From 277bd6e5379e0c1e1eb64db1a654b30e1efddc8e Mon Sep 17 00:00:00 2001 From: Vijay Thakkar Date: Mon, 23 Jan 2023 17:55:28 -0800 Subject: [PATCH] CUTLASS 3.0.0 (#786) * CUTLASS 3.0.0 --- CHANGELOG.md | 13 + CITATION.cff | 84 +- CMakeLists.txt | 83 +- CONTRIBUTORS.md | 122 +- README.md | 184 +- cuBLAS.cmake | 2 +- examples/10_planar_complex/CMakeLists.txt | 1 + .../11_planar_complex_array/CMakeLists.txt | 1 + .../gemm_with_layernorm.h | 30 +- .../fmha_grouped.h | 12 +- .../kernel_forward.h | 2 +- .../mma_from_smem.h | 4 +- .../ampere_gemm_universal_streamk.cu | 2 +- .../48_hopper_warp_specialized_gemm.cu | 463 + .../CMakeLists.txt | 35 + ..._gemm_schedules_with_collective_builder.cu | 522 + .../CMakeLists.txt | 35 + .../50_hopper_gemm_with_epilogue_swizzle.cu | 529 + .../CMakeLists.txt | 35 + examples/CMakeLists.txt | 6 + examples/cute/CMakeLists.txt | 30 + examples/cute/tutorial/CMakeLists.txt | 34 + examples/cute/tutorial/sgemm_nt_1.cu | 426 + include/cute/algorithm/axpby.hpp | 79 + include/cute/algorithm/clear.hpp | 66 + include/cute/algorithm/copy.hpp | 262 + include/cute/algorithm/fill.hpp | 87 + include/cute/algorithm/functional.hpp | 198 + include/cute/algorithm/gemm.hpp | 718 + include/cute/algorithm/prefer.hpp | 46 + include/cute/algorithm/tensor_algorithms.hpp | 102 + include/cute/algorithm/tuple_algorithms.hpp | 846 ++ include/cute/arch/cluster_sm90.hpp | 190 + include/cute/arch/copy.hpp | 71 + include/cute/arch/copy_sm75.hpp | 215 + include/cute/arch/copy_sm80.hpp | 138 + include/cute/arch/copy_sm90.hpp | 225 + include/cute/arch/copy_sm90_desc.hpp | 194 + include/cute/arch/copy_sm90_tma.hpp | 552 + include/cute/arch/mma.hpp | 64 + include/cute/arch/mma_sm61.hpp | 87 + include/cute/arch/mma_sm70.hpp | 329 + include/cute/arch/mma_sm75.hpp | 120 + include/cute/arch/mma_sm80.hpp | 2132 +++ include/cute/arch/mma_sm90.hpp | 961 ++ include/cute/arch/mma_sm90_desc.hpp | 131 + include/cute/arch/mma_sm90_gmma.hpp | 12265 ++++++++++++++++ include/cute/arch/util.hpp | 178 + include/cute/atom/copy_atom.hpp | 671 + include/cute/atom/copy_traits.hpp | 76 + include/cute/atom/copy_traits_sm75.hpp | 143 + include/cute/atom/copy_traits_sm80.hpp | 98 + include/cute/atom/copy_traits_sm90.hpp | 132 + include/cute/atom/copy_traits_sm90_tma.hpp | 795 + include/cute/atom/mma_atom.hpp | 1081 ++ include/cute/atom/mma_traits.hpp | 70 + include/cute/atom/mma_traits_sm61.hpp | 73 + include/cute/atom/mma_traits_sm70.hpp | 198 + include/cute/atom/mma_traits_sm75.hpp | 81 + include/cute/atom/mma_traits_sm80.hpp | 446 + include/cute/atom/mma_traits_sm90.hpp | 132 + include/cute/atom/mma_traits_sm90_gmma.hpp | 2975 ++++ include/cute/config.hpp | 121 + include/cute/container/alignment.hpp | 70 + include/cute/container/array.hpp | 282 + include/cute/container/array_aligned.hpp | 276 + include/cute/container/array_subbyte.hpp | 613 + include/cute/container/array_view.hpp | 274 + include/cute/container/bit_field.hpp | 131 + include/cute/container/tuple.hpp | 671 + include/cute/container/type_list.hpp | 84 + include/cute/int_tuple.hpp | 827 ++ include/cute/layout.hpp | 1638 +++ include/cute/numeric/arithmetic_tuple.hpp | 388 + include/cute/numeric/bfloat.hpp | 51 + include/cute/numeric/complex.hpp | 163 + include/cute/numeric/float8.hpp | 43 + include/cute/numeric/half.hpp | 41 + include/cute/numeric/int.hpp | 129 + include/cute/numeric/integer_sequence.hpp | 139 + include/cute/numeric/integer_subbyte.hpp | 233 + include/cute/numeric/integral_constant.hpp | 414 + include/cute/numeric/math.hpp | 319 + include/cute/numeric/real.hpp | 56 + include/cute/numeric/tfloat.hpp | 51 + include/cute/numeric/uint128.hpp | 259 + include/cute/pointer.hpp | 322 + include/cute/stride.hpp | 411 + include/cute/swizzle.hpp | 497 + include/cute/swizzle_layout.hpp | 1010 ++ include/cute/swizzle_ptr.hpp | 282 + include/cute/tensor.hpp | 900 ++ include/cute/tensor_predicate.hpp | 63 + include/cute/tile.hpp | 58 + include/cute/underscore.hpp | 148 + include/cute/util/debug.hpp | 153 + include/cute/util/print.hpp | 140 + include/cute/util/type_traits.hpp | 101 + include/cutlass/arch/barrier.h | 404 + include/cutlass/arch/memory_sm75.h | 66 +- include/cutlass/arch/mma.h | 1 - include/cutlass/arch/mma_sm80.h | 2 +- include/cutlass/arch/mma_sm90.h | 147 +- include/cutlass/arch/reg_reconfig.h | 68 + include/cutlass/array_subbyte.h | 4 - include/cutlass/cluster_launch.hpp | 156 + .../conv/kernel/implicit_gemm_convolution.h | 2 +- .../kernel/implicit_gemm_convolution_fusion.h | 2 +- .../implicit_gemm_convolution_strided_dgrad.h | 2 +- ...cit_gemm_convolution_with_fused_epilogue.h | 2 +- include/cutlass/cutlass.h | 57 +- include/cutlass/device_kernel.h | 38 +- .../collective/collective_epilogue.hpp | 49 + .../epilogue/collective/default_epilogue.hpp | 195 + .../default_transposed_epilogue.hpp | 203 + .../cutlass/epilogue/collective/epilogue.hpp | 322 + include/cutlass/epilogue/dispatch_policy.hpp | 39 + .../epilogue/thread/linear_combination.h | 87 +- include/cutlass/functional.h | 30 + .../collective/builders/sm90_gmma_builder.inl | 414 + .../gemm/collective/collective_builder.hpp | 78 + .../gemm/collective/collective_mma.hpp | 71 + .../gemm/collective/sm70_mma_twostage.hpp | 588 + .../gemm/collective/sm80_mma_multistage.hpp | 680 + .../sm90_mma_multistage_gmma_ss.hpp | 596 + .../gemm/collective/sm90_mma_tma_gmma_ss.hpp | 480 + .../sm90_mma_tma_gmma_ss_warpspecialized.hpp | 494 + .../gemm/device/gemm_universal_adapter.h | 367 +- include/cutlass/gemm/dispatch_policy.hpp | 144 + include/cutlass/gemm/gemm.h | 148 + include/cutlass/gemm/kernel/default_gemm.h | 12 +- include/cutlass/gemm/kernel/gemm.h | 2 +- include/cutlass/gemm/kernel/gemm_array.h | 2 +- include/cutlass/gemm/kernel/gemm_batched.h | 2 +- include/cutlass/gemm/kernel/gemm_grouped.h | 2 +- include/cutlass/gemm/kernel/gemm_pipelined.h | 2 +- .../cutlass/gemm/kernel/gemm_planar_complex.h | 2 +- .../gemm/kernel/gemm_planar_complex_array.h | 2 +- include/cutlass/gemm/kernel/gemm_universal.h | 14 +- .../cutlass/gemm/kernel/gemm_universal.hpp | 72 + .../gemm/kernel/gemm_with_fused_epilogue.h | 6 +- .../gemm/kernel/gemm_with_k_reduction.h | 2 +- .../gemm/kernel/params_universal_base.h | 6 +- include/cutlass/gemm/kernel/rank_2k_grouped.h | 2 +- .../cutlass/gemm/kernel/rank_2k_universal.h | 2 +- .../cutlass/gemm/kernel/rank_k_universal.h | 2 +- include/cutlass/gemm/kernel/sm70_gemm.hpp | 252 + include/cutlass/gemm/kernel/sm90_gemm_tma.hpp | 301 + .../kernel/sm90_gemm_tma_warpspecialized.hpp | 351 + ...90_gemm_tma_warpspecialized_persistent.hpp | 487 + .../gemm/kernel/sm90_tile_scheduler.hpp | 133 + include/cutlass/gemm/kernel/sparse_gemm.h | 2 +- include/cutlass/gemm/kernel/symm_universal.h | 2 +- include/cutlass/gemm/kernel/trmm_universal.h | 2 +- .../gemm/warp/mma_with_reduction_tensor_op.h | 32 +- include/cutlass/kernel_hardware_info.hpp | 71 + include/cutlass/layout/matrix.h | 20 + include/cutlass/pipeline.hpp | 529 + include/cutlass/quaternion.h | 1 - .../transform/pitch_linear_thread_map.h | 38 +- include/cutlass/uint128.h | 6 +- media/docs/code_organization.md | 27 +- media/docs/cute/00_quickstart.md | 75 + media/docs/cute/01_layout.md | 254 + media/docs/cute/02_layout_operations.md | 710 + media/docs/cute/03_tensor.md | 262 + media/docs/cute/04_algorithms.md | 223 + media/docs/cute/0t_mma_atom.md | 434 + media/docs/cute/0x_gemm_tutorial.md | 668 + media/docs/cute/0y_predication.md | 217 + .../cutlass_3x_backwards_compatibility.md | 473 + media/docs/cutlass_3x_design.md | 117 + media/docs/doxygen_mainpage.md | 120 +- media/docs/efficient_gemm.md | 15 + media/docs/functionality.md | 110 +- media/docs/gemm_api_3x.md | 701 + media/docs/layout.md | 5 + media/docs/pipeline.md | 210 + media/docs/profiler.md | 54 +- media/docs/programming_guidelines.md | 749 +- media/docs/quickstart.md | 174 +- media/docs/terminology.md | 28 +- media/docs/tile_iterator_concept.md | 13 +- media/docs/utilities.md | 7 + media/images/cute/HMMA.8x8x4.NT.png | Bin 0 -> 547992 bytes media/images/cute/HMMA.8x8x4.quadpair.AB.png | Bin 0 -> 609515 bytes media/images/cute/HMMA.8x8x4.quadpair.C.png | Bin 0 -> 522137 bytes media/images/cute/gmma_coremat_cd_fp16.png | Bin 0 -> 137035 bytes media/images/cute/gmma_wg_n_slice.png | Bin 0 -> 2007411 bytes .../logical_divide-and-zipped_divide-2.png | Bin 0 -> 254099 bytes .../cute/logical_divide-and-zipped_divide.png | Bin 0 -> 250709 bytes .../cutlass-3.0-gemm-peak-performance.png | Bin 0 -> 285327 bytes .../cutlass-reduction-in-named-iterators.png | Bin 0 -> 333316 bytes test/unit/CMakeLists.txt | 15 + test/unit/common/cutlass_unit_test.h | 24 +- test/unit/common/filter_architecture.cpp | 43 +- .../conv/device/conv2d_testbed_interleaved.h | 36 + ...v_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu | 3 + test/unit/core/numeric_conversion.cu | 51 +- test/unit/cute/CMakeLists.txt | 50 + test/unit/cute/ampere/CMakeLists.txt | 33 + test/unit/cute/ampere/cp_async.cu | 104 + test/unit/cute/ampere/ldsm.cu | 431 + test/unit/cute/core/CMakeLists.txt | 44 + test/unit/cute/core/bitfield.cpp | 84 + test/unit/cute/core/coalesce.cpp | 182 + test/unit/cute/core/compare.cpp | 168 + test/unit/cute/core/complement.cpp | 273 + test/unit/cute/core/composition.cpp | 528 + test/unit/cute/core/inverse_left.cpp | 183 + test/unit/cute/core/inverse_right.cpp | 255 + test/unit/cute/core/logical_divide.cpp | 253 + test/unit/cute/core/logical_product.cpp | 218 + test/unit/cute/core/mixedbits.cpp | 70 + test/unit/cute/core/transform.cpp | 49 + test/unit/cute/core/tuple.cpp | 266 + test/unit/cute/hopper/CMakeLists.txt | 58 + test/unit/cute/hopper/stsm.cu | 426 + test/unit/cute/hopper/tma_load.cu | 495 + test/unit/cute/hopper/tma_store.cu | 384 + test/unit/cute/layout/CMakeLists.txt | 32 + test/unit/cute/layout/layout_operator.cu | 136 + test/unit/gemm/device/CMakeLists.txt | 88 +- .../device/default_gemm_configuration.hpp | 1343 ++ ...cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu | 4 +- ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu | 4 +- ...cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu | 4 +- ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu | 4 +- .../gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu | 4 +- .../gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu | 4 +- test/unit/gemm/device/gemm_testbed_3x.hpp | 717 + .../hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../her2k_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../herk_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- test/unit/gemm/device/multistage_testbed.h | 5 + .../device/multistage_testbed_interleaved.h | 40 + .../gemm/device/sm50_gemm_f32_f32_f32_simt.cu | 135 + .../gemm/device/sm50_gemm_f64_f64_f64_simt.cu | 134 + .../gemm/device/sm61_gemm_s8_s8_s32_simt.cu | 136 + .../sm80_gemm_f16_f16_f32_tensor_op_f32.cu | 136 + .../gemm/device/sm80_gemm_f32_f32_f32_simt.cu | 135 + .../gemm/device/sm80_gemm_f64_f64_f64_simt.cu | 134 + .../sm80_gemm_f64_f64_f64_tensor_op_f64.cu | 98 + .../device/sm80_gemm_s8_s8_s32_tensor_op.cu | 94 + .../sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu | 135 + ...emm_bf16_bf16_bf16_alignx_tensor_op_f32.cu | 188 + .../sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu | 187 + .../sm90_gemm_f16_f16_f16_alignx_tensor_op.cu | 449 + .../device/sm90_gemm_f16_f16_f16_tensor_op.cu | 1077 ++ ...f16_tensor_op_f32_cluster_unspecialized.cu | 582 + ...6_tensor_op_f32_cluster_warpspecialized.cu | 582 + ..._f32_cluster_warpspecialized_persistent.cu | 1018 ++ .../sm90_gemm_f32_f32_f32_tensor_op_f32.cu | 86 + ...sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu | 152 + .../sm90_gemm_s8_s8_s8_tensor_op_s32.cu | 243 + ...gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu | 151 + .../sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu | 185 + .../symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../device/symm_f64_f64_tensor_op_f64_sm90.cu | 4 +- .../syr2k_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../syr2k_f64_f64_tensor_op_f64_sm90.cu | 4 +- .../syrk_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../device/syrk_f64_f64_tensor_op_f64_sm90.cu | 4 +- test/unit/gemm/device/testbed.h | 3 + test/unit/gemm/device/testbed_complex.h | 5 +- .../gemm/device/testbed_gemm_with_broadcast.h | 2 + .../gemm/device/testbed_gemm_with_reduction.h | 4 + test/unit/gemm/device/testbed_interleaved.h | 3 + .../gemm/device/testbed_rank2k_universal.h | 4 +- .../gemm/device/testbed_rank_k_universal.h | 2 + test/unit/gemm/device/testbed_sparse.h | 3 + .../unit/gemm/device/testbed_symm_universal.h | 3 + .../unit/gemm/device/testbed_trmm_universal.h | 3 + test/unit/gemm/device/testbed_universal.h | 3 + .../trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu | 4 +- .../trmm_f64_f64_f64_tensor_op_f64_sm90.cu | 4 +- .../mma_multistage_sparse_testbed.h | 14 +- .../gemm/threadblock/mma_multistage_testbed.h | 36 +- .../gemm/threadblock/mma_pipelined_testbed.h | 18 +- test/unit/gemm/warp/gemm_complex_sm90.cu | 4 +- test/unit/gemm/warp/gemm_sm90.cu | 4 +- test/unit/gemm/warp/testbed.h | 180 + test/unit/pipeline/CMakeLists.txt | 36 + test/unit/pipeline/pipeline_async.cu | 468 + test/unit/pipeline/pipeline_tma_async.cu | 469 + .../pipeline_tma_async_warp_specialized.cu | 525 + ...e_tma_async_warp_specialized_persistent.cu | 585 + test/unit/pipeline/sequence_barrier.cu | 226 + test/unit/pipeline/testbed.h | 145 + test/unit/util/CMakeLists.txt | 4 - .../library/include/cutlass/library/library.h | 18 +- tools/library/scripts/__init__.py | 0 tools/library/scripts/gemm_operation.py | 216 +- tools/library/scripts/generator.py | 234 +- tools/library/scripts/library.py | 68 +- tools/library/scripts/manifest.py | 5 + tools/library/scripts/pycutlass/README.md | 17 +- tools/library/scripts/pycutlass/build.sh | 36 +- tools/library/scripts/pycutlass/build_doc.sh | 32 + .../docker/Dockerfile-cuda11.8-pytorch | 40 + .../pycutlass/docker/Dockerfile-cuda12.0 | 46 + tools/library/scripts/pycutlass/setup.py | 42 +- .../scripts/pycutlass/src/cpp/cute.cpp | 54 + .../scripts/pycutlass/src/cpp/cutlass.cpp | 3 +- .../epilogue/epilogue_visitor_generic.h | 1 + .../epilogue/epilogue_visitor_op/binary_ops.h | 6 +- .../epilogue/epilogue_visitor_op/unary_ops.h | 4 +- .../visitor_op_accumulator.h | 4 +- .../epilogue_visitor_op/visitor_op_binary.h | 5 +- .../visitor_op_column_broadcast.h | 4 +- .../visitor_op_column_reduction.h | 5 +- .../visitor_op_linear_combination.h | 6 +- .../visitor_op_row_broadcast.h | 4 +- .../visitor_op_row_reduction.h | 5 +- .../epilogue_visitor_op/visitor_op_unary.h | 6 +- .../gemm/gemm_universal_with_visitor.h | 70 +- .../pycutlass/src/cpp/include/swizzling.h | 1 - .../pycutlass/src/pycutlass/__init__.py | 7 + .../builder/collective_op_builder.py | 395 + .../pycutlass/src/pycutlass/c_types.py | 82 +- .../pycutlass/src/pycutlass/compiler.py | 106 +- .../src/pycutlass/conv2d_operation.py | 7 +- .../pycutlass/src/pycutlass/epilogue.py | 1 + .../pycutlass/src/pycutlass/gemm_operation.py | 402 +- .../pycutlass/src/pycutlass/library.py | 204 +- .../pycutlass/src/pycutlass/operation.py | 55 +- .../scripts/pycutlass/src/pycutlass/parser.py | 5 - .../src/pycutlass/test/conv2d_testbed.py | 20 +- .../pycutlass/test/gemm_grouped_testbed.py | 2 +- .../src/pycutlass/test/gemm_testbed.py | 153 +- .../pycutlass/src/pycutlass/test/utils.py | 109 + .../src/pycutlass/utils/datatypes.py | 121 + ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py | 32 + ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py | 32 + ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py | 32 + ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py | 32 + ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py | 32 + ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py | 32 + ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py | 32 + ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py | 32 + ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py | 32 + .../pycutlass/test/conv/run_all_tests.py | 32 + .../pycutlass/test/example/run_all_example.sh | 32 + .../pycutlass/test/frontend/run_test.sh | 32 + .../pycutlass/test/frontend/test_frontend.py | 23 +- .../pycutlass/test/gemm/gemm_bf16_sm80.py | 34 +- .../pycutlass/test/gemm/gemm_bf16_sm90.py | 138 + .../pycutlass/test/gemm/gemm_f16_sm80.py | 34 +- .../pycutlass/test/gemm/gemm_f16_sm90.py | 182 + .../pycutlass/test/gemm/gemm_f32_sm80.py | 32 + .../pycutlass/test/gemm/gemm_f64_sm80.py | 34 +- .../pycutlass/test/gemm/gemm_f64_sm90.py | 124 + .../pycutlass/test/gemm/gemm_grouped_sm80.py | 34 +- .../pycutlass/test/gemm/gemm_s8_sm80.py | 34 +- .../pycutlass/test/gemm/gemm_s8_sm90.py | 154 + .../pycutlass/test/gemm/run_all_tests.py | 34 +- tools/library/src/gemm_operation_3x.hpp | 292 + tools/library/src/handle.cu | 6 + tools/profiler/CMakeLists.txt | 1 + tools/profiler/src/gemm_operation_profiler.cu | 7 + tools/profiler/src/operation_profiler.cu | 24 + tools/util/include/cutlass/util/GPU_Clock.hpp | 67 + .../include/cutlass/util/cublas_wrappers.hpp | 526 + .../include/cutlass/util/device_layernorm.h | 2 +- .../util/include/cutlass/util/helper_cuda.hpp | 116 + .../include/cutlass/util/packed_stride.hpp | 101 + .../util/include/cutlass/util/print_error.hpp | 235 + .../cutlass/util/reference/host/gett.hpp | 311 + .../util/reference/host/tensor_compare.hpp | 101 + .../util/reference/host/tensor_fill.hpp | 432 + .../util/reference/host/tensor_reduce.hpp | 203 + 377 files changed, 76395 insertions(+), 1185 deletions(-) create mode 100644 examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu create mode 100644 examples/48_hopper_warp_specialized_gemm/CMakeLists.txt create mode 100644 examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu create mode 100644 examples/49_hopper_gemm_schedules_with_collective_builder/CMakeLists.txt create mode 100644 examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu create mode 100644 examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt create mode 100644 examples/cute/CMakeLists.txt create mode 100644 examples/cute/tutorial/CMakeLists.txt create mode 100644 examples/cute/tutorial/sgemm_nt_1.cu create mode 100644 include/cute/algorithm/axpby.hpp create mode 100644 include/cute/algorithm/clear.hpp create mode 100644 include/cute/algorithm/copy.hpp create mode 100644 include/cute/algorithm/fill.hpp create mode 100644 include/cute/algorithm/functional.hpp create mode 100644 include/cute/algorithm/gemm.hpp create mode 100644 include/cute/algorithm/prefer.hpp create mode 100644 include/cute/algorithm/tensor_algorithms.hpp create mode 100644 include/cute/algorithm/tuple_algorithms.hpp create mode 100644 include/cute/arch/cluster_sm90.hpp create mode 100644 include/cute/arch/copy.hpp create mode 100644 include/cute/arch/copy_sm75.hpp create mode 100644 include/cute/arch/copy_sm80.hpp create mode 100644 include/cute/arch/copy_sm90.hpp create mode 100644 include/cute/arch/copy_sm90_desc.hpp create mode 100644 include/cute/arch/copy_sm90_tma.hpp create mode 100644 include/cute/arch/mma.hpp create mode 100644 include/cute/arch/mma_sm61.hpp create mode 100644 include/cute/arch/mma_sm70.hpp create mode 100644 include/cute/arch/mma_sm75.hpp create mode 100644 include/cute/arch/mma_sm80.hpp create mode 100644 include/cute/arch/mma_sm90.hpp create mode 100644 include/cute/arch/mma_sm90_desc.hpp create mode 100644 include/cute/arch/mma_sm90_gmma.hpp create mode 100644 include/cute/arch/util.hpp create mode 100644 include/cute/atom/copy_atom.hpp create mode 100644 include/cute/atom/copy_traits.hpp create mode 100644 include/cute/atom/copy_traits_sm75.hpp create mode 100644 include/cute/atom/copy_traits_sm80.hpp create mode 100644 include/cute/atom/copy_traits_sm90.hpp create mode 100644 include/cute/atom/copy_traits_sm90_tma.hpp create mode 100644 include/cute/atom/mma_atom.hpp create mode 100644 include/cute/atom/mma_traits.hpp create mode 100644 include/cute/atom/mma_traits_sm61.hpp create mode 100644 include/cute/atom/mma_traits_sm70.hpp create mode 100644 include/cute/atom/mma_traits_sm75.hpp create mode 100644 include/cute/atom/mma_traits_sm80.hpp create mode 100644 include/cute/atom/mma_traits_sm90.hpp create mode 100644 include/cute/atom/mma_traits_sm90_gmma.hpp create mode 100644 include/cute/config.hpp create mode 100644 include/cute/container/alignment.hpp create mode 100644 include/cute/container/array.hpp create mode 100644 include/cute/container/array_aligned.hpp create mode 100644 include/cute/container/array_subbyte.hpp create mode 100644 include/cute/container/array_view.hpp create mode 100644 include/cute/container/bit_field.hpp create mode 100644 include/cute/container/tuple.hpp create mode 100644 include/cute/container/type_list.hpp create mode 100644 include/cute/int_tuple.hpp create mode 100644 include/cute/layout.hpp create mode 100644 include/cute/numeric/arithmetic_tuple.hpp create mode 100644 include/cute/numeric/bfloat.hpp create mode 100644 include/cute/numeric/complex.hpp create mode 100644 include/cute/numeric/float8.hpp create mode 100644 include/cute/numeric/half.hpp create mode 100644 include/cute/numeric/int.hpp create mode 100644 include/cute/numeric/integer_sequence.hpp create mode 100644 include/cute/numeric/integer_subbyte.hpp create mode 100644 include/cute/numeric/integral_constant.hpp create mode 100644 include/cute/numeric/math.hpp create mode 100644 include/cute/numeric/real.hpp create mode 100644 include/cute/numeric/tfloat.hpp create mode 100644 include/cute/numeric/uint128.hpp create mode 100644 include/cute/pointer.hpp create mode 100644 include/cute/stride.hpp create mode 100644 include/cute/swizzle.hpp create mode 100644 include/cute/swizzle_layout.hpp create mode 100644 include/cute/swizzle_ptr.hpp create mode 100644 include/cute/tensor.hpp create mode 100644 include/cute/tensor_predicate.hpp create mode 100644 include/cute/tile.hpp create mode 100644 include/cute/underscore.hpp create mode 100644 include/cute/util/debug.hpp create mode 100644 include/cute/util/print.hpp create mode 100644 include/cute/util/type_traits.hpp create mode 100644 include/cutlass/arch/barrier.h create mode 100644 include/cutlass/arch/reg_reconfig.h create mode 100644 include/cutlass/cluster_launch.hpp create mode 100644 include/cutlass/epilogue/collective/collective_epilogue.hpp create mode 100644 include/cutlass/epilogue/collective/default_epilogue.hpp create mode 100644 include/cutlass/epilogue/collective/default_transposed_epilogue.hpp create mode 100644 include/cutlass/epilogue/collective/epilogue.hpp create mode 100644 include/cutlass/epilogue/dispatch_policy.hpp create mode 100644 include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl create mode 100644 include/cutlass/gemm/collective/collective_builder.hpp create mode 100644 include/cutlass/gemm/collective/collective_mma.hpp create mode 100644 include/cutlass/gemm/collective/sm70_mma_twostage.hpp create mode 100644 include/cutlass/gemm/collective/sm80_mma_multistage.hpp create mode 100644 include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss.hpp create mode 100644 include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp create mode 100644 include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp create mode 100644 include/cutlass/gemm/dispatch_policy.hpp create mode 100644 include/cutlass/gemm/kernel/gemm_universal.hpp create mode 100644 include/cutlass/gemm/kernel/sm70_gemm.hpp create mode 100644 include/cutlass/gemm/kernel/sm90_gemm_tma.hpp create mode 100644 include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp create mode 100644 include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp create mode 100644 include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp create mode 100644 include/cutlass/kernel_hardware_info.hpp create mode 100644 include/cutlass/pipeline.hpp create mode 100644 media/docs/cute/00_quickstart.md create mode 100644 media/docs/cute/01_layout.md create mode 100644 media/docs/cute/02_layout_operations.md create mode 100644 media/docs/cute/03_tensor.md create mode 100644 media/docs/cute/04_algorithms.md create mode 100644 media/docs/cute/0t_mma_atom.md create mode 100644 media/docs/cute/0x_gemm_tutorial.md create mode 100644 media/docs/cute/0y_predication.md create mode 100644 media/docs/cutlass_3x_backwards_compatibility.md create mode 100644 media/docs/cutlass_3x_design.md create mode 100644 media/docs/gemm_api_3x.md create mode 100644 media/docs/pipeline.md create mode 100644 media/images/cute/HMMA.8x8x4.NT.png create mode 100644 media/images/cute/HMMA.8x8x4.quadpair.AB.png create mode 100644 media/images/cute/HMMA.8x8x4.quadpair.C.png create mode 100644 media/images/cute/gmma_coremat_cd_fp16.png create mode 100644 media/images/cute/gmma_wg_n_slice.png create mode 100755 media/images/cute/logical_divide-and-zipped_divide-2.png create mode 100755 media/images/cute/logical_divide-and-zipped_divide.png create mode 100644 media/images/cutlass-3.0-gemm-peak-performance.png create mode 100644 media/images/cutlass-reduction-in-named-iterators.png create mode 100644 test/unit/cute/CMakeLists.txt create mode 100644 test/unit/cute/ampere/CMakeLists.txt create mode 100644 test/unit/cute/ampere/cp_async.cu create mode 100644 test/unit/cute/ampere/ldsm.cu create mode 100644 test/unit/cute/core/CMakeLists.txt create mode 100644 test/unit/cute/core/bitfield.cpp create mode 100644 test/unit/cute/core/coalesce.cpp create mode 100644 test/unit/cute/core/compare.cpp create mode 100644 test/unit/cute/core/complement.cpp create mode 100644 test/unit/cute/core/composition.cpp create mode 100644 test/unit/cute/core/inverse_left.cpp create mode 100644 test/unit/cute/core/inverse_right.cpp create mode 100644 test/unit/cute/core/logical_divide.cpp create mode 100644 test/unit/cute/core/logical_product.cpp create mode 100644 test/unit/cute/core/mixedbits.cpp create mode 100644 test/unit/cute/core/transform.cpp create mode 100644 test/unit/cute/core/tuple.cpp create mode 100644 test/unit/cute/hopper/CMakeLists.txt create mode 100644 test/unit/cute/hopper/stsm.cu create mode 100644 test/unit/cute/hopper/tma_load.cu create mode 100644 test/unit/cute/hopper/tma_store.cu create mode 100644 test/unit/cute/layout/CMakeLists.txt create mode 100644 test/unit/cute/layout/layout_operator.cu create mode 100644 test/unit/gemm/device/default_gemm_configuration.hpp create mode 100644 test/unit/gemm/device/gemm_testbed_3x.hpp create mode 100644 test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu create mode 100644 test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu create mode 100644 test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu create mode 100644 test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu create mode 100644 test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu create mode 100644 test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu create mode 100644 test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu create mode 100644 test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_persistent.cu create mode 100644 test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu create mode 100644 test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu create mode 100644 test/unit/pipeline/CMakeLists.txt create mode 100644 test/unit/pipeline/pipeline_async.cu create mode 100644 test/unit/pipeline/pipeline_tma_async.cu create mode 100644 test/unit/pipeline/pipeline_tma_async_warp_specialized.cu create mode 100644 test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu create mode 100644 test/unit/pipeline/sequence_barrier.cu create mode 100644 test/unit/pipeline/testbed.h create mode 100644 tools/library/scripts/__init__.py create mode 100644 tools/library/scripts/pycutlass/docker/Dockerfile-cuda11.8-pytorch create mode 100644 tools/library/scripts/pycutlass/docker/Dockerfile-cuda12.0 create mode 100644 tools/library/scripts/pycutlass/src/cpp/cute.cpp create mode 100644 tools/library/scripts/pycutlass/src/pycutlass/builder/collective_op_builder.py create mode 100644 tools/library/scripts/pycutlass/src/pycutlass/test/utils.py create mode 100644 tools/library/scripts/pycutlass/src/pycutlass/utils/datatypes.py create mode 100644 tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm90.py create mode 100644 tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm90.py create mode 100644 tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm90.py create mode 100644 tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm90.py create mode 100644 tools/library/src/gemm_operation_3x.hpp create mode 100644 tools/util/include/cutlass/util/GPU_Clock.hpp create mode 100644 tools/util/include/cutlass/util/cublas_wrappers.hpp create mode 100644 tools/util/include/cutlass/util/helper_cuda.hpp create mode 100644 tools/util/include/cutlass/util/packed_stride.hpp create mode 100644 tools/util/include/cutlass/util/print_error.hpp create mode 100644 tools/util/include/cutlass/util/reference/host/gett.hpp create mode 100644 tools/util/include/cutlass/util/reference/host/tensor_compare.hpp create mode 100644 tools/util/include/cutlass/util/reference/host/tensor_fill.hpp create mode 100644 tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c67bdd333..8744e0a6c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # NVIDIA CUTLASS Changelog + +## [3.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.0.0) (2023-01-23) +* [CuTe](/media/docs/cute/00_quickstart.md), a [new core library and backend](/include/cute) for CUTLASS 3.0 that defines a single Layout vocabulary type and an associated algebra of layouts for a much more expressive and composable abstraction for tensors, sets of parallel agents, and operations by said agents on tensors. +* [A new conceptual operation hierarchy](media/docs/cutlass_3x_design.md) that replaces the architecture-centric hierarchy of CUTLASS 2.x and [documentation for CUTLASS 3.0's GEMM API changes](/media/docs/gemm_api_3x.md). +* Strict API backwards compatibility that exposes both 2.x and 3.x API kernels through the same [`device::GemmUniversalAdapter`](include/cutlass/gemm/device/gemm_universal_adapter.h) and [`kernel::GemmUniversal`](include/cutlass/gemm/kernel/gemm_universal.hpp) types, allowing users to include both APIs in the same translation units. More information can be found in the [3.x backwards compatibility section](media/docs/cutlass_3x_backwards_compatibility.md). +* Updates to [Functionality](media/docs/functionality.md) which directs users on which kernels are supported via CUTLASS-2 and CUTLASS-3. +* Updates to [Compatibility](/README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](/README.md#Target-Architecture). +* New warp-specialized GEMM [kernel schedules](include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. +* Extensions to CUTLASS profiler to support threadblock cluster shapes in library and profiler tile configurations. +* [CUTLASS library integration](/tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler. +* Support for [Hopper GEMMs](examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features. +* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](examples/48_hopper_warp_specialized_gemm), [49](examples/49_hopper_gemm_schedules_with_collective_builder), and [50](examples/50_hopper_gemm_with_epilogue_swizzle). + ## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19) * [Stream-K](/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one. * [Fused multi-head attention Kernel](/examples/41_fused_multi_head_attention). It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length. Both versions just need one kernel. diff --git a/CITATION.cff b/CITATION.cff index 7ae2b4b1ce..ea97f1f68e 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,33 +5,61 @@ message: >- following metadata. type: software authors: - - given-names: Andrew - email: akerr@nvidia.com - family-names: Kerr + - given-names: Vijay + family-names: Thakkar + email: vithakkar@nvidia.com + affiliation: NVIDIA + - given-names: Pradeep + family-names: Ramani + email: prramani@nvidia.com + affiliation: NVIDIA + - given-names: Cris + family-names: Cecka + email: ccecka@nvidia.com + affiliation: NVIDIA + - given-names: Aniket + family-names: Shivam + email: ashivam@nvidia.com + affiliation: NVIDIA + - given-names: Honghao + family-names: Lu + email: honghaol@nvidia.com + affiliation: NVIDIA + - given-names: Ethan + family-names: Yan + email: etyan@nvidia.com + affiliation: NVIDIA + - given-names: Jack + family-names: Kosaian + email: jkosaian@nvidia.com + affiliation: NVIDIA + - given-names: Mark + family-names: Hoemmen + email: mhoemmen@nvidia.com affiliation: NVIDIA - given-names: Haicheng family-names: Wu - affiliation: NVIDIA email: haichengw@nvidia.com - - given-names: Manish - family-names: Gupta - affiliation: Google - email: manigupta@google.com - - given-names: Dustyn - family-names: Blasig - email: dblasig@nvidia.com affiliation: NVIDIA - - given-names: Pradeep - family-names: Ramini - email: prramani@nvidia.com + - given-names: Andrew + family-names: Kerr + email: akerr@nvidia.com + affiliation: NVIDIA + - given-names: Matt + family-names: Nicely + email: mnicely@nvidia.com affiliation: NVIDIA - given-names: Duane family-names: Merrill email: dumerrill@nvidia.com affiliation: NVIDIA - - given-names: Aniket - family-names: Shivam - email: ashivam@nvidia.com + - given-names: Dustyn + family-names: Blasig + email: dblasig@nvidia.com + affiliation: NVIDIA + - given-names: Fengqi + family-names: Qiao + email: fqiao@nvidia.com affiliation: NVIDIA - given-names: Piotr family-names: Majcher @@ -49,10 +77,12 @@ authors: family-names: Wang email: jinw@nvidia.com affiliation: NVIDIA - - given-names: Matt - family-names: Nicely - email: mnicely@nvidia.com - affiliation: NVIDIA + - given-names: Manish + family-names: Gupta + affiliation: Google + email: manigupta@google.com + + repository-code: 'https://github.com/NVIDIA/cutlass' abstract: >- CUTLASS is a collection of CUDA C++ template @@ -71,12 +101,12 @@ abstract: >- flexibility simplifies their use as building blocks within custom kernels and applications. keywords: - - 'cutlass, tensor cores, cuda' + - 'cutlass, tensor cores, cuda, cute, nvidia, gpu, linear algebra, matrix computations' license: BSD-3-Clause -license-url: https://github.com/NVIDIA/cutlass/blob/v2.11.0/LICENSE.txt -version: '2.11.0' -date-released: '2022-11-19' +license-url: https://github.com/NVIDIA/cutlass/blob/v3.0.0/LICENSE.txt +version: '3.0.0' +date-released: '2023-01-23' identifiers: - type: url - value: "https://github.com/NVIDIA/cutlass/tree/v2.11.0" - description: The GitHub release URL of tag 2.11.0 + value: "https://github.com/NVIDIA/cutlass/tree/v3.0.0" + description: The GitHub release URL of tag 3.0.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b8d7c8225..e879f780c3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cmake_minimum_required(VERSION 3.12.4 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) if(cutlass_LOADED) # If CUTLASS has been previously fetched and loaded, don't do it again. @@ -39,35 +39,40 @@ endif() message(STATUS "CMake Version: ${CMAKE_VERSION}") set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set") -project(CUTLASS VERSION 2.11.0 LANGUAGES CXX) +project(CUTLASS VERSION 3.0.0 LANGUAGES CXX) include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake) -if (CUDA_VERSION VERSION_LESS 10.2) - message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.") -elseif (CUDA_VERSION VERSION_LESS 11.0) - message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.") +if (CUDA_VERSION VERSION_LESS 11.3) + message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 11.4 or higher, and strongly recommends CUDA 11.8 or higher.") +elseif (CUDA_VERSION VERSION_LESS 11.4) + message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.") +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.5) + message(FATAL_ERROR "GCC version must be at least 7.5!") +endif() + +if (CUDA_COMPILER MATCHES "[Cc]lang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0) + message(FATAL_ERROR "Clang 7.0+ required for GPU compilation") endif() find_package(Doxygen QUIET) # -# CUTLASS 2.x requires C++11 +# CUTLASS 3.x requires C++17 # -if (NOT IMPLICIT_CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) - set(CMAKE_CXX_STANDARD_REQUIRED ON) - set(CMAKE_CXX_EXTENSIONS OFF) -endif() +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) if(CUTLASS_NATIVE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) else() - if (NOT IMPLICIT_CMAKE_CXX_STANDARD) - list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++11) - endif() + list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) endif() - + if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE) endif() @@ -107,29 +112,14 @@ if (CUTLASS_ENABLE_TESTS) endif() set(CUTLASS_NVCC_ARCHS_SUPPORTED "") -if (NOT CUDA_VERSION VERSION_LESS 7.5) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53) -endif() -if (NOT CUDA_VERSION VERSION_LESS 8.0) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61) -endif() -if (NOT CUDA_VERSION VERSION_LESS 9.0) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70) +if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87) endif() -if (NOT CUDA_VERSION VERSION_LESS 9.2) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 72) +if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 89 90) endif() -if (NOT CUDA_VERSION VERSION_LESS 10.0) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75) -endif() -if (NOT CUDA_VERSION VERSION_LESS 11.0) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80) -endif() -if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86) -endif() -if (NOT CUDA_VERSION VERSION_LESS 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90) +if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90a) endif() set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.") set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.") @@ -271,6 +261,7 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA) list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1) endif() + if (NOT MSVC AND CUTLASS_NVCC_KEEP) # MSVC flow handles caching already, but for other generators we handle it here. set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files") @@ -288,6 +279,15 @@ if (CUTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING) endif() endif() +if (CUTLASS_ENABLE_OPENMP_TESTS) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS}) + else() + message(WARNING "CUTLASS_ENABLE_OPENMP_TESTS set but OpenMP not found.") + endif() +endif() + list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$:-Xcompiler=-Wconversion>) list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$:-Xcompiler=-fno-strict-aliasing>) @@ -313,10 +313,6 @@ if(CUDA_COMPILER MATCHES "[Cc]lang") message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" ) endif() - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0) - message(FATAL_ERROR "Clang 7.0+ required for GPU compilation") - endif() - # There are numerous Clang versions that can work with each CUDA toolkit and the # the checks are not very useful so we are turning them off and using testing to # ensure the various combinations work properly. @@ -341,6 +337,7 @@ if(CUDA_COMPILER MATCHES "[Cc]lang") list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wl,--disable-new-dtags) link_libraries(nvidia::cudart) + link_libraries(nvidia::cuda_driver) endif() # Support for 128-bit integers if using NVIDIA C++ compiler @@ -530,6 +527,8 @@ target_include_directories( $ $ $ + $ + $ ) install( diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 21357b5f52..5a159d8c57 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -7,63 +7,77 @@ This is the official list of CUTLASS developers and contributors. ## DEVELOPERS -Andrew Kerr -Haicheng Wu -Manish Gupta -Dustyn Blasig -Pradeep Ramani -Cris Cecka -Vijay Thakkar -Aniket Shivam -Honghao Lu -Ethan Yan -Zhaodong Chen -Jack Kosaian -Yujia Zhai -Naila Farooqui -Piotr Majcher -Paul Springer -Jin Wang -Chinmay Talegaonkar -Shang Zhang -Scott Yokim -Markus Hohnerbach -Aditya Atluri -David Tanner -Manikandan Ananth +Vijay Thakkar
+Pradeep Ramani
+Cris Cecka
+Aniket Shivam
+Jack Kosaian
+Mark Hoemmen
+Honghao Lu
+Ethan Yan
+Haicheng Wu
+Andrew Kerr
+Dustyn Blasig
+Fengqi Qiao
+Duane Merrill
+Yujia Zhai
+Shang Zhang
+Piotr Majcher
+Paul Springer
+Markus Hohnerbach
+Jin Wang
+Aditya Atluri
+ +## CuTe +Cris Cecka
+Vijay Thakkar
## CUTLASS Product Manager -Matthew Nicely - +Matthew Nicely
+ +## Former CUTLASS Developers +Manish Gupta
+Naila Farooqui
+David Tanner
+Manikandan Ananth
+Zhaodong Chen
+Chinmay Talegaonkar
+ ## CONTRIBUTORS -Timothy Costa -Julien Demouth -Brian Fahs -Michael Goldfarb -Mostafa Hagog -Fei Hu -Alan Kaatz -Tina Li -Timmy Liu -Duane Merrill -Kevin Siu -Markus Tavenrath -John Tran -Vicki Wang -Junkai Wu -Fung Xie -Albert Xu -Jack Yang -Xiuxia Zhang -Nick Zhao +Timothy Costa
+Julien Demouth
+Brian Fahs
+Michael Garland
+Michael Goldfarb
+Mostafa Hagog
+Fei Hu
+Alan Kaatz
+Tina Li
+Timmy Liu
+Wei Liu
+Duane Merrill
+Kevin Siu
+Markus Tavenrath
+John Tran
+Vicki Wang
+Junkai Wu
+Fung Xie
+Albert Xu
+Yang Xu
+Jack Yang
+Scott Yokim
+Xiuxia Zhang
+Nick Zhao
## ACKNOWLEDGEMENTS -Girish Bharambe -Luke Durant -Olivier Giroux -Stephen Jones -Rishkul Kulkarni -Bryce Lelbach -Joel McCormack -Kyrylo Perelygin +Girish Bharambe
+Luke Durant
+Carter Edwards
+Olivier Giroux
+Stephen Jones
+Rishkul Kulkarni
+Bryce Lelbach
+Joel McCormack
+Kyrylo Perelygin
+Sean Treichler
diff --git a/README.md b/README.md index b58465132f..a89b8f49b4 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 2.11 +# CUTLASS 3.0 -_CUTLASS 2.11 - November 2022_ +_CUTLASS 3.0 - January 2023_ CUTLASS is a collection of CUDA C++ template abstractions for implementing -high-performance matrix-multiplication (GEMM) and related computations at all levels +high-performance matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA. It incorporates strategies for hierarchical decomposition and data movement similar to those used to implement cuBLAS and cuDNN. CUTLASS decomposes these "moving parts" into reusable, modular software components abstracted by C++ template -classes. These thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized -and tuned via custom tiling sizes, data types, and other algorithmic policy. The -resulting flexibility simplifies their use as building blocks within custom kernels -and applications. +classes. Primitives for different levels of a conceptual parallelization hierarchy +can be specialized and tuned via custom tiling sizes, data types, +and other algorithmic policy. The resulting flexibility simplifies their use +as building blocks within custom kernels and applications. To support a wide variety of applications, CUTLASS provides extensive support for mixed-precision computations, providing specialized data-movement and @@ -21,60 +21,75 @@ point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32), single-precision floating point (FP32), [FP32 emulation via tensor core instruction](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), double-precision floating -point (FP64) types, integer data types (4b and 8b), and binary data types (1b). -CUTLASS demonstrates warp-synchronous matrix multiply operations -targeting the programmable, high-throughput _Tensor Cores_ implemented by -NVIDIA's Volta, Turing, and Ampere architectures. - -CUTLASS implements high-performance Convolution via the implicit GEMM algorithm. -Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of -CUTLASS's modular GEMM pipeline. -This allows CUTLASS to build convolutions by reusing highly optimized warp-wide GEMM components and below. +point (FP64) types, integer data types (4b and 8b), and binary data types (1b). +CUTLASS demonstrates warp-synchronous matrix multiply operations +targeting the programmable, high-throughput _Tensor Cores_ implemented by +NVIDIA's Volta, Turing, Ampere, and Hopper architectures. See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly. See the [functionality listing](/media/docs/functionality.md) for the list of operations supported at each level of the execution model hierarchy. -# What's New in CUTLASS 2.11 - -CUTLASS 2.11 is an update to CUTLASS adding: -- [Stream-K](/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one. -- [Fused multi-head attention kernel](/examples/41_fused_multi_head_attention). It has two variants: one for fixed sequence lengths, and another for variable sequence lengths. -- [Dual GEMM](/examples/45_dual_gemm). It can run two GEMMs that share the same left input matrix in one kernel. -- Hopper improves [double precision matrix multiplication](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8. -- [BLAS3](/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions. -- [ELL Block Sparse GEMM](/examples/43_ell_block_sparse_gemm). -- [Optimized Group Conv](/examples/42_ampere_tensorop_group_conv). -- [Optimized DepthWise Conv](/examples/46_depthwise_simt_conv2dfprop). -- [Scripts](/examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM. -- [FP8 data type definition](/include/cutlass/float8.h) and [conversion routines](/include/cutlass/numeric_conversion.h#L1274-2115). -- Updates and bugfixes from the community (thanks!). Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers). -- **Deprecation announcement:** CUTLASS plans to deprecate the following in the next major release: - - Maxwell and Pascal GPU architectures - - Ubuntu 16.04 - - CUDA 10.2 - - C++ 11 -- **Future requirement announcement:** CUTLASS plans to add the following requirements in the next major release: - - Minimum C++ standard - C++17 +CUTLASS 3.0 introduces a new core library, CuTe, to describe and manipulate tensors of threads and data. +CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly package the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations. + +The core abstractions of CuTe are hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning. + +CUTLASS 3.0 adopts CuTe throughout the GEMM hierarchy in its templates. This greatly simplifies the design +and improves code composability and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](/media/docs/cute/00_quickstart.md). + +In addition to GEMMs, CUTLASS implements high-performance convolution via the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components. + +# What's New in CUTLASS 3.0 + +CUTLASS 3.0, as the next major version of the CUTLASS API, brings with it CuTe, a new programming model and backend designed for massively parallel heterogenous agents. Using CuTe, CUTLASS 3.0 provides implementations of GEMM kernels for the NVIDIA Hopper architecture. + +- [CuTe-based layouts and layout algebra](/media/docs/cute/00_quickstart.md) +- [A new GEMM template API](/media/docs/gemm_api_3x.md) that eschews the architecture-centric hierarchy of 2.x in favour of a new conceptual framing. Read more in the [3.0 design documentation](/media/docs/cutlass_3x_design.md). +- Support for 4th generation Hopper Tensor Core instructions (WGMMA) through CuTe. +- Support for Hopper asynchronous Tensor Memory Accelerator (TMA) instructions and associated transaction barriers through CuTe. +- New warp-specialized GEMM kernels targeting Hopper TMA + WGMMA for speed-of-light GEMMs. +- New warp-specialized persistent GEMM kernels targeting Hopper TMA + WGMMA. +- Support for CUDA Threadblock Clusters and programmatic TMA multicast for greater execution and data locality. +- A new way to instantiate default GEMM kernels using `CollectiveBuilder`s that supersede the 2.x `DefaultXConfiguration` types in favour a metaprogramming based kernel generator functionality. See [example 49](/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu). +- Extensions to the CUTLASS library and profiler to support CUTLASS 3.0 Hopper kernels, and a new format +for kernel procedural names. +- *Announcement*: CUTLASS plans to rename the GitHub branch `master` to `main` with a future release. + +## New architecture, compiler, and CUDA Toolkit requirements + +Minimum requirements: + +- Architecture: Volta +- Compiler: Must support at least C++17 +- CUDA Toolkit version: 11.4 + +CUTLASS 3.0 *removes support* for the following: + +- Maxwell and Pascal GPU architectures +- Ubuntu 16.04 +- CUDA 10.2 +- C++ language versions less than 17. **See the [CHANGELOG](CHANGELOG.md) for a detailed listing of releases and updates.** # Performance -

+

CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels, -they exhibit performance comparable to cuBLAS for scalar GEMM +they exhibit peak performance comparable to cuBLAS for scalar GEMM computations. The above figure shows CUTLASS performance relative to cuBLAS -for large matrix dimensions on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/), -an [NVIDIA A2](https://www.nvidia.com/en-us/data-center/products/a2/), -an [NVIDIA TitanV](https://www.nvidia.com/en-us/titan/titan-v/), -and an [NVIDIA GeForce 2080 Ti](https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-2080-ti/) -compiled with the [CUDA 11.5 Toolkit](https://developer.nvidia.com/cuda-downloads). Tensor Core operations are implemented using CUDA's +for large matrix dimensions on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture), +an [NVIDIA L40](https://www.nvidia.com/en-us/data-center/l40/) (NVIDIA Ada architecture), +an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) (NVIDIA Ampere architecture), +and an [NVIDIA A40](https://www.nvidia.com/en-us/data-center/a40/) (NVIDIA Ampere architecture). +CUTLASS 3.0 was compiled with the [CUDA 12.0 Toolkit](https://developer.nvidia.com/cuda-downloads). +Tensor Core operations are implemented using CUDA's [mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma). -

+

When using CUTLASS building blocks to construct device-wide implicit gemm (Fprop, Dgrad, and Wgrad) kernels, CUTLASS performance is also comparable to cuDNN when running Resnet-50 layers on an [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) @@ -83,39 +98,48 @@ as shown in the above figure. Tensor Core operations are still implemented usin # Compatibility -CUTLASS requires a C++11 host compiler and performs best when built with the [**CUDA 11.8 Toolkit**](https://developer.nvidia.com/cuda-toolkit). - -It is also compatible with CUDA 11.x. +CUTLASS requires a C++17 host compiler and +performs best when built with the [**CUDA 12.0 Toolkit**](https://developer.nvidia.com/cuda-toolkit). +It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, and CUDA 11.8. ## Operating Systems We have tested the following environments. |**Operating System** | **Compiler** | |-----------------|----------| -| Windows 10 | Microsoft Visual Studio 2015| -| | Microsoft Visual Studio 2017| -| | Microsoft Visual Studio 2019| -| Ubuntu 18.04 | GCC 7.5.0 | +| Ubuntu 18.04 | GCC 7.5.0 | | Ubuntu 20.04 | GCC 10.3.0 | | Ubuntu 22.04 | GCC 11.2.0 | -Additionally, CUTLASS may be built with clang. -See [these instructions](media/docs/quickstart.md#clang) for more details. +Note: We plan to add Windows (MSVC) & Clang compiler support soon. ## Hardware -CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on -any Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. - -|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**Minimum CUDA Toolkit Enabling Native Tensor Cores**| -|---|---|---|---| -|NVIDIA Tesla V100|7.0|9.2|10.1| -|NVIDIA TitanV|7.0|9.2|10.1| -|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2| -|NVIDIA Tesla T4|7.5|10.0|10.2| -|NVIDIA A100|8.0|11.0|11.0| -|NVIDIA A10 |8.6|11.1|11.1| -|NVIDIA GeForce 3090|8.6|11.1|11.1| -|NVIDIA H100 PCIe|9.0|11.8|Double-precision: 11.8; Mixed precision: 12.0| +CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on Volta, Turing, Ampere, Ada, and Hopper architecture based NVIDIA GPUs. + +|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit Required by CUTLASS-3**| +|---|---|---| +|NVIDIA V100 Tensor Core GPU |7.0|11.4| +|NVIDIA TitanV |7.0|11.4| +|NVIDIA GeForce RTX 2080 TI, 2080, 2070 |7.5|11.4| +|NVIDIA T4 |7.5|11.4| +|NVIDIA A100 Tensor Core GPU |8.0|11.4| +|NVIDIA A10 |8.6|11.4| +|NVIDIA GeForce RTX 3090 |8.6|11.4| +|NVIDIA GeForce RTX 4090 |8.9|11.8| +|NVIDIA L40 |8.9|11.8| +|NVIDIA H100 Tensor Core GPU |9.0|11.8| + +## Target Architecture + +In general, PTX code generated for one target architecture can be run on future architectures (i.e., it is forward compatible). However, CUDA 12.0 introduces the concept of "architecture-accelerated features" whose PTX does not have forward compatibility guarantees. Several Hopper PTX instructions fall under this category of architecture-accelerated features, and thus require a `sm_90a` target architecture (note the "a" appended). For more details on this and other architecture-accelerated instructions, please refer to the [CUDA Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability). + +The target architecture information is passed on to CUTLASS via the cmake flag `CUTLASS_NVCC_ARCHS`. In order to maximize performance on Hopper GH100, users are required to build CUTLASS with `90a` as the target architecture. If a user accidentally builds a kernel which uses SM90a features (e.g. Hopper Tensor Core Instructions), using the SM90 target (note the lack of "a"), with either CTK 12.0 or 11.8, the kernel is expected to fail with a runtime error. + +``` +cmake .. -DCUTLASS_NVCC_ARCHS="90a" +``` + +Please refer to the [functionality documentation](media/docs/functionality.md) for details on which kernels require which target architectures. # Documentation @@ -125,7 +149,9 @@ CUTLASS is described in the following documents and the accompanying - [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS - [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS - [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA -- [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts +- [CUTLASS 3.x Design](media/docs/cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components +- [GEMM API 3.x](media/docs/gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts +- [GEMM API 2.x](media/docs/gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts - [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS - [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project - [Terminology](media/docs/terminology.md) - describes terms used in the code @@ -161,7 +187,8 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc ``` Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels -for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce compile time you can specify +for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, and 9.0. +To reduce compile time you can specify the architectures to build CUTLASS for by changing the CMake configuration setting `CUTLASS_NVCC_ARCHS`. @@ -224,6 +251,23 @@ include/ # client applications should target this directory transform/ # code specialized for layout, type, and domain transformations * # core vocabulary types, containers, and basic numeric operations + + cute/ # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy + + algorithm/ # Definitions of core operations such as copy, gemm, and operations on cute::tuples + + arch/ # Bare bones PTX wrapper structs for copy and math instructions + + atom/ # Meta-information either link to or built from arch/ operators + + mma_atom.hpp # cute::Mma_Atom and cute::TiledMma + + copy_atom.hpp # cute::Copy_Atom and cute::TiledCopy + + *sm*.hpp # Arch specific meta-information for copy and math operations + + * # Core library types such as Shape, Stride, Layout, Tensor, and associated operations + ``` ### CUTLASS SDK Examples @@ -269,7 +313,7 @@ By default, only one tile size is instantiated for each data type, math instruct To instantiate all, set the following environment variable when running CMake from an empty `build/` directory. Beware, this results in *thousands* of kernels and long build times. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=all ... $ make cutlass_profiler -j16 ``` diff --git a/cuBLAS.cmake b/cuBLAS.cmake index 6936f0a9d6..db1e36fc1c 100644 --- a/cuBLAS.cmake +++ b/cuBLAS.cmake @@ -40,7 +40,7 @@ elseif(NOT TARGET cublas) find_path( _CUBLAS_INCLUDE_DIR - NAMES cublas.h + NAMES cublas_v2.h HINTS ${CUBLAS_INCLUDE_PATH} ENV CUBLAS_INCLUDE_PATH diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt index c24c05030f..11ca9724ec 100644 --- a/examples/10_planar_complex/CMakeLists.txt +++ b/examples/10_planar_complex/CMakeLists.txt @@ -45,5 +45,6 @@ target_link_libraries( PRIVATE cutlass_lib cutlass_tools_util_includes + cuda ) diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt index 7434656eed..64125b5256 100644 --- a/examples/11_planar_complex_array/CMakeLists.txt +++ b/examples/11_planar_complex_array/CMakeLists.txt @@ -45,5 +45,6 @@ target_link_libraries( PRIVATE cutlass_lib cutlass_tools_util_includes + cuda ) diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h index c2a20d751a..dde3c073a8 100644 --- a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h +++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h @@ -35,7 +35,7 @@ GemmLayernorm example = GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm) + lightweight full reduction kernel (ApplyFinalReduction) + GEMM1 with elemenwise operations fused in mainloop (GemmLayernormMainloopFusion) - + */ #pragma once @@ -77,7 +77,7 @@ template < typename ElementLayernormCompute_, typename ElementOutput, typename ThreadblockShape_, - bool IsShiftedVariance_ = false + bool IsShiftedVariance_ = false > class ApplyFinalReduction { public: @@ -91,7 +91,7 @@ class ApplyFinalReduction { using Layout = cutlass::layout::RowMajor; using TensorVariance = TensorRef; - using TensorMean = TensorRef; + using TensorMean = TensorRef; static bool const kIsShiftedVariance = IsShiftedVariance_; @@ -463,7 +463,7 @@ class EpilogueVisitorLayerNorm { for (int rid = 0; rid < kRowIterations; ++rid) { int row_step_offset = rid * kDeltaRow; int row_offset = thread_offset_row_base + step_offset + row_step_offset; - bool is_load = (row_offset < extent_.row()); + bool is_load = (row_offset < extent_.row()); shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load); } @@ -504,9 +504,9 @@ class EpilogueVisitorLayerNorm { using Minus = cutlass::minus; using Exp = cutlass::fast_exp_op; - Minus minus; - Mul mul; - Exp exponential; + [[maybe_unused]] Minus minus; + [[maybe_unused]] Mul mul; + [[maybe_unused]] Exp exponential; LayernormFragment result; @@ -605,7 +605,7 @@ class EpilogueVisitorLayerNorm { CUTLASS_DEVICE ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) { using ConvertShiftK = cutlass::NumericConverter; - ConvertShiftK convert_shift_k; + ConvertShiftK convert_shift_k; ElementOutput shift_k_val; // Computes the address to load shift_k element @@ -614,7 +614,7 @@ class EpilogueVisitorLayerNorm { arch::global_load(shift_k_val, (void *)curr_ptr_shift_k, is_load); // Converts data type to return ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val); - + return converted_shift_k_val; } @@ -689,7 +689,7 @@ class GemmLayernorm { // // Type definitions // - + static bool const kInternalTranspose = cutlass::platform::is_same::value; static bool const kIsShiftedVariance = IsShiftedVariance_; @@ -704,14 +704,14 @@ class GemmLayernorm { using OperatorClass = cutlass::arch::OpClassTensorOp; using ArchTag = cutlass::arch::Sm80; - // These are mandatory layouts and data types + // These are mandatory layouts and data types // that are inheritated from pre-defined params - + using LayoutSumSqr = LayoutInputScaleBias; using LayoutSum = LayoutInputScaleBias; using ElementMean = ElementInputScaleBias; - using ElementVariance = ElementInputScaleBias; + using ElementVariance = ElementInputScaleBias; /////////////////////////////////////////////////////////////////////////////////////////////// @@ -720,7 +720,7 @@ class GemmLayernorm { using LayoutInputA1 = LayoutOutput_; using LayoutInputB1 = LayoutOutput_; using LayoutOutputC0 = LayoutOutput_; - using LayoutOutputC1 = LayoutOutput_; + using LayoutOutputC1 = LayoutOutput_; using ElementInputA0 = ElementInputA0_; using ElementInputB0 = ElementInputB0_; @@ -747,7 +747,7 @@ class GemmLayernorm { static int const kStages1 = Stages1; using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - + /////////////////////////////////////////////////////////////////////////////////////////////// using MapArguments = cutlass::gemm::kernel::detail::MapArguments< diff --git a/examples/41_fused_multi_head_attention/fmha_grouped.h b/examples/41_fused_multi_head_attention/fmha_grouped.h index 9bae934ded..720159965a 100644 --- a/examples/41_fused_multi_head_attention/fmha_grouped.h +++ b/examples/41_fused_multi_head_attention/fmha_grouped.h @@ -180,9 +180,9 @@ struct FMHAGrouped { /// Default ctor CUTLASS_HOST_DEVICE - Arguments(): + Arguments(): problem_count(0), - threadblock_count(0), + threadblock_count(0), ptr_Q(nullptr), ptr_K(nullptr), ptr_P(nullptr), @@ -201,7 +201,7 @@ struct FMHAGrouped { /// Ctor CUTLASS_HOST_DEVICE - Arguments( + Arguments( GemmCoord *problem_sizes0, GemmCoord *problem_sizes1, int problem_count, @@ -219,7 +219,7 @@ struct FMHAGrouped { typename LayoutO::Stride::LongIndex *ldo, bool causal, GemmCoord *host_problem_sizes=nullptr - ): + ): problem_sizes0(problem_sizes0), problem_sizes1(problem_sizes1), problem_count(problem_count), @@ -311,7 +311,7 @@ struct FMHAGrouped { ldv(args.ldv), ldo(args.ldo), causal(args.causal) - { + { } @@ -464,7 +464,7 @@ struct FMHAGrouped { void operator()(Params const ¶ms, SharedStorage &shared_storage) { auto& m_prime = shared_storage.m_prime; auto& s_prime = shared_storage.s_prime; - auto& si = shared_storage.after_mm0.si; + [[maybe_unused]] auto& si = shared_storage.after_mm0.si; auto& mi = shared_storage.mi; ProblemVisitor problem_visitor( diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h index 6321e7dde8..6cb292c0ec 100644 --- a/examples/41_fused_multi_head_attention/kernel_forward.h +++ b/examples/41_fused_multi_head_attention/kernel_forward.h @@ -481,7 +481,7 @@ struct AttentionKernel { SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); auto& m_prime = shared_storage.m_prime; auto& s_prime = shared_storage.s_prime; - auto& si = shared_storage.after_mm0.si; + [[maybe_unused]] auto& si = shared_storage.after_mm0.si; auto& mi = shared_storage.mi; static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, ""); diff --git a/examples/41_fused_multi_head_attention/mma_from_smem.h b/examples/41_fused_multi_head_attention/mma_from_smem.h index 271a9f3a2c..21ac4d104c 100644 --- a/examples/41_fused_multi_head_attention/mma_from_smem.h +++ b/examples/41_fused_multi_head_attention/mma_from_smem.h @@ -384,7 +384,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory< // but not supported as it worsens perf: older gpus < sm80 don't // support async tranfers and have to waste registers CUTLASS_DEVICE - bool set_prologue_done(bool value) {} + void set_prologue_done(bool value) {} CUTLASS_DEVICE static void prologue( typename Base::SharedStorage& shared_storage, @@ -695,7 +695,7 @@ class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory< } CUTLASS_DEVICE - bool set_prologue_done(bool value) { + void set_prologue_done(bool value) { prologue_done_ = value; } diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu index 48d28bc22e..12739a0577 100644 --- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu +++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu @@ -34,7 +34,7 @@ "classic data-parallel" and "Split-K" decompositions. For more details regarding the Stream-K method, see "Stream-K: Work-centric Parallel Decomposition - for Dense Matrix-Matrix Multiplication on the GPU" (https://arxiv.org/abs/2301.03598) + for Dense Matrix-Matrix Multiplication on the GPU" (https://arxiv.org/abs/2301.03598) Requires NVIDIA Ampere or newer device (SM80+). diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu new file mode 100644 index 0000000000..599d1d5083 --- /dev/null +++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu @@ -0,0 +1,463 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Simple Hopper GEMM example using CUTLASS 3.0 APIs for NVIDIA Hopper architecture + + This example demonstrate a simple way to instantiate and run a TF32 GEMM using the new CUTLASS 3.0 + APIs on NVIDIA Hopper architecture. New features that will be showcased in this example are as follows: + + 1. NVIDIA Hopper architecture introduces a new series of tensor core instructions (GMMA) + which are more efficient than the Ampere tensor core instructions. + + 2. NVIDIA Hopper architecture includes new Tensor Memory Accelerator (TMA) unit to transfer large + blocks of data efficiently between global memory and shared memory. TMA also supports asynchronous + copies between thread blocks in a cluster. Another advantage is that TMA can load in FP32 data and + convert them implicitly to TF32. + + 3. This example uses the Warp Specialized kernel design (see /media/docs/efficient_gemm.md for details). + + Examples: + + $ ./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm --m=2048 --n=2048 --k=2048 +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/device/tensor_fill.h" + +#include "helper.h" + +using namespace cute; + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// GEMM kernel configurations +///////////////////////////////////////////////////////////////////////////////////////////////// + +// A matrix configuration +using ElementA = float; // Element type for A matrix operand +using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand +constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes) + +// B matrix configuration +using ElementB = float; // Element type for B matrix operand +using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand +constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes) + +// C/D matrix configuration +using ElementC = float; // Element type for C and D matrix operands +using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C and D matrix operands + +// Core kernel configurations +using ElementAccumulator = float; // Element type for internal accumulation +using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature +using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag +using TilesShape = Shape<_128,_128,_32>; // Threadblock-level tile size +using ClusterShape = Shape<_1,_2,_1>; // Shape of the threadblocks in a cluster +using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized based on the tile size +using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default setting in the Collective Builder + +using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + ElementA, LayoutA, AlignmentA, + ElementB, LayoutB, AlignmentB, + ElementAccumulator, + TilesShape, ClusterShape, + cutlass::gemm::collective::StageCountAuto, + cutlass::gemm::collective::KernelScheduleAuto + >::CollectiveOp; + +using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + cutlass::epilogue::thread::LinearCombination>; + +using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, // Indicates ProblemShape + CollectiveMainloop, + CollectiveEpilogue +>; + +using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + +// Reference device GEMM implementation type +using DeviceGemmReference = cutlass::reference::device::Gemm< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + ElementAccumulator>; + +using StrideA = typename Gemm::GemmKernel::StrideA; +using StrideB = typename Gemm::GemmKernel::StrideB; +using StrideC = typename Gemm::GemmKernel::StrideC; +using StrideD = typename Gemm::GemmKernel::StrideD; + +// +// Data members +// + +/// Initialization +StrideA stride_A; +StrideB stride_B; +StrideC stride_C; +StrideD stride_D; +uint64_t seed; + +cutlass::DeviceAllocation block_A; +cutlass::DeviceAllocation block_B; +cutlass::DeviceAllocation block_C; +cutlass::DeviceAllocation block_D; +cutlass::DeviceAllocation block_ref_D; + +#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Testbed utility types +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + + float alpha, beta; + int iterations; + int m, n, k; + + Options(): + help(false), + m(5120), n(4096), k(4096), + alpha(1.f), beta(0.f), + iterations(1000) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m); + cmd.get_cmd_line_argument("n", n); + cmd.get_cmd_line_argument("k", k); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "48_hopper_warp_specialized_gemm\n\n" + << " Hopper FP32 GEMM using a Warp Specialized kernel.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Number of profiling iterations to perform.\n\n"; + + out + << "\n\nExamples:\n\n" + << "$ " << "48_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const + { + // Two flops per multiply-add + uint64_t flop = uint64_t(2) * m * n * k; + double gflop = double(flop) / double(1.0e9); + return gflop / runtime_s; + } +}; + +/// Result structure +struct Result +{ + double avg_runtime_ms; + double gflops; + cutlass::Status status; + cudaError_t error; + bool passed; + + Result( + double avg_runtime_ms = 0, + double gflops = 0, + cutlass::Status status = cutlass::Status::kSuccess, + cudaError_t error = cudaSuccess) + : + avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false) + {} + +}; + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// GEMM setup and evaluation +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to initialize a block of device data +template +bool initialize_block( + cutlass::DeviceAllocation& block, + uint64_t seed=2023) { + + Element scope_max, scope_min; + int bits_input = cutlass::sizeof_bits::value; + + if (bits_input == 1) { + scope_max = 2; + scope_min = 0; + } else if (bits_input <= 8) { + scope_max = 2; + scope_min = -2; + } else { + scope_max = 8; + scope_min = -8; + } + + cutlass::reference::device::BlockFillRandomUniform( + block.get(), block.size(), seed, scope_max, scope_min, 0); + + return true; +} + +/// Initialize operands to be used in the GEMM and reference GEMM +void initialize(const Options &options) { + + stride_A = make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, Int<1>{})); + stride_B = make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, Int<1>{})); + stride_C = make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, Int<1>{})); + stride_D = make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, Int<1>{})); + + block_A.reset(options.m * options.k); + block_B.reset(options.k * options.n); + block_C.reset(options.m * options.n); + block_D.reset(options.m * options.n); + block_ref_D.reset(options.m * options.n); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); +} + +/// Populates a Gemm::Arguments structure from the given commandline options +typename Gemm::Arguments args_from_options(const Options &options) +{ + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + {options.m, options.n, options.k}, + block_A.get(), + stride_A, + block_B.get(), + stride_B, + {block_C.get(), stride_C, block_D.get(), stride_D, {options.alpha, options.beta}} + }; + + return arguments; +} + +bool verify(const Options &options) { + cutlass::TensorRef ref_A(block_A.get(), Gemm::LayoutA::packed({options.m, options.k})); + cutlass::TensorRef ref_B(block_B.get(), Gemm::LayoutB::packed({options.n, options.k})); + cutlass::TensorRef ref_C(block_C.get(), Gemm::LayoutC::packed({options.m, options.n})); + cutlass::TensorRef ref_D(block_ref_D.get(), Gemm::LayoutD::packed({options.m, options.n})); + + // + // Compute reference output + // + + // Create instantiation for device reference gemm kernel + DeviceGemmReference gemm_reference; + + // Launch device reference gemm kernel + gemm_reference( + {options.m, options.n, options.k}, + ElementAccumulator(options.alpha), + ref_A, + ref_B, + ElementAccumulator(options.beta), + ref_C, + ref_D); + + // Wait for kernel to finish + CUDA_CHECK(cudaDeviceSynchronize()); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; +} + +/// Execute a given example GEMM computation +template +int run(Options &options) +{ + initialize(options); + + // Instantiate CUTLASS kernel depending on templates + Gemm gemm; + + // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm + auto arguments = args_from_options(options); + + // Using the arguments, query for extra workspace required for matrix multiplication computation + size_t workspace_size = Gemm::get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + // Check if the problem size is supported or not + CUTLASS_CHECK(gemm.can_implement(arguments)); + + // Initialize CUTLASS kernel with arguments and workspace pointer + CUTLASS_CHECK(gemm.initialize(arguments, workspace.get())); + + // Correctness / Warmup iteration + CUTLASS_CHECK(gemm.run()); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + Result result; + result.passed = verify(options); + + std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl; + + if (!result.passed) { + exit(-1); + } + + // Run profiling loop + if (options.iterations > 0) + { + GpuTimer timer; + timer.start(); + for (int iter = 0; iter < options.iterations; ++iter) { + CUTLASS_CHECK(gemm.run()); + } + timer.stop(); + + // Compute average runtime and GFLOPs. + float elapsed_ms = timer.elapsed_millis(); + result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); + result.gflops = options.gflops(result.avg_runtime_ms / 1000.0); + + std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl; + std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl; + std::cout << " GFLOPS: " << result.gflops << std::endl; + } + + return 0; +} + +#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example + // and must have compute capability at least 90. + if (__CUDACC_VER_MAJOR__ < 12) { + std::cerr << "This example requires CUDA 12 or newer.\n"; + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + + cudaDeviceProp props; + int current_device_id; + CUDA_CHECK(cudaGetDevice(¤t_device_id)); + CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id)); + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (props.major < 9) { + std::cerr + << "This example requires a GPU of NVIDIA's Hopper Architecture or " + << "later (compute capability 90 or greater).\n"; + return 0; + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + // + // Evaluate CUTLASS kernels + // + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + run(options); +#endif + + return 0; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt new file mode 100644 index 0000000000..b00c7244d2 --- /dev/null +++ b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt @@ -0,0 +1,35 @@ + +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +cutlass_example_add_executable( + 48_hopper_warp_specialized_gemm + 48_hopper_warp_specialized_gemm.cu + ) diff --git a/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu b/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu new file mode 100644 index 0000000000..1d92bef93c --- /dev/null +++ b/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu @@ -0,0 +1,522 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Hopper GEMM example leveraging collective operation builders. + + This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels + targetting the NVIDIA Hopper architecture. + + Background and motivation + ------------------------- + CUTLASS kernels are highly parameterizable via template parameters. To ease the selection of template + parameters, CUTLASS 2 leveraged DefaultGemmConfigurations. Given a small set of parameters, such as + the data types of operands and the compute capability of the GPU, DefaultGemmConfigurations defined sensible + defaults for the many other parameters to the kernel (e.g., warp shape, stage count). + + However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed + in CUTLASS 3: + (1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without + specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targetting + Ampere specify that three pipeline stages should be used regardless of the sizes of operands. + If one wished to increase this value, one would also need to specify all other template parameters. + This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface. + (2) A new DefaultGemmConfiguration was required for each combination of operand types, GPU architecture, + and operation type (e.g., Tensor Core or SIMT). This led to increased code size to cover each unique + configuration and a lack of extensibility from one DefaultGemmConfiguration to another. + + Alongside these opportunities for improvement, the Hopper architecture offers new features that increase + the number of valid configurations of a kernel. In addition to the many template parameters already available + in CUTLASS 2 kernels, CUTLASS 3 kernels targetting Hopper also have various scheduling modes to select from that control: + (1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async) + (2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization") + (3) whether persistent thread blocks should be used + This increased configuration space further motivates rethinking DefaultGemmConfigurations. + + Introduction to the CollectiveBuilder + ------------------------------------- + CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters + for kernels targetting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder + takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically + determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided + parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template + parameters), the CollectiveBuilder will also automatically select these. + + Unlike DefaultGemmConfigurations a parital specialization of the CollectiveBuilder is not needed for many + configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic + properties of the specified operands, layouts, and other parameters. For example, when the stage count + is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that + will fit in shared memory given the types of operands and the thread block shape, rather than simply using + a single default value. + + Note that one does not need to use the CollectiveBuilder to declare CUTLASS 3 kernels; one can still provide + every template parameter to the gemm::collective::CollectiveMma. Specifying every template parameter in this + manner remains the primary API for using CUTLASS 3 kernels. The CollectiveBuilder is simply meant to be + a convenience interface. + + Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not + a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject + to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or + stage count to be used. + + Details of this example + ----------------------- + This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified. + This example also illustrates how CUTLASS 3 GEMMs targetting Hopper automatically support batched GEMMs by simply + extending the problem size with an additional tensor rank. + + Example usage: + $ ./examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder \ + --m=2048 --n=2048 --k=2048 --l=2 +*/ + +#include + +#include "cute/tensor.hpp" + +#include "cutlass/cutlass.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/device/tensor_fill.h" + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l; + float alpha, beta; + + Options(): + help(false), + error(false), + m(2048), n(2048), k(2048), l(1), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 2048); + cmd.get_cmd_line_argument("n", n, 2048); + cmd.get_cmd_line_argument("k", k, 2048); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "49_hopper_gemm_schedules_with_collective_builder\n\n" + << " This example showcases the use of CUTLASS's collective operation builders to easily construct\n" + << " performant kernels targetting NVIDIA's Hopper architecture.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to initialize a block of device data +template +bool initialize_block( + cutlass::DeviceAllocation& block, + uint64_t seed=2023) { + + Element scope_max, scope_min; + int bits_input = cutlass::sizeof_bits::value; + + if (bits_input == 1) { + scope_max = 2; + scope_min = 0; + } else if (bits_input <= 8) { + scope_max = 2; + scope_min = -2; + } else { + scope_max = 8; + scope_min = -8; + } + + cutlass::reference::device::BlockFillRandomUniform( + block.get(), block.size(), seed, scope_max, scope_min, 0); + + return true; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +// Wrapper to construct, run, and verify a GEMM. This example showcases CUTLASS's collective +// operation builders by specializing the GEMM only on the kernel schedule it will use and the +// number of pipeline stages. +// +// For either option, one can use a special `Auto` type that tells the CollectiveBuilder +// to select an appropriate value on its own. The CollectiveBuilder will attempt to select +// values that will result in the most-performant kernel, but this is not a guarantee. Furthermore, +// the behavior of the CollectiveBuilder with `Auto` types is subject to change in future releases +// -- do not rely on `Auto` if you require a specific scheduling policy. +template < + // Type of kernel schedule to generate + class KernelScheduleType = cutlass::gemm::collective::KernelScheduleAuto, + // Number of pipeline stages to use + class StageCountType = cutlass::gemm::collective::StageCountAuto +> +struct ExampleRunner { + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + static constexpr int kAlignmentA = 8; + static constexpr int kAlignmentB = 8; + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + cutlass::half_t, LayoutA, kAlignmentA, + cutlass::half_t, LayoutB, kAlignmentB, + float, + Shape<_128,_128,_64>, Shape<_2,_1,_1>, + StageCountType, + KernelScheduleType + >::CollectiveOp; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + cutlass::epilogue::thread::LinearCombination>; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutTagA = decltype(cutlass::gemm::detail::stride_to_layout_tag_A()); + using LayoutTagB = decltype(cutlass::gemm::detail::stride_to_layout_tag_B()); + using LayoutTagC = decltype(cutlass::gemm::detail::stride_to_layout_tag_A()); + using LayoutTagD = decltype(cutlass::gemm::detail::stride_to_layout_tag_A()); + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, float alpha, float beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), Gemm::LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), Gemm::LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), Gemm::LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), Gemm::LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + typename Gemm::EpilogueOutputOp::ElementCompute(alpha), + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + typename Gemm::EpilogueOutputOp::ElementCompute(beta), + ref_C, + ref_D, + typename Gemm::EpilogueOutputOp::ElementAccumulator(0.f), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + cudaError_t result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + std::cerr << "Reference kernel failed. Last CUDA error: " + << cudaGetErrorString(result) << std::endl; + return false; + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + bool run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + block_A.get(), + stride_A, + block_B.get(), + stride_B, + {block_C.get(), stride_C, block_D.get(), stride_D, {options.alpha, options.beta}}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + cutlass::Status status = gemm_op.can_implement(arguments); + if (status != cutlass::Status::kSuccess) { + std::cerr << "This kernel is not supported. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + status = gemm_op.initialize(arguments, workspace.get()); + if (status != cutlass::Status::kSuccess) { + std::cerr << "Failed to initialize the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + // Run the GEMM + status = gemm_op.run(); + if (status != cutlass::Status::kSuccess) { + std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + cudaError_t result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(result) << std::endl; + return false; + } + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + if (!passed) { + std::cerr << "Reference check failed" << std::endl; + } + + return passed; + } + +}; + +#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to print a description of the example run and its result +void print_result(const std::string& description, bool passed) { + std::cout << description << ": " << (passed ? "Passed" : "Failed") << std::endl; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (__CUDACC_VER_MAJOR__ < 12 || props.major < 9) { + std::cout + << "This example requires a GPU of NVIDIA's Hopper Architecture or " + << "later (compute capability 90 or greater) and CUDA 12.0 or greater.\n"; + return 0; + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.device_id = 0; + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // This first example constructs a GEMM using the default schedule and stage count provided by + // the CollectiveBuilder. The scheduling policy that is expected to be most performant will be + // selected and the maximum number of stages that can fit in shared memory will be selected. + // + // This example is equivalent to declaring + // ExampleRunner + // Each of the `Auto` types indicate that the CollectiveBuilder should determine the scheduling policy and + // stage count. Note that the behavior of the CollectiveBuilder with `Auto` parameters is subject to change + // -- do not rely on `Auto` if you require a specific scheduling policy. + ExampleRunner<> auto_schedule_auto_stage_runner; + passed = auto_schedule_auto_stage_runner.run(options, hw_info); + print_result("Automatically-selected schedule and stage count", passed); + + // One can override the stage count used in the GEMM by replacing cutlass::gemm::collective::StageCountAuto + // with the number of stages to use (5 in this case). + ExampleRunner auto_schedule_5_stage_runner; + passed = auto_schedule_5_stage_runner.run(options, hw_info); + print_result("Automatically-selected schedule with 5 stages", passed); + + // One can also override the scheduling policy to use. In this case, use the KernelTma scheduling + // policy, which specifies that the Hopper TMA feature should be used. + ExampleRunner tma_schedule_auto_stage_runner; + passed = tma_schedule_auto_stage_runner.run(options, hw_info); + print_result("TMA schedule with automatically-selected stage count", passed); + + // Here, we override the scheduling policy to use Hopper's TMA feature alongside the warp-specialized + // scheduling policy. + // + // Note that, as of the CUTLASS 3.0 release, this is the default scheduling policy + // used by the CollectiveBuilder, so this declaration is equivalent to ExampleRunner<> and + // ExampleRunner. However, this default is subject to + // change in future releases -- do not rely on `Auto` if you require a specific scheduling policy. + ExampleRunner ws_schedule_auto_stage_runner; + passed = ws_schedule_auto_stage_runner.run(options, hw_info); + print_result("Warp-specialized TMA schedule with automatically-selected stage count", passed); + + // Finally, we override the scheduling policy to use Hopper's TMA feature, alongside the warp-specialized + // scheduling policy, leveraging persistent thread blocks. + ExampleRunner ws_persistent_schedule_auto_stage_runner; + passed = ws_persistent_schedule_auto_stage_runner.run(options, hw_info); + print_result("Persistent warp-specialized TMA schedule with automatically-selected stage count", passed); + +#endif + + return 0; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/49_hopper_gemm_schedules_with_collective_builder/CMakeLists.txt b/examples/49_hopper_gemm_schedules_with_collective_builder/CMakeLists.txt new file mode 100644 index 0000000000..30c6e5ead0 --- /dev/null +++ b/examples/49_hopper_gemm_schedules_with_collective_builder/CMakeLists.txt @@ -0,0 +1,35 @@ + +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +cutlass_example_add_executable( + 49_hopper_gemm_schedules_with_collective_builder + 49_hopper_gemm_schedules_with_collective_builder.cu + ) diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu new file mode 100644 index 0000000000..7323cc39de --- /dev/null +++ b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu @@ -0,0 +1,529 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Hopper GEMM example to create a GEMM kernel with custom Collectives + + The following example shows how to assemble a custom GEMM kernel that spells out the Collectives + directly instead of using a builder and, in the process, instance a more efficient Epilogue + (from `cutlass/epilogue/collective/epilogue.hpp`) instead of using the default epilogue. + + The GemmUniversal API takes 3 main template arguments: + (1) the problem shape / extents + (2) the collective mainloop type + (3) the collective epilogue type + + While the collecive mainloop can be stamped out using a CollectiveBuilder interface, it is + possible to build a custom collective mainloop directly as well. Furthermore, since epilogues + do not yet have a builder interface, this example shows how to instantiate a more-efficient + epilogue alongside the collective mainloop. + + Note: there are several ways to implement the GEMM epilogue in Hopper - each with its own set + of trade-offs. So it is recommended that users look at the options available under + cutlass/epilogue/collective and evaluate for their particular scenario. + + Please refer to examples 48, 49 to learn more about kernel schedules and other CuTe examples + present in `test/unit/cute` to famialiarize with the basics of CuTe. + + Examples: + + $ ./examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cutlass/util/command_line.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/device/tensor_fill.h" + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l; + int alpha, beta; + + Options(): + help(false), + error(false), + m(2048), n(2048), k(2048), l(1), + alpha(1), beta(0) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 2048); + cmd.get_cmd_line_argument("n", n, 2048); + cmd.get_cmd_line_argument("k", k, 2048); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1); + cmd.get_cmd_line_argument("beta", beta, 0); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "50_hopper_gemm_with_vectorized_epilogue\n\n" + << "Hopper GEMM Example with Epilogue Swizzle.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to initialize a block of device data +template +bool initialize_block( + cutlass::DeviceAllocation& block, + uint64_t seed=2023) { + + Element scope_max, scope_min; + int bits_input = cutlass::sizeof_bits::value; + + if (bits_input == 1) { + scope_max = 2; + scope_min = 0; + } else if (bits_input <= 8) { + scope_max = 2; + scope_min = -2; + } else { + scope_max = 8; + scope_min = -8; + } + + cutlass::reference::device::BlockFillRandomUniform( + block.get(), block.size(), seed, scope_max, scope_min, 0); + + return true; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +// Wrapper to run and verify a GEMM. +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, int32_t alpha, int32_t beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + ElementCompute(alpha), + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + ElementCompute(beta), + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + cudaError_t result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + std::cerr << "Reference kernel failed. Last CUDA error: " + << cudaGetErrorString(result) << std::endl; + return false; + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + bool run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + block_A.get(), + stride_A, + block_B.get(), + stride_B, + {block_C.get(), stride_C, block_D.get(), stride_D, {options.alpha, options.beta}}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + cutlass::Status status = gemm_op.can_implement(arguments); + if (status != cutlass::Status::kSuccess) { + std::cerr << "This kernel is not supported. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + status = gemm_op.initialize(arguments, workspace.get()); + if (status != cutlass::Status::kSuccess) { + std::cerr << "Failed to initialize the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + // Run the GEMM + status = gemm_op.run(); + if (status != cutlass::Status::kSuccess) { + std::cerr << "Failed to launch the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; + } + + cudaError_t result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + std::cerr << "Error running the CUTLASS kernel. Last CUDA error is: " + << cudaGetErrorString(result) << std::endl; + return false; + } + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + if (!passed) { + std::cerr << "Reference check failed" << std::endl; + } + + return passed; + } + +}; + +#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (__CUDACC_VER_MAJOR__ < 12 || props.major < 9) { + std::cout + << "This example requires a GPU of NVIDIA's Hopper Architecture or " + << "later (compute capability 90 or greater) and CUDA 12.0 or greater.\n"; + return 0; + } + + // + // Parse options + // + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.device_id = 0; + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // Problem configuration + using ElementA = int8_t; + using ElementB = int8_t; + using ElementAcc = int32_t; + using ElementOutput = int8_t; + + // Note : Only TN WGMMA Gemm is supported currently in 3.0 + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + // Tiling configuration selection + using TileShape = Shape<_128,_64,_128>; + + // Choosing a thread block cluster larger than 1 allows us to Multicast data across thread blocks + using ClusterShape = Shape<_1,_2,_1>; + + // + // Assembling the CollectiveMainloop type + // + + // Pipeline Depth to be used i.e number of A, B buffers in shared memory + constexpr int PipelineStages = 8; + + // Let's choose a Warp-Specialized Mainloop implemention which uses TMA + // Note : This requires / assumes the tensors to be 16B aligned + using DispatchPolicy = cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized; + + // TN => K Major for both A & B + static constexpr cute::GMMA::Major GmmaMajorA = cute::GMMA::Major::K; + static constexpr cute::GMMA::Major GmmaMajorB = cute::GMMA::Major::K; + + // We use the SS op selector as both A, B operands are read directly from SMEM (for TN WGMMA) + using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< + ElementA, ElementB, ElementAcc, TileShape, GmmaMajorA, GmmaMajorB>())); + + // A loads can be optimized with multicast if cluster-n > 1 + using GmemTiledCopyA = std::conditional< cute::size(shape<1>(ClusterShape{})) == 1, + cute::SM90_TMA_LOAD, + cute::SM90_TMA_LOAD_MULTICAST>::type; + + // B loads can be optimized with multicast if cluster-m > 1 + using GmemTiledCopyB = std::conditional< cute::size(shape<0>(ClusterShape{})) == 1, + cute::SM90_TMA_LOAD, + cute::SM90_TMA_LOAD_MULTICAST>::type; + + using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector< + GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape{})), decltype(cute::get<2>(TileShape{})) + >()); + + using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector< + GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape{})), decltype(cute::get<2>(TileShape{})) + >()); + + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementA, + cutlass::gemm::TagToStrideA_t, + ElementB, + cutlass::gemm::TagToStrideB_t, + TiledMma, + GmemTiledCopyA, + SmemLayoutAtomA, + void, // Does not need a SmemCopyAtom, since A is read directly from SMEM + cute::identity, + GmemTiledCopyB, + SmemLayoutAtomB, + void, // Does not need a SmemCopyAtom, since B is read directly from SMEM + cute::identity + >; + + // + // Assembling the Collective Epilogue Type + // + + // Break the 128 along TILE_M into chunks of 32, to get a 128B leading dimension + using PreSwizzleLayout = Layout< Shape< Shape <_32,_4 >,_64>, + Stride,_32>>; + + // 128 threads loading 16 elements each (to get vectorized global stores) + using TileShapeS2R = Shape<_128,_16>; + + // Layout to ensure bank-conflict free loads & stores + using SmemLayout = ComposedLayout< + Swizzle<3,4,3>, + smem_ptr_flag_bits::value>, + PreSwizzleLayout>; + + // Tiled copy from Smem to Registers + // Note : CuTe will vectorize this copy if the tiling + swizzling above were right + using TiledCopyS2R = TiledCopy< + Copy_Atom, + Layout< Shape<_128,_16>, + Stride<_16,_1>>, + TileShapeS2R>; + + using Epilogue = cutlass::epilogue::collective::Epilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + cutlass::epilogue::thread::LinearCombination, + SmemLayout, + Copy_Atom, + TiledCopyS2R, + Copy_Atom>; + + // + // Assembling the GemmKernel + // + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + Epilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + passed = runner.run(options, hw_info); + + std::cout << "WGMMA GEMM with Epilogue Swizzle : " << (passed ? "Passed" : "Failed") << std::endl; + +#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) + + return 0; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt new file mode 100644 index 0000000000..b213d3936f --- /dev/null +++ b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt @@ -0,0 +1,35 @@ + +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +cutlass_example_add_executable( + 50_hopper_gemm_with_epilogue_swizzle + 50_hopper_gemm_with_epilogue_swizzle.cu + ) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index fac98b8ebc..a063bd81dd 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -54,12 +54,14 @@ function(cutlass_example_add_executable NAME) CUTLASS cutlass_tools_util_includes $<$:nvidia::cublas> + cuda ) target_include_directories( ${NAME} PRIVATE ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR} + ${CUTLASS_EXAMPLES_UTILS_DIR} ) install( @@ -118,6 +120,7 @@ foreach(EXAMPLE 36_gather_scatter_fusion 37_gemm_layernorm_gemm_fusion 38_syr2k_grouped + cute 39_gemm_permute 41_fused_multi_head_attention 42_ampere_tensorop_group_conv @@ -125,6 +128,9 @@ foreach(EXAMPLE 45_dual_gemm 46_depthwise_simt_conv2dfprop 47_ampere_gemm_universal_streamk + 48_hopper_warp_specialized_gemm + 49_hopper_gemm_schedules_with_collective_builder + 50_hopper_gemm_with_epilogue_swizzle ) add_subdirectory(${EXAMPLE}) diff --git a/examples/cute/CMakeLists.txt b/examples/cute/CMakeLists.txt new file mode 100644 index 0000000000..c210d634af --- /dev/null +++ b/examples/cute/CMakeLists.txt @@ -0,0 +1,30 @@ + +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +add_subdirectory(tutorial) diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt new file mode 100644 index 0000000000..97867ded44 --- /dev/null +++ b/examples/cute/tutorial/CMakeLists.txt @@ -0,0 +1,34 @@ + +# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + sgemm_nt_1 + sgemm_nt_1.cu +) + diff --git a/examples/cute/tutorial/sgemm_nt_1.cu b/examples/cute/tutorial/sgemm_nt_1.cu new file mode 100644 index 0000000000..fc4839a5bf --- /dev/null +++ b/examples/cute/tutorial/sgemm_nt_1.cu @@ -0,0 +1,426 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#include +#include + +#include + +#include "cutlass/util/print_error.hpp" +#include "cutlass/util/GPU_Clock.hpp" +#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 +# include "cutlass/util/cublas_wrappers.hpp" +#endif +#include "cutlass/util/helper_cuda.hpp" + +template +__global__ static +__launch_bounds__(decltype(size(CThreadLayout{}))::value) +void +gemm_device(MShape M, NShape N, KShape K, + TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, + TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, + TC * C, CStride dC, CBlockLayout , CThreadLayout tC, + Alpha alpha, Beta beta) +{ + using namespace cute; + using X = Underscore; + + // Preconditions + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + CUTE_STATIC_ASSERT(is_static::value); + + CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); + CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); + + //CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M + //CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N + CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K + + // Shared memory buffers + __shared__ TA smemA[cosize_v]; + __shared__ TB smemB[cosize_v]; + auto sA = make_tensor(make_smem_ptr(smemA), blockA); // (BLK_M,BLK_K) + auto sB = make_tensor(make_smem_ptr(smemB), blockB); // (BLK_N,BLK_K) + + // Represent the full tensors + auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) + auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K) + auto mC = make_tensor(make_gmem_ptr(C), make_shape(M,N), dC); // (M,N) + + // Get the appropriate blocks for this thread block -- + // potential for thread block locality + auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB));// (BLK_M,BLK_N,BLK_K) + auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) + + auto gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) + auto gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) + auto gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) + + // + // Partition the copying of A and B tiles across the threads + // + + // TUTORIAL: Example of simple partitioning of A|B tiles over tA|tB + // Default is a raked partition, but can be changed with Step parameter + + auto tAgA = local_partition(gA, tA, threadIdx.x); // (THR_M,THR_K,k) + auto tAsA = local_partition(sA, tA, threadIdx.x); // (THR_M,THR_K) + + auto tBgB = local_partition(gB, tB, threadIdx.x); // (THR_N,THR_K,k) + auto tBsB = local_partition(sB, tB, threadIdx.x); // (THR_N,THR_K) + + // + // Define C accumulators and A/B partitioning + // + + // TUTORIAL: Example of partitioning via projections of tC + + // Partition sA (M,K) by the rows of tC + auto tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{}); // (THR_M,BLK_K) + // Partition sB (N,K) by the cols of tC + auto tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{}); // (THR_N,BLK_K) + // Partition gC (M,N) by the tile of tC + auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{}); // (THR_M,THR_N) + + // Allocate the accumulators -- same size as the projected data + auto tCrC = make_fragment_like(tCgC); // (THR_M,THR_N) + + // Clear the accumulators + clear(tCrC); + +#if 0 + if(thread0()) { + print("mA\n"); + print(mA.shape()); print("\n"); print(mA.stride()); + print("\n\ngA\n"); + print(gA.shape()); print("\n"); print(gA.stride()); + print("\n\ntAgA\n"); + print(tAgA.shape()); print("\n"); print(tAgA.stride()); + print("\n\nsA\n"); + print(sA.shape()); print("\n"); print(sA.stride()); + print("\n\ntAsA\n"); + print(tAsA.shape()); print("\n"); print(tAsA.stride()); + print("\n\n"); + } +#endif + +#if 0 + if(thread0()) { + print("mB\n"); + print(mB.shape()); print("\n"); print(mB.stride()); + print("\n\ngB\n"); + print(gB.shape()); print("\n"); print(gB.stride()); + print("\n\ntBgB\n"); + print(tBgB.shape()); print("\n"); print(tBgB.stride()); + print("\n\nsB\n"); + print(sB.shape()); print("\n"); print(sB.stride()); + print("\n\ntBsB\n"); + print(tBsB.shape()); print("\n"); print(tBsB.stride()); + print("\n\n"); + } +#endif + +#if 0 + if(thread0()) { + print("mC\n"); + print(mC.shape()); print("\n"); print(mC.stride()); + print("\n\ngC\n"); + print(gC.shape()); print("\n"); print(gC.stride()); + print("\n\ntCsA\n"); + print(tCsA.shape()); print("\n"); print(tCsA.stride()); + print("\n\ntCsB\n"); + print(tCsB.shape()); print("\n"); print(tCsB.stride()); + print("\n\ntCgC\n"); + print(tCgC.shape()); print("\n"); print(tCgC.stride()); + print("\n\ntCrC\n"); + print(tCrC.shape()); print("\n"); print(tCrC.stride()); + print("\n\n"); + } +#endif + +#if 1 + + // TUTORIAL: Example of a very simple compute loop + // Data is read from global to shared memory via the tA|tB partitioning + // gemm(.) operates on the shared memory directly via the tC partitioning + + auto k_max = size<2>(tAgA); + + for (int k = 0; k < k_max; ++k) + { + // Copy gmem to smem + copy(tAgA(_,_,k), tAsA); + copy(tBgB(_,_,k), tBsB); + + // In case copy uses cp.async, make sure that the cp.async + // instructions are ordered with respect to other cp.async + // instructions (fence), then wait on all the outstanding copy + // operations (wait<0>()). __syncthreads() alone does not do + // this. + // + // NOTE: cp_async_wait<0>() currently issues cp.async.wait_all. + // This is equivalent to cp.async.commit_group followed by + // cp.async_wait_group 0. This should make the first + // cp_async_fence() (which also issues cp.async.commit_group) + // redundant. The tutorial works as-is, so we'll leave the + // redundant fence in for now and study its removal later. + cp_async_fence(); + cp_async_wait<0>(); + + __syncthreads(); + + // Compute gemm on smem + gemm(tCsA, tCsB, tCrC); + + __syncthreads(); + } + +#endif + + // + // Epilogue + // + + axpby(alpha, tCrC, beta, tCgC); +} + + +template +void +gemm(int m, int n, int k, + Alpha alpha, + TA const* A, int ldA, + TB const* B, int ldB, + Beta beta, + TC * C, int ldC, + cudaStream_t stream = 0) +{ + using namespace cute; + + // Define shapes (dynamic) + auto M = int(m); + auto N = int(n); + auto K = int(k); + + // Define strides (mixed) + auto dA = make_stride(Int<1>{}, ldA); + auto dB = make_stride(Int<1>{}, ldB); + auto dC = make_stride(Int<1>{}, ldC); + + // Define block sizes (static) + auto bM = Int<128>{}; + auto bN = Int<128>{}; + auto bK = Int< 8>{}; + + // Define the block layouts (static) + auto sA = make_layout(make_shape(bM,bK)); + auto sB = make_layout(make_shape(bN,bK)); + auto sC = make_layout(make_shape(bM,bN)); + + // Define the thread layouts (static) + auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); + auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); + + dim3 dimBlock(size(tC)); + dim3 dimGrid(ceil_div(size(M), size(bM)), + ceil_div(size(N), size(bN))); + gemm_device + <<< dimGrid, dimBlock, 0, stream >>> + (M, N, K, + A, dA, sA, tA, + B, dB, sB, tB, + C, dC, sC, tC, + alpha, beta); +} + +#include +#include +#include + +void test_gemm(int m, int n, int k) +{ + cute::device_init(0); + + std::cout << "M = " << m << std::endl; + std::cout << "N = " << n << std::endl; + std::cout << "K = " << k << std::endl; + + using TA = float; + using TB = float; + using TC = float; + using TI = float; + + thrust::host_vector h_A(m*k); + thrust::host_vector h_B(n*k); + thrust::host_vector h_C(m*n); + + for (int j = 0; j < m*k; ++j) h_A[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); + for (int j = 0; j < n*k; ++j) h_B[j] = static_cast( 2*(rand() / double(RAND_MAX)) - 1 ); + for (int j = 0; j < m*n; ++j) h_C[j] = static_cast(-1); + + thrust::device_vector d_A = h_A; + thrust::device_vector d_B = h_B; + thrust::device_vector d_C = h_C; + + TI alpha = 1.0; + TI beta = 0.0; + + double gflops = (2.0*m*n*k) * 1e-9; + + const int timing_iterations = 100; + GPU_Clock timer; + +#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 + // + // cuBLas + // + + cublasHandle_t handle; + cublasCreate(&handle); + + // Run once + d_C = h_C; + blam::cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + m, n, k, + &alpha, + d_A.data().get(), m, + d_B.data().get(), n, + &beta, + d_C.data().get(), m); + CUTE_CHECK_LAST(); + + thrust::host_vector cublas_result = d_C; + + // Timing iterations + timer.start(); + for (int i = 0; i < timing_iterations; ++i) { + blam::cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + m, n, k, + &alpha, + d_A.data().get(), m, + d_B.data().get(), n, + &beta, + d_C.data().get(), m); + } + double cublas_time = timer.seconds() / timing_iterations; + CUTE_CHECK_LAST(); + printf("CUBLAS_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / cublas_time, cublas_time*1000); + +#else + + std::cout << "Verification by comparison with cuBLAS is disabled, " + "either because the CMake option CUTLASS_ENABLE_CUBLAS " + "was explicitly set to OFF, or because CMake could not find cuBLAS. " + "If you would like to enable verification with cuBLAS, " + "please set the CMake option CUTLASS_ENABLE_CUBLAS to ON, " + "rerun CMake, and recompile this example.\n"; + +#endif // CUTLASS_ENABLE_CUBLAS + + // + // CuTe + // + + // Run once (and check) + d_C = h_C; + gemm(m, n, k, + alpha, + d_A.data().get(), m, + d_B.data().get(), n, + beta, + d_C.data().get(), m); + CUTE_CHECK_LAST(); + thrust::host_vector cute_result = d_C; + + // Timing iterations + timer.start(); + for (int i = 0; i < timing_iterations; ++i) { + gemm(m, n, k, + alpha, + d_A.data().get(), m, + d_B.data().get(), n, + beta, + d_C.data().get(), m); + } + double cute_time = timer.seconds() / timing_iterations; + CUTE_CHECK_LAST(); + printf("CUTE_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / cute_time, cute_time*1000); + +#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 + printf("Empirical Perf: %.1f%%\n", (cublas_time / cute_time) * 100); + + auto host_matrix_to_const_column_major_cute_tensor = + [](const auto& X, int num_rows, int num_cols, int LDX) { + const auto shape = cute::Shape{num_rows, num_cols}; + const auto strides = cute::Stride{1, LDX}; + return cute::make_tensor(X.data(), cute::make_layout(shape, strides)); + }; + + const auto A_view = host_matrix_to_const_column_major_cute_tensor(h_A, m, k, m); + // B^T is k x n, so B is n x k. + const auto B_view = host_matrix_to_const_column_major_cute_tensor(h_B, n, k, n); + const auto C_computed_view = host_matrix_to_const_column_major_cute_tensor(cute_result, m, n, m); + const auto C_expected_view = host_matrix_to_const_column_major_cute_tensor(cublas_result, m, n, m); + print_matrix_multiply_mollified_relative_error("float", A_view, B_view, C_computed_view, C_expected_view); + +#endif // CUTLASS_ENABLE_CUBLAS +} + + +int main(int argc, char** argv) +{ + int m = 5120; + if (argc >= 2) + sscanf(argv[1], "%d", &m); + + int n = 5120; + if (argc >= 3) + sscanf(argv[2], "%d", &n); + + int k = 4096; + if (argc >= 4) + sscanf(argv[3], "%d", &k); + + test_gemm(m, n, k); + + return 0; +} diff --git a/include/cute/algorithm/axpby.hpp b/include/cute/algorithm/axpby.hpp new file mode 100644 index 0000000000..a613417d39 --- /dev/null +++ b/include/cute/algorithm/axpby.hpp @@ -0,0 +1,79 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +// +// Accept mutable temporaries +// +template +CUTE_HOST_DEVICE +void +axpby(Alpha const& alpha, + Tensor const& x, + Beta const& beta, + Tensor && y) +{ + return axpby(alpha, x, beta, y); +} + +// +// AXPBY +// +template +CUTE_HOST_DEVICE +void +axpby(Alpha const& alpha, + Tensor const& x, + Beta const& beta, + Tensor & y) +{ + auto isBetaZero = (beta == Int<0>{}); + + CUTE_UNROLL + for (int i = 0; i < size(x); ++i) { + y(i) = (isBetaZero ? alpha * x(i) : alpha * x(i) + beta * y(i)); + } +} + +} // end namespace cute diff --git a/include/cute/algorithm/clear.hpp b/include/cute/algorithm/clear.hpp new file mode 100644 index 0000000000..ce7b51095d --- /dev/null +++ b/include/cute/algorithm/clear.hpp @@ -0,0 +1,66 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include + +namespace cute +{ + +// +// Accept mutable temporaries +// +template +CUTE_HOST_DEVICE +void +clear(Tensor&& tensor) +{ + return clear(tensor); +} + +// +// Set elements to zero +// +template +CUTE_HOST_DEVICE +void +clear(Tensor& tensor) +{ + using T = typename Tensor::value_type; + + fill(tensor, T{}); +} + +} // end namespace cute diff --git a/include/cute/algorithm/copy.hpp b/include/cute/algorithm/copy.hpp new file mode 100644 index 0000000000..04ceb051a4 --- /dev/null +++ b/include/cute/algorithm/copy.hpp @@ -0,0 +1,262 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +#include + +namespace cute +{ + +// +// Accept mutable temporaries +// + +template +CUTE_HOST_DEVICE +void +copy_if(PrdTensor const& pred, + Tensor const& src, + Tensor && dst) +{ + return copy_if(pred, src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy_if(Copy_Atom const& copy_atom, + PrdTensor const& pred, + Tensor const& src, + Tensor && dst) +{ + return copy_if(copy_atom, pred, src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy_vec(Tensor const& src, + Tensor && dst) +{ + return copy_vec(src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy(Tensor const& src, + Tensor && dst) +{ + return copy(src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom const& copy_atom, + Tensor const& src, + Tensor && dst) +{ + return copy(copy_atom, src, dst); +} + +// +// copy_if -- Predicated Copy +// + +template +CUTE_HOST_DEVICE +void +copy_if(PrdTensor const& pred, + Tensor const& src, + Tensor & dst) +{ + auto copy_op = select_elementwise_copy(src, dst); + + CUTE_UNROLL + for (int i = 0; i < size(src); ++i) { + if (pred(i)) { + copy_op.copy(src(i), dst(i)); + } + } +} + +// +// copy_if -- Predicated CopyAtom +// + +template +CUTE_HOST_DEVICE +void +copy_if(Copy_Atom const& copy_atom, + PredTensor const& pred, // (Rest...) + Tensor const& src, // (V,Rest...) + Tensor & dst) // (V,Rest...) +{ + static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch."); + if constexpr (SrcLayout::rank == 1) { // Dispatch the copy + copy_atom.call(src, dst); + } else { // Loop over all but the first mode + constexpr int R = SrcLayout::rank; + auto src_v = group_modes<1,R>(src); + auto dst_v = group_modes<1,R>(dst); + CUTE_UNROLL + for (int i = 0; i < size<1>(src_v); ++i) { + if (pred(i)) { + copy_atom.call(src_v(_,i), dst_v(_,i)); + } + } + } +} + +// +// copy_vec -- attempt vectorized copy with VecType +// + +template +CUTE_HOST_DEVICE +void +copy_vec(Tensor const& src, + Tensor & dst) +{ + using SrcType = typename SrcEngine::value_type; + using DstType = typename DstEngine::value_type; + if constexpr (sizeof(SrcType) == sizeof(DstType) && sizeof(VecType) > sizeof(DstType)) + { + /* @pre is_aligned(src.data()) && + * is_aligned(dst.data()) + */ + auto src_v = recast(src); + auto dst_v = recast(dst); + +#if 0 + if (thread0()) { + print("copy_vec -- vectorizing copy from %3db to %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(VecType))); + print(" "); print(layout(src)); print(" => "); print(layout(src_v)); print("\n"); + print(" "); print(layout(dst)); print(" => "); print(layout(dst_v)); print("\n"); + } +#endif + + return copy_if(TrivialPredTensor{}, src_v, dst_v); + } else { +#if 0 + if (thread0()) { + print("copy_vec -- not vectorizing, copy with %3db and %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(DstType))); + print(" "); print(layout(src)); print("\n"); + print(" "); print(layout(dst)); print("\n"); + } +#endif + + return copy_if(TrivialPredTensor{}, src, dst); + } +} + +// +// copy -- auto-vectorizing copy +// + +template +CUTE_HOST_DEVICE +void +copy(Tensor const& src, + Tensor & dst) +{ + constexpr int N = decltype(max_common_vector(src, dst))::value; + +#if 0 + if (thread0()) { + print("copy -- found a max_common_vector of %d\n", N); + print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n"); + print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n"); + } +#endif + + if constexpr (N <= 1) { + return copy_if(TrivialPredTensor{}, src, dst); + } else { + constexpr int vec_bits = N * sizeof_bits::value; + using VecType = uint_bit_t; + return copy_vec(src, dst); + } +} + +// +// copy -- CopyAtom +// + +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom const& copy_atom, + Tensor const& src, + Tensor & dst) +{ + return copy_if(copy_atom, TrivialPredTensor{}, src, dst); +} + +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom const&, + Tensor const& src, + Tensor & dst) +{ + return copy(src, dst); +} + +} // end namespace cute diff --git a/include/cute/algorithm/fill.hpp b/include/cute/algorithm/fill.hpp new file mode 100644 index 0000000000..bc0c4ad16d --- /dev/null +++ b/include/cute/algorithm/fill.hpp @@ -0,0 +1,87 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +// +// Accept mutable temporaries +// +template +CUTE_HOST_DEVICE +void +fill(Tensor&& tensor, T const& value) +{ + return fill(tensor, value); +} + +namespace detail +{ + +// Prefer fill(tensor.data(), value), if possible +template +CUTE_HOST_DEVICE +auto +fill(Tensor& tensor, T const& value, prefer<1>) + -> decltype(fill(tensor.data(), value)) +{ + fill(tensor.data(), value); +} + +// Default implementation +template +CUTE_HOST_DEVICE +void +fill(Tensor& tensor, T const& value, prefer<0>) +{ + CUTE_UNROLL + for (int i = 0; i < size(tensor); ++i) { + tensor(i) = value; + } +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE +void +fill(Tensor& tensor, T const& value) +{ + return detail::fill(tensor, value, prefer<1>{}); +} + +} // end namespace cute diff --git a/include/cute/algorithm/functional.hpp b/include/cute/algorithm/functional.hpp new file mode 100644 index 0000000000..e66cd975d5 --- /dev/null +++ b/include/cute/algorithm/functional.hpp @@ -0,0 +1,198 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +/** C++14 extensions */ + +namespace cute { + +/**************/ +/** Identity **/ +/**************/ + +struct identity { + template + CUTE_HOST_DEVICE constexpr + decltype(auto) operator()(T&& arg) const { + return std::forward(arg); + } +}; + +template +struct constant_fn { + template + CUTE_HOST_DEVICE constexpr + decltype(auto) operator()(T&&...) const { + return r_; + } + R r_; +}; + +/***********/ +/** Unary **/ +/***********/ + +#define CUTE_LEFT_UNARY_OP(NAME,OP) \ + struct NAME { \ + template \ + CUTE_HOST_DEVICE constexpr \ + decltype(auto) operator()(T&& arg) const { \ + return OP std::forward(arg); \ + } \ + } +#define CUTE_RIGHT_UNARY_OP(NAME,OP) \ + struct NAME { \ + template \ + CUTE_HOST_DEVICE constexpr \ + decltype(auto) operator()(T&& arg) const { \ + return std::forward(arg) OP ; \ + } \ + } +#define CUTE_NAMED_UNARY_OP(NAME,OP) \ + struct NAME { \ + template \ + CUTE_HOST_DEVICE constexpr \ + decltype(auto) operator()(T&& arg) const { \ + return OP (std::forward(arg)); \ + } \ + } + +CUTE_LEFT_UNARY_OP(unary_plus, +); +CUTE_LEFT_UNARY_OP(negate, -); +CUTE_LEFT_UNARY_OP(bit_not, ~); +CUTE_LEFT_UNARY_OP(logical_not, !); +CUTE_LEFT_UNARY_OP(dereference, *); +CUTE_LEFT_UNARY_OP(address_of, &); +CUTE_LEFT_UNARY_OP(pre_increment, ++); +CUTE_LEFT_UNARY_OP(pre_decrement, --); + +CUTE_RIGHT_UNARY_OP(post_increment, ++); +CUTE_RIGHT_UNARY_OP(post_decrement, --); + +CUTE_NAMED_UNARY_OP(abs_fn, abs); +CUTE_NAMED_UNARY_OP(conjugate, cute::conj); + +#undef CUTE_LEFT_UNARY_OP +#undef CUTE_RIGHT_UNARY_OP +#undef CUTE_NAMED_UNARY_OP + +/************/ +/** Binary **/ +/************/ + +#define CUTE_BINARY_OP(NAME,OP) \ + struct NAME { \ + template \ + CUTE_HOST_DEVICE constexpr \ + decltype(auto) operator()(T&& lhs, U&& rhs) const { \ + return std::forward(lhs) OP std::forward(rhs); \ + } \ + } +#define CUTE_NAMED_BINARY_OP(NAME,OP) \ + struct NAME { \ + template \ + CUTE_HOST_DEVICE constexpr \ + decltype(auto) operator()(T&& lhs, U&& rhs) const { \ + return OP (std::forward(lhs), std::forward(rhs)); \ + } \ + } + + +CUTE_BINARY_OP(plus, +); +CUTE_BINARY_OP(minus, -); +CUTE_BINARY_OP(multiplies, *); +CUTE_BINARY_OP(divides, /); +CUTE_BINARY_OP(modulus, %); + +CUTE_BINARY_OP(plus_assign, +=); +CUTE_BINARY_OP(minus_assign, -=); +CUTE_BINARY_OP(multiplies_assign, *=); +CUTE_BINARY_OP(divides_assign, /=); +CUTE_BINARY_OP(modulus_assign, %=); + +CUTE_BINARY_OP(bit_and, &); +CUTE_BINARY_OP(bit_or, |); +CUTE_BINARY_OP(bit_xor, ^); +CUTE_BINARY_OP(left_shift, <<); +CUTE_BINARY_OP(right_shift, >>); + +CUTE_BINARY_OP(bit_and_assign, &=); +CUTE_BINARY_OP(bit_or_assign, |=); +CUTE_BINARY_OP(bit_xor_assign, ^=); +CUTE_BINARY_OP(left_shift_assign, <<=); +CUTE_BINARY_OP(right_shift_assign, >>=); + +CUTE_BINARY_OP(logical_and, &&); +CUTE_BINARY_OP(logical_or, ||); + +CUTE_BINARY_OP(equal_to, ==); +CUTE_BINARY_OP(not_equal_to, !=); +CUTE_BINARY_OP(greater, >); +CUTE_BINARY_OP(less, <); +CUTE_BINARY_OP(greater_equal, >=); +CUTE_BINARY_OP(less_equal, <=); + +CUTE_NAMED_BINARY_OP(max_fn, cute::max); +CUTE_NAMED_BINARY_OP(min_fn, cute::min); + +#undef CUTE_BINARY_OP +#undef CUTE_NAMED_BINARY_OP + +/**********/ +/** Meta **/ +/**********/ + +template +struct bound_fn { + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator()(T&& arg) { + return fn_(arg_, std::forward(arg)); + } + + Fn fn_; + Arg arg_; +}; + +template +CUTE_HOST_DEVICE constexpr +auto +bind(Fn const& fn, Arg const& arg) { + return bound_fn{fn, arg}; +} + +} // end namespace cute diff --git a/include/cute/algorithm/gemm.hpp b/include/cute/algorithm/gemm.hpp new file mode 100644 index 0000000000..6e2ce612c0 --- /dev/null +++ b/include/cute/algorithm/gemm.hpp @@ -0,0 +1,718 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +/** The gemm algorithm takes four (or three) tensors and computes + * D += A * B + C + * It dispatches based on the number of modes each tensor has: + * + * 1. `(V) x (V) => (V)`. + * The element-wise product of vectors. Dispatches to FMA or MMA. + * 2. `(M) x (N) => (M,N)`. + * The outer product of vectors. Dispatches to [3] with new mode K=(1). + * 3. `(M,K) x (N,K) => (M,N)`. + * The product of matrices. Dispatches to [5] with MMA vector-mode V. + * 4. `(V,M) x (V,N) => (V,M,N)`. + * The batched outer product of vectors. Accounts for register reuse and dispatches to [1] for each (m,n). + * 5. `(V,M,K) x (V,N,K) => (V,M,N)`. + * The batched product of matrices. Dispatches to [4] for each (k). + */ + +namespace cute +{ + +// +// Three arguments to four +// + +template +CUTE_HOST_DEVICE +void +gemm(Tensor const& A, + Tensor const& B, + Tensor & C) +{ + return gemm(C, A, B, C); +} + +template +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor const& A, + Tensor const& B, + Tensor & C) +{ + return gemm(mma, C, A, B, C); +} + +// +// Accept mutable temporaries +// + +template +CUTE_HOST_DEVICE +void +gemm(Tensor const& A, + Tensor const& B, + Tensor && C) +{ + return gemm(C, A, B, C); +} + +template +CUTE_HOST_DEVICE +void +gemm(Tensor && D, + Tensor const& A, + Tensor const& B, + Tensor const& C) +{ + return gemm(D, A, B, C); +} + +template +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor const& A, + Tensor const& B, + Tensor && C) +{ + return gemm(mma, C, A, B, C); +} + +template +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor && D, + Tensor const& A, + Tensor const& B, + Tensor const& C) +{ + return gemm(mma, D, A, B, C); +} + +// +// Default MMA is UniversalFMA +// + +template +CUTE_HOST_DEVICE +void +gemm(Tensor & D, + Tensor const& A, + Tensor const& B, + Tensor const& C) +{ + using MMA = MMA_Atom::value_type, + typename Tensor::value_type, + typename Tensor::value_type, + typename Tensor::value_type>>; + + return gemm(MMA{}, D, A, B, C); +} + +// +// Thread-Local Register-Memory GEMMs +// + +// Dispatch [1]: (V) x (V) => (V) +template ::value && + ALayout::rank == 1 && is_rmem::value && + BLayout::rank == 1 && is_rmem::value && + CLayout::rank == 1 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (V) Logical data + Tensor const& A, // (V) Logical data + Tensor const& B, // (V) Logical data + Tensor const& C) // (V) Logical data +{ + // No static assertions on (V), MMA checks compatibility + mma.call(D, A, B, C); +} + +// Dispatch [2]: (M) x (N) => (M,N) +template ::value && + ALayout::rank == 1 && is_rmem::value && + BLayout::rank == 1 && is_rmem::value && + CLayout::rank == 2 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (M,N) Logical data + Tensor const& A, // (M) Logical data + Tensor const& B, // (N) Logical data + Tensor const& C) // (M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D)); + + gemm(mma, + D, // (M,N) + make_tensor(A.data(), append<2>(A.layout())), // (M,1) + make_tensor(B.data(), append<2>(B.layout())), // (N,1) + C); // (M,N) +} + +// Dispatch [3]: (M,K) x (N,K) => (M,N) +template ::value && + ALayout::rank == 2 && is_rmem::value && + BLayout::rank == 2 && is_rmem::value && + CLayout::rank == 2 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (M,N) Logical data + Tensor const& A, // (M,K) Logical data + Tensor const& B, // (N,K) Logical data + Tensor const& C) // (M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B)); // AK == BK + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D)); + + // Assert this is a 1-value MMA + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutC_TV{}) == Int<1>{}); + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutA_TV{}) == Int<1>{}); + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutB_TV{}) == Int<1>{}); + + gemm(mma, + make_tensor(D.data(), prepend<3>(D.layout())), // (1,M,N) + make_tensor(A.data(), prepend<3>(A.layout())), // (1,M,K) + make_tensor(B.data(), prepend<3>(B.layout())), // (1,N,K) + make_tensor(C.data(), prepend<3>(C.layout()))); // (1,M,N) +} + +// Dispatch [4]: (V,M) x (V,N) => (V,M,N) +template ::value && + ALayout::rank == 2 && is_rmem::value && + BLayout::rank == 2 && is_rmem::value && + CLayout::rank == 3 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (V,M,N) Logical data + Tensor const& A, // (V,M) Logical data + Tensor const& B, // (V,N) Logical data + Tensor const& C) // (V,M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D)); + + // REGISTER .reuse OPTIMIZATIONS + + auto M = size<1>(A); + auto N = size<1>(B); + + // 64-bit traversal specialization -- serpentine path + if (size<0>(A) * sizeof(typename Tensor::value_type) == 8 && + size<0>(B) * sizeof(typename Tensor::value_type) == 8) + { +#if 1 // NOTE: Must depend on the C-matrix order... (which we can test) + // Row-major iteration + CUTE_UNROLL + for (int m = 0; m < M; ++m) { + CUTE_UNROLL + for (int n = 0; n < N; ++n) { + int ns = (m & 1) ? N-1-n : n; // Serpentine coordinate + gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns)); + } + } +#else + // Col-major iteration + CUTE_UNROLL + for (int n = 0; n < N; ++n) { + CUTE_UNROLL + for (int m = 0; m < M; ++m) { + int ms = (n & 1) ? M-1-m : m; // Serpentine coordinate + gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n)); + } + } +#endif + } else + + // 32-bit traversal specialization -- kinked serpentine path + if (size<0>(A) * sizeof(typename Tensor::value_type) == 4 && + size<0>(B) * sizeof(typename Tensor::value_type) == 4) + { +#if 1 // NOTE: Must depend on the C-matrix order... (which we can test) + // Row-major iteration + CUTE_UNROLL + for (int m = 0; m < M; m += 2) { + CUTE_UNROLL + for (int n = 0; n < N; ++n) { + int ns = (m & 2) ? N-1-n : n; + gemm(mma, D(_,m+0,ns), A(_,m+0), B(_,ns), C(_,m+0,ns)); + + if (m+1 < M) { + gemm(mma, D(_,m+1,ns), A(_,m+1), B(_,ns), C(_,m+1,ns)); + } + } + } +#else + // Col-major iteration + CUTE_UNROLL + for (int n = 0; n < N; n += 2) { + CUTE_UNROLL + for (int m = 0; m < M; ++m) { + // Kinked serpentine traversal for maximum register reuse + int ms = (n & 2) ? M-1-m : m; + gemm(mma, D(_,ms,n+0), A(_,ms), B(_,n+0), C(_,ms,n+0)); + + if (n+1 < N) { + gemm(mma, D(_,ms,n+1), A(_,ms), B(_,n+1), C(_,ms,n+1)); + } + } + } +#endif + } else { + // Fallback to serpentine loop + // Col-major iteration + CUTE_UNROLL + for (int n = 0; n < N; ++n) { + CUTE_UNROLL + for (int m = 0; m < M; ++m) { + int ms = (n & 1) ? M-1-m : m; // Serpentine coordinate + gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n)); + } + } + } +} + +// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N) +template ::value && + ALayout::rank == 3 && is_rmem::value && + BLayout::rank == 3 && is_rmem::value && + CLayout::rank == 3 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (V,M,N) Logical data + Tensor const& A, // (V,M,K) Logical data + Tensor const& B, // (V,N,K) Logical data + Tensor const& C) // (V,M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B)); // AK == BK + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D)); + + auto K = size<2>(A); + + CUTE_UNROLL + for (int k = 0; k < K; ++k) { + gemm(mma, D, A(_,_,k), B(_,_,k), C); + } +} + +// +// Thread-Local Shared-Memory GEMMs +// + +// Dispatch [1]: (V) x (V) => (V) +// Dispatch [2]: (M) x (N) => (M,N) +// Dispatch [3]: (M,K) x (N,K) => (M,N) +// Dispatch [4]: (V,M) x (V,N) => (V,M,N) +// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N) +// Dispatch [3]: (M,K) x (N,K) => (M,N) +template ::value && + ALayout::rank == 2 && is_smem::value && + BLayout::rank == 2 && is_smem::value && + CLayout::rank == 2 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (M,N) Logical data + Tensor const& A, // (M,K) Logical data + Tensor const& B, // (N,K) Logical data + Tensor const& C) // (M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B)); // AK == BK + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D)); + + // Assert this is a 1-value MMA + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutC_TV{}) == Int<1>{}); + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutA_TV{}) == Int<1>{}); + CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom::LayoutB_TV{}) == Int<1>{}); + + gemm(mma, + make_tensor(D.data(), prepend<3>(D.layout())), // (1,M,N) + make_tensor(A.data(), prepend<3>(A.layout())), // (1,M,K) + make_tensor(B.data(), prepend<3>(B.layout())), // (1,N,K) + make_tensor(C.data(), prepend<3>(C.layout()))); // (1,M,N) +} + +// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N) +template ::value && + ALayout::rank == 3 && is_smem::value && + BLayout::rank == 3 && is_smem::value && + CLayout::rank == 3 && is_rmem::value)> +CUTE_HOST_DEVICE +void +gemm(MMA_Atom const& mma, + Tensor & D, // (V,M,N) Logical data + Tensor const& A, // (V,M,K) Logical data + Tensor const& B, // (V,N,K) Logical data + Tensor const& C) // (V,M,N) Logical data +{ + CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C)); // AM == CM + CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C)); // BN == CN + CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B)); // AK == BK + CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D)); + + auto rA = MMA_Atom::make_fragment_A(A); + auto rB = MMA_Atom::make_fragment_B(B); + + auto K = size<2>(A); + + CUTE_UNROLL + for (int k = 0; k < K; ++k) + { + copy(A(_,_,k), rA(_,_,k)); + copy(B(_,_,k), rB(_,_,k)); + // Thread-level register gemm for k + gemm(mma, D, rA(_,_,k), rB(_,_,k), C); + } +} + +// +// Collective Shared-Memory GEMMs +// + +template ::value && + BLayout::rank == 2 && is_smem::value && + CLayout::rank == 2 && is_smem::value)> +CUTE_HOST_DEVICE +void +gemm(ThrMMA const& thr_mma, + Alpha const& alpha, + Tensor sA, + Tensor sB, + Beta const& beta, + Tensor sC, + ALoadTransformOp const& sA_load_op /* transforms A values before used in GEMM */, + BLoadTransformOp const& sB_load_op /* transforms B values before used in GEMM */) +{ + CUTE_STATIC_ASSERT_V(size<0>(sA) == size<0>(sC)); // AM == CM + CUTE_STATIC_ASSERT_V(size<0>(sB) == size<1>(sC)); // BN == CN + CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB)); // AK == BK + + using TypeA = typename TA::value_type; + using TypeB = typename TB::value_type; + using TypeC = typename TC::value_type; + + static_assert(std::is_same_v>, TypeA>, + "ALoadTransformOp functor must accept and return value of type TA::value_type"); + static_assert(std::is_same_v>, TypeB>, + "BLoadTransformOp functor must accept and return value of type TB::value_type"); + + // Original, static size of the problem + auto M = size<0>(sC); + auto N = size<1>(sC); + auto K = size<1>(sA); + + // Block size of the compute tile + auto BLK_M = tile_size<0>(thr_mma); + auto BLK_N = tile_size<1>(thr_mma); + auto BLK_K = tile_size<2>(thr_mma); + + // Compute the "residues" + auto m_residue = M - BLK_M * (ceil_div(M, BLK_M) - Int<1>{}); // (0,BLK_M] + auto n_residue = N - BLK_N * (ceil_div(N, BLK_N) - Int<1>{}); // (0,BLK_N] + auto k_residue = K - BLK_K * (ceil_div(K, BLK_K) ); // (-BLK_K,0] + + // Shift the origin so k_residue is zeroth tile + sA.data() = &sA(0,k_residue); + sB.data() = &sB(0,k_residue); + +#if 0 + if (thread0()) { + printf("%d in BLK_M (%d)\n", int(m_residue), int(BLK_M)); + printf("%d in BLK_N (%d)\n", int(n_residue), int(BLK_N)); + printf("%d in BLK_K (%d)\n", int(k_residue), int(BLK_K)); + } +#endif + + // + // MMA Partitioning + // + + // Round the layout extents up to BLK_X + Tensor rounded_sA = sA.compose(make_shape(ceil_div(M, BLK_M) * BLK_M, ceil_div(K, BLK_K) * BLK_K)); + Tensor rounded_sB = sB.compose(make_shape(ceil_div(N, BLK_N) * BLK_N, ceil_div(K, BLK_K) * BLK_K)); + Tensor rounded_sC = sC.compose(make_shape(ceil_div(M, BLK_M) * BLK_M, ceil_div(N, BLK_N) * BLK_N)); + +#if 0 + if (thread0()) { + print(rounded_sA.layout()); print("\n"); + print(rounded_sB.layout()); print("\n"); + print(rounded_sC.layout()); print("\n"); + } +#endif + + // Partition the sA and sB tiles across the threads for the MMA + Tensor tCsA = thr_mma.partition_A(rounded_sA); // (MMA,MMA_M,MMA_K) + Tensor tCsB = thr_mma.partition_B(rounded_sB); // (MMA,MMA_N,MMA_K) + Tensor tCsC = thr_mma.partition_C(rounded_sC); // (MMA,MMA_M,MMA_N) + // Create register tensors for the MMA to operate on + Tensor tCrA = thr_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K) + Tensor tCrB = thr_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K) + Tensor tCrC = thr_mma.make_fragment_C(tCsC); // (MMA,MMA_M,MMA_N) + +#if 0 + if (thread0()) { + print(tCsA.layout()); print("\n"); + print(tCsB.layout()); print("\n"); + print(tCsC.layout()); print("\n"); + print(tCrA.layout()); print("\n"); + print(tCrB.layout()); print("\n"); + print(tCrC.layout()); print("\n"); + } +#endif + + // + // PREDICATION + // + + // Allocate the preds for only the MMA-mode of tCsA and tCsB + Tensor tCpA = make_tensor(size<0>(tCsA)); + Tensor tCpB = make_tensor(size<0>(tCsB)); + + // Create coordinate tensors on a single compute block for predication + Tensor cA = make_identity_tensor(make_shape(BLK_M, BLK_K)); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cB = make_identity_tensor(make_shape(BLK_N, BLK_K)); // (BLK_M,BLK_K) -> (blk_n,blk_k) + + // Repeat partitioning with thr_mma + Tensor tCcA = thr_mma.partition_A(cA); // (MMA,1,1) -> (blk_m,blk_k) + Tensor tCcB = thr_mma.partition_B(cB); // (MMA,1,1) -> (blk_n,blk_k) + + // Populate the m and n predicates + CUTE_UNROLL + for (int i = 0; i < size(tCpA); ++i) { + tCpA(i) = elem_less(get<0>(tCcA(i)), m_residue); + } + CUTE_UNROLL + for (int i = 0; i < size(tCpB); ++i) { + tCpB(i) = elem_less(get<0>(tCcB(i)), n_residue); + } + +#if 0 + printf("Thr %d: A(%d,%d):%d B(%d,%d):%d\n", + threadIdx.x, + int(get<0>(tCcA(0))), int(get<1>(tCcA(0))), int(tCpA(0)), + int(get<0>(tCcB(0))), int(get<1>(tCcB(0))), int(tCpB(0))); +#endif + + // + // PREFETCH k_block = 0 (with k-predication) + // + + CUTE_UNROLL + for (int i = 0; i < size<0>(tCsA); ++i) { // Copy MMA_I + if (k_residue == 0 || get<1>(tCcA(i)) >= -k_residue) { // k_block = 0, predicated on k + CUTE_UNROLL + for (int m = 0; m < size<1>(tCsA); ++m) { // Copy MMA_M, predicated on m + tCrA(i,m,0) = (m_residue == BLK_M || m < size<1>(tCsA)-1 || tCpA(i)) ? sA_load_op(tCsA(i,m,0)) : TypeA{}; + } + } + } + + CUTE_UNROLL + for (int i = 0; i < size<0>(tCsB); ++i) { // Copy MMA_I + if (k_residue == 0 || get<1>(tCcB(i)) >= -k_residue) { // k_block = 0, predicated on k + CUTE_UNROLL + for (int n = 0; n < size<1>(tCsB); ++n) { // Copy MMA_N, predicated on n + tCrB(i,n,0) = (n_residue == BLK_N || n < size<1>(tCsB)-1 || tCpB(i)) ? sB_load_op(tCsB(i,n,0)) : TypeB{}; + } + } + } + // + // MAINLOOP + // + + // Clear accumulators + clear(tCrC); + + constexpr int K_BLOCK_MAX = size<2>(tCrA); + + CUTE_UNROLL + for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) + { + // static-if load the next k_block. No k-predication required on these loads. + if (k_block < K_BLOCK_MAX-1) + { + // Load the next k_block + int k_next = k_block + 1; + + CUTE_UNROLL + for (int m = 0; m < size<1>(tCsA); ++m) { // Copy MMA_M + CUTE_UNROLL + for (int i = 0; i < size<0>(tCsA); ++i) { // Copy_if MMA_I predicated on m + tCrA(i,m,k_next) = (m_residue == BLK_M || m < size<1>(tCsA)-1 || tCpA(i)) ? sA_load_op(tCsA(i,m,k_next)) : TypeA{}; + } + } + + CUTE_UNROLL + for (int n = 0; n < size<1>(tCsB); ++n) { // Copy MMA_N + CUTE_UNROLL + for (int i = 0; i < size<0>(tCsB); ++i) { // Copy MMA_I predicated on n + tCrB(i,n,k_next) = (n_residue == BLK_N || n < size<1>(tCsB)-1 || tCpB(i)) ? sB_load_op(tCsB(i,n,k_next)) : TypeB{}; + } + } + } + + // GEMM on k_block in registers + gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC); + } + + // + // Epilogue + // + + Tensor cC = make_identity_tensor(make_shape(BLK_M, BLK_N)); // (BLK_M,BLK_N) -> (blk_m,blk_n) + Tensor tCcC = thr_mma.partition_C(cC); // (MMA, 1, 1) -> (blk_m,blk_n) + + const bool isBetaZero = (beta == Beta{}); + + // Custom axpby_if for now + CUTE_UNROLL + for (int m = 0; m < size<1>(tCsC); ++m) + { + CUTE_UNROLL + for (int n = 0; n < size<2>(tCsC); ++n) + { + CUTE_UNROLL + for (int i = 0; i < size<0>(tCsC); ++i) + { + if ((m_residue == BLK_M || m < size<1>(tCrC)-1 || get<0>(tCcC(i)) < m_residue) && + (n_residue == BLK_N || n < size<2>(tCrC)-1 || get<1>(tCcC(i)) < n_residue)) + { + tCsC(i,m,n) = isBetaZero ? alpha * tCrC(i,m,n) : alpha * tCrC(i,m,n) + beta * tCsC(i,m,n); + } + } + } + } +} + +template ::value && + BLayout::rank == 2 && is_smem::value && + CLayout::rank == 2 && is_smem::value)> +CUTE_HOST_DEVICE +void +gemm(ThrMMA const& thr_mma, + Alpha const& alpha, + Tensor sA, + Tensor sB, + Beta const& beta, + Tensor sC) +{ + gemm(thr_mma, alpha, sA, sB, beta, sC, identity() /* sA_load_op */, identity() /* sB_load_op */); +} + +} // end namespace cute diff --git a/include/cute/algorithm/prefer.hpp b/include/cute/algorithm/prefer.hpp new file mode 100644 index 0000000000..700edff0ba --- /dev/null +++ b/include/cute/algorithm/prefer.hpp @@ -0,0 +1,46 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +namespace cute +{ + +// Infinite types that inherit from each other +template +struct prefer : prefer {}; + +template <> +struct prefer<0> {}; + +// Can be used to preferencially overload implementations +// Higher N in prefer have higher priority. + +} // end namespace cute diff --git a/include/cute/algorithm/tensor_algorithms.hpp b/include/cute/algorithm/tensor_algorithms.hpp new file mode 100644 index 0000000000..258ddec680 --- /dev/null +++ b/include/cute/algorithm/tensor_algorithms.hpp @@ -0,0 +1,102 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/** Common algorithms on (hierarchical) tensors */ + +#pragma once + +#include + +#include + +namespace cute +{ + +// +// for_each +// + +template +CUTE_HOST_DEVICE constexpr +void +for_each(Tensor const& tensor, UnaryOp&& op) +{ + CUTE_UNROLL + for (int i = 0; i < size(tensor); ++i) { + static_cast(op)(tensor(i)); + } +} + +template +CUTE_HOST_DEVICE constexpr +void +for_each(Tensor& tensor, UnaryOp&& op) +{ + CUTE_UNROLL + for (int i = 0; i < size(tensor); ++i) { + static_cast(op)(tensor(i)); + } +} + +// Accept mutable temporaries +template +CUTE_HOST_DEVICE constexpr +void +for_each(Tensor&& tensor, UnaryOp&& op) +{ + return for_each(tensor, static_cast(op)); +} + +// +// transform +// + +// Similar to std::transform but does not return number of elements affected +template +CUTE_HOST_DEVICE constexpr +void +transform(Tensor& tensor, UnaryOp&& op) +{ + CUTE_UNROLL + for (int i = 0; i < size(tensor); ++i) { + tensor(i) = static_cast(op)(tensor(i)); + } +} + +// Accept mutable temporaries +template +CUTE_HOST_DEVICE constexpr +void +transform(Tensor&& tensor, UnaryOp&& op) +{ + return transform(tensor, std::forward(op)); +} + +} // end namespace cute diff --git a/include/cute/algorithm/tuple_algorithms.hpp b/include/cute/algorithm/tuple_algorithms.hpp new file mode 100644 index 0000000000..35b19f9612 --- /dev/null +++ b/include/cute/algorithm/tuple_algorithms.hpp @@ -0,0 +1,846 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include +#include + +/** Common algorithms on (hierarchical) tuples */ +/** Style choice: + * Forward params [using static_cast(.)] for const/non-const/ref/non-ref args + * but don't bother forwarding functions as ref-qualified member fns are extremely rare + */ + +namespace cute +{ + +// +// Apply (Unpack) +// (t, f) => f(t_0,t_1,...,t_n) +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +apply(T&& t, F&& f, seq) +{ + return f(get(static_cast(t))...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +apply(T&& t, F&& f) +{ + return detail::apply(static_cast(t), f, tuple_seq{}); +} + +// +// Transform Apply +// (t, f, g) => g(f(t_0),f(t_1),...) +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +tapply(T&& t, F&& f, G&& g, seq) +{ + return g(f(get(static_cast(t)))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tapply(T0&& t0, T1&& t1, F&& f, G&& g, seq) +{ + return g(f(get(static_cast(t0)), + get(static_cast(t1)))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tapply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g, seq) +{ + return g(f(get(static_cast(t0)), + get(static_cast(t1)), + get(static_cast(t2)))...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +transform_apply(T&& t, F&& f, G&& g) +{ + return detail::tapply(static_cast(t), f, g, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform_apply(T0&& t0, T1&& t1, F&& f, G&& g) +{ + return detail::tapply(static_cast(t0), static_cast(t1), f, g, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform_apply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g) +{ + return detail::tapply(static_cast(t0), static_cast(t1), static_cast(t2), f, g, tuple_seq{}); +} + +// +// For Each +// (t, f) => f(t_0),f(t_1),...,f(t_n) +// + +template +CUTE_HOST_DEVICE constexpr +void +for_each(T&& t, F&& f) +{ + detail::apply(t, [&](auto&&... a) { (f(static_cast(a)), ...); }, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +for_each_leaf(T&& t, F&& f) +{ + if constexpr (is_tuple>::value) { + return detail::apply(static_cast(t), [&](auto&&... a){ return (for_each_leaf(static_cast(a), f), ...); }, tuple_seq{}); + } else { + return f(static_cast(t)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Transform +// (t, f) => (f(t_0),f(t_1),...,f(t_n)) +// + +template +CUTE_HOST_DEVICE constexpr +auto +transform(T const& t, F&& f) +{ + return detail::tapply(t, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform(T0 const& t0, T1 const& t1, F&& f) +{ + static_assert(tuple_size::value == tuple_size::value, "Mismatched tuple_size"); + return detail::tapply(t0, t1, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform(T0 const& t0, T1 const& t1, T2 const& t2, F&& f) +{ + static_assert(tuple_size::value == tuple_size::value, "Mismatched tuple_size"); + static_assert(tuple_size::value == tuple_size::value, "Mismatched tuple_size"); + return detail::tapply(t0, t1, t2, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform_leaf(T const& t, F&& f) +{ + if constexpr (is_tuple::value) { + return transform(t, [&](auto const& a) { return transform_leaf(a, f); }); + } else { + return f(t); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// find and find_if +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +find_if(T const& t, F&& f, seq<>) +{ + return cute::integral_constant::value>{}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +find_if(T const& t, F&& f, seq) +{ + if constexpr (decltype(f(get(t)))::value) { + return cute::integral_constant{}; + } else { + return find_if(t, f, seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +find_if(T const& t, F&& f) +{ + if constexpr (is_tuple::value) { + return detail::find_if(t, f, tuple_seq{}); + } else { + return cute::integral_constant{}; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +find(T const& t, X const& x) +{ + return find_if(t, [&](auto const& v) { return v == x; }); // This should always return a static true/false +} + +template +auto +none_of(T const& t, F&& f) +{ + return cute::integral_constant::value>{}; +} + +template +auto +all_of(T const& t, F&& f) +{ + auto not_f = [&](auto const& a) { return !f(a); }; + return cute::integral_constant::value>{}; +} + +template +auto +any_of(T const& t, F&& f) +{ + return cute::integral_constant{}; +} + +// +// Filter +// (t, f) => +// + +template +CUTE_HOST_DEVICE constexpr +auto +filter_tuple(T const& t, F&& f) +{ + return transform_apply(t, f, [](auto const&... a) { return cute::tuple_cat(a...); }); +} + +template +CUTE_HOST_DEVICE constexpr +auto +filter_tuple(T0 const& t0, T1 const& t1, F&& f) +{ + return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); }); +} + +// +// Fold (Reduce, Accumulate) +// (t, v, f) => f(...f(f(v,t_0),t_1),...,t_n) +// + +namespace detail { + +// This impl compiles much faster than cute::apply and variadic args +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +fold(T&& t, V&& v, F&& f, seq<>) +{ + return static_cast(v); +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +fold(T&& t, V&& v, F&& f, seq) +{ + if constexpr (sizeof...(Is) == 0) { + return f(static_cast(v), get(static_cast(t))); + } else { + return fold(static_cast(t), + f(static_cast(v), get(static_cast(t))), + f, + seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +fold(T&& t, V&& v, F&& f) +{ + if constexpr (is_tuple>::value) { + return detail::fold(static_cast(t), + static_cast(v), + f, + tuple_seq{}); + } else { + return f(static_cast(v), static_cast(t)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +fold_first(T&& t, F&& f) +{ + if constexpr (is_tuple>::value) { + return detail::fold(static_cast(t), + get<0>(static_cast(t)), + f, + make_range<1,std::tuple_size>::value>{}); + } else { + return static_cast(t); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// front, back, take, unwrap +// + +// Get the first non-tuple element in a hierarchical tuple +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +front(T&& t) +{ + if constexpr (is_tuple>::value) { + return front(get<0>(static_cast(t))); + } else { + return static_cast(t); + } + + CUTE_GCC_UNREACHABLE; +} + +// Get the last non-tuple element in a hierarchical tuple +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +back(T&& t) +{ + if constexpr (is_tuple>::value) { + constexpr int N = tuple_size>::value; + return back(get(static_cast(t))); + } else { + return static_cast(t); + } + + CUTE_GCC_UNREACHABLE; +} + +// Takes the elements in the range [B,E) +template +CUTE_HOST_DEVICE constexpr +auto +take(T const& t) +{ + return detail::apply(t, [](auto const&... a) { return cute::make_tuple(a...); }, make_range{}); +} + +// Unwrap rank-1 tuples until we're left with a rank>1 tuple or a non-tuple +template +CUTE_HOST_DEVICE constexpr +auto +unwrap(T const& t) +{ + if constexpr (is_tuple::value) { + if constexpr (tuple_size::value == 1) { + return unwrap(get<0>(t)); + } else { + return t; + } + } else { + return t; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Flatten a hierarchical tuple to a tuple of depth one. +// + +template +CUTE_HOST_DEVICE constexpr +auto +flatten_to_tuple(T const& t) +{ + if constexpr (is_tuple::value) { + return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); }); + } else { + return cute::make_tuple(t); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +flatten(T const& t) +{ + if constexpr (is_tuple::value) { + return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); }); + } else { + return t; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// insert and remove and replace +// + +namespace detail { + +// Shortcut around tuple_cat for common insert/remove/repeat cases +template +CUTE_HOST_DEVICE constexpr +auto +construct(T const& t, X const& x, seq, seq, seq) +{ + return cute::make_tuple(get(t)..., (void(J),x)..., get(t)...); +} + +} // end namespace detail + +// Insert x into the Nth position of the tuple +template +CUTE_HOST_DEVICE constexpr +auto +insert(T const& t, X const& x) +{ + return detail::construct(t, x, make_seq{}, seq<0>{}, make_range::value>{}); +} + +// Remove the Nth element of the tuple +template +CUTE_HOST_DEVICE constexpr +auto +remove(T const& t) +{ + return detail::construct(t, 0, make_seq{}, seq<>{}, make_range::value>{}); +} + +// Replace the Nth element of the tuple with x +template +CUTE_HOST_DEVICE constexpr +auto +replace(T const& t, X const& x) +{ + return detail::construct(t, x, make_seq{}, seq<0>{}, make_range::value>{}); +} + +// Replace the first element of the tuple with x +template +CUTE_HOST_DEVICE constexpr +auto +replace_front(T const& t, X const& x) +{ + if constexpr (is_tuple::value) { + return detail::construct(t, x, seq<>{}, seq<0>{}, make_range<1,tuple_size::value>{}); + } else { + return x; + } + + CUTE_GCC_UNREACHABLE; +} + +// Replace the last element of the tuple with x +template +CUTE_HOST_DEVICE constexpr +auto +replace_back(T const& t, X const& x) +{ + if constexpr (is_tuple::value) { + return detail::construct(t, x, make_seq::value-1>{}, seq<0>{}, seq<>{}); + } else { + return x; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Make a tuple of Xs of tuple_size N +// + +template +CUTE_HOST_DEVICE constexpr +auto +repeat(X const& x) +{ + return detail::construct(0, x, seq<>{}, make_seq{}, seq<>{}); +} + +// +// Make a tuple of Xs the same profile as tuple +// + +template +CUTE_HOST_DEVICE constexpr +auto +repeat_like(T const& t, X const& x) +{ + if constexpr (is_tuple::value) { + return transform(t, [&](auto const& a) { return repeat_like(a,x); }); + } else { + return x; + } + + CUTE_GCC_UNREACHABLE; +} + +// Group the elements [B,E) of a T into a single element +// e.g. group<2,4>(T<_1,_2,_3,_4,_5,_6>{}) +// => T<_1,_2,T<_3,_4>,_5,_6>{} +template +CUTE_HOST_DEVICE constexpr +auto +group(T const& t) +{ + return detail::construct(t, take(t), make_seq{}, seq<0>{}, make_range::value>{}); +} + +// +// Extend a T to rank N by appending/prepending an element +// + +template +CUTE_HOST_DEVICE constexpr +auto +append(T const& a, X const& x) +{ + if constexpr (is_tuple::value) { + if constexpr (N == tuple_size::value) { + return a; + } else { + static_assert(N > tuple_size::value); + return detail::construct(a, x, make_seq::value>{}, make_seq::value>{}, seq<>{}); + } + } else { + if constexpr (N == 1) { + return a; + } else { + return detail::construct(cute::make_tuple(a), x, seq<0>{}, make_seq{}, seq<>{}); + } + } + + CUTE_GCC_UNREACHABLE; +} +template +CUTE_HOST_DEVICE constexpr +auto +append(T const& a, X const& x) +{ + if constexpr (is_tuple::value) { + return detail::construct(a, x, make_seq::value>{}, seq<0>{}, seq<>{}); + } else { + return cute::make_tuple(a, x); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +prepend(T const& a, X const& x) +{ + if constexpr (is_tuple::value) { + if constexpr (N == tuple_size::value) { + return a; + } else { + static_assert(N > tuple_size::value); + return detail::construct(a, x, seq<>{}, make_seq::value>{}, make_seq::value>{}); + } + } else { + if constexpr (N == 1) { + return a; + } else { + static_assert(N > 1); + return detail::construct(cute::make_tuple(a), x, seq<>{}, make_seq{}, seq<0>{}); + } + } + + CUTE_GCC_UNREACHABLE; +} +template +CUTE_HOST_DEVICE constexpr +auto +prepend(T const& a, X const& x) +{ + if constexpr (is_tuple::value) { + return detail::construct(a, x, seq<>{}, seq<0>{}, make_seq::value>{}); + } else { + return cute::make_tuple(x, a); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Inclusive scan (prefix sum) +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +iscan(T const& t, V const& v, F&& f, seq) +{ + // Apply the function to v and the element at I + auto v_next = f(v, get(t)); + // Replace I with v_next + auto t_next = replace(t, v_next); + +#if 0 + std::cout << "ISCAN i" << I << std::endl; + std::cout << " t " << t << std::endl; + std::cout << " i " << v << std::endl; + std::cout << " f(i,t) " << v_next << std::endl; + std::cout << " t_n " << t_next << std::endl; +#endif + + if constexpr (sizeof...(Is) == 0) { + return t_next; + } else { + return iscan(t_next, v_next, f, seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +iscan(T const& t, V const& v, F&& f) +{ + return detail::iscan(t, v, f, tuple_seq{}); +} + +// +// Exclusive scan (prefix sum) +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +escan(T const& t, V const& v, F&& f, seq) +{ + if constexpr (sizeof...(Is) == 0) { + // Replace I with v + return replace(t, v); + } else { + // Apply the function to v and the element at I + auto v_next = f(v, get(t)); + // Replace I with v + auto t_next = replace(t, v); + +#if 0 + std::cout << "ESCAN i" << I << std::endl; + std::cout << " t " << t << std::endl; + std::cout << " i " << v << std::endl; + std::cout << " f(i,t) " << v_next << std::endl; + std::cout << " t_n " << t_next << std::endl; +#endif + + // Recurse + return escan(t_next, v_next, f, seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +escan(T const& t, V const& v, F&& f) +{ + return detail::escan(t, v, f, tuple_seq{}); +} + +// +// Zip (Transpose) +// + +// Take ((a,b,c,...),(x,y,z,...),...) rank-R0 x rank-R1 input +// to produce ((a,x,...),(b,y,...),(c,z,...),...) rank-R1 x rank-R0 output + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +zip_(T const& t, seq) +{ + return cute::make_tuple(get(get(t))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +zip(T const& t, seq, seq) +{ + static_assert(conjunction>::value == tuple_size>::value>...>::value, "Mismatched Ranks"); + return cute::make_tuple(detail::zip_(t, seq{})...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +zip(T const& t) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple>::value) { + return detail::zip(t, tuple_seq{}, tuple_seq>{}); + } else { + return cute::make_tuple(t); + } + } else { + return t; + } + + CUTE_GCC_UNREACHABLE; +} + +// Convenient to pass them in separately +template +CUTE_HOST_DEVICE constexpr +auto +zip(T0 const& t0, T1 const& t1, Ts const&... ts) +{ + return zip(cute::make_tuple(t0, t1, ts...)); +} + +// +// zip2_by -- A guided zip for rank-2 tuples +// Take a tuple like ((A,a),((B,b),(C,c)),d) +// and produce a tuple ((A,(B,C)),(a,(b,c),d)) +// where the rank-2 modes are selected by the terminals of the guide (X,(X,X)) +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +zip2_by(T const& t, TG const& guide, seq, seq) +{ + // zip2_by produces the modes like ((A,a),(B,b),...) + auto split = cute::make_tuple(zip2_by(get(t), get(guide))...); + + // Rearrange and append missing modes from t to make ((A,B,...),(a,b,...,x,y)) + return cute::make_tuple(cute::make_tuple(get(split)...), + cute::make_tuple(get(split)..., get(t)...)); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +zip2_by(T const& t, TG const& guide) +{ + if constexpr (is_tuple::value) { + constexpr int TR = tuple_size::value; + constexpr int GR = tuple_size::value; + static_assert(TR >= GR, "Mismatched ranks"); + return detail::zip2_by(t, guide, + make_range< 0, GR>{}, + make_range{}); + } else { + static_assert(tuple_size::value == 2, "Mismatched ranks"); + return t; + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace cute diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp new file mode 100644 index 0000000000..6fd9edd382 --- /dev/null +++ b/include/cute/arch/cluster_sm90.hpp @@ -0,0 +1,190 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \ + ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))) +# define CUTE_ARCH_CLUSTER_SM90_ENABLED +#endif + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) +# define CUTE_ARCH_ELECT_ONE_SM90_ENABLED +#endif + +namespace cute { + +CUTE_DEVICE void cluster_arrive_relaxed() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + asm volatile("barrier.cluster.arrive.relaxed.aligned;\n" : : ); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +CUTE_DEVICE void cluster_arrive() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + asm volatile("barrier.cluster.arrive.aligned;\n" : : ); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +CUTE_DEVICE void cluster_wait() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + asm volatile("barrier.cluster.wait.aligned;\n" : : ); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +CUTE_DEVICE void cluster_sync() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + cluster_arrive(); + cluster_wait(); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +// Returns the dim3 grid size in terms of number of clusters. +CUTE_DEVICE dim3 cluster_grid_dims() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t x, y, z; + asm volatile("mov.u32 %0, %nclusterid.x;\n" : "=r"(x) : ); + asm volatile("mov.u32 %0, %nclusterid.y;\n" : "=r"(y) : ); + asm volatile("mov.u32 %0, %nclusterid.z;\n" : "=r"(z) : ); + return {x, y, z}; +#else + return gridDim; +#endif +} + +// Returns the dim3 cluster rank in the grid. +CUTE_DEVICE dim3 cluster_id_in_grid() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t x, y, z; + asm volatile("mov.u32 %0, %clusterid.x;\n" : "=r"(x) : ); + asm volatile("mov.u32 %0, %clusterid.y;\n" : "=r"(y) : ); + asm volatile("mov.u32 %0, %clusterid.z;\n" : "=r"(z) : ); + return {x, y, z}; +#else + return blockIdx; +#endif +} + +// Returns the relative dim3 block rank local to the cluster. +CUTE_DEVICE dim3 block_id_in_cluster() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t x, y, z; + asm volatile("mov.u32 %0, %cluster_ctaid.x;\n" : "=r"(x) : ); + asm volatile("mov.u32 %0, %cluster_ctaid.y;\n" : "=r"(y) : ); + asm volatile("mov.u32 %0, %cluster_ctaid.z;\n" : "=r"(z) : ); + return {x, y, z}; +#else + return {0,0,0}; +#endif +} + +// Returns the dim3 cluster shape. +CUTE_DEVICE dim3 cluster_shape() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t x, y, z; + asm volatile("mov.u32 %0, %cluster_nctaid.x;\n" : "=r"(x) : ); + asm volatile("mov.u32 %0, %cluster_nctaid.y;\n" : "=r"(y) : ); + asm volatile("mov.u32 %0, %cluster_nctaid.z;\n" : "=r"(z) : ); + return {x, y, z}; +#else + return {1,1,1}; +#endif +} + +// Get 1D ctaid in a cluster. +CUTLASS_DEVICE uint32_t block_rank_in_cluster() +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t rank; + asm volatile("mov.u32 %0, %cluster_ctarank;\n" : "=r"(rank) :); + return rank; +#else + return 0; +#endif +} + +// Set the destination block-ID in cluster for a given SMEM Address +CUTLASS_DEVICE uint32_t set_block_rank(uint32_t smemAddr, uint32_t rank) +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t result; + asm volatile("mapa.shared::cluster.u32 %0, %1, %2;\n" + : "=r"(result) + : "r"(smemAddr), "r"(rank)); + return result; +#else + return smemAddr; +#endif +} + +// Elect one thread in the warp. The elected thread gets its predicate set to true, all others obtain false. +CUTE_HOST_DEVICE uint32_t elect_one_sync() +{ +#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED) + uint32_t pred = 0; + uint32_t laneid = 0; + asm volatile( + "{\n" + ".reg .b32 %rx;\n" + ".reg .pred %px;\n" + " elect.sync %rx|%px, %2;\n" + "@%px mov.s32 %1, 1;\n" + " mov.s32 %0, %rx;\n" + "}\n" + : "+r"(laneid), "+r"(pred) + : "r"(0xFFFFFFFF)); + return pred; +#elif defined(__CUDA_ARCH__) + return (threadIdx.x % 32) == 0; +#else + return true; +#endif +} + +} // end namespace cute diff --git a/include/cute/arch/copy.hpp b/include/cute/arch/copy.hpp new file mode 100644 index 0000000000..aa7bb333ed --- /dev/null +++ b/include/cute/arch/copy.hpp @@ -0,0 +1,71 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +// +// Direct Copy for any type +// + +template +struct UniversalCopy +{ + using SRegisters = S[1]; + using DRegisters = D[1]; + + CUTE_HOST_DEVICE static constexpr void + copy(S const& src, + D & dst) + { + dst = src; + } +}; + +// +// Placeholder for the copy algorithm's default, auto-vectorizing behavior +// + +struct DefaultCopy +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint128_t[1]; +}; + +using AutoVectorizingCopy = DefaultCopy; + +} // end namespace cute diff --git a/include/cute/arch/copy_sm75.hpp b/include/cute/arch/copy_sm75.hpp new file mode 100644 index 0000000000..fda6340d35 --- /dev/null +++ b/include/cute/arch/copy_sm75.hpp @@ -0,0 +1,215 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) +# define CUTE_ARCH_LDSM_SM75_ENABLED +#endif + +namespace cute +{ + +struct SM75_U32x1_LDSM_N +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n" + : "=r"(dst) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +struct SM75_U32x2_LDSM_N +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst0, uint32_t& dst1) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n" + : "=r"(dst0), "=r"(dst1) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +struct SM75_U32x4_LDSM_N +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +struct SM75_U16x2_LDSM_T +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];\n" + : "=r"(dst) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +struct SM75_U16x4_LDSM_T +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst0, uint32_t& dst1) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];\n" + : "=r"(dst0), "=r"(dst1) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +struct SM75_U16x8_LDSM_T +{ + using SRegisters = uint128_t[1]; + using DRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + copy(uint128_t const& smem_src, + uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3) + { +#if defined(CUTE_ARCH_LDSM_SM75_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src); + asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3) + : "r"(smem_int_ptr)); +#else + CUTE_RUNTIME_ASSERT("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ENABLED."); +#endif + } +}; + +// +// Legacy LDSM interfaces that aren't very useful +// + +template +CUTE_HOST_DEVICE +void +copy_ldsm(uint128_t const* const smem_ptr, + T* rmem_ptr) +{ + uint32_t* reg_ptr = reinterpret_cast(rmem_ptr); + + // if constexpr + if (sizeof(T) == 4) { + SM75_U32x1_LDSM_N::copy(smem_ptr[0], reg_ptr[0]); + } + else if (sizeof(T) == 8) { + SM75_U32x2_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]); + } + else if (sizeof(T) == 16) { + SM75_U32x4_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]); + } + else { + static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported"); + } +} + +template +CUTE_HOST_DEVICE +void +copy_ldsm_trans(uint128_t const* const smem_ptr, + T* rmem_ptr) +{ + uint32_t* reg_ptr = reinterpret_cast(rmem_ptr); + + // if constexpr + if (sizeof(T) == 4) { + SM75_U16x2_LDSM_T::copy(smem_ptr[0], reg_ptr[0]); + } + else if (sizeof(T) == 8) { + SM75_U16x4_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]); + } + else if (sizeof(T) == 16) { + SM75_U16x8_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]); + } + else { + static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported"); + } +} + +} // end namespace cute diff --git a/include/cute/arch/copy_sm80.hpp b/include/cute/arch/copy_sm80.hpp new file mode 100644 index 0000000000..c6c44121bd --- /dev/null +++ b/include/cute/arch/copy_sm80.hpp @@ -0,0 +1,138 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) +# define CUTE_ARCH_CP_ASYNC_SM80_ENABLED +#endif + +namespace cute +{ + +/// Copy via cp.async with caching at all levels +template +struct SM80_CP_ASYNC_CACHEALWAYS +{ + using SRegisters = TS[1]; + using DRegisters = TD[1]; + + static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)"); + static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported"); + + CUTE_HOST_DEVICE static void + copy(TS const& gmem_src, + TD & smem_dst) + { +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + TS const* gmem_ptr = &gmem_src; + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" + :: "r"(smem_int_ptr), + "l"(gmem_ptr), + "n"(sizeof(TS))); +#else + CUTE_RUNTIME_ASSERT("Support for cp.async instructions has not been enabled"); +#endif + } +}; + +/// Copy via cp.async with caching at global level +template +struct SM80_CP_ASYNC_CACHEGLOBAL +{ + using SRegisters = TS[1]; + using DRegisters = TD[1]; + + static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)"); + static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported"); + + CUTE_HOST_DEVICE static void + copy(TS const& gmem_src, + TD & smem_dst) + { +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + TS const* gmem_ptr = &gmem_src; + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" + :: "r"(smem_int_ptr), + "l"(gmem_ptr), + "n"(sizeof(TS))); +#else + CUTE_RUNTIME_ASSERT("Support for cp.async instructions has not been enabled"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block. +CUTE_HOST_DEVICE +void +cp_async_fence() +{ +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + asm volatile("cp.async.commit_group;\n" ::); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Blocks until all but N previous cp.async.commit_group operations have committed. +template +CUTE_HOST_DEVICE +void +cp_async_wait() +{ +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + if constexpr (N == 0) { + asm volatile("cp.async.wait_all;\n" ::); + } else { + asm volatile("cp.async.wait_group %0;\n" :: "n"(N)); + } +#endif +} + +template +CUTE_HOST_DEVICE +void +cp_async_wait(Int) +{ + return cp_async_wait(); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/arch/copy_sm90.hpp b/include/cute/arch/copy_sm90.hpp new file mode 100644 index 0000000000..6ac96438c1 --- /dev/null +++ b/include/cute/arch/copy_sm90.hpp @@ -0,0 +1,225 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) +# define CUTE_ARCH_STSM_SM90_ENABLED +# define CUTE_ARCH_TMA_SM90_ENABLED +#endif + +namespace cute +{ + +struct SM90_U32x1_STSM_N +{ + using SRegisters = uint32_t[1]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src, + uint128_t & smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x1.m8n8.shared.b16 [%0], {%1};\n" + :: "r"(smem_int_ptr), + "r"(src)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +struct SM90_U32x2_STSM_N +{ + using SRegisters = uint32_t[2]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src0, uint32_t const& src1, + uint128_t& smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n" + :: "r"(smem_int_ptr), + "r"(src0), "r"(src1)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +struct SM90_U32x4_STSM_N +{ + using SRegisters = uint32_t[4]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3, + uint128_t& smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x4.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n" + :: "r"(smem_int_ptr), + "r"(src0), "r"(src1), "r"(src2), "r"(src3)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +struct SM90_U16x2_STSM_T +{ + using SRegisters = uint32_t[1]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src, + uint128_t& smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [%0], {%1};\n" + :: "r"(smem_int_ptr), + "r"(src)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +struct SM90_U16x4_STSM_T +{ + using SRegisters = uint32_t[2]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src0, uint32_t const& src1, + uint128_t& smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [%0], {%1, %2};\n" + :: "r"(smem_int_ptr), + "r"(src0), "r"(src1)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +struct SM90_U16x8_STSM_T +{ + using SRegisters = uint32_t[4]; + using DRegisters = uint128_t[1]; + + CUTE_HOST_DEVICE static void + copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3, + uint128_t& smem_dst) + { +#if defined(CUTE_ARCH_STSM_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); + asm volatile ("stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n" + :: "r"(smem_int_ptr), + "r"(src0), "r"(src1), "r"(src2), "r"(src3)); +#else + CUTE_RUNTIME_ASSERT("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED."); +#endif + } +}; + +// +// Legacy STSM interfaces that aren't very useful +// + +template +CUTE_HOST_DEVICE +void +copy_stsm(T const* const rmem_ptr, + uint128_t* const smem_ptr) +{ + uint32_t const* reg_ptr = reinterpret_cast(rmem_ptr); + + // if constexpr + if (sizeof(T) == 4) { + SM90_U32x1_STSM_N::copy(reg_ptr[0], smem_ptr[0]); + } + else if (sizeof(T) == 8) { + SM90_U32x2_STSM_N::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]); + } + else if (sizeof(T) == 16) { + SM90_U32x4_STSM_N::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]); + } + else { + static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported"); + } +} + +template +CUTE_HOST_DEVICE +void +copy_stsm_trans(T const* const rmem_ptr, + uint128_t* const smem_ptr) +{ + uint32_t const* reg_ptr = reinterpret_cast(rmem_ptr); + + // if constexpr + if (sizeof(T) == 4) { + SM90_U16x2_STSM_T::copy(reg_ptr[0], smem_ptr[0]); + } + else if (sizeof(T) == 8) { + SM90_U16x4_STSM_T::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]); + } + else if (sizeof(T) == 16) { + SM90_U16x8_STSM_T::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]); + } + else { + static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported"); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp new file mode 100644 index 0000000000..ca8320f665 --- /dev/null +++ b/include/cute/arch/copy_sm90_desc.hpp @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include +#include + +#include +#include +#include // to_Format<[u]intX> +#include // to_Format + +namespace cute +{ + +////////////////////////////////////////////////////////////////////////////////////////////////////// +/// Barriers are 64-bit of user-managed information used in broadly two types syncronization patterns +/// 1) arrive/wait on threads (usage: cp.async and warp-specialized kernels) +/// 2) transaction-based (usage: TMA transaction where a CTA issues one transaction) +////////////////////////////////////////////////////////////////////////////////////////////////////// + +// Initialize barrier present in shared memory +CUTE_HOST_DEVICE +void +initialize_barrier(uint64_t& smem_barrier, // 64 bits user-manged barrier in smem + int thread_count = 1) // Thread count expected to arrive/wait on this barrier +{ +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier); + asm volatile ("mbarrier.init.shared.b64 [%0], %1;\n" + :: "r"(smem_int_ptr), + "r"(thread_count)); +#endif +} + +// Set the number of bytes transfered per transaction +CUTE_HOST_DEVICE +void +set_barrier_transaction_bytes(uint64_t& smem_barrier, // 64 bits user-manged barrier in smem + uint32_t bytes) // Number of bytes transfered by per TMA transaction +{ +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier); + asm volatile ("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;\n" + :: "r"(smem_int_ptr), + "r"(bytes)); +#endif +} + +// Barrier wait +CUTE_HOST_DEVICE +void +wait_barrier(uint64_t& smem_barrier, // 64 bits user-manged barrier in smem + int phase_bit) // Current phase bit the barrier waiting to flip +{ +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier); + asm volatile( + "{\n" + ".reg .pred P1;\n" + "LAB_WAIT:\n" + "mbarrier.try_wait.parity.shared.b64 P1, [%0], %1;\n" + "@P1 bra.uni DONE;\n" + "bra.uni LAB_WAIT;\n" + "DONE:\n" + "}\n" + :: "r"(smem_int_ptr), + "r"(phase_bit)); + +#endif +} + +// Barrier arrive +CUTE_HOST_DEVICE +void +arrive_barrier(uint64_t& smem_barrier) // 64 bits user-manged barrier in smem +{ +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier); + asm volatile( + "{\n" + ".reg .b64 state; \n" + "mbarrier.arrive.shared.b64 state, [%0];\n" + "}\n" + :: "r"(smem_int_ptr)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// TMA Descriptor and utilities +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace TMA { + +enum class SmemSwizzleBits : uint8_t { + DISABLE = 0, + B32 = 1, + B64 = 2, + B128 = 3, +}; + +#if (__CUDACC_VER_MAJOR__ >= 12) + +template +inline CUtensorMapDataType to_CUtensorMapDataType() { + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_UINT8; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_UINT8; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_UINT16; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_UINT32; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_UINT64; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_INT32; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_INT64; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT16; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT32; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT64; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; } else + if constexpr (std::is_same::value) { return CU_TENSOR_MAP_DATA_TYPE_TFLOAT32; } else + { static_assert(sizeof(T) < 0, "Unknown TMA Format!"); } +} + +inline CUtensorMapSwizzle to_CUtensorMapSwizzle(SmemSwizzleBits const& t) { + switch (t) { + default: assert(false && "Unknown SmemSwizzleBits!"); + case SmemSwizzleBits::DISABLE: return CU_TENSOR_MAP_SWIZZLE_NONE; + case SmemSwizzleBits::B32: return CU_TENSOR_MAP_SWIZZLE_32B; + case SmemSwizzleBits::B64: return CU_TENSOR_MAP_SWIZZLE_64B; + case SmemSwizzleBits::B128: return CU_TENSOR_MAP_SWIZZLE_128B; + } +} + +#endif // (__CUDACC_VER_MAJOR__ >= 12) +} // end namespace TMA + +#if (__CUDACC_VER_MAJOR__ >= 12) +using TmaDescriptor = CUtensorMap; +#else +using TmaDescriptor = struct { char bytes[128]; }; +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Initiates a TensorMap Prefetch +//////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTE_HOST_DEVICE +void +prefetch_tma_descriptor(TmaDescriptor const* desc_ptr) +{ +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + // Prefetch TMA Descriptor using generic addressing (i.e. no specific state space: const or param) + asm volatile ( + "prefetch.tensormap [%0];" + : + : "l"(gmem_int_desc) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use TMA Descriptor Prefetch without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif +} + +/////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/arch/copy_sm90_tma.hpp b/include/cute/arch/copy_sm90_tma.hpp new file mode 100644 index 0000000000..d6025e4ad8 --- /dev/null +++ b/include/cute/arch/copy_sm90_tma.hpp @@ -0,0 +1,552 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// TMA_LOAD : Initiates a TMA copy from global memory to shared memory +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM90_TMA_LOAD_1D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%0], [%1, {%3}], [%2];" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "r"(crd0) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_2D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%0], [%1, {%3, %4}], [%2];" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "r"(crd0), "r"(crd1) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_3D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%0], [%1, {%3, %4, %5}], [%2];" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "r"(crd0), "r"(crd1), "r"(crd2) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_4D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%0], [%1, {%3, %4, %5, %6}], [%2];" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_5D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes" + " [%0], [%1, {%3, %4, %5, %6, %7}], [%2];" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0) + { + return SM90_TMA_LOAD_1D::copy(desc_ptr, smem_mbar, smem_ptr, crd0); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { + return SM90_TMA_LOAD_2D::copy(desc_ptr, smem_mbar, smem_ptr, crd0, crd1); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { + return SM90_TMA_LOAD_3D::copy(desc_ptr, smem_mbar, smem_ptr, crd0, crd1, crd2); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { + return SM90_TMA_LOAD_4D::copy(desc_ptr, smem_mbar, smem_ptr, crd0, crd1, crd2, crd3); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { + return SM90_TMA_LOAD_5D::copy(desc_ptr, smem_mbar, smem_ptr, crd0, crd1, crd2, crd3, crd4); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// TMA_LOAD_MULTICAST: Initiates a TMA copy from global memory to shared memory +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM90_TMA_LOAD_1D_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster" + " [%0], [%1, {%4}], [%2], %3;" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "h"(multicast_mask), + "r"(crd0) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_2D_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster" + " [%0], [%1, {%4, %5}], [%2], %3;" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "h"(multicast_mask), + "r"(crd0), "r"(crd1) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_3D_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster" + " [%0], [%1, {%4, %5, %6}], [%2], %3;" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "h"(multicast_mask), + "r"(crd0), "r"(crd1), "r"(crd2) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_4D_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster" + " [%0], [%1, {%4, %5, %6, %7}], [%2], %3;" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "h"(multicast_mask), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_5D_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_mbar = cast_smem_ptr_to_uint(&smem_mbar); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster" + " [%0], [%1, {%4, %5, %6, %7, %8}], [%2], %3;" + : + : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar), + "h"(multicast_mask), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_LOAD_MULTICAST +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0) + { + return SM90_TMA_LOAD_1D_MULTICAST::copy(desc_ptr, smem_mbar, multicast_mask, smem_ptr, crd0); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { + return SM90_TMA_LOAD_2D_MULTICAST::copy(desc_ptr, smem_mbar, multicast_mask, smem_ptr, crd0, crd1); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { + return SM90_TMA_LOAD_3D_MULTICAST::copy(desc_ptr, smem_mbar, multicast_mask, smem_ptr, crd0, crd1, crd2); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { + return SM90_TMA_LOAD_4D_MULTICAST::copy(desc_ptr, smem_mbar, multicast_mask, smem_ptr, crd0, crd1, crd2, crd3); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, uint64_t& smem_mbar, uint16_t multicast_mask, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { + return SM90_TMA_LOAD_5D_MULTICAST::copy(desc_ptr, smem_mbar, multicast_mask, smem_ptr, crd0, crd1, crd2, crd3, crd4); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// TMA_STORE : Initiates a TMA copy from shared memory to global memory +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM90_TMA_STORE_1D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];" + : + : "l"(gmem_int_desc), "r"(smem_int_ptr), + "r"(crd0) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_STORE_2D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];" + : + : "l"(gmem_int_desc), "r"(smem_int_ptr), + "r"(crd0), "r"(crd1) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_STORE_3D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];" + : + : "l"(gmem_int_desc), "r"(smem_int_ptr), + "r"(crd0), "r"(crd1), "r"(crd2) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_STORE_4D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];" + : + : "l"(gmem_int_desc), "r"(smem_int_ptr), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_STORE_5D +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + uint64_t gmem_int_desc = reinterpret_cast(desc_ptr); + uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr); + asm volatile ( + "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];" + : + : "l"(gmem_int_desc), "r"(smem_int_ptr), + "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif + } +}; + +struct SM90_TMA_STORE +{ + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0) + { + return SM90_TMA_STORE_1D::copy(desc_ptr, smem_ptr, crd0); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1) + { + return SM90_TMA_STORE_2D::copy(desc_ptr, smem_ptr, crd0, crd1); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2) + { + return SM90_TMA_STORE_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3) + { + return SM90_TMA_STORE_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3); + } + CUTE_HOST_DEVICE static void + copy(void const* const desc_ptr, + void const* const smem_ptr, + int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4) + { + return SM90_TMA_STORE_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4); + } +}; + +// Indicate arrival of warp issuing TMA_STORE +CUTE_HOST_DEVICE static void +tma_store_arrive() { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + asm volatile("cp.async.bulk.commit_group;"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif +} + +// Wait on prior N (Count) TMA_STORE instructions to complete +template +CUTE_HOST_DEVICE static void +tma_store_wait() { +#if defined(CUTE_ARCH_TMA_SM90_ENABLED) + asm volatile( + "cp.async.bulk.wait_group.read %0;" + : + : "n"(Count) + : "memory"); +#else + CUTE_RUNTIME_ASSERT("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED."); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/arch/mma.hpp b/include/cute/arch/mma.hpp new file mode 100644 index 0000000000..1c1058fcb9 --- /dev/null +++ b/include/cute/arch/mma.hpp @@ -0,0 +1,64 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +// +// Direct FMA for any type +// + +template +struct UniversalFMA +{ + using DRegisters = D[1]; + using ARegisters = A[1]; + using BRegisters = B[1]; + using CRegisters = C[1]; + + CUTE_HOST_DEVICE static constexpr void + fma(D & d, + A const& a, + B const& b, + C const& c) + { + // Forward to an ADL/cute free function for these types + using cute::fma; + fma(d, a, b, c); + } +}; + +} // end namespace cute diff --git a/include/cute/arch/mma_sm61.hpp b/include/cute/arch/mma_sm61.hpp new file mode 100644 index 0000000000..32a9fbbcb5 --- /dev/null +++ b/include/cute/arch/mma_sm61.hpp @@ -0,0 +1,87 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610)) +# define CUTE_ARCH_MMA_SM61_ENABLED +#endif + +namespace cute +{ + +struct SM61_DP4A +{ + using DRegisters = int32_t[1]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = int32_t[1]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c) + { +#if defined(CUTE_ARCH_MMA_SM61_ENABLED) + asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" + : "=r"(d) + : "r"(a), "r"(b), "r"(c)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM61_DP4A without CUTE_ARCH_MMA_SM61_ENABLED"); +#endif + } +}; + +struct SM61_DP2A +{ + using DRegisters = int32_t[1]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = int32_t[1]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c) + { +#if defined(CUTE_ARCH_MMA_SM61_ENABLED) + asm volatile("dp2a.s32.s32 %0, %1, %2, %3;" + : "=r"(d) + : "r"(a), "r"(b), "r"(c)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM61_DP2A without CUTE_ARCH_MMA_SM61_ENABLED"); +#endif + } +}; + +} // namespace cute diff --git a/include/cute/arch/mma_sm70.hpp b/include/cute/arch/mma_sm70.hpp new file mode 100644 index 0000000000..139e60041a --- /dev/null +++ b/include/cute/arch/mma_sm70.hpp @@ -0,0 +1,329 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// Config +#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1)) +# define CUTE_ARCH_MMA_SM70_SUPPORTED +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) +# define CUTE_ARCH_MMA_SM70_ENABLED +# endif +#endif + +namespace cute +{ + +// +// SM70 MMA 884 F16F16F16 +// + +struct SM70_8x8x4_F16F16F16F16_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6, %7}," + "{%8, %9, %10, %11};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F16F16F16F16_TN without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F16F16F16F16_NT +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6, %7}," + "{%8, %9, %10, %11};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F16F16F16F16_NT without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F16F16F16F16_NN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6, %7}," + "{%8, %9, %10, %11};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F16F16F16F16_NN without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F16F16F16F16_TT +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6, %7}," + "{%8, %9, %10, %11};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F16F16F16F16_TT without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// SM70 MMA 884 F16F16F32 +// + +struct SM70_8x8x4_F32F16F16F32_TN +{ + using DRegisters = float[8]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = float[8]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + float const& c0, float const& c1, float const& c2, float const& c3, + float const& c4, float const& c5, float const& c6, float const& c7) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11}," + "{%12, %13, %14, %15, %16, %17, %18, %19};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), + "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3), + "f"(c4), "f"(c5), "f"(c6), "f"(c7)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F32F16F16F32_TN without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F32F16F16F32_NT +{ + using DRegisters = float[8]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = float[8]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + float const& c0, float const& c1, float const& c2, float const& c3, + float const& c4, float const& c5, float const& c6, float const& c7) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32" + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11}," + "{%12, %13, %14, %15, %16, %17, %18, %19};" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), + "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3), + "f"(c4), "f"(c5), "f"(c6), "f"(c7)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F32F16F16F32_NT without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F32F16F16F32_NN +{ + using DRegisters = float[8]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = float[8]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + float const& c0, float const& c1, float const& c2, float const& c3, + float const& c4, float const& c5, float const& c6, float const& c7) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32" + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11}," + "{%12, %13, %14, %15, %16, %17, %18, %19};" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), + "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3), + "f"(c4), "f"(c5), "f"(c6), "f"(c7)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F32F16F16F32_NN without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct SM70_8x8x4_F32F16F16F32_TT +{ + using DRegisters = float[8]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[2]; + using CRegisters = float[8]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, uint32_t const& b1, + float const& c0, float const& c1, float const& c2, float const& c3, + float const& c4, float const& c5, float const& c6, float const& c7) + { +#if defined(CUTE_ARCH_MMA_SM70_ENABLED) + asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32" + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11}," + "{%12, %13, %14, %15, %16, %17, %18, %19};" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), + "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7) + : "r"(a0), "r"(a1), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3), + "f"(c4), "f"(c5), "f"(c6), "f"(c7)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM70_8x8x4_F32F16F16F32_TT without CUTE_ARCH_MMA_SM70_ENABLED"); +#endif + } + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/arch/mma_sm75.hpp b/include/cute/arch/mma_sm75.hpp new file mode 100644 index 0000000000..20d2b56c0b --- /dev/null +++ b/include/cute/arch/mma_sm75.hpp @@ -0,0 +1,120 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// Config +#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) +# define CUTE_ARCH_MMA_SM75_SUPPORTED +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) +# define CUTE_ARCH_MMA_SM75_ENABLED +# endif +#endif + +namespace cute +{ + +// +// SM75 MMA 1688 F16F16F32 +// + +struct SM75_16x8x8_F32F16F16F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = float[4]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + float const& c0, float const& c1, float const& c2, float const& c3) + { +#if defined(CUTE_ARCH_MMA_SM75_ENABLED) + asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM75_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM75_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// SM75 MMA 8816 S8S8S32 +// + +struct SM75_8x8x16_S32S8S8S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + // Register asm fma + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM75_ENABLED) + asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32" + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM75_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM75_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/arch/mma_sm80.hpp b/include/cute/arch/mma_sm80.hpp new file mode 100644 index 0000000000..6050500a47 --- /dev/null +++ b/include/cute/arch/mma_sm80.hpp @@ -0,0 +1,2132 @@ + /************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) +# define CUTE_ARCH_MMA_SM80_ENABLED +#endif + +namespace cute { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM80_16x8x8_F16F16F16F16_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + "{%0, %1}," + "{%2, %3}," + "{%4}," + "{%5, %6};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x8_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_F16F16F16F16_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " + "{%0, %1}," + "{%2, %3, %4, %5}," + "{%6, %7}," + "{%8, %9};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM80_16x8x8_F32F16F16F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_F32F16F16F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM80_16x8x8_F32BF16BF16F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x8_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_F32BF16BF16F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x4 TN +struct SM80_16x8x4_F32TF32TF32F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x4_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM80_16x8x8_F32TF32TF32F32_TN +{ + using DRegisters = float[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(float & d0, float & d1, float & d2, float & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + float const & c0, float const & c1, float const & c2, float const & c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "f"(c0), "f"(c1), "f"(c2), "f"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x8_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x4 TN +struct SM80_8x8x4_F64F64F64F64_TN +{ + using DRegisters = double[2]; + using ARegisters = double[1]; + using BRegisters = double[1]; + using CRegisters = double[2]; + + CUTE_HOST_DEVICE static void + fma(double & d0, double & d1, + double const& a0, + double const& b0, + double const& c0, double const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=d"(d0), "=d"(d1) + : "d"(a0), + "d"(b0), + "d"(c0), "d"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +// MMA 8x8x4 TN with Planar Complex multiplication +struct SM80_8x8x4_C64C64C64C64_TN +{ + using DRegisters = complex[2]; + using ARegisters = complex[1]; + using BRegisters = complex[1]; + using CRegisters = complex[2]; + + CUTE_HOST_DEVICE static void + fma(complex & d0, complex & d1, + complex const& a0, + complex const& b0, + complex const& c0, complex const& c1) + { + // Because thrust::complex does not provide a mutable ref + double& rd0 = reinterpret_cast(d0)[0]; + double& id0 = reinterpret_cast(d0)[1]; + double& rd1 = reinterpret_cast(d1)[0]; + double& id1 = reinterpret_cast(d1)[1]; + + // d.real() = a.real() * b.real() + c.real(); + SM80_8x8x4_F64F64F64F64_TN::fma( + rd0, rd1, + a0.real(), + b0.real(), + c0.real(), c1.real()); + + // d.imag() = a.imag() * b.real() + c.imag(); + SM80_8x8x4_F64F64F64F64_TN::fma( + id0, id1, + a0.imag(), + b0.real(), + c0.imag(), c1.imag()); + + // d.real() = -a.imag() * b.imag() + d.real(); + SM80_8x8x4_F64F64F64F64_TN::fma( + rd0, rd1, + -a0.imag(), + b0.imag(), + d0.real(), d1.real()); + + // d.imag() = a.real() * b.imag() + d.imag(); + SM80_8x8x4_F64F64F64F64_TN::fma( + id0, id1, + a0.real(), + b0.imag(), + d0.imag(), d1.imag()); + } +}; + +// MMA 8x8x4 TN with Gaussian Complex multiplication: +// (a + bi)*(c + di) +// yields +// t0 += a*c +// t1 += b*d +// t2 += (a+b)*(c+d) +// then +// re = t0 - t1 +// im = t2 - t0 - t1 +struct SM80_8x8x4_GC64C64C64GC64_TN +{ + struct GaussComplex { + double t0, t1, t2; + + CUTE_HOST_DEVICE //constexpr + operator complex() const { return complex(t0 - t1, t2 - t0 - t1); } + + CUTE_HOST_DEVICE friend //constexpr + complex operator*(GaussComplex const& a, complex const& b) { return static_cast>(a) * b; } + CUTE_HOST_DEVICE friend //constexpr + complex operator*(complex const& a, GaussComplex const& b) { return b * a; } + + CUTE_HOST_DEVICE friend //constexpr + complex operator+(GaussComplex const& a, complex const& b) { return static_cast>(a) + b; } + CUTE_HOST_DEVICE friend //constexpr + complex operator+(complex const& a, GaussComplex const& b) { return b + a; } + }; + + using DRegisters = GaussComplex[2]; + using ARegisters = complex[1]; + using BRegisters = complex[1]; + using CRegisters = GaussComplex[2]; + + CUTE_HOST_DEVICE static void + fma(GaussComplex & d0, GaussComplex & d1, + complex const& a0, + complex const& b0, + GaussComplex const& c0, GaussComplex const& c1) + { + SM80_8x8x4_F64F64F64F64_TN::fma(d0.t0, d1.t0, + a0.real(), + b0.real(), + c0.t0, c1.t0); + SM80_8x8x4_F64F64F64F64_TN::fma(d0.t1, d1.t1, + a0.imag(), + b0.imag(), + c0.t1, c1.t1); + SM80_8x8x4_F64F64F64F64_TN::fma(d0.t2, d1.t2, + a0.real() + a0.imag(), + b0.real() + b0.imag(), + c0.t2, c1.t2); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32S8S8S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32S8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32S8S8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32S8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S8S8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32S8U8S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32S8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32S8U8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32S8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S8U8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32U8S8S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32U8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32U8S8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32U8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U8S8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U8S8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32U8U8S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x16 TN +struct SM80_8x8x16_S32U8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32U8U8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM80_16x8x16_S32U8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U8U8S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U8U8S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32S4S4S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32S4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S4S4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32S4S4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32S4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32S4U4S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32S4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S4U4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32S4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32S4U4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32S4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32U4S4S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32U4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U4S4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32U4S4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32U4S4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32U4U4S32_TN +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x32 TN +struct SM80_8x8x32_S32U4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32.satfinite " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U4U4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x32 TN +struct SM80_16x8x32_S32U4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32U4U4S32_TN +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x64 TN +struct SM80_16x8x64_S32U4U4S32_TN_SATURATE +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x64_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 8x8x128 TN +struct SM80_8x8x128_S32U1U1S32_TN_XORPOPC +{ + using DRegisters = uint32_t[2]; + using ARegisters = uint32_t[1]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, + uint32_t const& a0, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.xor.popc " + "{%0, %1}," + "{%2}," + "{%3}," + "{%4, %5};\n" + : "=r"(d0), "=r"(d1) + : "r"(a0), + "r"(b0), + "r"(c0), "r"(c1)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_8x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x128 TN +struct SM80_16x8x128_S32U1U1S32_TN_XORPOPC +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[2]; + using BRegisters = uint32_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, + uint32_t const& b0, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.xor.popc " + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), + "r"(b0), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x256 TN +struct SM80_16x8x256_S32U1U1S32_TN_XORPOPC +{ + using DRegisters = uint32_t[4]; + using ARegisters = uint32_t[4]; + using BRegisters = uint32_t[2]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint32_t const& b0, uint32_t const& b1, + uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3) + { +#if defined(CUTE_ARCH_MMA_SM80_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "r"(b0), "r"(b1), + "r"(c0), "r"(c1), "r"(c2), "r"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM80_16x8x256_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute diff --git a/include/cute/arch/mma_sm90.hpp b/include/cute/arch/mma_sm90.hpp new file mode 100644 index 0000000000..08fe2b2810 --- /dev/null +++ b/include/cute/arch/mma_sm90.hpp @@ -0,0 +1,961 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL)) +# define CUTE_ARCH_MMA_SM90_ENABLED +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cute { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x4 TN +struct SM90_16x8x4_F64F64F64F64_TN +{ + using DRegisters = double[4]; + using ARegisters = double[2]; + using BRegisters = double[1]; + using CRegisters = double[4]; + + CUTE_HOST_DEVICE static void + fma(double & d0, double & d1, double & d2, double & d3, + double const& a0, double const& a1, + double const& b0, + double const& c0, double const& c1, double const& c2, double const& c3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64" + "{%0, %1, %2, %3}," + "{%4, %5}," + "{%6}," + "{%7, %8, %9, %10};\n" + : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3) + : "d"(a0), "d"(a1), + "d"(b0), + "d"(c0), "d"(c1), "d"(c2), "d"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_16x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM90_16x8x8_F64F64F64F64_TN +{ + using DRegisters = double[4]; + using ARegisters = double[4]; + using BRegisters = double[2]; + using CRegisters = double[4]; + + CUTE_HOST_DEVICE static void + fma(double & d0, double & d1, double & d2, double & d3, + double const& a0, double const& a1, double const& a2, double const& a3, + double const& b0, double const& b1, + double const& c0, double const& c1, double const& c2, double const& c3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64" + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + "{%8, %9}," + "{%10, %11, %12, %13};\n" + : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3) + : "d"(a0), "d"(a1), "d"(a2), "d"(a3), + "d"(b0), "d"(b1), + "d"(c0), "d"(c1), "d"(c2), "d"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_16x8x8_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM90_16x8x16_F64F64F64F64_TN +{ + using DRegisters = double[4]; + using ARegisters = double[8]; + using BRegisters = double[4]; + using CRegisters = double[4]; + + CUTE_HOST_DEVICE static void + fma(double & d0, double & d1, double & d2, double & d3, + double const& a0, double const& a1, double const& a2, double const& a3, + double const& a4, double const& a5, double const& a6, double const& a7, + double const& b0, double const& b1, double const& b2, double const& b3, + double const& c0, double const& c1, double const& c2, double const& c3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64" + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7, %8, %9, %10, %11}," + "{%12, %13, %14, %15}," + "{%16, %17, %18, %19};\n" + : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3) + : "d"(a0), "d"(a1), "d"(a2), "d"(a3), + "d"(a4), "d"(a5), "d"(a6), "d"(a7), + "d"(b0), "d"(b1), "d"(b2), "d"(b3), + "d"(c0), "d"(c1), "d"(c2), "d"(c3)); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_16x8x16_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x4 TN +struct SM90_16x8x4_C64C64C64C64_TN +{ + using DRegisters = complex[4]; + using ARegisters = complex[2]; + using BRegisters = complex[1]; + using CRegisters = complex[4]; + + CUTE_HOST_DEVICE static void + fma(complex & d0, complex & d1, + complex & d2, complex & d3, + complex const& a0, complex const& a1, + complex const& b0, + complex const& c0, complex const& c1, + complex const& c2, complex const& c3) + { + // Because thrust::complex does not provide a mutable ref + double& rd0 = reinterpret_cast(d0)[0]; + double& id0 = reinterpret_cast(d0)[1]; + double& rd1 = reinterpret_cast(d1)[0]; + double& id1 = reinterpret_cast(d1)[1]; + double& rd2 = reinterpret_cast(d2)[0]; + double& id2 = reinterpret_cast(d2)[1]; + double& rd3 = reinterpret_cast(d3)[0]; + double& id3 = reinterpret_cast(d3)[1]; + + // d.real() = a.real() * b.real() + c.real(); + SM90_16x8x4_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + a0.real(), a1.real(), + b0.real(), + c0.real(), c1.real(), c2.real(), c3.real()); + + // d.imag() = a.imag() * b.real() + c.imag(); + SM90_16x8x4_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.imag(), a1.imag(), + b0.real(), + c0.imag(), c1.imag(), c2.imag(), c3.imag()); + + // d.real() = -a.imag() * b.imag() + d.real(); + SM90_16x8x4_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + -a0.imag(), -a1.imag(), + b0.imag(), + d0.real(), d1.real(), d2.real(), d3.real()); + + // d.imag() = a.real() * b.imag() + d.imag(); + SM90_16x8x4_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.real(), a1.real(), + b0.imag(), + d0.imag(), d1.imag(), d2.imag(), d3.imag()); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x8 TN +struct SM90_16x8x8_C64C64C64C64_TN +{ + using DRegisters = complex[4]; + using ARegisters = complex[4]; + using BRegisters = complex[2]; + using CRegisters = complex[4]; + + CUTE_HOST_DEVICE static void + fma(complex & d0, complex & d1, + complex & d2, complex & d3, + complex const& a0, complex const& a1, + complex const& a2, complex const& a3, + complex const& b0, complex const& b1, + complex const& c0, complex const& c1, + complex const& c2, complex const& c3) + { + // Because thrust::complex does not provide a mutable ref + double& rd0 = reinterpret_cast(d0)[0]; + double& id0 = reinterpret_cast(d0)[1]; + double& rd1 = reinterpret_cast(d1)[0]; + double& id1 = reinterpret_cast(d1)[1]; + double& rd2 = reinterpret_cast(d2)[0]; + double& id2 = reinterpret_cast(d2)[1]; + double& rd3 = reinterpret_cast(d3)[0]; + double& id3 = reinterpret_cast(d3)[1]; + + // d.real() = a.real() * b.real() + c.real(); + SM90_16x8x8_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + a0.real(), a1.real(), a2.real(), a3.real(), + b0.real(), b1.real(), + c0.real(), c1.real(), c2.real(), c3.real()); + + // d.imag() = a.imag() * b.real() + c.imag(); + SM90_16x8x8_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.imag(), a1.imag(), a2.imag(), a3.imag(), + b0.real(), b1.real(), + c0.imag(), c1.imag(), c2.imag(), c3.imag()); + + // d.real() = -a.imag() * b.imag() + d.real(); + SM90_16x8x8_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(), + b0.imag(), b1.imag(), + d0.real(), d1.real(), d2.real(), d3.real()); + + // d.imag() = a.real() * b.imag() + d.imag(); + SM90_16x8x8_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.real(), a1.real(), a2.real(), a3.real(), + b0.imag(), b1.imag(), + d0.imag(), d1.imag(), d2.imag(), d3.imag()); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 16x8x16 TN +struct SM90_16x8x16_C64C64C64C64_TN +{ + using DRegisters = complex[4]; + using ARegisters = complex[8]; + using BRegisters = complex[4]; + using CRegisters = complex[4]; + + CUTE_HOST_DEVICE static void + fma(complex & d0, complex & d1, + complex & d2, complex & d3, + complex const& a0, complex const& a1, + complex const& a2, complex const& a3, + complex const& a4, complex const& a5, + complex const& a6, complex const& a7, + complex const& b0, complex const& b1, + complex const& b2, complex const& b3, + complex const& c0, complex const& c1, + complex const& c2, complex const& c3) + { + // Because thrust::complex does not provide a mutable ref + double& rd0 = reinterpret_cast(d0)[0]; + double& id0 = reinterpret_cast(d0)[1]; + double& rd1 = reinterpret_cast(d1)[0]; + double& id1 = reinterpret_cast(d1)[1]; + double& rd2 = reinterpret_cast(d2)[0]; + double& id2 = reinterpret_cast(d2)[1]; + double& rd3 = reinterpret_cast(d3)[0]; + double& id3 = reinterpret_cast(d3)[1]; + + // d.real() = a.real() * b.real() + c.real(); + SM90_16x8x16_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + a0.real(), a1.real(), a2.real(), a3.real(), + a4.real(), a5.real(), a6.real(), a7.real(), + b0.real(), b1.real(), b2.real(), b3.real(), + c0.real(), c1.real(), c2.real(), c3.real()); + + // d.imag() = a.imag() * b.real() + c.imag(); + SM90_16x8x16_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.imag(), a1.imag(), a2.imag(), a3.imag(), + a4.imag(), a5.imag(), a6.imag(), a7.imag(), + b0.real(), b1.real(), b2.real(), b3.real(), + c0.imag(), c1.imag(), c2.imag(), c3.imag()); + + // d.real() = -a.imag() * b.imag() + d.real(); + SM90_16x8x16_F64F64F64F64_TN::fma( + rd0, rd1, rd2, rd3, + -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(), + -a4.imag(), -a5.imag(), -a6.imag(), -a7.imag(), + b0.imag(), b1.imag(), b2.imag(), b3.imag(), + d0.real(), d1.real(), d2.real(), d3.real()); + + // d.imag() = a.real() * b.imag() + d.imag(); + SM90_16x8x16_F64F64F64F64_TN::fma( + id0, id1, id2, id3, + a0.real(), a1.real(), a2.real(), a3.real(), + a4.real(), a5.real(), a6.real(), a7.real(), + b0.imag(), b1.imag(), b2.imag(), b3.imag(), + d0.imag(), d1.imag(), d2.imag(), d3.imag()); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cute { +namespace GMMA { + +template< + class ElementA, + class ElementB, + class ElementC, + class TileShape_MNK, + GMMA::Major MajorA = GMMA::Major::K, + GMMA::Major MajorB = GMMA::Major::K, + auto... Args // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One] + // But most commonly leave empty for defaults +> +CUTE_HOST_DEVICE constexpr +auto +ss_op_selector() +{ + static_assert(is_static::value, "TileShape_MNK must be static."); + static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3."); + static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64."); + auto Tile_N = size<1>(TileShape_MNK{}); + + // FP16 accumulator + if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "Element types for AB must be half if ElementC is half."); + static_assert(std::is_same_v, "Element types for AB must be half if ElementC is half."); + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + + // Dispatch against the Tile N mode size + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F16F16F16_SS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F16F16F16_SS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // FP32 accumulator + else if constexpr (std::is_same_v) { + + // FP16 inputs + if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "ElementA and ElementB must be the same type for this config."); + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F32F16F16_SS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F32F16F16_SS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // BF16 inputs + else if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "ElementA and ElementB must be the same type for this config."); + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F32BF16BF16_SS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F32BF16BF16_SS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // TF32 inputs + else if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "ElementA and ElementB must be the same type for this config."); + static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config."); + static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config."); + static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x8_F32TF32TF32_SS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x8_F32TF32TF32_SS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + else { + static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration."); + } + } + + // S32 accumulator + else if constexpr (std::is_same_v) { + static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config."); + static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config."); + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + // ElementA == int8_t && ElementB == int8_t + if constexpr (std::is_same_v && std::is_same_v) { + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32S8S8_SS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32S8S8_SS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == int8_t && ElementB == uint8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32S8U8_SS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32S8U8_SS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == uint8_t && ElementB == int8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32U8S8_SS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32U8S8_SS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == uint8_t && ElementB == uint8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32U8U8_SS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32U8U8_SS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + } + + // Unknown accumulator type + else { + static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type."); + } +} + +template< + class ElementA, + class ElementB, + class ElementC, + class TileShape_MNK, + GMMA::Major MajorA = GMMA::Major::K, + GMMA::Major MajorB = GMMA::Major::K, + auto... Args // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One] + // But most commonly leave empty for defaults +> +CUTE_HOST_DEVICE constexpr +auto +rs_op_selector() +{ + static_assert(is_static::value, "TileShape_MNK must be static."); + static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3."); + static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64."); + static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout."); + auto Tile_N = size<1>(TileShape_MNK{}); + + // FP16 accumulator + if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "Element types for AB must be half if ElementC is half."); + static_assert(std::is_same_v, "Element types for AB must be half if ElementC is half."); + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + + // Dispatch against the Tile N mode size + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F16F16F16_RS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F16F16F16_RS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // FP32 accumulator + else if constexpr (std::is_same_v) { + static_assert(std::is_same_v, "ElementA and ElementB must be the same type for this config."); + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + + // FP16 inputs + if constexpr (std::is_same_v) { + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F32F16F16_RS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F32F16F16_RS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // BF16 inputs + else if constexpr (std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x16_F32BF16BF16_RS{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x16_F32BF16BF16_RS{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // TF32 inputs + else if constexpr (std::is_same_v) { + static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config."); + static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x8_F32TF32TF32_RS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x8_F32TF32TF32_RS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + else { + static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration."); + } + } + + // S32 accumulator + else if constexpr (std::is_same_v) { + static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config."); + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + // ElementA == int8_t && ElementB == int8_t + if constexpr (std::is_same_v && std::is_same_v) { + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32S8S8_RS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32S8S8_RS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == int8_t && ElementB == uint8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32S8U8_RS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32S8U8_RS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == uint8_t && ElementB == int8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32U8S8_RS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32U8S8_RS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + + // ElementA == uint8_t && ElementB == uint8_t + else if constexpr (std::is_same_v && std::is_same_v) { + static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32."); + + if constexpr (Tile_N % 256 == 0) { + return SM90_64x256x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 192 == 0) { + return SM90_64x192x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 128 == 0) { + return SM90_64x128x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 96 == 0) { + return SM90_64x96x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 64 == 0) { + return SM90_64x64x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 32 == 0) { + return SM90_64x32x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 16 == 0) { + return SM90_64x16x32_S32U8U8_RS_TN{}; + } + else if constexpr (Tile_N % 8 == 0) { + return SM90_64x8x32_S32U8U8_RS_TN{}; + } + else { + static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8."); + } + } + } + + // Unknown accumulator type + else { + static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type."); + } +} +} // end namespace GMMA +} // end namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp new file mode 100644 index 0000000000..abac517044 --- /dev/null +++ b/include/cute/arch/mma_sm90_desc.hpp @@ -0,0 +1,131 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include + +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL)) +# define CUTE_ARCH_MMA_SM90_ENABLED +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cute { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// GMMA Descriptor and utilities + +// GMMA enums and utilities +namespace GMMA +{ + +enum class LayoutType : uint8_t { + INTERLEAVE = 0, + B128 = 1, + B64 = 2, + B32 = 3, +}; + +CUTE_HOST_DEVICE char const* to_string(LayoutType const& t) { + switch (t) { + case LayoutType::INTERLEAVE: return "INTERLEAVE"; + case LayoutType::B128: return "B128"; + case LayoutType::B64: return "B64"; + case LayoutType::B32: return "B32"; + } + return nullptr; +} + +// Output operator for all enums in this namespace +CUTE_HOST std::ostream& operator<<(std::ostream& os, LayoutType const& t) { + char const* s = to_string(t); + if (s) { + std::operator<<(os, s); // Explicit call to avoid ambiguity + } else { + os.setstate(std::ios_base::failbit); + } + return os; +} + +} // end namespace GMMA + +union GmmaDescriptor +{ + uint64_t desc_; + uint32_t reg32_[2]; + uint16_t reg16_[4]; + + // Bitfield implementation avoids the need for shifts in assignment + struct { + // start_address, bit [0,14), 4LSB not included + uint16_t start_address_ : 14, : 2; // 14 bits [0,14), 2 bits unused + // leading dimension byte offset, bit [16,30), 4LSB not included + // For N: This is the stride from the first col to the second col of the 8x2 brick in INTERLEAVED + // Unused for all SWIZZLE_* layouts (and assumed to be 1) + // For T: This is the stride from the first 8 rows to the next 8 rows. + uint16_t leading_byte_offset_ : 14, : 2; // 14 bits [0,14), 2 bits unused + // stride dimension byte offset, bit [32,46), 4LSB not included + // For N: This is the stride from the first 8 rows to the next 8 rows. + // For T: This is the stride fro mthe first 8 cols to the next 8 cols. + uint16_t stride_byte_offset_ : 14, : 2; // 14 bits [0,14), 2 bits unused + // base_offset, bit [49,52) + // Valid only for SWIZZLE_128B and SWIZZLE_64B + uint8_t : 1, base_offset_ : 3, : 4; // 1 bit unused, 3 bits [1,4), 4 bits unused + // layout type, bit [62,64) + // SWIZZLE_NONE = 0, SWIZZLE_32B = 3, SWIZZLE_64B = 2, SWIZZLE_128B = 1 + uint8_t : 6, layout_type_ : 2; // 6 bits unused, 2 bits [6,8) + }; + + // Decay to a uint64_t + CUTE_HOST_DEVICE constexpr + operator uint64_t() const noexcept { return desc_; } + + // Printer + CUTE_HOST_DEVICE friend void print(GmmaDescriptor const& t) + { + printf("GmmaDescriptor: 0x%016lx\n", t.desc_); + printf(" start_addr : 0x%04x\n", t.start_address_); + printf(" leading_off: 0x%04x (%d)\n", t.leading_byte_offset_, t.leading_byte_offset_); + printf(" stride_off : 0x%04x (%d)\n", t.stride_byte_offset_, t.stride_byte_offset_); + printf(" base_offset: 0x%01x\n", t.base_offset_); + printf(" layout_type: 0x%01x (%s)\n", t.layout_type_, to_string(static_cast(t.layout_type_))); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/arch/mma_sm90_gmma.hpp b/include/cute/arch/mma_sm90_gmma.hpp new file mode 100644 index 0000000000..25a1d1714a --- /dev/null +++ b/include/cute/arch/mma_sm90_gmma.hpp @@ -0,0 +1,12265 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +// Config +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL)) +# define CUTE_ARCH_MMA_SM90_ENABLED +#endif + +namespace cute { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Warpgroup sync primitives + +CUTE_HOST_DEVICE +void +warpgroup_arrive() +{ +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory"); +#else + CUTE_RUNTIME_ASSERT("Attempting to use wgmma.fence without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif +} + +template +CUTE_HOST_DEVICE +void +warpgroup_wait() +{ + static_assert(N >= 0 && N <= 7, "_warpgroup.wait {N}; must be in range [0, 7]"); +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory"); +#else + CUTE_RUNTIME_ASSERT("Attempting to use wgmma.wait_group without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif +} + +// Marks the commit point for one or more sized batch of warpgroup MMAs. +CUTE_HOST_DEVICE +void +warpgroup_commit_batch() +{ +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory"); +#else + CUTE_RUNTIME_ASSERT("Attempting to use wgmma.commit_group without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif +} + +CUTE_HOST_DEVICE +void +warpgroup_fence_operand(uint32_t& reg) { + asm volatile("" : "+r"(reg) :: "memory"); +} + +CUTE_HOST_DEVICE +void +warpgroup_fence_operand(float& reg) { + asm volatile("" : "+f"(reg) :: "memory"); +} + +namespace GMMA { + +enum class Major { + K = 0, + MN = 1 +}; + +enum class ScaleOut { + Zero = 0, + One = 1 +}; + +enum class ScaleIn { + Neg = -1, + One = 1 +}; + +} // namespace GMMA + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// GMMA PTX definitions: C = (scaleA * A) * (scaleB * B) + (scaleD * C) +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[2]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 " + "{%0, %1}," + " %2," + " %3," + " %4, %5, %6, %7, %8;\n" + : "+r"(d0), "+r"(d1) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[2]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 " + "{%0, %1}," + "{%2, %3, %4, %5}," + " %6," + " %7, %8, %9, %10;\n" + : "+r"(d0), "+r"(d1) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6, %7, %8, %9, %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9, %10, %11, %12;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10, %11, %12, %13, %14;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13, %14, %15, %16;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18, %19, %20, %21, %22;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21, %22, %23, %24;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[24]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23}," + " %24," + " %25," + " %26, %27, %28, %29, %30;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[24]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23}," + "{%24, %25, %26, %27}," + " %28," + " %29, %30, %31, %32;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34, %35, %36, %37, %38;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37, %38, %39, %40;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50, %51, %52, %53, %54;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53, %54, %55, %56;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F16F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66, %67, %68, %69, %70;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F16+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F16F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69, %70, %71, %72;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6, %7, %8, %9, %10;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9, %10, %11, %12;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10, %11, %12, %13, %14;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13, %14, %15, %16;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18, %19, %20, %21, %22;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21, %22, %23, %24;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34, %35, %36, %37, %38;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37, %38, %39, %40;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50, %51, %52, %53, %54;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53, %54, %55, %56;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66, %67, %68, %69, %70;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69, %70, %71, %72;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98, %99, %100, %101, %102;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101, %102, %103, %104;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F32F16F16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130, %131, %132, %133, %134;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F32+=F16*F16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F32F16F16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133, %134, %135, %136;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6, %7, %8, %9, %10;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9, %10, %11, %12;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10, %11, %12, %13, %14;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13, %14, %15, %16;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18, %19, %20, %21, %22;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21, %22, %23, %24;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34, %35, %36, %37, %38;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37, %38, %39, %40;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50, %51, %52, %53, %54;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53, %54, %55, %56;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66, %67, %68, %69, %70;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69, %70, %71, %72;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98, %99, %100, %101, %102;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101, %102, %103, %104;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F32BF16BF16_SS +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130, %131, %132, %133, %134;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x16 F32+=BF16*BF16 +template< + GMMA::Major tnspA, + GMMA::Major tnspB, + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x16_F32BF16BF16_RS +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + static_assert(tnspA == GMMA::Major::K, + "Register source operand A must have K major layout."); + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133, %134, %135, %136;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6, %7, %8;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x8x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x8x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9, %10, %11;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10, %11, %12;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x16x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x16x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + float & d0, float & d1, float & d2, float & d3, + float & d4, float & d5, float & d6, float & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13, %14, %15;\n" + : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3), + "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18, %19, %20;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x32x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x32x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21, %22, %23;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34, %35, %36;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x64x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x64x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37, %38, %39;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50, %51, %52;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x96x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x96x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53, %54, %55;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66, %67, %68;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x128x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x128x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69, %70, %71;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98, %99, %100;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x192x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x192x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + float & d00, float & d01, float & d02, float & d03, + float & d04, float & d05, float & d06, float & d07, + float & d08, float & d09, float & d10, float & d11, + float & d12, float & d13, float & d14, float & d15, + float & d16, float & d17, float & d18, float & d19, + float & d20, float & d21, float & d22, float & d23, + float & d24, float & d25, float & d26, float & d27, + float & d28, float & d29, float & d30, float & d31, + float & d32, float & d33, float & d34, float & d35, + float & d36, float & d37, float & d38, float & d39, + float & d40, float & d41, float & d42, float & d43, + float & d44, float & d45, float & d46, float & d47, + float & d48, float & d49, float & d50, float & d51, + float & d52, float & d53, float & d54, float & d55, + float & d56, float & d57, float & d58, float & d59, + float & d60, float & d61, float & d62, float & d63, + float & d64, float & d65, float & d66, float & d67, + float & d68, float & d69, float & d70, float & d71, + float & d72, float & d73, float & d74, float & d75, + float & d76, float & d77, float & d78, float & d79, + float & d80, float & d81, float & d82, float & d83, + float & d84, float & d85, float & d86, float & d87, + float & d88, float & d89, float & d90, float & d91, + float & d92, float & d93, float & d94, float & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101, %102, %103;\n" + : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03), + "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07), + "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11), + "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15), + "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19), + "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23), + "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27), + "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31), + "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35), + "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39), + "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43), + "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47), + "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51), + "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55), + "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59), + "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63), + "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67), + "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71), + "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75), + "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79), + "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83), + "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87), + "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91), + "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x8_F32TF32TF32_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130, %131, %132;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA 64x256x8 TN F32+=TF32*TF32 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One, + GMMA::ScaleIn scaleA = GMMA::ScaleIn::One, + GMMA::ScaleIn scaleB = GMMA::ScaleIn::One +> +struct SM90_64x256x8_F32TF32TF32_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = float[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + float & d000, float & d001, float & d002, float & d003, + float & d004, float & d005, float & d006, float & d007, + float & d008, float & d009, float & d010, float & d011, + float & d012, float & d013, float & d014, float & d015, + float & d016, float & d017, float & d018, float & d019, + float & d020, float & d021, float & d022, float & d023, + float & d024, float & d025, float & d026, float & d027, + float & d028, float & d029, float & d030, float & d031, + float & d032, float & d033, float & d034, float & d035, + float & d036, float & d037, float & d038, float & d039, + float & d040, float & d041, float & d042, float & d043, + float & d044, float & d045, float & d046, float & d047, + float & d048, float & d049, float & d050, float & d051, + float & d052, float & d053, float & d054, float & d055, + float & d056, float & d057, float & d058, float & d059, + float & d060, float & d061, float & d062, float & d063, + float & d064, float & d065, float & d066, float & d067, + float & d068, float & d069, float & d070, float & d071, + float & d072, float & d073, float & d074, float & d075, + float & d076, float & d077, float & d078, float & d079, + float & d080, float & d081, float & d082, float & d083, + float & d084, float & d085, float & d086, float & d087, + float & d088, float & d089, float & d090, float & d091, + float & d092, float & d093, float & d094, float & d095, + float & d096, float & d097, float & d098, float & d099, + float & d100, float & d101, float & d102, float & d103, + float & d104, float & d105, float & d106, float & d107, + float & d108, float & d109, float & d110, float & d111, + float & d112, float & d113, float & d114, float & d115, + float & d116, float & d117, float & d118, float & d119, + float & d120, float & d121, float & d122, float & d123, + float & d124, float & d125, float & d126, float & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133, %134, %135;\n" + : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003), + "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007), + "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011), + "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015), + "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019), + "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023), + "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027), + "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031), + "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035), + "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039), + "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043), + "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047), + "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051), + "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055), + "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059), + "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063), + "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067), + "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071), + "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075), + "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079), + "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083), + "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087), + "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091), + "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095), + "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099), + "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103), + "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107), + "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111), + "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115), + "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119), + "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123), + "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=S8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32S8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8S8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8S8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8S8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*S8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8S8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3}," + " %4," + " %5," + " %6;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + " %8," + " %9," + " %10;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + " %16," + " %17," + " %18;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + " %32," + " %33," + " %34;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + " %48," + " %49," + " %50;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + " %64," + " %65," + " %66;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + " %96," + " %97," + " %98;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8U8_SS_TN +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8U8_SS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint64_t[1]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint64_t const& desc_a, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + " %128," + " %129," + " %130;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "l"(desc_a), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x8x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x8x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[4]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3}," + "{%4, %5, %6, %7}," + " %8," + " %9;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x8x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x16x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x16x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[8]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3, + uint64_t const& desc_b, + uint32_t & d0, uint32_t & d1, uint32_t & d2, uint32_t & d3, + uint32_t & d4, uint32_t & d5, uint32_t & d6, uint32_t & d7) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7}," + "{%8, %9, %10, %11}," + " %12," + " %13;\n" + : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3), + "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x16x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x32x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x32x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[16]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15}," + "{%16, %17, %18, %19}," + " %20," + " %21;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x32x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x64x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x64x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[32]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31}," + "{%32, %33, %34, %35}," + " %36," + " %37;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x64x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x96x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x96x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[48]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47}," + "{%48, %49, %50, %51}," + " %52," + " %53;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x96x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x128x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x128x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[64]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63}," + "{%64, %65, %66, %67}," + " %68," + " %69;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x128x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x192x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x192x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[96]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03, + uint64_t const& desc_b, + uint32_t & d00, uint32_t & d01, uint32_t & d02, uint32_t & d03, + uint32_t & d04, uint32_t & d05, uint32_t & d06, uint32_t & d07, + uint32_t & d08, uint32_t & d09, uint32_t & d10, uint32_t & d11, + uint32_t & d12, uint32_t & d13, uint32_t & d14, uint32_t & d15, + uint32_t & d16, uint32_t & d17, uint32_t & d18, uint32_t & d19, + uint32_t & d20, uint32_t & d21, uint32_t & d22, uint32_t & d23, + uint32_t & d24, uint32_t & d25, uint32_t & d26, uint32_t & d27, + uint32_t & d28, uint32_t & d29, uint32_t & d30, uint32_t & d31, + uint32_t & d32, uint32_t & d33, uint32_t & d34, uint32_t & d35, + uint32_t & d36, uint32_t & d37, uint32_t & d38, uint32_t & d39, + uint32_t & d40, uint32_t & d41, uint32_t & d42, uint32_t & d43, + uint32_t & d44, uint32_t & d45, uint32_t & d46, uint32_t & d47, + uint32_t & d48, uint32_t & d49, uint32_t & d50, uint32_t & d51, + uint32_t & d52, uint32_t & d53, uint32_t & d54, uint32_t & d55, + uint32_t & d56, uint32_t & d57, uint32_t & d58, uint32_t & d59, + uint32_t & d60, uint32_t & d61, uint32_t & d62, uint32_t & d63, + uint32_t & d64, uint32_t & d65, uint32_t & d66, uint32_t & d67, + uint32_t & d68, uint32_t & d69, uint32_t & d70, uint32_t & d71, + uint32_t & d72, uint32_t & d73, uint32_t & d74, uint32_t & d75, + uint32_t & d76, uint32_t & d77, uint32_t & d78, uint32_t & d79, + uint32_t & d80, uint32_t & d81, uint32_t & d82, uint32_t & d83, + uint32_t & d84, uint32_t & d85, uint32_t & d86, uint32_t & d87, + uint32_t & d88, uint32_t & d89, uint32_t & d90, uint32_t & d91, + uint32_t & d92, uint32_t & d93, uint32_t & d94, uint32_t & d95) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95}," + "{%96, %97, %98, %99}," + " %100," + " %101;\n" + : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03), + "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07), + "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11), + "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15), + "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19), + "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23), + "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27), + "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31), + "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35), + "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39), + "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43), + "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47), + "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51), + "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55), + "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59), + "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63), + "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67), + "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71), + "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75), + "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79), + "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83), + "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87), + "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91), + "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95) + : "r"(a00), "r"(a01), "r"(a02), "r"(a03), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x192x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8U8_RS_TN +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// MMA 64x256x32 TN S32+=U8*U8 +template< + GMMA::ScaleOut scaleD = GMMA::ScaleOut::One +> +struct SM90_64x256x32_S32U8U8_RS_TN_SATURATE +{ + using DRegisters = void; + using ARegisters = uint32_t[4]; + using BRegisters = uint64_t[1]; + using CRegisters = uint32_t[128]; + + CUTE_HOST_DEVICE static void + fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003, + uint64_t const& desc_b, + uint32_t & d000, uint32_t & d001, uint32_t & d002, uint32_t & d003, + uint32_t & d004, uint32_t & d005, uint32_t & d006, uint32_t & d007, + uint32_t & d008, uint32_t & d009, uint32_t & d010, uint32_t & d011, + uint32_t & d012, uint32_t & d013, uint32_t & d014, uint32_t & d015, + uint32_t & d016, uint32_t & d017, uint32_t & d018, uint32_t & d019, + uint32_t & d020, uint32_t & d021, uint32_t & d022, uint32_t & d023, + uint32_t & d024, uint32_t & d025, uint32_t & d026, uint32_t & d027, + uint32_t & d028, uint32_t & d029, uint32_t & d030, uint32_t & d031, + uint32_t & d032, uint32_t & d033, uint32_t & d034, uint32_t & d035, + uint32_t & d036, uint32_t & d037, uint32_t & d038, uint32_t & d039, + uint32_t & d040, uint32_t & d041, uint32_t & d042, uint32_t & d043, + uint32_t & d044, uint32_t & d045, uint32_t & d046, uint32_t & d047, + uint32_t & d048, uint32_t & d049, uint32_t & d050, uint32_t & d051, + uint32_t & d052, uint32_t & d053, uint32_t & d054, uint32_t & d055, + uint32_t & d056, uint32_t & d057, uint32_t & d058, uint32_t & d059, + uint32_t & d060, uint32_t & d061, uint32_t & d062, uint32_t & d063, + uint32_t & d064, uint32_t & d065, uint32_t & d066, uint32_t & d067, + uint32_t & d068, uint32_t & d069, uint32_t & d070, uint32_t & d071, + uint32_t & d072, uint32_t & d073, uint32_t & d074, uint32_t & d075, + uint32_t & d076, uint32_t & d077, uint32_t & d078, uint32_t & d079, + uint32_t & d080, uint32_t & d081, uint32_t & d082, uint32_t & d083, + uint32_t & d084, uint32_t & d085, uint32_t & d086, uint32_t & d087, + uint32_t & d088, uint32_t & d089, uint32_t & d090, uint32_t & d091, + uint32_t & d092, uint32_t & d093, uint32_t & d094, uint32_t & d095, + uint32_t & d096, uint32_t & d097, uint32_t & d098, uint32_t & d099, + uint32_t & d100, uint32_t & d101, uint32_t & d102, uint32_t & d103, + uint32_t & d104, uint32_t & d105, uint32_t & d106, uint32_t & d107, + uint32_t & d108, uint32_t & d109, uint32_t & d110, uint32_t & d111, + uint32_t & d112, uint32_t & d113, uint32_t & d114, uint32_t & d115, + uint32_t & d116, uint32_t & d117, uint32_t & d118, uint32_t & d119, + uint32_t & d120, uint32_t & d121, uint32_t & d122, uint32_t & d123, + uint32_t & d124, uint32_t & d125, uint32_t & d126, uint32_t & d127) + { +#if defined(CUTE_ARCH_MMA_SM90_ENABLED) + asm volatile( + "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite " + "{%0, %1, %2, %3, %4, %5, %6, %7, " + " %8, %9, %10, %11, %12, %13, %14, %15, " + " %16, %17, %18, %19, %20, %21, %22, %23, " + " %24, %25, %26, %27, %28, %29, %30, %31, " + " %32, %33, %34, %35, %36, %37, %38, %39, " + " %40, %41, %42, %43, %44, %45, %46, %47, " + " %48, %49, %50, %51, %52, %53, %54, %55, " + " %56, %57, %58, %59, %60, %61, %62, %63, " + " %64, %65, %66, %67, %68, %69, %70, %71, " + " %72, %73, %74, %75, %76, %77, %78, %79, " + " %80, %81, %82, %83, %84, %85, %86, %87, " + " %88, %89, %90, %91, %92, %93, %94, %95, " + " %96, %97, %98, %99, %100, %101, %102, %103, " + " %104, %105, %106, %107, %108, %109, %110, %111, " + " %112, %113, %114, %115, %116, %117, %118, %119, " + " %120, %121, %122, %123, %124, %125, %126, %127}," + "{%128, %129, %130, %131}," + " %132," + " %133;\n" + : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003), + "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007), + "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011), + "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015), + "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019), + "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023), + "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027), + "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031), + "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035), + "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039), + "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043), + "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047), + "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051), + "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055), + "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059), + "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063), + "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067), + "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071), + "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075), + "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079), + "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083), + "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087), + "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091), + "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095), + "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099), + "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103), + "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107), + "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111), + "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115), + "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119), + "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123), + "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127) + : "r"(a000), "r"(a001), "r"(a002), "r"(a003), + "l"(desc_b), + "n"(int32_t(scaleD))); +#else + CUTE_RUNTIME_ASSERT("Attempting to use SM90_64x256x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90_ENABLED"); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp new file mode 100644 index 0000000000..007781f56b --- /dev/null +++ b/include/cute/arch/util.hpp @@ -0,0 +1,178 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) + extern "C" { + // This NVVM intrinsic is subject to change in future versions of CUDA. + // Clients should not call it directly. + CUTE_DEVICE uint32_t __nvvm_get_smem_pointer(void*); + } +#endif + +namespace cute +{ + +/// CUTE helper to cast SMEM pointer to unsigned +CUTE_HOST_DEVICE +uint32_t +cast_smem_ptr_to_uint(void const* const ptr) +{ +// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to +// the previous internal intrinsics if they are available. +#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) + // + // This NVVM intrinsic converts an address in shared memory to a plain + // unsigned integer. This is necessary to pass to shared memory instructions + // in inline PTX. + // + // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer() [only available in 10.2]. + // + //__device__ size_t __cvta_generic_to_shared(void* ptr); + + /// CUTE helper to get SMEM pointer + return static_cast(__cvta_generic_to_shared(ptr)); + +#elif (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) + + return __nvvm_get_smem_pointer(ptr); + +#elif defined(__CUDA_ARCH__) + + uint32_t smem_ptr; + + asm( + "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" + : "=r"(smem_ptr) : "l"(ptr)); + + return smem_ptr; + +#else + + + (void) ptr; + printf("ERROR: cast_smem_ptr_to_uint not supported but used.\n"); + return 0; + +#endif +} + +// +// Utility for pointer interfaces +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, + PtrS&& s, int_sequence, + PtrD&& d, int_sequence) +{ + return fn(s[Is]..., d[Id]...); +} + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, + PtrA&& a, int_sequence, + PtrB&& b, int_sequence, + PtrC&& c, int_sequence) +{ + return fn(a[Ia]..., b[Ib]..., c[Ic]...); +} + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, + PtrD&& d, int_sequence, + PtrA&& a, int_sequence, + PtrB&& b, int_sequence, + PtrC&& c, int_sequence) +{ + return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, PtrS&& s, PtrD&& d) +{ + return detail::explode(fn, + s, make_int_sequence{}, + d, make_int_sequence{}); +} + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, PtrA&& a, PtrB&& b, PtrC&& c) +{ + return detail::explode(fn, + a, make_int_sequence{}, + b, make_int_sequence{}, + c, make_int_sequence{}); +} + +template +CUTE_HOST_DEVICE constexpr +void +explode(Fn fn, PtrD&& d, PtrA&& a, PtrB&& b, PtrC&& c) +{ + return detail::explode(fn, + d, make_int_sequence{}, + a, make_int_sequence{}, + b, make_int_sequence{}, + c, make_int_sequence{}); +} + +} // end namespace cute diff --git a/include/cute/atom/copy_atom.hpp b/include/cute/atom/copy_atom.hpp new file mode 100644 index 0000000000..2c5d9c557a --- /dev/null +++ b/include/cute/atom/copy_atom.hpp @@ -0,0 +1,671 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include +#include + +#include + +namespace cute { + +// Generic copy_unpack for any Copy_Traits +template +CUTE_HOST_DEVICE constexpr +void +copy_unpack(Copy_Traits const&, + Tensor const& src, + Tensor & dst) +{ + // Specializations can generalize on these checks + //static_assert(is_smem::value, "Expected smem for this Copy_Traits"); + //static_assert(is_rmem::value, "Expected rmem for this Copy_Traits"); + + using RegistersSrc = typename Operation::SRegisters; + using RegistersDst = typename Operation::DRegisters; + using RegTypeSrc = typename std::remove_extent::type; + using RegTypeDst = typename std::remove_extent::type; + constexpr int RegNumSrc = std::extent::value; + constexpr int RegNumDst = std::extent::value; + + Tensor rS = recast(src); + Tensor rD = recast(dst); + + CUTE_STATIC_ASSERT_V(size(rS) == Int{}, + "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy."); + CUTE_STATIC_ASSERT_V(size(rD) == Int{}, + "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy."); + + detail::explode(Operation::copy, + rS, make_int_sequence{}, + rD, make_int_sequence{}); +} + + +template +struct Copy_Atom; + +template +struct Copy_Atom : Copy_Atom, T> +{}; + +template +struct Copy_Atom, T> + : Copy_Traits +{ + using Traits = Copy_Traits; + + // Bit and Thr layouts from the Copy_Traits + using ThrID = typename Traits::ThrID; + using BitLayoutSrc = typename Traits::SrcLayout; + using BitLayoutDst = typename Traits::DstLayout; + using BitLayoutRef = typename Traits::RefLayout; + + using ValType = T; + + using ValLayoutSrc = decltype(upcast::value>(BitLayoutSrc{})); + using ValLayoutDst = decltype(upcast::value>(BitLayoutDst{})); + using ValLayoutRef = decltype(upcast::value>(BitLayoutRef{})); + + CUTE_STATIC_ASSERT_V(size<0>(ValLayoutSrc{}) == size(ThrID{}), "CopyOperation is not valid for Src of ValType."); + CUTE_STATIC_ASSERT_V(size<0>(ValLayoutDst{}) == size(ThrID{}), "CopyOperation is not valid for Dst of ValType."); + CUTE_STATIC_ASSERT_V(size<0>(ValLayoutRef{}) == size(ThrID{}), "CopyOperation is not valid for Ref of ValType."); + + static constexpr int NumValSrc = size<1>(ValLayoutSrc{}); + static constexpr int NumValDst = size<1>(ValLayoutDst{}); + + // Additional Trait parameters/transformations + template + CUTE_HOST_DEVICE + auto + with(TraitsArgs&&... args) const { + auto traits = Traits::with(std::forward(args)...); + return Copy_Atom{traits}; + } + + // Print thread and data layouts for debugging + CUTE_HOST_DEVICE static + void + print_all() + { + print("ThrID: "); print(ThrID{}); print("\n"); + print("BitLayoutSrc: "); print(BitLayoutSrc{}); print("\n"); + print("BitLayoutDst: "); print(BitLayoutDst{}); print("\n"); + print("BitLayoutRef: "); print(BitLayoutRef{}); print("\n"); + print("ValLayoutSrc: "); print(ValLayoutSrc{}); print("\n"); + print("ValLayoutDst: "); print(ValLayoutDst{}); print("\n"); + print("ValLayoutRef: "); print(ValLayoutRef{}); print("\n"); + print("ValueType: %db", sizeof_bits::value); print("\n"); + } + + // + // Tensor call interfaces + // + + // Cast, check, and call + template + CUTE_HOST_DEVICE + void + call(Tensor const& src, + Tensor & dst) const + { + static_assert(SLayout::rank == 1, "Expected rank-1 src tensor"); + static_assert(DLayout::rank == 1, "Expected rank-1 dst tensor"); + + if constexpr (is_constant::value || is_constant::value) { + // Dispatch to unpack for instruction + return copy_unpack(*this, src, dst); + } else { + // Recurse if needed by peeling the tensor mode + return copy(*this, tensor<0>(src), tensor<0>(dst)); + } + } + + // Accept mutable temporaries + template + CUTE_HOST_DEVICE + void + call(Tensor const& src, + Tensor && dst) const + { + return call(src, dst); + } +}; + +// +// A tiling of copy atoms +// + +template coord [Need not be 2D...] + class ShapeTile_MN> // coord space +struct TiledCopy : Copy_Atom +{ + // Layout information from the CopyAtom + using AtomThrID = typename Copy_Atom::ThrID; // thrid -> thr_idx + using AtomLayoutSrc = typename Copy_Atom::ValLayoutSrc; // (thr,val) -> offset + using AtomLayoutDst = typename Copy_Atom::ValLayoutDst; // (thr,val) -> offset + using AtomLayoutRef = typename Copy_Atom::ValLayoutRef; // (thr,val) -> offset + + using AtomNumThr = decltype(size<0>(AtomLayoutRef{})); + using AtomNumVal = decltype(size<1>(AtomLayoutRef{})); + + // Layout information for the TiledCopy + using Tiler_MN = ShapeTile_MN; + using TiledShape_MN = decltype(shape(ShapeTile_MN{})); + using TiledLayout_TV = LayoutCopy_TV; + using TiledNumThr = decltype(size<0>(TiledLayout_TV{})); + using TiledNumVal = decltype(size<1>(TiledLayout_TV{})); + + CUTE_STATIC_ASSERT_V(TiledNumThr{} % AtomNumThr{} == Int<0>{}, "TiledCopy uses too few thrs for selected CopyAtom"); + CUTE_STATIC_ASSERT_V(TiledNumVal{} % AtomNumVal{} == Int<0>{}, "TiledCopy uses too few vals for selected CopyAtom"); + + // Tile a tensor or a layout from shape + // (M,N,...) + // to shape + // ((ThrV,ThrX),FrgV,(RestM,RestN,...)) + // where + // ThrV: The threads local to a COPY_ATOM Src. + // ThrX: The threads tiled across COPY_ATOMs Src. + // FrgV: The values local to a COPY_ATOM Src. + // RestM: The values tiled in M. + // RestN: The values tiled in N. + template + CUTE_HOST_DEVICE constexpr static + auto + tidfrg_S(STensor&& stensor) + { + return thrfrg(stensor, right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{})); + } + + // Tile a tensor or a layout from shape + // (M,N,...) + // to shape + // ((ThrV,ThrX),FrgV,(RestM,RestN,...)) + // where + // ThrV: The threads local to a COPY_ATOM Dst. + // ThrX: The threads tiled across COPY_ATOMs Dst. + // FrgV: The values local to a COPY_ATOM Dst. + // RestM: The values tiled in M. + // RestN: The values tiled in N. + template + CUTE_HOST_DEVICE constexpr static + auto + tidfrg_D(DTensor&& dtensor) + { + return thrfrg(dtensor, right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{})); + } + + template + CUTE_HOST_DEVICE constexpr static + auto + thrfrg(Tensor&& tensor, Ref2TrgLayout const& ref2trg) + { + constexpr int R = remove_cvref_t::rank; + static_assert(R >= rank_v, "Rank of tensor to be partitioned too small."); + // Generalize the dimension checks for arbitrary rank + //CUTE_STATIC_ASSERT_V(size<0>(stensor) % size<0>(TiledShape_MNK{}) == Int<0>{}); + //CUTE_STATIC_ASSERT_V(size<1>(stensor) % size<1>(TiledShape_MNK{}) == Int<0>{}); + + // Take the thrs/vals that the atom is interested in + // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID + auto atom_layout_TV = zipped_divide(TiledLayout_TV{}, make_shape(AtomNumThr{}, AtomNumVal{})); + // ((atom_tid,atom_val),(rest_tid,rest_val)) -> (m,n) + + // Transform to the trg layout + auto trg_layout_TV = atom_layout_TV.compose(ref2trg, _); + // ((trg_tid,trg_val),(rest_tid,rest_val)) -> (m,n) + + // Transform the thrs mode from thrid to thr_idx + // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID + auto thrval2mn = coalesce(zip(trg_layout_TV), Shape<_1,Shape<_1,_1>>{}); + // ((trg_tid,rest_tid),(trg_val,rest_val)) -> (m,n) + + /// ================== + + // Tile the tensor for TiledLayout + auto t_tensor = zipped_divide(tensor, Tiler_MN{}); + // ((TileM,TileN,...),(RestM,RestN,...)) + + // Transform the tile mode + auto tv_tensor = t_tensor.compose(thrval2mn, _); + // ((thrid,val),(RM,RN,...)) + + // Unfold and return + return tv_tensor(make_coord(_,_), _); + } + + // retile_S and retile_D assume they are working with the reference layout -- they are the same + template + CUTE_HOST_DEVICE constexpr static + auto + retile(Tensor&& tensor) + { + constexpr int R = remove_cvref_t::rank; + // Assert that AtomLayoutSrc|Dst is identity so we can skip the Ref transformation + + // Assume the first size<0>(tensor) elements are the first val_ids in TiledLayout_TV. + // Then, we only need the shape+layout of those size<0>(tensor) elements in TiledLayout_TV + // and that shape is what we gather from the other modes of tensor + + auto V = size<0>(tensor); + + auto frg_layout_mn = upcast(right_inverse(TiledLayout_TV{}).with_shape(TiledShape_MN{})); + // (m,n) -> v_idx -- The shape and order of the V inside of TiledLayout_TV + + auto frg_layout_v = zipped_divide(logical_product(make_layout(V), right_inverse(frg_layout_mn)), make_layout(AtomNumVal{})); + // (atom_vals,rest_vals) -> (v,m,n) + + /// ======= + + // Tile the tensor for TileFrg + auto t_tensor = zipped_divide(tensor, prepend(product_each(shape(frg_layout_mn)), V)); + // ((TileV,TileM,TileN,...),(1,RestM,RestN,...)) + + // Transform the tile mode + auto v_tensor = t_tensor.compose(frg_layout_v, _); + // ((atom_vals,rest_vals),(1,RM,RN,...)) + + // Unfold and return + return v_tensor(_, append(Int<0>{},_)); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutS_MN() + { + // (M,N) -> (M,N) + auto ref_S = make_layout(TiledShape_MN{}); + // (thr_idx,val_idx) -> (M,N) + auto layoutS_TV = tidfrg_S(ref_S); + // (M,K) -> (thr_idx,val_idx) + auto layoutS_MK = right_inverse(layoutS_TV).with_shape(shape(ref_S)); + + // athrid = (v,m,k) -> thr_idx + auto thrID_S = make_layout(size<0>(TiledLayout_TV{})); + + return cute::make_tuple(layoutS_MK, thrID_S); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutS_TV() + { + // (M,N) -> (M,N) + auto ref_S = make_layout(TiledShape_MN{}); + // (thr_idx,val_idx) -> (M,N) + return tidfrg_S(ref_S)(_,_,Int<0>{}); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutD_MN() + { + // (M,N) -> (M,N) + auto ref_D = make_layout(TiledShape_MN{}); + // (thr_idx,val_idx) -> (M,N) + auto layoutD_TV = tidfrg_D(ref_D); + // (M,K) -> (thr_idx,val_idx) + auto layoutD_MK = right_inverse(layoutD_TV).with_shape(shape(ref_D)); + + // athrid = (v,m,k) -> thr_idx + auto thrID_D = make_layout(size<0>(TiledLayout_TV{})); + + return cute::make_tuple(layoutD_MK, thrID_D); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutD_TV() + { + // (M,N) -> (M,N) + auto ref_D = make_layout(TiledShape_MN{}); + // (thr_idx,val_idx) -> (M,N) + return tidfrg_D(ref_D)(_,_,Int<0>{}); + } + + template + struct ThrCopy : Copy_Atom + { + ThrIdx thr_idx_; + + CUTE_HOST_DEVICE + ThrCopy(ThrIdx const& thr_idx) : thr_idx_(thr_idx) {} + + template + CUTE_HOST_DEVICE + auto + partition_S(STensor&& stensor) { + //static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename Copy_Atom::ValType), + // "Expected ValType for tiling SrcTensor."); + auto thr_tensor = make_tensor(std::forward(stensor).data(), tidfrg_S(stensor.layout())); + return thr_tensor(thr_idx_, _, repeat>(_)); + } + + template + CUTE_HOST_DEVICE + auto + partition_D(DTensor&& dtensor) { + //static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename Copy_Atom::ValType), + // "Expected ValType for tiling DstTensor."); + auto thr_tensor = make_tensor(std::forward(dtensor).data(), tidfrg_D(dtensor.layout())); + return thr_tensor(thr_idx_, _, repeat>(_)); + } + + template + CUTE_HOST_DEVICE static + auto + retile_S(STensor&& stensor) { + static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename Copy_Atom::ValType), + "Expected ValType for tiling SrcTensor."); + return make_tensor(std::forward(stensor).data(), TiledCopy::retile(stensor.layout())); + } + + template + CUTE_HOST_DEVICE static + auto + retile_D(DTensor&& dtensor) { + static_assert(sizeof(typename remove_cvref_t::value_type) == sizeof(typename Copy_Atom::ValType), + "Expected ValType for tiling DstTensor."); + return make_tensor(std::forward(dtensor).data(), TiledCopy::retile(dtensor.layout())); + } + }; + + template ::value)> + CUTE_HOST_DEVICE static + auto + get_slice(ThrIdx const& thr_idx) + { + return ThrCopy(thr_idx); + } + + template ::value)> + CUTE_HOST_DEVICE static + auto + get_thread_slice(ThrIdx const& thr_idx) + { + return get_slice(thr_idx); + } +}; + + +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_impl(Copy_Atom const& atom, + LayoutCopy_TV const&, + Tile const&) +{ + return TiledCopy, LayoutCopy_TV, Tile>{atom}; +} + +// +// These tile the Copy_Atom as a whole +// + +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_A(Copy_Atom const& copy_atom, + TiledMMA const& tiled_mma) +{ + using MNK = typename TiledMMA::TiledShape_MNK; + return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutA_TV(), make_shape(size<0>(MNK{}),size<2>(MNK{}))); +} + +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_B(Copy_Atom const& copy_atom, + TiledMMA const& tiled_mma) +{ + using MNK = typename TiledMMA::TiledShape_MNK; + return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutB_TV(), make_shape(size<1>(MNK{}),size<2>(MNK{}))); +} + +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_C(Copy_Atom const& copy_atom, + TiledMMA const& tiled_mma) +{ + using MNK = typename TiledMMA::TiledShape_MNK; + return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutC_TV(), make_shape(size<0>(MNK{}),size<1>(MNK{}))); +} + +template > +CUTE_HOST_DEVICE +auto +make_tiled_copy(Copy_Atom const& copy_atom, + ThrLayout const& thr_layout = {}, // (m,n) -> thr_idx + ValLayout const& val_layout = {}) +{ + constexpr int R = cute::max(rank_v, rank_v); + + auto thr_layout_mn = append(thr_layout, Layout<_1>{}); + auto val_layout_mn = append(val_layout, Layout<_1>{}); + + // Take the raked_products to compute the Layout_MN + auto layout_mn = raked_product(thr_layout_mn, val_layout_mn); + auto layout_tv = right_inverse(layout_mn).with_shape(make_shape(size(thr_layout), size(val_layout))); + + //print("thr_layout: "); print(thr_layout_mn); print("\n"); + //print("val_layout: "); print(val_layout_mn); print("\n"); + //print("layout_mn : "); print(layout_mn); print("\n"); + //print("layout_tv : "); print(layout_tv); print("\n"); + + return make_tiled_copy_impl(copy_atom, layout_tv, product_each(shape(layout_mn))); +} + +// Make a TiledCopy out of the copy_atom that matches the Src-Layout of tiled_copy +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_S(Copy_Atom const& copy_atom, + TiledCopy const& tiled_copy) +{ + return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutS_TV(), typename TiledCopy::Tiler_MN{}); +} + +// Make a TiledCopy out of the copy_atom that matches the Dst-Layout of tiled_copy +template +CUTE_HOST_DEVICE +auto +make_tiled_copy_D(Copy_Atom const& copy_atom, + TiledCopy const& tiled_copy) +{ + return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutD_TV(), typename TiledCopy::Tiler_MN{}); +} + +// +// Size +// + +// The logical size of a TileCopy +template +CUTE_HOST_DEVICE constexpr +auto +tile_size(TiledCopy const&) +{ + return size(typename TiledCopy::TiledShape_MN{}); +} + +// The number of threads involved in a TiledCopy +template +CUTE_HOST_DEVICE constexpr +auto +size(TiledCopy const&) +{ + return typename TiledCopy::TiledNumThr{}; +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE +auto +print_latex(TiledCopy const& copy) +{ + auto [layoutS_MN, thrID_S] = copy.get_layoutS_MN(); + auto [layoutD_MN, thrID_D] = copy.get_layoutD_MN(); + + print_latex_copy(layoutS_MN, thrID_S, + layoutD_MN, thrID_D); +} + +// MNK Copy Layout to Latex TIKZ -- 8-value color coded by thread +template +CUTE_HOST_DEVICE +void +print_latex_copy(LayoutS const& S, ThrIDS const& TS, // (m,n) -> (tid,vid) and tid -> thr_idx + LayoutD const& D, ThrIDD const& TD) // (m,n) -> (tid,vid) and tid -> thr_idx +{ + CUTE_STATIC_ASSERT_V(rank(S) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(D) == Int<2>{}); + + assert(size<0>(S) == size<0>(D)); + assert(size<1>(S) == size<1>(D)); + + char const* latex_header = + "\\documentclass{standalone}\n" + "\\usepackage{tikz}\n" + "\\usetikzlibrary{external}\n" + "\\tikzexternalize\n" + "\\begin{document}\n" + "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},box/.style={rectangle,draw=black,thick,minimum size=1cm,anchor=center}]\n\n"; + char const* latex_footer = + "\\end{tikzpicture}\n" + "\\end{document}\n"; + + char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}", + "{rgb,255:red,175;green,255;blue,175}", + "{rgb,255:red,255;green,255;blue,175}", + "{rgb,255:red,255;green,175;blue,175}", + "{rgb,255:red,210;green,210;blue,255}", + "{rgb,255:red,210;green,255;blue,210}", + "{rgb,255:red,255;green,255;blue,210}", + "{rgb,255:red,255;green,210;blue,210}",}; + + // Header + printf("%% LayoutS: "); print(S); printf("\n"); + printf("%% ThrIDS : "); print(TS); printf("\n"); + printf("%% LayoutD: "); print(D); printf("\n"); + printf("%% ThrIDD : "); print(TD); printf("\n\n"); + + printf(latex_header); + + // S starting at 0,0 + for (int i = 0; i < size<0>(S); ++i) { + for (int j = 0; j < size<1>(S); ++j) { + int thrid = S(i,j) % size(TS); + int val_idx = S(i,j) / size(TS); + int thr_idx = TS(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + i, j, + thr_idx, val_idx); + } + } + + // D starting at 0,size<1>(S)+3 + for (int i = 0; i < size<0>(D); ++i) { + for (int j = 0; j < size<1>(D); ++j) { + int thrid = D(i,j) % size(TD); + int val_idx = D(i,j) / size(TD); + int thr_idx = TD(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + i, j + size<1>(S) + 3, + thr_idx, val_idx); + } + } + + // S Labels + for (int i = 0, j = -1; i < size<0>(S); ++i) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i); + } + for (int j = 0, i = -1; j < size<1>(S); ++j) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j); + } + // D Labels + for (int i = 0, j = size<1>(D); i < size<0>(S); ++i) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, i); + } + for (int j = 0, i = -1; j < size<1>(D); ++j) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, j); + } + + // Footer + printf(latex_footer); +} + +} // end namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +// Config +#if (__CUDACC_VER_MAJOR__ >= 12) +# define CUTE_COPY_ATOM_TMA_SM90_ENABLED +#endif + +#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED) +#include +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp new file mode 100644 index 0000000000..83cb05652a --- /dev/null +++ b/include/cute/atom/copy_traits.hpp @@ -0,0 +1,76 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +template +struct Copy_Traits +{ + static_assert(sizeof(CopyOperation) == 0, "Copy_Traits not implemented for this Copy_Operation."); +}; + +template +struct Copy_Traits> +{ + // Logical thread id to thread idx (one-thread) + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout::value>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout::value>>>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (one-thread) + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, Stride<_0,_0>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout, Stride<_0,_0>>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +} // end namespace cute diff --git a/include/cute/atom/copy_traits_sm75.hpp b/include/cute/atom/copy_traits_sm75.hpp new file mode 100644 index 0000000000..13eb166e29 --- /dev/null +++ b/include/cute/atom/copy_traits_sm75.hpp @@ -0,0 +1,143 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute +{ + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout,_128>, + Stride, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout, + Stride<_32, _1>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout,_128>, + Stride, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_32,Stride< _1,_1024>>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, + Stride<_128, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_32,Stride< _1,_1024>>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout,_128>, + Stride, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout,Shape <_16, _2>>, + Stride,Stride< _1,_128>>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout,_128>, + Stride, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout,Shape <_16, _2, _2>>, + Stride,Stride< _1,_128,_1024>>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout, + Stride<_128, _1>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout,Shape <_16, _2, _4>>, + Stride,Stride< _1,_128,_1024>>>; + + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; +}; + +} // end namespace cute diff --git a/include/cute/atom/copy_traits_sm80.hpp b/include/cute/atom/copy_traits_sm80.hpp new file mode 100644 index 0000000000..089d19347f --- /dev/null +++ b/include/cute/atom/copy_traits_sm80.hpp @@ -0,0 +1,98 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute +{ + +template +struct Copy_Traits> +{ + // Logical thread id to thread idx (one-thread) + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout::value>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout::value>>>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template +struct Copy_Traits> +{ + // Logical thread id to thread idx (one-thread) + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout::value>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout::value>>>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Element copy selector +template +CUTE_HOST_DEVICE constexpr +auto +select_elementwise_copy(SrcTensor const&, DstTensor const&) +{ + using SrcType = typename SrcTensor::value_type; + using DstType = typename DstTensor::value_type; + +#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) + if constexpr (is_gmem::value && is_smem::value && + sizeof(SrcType) == sizeof(DstType) && + (sizeof(SrcType) == 4 || sizeof(SrcType) == 8 || sizeof(SrcType) == 16)) + { + return SM80_CP_ASYNC_CACHEALWAYS{}; + } else { + return UniversalCopy{}; + } + + CUTE_GCC_UNREACHABLE; +#else + return UniversalCopy{}; +#endif +} + +} diff --git a/include/cute/atom/copy_traits_sm90.hpp b/include/cute/atom/copy_traits_sm90.hpp new file mode 100644 index 0000000000..8c5e843f4e --- /dev/null +++ b/include/cute/atom/copy_traits_sm90.hpp @@ -0,0 +1,132 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include +#include + +#include + +namespace cute +{ + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +template <> +struct Copy_Traits +{ + // Logical thread id to thread idx (warp) + using ThrID = Layout<_32>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = typename Copy_Traits::DstLayout; + // Map from (dst-thr,dst-val) to bit + using DstLayout = typename Copy_Traits::SrcLayout; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; +}; + +} // end namespace cute diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp new file mode 100644 index 0000000000..18e22bf604 --- /dev/null +++ b/include/cute/atom/copy_traits_sm90_tma.hpp @@ -0,0 +1,795 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include + +#include + +namespace cute +{ + +////////////////////////////////////////////////////////////////////////////// +///////////////////////////// TMA_LOAD /////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + +struct SM90_TMA_LOAD_OP : SM90_TMA_LOAD {}; + +// The executable SM90_TMA_LOAD with tma_desc and tma_mbar +template +struct Copy_Traits +{ + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; + + // SM90_TMA_LOAD arguments + TmaDescriptor const& tma_desc_; + uint64_t& tma_load_mbar_; + + template + CUTE_HOST_DEVICE constexpr + void + copy_unpack_(void const* const dst_ptr, + Coord const& src_coord, seq) const + { +#if 0 + print("THR (%d,%d,%d) BLK (%d,%d,%d)\n", + threadIdx.x, threadIdx.y, threadIdx.z, + blockIdx.x, blockIdx.y, blockIdx.z); + print(" TMA Coord "); print(src_coord); print("\n"); + print(" TMA Shape "); print(make_tuple(uint64_t(tma_desc_.size0_), + uint64_t(tma_desc_.size1_), + uint64_t(tma_desc_.size2_), + uint64_t(tma_desc_.size3_))); print("\n"); +#endif + + SM90_TMA_LOAD::copy(&tma_desc_, + tma_load_mbar_, + dst_ptr, + get(src_coord)...); + } + + // This is the copy_unpack dispatch for this Copy_Traits + // Src needs to be a gmem tensor with TmaCoordIterator .data() + // Dst needs to be a smem tensor + template + CUTE_HOST_DEVICE friend constexpr + void + copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor & dst) + { + //static_assert(is_gmem::value, "Expected gmem src for SM90_TMA_LOAD"); // TMA spoofed src tensor + static_assert(is_smem::value, "Expected smem dst for SM90_TMA_LOAD"); + + traits.copy_unpack_(dst.data().get(), src.data().coord_, tuple_seq{}); + } +}; + +// The non-executable SM90_TMA_LOAD with tma_desc and no tma_mbar +// Use .with(tma_mbar) to construct an executable version +template +struct Copy_Traits +{ + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; + + // SM90_TMA_LOAD arguments + TmaDescriptor tma_desc_; + GmemStrides g_stride_; + + // Return TmaDescriptor/TensorMap + CUTE_HOST_DEVICE constexpr + TmaDescriptor const* + get_tma_descriptor() const { + return &tma_desc_; + } + + // Construct an executable SM90_TMA_LOAD with tma_mbar + CUTE_HOST_DEVICE constexpr + Copy_Traits + with(uint64_t& tma_mbar, uint16_t const& multicast_mask = 0) const { + // We accept multicast_mask here to keep the API for both atoms consistent + // assert(multicast_mask == 0); + (void) multicast_mask; + return {tma_desc_, tma_mbar}; + } + + // Generate the TMA coord tensor + template + CUTE_HOST_DEVICE constexpr + auto + get_tma_tensor(GShape const& g_shape) const { + static_assert(is_congruent::value); + constexpr int tma_rank = decltype(cute::min(rank(flatten(g_stride_)), Int<5>{}))::value; + return make_tensor(ArithmeticTupleIterator(as_arithmetic_tuple(repeat(Int<0>{}))), + g_shape, + g_stride_); + } + + // Don't try to execute a copy with SM90_TMA_LOAD before calling .with() + template + CUTE_HOST_DEVICE friend constexpr void + copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor & dst) = delete; +}; + +////////////////////////////////////////////////////////////////////////////// +///////////////////////////// TMA_LOAD_MULTICAST ///////////////////////////// +////////////////////////////////////////////////////////////////////////////// + +struct SM90_TMA_LOAD_MULTICAST_OP : SM90_TMA_LOAD_MULTICAST {}; + +template +struct Copy_Traits +{ + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; + + // SM90_TMA_LOAD_MULTICAST arguments + TmaDescriptor const& tma_desc_; + uint64_t& tma_load_mbar_; + uint16_t const& multicast_mask_; + + template + CUTE_HOST_DEVICE constexpr + void + copy_unpack_(void const* const dst_ptr, + Coord const& src_coord, seq) const + { +#if 0 + print("THR (%d,%d,%d) BLK (%d,%d,%d)\n", + threadIdx.x, threadIdx.y, threadIdx.z, + blockIdx.x, blockIdx.y, blockIdx.z); + print(" TMA Coord "); print(src_coord); print("\n"); + print(" TMA Shape "); print(make_tuple(uint64_t(tma_desc_.size0_), + uint64_t(tma_desc_.size1_), + uint64_t(tma_desc_.size2_), + uint64_t(tma_desc_.size3_))); print("\n"); +#endif + + SM90_TMA_LOAD_MULTICAST::copy(&tma_desc_, + tma_load_mbar_, + multicast_mask_, + dst_ptr, + get(src_coord)...); + } + + template + CUTE_HOST_DEVICE friend constexpr + void + copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor & dst) + { + //static_assert(is_gmem::value, "Expected gmem src for SM90_TMA_LOAD"); // TMA spoofed src tensor + static_assert(is_smem::value, "Expected smem dst for SM90_TMA_LOAD_MULTICAST"); + + traits.copy_unpack_(dst.data().get(), src.data().coord_, tuple_seq{}); + } +}; + +template +struct Copy_Traits +{ + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; + + // SM90_TMA_LOAD_MULTICAST arguments + TmaDescriptor tma_desc_; + GmemStrides g_stride_; + + // Return TmaDescriptor/TensorMap + CUTE_HOST_DEVICE constexpr + TmaDescriptor const* + get_tma_descriptor() const { + return &tma_desc_; + } + + // Construct an executable SM90_TMA_LOAD_MULTICAST with tma_mbar + CUTE_HOST_DEVICE constexpr + Copy_Traits + with(uint64_t& tma_load_mbar, uint16_t const& multicast_mask) const { + return {tma_desc_, tma_load_mbar, multicast_mask}; + } + + // Generate the TMA coord tensor + template + CUTE_HOST_DEVICE constexpr + auto + get_tma_tensor(GShape const& g_shape) const { + static_assert(is_congruent::value); + constexpr int tma_rank = decltype(cute::min(rank(flatten(g_stride_)), Int<5>{}))::value; + return make_tensor(ArithmeticTupleIterator(as_arithmetic_tuple(repeat(Int<0>{}))), + g_shape, + g_stride_); + } + + // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with() + template + CUTE_HOST_DEVICE friend constexpr void + copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor & dst) = delete; +}; + +////////////////////////////////////////////////////////////////////////////// +///////////////////////////// TMA_STORE ////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + +// The executable SM90_TMA_STORE with tma_desc +template +struct Copy_Traits +{ + using ThrID = Layout<_1>; + + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>; + + // Reference map from (thr,val) to bit + using RefLayout = SrcLayout; + + // SM90_TMA_STORE arguments + TmaDescriptor tma_desc_; + GmemStrides g_stride_; + + // Generate the TMA coord tensor + template + CUTE_HOST_DEVICE constexpr + auto + get_tma_tensor(GShape const& g_shape) const { + static_assert(is_congruent::value); + constexpr int tma_rank = decltype(cute::min(rank(flatten(g_stride_)), Int<5>{}))::value; + return make_tensor(ArithmeticTupleIterator(as_arithmetic_tuple(repeat(Int<0>{}))), + g_shape, + g_stride_); + } + + template + CUTE_HOST_DEVICE constexpr + void + copy_unpack_(void const* const src_ptr, + Coord const& dst_coord, seq) const + { +#if 0 + print("THR (%d,%d,%d) BLK (%d,%d,%d)\n", + threadIdx.x, threadIdx.y, threadIdx.z, + blockIdx.x, blockIdx.y, blockIdx.z); + print(" TMA Coord "); print(dst_coord); print("\n"); + print(" TMA Shape "); print(make_tuple(uint64_t(tma_desc_.size0_), + uint64_t(tma_desc_.size1_), + uint64_t(tma_desc_.size2_), + uint64_t(tma_desc_.size3_))); print("\n"); +#endif + + SM90_TMA_STORE::copy(&tma_desc_, + src_ptr, + get(dst_coord)...); + } + + // This is the copy_unpack dispatch for this Copy_Traits + // Src needs to be a smem tensor + // Dst needs to be a gmem tensor with TmaCoordIterator .data() + template + CUTE_HOST_DEVICE friend constexpr + void + copy_unpack(Copy_Traits const& traits, + Tensor const& src, + Tensor & dst) + { + static_assert(is_smem::value, "Expected smem src for SM90_TMA_STORE"); + //static_assert(is_gmem::value, "Expected gmem dst for SM90_TMA_STORE"); // TMA spoofed src tensor + + traits.copy_unpack_(src.data().get(), dst.data().coord_, tuple_seq{}); + } +}; + +// +// MAKE_TMA_COPY and related +// + +template +TMA::SmemSwizzleBits +get_tma_swizzle_bits(ComposedLayout,Offset,SLayout>) +{ + static_assert(M == 4, "Expected 128b=16B=(2^4)B base swizzle."); + static_assert(S == 3, "Unsupported layout swizzle"); + + switch (B) { + default: static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3. Unsupported layout swizzle."); + case 3: return TMA::SmemSwizzleBits::B128; + case 2: return TMA::SmemSwizzleBits::B64; + case 1: return TMA::SmemSwizzleBits::B32; + case 0: return TMA::SmemSwizzleBits::DISABLE; + } +} + +template +TMA::SmemSwizzleBits +get_tma_swizzle_bits(Layout) +{ + return TMA::SmemSwizzleBits::DISABLE; +} + +template +auto +get_nonswizzle_layout(ComposedLayout,Offset,SLayout> const& slayout) +{ + return slayout.layout_fn(); +} + +template +auto +get_nonswizzle_layout(Layout const& slayout) +{ + return slayout; +} + +/** Make a CuTe CTA-collective TiledCopy for a TMA operation. + * + * @param CopyOp The target copy operation: SM90_TMA_LOAD, SM90_TMA_LOAD_MULTICAST, SM90_TMA_STORE + * @param gtensor The GMEM Tensor to be involved in the TMA. + * @param slayout The SMEM Layout to be involved in the TMA. + * @param cta_tile The CTA-local tile that each CTA will be tiling GMEM with. + * This is often the blk_shape that is used to tile the GMEM for CTAs: + * local_tile(gtensor, blk_shape, blk_coord) -> CTA-local tile of gtensor + * @param cluster_size When using SM90_TMA_LOAD_MULTICAST, this can be a (static) power-of-2 <= 16 + * defining the multicast size (used to further partition the SMEM) + * Else, static-1 + * + * This code attempts to maximize the TMA box size. It does this by tracing + * the SMEM "vector" -- the inverse of the smem layout -- to find the largest + * contiguous array of smem that can be written to/from global memory given + * the constraints that the TMA instruction imposes. + * + * This is accomplished by assigning "basis" strides to the GMEM to track which + * modes of SMEM map to which modes of GMEM, then reorder the modes of GMEM according + * to the SMEM vector, and then using those GMEM/SMEM modes to fill in the desc. + * + * Examples: + using T = float; + T* gptr = nullptr; + + { + // Simple 2D + Tensor gtensor = make_tensor(gptr, make_shape(1024, 256), GenRowMajor{}); // K-Major GMEM + auto slayout = make_layout(make_shape(_64{}, _32{}), GenRowMajor{}); // K-Major SMEM + auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout); + } + + { + // GMMA 2D + Tensor gtensor = make_tensor(gptr, make_shape(1024, 256)); // MN-Major GMEM + auto slayout = tile_to_shape(GMMA::Layout_MN_SW128_Atom{}, make_shape(_128{},_64{})); // MN-Major Swizzled+Tiled 128x64 SMEM + auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout); + } + + { + // 3D + Tensor gtensor = make_tensor(gptr, make_shape(1024, 32, 512), make_stride(64, Int<1>{}, 65536)); // GMEM + auto slayout = make_layout(make_shape(_16{}, _8{}, _2{}), make_stride(_16{}, _1{}, _8{})); // SMEM w/ same major-mode + auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout); + } + + { + // cuTENSOR 4D + auto layout = make_shape(make_shape(32,40),make_shape(make_shape(8,8),656)); // GMEM + auto cta_tile = make_shape(_128{},make_shape(_32{},_2{})); // GMEM Tiling: + // Take 128-elem from m: m0 must divide 128, + // m-last may be predicated + // Take 32-elem from k0, 2-elem from k1 + auto slayout = make_layout(cta_tile); // Col-Major SMEM + auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout, cta_tile, Int<1>{}); + } + * + * Check the TMA box size and desc: + print("TMA Box size: "); print(typename decltype(tma)::Tiler_MN{}); print("\n"); + print("TMA desc : "); print(tma.tma_desc_); print("\n"); + * + * Usage: + Tensor mA = tma_a.get_tma_tensor(make_shape(M,N)); // (M,N) TMA coord tensor + Tensor gA = local_tile(mA, cta_tile, cta_coord); // (BLK_M,BLK_N) TMA coord tensor for this CTA + Tensor sA = make_tensor(make_smem_ptr(sptr), slayout); // (BLK_M,BLK_N) SMEM tensor + + auto cta_tma = tma.get_slice(cta_idx_in_cluster); // Slice for multicast partitioning + Tensor tAgA = cta_tma.partition_S(gA); // Partition for src + Tensor tAsA = cta_tma.partition_D(sA); // Partition for dst + + copy(tma.with(barrier, mcast_mask), tAgA, tAsA); // copy with supporting TMA params + */ +template +CUTE_HOST +auto +make_tma_copy(CopyOp, + Tensor const& gtensor, + SLayout const& slayout, + CTA_Tile const& cta_tile, + Cluster_Size const& cluster_size) +{ + static_assert((std::is_same::value && is_constant<1, Cluster_Size>::value) || + (std::is_same::value) || + (std::is_same::value && is_constant<1, Cluster_Size>::value)); + + using T = typename Tensor::value_type; + + // + // TMA parameter checking + // + + auto flat_glayout = flatten(gtensor.layout()); + + CUTE_STATIC_ASSERT_V(rank(flatten(cta_tile)) <= Int<5>{}, + "CTA_Tile cannot have more than five modes, TMA arch restriction."); + CUTE_STATIC_ASSERT_V(rank(flat_glayout) <= Int<5>{} || rank(flatten(cta_tile)) <= Int<4>{}, + "If GTensor has more than five modes, then CTA_Tile cannot have more than four modes. TMA multimode."); + CUTE_STATIC_ASSERT_V(compatible(product_each(shape(slayout)), shape(cta_tile)), + "CTA_Tile must be compatible with SLayout."); + CUTE_STATIC_ASSERT_V(is_integral{} && has_single_bit(cluster_size) && cluster_size <= Int<16>{}, + "Expecting a pow2 integral Cluster_Size leq 16."); + CUTE_STATIC_ASSERT_V(size(slayout) % cluster_size == Int<0>{}, + "ClusterShape must divide domain size of slayout."); + + // + // TMA slayout manipulation + // + + auto tma_multimode = rank(flat_glayout) > Int<5>{}; + + // Invert the smem to get the largest contiguous vector in the smem layout + auto inv_smem_layout = right_inverse(get_nonswizzle_layout(slayout)); + // trunc_smem_idx -> trunc_smem_coord + + // Map from smem idx to a gmem mode + auto sidx_to_gmode = flatten(composition(make_identity_layout(cta_tile), inv_smem_layout)); + + // Truncate any incompatibilities + auto smem_rank = find_if(stride(sidx_to_gmode), [](auto e){ + [[maybe_unused]] auto v = basis_value(e); + return not is_constant<1,decltype(v)>{}; + }); + static_assert(smem_rank > 0, "Could not find a common smem-gmem vectorization for TMA."); + constexpr int smem_tma_rank = cute::min(int(smem_rank), (tma_multimode ? 4 : 5)); + + // Keep only the static-1 basis modes into gmem + auto sidx_to_gmode_cluster_trunc = take<0,smem_tma_rank>(sidx_to_gmode); + // Keep only the portion each multicast CTA will be responsible for + auto sidx_to_gmode_cta_trunc = composition(sidx_to_gmode_cluster_trunc, shape_div(size(sidx_to_gmode_cluster_trunc), cluster_size)); + + // + // TMA gtensor manipulation + // + + // Generate a TupleBasis for the gtensor + auto flat_gbasis = make_basis_like(shape(flat_glayout)); + + // Fold the flat_gbasis into the glayout + auto glayout_basis = make_layout(shape(gtensor), + stride(composition(make_layout(repeat_like(shape(flat_glayout), Int<2>{}), flat_gbasis), + make_layout(repeat_like(shape(gtensor), Int<2>{}))))); + + // Tile the modes of gtensor with cta_tile + auto cta_glayout_basis = composition(glayout_basis, cta_tile); + + // Check that the cta_tile selects modes from gtensor properly + for_each(flatten(stride(cta_glayout_basis)), [](auto d) { + static_assert(is_constant<1, decltype(d.value())>::value, + "CTA_Tile does not faithfully partition the GMEM, it should select the number of elements from each mode of glayout."); + }); + + // Tile the modes of gtensor again with the truncated cta_tile o inv_smem_layout + auto tma_layout_cta_trunc = flatten(composition(glayout_basis, sidx_to_gmode_cta_trunc)); + + // Append any missing basis on the end as size-1 modes b/c they got truncated + auto missing_basis = fold(stride(tma_layout_cta_trunc), flat_gbasis, [](auto init, auto e){ + auto k = find(init, e); + return remove(init); + }); + + // The appended map from truncated smem codomain to gmem mode: trunc_smem_idx -> gmem_mode + auto tma_layout_cta = flatten(make_layout(tma_layout_cta_trunc, + make_layout(repeat(Int<1>{}), missing_basis))); + +#if 0 + print("g_layout : "); print(gtensor.layout()); print("\n"); + print("s_layout : "); print(slayout); print("\n"); + print("cta_tile : "); print(cta_tile); print("\n"); + print("cluster_size : "); print(cluster_size); print("\n"); + print("flat_gbasis : "); print(flat_gbasis); print("\n"); + print("cta_glayout : "); print(cta_glayout_basis); print("\n"); + print("inv_smem : "); print(inv_smem_layout); print("\n"); + print("sidx_to_gmode : "); print(sidx_to_gmode); print("\n"); + print("missing_b : "); print(missing_basis); print("\n"); + print("tma_layout_cta: "); print(tma_layout_cta); print("\n"); +#endif + + // + // TMA gmem desc info + // + + constexpr int TmaRANK = cute::min(rank(flat_glayout), 5); + void* gmem_address = (void*) gtensor.data(); + + cute::array gmem_prob_shape = {1,1,1,1,1}; + cute::array gmem_prob_stride = {0,0,0,0,0}; + for_each(make_seq{}, [&](auto i) { + // NOTE : WAR g++-7.3.5, let it deduce e rather than fuse with below + auto e = stride(tma_layout_cta); + constexpr int j = decltype(e.mode())::value; + constexpr int tma_i = i < 5 ? i : 4; + + // Problem stride + uint64_t stride_j = stride(flat_glayout) * sizeof(T); + uint64_t old_stride = gmem_prob_stride[tma_i]; + gmem_prob_stride[tma_i] = gcd(gmem_prob_stride[tma_i], stride_j); + + // Problem shape + uint64_t shape_j = shape(flat_glayout); + if (gmem_prob_stride[tma_i] != 0) { + // We're "resetting" this TMA mode and using it as a "multimode" + // Recurrence: g_shape = (s_i - 1) * (d_i / gcd_j d_j) + 1 + gmem_prob_shape[tma_i] = (gmem_prob_shape[tma_i]-1) * (old_stride / gmem_prob_stride[tma_i]) + + (shape_j-1) * (stride_j / gmem_prob_stride[tma_i]) + + 1; + } else { + gmem_prob_shape[tma_i] = shape_j; + } + }); + + assert((reinterpret_cast(gmem_address) & 0b1111) == 0); // Address must be 16B-aligned + + assert(gmem_prob_shape[0] >= (uint64_t(1))); // Size must be min 1 + assert(gmem_prob_shape[0] <= (uint64_t(1) << 32)); // Size must be max 2^32 + assert(gmem_prob_shape[1] >= (uint64_t(1))); // Size must be min 1 + assert(gmem_prob_shape[1] <= (uint64_t(1) << 32)); // Size must be max 2^32 + assert(gmem_prob_shape[2] >= (uint64_t(1))); // Size must be min 1 + assert(gmem_prob_shape[2] <= (uint64_t(1) << 32)); // Size must be max 2^32 + assert(gmem_prob_shape[3] >= (uint64_t(1))); // Size must be min 1 + assert(gmem_prob_shape[3] <= (uint64_t(1) << 32)); // Size must be max 2^32 + assert(gmem_prob_shape[4] >= (uint64_t(1))); // Size must be min 1 + assert(gmem_prob_shape[4] <= (uint64_t(1) << 32)); // Size must be max 2^32 + + assert((gmem_prob_stride[0]) == sizeof(T)); // First stride is implicitly 1 + assert((gmem_prob_stride[1]) < (uint64_t(1) << 40)); // Stride must be max 2^40 + assert((gmem_prob_stride[1] & 0b1111) == 0); // Stride must be multiple of 16B (128b) + assert((gmem_prob_stride[2]) < (uint64_t(1) << 40)); // Stride must be max 2^40 + assert((gmem_prob_stride[2] & 0b1111) == 0); // Stride must be multiple of 16B (128b) + assert((gmem_prob_stride[3]) < (uint64_t(1) << 40)); // Stride must be max 2^40 + assert((gmem_prob_stride[3] & 0b1111) == 0); // Stride must be multiple of 16B (128b) + assert((gmem_prob_stride[4]) < (uint64_t(1) << 40)); // Stride must be max 2^40 + assert((gmem_prob_stride[4] & 0b1111) == 0); // Stride must be multiple of 16B (128b) + + // + // TMA smem desc info + // + + // TMA smem box size + cute::array smem_box_shape = {1,1,1,1,1}; + for_each(make_seq{}, [&](auto i) { + uint32_t shape_i = shape(tma_layout_cta); + constexpr int tma_i = i < 5 ? i : 4; + if (tma_multimode && tma_i == 4) { + // We're "reusing" this TMA mode and using it as a "multimode" + smem_box_shape[tma_i] = 1; + } else { + smem_box_shape[tma_i] = shape_i; + } + }); + + // TMA smem mode strides + [[maybe_unused]] cute::array smem_box_stride = {1,1,1,1,1}; + + assert(smem_box_shape[0] >= (uint64_t(1))); // Size must be min 1 + assert(smem_box_shape[0] <= (uint64_t(1) << 8)); // Size must be max 2^8 + assert(smem_box_shape[0] >= (uint64_t(1))); // Size must be min 1 + assert(smem_box_shape[0] <= (uint64_t(1) << 8)); // Size must be max 2^8 + assert(smem_box_shape[0] >= (uint64_t(1))); // Size must be min 1 + assert(smem_box_shape[0] <= (uint64_t(1) << 8)); // Size must be max 2^8 + assert(smem_box_shape[0] >= (uint64_t(1))); // Size must be min 1 + assert(smem_box_shape[0] <= (uint64_t(1) << 8)); // Size must be max 2^8 + + assert(smem_box_stride[0] >= (uint32_t(1))); // Stride must be min 1 + assert(smem_box_stride[0] <= (uint32_t(8))); // Stride must be max 2^3 + assert(smem_box_stride[1] >= (uint32_t(1))); // Stride must be min 1 + assert(smem_box_stride[1] <= (uint32_t(8))); // Stride must be max 2^3 + assert(smem_box_stride[2] >= (uint32_t(1))); // Stride must be min 1 + assert(smem_box_stride[2] <= (uint32_t(8))); // Stride must be max 2^3 + assert(smem_box_stride[3] >= (uint32_t(1))); // Stride must be min 1 + assert(smem_box_stride[3] <= (uint32_t(8))); // Stride must be max 2^3 + assert(smem_box_stride[4] >= (uint32_t(1))); // Stride must be min 1 + assert(smem_box_stride[4] <= (uint32_t(8))); // Stride must be max 2^3 + + // + // Construct the descriptor + // + + TmaDescriptor tma_desc = {0}; + +#if (__CUDACC_VER_MAJOR__ >= 12) + + // + // TMA general info + // + + cuuint32_t tma_dim = TmaRANK; + CUtensorMapDataType tma_format = TMA::to_CUtensorMapDataType(); + CUtensorMapInterleave tma_interleave = CU_TENSOR_MAP_INTERLEAVE_NONE; + CUtensorMapL2promotion tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_NONE; + CUtensorMapFloatOOBfill tma_oobFill = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE; + + // TMA smem swizzle type + CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(get_tma_swizzle_bits(slayout)); + + CUresult result = cuTensorMapEncodeTiled( + &tma_desc, + tma_format, + tma_dim, + gmem_address, + gmem_prob_shape.data(), + gmem_prob_stride.data() + 1, // gmem_prob_stride[0] implicitly 1 + smem_box_shape.data(), + smem_box_stride.data(), + tma_interleave, + smem_swizzle, + tma_l2Promotion, + tma_oobFill); + + if (result != CUDA_SUCCESS) { + std::cerr << "TMA Desc Addr: " << &tma_desc + << "\nformat " << tma_format + << "\ndim " << tma_dim + << "\ngmem_address " << gmem_address + << "\nglobalDim " << gmem_prob_shape + << "\nglobalStrides " << gmem_prob_stride + << "\nboxDim " << smem_box_shape + << "\nelementStrides " << smem_box_stride + << "\ninterleave " << tma_interleave + << "\nswizzle " << smem_swizzle + << "\nl2Promotion " << tma_l2Promotion + << "\noobFill " << tma_oobFill << std::endl; + std::cerr << "Error: Failed to intialize the TMA descriptor " << result << std::endl; + assert(false); + } +#endif // (__CUDACC_VER_MAJOR__ >= 12) + + // + // Construct the Copy_Traits + // + + // Finally, get the inverse permutation of the E bases for the mocked gmem stride + auto gmem_stride_bases_flat = transform(make_seq{}, [&](auto i) { + auto k = find(stride(tma_layout_cta), E{}); + // NOTE: gcc 7.3.5 WAR -- avoid if constexpr + int32_t tma_coord_stride = int32_t(stride(flat_glayout) * sizeof(T) / (gmem_prob_stride[4] != 0 ? gmem_prob_stride[4] : 16)); + return conditional_return(tma_multimode && (k >= Int<4>{}), + E<4>{} * tma_coord_stride, // The 4th TMA mode is the multimode, use int32_t coord stride + E{}); + }); + + // Give that the profile of gtensor and fold it + auto gmem_stride_bases = stride(composition(make_layout(repeat_like(shape(flat_glayout), Int<2>{}), gmem_stride_bases_flat), + make_layout(repeat_like(shape(gtensor), Int<2>{})))); + + constexpr int num_bits = size(sidx_to_gmode_cta_trunc) * sizeof(T) * 8; + using Traits = Copy_Traits, decltype(gmem_stride_bases)>; + +#if 0 + print("num_bits : "); print(num_bits); print("\n"); + print("g_stride_bases: "); print(gmem_stride_bases); print("\n"); +#endif + + // + // Construct the TiledCopy + // + + // The ThrVal layout for 1 TMA instruction within cta_tile + auto layout_tv_1 = composition(inv_smem_layout, make_layout(make_shape(cluster_size, size(sidx_to_gmode_cta_trunc)), GenRowMajor{})); + // The ThrVal layout for N TMA instructions within cta_tile + auto layout_tv = tile_to_shape(layout_tv_1, make_shape(cluster_size, size(cta_tile)/cluster_size)); + +#if 0 + print("layout_tv : "); print(layout_tv); print("\n"); +#endif + + return TiledCopy, decltype(layout_tv), decltype(cta_tile)>{tma_desc, gmem_stride_bases}; +} + +// Explicit defaulting +template +CUTE_HOST +auto +make_tma_copy(CopyOp const& copy_op, + Tensor const& gtensor, + SLayout const& slayout) +{ + return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), Int<1>{}); +} + +template +CUTE_HOST +auto +make_tma_copy(CopyOp const& copy_op, + Tensor const& gtensor, + SLayout const& slayout, + Cluster_Size const& cluster_size) +{ + return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), cluster_size); +} + +} // end namespace cute diff --git a/include/cute/atom/mma_atom.hpp b/include/cute/atom/mma_atom.hpp new file mode 100644 index 0000000000..c3025f5065 --- /dev/null +++ b/include/cute/atom/mma_atom.hpp @@ -0,0 +1,1081 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include +#include +#include +#include + +namespace cute { + +// Generic mma_unpack for any MMA_Traits +template +CUTE_HOST_DEVICE constexpr +void +mma_unpack(MMA_Traits const&, + Tensor & D, + Tensor const& A, + Tensor const& B, + Tensor const& C) +{ + static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); + static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); + static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); + static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); + + // Register value types from the MMA_Operation register arrays + using RegTypeD = typename std::remove_extent::type; + using RegTypeA = typename std::remove_extent::type; + using RegTypeB = typename std::remove_extent::type; + using RegTypeC = typename std::remove_extent::type; + constexpr int RegNumD = std::extent::value; + constexpr int RegNumA = std::extent::value; + constexpr int RegNumB = std::extent::value; + constexpr int RegNumC = std::extent::value; + + Tensor rA = recast(A); + Tensor rB = recast(B); + + CUTE_STATIC_ASSERT_V(size(rA) == Int{}); + CUTE_STATIC_ASSERT_V(size(rB) == Int{}); + + if constexpr (std::is_same::value) + { + static_assert(std::is_same::value, "GMMA C and D value_type must match."); + static_assert(std::is_same::value, "GMMA C and D layouts must match."); + // assert((void*)&C == (void*)&D); + + Tensor rC = recast(D); // NOTE: D and C are same, so use mutable D + + //CUTE_STATIC_ASSERT_V(size(rC) == Int{}); + + detail::explode(Operation::fma, + rA, make_int_sequence{}, + rB, make_int_sequence{}, + rC, make_int_sequence{}); + } else + { + Tensor rD = recast(D); + Tensor rC = recast(C); + + CUTE_STATIC_ASSERT_V(size(rD) == Int{}); + CUTE_STATIC_ASSERT_V(size(rC) == Int{}); + + detail::explode(Operation::fma, + rD, make_int_sequence{}, + rA, make_int_sequence{}, + rB, make_int_sequence{}, + rC, make_int_sequence{}); + } +} + + +namespace detail { + +template +struct FrgTypeA_or_Default { using type = typename X::ElementAVal; }; +template +struct FrgTypeA_or_Default> { using type = typename X::ElementAFrg; }; + +template +struct FrgTypeB_or_Default { using type = typename X::ElementBVal; }; +template +struct FrgTypeB_or_Default> { using type = typename X::ElementBFrg; }; + +template +struct FrgTypeC_or_Default { using type = typename X::ElementCVal; }; +template +struct FrgTypeC_or_Default> { using type = typename X::ElementCFrg; }; + +} // end namespace detail + +template +struct MMA_Atom; + +template +struct MMA_Atom : MMA_Atom> +{}; + +template +struct MMA_Atom> + : MMA_Traits +{ + using Traits = MMA_Traits; + + // Element value types from the MMA_Traits + using ValTypeD = typename Traits::ElementDVal; + using ValTypeA = typename Traits::ElementAVal; + using ValTypeB = typename Traits::ElementBVal; + using ValTypeC = typename Traits::ElementCVal; + + // Thr-Val layouts from the MMA_Traits + using Shape_MNK = typename Traits::Shape_MNK; + using ThrID = typename Traits::ThrID; + using LayoutC_TV = typename Traits::CLayout; + using LayoutA_TV = typename Traits::ALayout; + using LayoutB_TV = typename Traits::BLayout; + + // Fragment value types from the MMA_Traits (optional, defaults to Val type) + using FrgTypeD = typename detail::FrgTypeC_or_Default::type; + using FrgTypeA = typename detail::FrgTypeA_or_Default::type; + using FrgTypeB = typename detail::FrgTypeB_or_Default::type; + using FrgTypeC = typename detail::FrgTypeC_or_Default::type; + + // Additional Trait parameters/transformations + template + CUTE_HOST_DEVICE + auto + with(TraitsArgs&&... args) const { + auto traits = Traits::with(std::forward(args)...); + return MMA_Atom{traits}; + } + + // Print thread and data layouts for debugging + CUTE_HOST_DEVICE static + void + print_all() + { + print("ThrID: "); print(ThrID{}); print("\n"); + print("LayoutA_TV: "); print(LayoutA_TV{}); print("\n"); + print("LayoutB_TV: "); print(LayoutB_TV{}); print("\n"); + print("LayoutC_TV: "); print(LayoutC_TV{}); print("\n"); + } + + // + // Tensor call interfaces + // + + // Cast, check, and call fma + template + CUTE_HOST_DEVICE constexpr + void + call(Tensor & D, + Tensor const& A, + Tensor const& B, + Tensor const& C) const + { + static_assert(DLayout::rank == 1, "Expected rank-1 D tensor"); + static_assert(ALayout::rank == 1, "Expected rank-1 A tensor"); + static_assert(BLayout::rank == 1, "Expected rank-1 B tensor"); + static_assert(CLayout::rank == 1, "Expected rank-1 C tensor"); + + return mma_unpack(*this, D, A, B, C); + } + + // Three arguments reproduces C + template + CUTE_HOST_DEVICE constexpr + void + call(Tensor const& A, + Tensor const& B, + Tensor & C) const + { + return call(C, A, B, C); + } + + // + // make_fragment_A|B|C + // These functions are awkward as they expect already-partitioned tensors + // resulting from a previous call to partition_A|B|C + // The reasoning is that we can inspect the layout of the partitioned data + // and attempt to match it in generated fragment to promote vectorization + // when copying from partition to fragment. + // + + template + CUTE_HOST_DEVICE static constexpr + auto + make_fragment_C(CTensor&& ctensor) + { + // Check that this tensor is likely already partitioned + CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<3>{}); // VMN + CUTE_STATIC_ASSERT_V(size<0>(ctensor) == size<1>(LayoutC_TV{})); + + // C is a bit special because we are after accumulators here + // The input/output type doesn't have to match the accumulator type + //static_assert(std::is_same::value_type>::value, "Expecting ValTypeC type"); + + // We'll never base the accumulator layout on the input tensor layout, so just return a FrgTypeC tensor + return make_tensor(shape(ctensor)); + } + + template + CUTE_HOST_DEVICE static constexpr + auto + make_fragment_A(ATensor&& atensor) + { + // Check that this tensor is likely already partitioned + CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<3>{}); // VMK + CUTE_STATIC_ASSERT_V(size<0>(atensor) == size<1>(LayoutA_TV{})); + static_assert(std::is_same::value_type>::value, "Expecting ValTypeA type"); + + if constexpr (has_dereference::value) { + return recast(std::forward(atensor)); + } else { + return make_tensor(make_fragment_like(atensor.layout())); + } + + CUTE_GCC_UNREACHABLE; + } + + template + CUTE_HOST_DEVICE static constexpr + auto + make_fragment_B(BTensor&& btensor) + { + // Check that this tensor is likely already partitioned + CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<3>{}); // VNK + CUTE_STATIC_ASSERT_V(size<0>(btensor) == size<1>(LayoutB_TV{})); + static_assert(std::is_same::value_type>::value, "Expecting ValTypeB type"); + + if constexpr (has_dereference::value) { + return recast(std::forward(btensor)); + } else { + return make_tensor(make_fragment_like(btensor.layout())); + } + + CUTE_GCC_UNREACHABLE; + } +}; + +// +// A tiling of mma atoms +// + +template +struct ThrMMA; + +template >, + class ValLayoutMNK = Layout>, + class PermutationsMNK = Tile> +struct TiledMMA : MMA_Atom +{ + static_assert(rank_v == 3, "TiledMMA requires rank-3 AtomLayoutMNK"); + static_assert(rank_v == 3, "TiledMMA requires rank-3 ValLayoutMNK"); + static_assert(rank_v == 3, "TiledMMA requires rank-3 PermutationsMNK"); + + using AtomShape_MNK = typename MMA_Atom::Shape_MNK; + + using AtomLayoutC_TV = typename MMA_Atom::LayoutC_TV; + using AtomLayoutA_TV = typename MMA_Atom::LayoutA_TV; + using AtomLayoutB_TV = typename MMA_Atom::LayoutB_TV; + + // ThrV -> thread_idx + using AtomThrID = typename MMA_Atom::ThrID; + + // (M,N,K) + using TiledShape_MNK = decltype(make_shape(size<0>(AtomShape_MNK{})*size<0>(AtomLayoutMNK{})*size<0>(ValLayoutMNK{}), + size<1>(AtomShape_MNK{})*size<1>(AtomLayoutMNK{})*size<1>(ValLayoutMNK{}), + size<2>(AtomShape_MNK{})*size<2>(AtomLayoutMNK{})*size<2>(ValLayoutMNK{}))); + + // thrid = (ThrV,ThrM,ThrN,ThrK) -> thr_idx + using ThrLayoutVMNK = decltype(tiled_product(AtomThrID{}, AtomLayoutMNK{})); + + // thr_idx -> (ThrV,ThrM,ThrN,ThrK) + using TidLayout = decltype(right_inverse(ThrLayoutVMNK{})); + + // Tile a tensor or a layout from shape + // (M,N,...) + // to shape + // ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN,...))) + // where + // ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx + // ThrM: The threads tiled in M. layout<1>(ThrLayoutVMNK): ThrM -> thread_idx + // ThrN: The threads tiled in N. layout<2>(ThrLayoutVMNK): ThrN -> thread_idx + // FrgV: The values local to an MMA. + // RestM: The values tiled in M. + // RestN: The values tiled in N. + template + CUTE_HOST_DEVICE constexpr static + auto + thrfrg_C(CTensor&& ctensor) + { + CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<2>{}); + CUTE_STATIC_ASSERT_V(size<0>(ctensor) % size<0>(TiledShape_MNK{}) == Int<0>{}); + CUTE_STATIC_ASSERT_V(size<1>(ctensor) % size<1>(TiledShape_MNK{}) == Int<0>{}); + + // Reorder the tensor for the TiledAtom + auto t_tile = make_tile(left_inverse(get<0>(PermutationsMNK{})), + left_inverse(get<1>(PermutationsMNK{}))); + auto t_tensor = logical_divide(ctensor, t_tile); // (PermM,PermN) + + // Tile the tensor for the Atom + auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})), + make_layout(size<1>(AtomShape_MNK{}))); + auto a_tensor = zipped_divide(t_tensor, a_tile); // ((AtomM,AtomN),(RestM,RestN)) + + // Transform the Atom mode from (M,K) to (Thr,Val) + auto tv_tensor = a_tensor.compose(AtomLayoutC_TV{},_); // ((ThrV,FrgV),(RestM,RestN)) + + // Tile the tensor for the C-threads + auto thr_tile = make_tile(_, + make_tile(make_layout(size<1>(ThrLayoutVMNK{})), + make_layout(size<2>(ThrLayoutVMNK{})))); + auto thr_tensor = zipped_divide(tv_tensor, thr_tile); // ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN))) + + return thr_tensor; + } + + // Tile from (M,N,...) + // to (thr_idx,(FrgV,(RestM,RestN,...))) + template + CUTE_HOST_DEVICE constexpr static + auto + tidfrg_C(CTensor&& ctensor) + { + // Don't need a ctile composition because ThrK is last mode in TidLayout + + return thrfrg_C(ctensor).compose(TidLayout{}, _); + } + + // Tile a tensor or a layout from shape + // (M,K,...) + // to shape + // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK,...))) + // where + // ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx + // ThrM: The threads tiled in M. layout<1>(ThrLayoutVMNK): ThrM -> thread_idx + // ThrK: The threads tiled in K. layout<3>(ThrLayoutVMNK): ThrK -> thread_idx + // FrgV: The values local to an MMA. + // RestM: The values tiled in M. + // RestK: The values tiled in K. + template + CUTE_HOST_DEVICE constexpr static + auto + thrfrg_A(ATensor&& atensor) + { + CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<2>{}); + CUTE_STATIC_ASSERT_V(size<0>(atensor) % size<0>(TiledShape_MNK{}) == Int<0>{}); + CUTE_STATIC_ASSERT_V(size<1>(atensor) % size<2>(TiledShape_MNK{}) == Int<0>{}); + + // Reorder the tensor for the TiledAtom + auto t_tile = make_tile(left_inverse(get<0>(PermutationsMNK{})), + left_inverse(get<2>(PermutationsMNK{}))); + auto t_tensor = logical_divide(atensor, t_tile); // (PermM,PermK) + + // Tile the tensor for the Atom + auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})), + make_layout(size<2>(AtomShape_MNK{}))); + auto a_tensor = zipped_divide(t_tensor, a_tile); // ((AtomM,AtomK),(RestM,RestK)) + + // Transform the Atom mode from (M,K) to (Thr,Val) + auto tv_tensor = a_tensor.compose(AtomLayoutA_TV{},_); // ((ThrV,FrgV),(RestM,RestK)) + + // Tile the tensor for the Thread + auto thr_tile = make_tile(_, + make_tile(make_layout(size<1>(ThrLayoutVMNK{})), + make_layout(size<3>(ThrLayoutVMNK{})))); + auto thr_tensor = zipped_divide(tv_tensor, thr_tile); // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK))) + + return thr_tensor; + } + + // Tile from (M,K,...) + // to (thr_idx,(FrgV,(RestM,RestK,...))) + template + CUTE_HOST_DEVICE constexpr static + auto + tidfrg_A(ATensor&& atensor) + { + auto atile = make_tile(_, + make_tile(make_layout(make_shape (size<1>(ThrLayoutVMNK{}), size<2>(ThrLayoutVMNK{})), + make_stride( Int<1>{} , Int<0>{} )), + _)); + // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK)) + + return thrfrg_A(atensor).compose(atile, _).compose(TidLayout{}, _); + } + + // Tile a tensor or a layout from shape + // (N,K,...) + // to shape + // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) + // where + // ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx + // ThrN: The threads tiled in N. layout<2>(ThrLayoutVMNK): ThrN -> thread_idx + // ThrK: The threads tiled in K. layout<3>(ThrLayoutVMNK): ThrK -> thread_idx + // FrgV: The values local to an MMA. + // RestN: The values tiled in N. + // RestK: The values tiled in K. + template + CUTE_HOST_DEVICE constexpr static + auto + thrfrg_B(BTensor&& btensor) + { + CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<2>{}); + CUTE_STATIC_ASSERT_V(size<0>(btensor) % size<1>(TiledShape_MNK{}) == Int<0>{}); + CUTE_STATIC_ASSERT_V(size<1>(btensor) % size<2>(TiledShape_MNK{}) == Int<0>{}); + + // Reorder the tensor for the TiledAtom + auto t_tile = make_tile(left_inverse(get<1>(PermutationsMNK{})), + left_inverse(get<2>(PermutationsMNK{}))); + auto t_tensor = logical_divide(btensor, t_tile); // (PermN,PermK) + + // Tile the tensor for the Atom + auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})), + make_layout(size<2>(AtomShape_MNK{}))); + auto a_tensor = zipped_divide(t_tensor, a_tile); // ((AtomN,AtomK),(RestN,RestK)) + + // Transform the Atom mode from (M,K) to (Thr,Val) + auto tv_tensor = a_tensor.compose(AtomLayoutB_TV{},_); // ((ThrV,FrgV),(RestN,RestK)) + + // Tile the tensor for the Thread + auto thr_tile = make_tile(_, + make_tile(make_layout(size<2>(ThrLayoutVMNK{})), + make_layout(size<3>(ThrLayoutVMNK{})))); + auto thr_tensor = zipped_divide(tv_tensor, thr_tile); // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK))) + + return thr_tensor; + } + + // Tile from (N,K,...) + // to (thr_idx,(FrgV,(RestN,RestK,...))) + template + CUTE_HOST_DEVICE constexpr static + auto + tidfrg_B(BTensor&& btensor) + { + auto btile = make_tile(_, + make_tile(make_layout(make_shape (size<1>(ThrLayoutVMNK{}), size<2>(ThrLayoutVMNK{})), + make_stride( Int<0>{} , Int<1>{} )), + _)); + // (ThrV,(ThrN,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK)) + + return thrfrg_B(btensor).compose(btile, _).compose(TidLayout{}, _); + } + + template ::value)> + CUTE_HOST_DEVICE static constexpr + auto + get_slice(ThrIdx const& thr_idx) + { + auto thr_vmnk = ThrLayoutVMNK{}.get_flat_coord(thr_idx); + return ThrMMA(thr_vmnk); + } + + template ::value)> + CUTE_HOST_DEVICE static constexpr + auto + get_thread_slice(ThrIdx const& thr_idx) + { + return get_slice(thr_idx); + } + + // + // Utility for printing and visualization + // + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutC_MN() + { + // (M,N) -> (M,N) + auto ref_C = make_layout(make_shape(size<0>(TiledShape_MNK{}), size<1>(TiledShape_MNK{}))); + // (cthrid,val) -> (M,N) + auto layoutC_TV = thrfrg_C(ref_C); + // (M,N) -> (cthrid,frg) + auto layoutC_MN = right_inverse(layoutC_TV).with_shape(shape(ref_C)); + + // cthrid = (v,m,n) -> thr_idx + auto thrID_C = ThrLayoutVMNK{}(_,_,_,Int<0>{}); + + return cute::make_tuple(layoutC_MN, thrID_C); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutC_TV() + { + // (M,N) -> (M,N) + auto ref_C = make_layout(make_shape(size<0>(TiledShape_MNK{}), size<1>(TiledShape_MNK{}))); + + return tidfrg_C(ref_C); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutA_MK() + { + // (M,K) -> (M,K) + auto ref_A = make_layout(make_shape(size<0>(TiledShape_MNK{}), size<2>(TiledShape_MNK{}))); + // (athrid,val) -> (M,K) + auto layoutA_TV = thrfrg_A(ref_A); + // (M,K) -> (athrid,frg) + auto layoutA_MK = right_inverse(layoutA_TV).with_shape(shape(ref_A)); + + // athrid = (v,m,k) -> thr_idx + auto thrID_A = ThrLayoutVMNK{}(_,_,Int<0>{},_); + + return cute::make_tuple(layoutA_MK, thrID_A); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutA_TV() + { + // (M,K) -> (M,K) + auto ref_A = make_layout(make_shape(size<0>(TiledShape_MNK{}), size<2>(TiledShape_MNK{}))); + + return tidfrg_A(ref_A); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutB_NK() + { + // (N,K) -> (N,K) + auto ref_B = make_layout(make_shape(size<1>(TiledShape_MNK{}), size<2>(TiledShape_MNK{}))); + // (bthrid,val) -> (N,K) + auto layoutB_TV = thrfrg_B(ref_B); + // (N,K) -> (bthrid,frg) + auto layoutB_NK = right_inverse(layoutB_TV).with_shape(shape(ref_B)); + + // bthrid = (v,n,k) -> thr_idx + auto thrID_B = ThrLayoutVMNK{}(_,Int<0>{},_,_); + + return cute::make_tuple(layoutB_NK, thrID_B); + } + + CUTE_HOST_DEVICE constexpr static + auto + get_layoutB_TV() + { + // (N,K) -> (N,K) + auto ref_B = make_layout(make_shape(size<1>(TiledShape_MNK{}), size<2>(TiledShape_MNK{}))); + + return tidfrg_B(ref_B); + } +}; + +template +struct ThrMMA : TiledMMA +{ + // Use ThrVMNK and thrfrg rather than thr_idx and tidfrg + // to support swizzled threads partitioning dynamic layouts + ThrVMNK thr_vmnk_; + + CUTE_HOST_DEVICE constexpr + ThrMMA(ThrVMNK const& thr_vmnk) : thr_vmnk_(thr_vmnk) {} + + template + CUTE_HOST_DEVICE constexpr + auto + partition_C(CTensor&& ctensor) const + { + auto thr_tensor = make_tensor(std::forward(ctensor).data(), thrfrg_C(ctensor.layout())); + + auto thr_vmn = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<2>(thr_vmnk_))); + return thr_tensor(thr_vmn, make_coord(_, repeat(thr_tensor)>(_))); + } + + template + CUTE_HOST_DEVICE constexpr + auto + partition_A(ATensor&& atensor) const + { + auto thr_tensor = make_tensor(std::forward(atensor).data(), thrfrg_A(atensor.layout())); + + auto thr_vmk = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<3>(thr_vmnk_))); + return thr_tensor(thr_vmk, make_coord(_, repeat(thr_tensor)>(_))); + } + + template + CUTE_HOST_DEVICE constexpr + auto + partition_B(BTensor&& btensor) const + { + auto thr_tensor = make_tensor(std::forward(btensor).data(), thrfrg_B(btensor.layout())); + + auto thr_vnk = make_coord(get<0>(thr_vmnk_), make_coord(get<2>(thr_vmnk_), get<3>(thr_vmnk_))); + return thr_tensor(thr_vnk, make_coord(_, repeat(thr_tensor)>(_))); + } + + template + CUTE_HOST_DEVICE constexpr + auto + partition_fragment_C(CTensor&& ctensor) const + { + return make_fragment_C(partition_C(ctensor)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + partition_fragment_A(ATensor&& atensor) const + { + return make_fragment_A(partition_A(atensor)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + partition_fragment_B(BTensor&& btensor) const + { + return make_fragment_B(partition_B(btensor)); + } +}; + +// +// These tile the MMA_Atom as a whole +// + +template >, + class MMAValLayout = Layout>, + class Permutations = Tile> +CUTE_HOST_DEVICE constexpr +auto +make_tiled_mma(MMA_Atom const&, + MMAThrLayout const& thr_layout = {}, + MMAValLayout const& val_layout = {}, + Permutations const& permutations = {}) +{ + auto thr_layout_mnk = append<3>(thr_layout, Layout<_1>{}); + auto val_layout_mnk = append<3>(val_layout, Layout<_1>{}); + auto permutation_mnk = append<3>(permutations, _); + + return TiledMMA, + decltype(thr_layout_mnk), + decltype(val_layout_mnk), + decltype(permutation_mnk)>{}; +} + +template >, + class MMAValLayout = Layout>, + class Permutations = Tile> +CUTE_HOST_DEVICE constexpr +auto +make_tiled_mma(MMA_Op const&, + MMAThrLayout const& thr_layout = {}, + MMAValLayout const& val_layout = {}, + Permutations const& permutations = {}) +{ + // Attempt to wrap in an MMA_Atom<> and forward + return make_tiled_mma(MMA_Atom{}, thr_layout, val_layout, permutations); +} + +// +// partition_fragment_C -- static context +// + +template +CUTE_HOST_DEVICE constexpr +auto +partition_fragment_C(TiledMMA, Shape_MN shapeMN) +{ + constexpr int R = rank_v; + static_assert(R >= 2, "Must have at least rank-2"); + auto atomMNK = typename TiledMMA::AtomShape_MNK{}; + auto thrVMNK = typename TiledMMA::ThrLayoutVMNK{}; + + auto V = size<1>(typename TiledMMA::AtomLayoutC_TV{}); + auto M = shape_div(size<0>(shapeMN), size<0>(atomMNK) * size<1>(thrVMNK)); + auto N = shape_div(size<1>(shapeMN), size<1>(atomMNK) * size<2>(thrVMNK)); + auto frg_shape = tuple_cat(make_shape(V,M,N), take<2,R>(shapeMN)); + + return make_tensor::FrgTypeC>(frg_shape); +} + +// partition_fragment_A and partition_fragment_B often depend on the +// layout of A and B and/or the thread_idx that is requesting the partition. +// For these reasons, they should not be used in a static context. +// See TiledMMA::get_slice(thr_idx).partition_fragment_A(tensorA) instead. + +// +// Size +// + +template +CUTE_HOST_DEVICE constexpr +auto +tile_size(TiledMMA const& mma) +{ + return size(typename TiledMMA::TiledShape_MNK{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +size(TiledMMA const& mma) +{ + return size(typename TiledMMA::ThrLayoutVMNK{}); +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE +auto +print_latex(TiledMMA const& mma) +{ + auto layout_and_thrid_C = mma.get_layoutC_MN(); + auto layoutC_MN = get<0>(layout_and_thrid_C); + auto thrID_C = get<1>(layout_and_thrid_C); + + auto layout_and_thrid_A = mma.get_layoutA_MK(); + auto layoutA_MK = get<0>(layout_and_thrid_A); + auto thrID_A = get<1>(layout_and_thrid_A); + + auto layout_and_thrid_B = mma.get_layoutB_NK(); + auto layoutB_NK = get<0>(layout_and_thrid_B); + auto thrID_B = get<1>(layout_and_thrid_B); + + print_latex_mma(layoutC_MN, thrID_C, + layoutA_MK, thrID_A, + layoutB_NK, thrID_B); +} + +// EXPERIMENTAL -- Doesn't work with Swizzled Thr TileMMAs... +template +CUTE_HOST_DEVICE +auto +print_latex_2(TiledMMA const& mma) +{ + print_latex_mma(typename TiledMMA::TiledShape_MNK{}, + mma.get_layoutC_TV(), + mma.get_layoutA_TV(), + mma.get_layoutB_TV()); +} + +// MNK MMA Layout to console printer -- 8-value color coded by thread +template +CUTE_HOST_DEVICE +void +print_layout_mma(LayoutC const& C, ThrIDC const& TC, // (m,n) -> (tid,vid) and tid -> thr_idx + LayoutA const& A, ThrIDA const& TA, // (m,k) -> (tid,vid) and tid -> thr_idx + LayoutB const& B, ThrIDB const& TB) // (n,k) -> (tid,vid) and tid -> thr_idx +{ + CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{}); + + assert(size<0>(A) == size<0>(C)); + assert(size<0>(B) == size<1>(C)); + assert(size<1>(A) == size<1>(B)); + + int a_width = size<1>(A) * 6 + 4; + + // Print out B (white-shifted) k-by-n + for (int k = 0; k < size<1>(B); ++k) { + // Header + printf("%*s", a_width, ""); + for (int n = 0; n < size<0>(B); ++n) printf("+-----"); + printf("+\n"); + // Values + printf("%*s", a_width, ""); + for (int n = 0; n < size<0>(B); ++n) printf("|T%02dV%1d", int(TB(B(n,k) % size(TB))), int(B(n,k) / size(TB))); + printf("|\n"); + } + // Footer + printf("%*s", a_width, ""); + for (int n = 0; n < size<0>(B); ++n) printf("+-----"); + printf("+\n\n"); + + // Print out A m-by-k and C m-by-n + for (int m = 0; m < size<0>(A); ++m) { + // Header + for (int k = 0; k < size<1>(A); ++k) printf("+-----"); + printf("+ "); + for (int n = 0; n < size<1>(C); ++n) printf("+-----"); + printf("+\n"); + // Values + for (int k = 0; k < size<1>(A); ++k) printf("|T%02dV%1d", int(TA(A(m,k) % size(TA))), int(A(m,k) / size(TA))); + printf("| "); + for (int n = 0; n < size<1>(C); ++n) printf("|T%02dV%1d", int(TC(C(m,n) % size(TC))), int(C(m,n) / size(TC))); + printf("|\n"); + } + // Footer + for (int k = 0; k < size<1>(A); ++k) printf("+-----"); + printf("+ "); + for (int n = 0; n < size<1>(C); ++n) printf("+-----"); + printf("+\n"); +} + +// MNK MMA Layout to Latex TIKZ -- 8-value color coded by thread +template +CUTE_HOST_DEVICE +void +print_latex_mma(LayoutC const& C, ThrIDC const& TC, // (m,n) -> (tid,vid) and tid -> thr_idx + LayoutA const& A, ThrIDA const& TA, // (m,k) -> (tid,vid) and tid -> thr_idx + LayoutB const& B, ThrIDB const& TB) // (n,k) -> (tid,vid) and tid -> thr_idx +{ + CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{}); + + assert(size<0>(A) == size<0>(C)); + assert(size<0>(B) == size<1>(C)); + assert(size<1>(A) == size<1>(B)); + + char const* latex_header = + "\\documentclass{standalone}\n" + "\\usepackage{tikz}\n" + "\\usetikzlibrary{external}\n" + "\\tikzexternalize\n" + "\\begin{document}\n" + "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},box/.style={rectangle,draw=black,thick,minimum size=1cm,anchor=center}]\n\n"; + char const* latex_footer = + "\\end{tikzpicture}\n" + "\\end{document}\n"; + + char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}", + "{rgb,255:red,175;green,255;blue,175}", + "{rgb,255:red,255;green,255;blue,175}", + "{rgb,255:red,255;green,175;blue,175}", + "{rgb,255:red,210;green,210;blue,255}", + "{rgb,255:red,210;green,255;blue,210}", + "{rgb,255:red,255;green,255;blue,210}", + "{rgb,255:red,255;green,210;blue,210}"}; + + // Header + printf("%% LayoutC: "); print(C); printf("\n"); + printf("%% ThrIDC : "); print(TC); printf("\n"); + printf("%% LayoutA: "); print(A); printf("\n"); + printf("%% ThrIDA : "); print(TA); printf("\n"); + printf("%% LayoutB: "); print(B); printf("\n"); + printf("%% ThrIDB : "); print(TB); printf("\n\n"); + + printf(latex_header); + + // C starting at 0,0 + for (int m = 0; m < size<0>(C); ++m) { + for (int n = 0; n < size<1>(C); ++n) { + int thrid = C(m,n) % size(TC); + int val_idx = C(m,n) / size(TC); + int thr_idx = TC(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + m, n, + thr_idx, val_idx); + } + } + + // A starting at 0,-size<1>(A)-1 + for (int m = 0; m < size<0>(A); ++m) { + for (int k = 0; k < size<1>(A); ++k) { + int thrid = A(m,k) % size(TA); + int val_idx = A(m,k) / size(TA); + int thr_idx = TA(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + m, k-1-size<1>(A), + thr_idx, val_idx); + } + } + + // B starting at -size<1>(B)-1,0 + for (int n = 0; n < size<0>(B); ++n) { + for (int k = 0; k < size<1>(B); ++k) { + int thrid = B(n,k) % size(TB); + int val_idx = B(n,k) / size(TB); + int thr_idx = TB(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + k-1-size<1>(B), n, + thr_idx, val_idx); + } + } + + // A labels + for (int m = 0, k = -1; m < size<0>(A); ++m) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), m); + } + for (int k = 0, m = -1; k < size<1>(A); ++k) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), k); + } + // B labels + for (int n = 0, k = -1; n < size<0>(B); ++n) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, n); + } + for (int k = 0, n = -1; k < size<1>(B); ++k) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, k); + } + + // Footer + printf(latex_footer); +} + +// ThrVal MMA Layout to Latex TIKZ -- 8-value color coded by thread +template +CUTE_HOST_DEVICE +void +print_latex_mma(Shape_MNK const& shape_mnk, + LayoutC const& C, // (thr_idx,vid) -> (m,n) + LayoutA const& A, // (thr_idx,vid) -> (m,k) + LayoutB const& B) // (thr_idx,vid) -> (n,k) +{ + CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{}); + CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{}); + + char const* latex_header = + "\\documentclass{standalone}\n" + "\\usepackage{tikz}\n" + "\\usetikzlibrary{external}\n" + "\\tikzexternalize\n" + "\\begin{document}\n" + "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},box/.style={rectangle,draw=black,thick,minimum size=1cm,anchor=center}]\n\n"; + char const* latex_footer = + "\\end{tikzpicture}\n" + "\\end{document}\n"; + + char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}", + "{rgb,255:red,175;green,255;blue,175}", + "{rgb,255:red,255;green,255;blue,175}", + "{rgb,255:red,255;green,175;blue,175}", + "{rgb,255:red,210;green,210;blue,255}", + "{rgb,255:red,210;green,255;blue,210}", + "{rgb,255:red,255;green,255;blue,210}", + "{rgb,255:red,255;green,210;blue,210}"}; + + // Header + printf("%% Shape_MNK: "); print(shape_mnk); printf("\n"); + printf("%% LayoutC : "); print(C); printf("\n"); + printf("%% LayoutA : "); print(A); printf("\n"); + printf("%% LayoutB : "); print(B); printf("\n\n"); + + printf(latex_header); + + int M = size<0>(shape_mnk); + int N = size<1>(shape_mnk); + int K = size<2>(shape_mnk); + + // C starting at 0,0 + bool c_filled[M][N] = {}; + for (int t = 0; t < size<0>(C); ++t) { + for (int v = 0; v < size<1>(C); ++v) { + int m = C(t,v) % M; + int n = C(t,v) / M; + + if (not c_filled[m][n]) { + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[t % 8], + m, n, + t, v); + c_filled[m][n] = true; + } + } + } + + // A starting at 0,-size<1>(A)-1 + bool a_filled[M][K] = {}; + for (int t = 0; t < size<0>(A); ++t) { + for (int v = 0; v < size<1>(A); ++v) { + int m = A(t,v) % M; + int k = A(t,v) / M; + + if (not a_filled[m][k]) { + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[t % 8], + m, k - 1 - K, + t, v); + a_filled[m][k] = true; + } + } + } + + // B starting at -size<1>(B)-1,0 + bool b_filled[N][K] = {}; + for (int t = 0; t < size<0>(B); ++t) { + for (int v = 0; v < size<1>(B); ++v) { + int n = B(t,v) % N; + int k = B(t,v) / N; + + if (not b_filled[n][k]) { + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[t % 8], + k - 1 - K, n, + t, v); + b_filled[n][k] = true; + } + } + } + + // A labels + for (int m = 0, k = -1; m < M; ++m) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k - 1 - K, m); + } + for (int k = 0, m = -1; k < K; ++k) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k - 1 - K, k); + } + // B labels + for (int n = 0, k = -1; n < N; ++n) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k - 1 - K, n, n); + } + for (int k = 0, n = -1; k < K; ++k) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k - 1 - K, n, k); + } + + // Footer + printf(latex_footer); +} + +} // namespace cute + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/atom/mma_traits.hpp b/include/cute/atom/mma_traits.hpp new file mode 100644 index 0000000000..a8c3323a36 --- /dev/null +++ b/include/cute/atom/mma_traits.hpp @@ -0,0 +1,70 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +template +struct MMA_Traits +{ + static_assert(sizeof(MMAOperation) == 0, "MMA_Traits not implemented for this MMA_Operation."); +}; + +template +struct MMA_Traits> +{ + using ElementDVal = D; + using ElementAVal = A; + using ElementBVal = B; + using ElementCVal = C; + + // Logical shape of the MMA + using Shape_MNK = Shape<_1,_1,_1>; + + // Logical thread id (tid) -> tidx + using ThrID = Layout<_1>; + + // (Logical thread id (tid), Logical value id (vid)) -> coord + + // (tid,vid) -> (m,k) + using ALayout = Layout>; + // (tid,vid) -> (n,k) + using BLayout = Layout>; + // (tid,vid) -> (m,n) + using CLayout = Layout>; +}; + +} // namespace cute diff --git a/include/cute/atom/mma_traits_sm61.hpp b/include/cute/atom/mma_traits_sm61.hpp new file mode 100644 index 0000000000..85d4e98787 --- /dev/null +++ b/include/cute/atom/mma_traits_sm61.hpp @@ -0,0 +1,73 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_1,_1,_4>; + using ThrID = Layout<_1>; + using ALayout = Layout>; + using BLayout = Layout>; + using CLayout = Layout>; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int16_t; + using ElementBVal = int16_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_1,_1,_2>; + using ThrID = Layout<_1>; + using ALayout = Layout>; + using BLayout = Layout>; + using CLayout = Layout>; +}; + +} // namespace cute diff --git a/include/cute/atom/mma_traits_sm70.hpp b/include/cute/atom/mma_traits_sm70.hpp new file mode 100644 index 0000000000..79430350ce --- /dev/null +++ b/include/cute/atom/mma_traits_sm70.hpp @@ -0,0 +1,198 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +namespace { + +// Logical thread id to thread idx (quadpair) +using SM70_QuadPair = Layout, + Stride<_1,_16>>; +// (T8,V4) -> (M8,K4) +using SM70_8x4_Row = Layout, + Stride<_1,_8>>; +// (T8,V4) -> (M8,K4) +using SM70_8x4_Col = Layout,_4>, + Stride,_1>>; +// (T8,V8) -> (M8,N8) +using SM70_8x8_16b = Layout, + Stride<_1,_8>>; +// (T8,V8) -> (M8,N8) +using SM70_8x8_32b = Layout,Shape <_2,_2, _2>>, + Stride,Stride<_8,_2,_32>>>; + +} + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Row; + using BLayout = SM70_8x4_Row; + using CLayout = SM70_8x8_16b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Col; + using BLayout = SM70_8x4_Col; + using CLayout = SM70_8x8_16b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Col; + using BLayout = SM70_8x4_Row; + using CLayout = SM70_8x8_16b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Row; + using BLayout = SM70_8x4_Col; + using CLayout = SM70_8x8_16b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Row; + using BLayout = SM70_8x4_Row; + using CLayout = SM70_8x8_32b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Col; + using BLayout = SM70_8x4_Col; + using CLayout = SM70_8x8_32b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Col; + using BLayout = SM70_8x4_Row; + using CLayout = SM70_8x8_32b; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Row; + using BLayout = SM70_8x4_Col; + using CLayout = SM70_8x8_32b; +}; + +/////////////////////////////////////////////////////////////////////////////// +} // namespace cute diff --git a/include/cute/atom/mma_traits_sm75.hpp b/include/cute/atom/mma_traits_sm75.hpp new file mode 100644 index 0000000000..405e871fd2 --- /dev/null +++ b/include/cute/atom/mma_traits_sm75.hpp @@ -0,0 +1,81 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_16,_8,_8>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_1>>>; + using BLayout = Layout,_2>, + Stride,_8>>; + using CLayout = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_1>>>; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_8,_8,_16>; + using ThrID = Layout<_32>; + using ALayout = Layout,_4>, + Stride,_8>>; + using BLayout = Layout,_4>, + Stride,_8>>; + using CLayout = Layout,_2>, + Stride,_8>>; +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cute diff --git a/include/cute/atom/mma_traits_sm80.hpp b/include/cute/atom/mma_traits_sm80.hpp new file mode 100644 index 0000000000..6636b7aaa5 --- /dev/null +++ b/include/cute/atom/mma_traits_sm80.hpp @@ -0,0 +1,446 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +#include + +#include + +namespace cute +{ + +namespace { + +// (T32,V1) -> (M8,N8) +using SM80_8x4 = Layout,_1>, + Stride,_0>>; +// (T32,V2) -> (M8,N8) +using SM80_8x8_Row = Layout,_2>, + Stride,_8>>; +// (T32,V4) -> (M8,N16) +using SM80_8x16_Row = Layout,_4>, + Stride,_8>>; +// (T32,V4) -> (M16,N8) +using SM80_16x8_Row = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_8>>>; + +} + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp16 = fp16 * fp16 + fp16 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_16,_8,_8>; + using ThrID = Layout<_32>; + using ALayout = SM80_16x8_Row; + using BLayout = SM80_8x8_Row; + using CLayout = SM80_16x8_Row; +}; + +template <> +struct MMA_Traits +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using Shape_MNK = Shape<_16,_8,_16>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape < _2,_2, _2>>, + Stride,Stride<_16,_8,_128>>>; + using BLayout = Layout,Shape <_2, _2>>, + Stride,Stride<_8,_64>>>; + using CLayout = SM80_16x8_Row; +}; + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp32 = fp16 * fp16 + fp32 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; +}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; +}; + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp32 = bf16 * bf16 + fp32 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; +}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; +}; + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp32 = tf32 * tf32 + fp32 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = cutlass::tfloat32_t; + using ElementBVal = cutlass::tfloat32_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_16,_8,_4>; + using ThrID = Layout<_32>; + using ALayout = Layout,_2>, + Stride,_8>>; + using BLayout = SM80_8x4; + using CLayout = SM80_16x8_Row; +}; + +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = cutlass::tfloat32_t; + using ElementBVal = cutlass::tfloat32_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_16,_8,_8>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape <_2, _2>>, + Stride,Stride<_8,_64>>>; + using BLayout = Layout, _2>, + Stride,_32>>; + using CLayout = SM80_16x8_Row; +}; + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp64 = fp64 * fp64 + fp64 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = double; + using ElementAVal = double; + using ElementBVal = double; + using ElementCVal = double; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = Layout<_32>; + using ALayout = SM80_8x4; + using BLayout = SM80_8x4; + using CLayout = SM80_8x8_Row; +}; + +// Custom complex fp64 MMA composed of 4 fp64 MMAs -- same layouts +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = complex; + using ElementAVal = complex; + using ElementBVal = complex; + using ElementCVal = complex; +}; + +// Custom complex fp64 MMA composed of 3 fp64 MMAs -- same layouts +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex; + using ElementAVal = complex; + using ElementBVal = complex; + using ElementCVal = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex; +}; + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////// s32 = s8 * s8 + s32 /////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_8,_8,_16>; + using ThrID = Layout<_32>; + using ALayout = SM80_8x16_Row; + using BLayout = SM80_8x16_Row; + using CLayout = SM80_8x8_Row; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_16,_8,_16>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape < _4,_2>>, + Stride,Stride<_16,_8>>>; + using BLayout = SM80_8x16_Row; + using CLayout = SM80_16x8_Row; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_16,_8,_32>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape < _4,_2, _2>>, + Stride,Stride<_16,_8,_256>>>; + using BLayout = Layout, Shape <_4, _2>>, + Stride, Stride<_8,_128>>>; + using CLayout = SM80_16x8_Row; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////// s32 = s8 * u8 + s32 /////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////// s32 = u8 * s8 + s32 /////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////// s32 = u8 * u8 + s32 /////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; +}; + +template <> +struct MMA_Traits + : MMA_Traits {}; + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////// s32 = b1 ^ b1 + s32 /////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = int32_t; + using ElementAVal = cute::uint1b_t; + using ElementBVal = cute::uint1b_t; + using ElementCVal = int32_t; + + using Shape_MNK = Shape<_16,_8,_256>; + using ThrID = Layout<_32>; + using ALayout = Layout>, + Stride<_64,Stride<_64,_16,_8,_2048>>>; + using BLayout = Layout>, + Stride<_32,Stride< _1,_1024>>>; + using CLayout = SM80_16x8_Row; +}; +} // end namespace cute diff --git a/include/cute/atom/mma_traits_sm90.hpp b/include/cute/atom/mma_traits_sm90.hpp new file mode 100644 index 0000000000..b7a12b98f4 --- /dev/null +++ b/include/cute/atom/mma_traits_sm90.hpp @@ -0,0 +1,132 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute { + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////// fp64 = fp64 * fp64 + fp64 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits +{ + using ElementDVal = double; + using ElementAVal = double; + using ElementBVal = double; + using ElementCVal = double; + + using Shape_MNK = Shape<_16,_8,_4>; + using ThrID = Layout<_32>; + using ALayout = Layout,_2>, + Stride,_8>>; + using BLayout = Layout,_1>, + Stride,_0>>; + using CLayout = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_8>>>; +}; + +template <> +struct MMA_Traits +{ + using ElementDVal = double; + using ElementAVal = double; + using ElementBVal = double; + using ElementCVal = double; + + using Shape_MNK = Shape<_16,_8,_8>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape <_2, _2>>, + Stride,Stride<_8,_64>>>; + using BLayout = Layout, _2>, + Stride,_32>>; + using CLayout = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_8>>>; +}; + +template <> +struct MMA_Traits +{ + using ElementDVal = double; + using ElementAVal = double; + using ElementBVal = double; + using ElementCVal = double; + + using Shape_MNK = Shape<_16,_8,_16>; + using ThrID = Layout<_32>; + using ALayout = Layout,Shape <_2, _4>>, + Stride,Stride<_8,_64>>>; + using BLayout = Layout, _4>, + Stride,_32>>; + using CLayout = Layout,Shape < _2,_2>>, + Stride,Stride<_16,_8>>>; +}; + +/////////////////////////////////////////////////////////////////////////////////// +//////////////////////// cfp64 = cfp64 * cfp64 + cfp64 //////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = complex; + using ElementAVal = complex; + using ElementBVal = complex; + using ElementCVal = complex; +}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = complex; + using ElementAVal = complex; + using ElementBVal = complex; + using ElementCVal = complex; +}; + +template <> +struct MMA_Traits + : MMA_Traits +{ + using ElementDVal = complex; + using ElementAVal = complex; + using ElementBVal = complex; + using ElementCVal = complex; +}; + +} // end namespace cute diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp new file mode 100644 index 0000000000..d390dafc58 --- /dev/null +++ b/include/cute/atom/mma_traits_sm90_gmma.hpp @@ -0,0 +1,2975 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute { + +namespace GMMA { + +/////////////////////////////////////////// +// Common layouts for GMMA Shared Memory // +/////////////////////////////////////////// + +// M|N-major GMMA layouts in units of bits +using Layout_MN_INTER_Atom_Bits = Layout,Stride<_1,_128>>; +using Layout_MN_SW32_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride<_1, _256>>>; +using Layout_MN_SW64_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride<_1, _512>>>; +using Layout_MN_SW128_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride<_1,_1024>>>; + +// K-major GMMA layouts in units of bits +using Layout_K_INTER_Atom_Bits = Layout,Stride<_128,_1>>; +using Layout_K_SW32_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride< _256,_1>>>; +using Layout_K_SW64_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride< _512,_1>>>; +using Layout_K_SW128_Atom_Bits = ComposedLayout, smem_ptr_flag, Layout,Stride<_1024,_1>>>; + +// M|N-major layouts in units of Type +template +using Layout_MN_INTER_Atom = decltype(upcast::value>(Layout_MN_INTER_Atom_Bits{})); +template +using Layout_MN_SW32_Atom = decltype(upcast::value>(Layout_MN_SW32_Atom_Bits{})); +template +using Layout_MN_SW64_Atom = decltype(upcast::value>(Layout_MN_SW64_Atom_Bits{})); +template +using Layout_MN_SW128_Atom = decltype(upcast::value>(Layout_MN_SW128_Atom_Bits{})); + +// K-major layouts in units of Type +template +using Layout_K_INTER_Atom = decltype(upcast::value>(Layout_K_INTER_Atom_Bits{})); +template +using Layout_K_SW32_Atom = decltype(upcast::value>(Layout_K_SW32_Atom_Bits{})); +template +using Layout_K_SW64_Atom = decltype(upcast::value>(Layout_K_SW64_Atom_Bits{})); +template +using Layout_K_SW128_Atom = decltype(upcast::value>(Layout_K_SW128_Atom_Bits{})); + +// With GMMA::Major param +template +using Layout_INTER_Atom = typename std::conditional, + Layout_K_INTER_Atom>::type; +template +using Layout_SW32_Atom = typename std::conditional, + Layout_K_SW32_Atom>::type; +template +using Layout_SW64_Atom = typename std::conditional, + Layout_K_SW64_Atom>::type; +template +using Layout_SW128_Atom = typename std::conditional, + Layout_K_SW128_Atom>::type; + +// Helper for GMMA smem selection that considers a tensor TileShape: +// (BLK_MN, BLK_K) +// or hierarchically +// ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...)) +// and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0 +template +CUTE_HOST_DEVICE constexpr +auto +smem_selector() +{ + auto BLK_MN0 = size<0>(BLK_MN{}); + auto BLK_K0 = size<0>(BLK_K{}); + + static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8."); + static_assert(BLK_K0 % 8 == 0, "BLK_K0 must be a multiple of 8."); + + + if constexpr (major == GMMA::Major::MN) { + if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom{}) == 0) { + return GMMA::Layout_MN_SW128_Atom{}; + } else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom{}) == 0) { + return GMMA::Layout_MN_SW64_Atom{}; + } else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom{}) == 0) { + return GMMA::Layout_MN_SW32_Atom{}; + } else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom{}) == 0) { + return GMMA::Layout_MN_INTER_Atom{}; + } else { + static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom{}) == 0, + "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom{})"); + } + } else if constexpr (major == GMMA::Major::K) { + if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom{}) == 0) { + return GMMA::Layout_K_SW128_Atom{}; + } else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom{}) == 0) { + return GMMA::Layout_K_SW64_Atom{}; + } else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom{}) == 0) { + return GMMA::Layout_K_SW32_Atom{}; + } else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom{}) == 0) { + return GMMA::Layout_K_INTER_Atom{}; + } else { + static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom{}) == 0, + "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom{})"); + } + } +} + +// +// Tensor to LayoutType utility +// + +// smem_ptr_swizzle LayoutType +template +CUTE_HOST_DEVICE constexpr +LayoutType +layout_type(Tensor>>, + Layout> const&) +{ + static_assert(M == 4, "Unsupported layout swizzle"); + static_assert(0 <= B && B <= 3, "Unsupported layout swizzle"); + static_assert(S == 3, "Unsupported layout swizzle"); + + switch (B) { + case 0: return LayoutType::INTERLEAVE; + case 1: return LayoutType::B32; + case 2: return LayoutType::B64; + case 3: return LayoutType::B128; + } + return LayoutType::INTERLEAVE; // ERROR +} + +// smem_ptr non-swizzled LayoutType +template +CUTE_HOST_DEVICE constexpr +LayoutType +layout_type(Tensor>, + Layout> const&) +{ + return LayoutType::INTERLEAVE; +} + +/////////////////////////////////////////////////////////////////////////////// +// Construction method for GMMA Descriptors +/////////////////////////////////////////////////////////////////////////////// + +/** +* /////////////////////////////// +* // make_gmma_desc // +* /////////////////////////////// +* Each GmmaDescriptor Major-MN describes a canonical layout of the form +* +* LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((T,1,m),(8,k)):((1,T,SBO),(1T,LBO)) +* LayoutType::B32 : Swizzle<1,4,3> o smem_ptr o ((T,2,m),(8,k)):((1,T,LBO),(2T,SBO)) +* LayoutType::B64 : Swizzle<2,4,3> o smem_ptr o ((T,4,m),(8,k)):((1,T,LBO),(4T,SBO)) +* LayoutType::B128 : Swizzle<3,4,3> o smem_ptr o ((T,8,m),(8,k)):((1,T,LBO),(8T,SBO)) +* +* where +* T : sizeof(uint128_t) / sizeof(value_type) +* m : integer in [1,16] corresponding to GMMA shape +* k : integer in [1,32] corresponding to GMMA shape +* SBO: stride byte offset +* LBO: leading byte offset +* +* See GMMA::Layout_MN_XXX_Atom for building canonical GmmaDescriptor Major-MN layouts. +* For example, +* auto smem_layout = tile_to_shape(Layout_MN_SW128_Atom{}, Shape<_128,_64>{}); +* is guaranteed to be accepted by make_gmma_desc for appropriate value_type. +* +* ////////////////////////////// +* // make_gmma_desc // +* ////////////////////////////// +* Each GmmaDescriptor Major-K describes a canonical layout of the form +* +* LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((8,m),(T,2)):((1T,SBO),(1,LBO)) +* LayoutType::B32 : Swizzle<1,4,3> o smem_ptr o ((8,m),(T,2)):((2T,SBO),(1, T )) +* LayoutType::B64 : Swizzle<2,4,3> o smem_ptr o ((8,m),(T,2)):((4T,SBO),(1, T )) +* LayoutType::B128 : Swizzle<3,4,3> o smem_ptr o ((8,m),(T,2)):((8T,SBO),(1, T )) +* +* See GMMA::Layout_K_XXX_Atom for building canonical GmmaDescriptor Major-K layouts. +* For example, +* auto smem_layout = tile_to_shape(Layout_K_SW128_Atom{}, Shape<_128,_64>{}); +* is guaranteed to be accepted by make_gmma_desc for appropriate value_type. +*/ +template +CUTE_HOST_DEVICE constexpr +GmmaDescriptor +make_gmma_desc(Tensor const& tensor) +{ + static_assert(is_smem::value, "GMMA Descriptors can only be constructed on smem."); + static_assert(TLayout::rank == 2, "GMMA Descriptors can only be constructed on rank-2 tensors."); + using value_type = typename TEngine::value_type; + + Tensor u128_tensor = recast(tensor); + + // Result + GmmaDescriptor desc; + + // Layout type + constexpr GMMA::LayoutType LAYOUT_TYPE = GMMA::layout_type(u128_tensor); + desc.layout_type_ = uint8_t(LAYOUT_TYPE); + + // Start address (4LSB not included) + uint32_t start_address = cast_smem_ptr_to_uint(u128_tensor.data().get()); + desc.start_address_ = start_address >> 4; + + constexpr uint8_t base_offset = 0; + desc.base_offset_ = base_offset; + + // LayoutType meta + constexpr int W = LAYOUT_TYPE == GMMA::LayoutType::INTERLEAVE ? 1 : + LAYOUT_TYPE == GMMA::LayoutType::B32 ? 2 : + LAYOUT_TYPE == GMMA::LayoutType::B64 ? 4 : + LAYOUT_TYPE == GMMA::LayoutType::B128 ? 8 : -1; + + if constexpr (MajorMode == GMMA::Major::MN) + { + /* In units of uint128_t, each GmmaDescriptor Major-MN describes a canonical layout of the form + * + * LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((1,n),(8,k)):((X,SBO),(1,LBO)) + * LayoutType::B32 : Swizzle<1,4,3> o smem_ptr o ((2,n),(8,k)):((1,LBO),(2,SBO)) + * LayoutType::B64 : Swizzle<2,4,3> o smem_ptr o ((4,n),(8,k)):((1,LBO),(4,SBO)) + * LayoutType::B128 : Swizzle<3,4,3> o smem_ptr o ((8,n),(8,k)):((1,LBO),(8,SBO)) + */ + static_assert(size<1>(u128_tensor) == Int<(256 / cute::sizeof_bits::value)>{}, // K size + "Not a canonical GMMA_MN Layout: Expected K-size 256/sizeof_bits."); + + // Construct the canonical GMMA T Layout with shape ((W,n),(8,2)) + Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout,_1>{}, Layout,_1>{})); + + // Check ranks of canonical + CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode"); + CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode"); + // Check canonical mode strides + constexpr uint32_t stride_00 = stride<0,0>(canonical_layout); + constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == GMMA::LayoutType::INTERLEAVE ? stride<0,0>(canonical_layout) : 1; + static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_MN Layout: Expected stride failure."); + constexpr uint32_t stride_10 = stride<1,0>(canonical_layout); + constexpr uint32_t expected_stride_10 = W; + static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_MN Layout: Expected stride failure."); + + // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units) + constexpr uint32_t stride_01 = stride<0,1>(canonical_layout); + constexpr uint32_t stride_11 = stride<1,1>(canonical_layout); + + desc.stride_byte_offset_ = (LAYOUT_TYPE == GMMA::LayoutType::INTERLEAVE) ? stride_01 : stride_11; + desc.leading_byte_offset_ = (LAYOUT_TYPE == GMMA::LayoutType::INTERLEAVE) ? stride_11 : stride_01; + } + else if constexpr (MajorMode == GMMA::Major::K) + { + /* In units of uint128_t, each GmmaDescriptor Major-K describes a canonical layout of the form + * + * LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((8,n),2):((1,SBO),LBO) + * LayoutType::B32 : Swizzle<1,4,3> o smem_ptr o ((8,n),2):((2,SBO),1) + * LayoutType::B64 : Swizzle<2,4,3> o smem_ptr o ((8,n),2):((4,SBO),1) + * LayoutType::B128 : Swizzle<3,4,3> o smem_ptr o ((8,n),2):((8,SBO),1) + */ + CUTE_STATIC_ASSERT_V(size<0>(u128_tensor) % Int<8>{} == Int<0>{}, // N|M size + "Not a canonical GMMA_K Layout: Expected MN-size multiple of 8."); + CUTE_STATIC_ASSERT_V(size<1>(u128_tensor) == Int<2>{}, // K size + "Not a canonical GMMA_K Layout: Expected K-size 2 (in units of uint128_t)."); + + // Construct the canonical GMMA N Layout with shape ((8,n),(2,1)) + Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{})); + + // Check ranks of canonical + CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode"); + CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode"); + // Check canonical mode strides + constexpr uint32_t stride_00 = stride<0,0>(canonical_layout); + constexpr uint32_t expected_stride_00 = W; + static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_K Layout: Expected stride failure."); + constexpr uint32_t stride_10 = stride<1,0>(canonical_layout); + constexpr uint32_t expected_stride_10 = (LAYOUT_TYPE == GMMA::LayoutType::INTERLEAVE) ? stride<1,0>(canonical_layout) : 1; + static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_K Layout: Expected stride failure."); + + // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units) + constexpr uint32_t stride_01 = stride<0,1>(canonical_layout); + + desc.stride_byte_offset_ = stride_01; + desc.leading_byte_offset_ = stride_10; + } else { + static_assert(MajorMode != GMMA::Major::MN && MajorMode != GMMA::Major::K, "Unrecognized MajorMode!"); + } + +#if 0 + // DEBUG and SANITY + assert((start_address & 0b0000001111) == 0); // Must be 16B aligned (4LSB are 0) no negotiation + assert((start_address & 0b1110000000) == 0); // Assert base_offset is 0, generalize later + if (thread0()) { + print("smem_desc input tensor: "); print(tensor.data()); print(" o "); print(tensor.layout()); print("\n"); + print("smem_desc uint128_t tensor: "); print(u128_tensor.data()); print(" o "); print(u128_tensor.layout()); print("\n"); + //print(" desc canonical layout: "); print(canonical_layout); print("\n"); + print(desc); + } +#endif + + return desc; +} + +/////////////////////////////////////////////////////////////////////////////// +// Higher level GMMA Descriptor utilities +/////////////////////////////////////////////////////////////////////////////// + +struct gmma_descriptor_iterator +{ + GmmaDescriptor desc_; + + // Dereference returns the GmmaDescriptor + CUTE_HOST_DEVICE constexpr + GmmaDescriptor const& operator*() const { return desc_; } + + // Advance and return a new GmmaDescriptor + template + CUTE_HOST_DEVICE constexpr + GmmaDescriptor operator[](Index const& i) const { return *(*this + i); } + + // Return an advanced iterator + template + CUTE_HOST_DEVICE constexpr + gmma_descriptor_iterator operator+(Index const& offset) const + { + // offset is in the units of uint128_t (4LSB of start_address not included) + + //GmmaDescriptor desc = desc_; + //desc.start_address_ += uint16_t(offset); + //desc.reg32_[0] += uint16_t(offset); // Generates better asm than adding to the bitfield + + // May need to update base_offset if swizzle alignment isn't guaranteed + //desc.base_offset_ = 0; + //assert((desc.start_address_ & 0b111000) == 0); // Assert base_offset is 0, generalize later + + //return {desc}; + + // The above seems to not work for some reason... + return {desc_ + uint64_t(offset)}; + } +}; + +template +struct smem_desc : gmma_descriptor_iterator {}; + +template +CUTE_HOST_DEVICE constexpr +auto +make_gmma_desc_fragment(Tensor const& t) +{ + // Cast to a uint128_t tensor for GMMA Desc iteration + return make_tensor(gmma_descriptor_iterator{make_gmma_desc(tensor<0>(t))}, + recast(t).layout()); +} + +// Recast a tensor to a tensor of gmma_descriptor_iterator +template +CUTE_HOST_DEVICE constexpr +auto +recast(Tensor&& tensor, type_list>) +{ + return make_gmma_desc_fragment(tensor); +} + +// Recast a gmma_descriptor_iterator Tensor to uint64_t, it's RegType +template +CUTE_HOST_DEVICE constexpr +auto +recast(Tensor,TLayout> const& tensor, type_list) +{ + static_assert(std::is_same::value, "Can only cast descriptors to uint64_t."); + return make_tensor(tensor.data(), Layout<_1,_0>{}); +} + +} // end namespace GMMA + +// Fence between the async destination accumulators of GMMA & source for their dependent use +template +CUTE_HOST_DEVICE +void +warpgroup_fence_operand(Tensor& frg) { + CUTE_STATIC_ASSERT(is_static::value); + if constexpr (std::is_same_v) { + auto f32_frg = recast(frg); + CUTE_UNROLL + for (int i = 0; i < size(f32_frg); ++i) { + warpgroup_fence_operand(f32_frg(i)); + } + } + else { + CUTE_STATIC_ASSERT(is_rmem::value); + auto u32_frg = recast(frg); + CUTE_UNROLL + for (int i = 0; i < size(u32_frg); ++i) { + warpgroup_fence_operand(u32_frg(i)); + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +//////////////////////////// MMA_TRAITS /////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +namespace GMMA { + +// Accumulator layouts +using CLayout_64x8 = Layout,Shape < _2,_2>>, + Stride,Stride<_64,_8>>>; + +using CLayout_64x16 = Layout,Shape < _2,_2, _2>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x32 = Layout,Shape < _2,_2, _4>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x64 = Layout,Shape < _2,_2, _8>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x96 = Layout,Shape < _2,_2, _12>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x128 = Layout,Shape < _2,_2, _16>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x192 = Layout,Shape < _2,_2, _24>>, + Stride,Stride<_64,_8,_512>>>; + +using CLayout_64x256 = Layout,Shape < _2,_2, _32>>, + Stride,Stride<_64,_8,_512>>>; + +// Register source layout for 32-bit value types +using ALayout_64x8 = Layout,Shape < _2, _2>>, + Stride,Stride< _8,_256>>>; + +// Register source layout for 16-bit value types +using ALayout_64x16 = CLayout_64x16; + +// Register source layout for 8-bit value types +using ALayout_64x32 = Layout,Shape < _4,_2, _2>>, + Stride,Stride<_64,_8,_1024>>>; + +// Shared memory source layouts for any value type +template +using ABLayout = Layout,Int>>, + Stride< _0,Stride< _1,Int>>>; + +} // namespace GMMA + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = half_t; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = half_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 8, 16>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 16, 16>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 32, 16>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 64, 16>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout< 96, 16>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<128, 16>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<192, 16>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 16>; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = bfloat16_t; + using ElementBVal = bfloat16_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_16>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x16; + using BLayout = GMMA::ABLayout<256, 16>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout< 8, 8>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout< 8, 8>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout< 16, 8>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout< 16, 8>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout< 32, 8>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout< 32, 8>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout< 64, 8>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout< 64, 8>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout< 96, 8>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout< 96, 8>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout<128, 8>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout<128, 8>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout<192, 8>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout<192, 8>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 8>; + using BLayout = GMMA::ABLayout<256, 8>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = float; + using ElementAVal = tfloat32_t; + using ElementBVal = tfloat32_t; + using ElementCVal = float; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_8>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x8; + using BLayout = GMMA::ABLayout<256, 8>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = int8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = int8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementAFrg = GMMA::smem_desc; + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ABLayout< 64, 32>; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_8,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 8, 32>; + using CLayout = GMMA::CLayout_64x8; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_16,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 16, 32>; + using CLayout = GMMA::CLayout_64x16; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_32,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 32, 32>; + using CLayout = GMMA::CLayout_64x32; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_64,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 64, 32>; + using CLayout = GMMA::CLayout_64x64; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_96,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout< 96, 32>; + using CLayout = GMMA::CLayout_64x96; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_128,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<128, 32>; + using CLayout = GMMA::CLayout_64x128; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_192,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<192, 32>; + using CLayout = GMMA::CLayout_64x192; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MMA_Traits> +{ + using ElementDVal = int32_t; + using ElementAVal = uint8_t; + using ElementBVal = uint8_t; + using ElementCVal = int32_t; + + using ElementBFrg = GMMA::smem_desc; + + using Shape_MNK = Shape<_64,_256,_32>; + using ThrID = Layout<_128>; + using ALayout = GMMA::ALayout_64x32; + using BLayout = GMMA::ABLayout<256, 32>; + using CLayout = GMMA::CLayout_64x256; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // end namespace cute diff --git a/include/cute/config.hpp b/include/cute/config.hpp new file mode 100644 index 0000000000..b2f4de8363 --- /dev/null +++ b/include/cute/config.hpp @@ -0,0 +1,121 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) +# define CUTE_HOST_DEVICE __forceinline__ __host__ __device__ +# define CUTE_DEVICE __forceinline__ __device__ +# define CUTE_HOST __forceinline__ __host__ +#else +# define CUTE_HOST_DEVICE inline +# define CUTE_DEVICE inline +# define CUTE_HOST inline +#endif // CUTE_HOST_DEVICE, CUTE_DEVICE + +#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) +# define CUTE_UNROLL #pragma unroll +# define CUTE_NO_UNROLL #pragma unroll 1 +#else +# define CUTE_UNROLL +# define CUTE_NO_UNROLL +#endif // CUTE_UNROLL + +#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) +# define CUTE_INLINE_CONSTANT static const __device__ +#else +# define CUTE_INLINE_CONSTANT static constexpr +#endif + +// Some versions of GCC < 11 have trouble deducing that a +// function with "auto" return type and all of its returns in an "if +// constexpr ... else" statement must actually return. Thus, GCC +// emits spurious "missing return statement" build warnings. +// Developers can suppress these warnings by using the +// CUTE_GCC_UNREACHABLE macro, which must be followed by a semicolon. +// It's harmless to use the macro for other GCC versions or other +// compilers, but it has no effect. +#if ! defined(CUTE_GCC_UNREACHABLE) +# if defined(__GNUC__) && __GNUC__ < 11 + // GCC 10, but not 7.5, 9.4.0, or 11, issues "missing return + // statement" warnings without this little bit of help. +# define CUTE_GCC_UNREACHABLE __builtin_unreachable() +# else +# define CUTE_GCC_UNREACHABLE +# endif +#endif + +// +// Assertion helpers +// + +#include + +#define CUTE_STATIC_ASSERT static_assert +#define CUTE_STATIC_ASSERT_V(x,...) static_assert(decltype(x)::value, ##__VA_ARGS__) + +#if defined(__CUDA_ARCH__) +# define CUTE_RUNTIME_ASSERT(x) asm volatile ("brkpt;\n" ::: "memory") +#else +# define CUTE_RUNTIME_ASSERT(x) assert(0 && x) +#endif + +// +// IO +// + +#include +#include +#include + +// +// Support +// + +#include + +// +// Basic types +// + +#include +#include +#include +#include +#include +#include +#include + +// +// Debugging utilities +// + +#include +#include diff --git a/include/cute/container/alignment.hpp b/include/cute/container/alignment.hpp new file mode 100644 index 0000000000..49101fa7a9 --- /dev/null +++ b/include/cute/container/alignment.hpp @@ -0,0 +1,70 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +// Test if a pointer is aligned to N bytes +template +CUTE_HOST_DEVICE constexpr +bool +is_byte_aligned(void const* const ptr) +{ + static_assert(N > 0 && (N & (N - 1)) == 0, "N must be a power of 2 in alignment check"); + return (reinterpret_cast(ptr) & (N-1)) == 0; +} + +#if defined(__CUDACC__) +# define CUTE_ALIGNAS(n) __align__(n) +#else +# define CUTE_ALIGNAS(n) alignas(n) +#endif + +template +struct aligned_struct {}; + +template <> struct CUTE_ALIGNAS( 1) aligned_struct< 1> {}; +template <> struct CUTE_ALIGNAS( 2) aligned_struct< 2> {}; +template <> struct CUTE_ALIGNAS( 4) aligned_struct< 4> {}; +template <> struct CUTE_ALIGNAS( 8) aligned_struct< 8> {}; +template <> struct CUTE_ALIGNAS( 16) aligned_struct< 16> {}; +template <> struct CUTE_ALIGNAS( 32) aligned_struct< 32> {}; +template <> struct CUTE_ALIGNAS( 64) aligned_struct< 64> {}; +template <> struct CUTE_ALIGNAS(128) aligned_struct<128> {}; +template <> struct CUTE_ALIGNAS(256) aligned_struct<256> {}; + +} // end namespace cute diff --git a/include/cute/container/array.hpp b/include/cute/container/array.hpp new file mode 100644 index 0000000000..571ac0897c --- /dev/null +++ b/include/cute/container/array.hpp @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute +{ + +template +struct array +{ + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + using iterator = pointer; + using const_iterator = const_pointer; + + CUTE_HOST_DEVICE constexpr + reference operator[](size_type pos) + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE constexpr + const_reference operator[](size_type pos) const + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE constexpr + reference front() + { + return *begin(); + } + + CUTE_HOST_DEVICE constexpr + const_reference front() const + { + return *begin(); + } + + CUTE_HOST_DEVICE constexpr + reference back() + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE constexpr + const_reference back() const + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE constexpr + T* data() + { + return __elems_; + } + + CUTE_HOST_DEVICE constexpr + T const* data() const + { + return __elems_; + } + + CUTE_HOST_DEVICE constexpr + iterator begin() + { + return data(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator begin() const + { + return data(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cbegin() + { + return begin(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cbegin() const + { + return begin(); + } + + CUTE_HOST_DEVICE constexpr + iterator end() + { + return data() + size(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator end() const + { + return data() + size(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cend() + { + return end(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cend() const + { + return end(); + } + + CUTE_HOST_DEVICE constexpr + bool empty() const + { + return size() == 0; + } + + CUTE_HOST_DEVICE constexpr + size_type size() const + { + return N; + } + + CUTE_HOST_DEVICE constexpr + size_type max_size() const + { + return size(); + } + + CUTE_HOST_DEVICE constexpr + void fill(const T& value) + { + for (auto& e : *this) { + e = value; + } + } + + CUTE_HOST_DEVICE constexpr + void clear() + { + fill(T(0)); + } + + CUTE_HOST_DEVICE constexpr + void swap(array& other) + { + using std::swap; + for (size_type i = 0; i < size(); ++i) { + swap((*this)[i], other[i]); + } + } + + value_type __elems_[N > 0 ? N : 1]; +}; + + +template +CUTE_HOST_DEVICE constexpr +bool operator==(array const& lhs, array const& rhs) +{ + for (std::size_t i = 0; i < N; ++i) { + if (lhs[i] != rhs[i]) { + return false; + } + } + return true; +} + +template +CUTE_HOST_DEVICE constexpr +void clear(array& a) +{ + a.fill(T(0)); +} + +template +CUTE_HOST_DEVICE constexpr +void fill(array& a, T const& value) +{ + a.fill(value); +} + +template +CUTE_HOST_DEVICE constexpr +void swap(array& a, array& b) +{ + a.swap(b); +} + +} // end cute + + +// +// Specialize tuple-related functionality for cute::array +// + +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +T& get(array& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T const& get(array const& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T&& get(array&& a) +{ + static_assert(I < N, "Index out of range"); + return std::move(a[I]); +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> +{ + using type = T; +}; + +} // end std diff --git a/include/cute/container/array_aligned.hpp b/include/cute/container/array_aligned.hpp new file mode 100644 index 0000000000..b1b357278d --- /dev/null +++ b/include/cute/container/array_aligned.hpp @@ -0,0 +1,276 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include + +namespace cute +{ + +template +struct array_aligned + : public aligned_struct +{ + /// Make sure the Alignment makes sense wrt the size of elements. + static_assert(Alignment == 16 || Alignment >= sizeof(T), "Alignment is too small"); + /// Alignment must be a power of two + static_assert(has_single_bit(Alignment), "Alignment must be a power of two"); + + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + using iterator = pointer; + using const_iterator = const_pointer; + + CUTE_HOST_DEVICE constexpr + reference operator[](size_type pos) + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE constexpr + const_reference operator[](size_type pos) const + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE constexpr + reference front() + { + return *begin(); + } + + CUTE_HOST_DEVICE constexpr + const_reference front() const + { + return *begin(); + } + + CUTE_HOST_DEVICE constexpr + reference back() + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE constexpr + const_reference back() const + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE constexpr + T* data() + { + return reinterpret_cast(storage); + } + + CUTE_HOST_DEVICE constexpr + T const* data() const + { + return reinterpret_cast(storage); + } + + CUTE_HOST_DEVICE constexpr + iterator begin() + { + return data(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator begin() const + { + return data(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cbegin() + { + return begin(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cbegin() const + { + return begin(); + } + + CUTE_HOST_DEVICE constexpr + iterator end() + { + return data() + size(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator end() const + { + return data() + size(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cend() + { + return end(); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cend() const + { + return end(); + } + + CUTE_HOST_DEVICE constexpr + bool empty() const + { + return size() == 0; + } + + CUTE_HOST_DEVICE constexpr + size_type size() const + { + return N; + } + + CUTE_HOST_DEVICE constexpr + size_type max_size() const + { + return size(); + } + + CUTE_HOST_DEVICE constexpr + void fill(T const& value) + { + for (auto& e : *this) { + e = value; + } + } + + CUTE_HOST_DEVICE constexpr + void clear() + { + fill(T(0)); + } + + // Not private, we want trivial type + //private: + + /// Storage type to use for Elements + using StorageType = typename uint_byte(Alignment)>::type; + + /// Ensure that there's enough storage for all elements + static_assert(sizeof(StorageType) <= Alignment, "StorageType is too big for given alignment"); + + /// Number of elements in the storage + static constexpr std::size_t storageN = (sizeof(T)*N + sizeof(StorageType) - 1) / sizeof(StorageType); + + /// The storage. + StorageType storage[storageN > 0 ? storageN : 1]; +}; + +// +// Operators +// + +template +CUTE_HOST_DEVICE constexpr +void clear(array_aligned& a) +{ + a.clear(); +} + +template +CUTE_HOST_DEVICE constexpr +void fill(array_aligned& a, T const& value) +{ + a.fill(value); +} + +} // end namespace cute + +// +// Specialize tuple-related functionality for cute::array +// + +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +T& get(array_aligned& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T const& get(array_aligned const& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T&& get(array_aligned&& a) +{ + static_assert(I < N, "Index out of range"); + return std::move(a[I]); +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> +{ + using type = T; +}; + +} // end std diff --git a/include/cute/container/array_subbyte.hpp b/include/cute/container/array_subbyte.hpp new file mode 100644 index 0000000000..a217a671f7 --- /dev/null +++ b/include/cute/container/array_subbyte.hpp @@ -0,0 +1,613 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Statically sized array of elements that accommodates subbyte trivial types + in a packed storage. +*/ + +#pragma once + +#include + +#include // sizeof_bits + +namespace cute +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Statically sized array for any data type +template +class array_subbyte +{ + public: + + /// Number of total bits in the array + static constexpr int kSizeBits = sizeof_bits::value * N; + + /// Storage type + using Storage = typename std::conditional< + (kSizeBits % 32) == 0, + uint32_t, + typename std::conditional< + (kSizeBits % 16) == 0, + uint16_t, + uint8_t + >::type + >::type; + + + /// Number of logical elements per stored object + static constexpr int kElementsPerStoredItem = sizeof_bits::value / sizeof_bits::value; + + /// Number of storage elements + static constexpr std::size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem; + + /// Bitmask for covering one item + static constexpr Storage bit_mask_ = ((Storage(1) << sizeof_bits::value) - 1); + + // + // C++ standard members with reference and iterator types omitted + // + + using value_type = T; + using pointer = value_type*; + using const_pointer = value_type const*; + + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + // + // References + // + + /// Reference object inserts or extracts sub-byte items + class reference { + /// Pointer to storage element + Storage* ptr_; + + /// Index into elements packed into Storage object + int idx_; + + public: + + /// Default ctor + CUTE_HOST_DEVICE constexpr + reference() : ptr_(nullptr), idx_(0) {} + + /// Ctor + CUTE_HOST_DEVICE constexpr + reference(Storage* ptr, int idx = 0) : ptr_(ptr), idx_(idx) {} + + /// Assignment + CUTE_HOST_DEVICE constexpr + reference& operator=(T x) { + Storage item = (reinterpret_cast(x) & bit_mask_); + Storage kUpdateMask = Storage(~(bit_mask_ << (idx_ * sizeof_bits::value))); + *ptr_ = Storage((*ptr_ & kUpdateMask) | (item << (idx_ * sizeof_bits::value))); + return *this; + } + + CUTE_HOST_DEVICE constexpr + T get() const { + Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits::value)) & bit_mask_); + return reinterpret_cast(item); + } + + /// Extract to type T -- disable if T == bool + template ::value)> + CUTE_HOST_DEVICE constexpr + operator T() const { + return get(); + } + + // Extract to bool -- potentially faster impl + CUTE_HOST_DEVICE constexpr + operator bool() const { + return bool((*ptr_) & (bit_mask_ << (idx_ * sizeof_bits::value))); + } + + /// Explicit cast to int + CUTE_HOST_DEVICE constexpr + explicit operator int() const { + return int(get()); + } + + /// Explicit cast to float + CUTE_HOST_DEVICE constexpr + explicit operator float() const { + return float(get()); + } + }; + + /// Reference object extracts sub-byte items + class const_reference { + + /// Pointer to storage element + Storage const* ptr_; + + /// Index into elements packed into Storage object + int idx_; + + public: + + /// Default ctor + CUTE_HOST_DEVICE constexpr + const_reference(): ptr_(nullptr), idx_(0) { } + + /// Ctor + CUTE_HOST_DEVICE constexpr + const_reference(Storage const* ptr, int idx = 0): ptr_(ptr), idx_(idx) { } + + CUTE_HOST_DEVICE constexpr + const T get() const { + Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits::value)) & bit_mask_); + return reinterpret_cast(item); + } + + /// Extract to type T -- disable if T == bool + template ::value)> + CUTE_HOST_DEVICE constexpr + operator T() const { + return get(); + } + + // Extract to bool -- potentially faster impl + CUTE_HOST_DEVICE constexpr + operator bool() const { + return bool((*ptr_) & (bit_mask_ << (idx_ * sizeof_bits::value))); + } + + /// Explicit cast to int + CUTE_HOST_DEVICE constexpr + explicit operator int() const { + return int(get()); + } + + /// Explicit cast to float + CUTE_HOST_DEVICE constexpr + explicit operator float() const { + return float(get()); + } + }; + + // + // Iterators + // + + /// Bidirectional iterator over elements + class iterator { + + /// Pointer to storage element + Storage* ptr_; + + /// Index into elements packed into Storage object + int idx_; + + public: + + CUTE_HOST_DEVICE constexpr + iterator(): ptr_(nullptr), idx_(0) { } + + CUTE_HOST_DEVICE constexpr + iterator(Storage* ptr, int idx = 0): ptr_(ptr), idx_(idx) { } + + CUTE_HOST_DEVICE constexpr + iterator& operator++() { + ++idx_; + if (idx_ == kElementsPerStoredItem) { + ++ptr_; + idx_ = 0; + } + return *this; + } + + CUTE_HOST_DEVICE constexpr + iterator& operator--() { + if (idx_) { + --idx_; + } else { + --ptr_; + idx_ = kElementsPerStoredItem - 1; + } + return *this; + } + + CUTE_HOST_DEVICE constexpr + iterator operator++(int) { + iterator ret(*this); + ++(*this); + return ret; + } + + CUTE_HOST_DEVICE constexpr + iterator operator--(int) { + iterator ret(*this); + --(*this); + return ret; + } + + CUTE_HOST_DEVICE constexpr + iterator& operator+=(int k) { + idx_ += k; + ptr_ += idx_ / kElementsPerStoredItem; + idx_ = idx_ % kElementsPerStoredItem; + return *this; + } + + CUTE_HOST_DEVICE constexpr + iterator operator+(int k) const { + return iterator(ptr_,idx_) += k; + } + + CUTE_HOST_DEVICE constexpr + reference operator*() const { + return reference(ptr_, idx_); + } + + CUTE_HOST_DEVICE constexpr + reference operator[](int k) const { + return *(*this + k); + } + + CUTE_HOST_DEVICE constexpr + bool operator==(iterator const& other) const { + return ptr_ == other.ptr_ && idx_ == other.idx_; + } + + CUTE_HOST_DEVICE constexpr + bool operator!=(iterator const& other) const { + return !(*this == other); + } + }; + + /// Bidirectional constant iterator over elements + class const_iterator { + + /// Pointer to storage element + Storage const* ptr_; + + /// Index into elements packed into Storage object + int idx_; + + public: + + CUTE_HOST_DEVICE constexpr + const_iterator(): ptr_(nullptr), idx_(0) { } + + CUTE_HOST_DEVICE constexpr + const_iterator(Storage const* ptr, int idx = 0): ptr_(ptr), idx_(idx) { } + + CUTE_HOST_DEVICE constexpr + const_iterator& operator++() { + ++idx_; + if (idx_ == kElementsPerStoredItem) { + ++ptr_; + idx_ = 0; + } + return *this; + } + + CUTE_HOST_DEVICE constexpr + const_iterator& operator--() { + if (idx_) { + --idx_; + } else { + --ptr_; + idx_ = kElementsPerStoredItem - 1; + } + return *this; + } + + CUTE_HOST_DEVICE constexpr + const_iterator operator++(int) { + iterator ret(*this); + ++idx_; + if (idx_ == kElementsPerStoredItem) { + ++ptr_; + idx_ = 0; + } + return ret; + } + + CUTE_HOST_DEVICE constexpr + const_iterator operator--(int) { + iterator ret(*this); + if (idx_) { + --idx_; + } else { + --ptr_; + idx_ = kElementsPerStoredItem - 1; + } + return ret; + } + + CUTE_HOST_DEVICE constexpr + const_iterator& operator+=(int k) { + idx_ += k; + ptr_ += idx_ / kElementsPerStoredItem; + idx_ = idx_ % kElementsPerStoredItem; + return *this; + } + + CUTE_HOST_DEVICE constexpr + const_iterator operator+(int k) const { + return const_iterator(ptr_,idx_) += k; + } + + CUTE_HOST_DEVICE constexpr + const_reference operator*() const { + return const_reference(ptr_, idx_); + } + + CUTE_HOST_DEVICE constexpr + const_reference operator[](int k) const { + return *(*this + k); + } + + CUTE_HOST_DEVICE constexpr + bool operator==(iterator const& other) const { + return ptr_ == other.ptr_ && idx_ == other.idx_; + } + + CUTE_HOST_DEVICE constexpr + bool operator!=(iterator const& other) const { + return !(*this == other); + } + }; + +private: + + /// Internal storage + Storage storage[kStorageElements]; + +public: + + CUTE_HOST_DEVICE constexpr + array_subbyte() { } + + CUTE_HOST_DEVICE constexpr + array_subbyte(array_subbyte const& x) { + CUTE_UNROLL + for (unsigned i = 0; i < kStorageElements; ++i) { + storage[i] = x.storage[i]; + } + } + + CUTE_HOST_DEVICE constexpr + size_type size() const { + return N; + } + + CUTE_HOST_DEVICE constexpr + size_type max_size() const { + return N; + } + + CUTE_HOST_DEVICE constexpr + bool empty() const { + return !N; + } + + /// Efficient clear method + CUTE_HOST_DEVICE constexpr + void clear() { + CUTE_UNROLL + for (unsigned i = 0; i < kStorageElements; ++i) { + storage[i] = Storage(0); + } + } + + // Efficient fill method + CUTE_HOST_DEVICE constexpr + void fill(T const& value) { + Storage item = (reinterpret_cast(value) & bit_mask_); + + // Reproduce the value over the bits of the storage item + CUTE_UNROLL + for (unsigned s = sizeof_bits::value; s < sizeof_bits::value; s *= 2) { + item |= item << s; + } + + CUTE_UNROLL + for (unsigned i = 0; i < kStorageElements; ++i) { + storage[i] = item; + } + } + + CUTE_HOST_DEVICE constexpr + reference at(size_type pos) { + return reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem); + } + + CUTE_HOST_DEVICE constexpr + const_reference at(size_type pos) const { + return const_reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem); + } + + CUTE_HOST_DEVICE constexpr + reference operator[](size_type pos) { + return at(pos); + } + + CUTE_HOST_DEVICE constexpr + const_reference operator[](size_type pos) const { + return at(pos); + } + + CUTE_HOST_DEVICE constexpr + reference front() { + return at(0); + } + + CUTE_HOST_DEVICE constexpr + const_reference front() const { + return at(0); + } + + CUTE_HOST_DEVICE constexpr + reference back() { + return reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1); + } + + CUTE_HOST_DEVICE constexpr + const_reference back() const { + return const_reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1); + } + + CUTE_HOST_DEVICE constexpr + pointer data() { + return reinterpret_cast(storage); + } + + CUTE_HOST_DEVICE constexpr + const_pointer data() const { + return reinterpret_cast(storage); + } + + CUTE_HOST_DEVICE constexpr + Storage* raw_data() { + return storage; + } + + CUTE_HOST_DEVICE constexpr + Storage const* raw_data() const { + return storage; + } + + CUTE_HOST_DEVICE constexpr + iterator begin() { + return iterator(storage); + } + + CUTE_HOST_DEVICE constexpr + const_iterator begin() const { + return const_iterator(storage); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cbegin() const { + return begin(); + } + + CUTE_HOST_DEVICE constexpr + iterator end() { + return iterator(storage + N / kElementsPerStoredItem, N % kElementsPerStoredItem); + } + + CUTE_HOST_DEVICE constexpr + const_iterator end() const { + return const_iterator(storage + N / kElementsPerStoredItem, N % kElementsPerStoredItem); + } + + CUTE_HOST_DEVICE constexpr + const_iterator cend() const { + return end(); + } + + // + // Comparison operators + // + +}; + +// +// Operators +// + +template +CUTE_HOST_DEVICE constexpr +void clear(array_subbyte& a) +{ + a.clear(); +} + +template +CUTE_HOST_DEVICE constexpr +void fill(array_subbyte& a, T const& value) +{ + a.fill(value); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute + +// +// Specialize tuple-related functionality for cute::array_subbyte +// + +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +T& get(array_subbyte& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T const& get(array_subbyte const& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T&& get(array_subbyte&& a) +{ + static_assert(I < N, "Index out of range"); + return std::move(a[I]); +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> +{ + using type = T; +}; + +} // end namespace std diff --git a/include/cute/container/array_view.hpp b/include/cute/container/array_view.hpp new file mode 100644 index 0000000000..51b3ccc07d --- /dev/null +++ b/include/cute/container/array_view.hpp @@ -0,0 +1,274 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace cute +{ + +template +struct array_view +{ + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + using iterator = pointer; + using const_iterator = const_pointer; + + array_view(array& a) + : __elems_(a.data()) {} + + CUTE_HOST_DEVICE + reference operator[](size_type pos) + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE + const_reference operator[](size_type pos) const + { + return begin()[pos]; + } + + CUTE_HOST_DEVICE + reference front() + { + return *begin(); + } + + CUTE_HOST_DEVICE + const_reference front() const + { + return *begin(); + } + + CUTE_HOST_DEVICE + reference back() + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE + const_reference back() const + { + // return *rbegin(); + return operator[](N-1); + } + + CUTE_HOST_DEVICE + T* data() + { + return __elems_; + } + + CUTE_HOST_DEVICE + const T* data() const + { + return __elems_; + } + + CUTE_HOST_DEVICE + iterator begin() + { + return data(); + } + + CUTE_HOST_DEVICE + const_iterator begin() const + { + return data(); + } + + CUTE_HOST_DEVICE + const_iterator cbegin() + { + return begin(); + } + + CUTE_HOST_DEVICE + const_iterator cbegin() const + { + return begin(); + } + + CUTE_HOST_DEVICE + iterator end() + { + return data() + size(); + } + + CUTE_HOST_DEVICE + const_iterator end() const + { + return data() + size(); + } + + CUTE_HOST_DEVICE + const_iterator cend() + { + return end(); + } + + CUTE_HOST_DEVICE + const_iterator cend() const + { + return end(); + } + + CUTE_HOST_DEVICE constexpr + bool empty() const + { + return size() == 0; + } + + CUTE_HOST_DEVICE constexpr + size_type size() const + { + return N; + } + + CUTE_HOST_DEVICE constexpr + size_type max_size() const + { + return size(); + } + + CUTE_HOST_DEVICE + void fill(const T& value) + { + for(auto& e : *this) + { + e = value; + } + } + + CUTE_HOST_DEVICE + void swap(array_view& other) + { + using std::swap; + swap(__elems_, other.__elems_); + } + + value_type* __elems_; +}; + + +template +CUTE_HOST_DEVICE +bool operator==(const array_view& lhs, const array_view& rhs) +{ + for(std::size_t i = 0; i < N; ++i) + { + if(lhs[i] != rhs[i]) return false; + } + + return true; +} + +template +CUTE_HOST_DEVICE +void clear(array_view& a) +{ + a.fill(T(0)); +} + +template +CUTE_HOST_DEVICE +void swap(array_view& a, array_view& b) +{ + a.swap(b); +} + +} // end cute + + +// +// Specialize tuple-related functionality for cute::array_view +// + +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +T& +get(array_view& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +const T& +get(const array_view& a) +{ + static_assert(I < N, "Index out of range"); + return a[I]; +} + +template +CUTE_HOST_DEVICE constexpr +T&& +get(array_view&& a) +{ + static_assert(I < N, "Index out of range"); + return std::move(a[I]); +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> +{ + using type = T; +}; + +} // end std diff --git a/include/cute/container/bit_field.hpp b/include/cute/container/bit_field.hpp new file mode 100644 index 0000000000..06b08754c9 --- /dev/null +++ b/include/cute/container/bit_field.hpp @@ -0,0 +1,131 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Portable bit field that supports byte and word straddling that can + be used in unions to bit-wise define parameters. +*/ + +#pragma once + +#include + +#include // uint_bit_t + +namespace cute +{ + +class dummy_type {}; + +template +struct bit_field +{ + static_assert(0 < NumBits && NumBits <= 64, "bit_fields with more than 64 bits are not supported."); + + // value_type: Use the smallest value type that fits NumBits + static constexpr uint32_t value_type_bits = (NumBits <= 8) ? 8 : + (NumBits <= 16) ? 16 : + (NumBits <= 32) ? 32 : 64; + using value_type = cute::uint_bit_t; + // storage_type: Use the smallest storage_type that avoids boundary crossing + static constexpr uint32_t storage_type_bits = (BitStart / 8 == (BitStart + NumBits - 1) / 8) ? 8 : + (BitStart / 16 == (BitStart + NumBits - 1) / 16) ? 16 : + (BitStart / 32 == (BitStart + NumBits - 1) / 32) ? 32 : 64; + using storage_type = cute::uint_bit_t; + + static_assert(sizeof(OtherValueType) == sizeof(value_type) || std::is_same::value, + "sizeof(OtherValueType) must be same as sizeof(value_type)."); + + // Number of storage values needed: ceil_div(BitStart + NumBits, storage_type_bits) + static constexpr uint32_t N = (BitStart + NumBits + storage_type_bits - 1) / storage_type_bits; + // Index of storage value for BitStart + static constexpr uint32_t idx = BitStart / storage_type_bits; + // Bit of data_[idx] for BitStart + static constexpr uint32_t bit_lo = BitStart % storage_type_bits; + // Number of bits in data_[idx] used for NumBits if straddling, else 0 + static constexpr uint32_t bit_hi = (idx + 1 < N) ? (storage_type_bits - bit_lo) : 0; + + // NumBits mask + static constexpr value_type mask = (NumBits < 64) ? ((uint64_t(1) << NumBits) - 1) : uint64_t(-1); + // NumBits mask for BitStart + static constexpr storage_type mask_lo = storage_type(mask) << bit_lo; + // NumBits mask for leftover bits in data_[idx+1] if straddling, else 0 + static constexpr storage_type mask_hi = (idx + 1 < N) ? (storage_type(mask) >> bit_hi) : 0; + + storage_type data_[N]; + + // Get value + CUTE_HOST_DEVICE constexpr + value_type get() const { + storage_type result = (data_[idx] & mask_lo) >> bit_lo; + if constexpr (bit_hi) { + result |= (data_[idx+1] & mask_hi) << bit_hi; + } + return static_cast(result); + } + + // Set value + CUTE_HOST_DEVICE constexpr + void set(value_type x) { + storage_type item = static_cast(x & mask); + data_[idx] = static_cast((data_[idx] & ~mask_lo) | (item << bit_lo)); + if constexpr (bit_hi) { + data_[idx+1] = static_cast((data_[idx+1] & ~mask_hi) | (item >> bit_hi)); + } + } + + // Assign value + CUTE_HOST_DEVICE constexpr + bit_field& operator=(value_type x) { + set(x); + return *this; + } + + // Cast to value + CUTE_HOST_DEVICE constexpr + operator value_type () const { + return get(); + } + + // Assign OtherValueType + CUTE_HOST_DEVICE constexpr + bit_field& operator=(OtherValueType x) { + return *this = *reinterpret_cast(&x); + } + + // Cast to OtherValueType + CUTE_HOST_DEVICE constexpr + operator OtherValueType () const { + value_type x = get(); + return *reinterpret_cast(&x); + } +}; + +} // end namespace cute diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp new file mode 100644 index 0000000000..1b3ffa42d4 --- /dev/null +++ b/include/cute/container/tuple.hpp @@ -0,0 +1,671 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +#include +#include + +#include // cute::true_type, cute::false_type +//#include // Advanced optimizations + +#if 0 +// +// Use of agency::tuple is functional, but is over-engineered for our purposes... +// This tends to result in slow compilation times and unintentionally propagated cvref types +// + +#include + +namespace cute +{ + +using agency::tuple; + +using agency::make_tuple; +using agency::tuple_cat; + +} // end namespace cute +#endif + +// cute::tuple is like std::tuple, with two differences. +// +// 1. It works on both host and device. +// 2. Its template arguments must be semiregular types. +// +// Semiregular types are default constructible and copyable. +// They include "value types" like int or float, +// but do _not_ include references like int& or float&. +// (See std::tie for an example of a tuple of references.) +// +// This is simplified over the implementation in std:: and agency:: by ignoring much of +// the conversion SFINAE, special overloading, and avoiding cvref template types. +// Furthermore, the empty base optimization (EBO) is MORE aggressive by avoiding +// construction calls, and ignoring any need for unique element addresses. +// +// Over the agency::tuple implementation, this appears to accelerate compilation times by over 3x. + +namespace cute +{ + +namespace detail +{ + +// EBO stands for "empty base optimization." +// We use this technique to ensure that cute::tuple +// doesn't need to waste space storing any template arguments +// of cute::tuple that have no data (like integral_constant). +// Otherwise, cute::tuple would need to spend at least 1 byte +// for each of its template arguments. +// +// EBO always "holds" a single value of type T. +// N is like an array index that TupleBase uses +// to access the desired tuple element. +template ::value> +struct EBO; + +// Specialization for types T that have no data; +// the "static tuple leaf." Valid T here include +// integral_constant, Int, +// and any other semiregular type +// for which std::is_empty_v is true. +template +struct EBO +{ + CUTE_HOST_DEVICE constexpr + EBO() {} + + CUTE_HOST_DEVICE constexpr + EBO(T const&) {} +}; + +template +CUTE_HOST_DEVICE constexpr T getv(EBO const&) +{ return {}; } + +// Specialization for types T that are not empty; +// the "dynamic tuple leaf." Valid T here include int, +// any other integral or floating-point type, +// or any semiregular type for which std::is_empty_v is false. +template +struct EBO +{ + CUTE_HOST_DEVICE constexpr + EBO() : t_{} {} + + template + CUTE_HOST_DEVICE constexpr + EBO(U const& u) : t_{u} {} + + T t_; +}; + +template +CUTE_HOST_DEVICE constexpr T const& getv(EBO const& x) +{ return x.t_; } + +template +CUTE_HOST_DEVICE constexpr T& getv(EBO& x) +{ return x.t_; } + +template +CUTE_HOST_DEVICE constexpr T&& getv(EBO&& x) +{ return static_cast(x.t_); } + +template +struct TupleBase; + +// Base class of cute::tuple. +// It inherits from EBO for each (i, t) in (I..., T...). +// The actual storage (for nonempty t) lives in the base classes. +// index_sequence is a way to wrap up a sequence of zero or more +// compile-time integer values in a single type. +// We only ever use index_sequence<0, 1, ..., sizeof...(T)> in practice, +// as the type alias TupleBase below indicates. +template +struct TupleBase, T...> + : EBO... +{ + CUTE_HOST_DEVICE constexpr + TupleBase() {} + + template + CUTE_HOST_DEVICE constexpr explicit + TupleBase(U const&... u) + : EBO(u)... {} + + template + CUTE_HOST_DEVICE constexpr + TupleBase(TupleBase, U...> const& u) + : EBO(getv(static_cast const&>(u)))... {} +}; + +} // end namespace detail + +// make_index_sequence returns index_sequence<0, 1, ..., K-1>. +template +using TupleBase = detail::TupleBase, T...>; + +// This is the actual cute::tuple class. +// The storage (if any) lives in TupleBase's EBO base classes. +template +struct tuple : TupleBase +{ + CUTE_HOST_DEVICE constexpr + tuple() {} + + template + CUTE_HOST_DEVICE constexpr + tuple(U const&... u) : TupleBase(u...) {} + + template + CUTE_HOST_DEVICE constexpr + tuple(tuple const& u) + : TupleBase(static_cast const&>(u)) {} +}; + +// +// get for cute::tuple (just like std::get for std::tuple) +// + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +get(tuple const& t) noexcept +{ + static_assert(I < sizeof...(T), "Index out of range"); + return detail::getv(t); +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +get(tuple& t) noexcept +{ + static_assert(I < sizeof...(T), "Index out of range"); + return detail::getv(t); +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +get(tuple&& t) noexcept +{ + static_assert(I < sizeof...(T), "Index out of range"); + return detail::getv(static_cast&&>(t)); +} + +// +// Custom is_tuple trait simply checks the existence of std::tuple_size +// and assumes std::get(.), std::tuple_element +// +namespace detail { + +template +std::integral_constant::value >= 0> has_tuple_size(int); + +template +std::false_type has_tuple_size(...); + +} // end namespace detail + +template +struct is_tuple : decltype(detail::has_tuple_size(0)) {}; + +// +// make_tuple (value-based implementation) +// + +template +CUTE_HOST_DEVICE constexpr +tuple +make_tuple(T const&... t) +{ + return {t...}; +} + +// +// tuple_cat concatenates multiple cute::tuple into a single cute::tuple, +// just like std::tuple_cat for std::tuple. +// + +#if 0 +// Original implementation + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, + std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)...); +} + +} // end namespace detail + +CUTE_HOST_DEVICE constexpr +tuple<> +tuple_cat() +{ + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +Tuple const& +tuple_cat(Tuple const& t) +{ + return t; +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1) +{ + return detail::tuple_cat(t0, t1, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, Ts const&... ts) +{ + return cute::tuple_cat(cute::tuple_cat(t0,t1),t2,ts...); +} +#endif + +#if 1 +// Extended implementation + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, + std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, + std::index_sequence, std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)..., get(t2)...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, + std::index_sequence, std::index_sequence, std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)..., get(t2)..., get(t3)...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, + std::index_sequence, std::index_sequence, std::index_sequence, std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)..., get(t2)..., get(t3)..., get(t4)...); +} + +} // end namespace detail + +CUTE_HOST_DEVICE constexpr +tuple<> +tuple_cat() +{ + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +Tuple const& +tuple_cat(Tuple const& t) +{ + return t; +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1) +{ + return detail::tuple_cat(t0, t1, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2) +{ + return detail::tuple_cat(t0, t1, t2, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3) +{ + return detail::tuple_cat(t0, t1, t2, t3, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4) +{ + return detail::tuple_cat(t0, t1, t2, t3, t4, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, T5 const& t5, Ts const&... ts) +{ + return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), t5, ts...); +} +#endif + +#if 0 +// Outer-Inner indexing trick to concat all tuples at once + +namespace detail { + +template +struct tuple_cat_helper +{ + static constexpr cute::array ns = {Ns...}; + + static constexpr std::size_t total_size() { + std::size_t sum = 0; + for (std::size_t n : ns) sum += n; + return sum; + } + static constexpr std::size_t total_size_ = total_size(); + + static constexpr auto values() { + cute::array outer_inner = {}; + + std::size_t idx = 0; + for (std::size_t i = 0; i < ns.size(); ++i) { + for (std::size_t j = 0; j < ns[i]; ++j, ++idx) { + outer_inner[idx][0] = i; + outer_inner[idx][1] = j; + } + } + return outer_inner; + } + static constexpr auto outer_inner_ = values(); + + using total_sequence = std::make_index_sequence; +}; + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(Tuple const& t, std::index_sequence) +{ + return cute::make_tuple(get(get(t))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1, + std::index_sequence, std::index_sequence) +{ + return cute::make_tuple(get(t0)..., get(t1)...); +} + +} // end namespace detail + +CUTE_HOST_DEVICE constexpr +tuple<> +tuple_cat() +{ + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +Tuple const& +tuple_cat(Tuple const& t) +{ + return t; +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(T0 const& t0, T1 const& t1) +{ + return detail::tuple_cat(t0, t1, + std::make_index_sequence::value>{}, + std::make_index_sequence::value>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tuple_cat(Tuples const&... ts) +{ + using Helper = detail::tuple_cat_helper::value...>; + return detail::tuple_cat(make_tuple(ts...), typename Helper::total_sequence{}); +} +#endif + +// +// Equality operators +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +equal_impl(TupleA const& a, TupleB const& b) +{ + if constexpr (I == std::tuple_size::value) { + return cute::true_type{}; // Terminal: TupleA is exhausted + } else if constexpr (I == std::tuple_size::value) { + return cute::false_type{}; // Terminal: TupleA is not exhausted, TupleB is exhausted + } else { + return (get(a) == get(b)) && equal_impl(a,b); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template ::value && is_tuple::value)> +CUTE_HOST_DEVICE constexpr +auto +operator==(TupleT const& t, TupleU const& u) +{ + return detail::equal_impl<0>(t, u); +} + +template ::value ^ is_tuple::value)> +CUTE_HOST_DEVICE constexpr +auto +operator==(TupleT const& t, TupleU const& u) +{ + return cute::false_type{}; +} + +template ::value && is_tuple::value)> +CUTE_HOST_DEVICE constexpr +auto +operator!=(TupleT const& t, TupleU const& u) +{ + return !(t == u); +} + +template ::value ^ is_tuple::value)> +CUTE_HOST_DEVICE constexpr +auto +operator!=(TupleT const& t, TupleU const& u) +{ + return cute::true_type{}; +} + +// +// Comparison operators +// + +// +// There are many ways to compare tuple of elements and because CuTe is built +// on parameterizing layouts of coordinates, some comparisons are appropriate +// only in certain cases. +// -- lexicographical comparison [reverse, reflected, revref] +// -- colexicographical comparison [reverse, reflected, revref] +// -- element-wise comparison [any,all] +// This can be very confusing. To avoid errors in selecting the appropriate +// comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple. +// +// That said, see int_tuple for more explicitly named common comparison ops. +// + +// +// Shortcuts +// + +//using std::get; +using std::tuple_size; +using std::tuple_element; +using std::tuple_element_t; + +// +// Display utilities +// + +namespace detail { + +template +CUTE_HOST_DEVICE void print_tuple(Tuple const& t, + std::index_sequence, char s = '(', char e = ')') +{ + using eat = int[]; + using cute::print; + (void) eat {(print(s), 0), + (print(Is == 0 ? "" : ","), print(get(t)), 0)..., + (print(e), 0)}; +} + +template +CUTE_HOST std::ostream& print_tuple_os(std::ostream& os, Tuple const& t, + std::index_sequence, char s = '(', char e = ')') +{ + using eat = int[]; + (void) eat {(void(os << s), 0), + (void(os << (Is == 0 ? "" : ",") << get(t)), 0)..., + (void(os << e), 0)}; + return os; +} + +} // end namespace detail + +template ::value)> +CUTE_HOST_DEVICE void print(Tuple const& t) +{ + return detail::print_tuple(t, std::make_index_sequence::value>{}); +} + +template ::value)> +CUTE_HOST std::ostream& operator<<(std::ostream& os, Tuple const& t) +{ + return detail::print_tuple_os(os, t, std::make_index_sequence::value>{}); +} + +} // end namespace cute + +// +// std:: compatability +// + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> + : std::tuple_element> +{}; + +} // end std diff --git a/include/cute/container/type_list.hpp b/include/cute/container/type_list.hpp new file mode 100644 index 0000000000..c082a6daaf --- /dev/null +++ b/include/cute/container/type_list.hpp @@ -0,0 +1,84 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +namespace cute +{ + +template +struct type_c { + using type = T; +}; + +template +struct type_list {}; + +} // end namespace cute + +// +// Specialize tuple-related functionality for cute::type_list +// + +#include +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +std::tuple_element_t> +get(type_list&) noexcept { + return {}; +} +template +CUTE_HOST_DEVICE constexpr +std::tuple_element_t> +get(type_list const& t) noexcept { + return {}; +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> + : cute::type_c>::type> +{}; + +} // end namespace std diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp new file mode 100644 index 0000000000..045e7210b1 --- /dev/null +++ b/include/cute/int_tuple.hpp @@ -0,0 +1,827 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace cute +{ + +template +using IntTuple = cute::tuple; + +// Construct an IntTuple with all value-elements +template +CUTE_HOST_DEVICE constexpr +IntTuple +make_int_tuple(Ts const&... t) +{ + return {t...}; +} + +/** if rank(int) == 1, then get<0>(int) should work too + */ +template >::value)> +CUTE_HOST_DEVICE constexpr +decltype(auto) +get(T&& t) noexcept +{ + static_assert(I == 0, "Index out of range"); + return static_cast(t); +} + +/** Custom recursive get for anything that implements get(.) + */ +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +get(Tuple&& t) noexcept +{ + return get(get(static_cast(t))); +} + +// +// rank +// + +template +CUTE_HOST_DEVICE constexpr +auto +rank(IntTuple const& t) +{ + if constexpr (sizeof...(Is) == 0) { + if constexpr (is_tuple::value) { + return Int::value>{}; + } else { + return Int<1>{}; + } + } else { + return rank(get(t)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +using rank_t = decltype(rank(std::declval())); + +template +static constexpr int rank_v = rank_t::value; + +// +// shape +// + +template +CUTE_HOST_DEVICE constexpr +auto +shape(IntTuple const& s) +{ + if constexpr (is_tuple::value) { + return transform(s, [](auto const& a) { return shape(a); }); + } else { + return s; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +shape(IntTuple const& s) +{ + if constexpr (is_tuple::value) { + return shape(get(s)); + } else { + return get(shape(s)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// max +// + +template +CUTE_HOST_DEVICE constexpr +auto +max(T0 const& t0, Ts const&... ts) +{ + if constexpr (is_tuple::value) { + return cute::max(cute::apply(t0, [](auto const&... a){ return cute::max(a...); }), ts...); + } else if constexpr (sizeof...(Ts) == 0) { + return t0; + } else { + return cute::max(t0, cute::max(ts...)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// min +// + +template +CUTE_HOST_DEVICE constexpr +auto +min(T0 const& t0, Ts const&... ts) +{ + if constexpr (is_tuple::value) { + return cute::min(cute::apply(t0, [](auto const&... a){ return cute::min(a...); }), ts...); + } else if constexpr (sizeof...(Ts) == 0) { + return t0; + } else { + return cute::min(t0, cute::min(ts...)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// depth +// + +template +CUTE_HOST_DEVICE constexpr +auto +depth(IntTuple const& t) +{ + if constexpr (sizeof...(Is) == 0) { + if constexpr (is_tuple::value) { + return Int<1>{} + cute::apply(t, [](auto const&... v){ return cute::max(depth(v)...); }); + } else { + return Int<0>{}; + } + } else { + return depth(get(t)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +using depth_t = decltype(depth(std::declval())); + +template +static constexpr int depth_v = depth_t::value; + +// +// product +// + +template +CUTE_HOST_DEVICE constexpr +auto +product(IntTuple const& a) +{ + if constexpr (is_tuple::value) { + return cute::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }); + } else { + return a; + } + + CUTE_GCC_UNREACHABLE; +} + +// Product of a subrange +template +CUTE_HOST_DEVICE constexpr +auto +product(Tuple const& a) +{ + return detail::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }, make_range{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +product_each(Tuple const& t) +{ + return transform(t, [](auto const& x) { return product(x); }); +} + +// Return the product of elements in a mode +template +CUTE_HOST_DEVICE constexpr +auto +size(IntTuple const& a) +{ + if constexpr (sizeof...(Is) == 0) { + return product(a); + } else { + return product(get(a)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +static constexpr int size_v = decltype(size(std::declval()))::value; + +// +// sum +// + +template +CUTE_HOST_DEVICE constexpr +auto +sum(IntTuple const& a) +{ + if constexpr (is_tuple::value) { + return cute::apply(a, [](auto const&... v){ return (Int<0>{} + ... + sum(v)); }); + } else { + return a; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// inner_product +// + +template +CUTE_HOST_DEVICE constexpr +auto +inner_product(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched ranks"); + return transform_apply(a, b, [](auto const& x, auto const& y) { return inner_product(x,y); }, + [](auto const&... v) { return (Int<0>{} + ... + v); }); + } else { + return a * b; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// ceil_div +// + +template +CUTE_HOST_DEVICE constexpr +auto +ceil_div(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + static_assert(tuple_size::value >= tuple_size::value, "Mismatched ranks"); + constexpr int R = tuple_size::value; // Missing ranks in TupleB are implictly 1 + return transform(a, append(b,Int<1>{}), [](auto const& x, auto const& y) { return ceil_div(x,y); }); + } else { + return (a + b - Int<1>{}) / b; + } + + CUTE_GCC_UNREACHABLE; +} + +/** Division for Shapes + */ +template +CUTE_HOST_DEVICE constexpr +auto +shape_div(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // tuple tuple + static_assert(tuple_size::value == tuple_size::value, "Mismatched ranks"); + return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); }); + } else { // tuple int + auto const [result, rest] = fold(a, make_tuple(make_tuple(), b), + [] (auto const& init, auto const& ai) { + return make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai)); + }); + return result; + } + } else { + if constexpr (is_tuple::value) { // int tuple + return shape_div(a, product(b)); + } else { // int int + //assert(a % b == 0 || b % a == 0); + return a / b != 0 ? a / b : signum(a) * signum(b); // divide with rounding away from zero + } + } + + CUTE_GCC_UNREACHABLE; +} + +/** Division for Shapes that are static constants + * @pre t % u == 0 || u % t == 0 + * @result if t % u == 0, then t / u + * if u % t == 0, then signum(t) * signum(u) + */ +template +CUTE_HOST_DEVICE constexpr +constant +shape_div(constant const&, constant const&) +{ + static_assert(t % u == 0 || u % t == 0, "Static shape_div failure"); + return {}; +} + +/** Return a tuple the same profile as A scaled by corresponding elements in B + */ +template +CUTE_HOST_DEVICE constexpr +auto +elem_scale(A const& a, B const& b) +{ + if constexpr (is_tuple::value) { + return transform(a, b, [](auto const& x, auto const& y) { return elem_scale(x,y); }); + } else { + return a * product(b); + } + + CUTE_GCC_UNREACHABLE; +} + +/** Test if two IntTuple have the same profile (hierarchical rank division) + */ +template +CUTE_HOST_DEVICE constexpr +auto +congruent(IntTupleA const& a, IntTupleB const& b) +{ + return bool_constant::value>{}; +} + +template +using is_congruent = decltype(congruent(std::declval(), std::declval())); + +/** Test if Shape B is compatible with Shape A: + * Any coordinate into A can also be used as a coordinate into B + * A <= B is a partially ordered set of factored shapes + */ +template +CUTE_HOST_DEVICE constexpr +auto +compatible(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + if constexpr (tuple_size::value != tuple_size::value) { + return false_type{}; + } else { + return transform_apply(a, b, [](auto const& x, auto const& y) { return compatible(x,y); }, + [](auto const&... z) { return (true_type{} && ... && z); }); + } + } else if constexpr (is_integral::value) { + return a == size(b); + } else if constexpr (is_integral::value) { + return false_type{}; + } else { + return compatible(shape(a), shape(b)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +using is_compatible = decltype(compatible(std::declval(), std::declval())); + +/** Replace the elements of Tuple B that are paired with an Int<0> with an Int<1> + */ +template +CUTE_HOST_DEVICE constexpr +auto +filter_zeros(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value) { + return transform(a, b, [](auto const& x, auto const& y) { return filter_zeros(x,y); }); + } else if constexpr (is_constant<0, IntTupleA>::value) { + return Int<1>{}; + } else { + return b; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +filter_zeros(Tuple const& t) +{ + return filter_zeros(t, t); +} + +// +// Converters and constructors with arrays and params +// + +/** Make an IntTuple of rank N from an Indexable array. + * Access elements up to a dynamic index n, then use init (requires compatible types) + * Consider cute::take if all indexing is known to be valid + * \code + * std::vector a = {6,3,4}; + * auto tup = make_int_tuple<5>(a, a.size(), 0) // (6,3,4,0,0) + * \endcode + */ +template +CUTE_HOST_DEVICE constexpr +auto +make_int_tuple(Indexable const& t, int n, T const& init) +{ + static_assert(N > 0); + if constexpr (N == 1) { + return 0 < n ? t[0] : init; + } else { + return transform(make_seq{}, [&](auto i) { return i < n ? t[i] : init; }); + } + + CUTE_GCC_UNREACHABLE; +} + +/** Fill the dynamic values of a Tuple with values from another Tuple + * \code + * auto params = make_int_tuple(6,3,4); + * cute::tuple, cute::tuple>, int, Int<2>> result; + * fill_int_tuple_from(result, params); // (_1,(6,3,_3),4,_2) + * \endcode + */ +template +CUTE_HOST_DEVICE constexpr +auto +fill_int_tuple_from(Tuple& result, TupleV const& vals) +{ + return fold(result, vals, [](auto const& init, auto&& r) { + if constexpr (is_static>::value) { // Skip static elements of result + return init; + } else if constexpr (is_tuple>::value) { // Recurse into tuples + return fill_int_tuple_from(r, init); + } else { // Assign and consume arg + static_assert(tuple_size>::value > 0, "Not enough values to fill with!"); + r = get<0>(init); + return remove<0>(init); + } + + CUTE_GCC_UNREACHABLE; + }); +} + +/** Make a "Tuple" by filling in the dynamic values in order from the arguments + * \code + * using result_t = cute::tuple, cute::tuple>, int, Int<2>>; + * auto result = make_int_tuple_from(6,3,4); // (_1,(6,3,_3),4,_2) + * \endcode + */ +template +CUTE_HOST_DEVICE constexpr +Tuple +make_int_tuple_from(Ts const&... ts) +{ + Tuple result = Tuple{}; + fill_int_tuple_from(result, make_tuple(ts...)); + return result; +} + +/** Convert a tuple to a flat homogeneous array of type T + * \code + * auto tup = make_tuple(Int<1>{}, make_tuple(6,3,Int<3>{}),4,Int<2>{}); + * cute::array result = to_array(tup); // [1,6,3,3,4,2] + * \endcode + */ +template +CUTE_HOST_DEVICE constexpr +auto +to_array(IntTuple const& t) +{ + auto flat_t = flatten_to_tuple(t); + constexpr int N = tuple_size::value; + cute::array result; + for_each(make_seq{}, [&] (auto i) { result[i] = get(flat_t); }); + return result; +} + +// +// Comparison operators +// + +// +// There are many ways to compare tuple of elements and because CuTe is built +// on parameterizing layouts of coordinates, some comparisons are appropriate +// only in certain cases. +// -- lexicographical comparison [reverse, reflected, revref] : Correct for coords in RowMajor Layout +// -- colexicographical comparison [reverse, reflected, revref] : Correct for coords in ColMajor Layout +// -- element-wise comparison [any,all] : +// This can be very confusing. To avoid errors in selecting the appropriate +// comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple. +// +// When actually desiring to order coordinates, the user should map them to +// their indices within the Layout they came from: +// e.g. layoutX(coordA) < layoutX(coordB) +// That said, we implement the three most common ways to compare tuples below. +// These are implemented with slighly more explicit names than op<. +// + +template +CUTE_HOST_DEVICE constexpr +auto +lex_less(IntTupleA const& a, IntTupleB const& b); + +template +CUTE_HOST_DEVICE constexpr +auto +colex_less(IntTupleA const& a, IntTupleB const& b); + +template +CUTE_HOST_DEVICE constexpr +auto +elem_less(IntTupleA const& a, IntTupleB const& b); + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +lex_less_impl(TupleA const& a, TupleB const& b) +{ + if constexpr (I == tuple_size::value) { + return cute::false_type{}; // Terminal: TupleB is exhausted + } else if constexpr (I == tuple_size::value) { + return cute::true_type{}; // Terminal: TupleA is exhausted, TupleB is not exhausted + } else { + return lex_less(get(a), get(b)) || (get(a) == get(b) && lex_less_impl(a,b)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +colex_less_impl(TupleA const& a, TupleB const& b) +{ + if constexpr (I == tuple_size::value) { + return cute::false_type{}; // Terminal: TupleB is exhausted + } else if constexpr (I == tuple_size::value) { + return cute::true_type{}; // Terminal: TupleA is exhausted, TupleB is not exhausted + } else { + constexpr std::size_t A = tuple_size::value - 1 - I; + constexpr std::size_t B = tuple_size::value - 1 - I; + return colex_less(get(a), get(b)) || (get(a) == get(b) && colex_less_impl(a,b)); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +elem_less_impl(TupleA const& a, TupleB const& b) +{ + if constexpr (I == tuple_size::value) { + return cute::true_type{}; // Terminal: TupleA is exhausted + } else if constexpr (I == tuple_size::value) { + return cute::false_type{}; // Terminal: TupleA is not exhausted, TupleB is exhausted + } else { + return elem_less(get(a), get(b)) && elem_less_impl(a,b); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +// Lexicographical comparison + +template +CUTE_HOST_DEVICE constexpr +auto +lex_less(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + return detail::lex_less_impl<0>(a, b); + } else { + return a < b; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +lex_leq(T const& t, U const& u) { + return !lex_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +lex_gtr(T const& t, U const& u) { + return lex_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +lex_geq(T const& t, U const& u) { + return !lex_less(t, u); +} + +// Colexicographical comparison + +template +CUTE_HOST_DEVICE constexpr +auto +colex_less(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + return detail::colex_less_impl<0>(a, b); + } else { + return a < b; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +colex_leq(T const& t, U const& u) { + return !colex_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +colex_gtr(T const& t, U const& u) { + return colex_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +colex_geq(T const& t, U const& u) { + return !colex_less(t, u); +} + +// Elementwise [all] comparison + +template +CUTE_HOST_DEVICE constexpr +auto +elem_less(IntTupleA const& a, IntTupleB const& b) +{ + if constexpr (is_tuple::value && is_tuple::value) { + return detail::elem_less_impl<0>(a, b); + } else { + return a < b; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +elem_leq(T const& t, U const& u) { + return !elem_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +elem_gtr(T const& t, U const& u) { + return elem_less(u, t); +} + +template +CUTE_HOST_DEVICE constexpr +auto +elem_geq(T const& t, U const& u) { + return !elem_less(t, u); +} + +/** Increment a (dynamic) coord lexicographically within a shape + * \code + * auto shape = make_shape(1,2,make_shape(2,3),3); + * + * int i = 0; + * for (auto coord = repeat_like(shape, 0); back(coord) != back(shape); increment(coord, shape)) { + * std::cout << i++ << ": " << coord << std::endl; + * } + * assert(i == size(shape)); + * \endcode + */ +template +CUTE_HOST_DEVICE constexpr +void +increment(Coord& coord, Shape const& shape); + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +void +increment(Coord& coord, Shape const& shape, seq) +{ + cute::increment(get(coord), get(shape)); + if constexpr (sizeof...(Is) != 0) { + if (back(get(coord)) == back(get(shape))) { + back(get(coord)) = 0; + increment(coord, shape, seq{}); + } + } +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +void +increment(Coord& coord, Shape const& shape) +{ + if constexpr (is_integral::value && is_integral::value) { + ++coord; + } else if constexpr (is_tuple::value && is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched ranks"); + detail::increment(coord, shape, tuple_seq{}); + } else { + static_assert(sizeof(Coord) == 0, "Invalid parameters"); + } +} + +struct ForwardCoordIteratorSentinal +{}; + +// A forward iterator for a coordinate that starts from zero and goes to shape +template +struct ForwardCoordIterator +{ + static_assert(is_congruent::value); + + CUTE_HOST_DEVICE constexpr + Coord const& operator*() const { return coord; } + + CUTE_HOST_DEVICE constexpr + ForwardCoordIterator& operator++() { increment(coord, shape); return *this; } + + // Sentinal for the end of the implied range + CUTE_HOST_DEVICE constexpr + bool operator< (ForwardCoordIteratorSentinal const&) const { return back(coord) < back(shape); } + CUTE_HOST_DEVICE constexpr + bool operator==(ForwardCoordIteratorSentinal const&) const { return back(coord) == back(shape); } + CUTE_HOST_DEVICE constexpr + bool operator!=(ForwardCoordIteratorSentinal const&) const { return back(coord) != back(shape); } + // NOTE: These are expensive, avoid use + CUTE_HOST_DEVICE constexpr + bool operator< (ForwardCoordIterator const& other) const { return colex_less(coord, other.coord); } + CUTE_HOST_DEVICE constexpr + bool operator==(ForwardCoordIterator const& other) const { return coord == other.coord; } + CUTE_HOST_DEVICE constexpr + bool operator!=(ForwardCoordIterator const& other) const { return coord != other.coord; } + + Coord coord; + Shape const& shape; +}; + +// A forward iterator for a coordinate that starts from zero +template +CUTE_HOST_DEVICE constexpr +auto +make_coord_iterator(Shape const& shape) +{ + auto coord = repeat_like(shape, int(0)); + return ForwardCoordIterator{coord,shape}; +} + +} // end namespace cute diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp new file mode 100644 index 0000000000..fe937ee738 --- /dev/null +++ b/include/cute/layout.hpp @@ -0,0 +1,1638 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace cute +{ + +// Aliases + +template +using Shape = IntTuple; + +template +using Stride = IntTuple; + +template +using Step = IntTuple; + +template +using Coord = IntTuple; + +template +CUTE_HOST_DEVICE constexpr +Shape +make_shape(Ts const&... t) { + return {t...}; +} +template +CUTE_HOST_DEVICE constexpr +Stride +make_stride(Ts const&... t) { + return {t...}; +} +template +CUTE_HOST_DEVICE constexpr +Step +make_step(Ts const&... t) { + return {t...}; +} +template +CUTE_HOST_DEVICE constexpr +Coord +make_coord(Ts const&... t) { + return {t...}; +} + + +template > +struct Layout + : private cute::tuple // EBO for static layouts +{ + // Avoid bad CTAD: + // Layout smem = GMMA::Layout_MN_SW128_Atom; + // Should fail because smem is a ComposedLayout (SwizzleLayout) and not a Layout + static_assert(is_integral::value || is_tuple::value); + + // Expensive in compilation time... + //static_assert(is_congruent::value, + // "Shape and Stride must have the same hierarchical structure"); + //static_assert(is_integral::value || is_tuple::value); + + // NOTE: This defaults static Shapes/Strides correctly, but not dynamic + CUTE_HOST_DEVICE constexpr + Layout(LogicalShape const& logical_shape = {}, + LogicalStride const& logical_stride = {}) + : cute::tuple(logical_shape, logical_stride) + {} + + // + // Accessors + // + + static constexpr int rank = rank_v ; + + CUTE_HOST_DEVICE constexpr + decltype(auto) + layout() { + return *this; + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + layout() const { + return *this; + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + shape() { + return get<0,I...>(static_cast&>(*this)); + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + shape() const { + return get<0,I...>(static_cast const&>(*this)); + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + stride() { + return get<1,I...>(static_cast&>(*this)); + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + stride() const { + return get<1,I...>(static_cast const&>(*this)); + } + + // + // Mappings + // + + // Map a logical coordinate to a linear index (Coord has no Underscore slice operators) + // OR + // Slice the layout and return the sublayout (Coord has an Underscore slice op) + template + CUTE_HOST_DEVICE constexpr + auto + operator()(Coord const& coord) const { + if constexpr (has_underscore::value) { + return slice(coord, *this); + } else { + return crd2idx(coord, shape(), stride()); + } + + CUTE_GCC_UNREACHABLE; + } + + // Convenience function for multi-dimensional coordinates + template + CUTE_HOST_DEVICE constexpr + auto + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { + return operator()(make_coord(c0,c1,cs...)); + } + + // Map a linear index to a hier ND logical coordinate + // NOTE: Dangerous and error-prone + template + CUTE_HOST_DEVICE constexpr + auto + operator[](Int const& linear_idx) const { + static_assert(is_integral::value); + return get_hier_coord(linear_idx); + } + + // + // Compose + // + + template + CUTE_HOST_DEVICE constexpr + auto + compose(OtherLayout const& other) const { + return composition(*this, other); + } + + template + CUTE_HOST_DEVICE constexpr + auto + compose(Layouts const&... layouts) const { + return composition(*this, make_tile(layouts...)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + with_shape(OtherShape const& shape) const { + return composition(*this, make_layout(shape)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + with_shape(Shapes const&... shapes) const { + return composition(*this, make_layout(make_shape(shapes...))); + } + + // + // Tile + // + + template + CUTE_HOST_DEVICE constexpr + auto + tile(OtherLayout const& other) const { + return tiled_divide(*this, other); + } + + template + CUTE_HOST_DEVICE constexpr + auto + tile(Layouts const&... layouts) const { + return tiled_divide(*this, make_tile(layouts...)); + } + + // + // Utility + // + + // + // Index to Coordinate + // + + // NOTE: Only valid for compact layouts + + // Return the (hierarchical) ND logical coordinate corresponding to the linear index + // @post crd2idx(@a result, shape(), stride()) == idx + // @post congruent(@a result, shape()) + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_hier_coord(IInt const& idx) const { + return cute::idx2crd(idx, shape(), stride()); + } + + // Return the (flat) ND logical coordinate corresponding to the linear index + // @post crd2idx(@a result, shape(), stride()) == idx + // @post rank(@a result) == rank(shape()) && depth(@a result) == 1 + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_flat_coord(IInt const& idx) const { + return cute::crd2crd(this->get_hier_coord(idx), shape(), repeat(Int<1>{})); + } + + // Return the generalized column-major 1D logical coordinate corresponding to the linear index + // @post crd2idx(@a result, shape(), stride()) == idx + // @post is_integral::value + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_1d_coord(IInt const& idx) const { + return cute::crd2idx(this->get_hier_coord(idx), shape()); + } + + // + // Coordinate to Coordinate + // + +#if 0 + // Return the (hierarchical) ND logical coordinate corresponding to the linear index + // @post congruent(@a result, shape()) + template + CUTE_HOST_DEVICE constexpr + auto + crd_2_hier_coord(Coord const& crd) const { + return cute::crd2crd(crd, shape(), shape()); + } + + // Return the (flat) ND logical coordinate corresponding to the linear index + // @post rank(@a result) == rank(shape()) && depth(@a result) == 1 + template + CUTE_HOST_DEVICE constexpr + auto + crd_2_flat_coord(Coord const& crd) const { + return cute::crd2crd(crd, shape(), product_each(shape())); + } + + // Return the generalized column-major 1D logical coordinate corresponding to the linear index + // @post is_integral::value + template + CUTE_HOST_DEVICE constexpr + auto + crd_2_1d_coord(Coord const& crd) const { + //return cute::crd2crd(crd, shape(), product(shape())); + return cute::crd2idx(crd, shape()); + } +#endif +}; + + +template +struct is_layout : false_type {}; +template +struct is_layout> : true_type {}; + + +template ::value || is_integral::value) && + (is_tuple::value || is_integral::value))> +CUTE_HOST_DEVICE constexpr +auto +make_layout(Shape const& shape, Stride const& stride) +{ + return Layout(shape, stride); +} + +template ::value || is_integral::value)> +CUTE_HOST_DEVICE constexpr +auto +make_layout(Shape const& shape) +{ + return make_layout(shape, compact_col_major(shape)); +} + +// Construct a layout from multiple layouts by +// concatenating each layout as an independent mode +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(Layout const&... layouts) +{ + return make_layout(make_shape (layouts.shape()...), + make_stride(layouts.stride()...)); +} + +// +// Convenience tags for common layouts +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(Shape const& shape, GenColMajor) +{ + return make_layout(shape, compact_col_major(shape)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(Shape const& shape, GenRowMajor) +{ + return make_layout(shape, compact_row_major(shape)); +} + +// Follow the same ordering induced by the strides, but make the layout compact +template +CUTE_HOST_DEVICE constexpr +auto +make_ordered_layout(Shape const& shape, Order const& order) +{ + static_assert(is_static::value && is_static::value); + return make_layout(shape, compact_order(shape, order)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_ordered_layout(Layout const& layout) +{ + return make_ordered_layout(layout.shape(), layout.stride()); +} + +// Make a layout of the same shape that is either ordered or colmajor depending on staticness +template +CUTE_HOST_DEVICE constexpr +auto +make_layout_like(Layout const& layout) +{ + if constexpr (is_static::value && is_static::value) { + return make_ordered_layout(layout.shape(), layout.stride()); + } else { + return make_layout(layout.shape()); + } + + CUTE_GCC_UNREACHABLE; +} + +// Make a layout of the same shape, +// with mode-0 being colmajor then following the the mode order in layout +template +CUTE_HOST_DEVICE constexpr +auto +make_fragment_like(Layout const& layout) +{ + auto shape = replace<0>(layout.shape(), size<0>(layout)); + auto order = replace<0>(layout.stride(), Int<0>{}); + if constexpr (is_static::value && is_static::value) { + return make_ordered_layout(shape, order); + } else { + return make_layout(layout.shape()); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_identity_layout(Shape const& shape) +{ + return make_layout(shape, make_basis_like(shape)); +} + +// +// Operations to manipulate Layouts like a tuple of pairs +// + +template +CUTE_HOST_DEVICE constexpr +auto +get(Layout const& layout) +{ + // Let the static_asserts in get(shape|stride) catch problems + return make_layout(get(layout.shape()), get(layout.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +take(Layout const& layout) +{ + // Let the static_asserts in take(shape|stride) catch problems + return make_layout(take(layout.shape()), take(layout.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +flatten(Layout const& layout) +{ + return make_layout(flatten(layout.shape()), flatten(layout.stride())); +} + +// +// Utilities +// + +// Return the layout of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +layout(Layout const& layout) +{ + if constexpr (sizeof...(Is) == 0) { + return layout; + } else { + return get(layout); + } + + CUTE_GCC_UNREACHABLE; +} + +// Return the shape of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +shape(Layout& layout) +{ + return layout.template shape(); +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +shape(Layout const& layout) +{ + return layout.template shape(); +} + +// Return the stride of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +stride(Layout& layout) +{ + return layout.template stride(); +} + +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +stride(Layout const& layout) +{ + return layout.template stride(); +} + +// Return the number of elements in a mode +template +CUTE_HOST_DEVICE constexpr +auto +size(Layout const& layout) +{ + return size(shape(layout)); +} + +// Return the number of modes +template +CUTE_HOST_DEVICE constexpr +auto +rank(Layout const& layout) +{ + return rank(shape(layout)); +} + +// Return the depth of the layout +template +CUTE_HOST_DEVICE constexpr +auto +depth(Layout const& layout) +{ + return depth(shape(layout)); +} + +// Return the codomain size of a mode +// @return M smallest integer such that @a sub_layout(c) < M for all c < size(@a sub_layout) +// where sub_layout = get(layout). +template +CUTE_HOST_DEVICE constexpr +auto +cosize(Layout const& layout) +{ + // Protect against negative strides + auto abs_sub_layout = make_layout(shape(layout), + transform_leaf(stride(layout), abs_fn{})); + return abs_sub_layout(size(abs_sub_layout) - Int<1>{}) + Int<1>{}; +} + +template +using cosize_t = decltype(cosize(std::declval())); + +template +static constexpr int cosize_v = cosize_t::value; + +// Equality +// Return a static or dynamic boolean +template +CUTE_HOST_DEVICE constexpr +auto +operator==(Layout const& layoutA, Layout const& layoutB) +{ + return layoutA.shape() == layoutB.shape() && layoutA.stride() == layoutB.stride(); +} + +// With crd2idx(coord, shape), makes sense to have crd2idx(coord, Layout) as well +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx(Coord const& c, Layout const& layout) +{ + return crd2idx(c, layout.shape(), layout.stride()); +} + +// +// Slice and Dice a layout +// + +template +CUTE_HOST_DEVICE constexpr +auto +slice(Coord const& c, Layout const& layout) +{ + return make_layout(slice(c, layout.shape()), + slice(c, layout.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +slice_and_offset(Coord const& c, Layout const& layout) +{ + return cute::make_tuple(slice(c, layout), crd2idx(c, layout)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +dice(Coord const& c, Layout const& layout) +{ + return make_layout(dice(c, layout.shape()), + dice(c, layout.stride())); +} + +// +// Transform the modes of a layout +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +transform_layout(Tuple const& t, F&& f, seq) +{ + return make_layout(f(get(t))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f, seq, seq, seq) +{ + return make_layout(f(get(t0),get(t1))..., get(t0)..., get(t1)...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +transform_layout(Tuple const& t, F&& f) +{ + return detail::transform_layout(t, f, make_seq{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f) +{ + constexpr int R0 = decltype(rank(t0))::value; + constexpr int R1 = decltype(rank(t1))::value; + constexpr int R = (R0 < R1) ? R0 : R1; + return detail::transform_layout(t0, t1, f, make_seq{}, make_range{}, make_range{}); +} + +// +// Coalesce and Filter +// + +namespace detail { + +// Look at each element and the front of the stack (in order of priority) +// front(NewLayout) get(Layout) +// s0:d0 _1:d1 => continue +// _1:d0 s1:d1 => replace_front s1:d1 +// s0:s1*d1 s1:d1 => replace_front s0*s1:d1 +// s0:d0 s1:d1 => prepend s1:d1 +// +// @pre OldShape and OldStride are flat +template +CUTE_HOST_DEVICE constexpr +auto +bw_coalesce(OldShape const& old_shape, OldStride const& old_stride, + NewShape const& new_shape, NewStride const& new_stride) +{ + if constexpr (I == -1) { + // Base case, we're done + if constexpr (is_constant<1, NewShape>::value) { + return Layout<_1,_0>{}; + } else { + return Layout{new_shape,new_stride}; + } + } else if constexpr (is_constant<1, decltype(get(old_shape))>::value) { + // shape(layout) == _1, skip it and continue + return bw_coalesce(old_shape, old_stride, new_shape, new_stride); + } else if constexpr (is_constant<1, NewShape>::value) { + // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride) + return bw_coalesce(old_shape, old_stride, get(old_shape), get(old_stride)); + } else if constexpr (is_constant(old_shape) * get(old_stride) == get<0>(new_stride))>::value) { + // Merge modes because the shapes and strides match + return bw_coalesce(old_shape, old_stride, + replace_front(new_shape, get(old_shape) * get<0>(new_shape)), + replace_front(new_stride, get(old_stride))); + } else { + // Can't replace or merge, so prepend a new mode + return bw_coalesce(old_shape, old_stride, + prepend(new_shape, get(old_shape)), + prepend(new_stride, get(old_stride))); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +// Combine all the modes that are possible to combine +// Does not respect the profile of the layout, but does preserve total size +template +CUTE_HOST_DEVICE constexpr +auto +coalesce(Layout const& layout) +{ + auto flat_shape = flatten(layout.shape()); + auto flat_stride = flatten(layout.stride()); + + constexpr int R = decltype(rank(flat_shape))::value; + return detail::bw_coalesce(flat_shape, flat_stride, get(flat_shape), get(flat_stride)); +} + +// Apply coalesce at the terminals of trg_profile +template +CUTE_HOST_DEVICE constexpr +auto +coalesce(Layout const& layout, IntTuple const& trg_profile) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value <= Layout::rank); + return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce(l,t); }); + } else { + return coalesce(layout); + } + + CUTE_GCC_UNREACHABLE; +} + +// Replace the modes in layout that have a 0-stride with a 1-size +template +CUTE_HOST_DEVICE constexpr +auto +filter_zeros(Layout const& layout) +{ + return make_layout(filter_zeros(layout.stride(), layout.shape()), layout.stride()); +} + +// Remove all of the 0-strides and 1-sizes +// Return 1-shape if empty +template +CUTE_HOST_DEVICE constexpr +auto +filter(Layout const& layout) +{ + return coalesce(filter_zeros(layout)); +} + +// Apply filter at the terminals of trg_profile +template +CUTE_HOST_DEVICE constexpr +auto +filter(Layout const& layout, IntTuple const& trg_profile) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value <= Layout::rank); + return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return filter(l,t); }); + } else { + return filter(layout); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Append, Prepend, Replace +// + +template +CUTE_HOST_DEVICE constexpr +auto +append(Layout const& layout, + Layout const& x = {}) +{ + return make_layout(append(layout.shape(), x.shape()), + append(layout.stride(), x.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +prepend(Layout const& layout, + Layout const& x = {}) +{ + return make_layout(prepend(layout.shape(), x.shape()), + prepend(layout.stride(), x.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +replace(Layout const& layout, + Layout const& x) +{ + return make_layout(replace(layout.shape(), x.shape()), + replace(layout.stride(), x.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +group(Layout const& layout) +{ + return make_layout(group(layout.shape()), + group(layout.stride())); +} + +// +// Composition of two layouts: lhs o rhs +// @post compatible(rhs, result) +// @post result(c) = lhs(rhs(c)) +// for all c in the domain of result +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Layout const& lhs, + RShape const& rhs_shape, RStride const& rhs_stride) +{ + if constexpr (is_tuple::value) { + // Apply the right-distributivity of Layout composition + return transform_layout(rhs_shape, rhs_stride, [&](auto const& s, auto const& d) { return composition(lhs, s, d); }); + } else + if constexpr (is_scaled_basis::value) { + // Special case for a ScaledBasis stride + return composition(get(lhs), rhs_shape, rhs_stride.value()); + } else + if constexpr (is_integral::value) { + // Integral Rstride (and RShape) + + // NOTE: Should only flatten once for efficiency + auto flat_shape = flatten(lhs.shape()); + auto flat_stride = flatten(lhs.stride()); + [[maybe_unused]] constexpr int R = rank(flat_shape); + + if constexpr (is_constant<0, RStride>::value) { + // Special case shortcut for any static stride-0 + return Layout{rhs_shape, rhs_stride}; + } else + if constexpr (is_integral::value) { + // Special case shortcut for any integral LShape + auto result_stride = rhs_stride * flat_stride; + return Layout{rhs_shape, result_stride}; + } else + if constexpr (is_constant<1, RStride>::value) { + // Special case shortcut for any static stride-1 + auto result_shape_0 = take<0,R-1>(flat_shape); + + // Mod out the rhs_shape from the lhs.shape() + auto const [result_shape_1, rest_shape] = fold(result_shape_0, make_tuple(make_tuple(), rhs_shape), + [] (auto const& init, auto const& si) { + return make_tuple(append(get<0>(init), cute::min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si))); + }); + + // Jump into coalesce and append (rest_shape, get(lhs.stride()) + return detail::bw_coalesce(result_shape_1, flat_stride, rest_shape, get(flat_stride)); + } else + { + // General case + auto result_shape_0 = take<0,R-1>(flat_shape); + auto result_stride_0 = take<0,R-1>(flat_stride); + + // Divide out the rhs_stride from the lhs.shape() + auto const [result_shape_1, rest_stride] = fold(result_shape_0, make_tuple(make_tuple(), rhs_stride), + [] (auto const& init, auto const& di) { + return make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di)); + }); + + // Apply any lhs.shape() changes to the stride + auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1)); + + // Mod out the rhs_shape from the lhs.shape() + auto const [result_shape_2, rest_shape] = fold(result_shape_1, make_tuple(make_tuple(), rhs_shape), + [] (auto const& init, auto const& si) { + return make_tuple(append(get<0>(init), cute::min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si))); + }); + + // Jump into coalesce and append (rest_shape, rest_stride * get(lhs.stride()) + return detail::bw_coalesce(result_shape_2, result_stride_1, rest_shape, rest_stride * get(flat_stride)); + } + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Layout const& lhs, + Layout const& rhs) +{ + //return detail::composition(flatten(lhs), rhs.shape(), rhs.stride()); + return detail::composition(lhs, rhs.shape(), rhs.stride()); +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Layout const& lhs, + IntTuple const& rhs) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value <= Layout::rank); + // Drop any modes of lhs that aren't hit by rhs + return detail::transform_layout(lhs, rhs, [](auto const& l, auto const& r) { return composition(l,r); }, make_seq::value>{}, seq<>{}, seq<>{}); + } else if constexpr (is_underscore::value) { + return lhs; + } else { + return composition(lhs, make_layout(rhs)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Complement +// +// Build the complement of a layout. +// @post size(@a result) >= @a cosize_hi / size(filter(@a layout))); +// @post For all i in [1,size(@a result)), +// @a result(i) < @a result(i-1) +// For all j in [0, size(@a layout)), +// @a result(i) != @a layout(j) +// + +template +CUTE_HOST_DEVICE constexpr +auto +complement(Layout const& layout, CoSizeHi const& cosize_hi) +{ + // Remove the stride-0 modes, the size-1 modes, and flatten the layout + auto flat_layout = filter(layout); + + if constexpr (is_constant<0, decltype(flat_layout.stride())>::value) { + // Special case for stride-0 layout + return make_layout(cosize_hi); + } else { + // General case + constexpr int R = decltype(rank(flat_layout))::value; + static_assert(R == 1 || is_static::value, + "Dynamic-stride complement only for rank-1 layouts"); + + // Should just be a sort and a fold... + // Then we could even handle dynamic strides (but they would destroy all static strides) + auto result = fold(make_seq{}, + make_tuple(flat_layout.shape(), + flat_layout.stride(), + make_tuple(), + make_tuple(Int<1>{})), + [](auto const& init, auto i) + { + auto curr_stride = cute::min(get<1>(init)); + auto curr_idx = find(get<1>(init), curr_stride); + auto curr_shape = get(get<0>(init)); + + return make_tuple(remove(get<0>(init)), // Remove the curr shape + remove(get<1>(init)), // Remove the curr stride + append(get<2>(init), curr_stride / get<3,i>(init)), // new shape = curr_stride / last_stride + append(get<3>(init), curr_shape * curr_stride)); // new stride = curr_shape * curr_stride + }); + + // Append the last shape mode + auto result_stride = get<3>(result); + auto result_shape = append(get<2>(result), get<1,0>(result) / back(result_stride)); // new shape = curr_stride / last_stride + + // Compute the rest_stride + auto rest_stride = get<0,0>(result) * get<1,0>(result); + //return make_layout(append(result_shape, ceil_div(cosize_hi, rest_stride)), append(result_stride, rest_stride)); + // Jump into coalesce and append (ceil_div(cosize_hi, rest_stride), rest_stride) + return detail::bw_coalesce(result_shape, result_stride, ceil_div(cosize_hi, rest_stride), rest_stride); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +complement(Layout const& layout) +{ + return complement(layout, cosize(layout)); +} + +// +// Right-Inverse and Left-Inverse +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +inverse_seq(Shape const& shape, Stride const& stride, seq) +{ + if constexpr (I == decltype(rank(stride))::value) { + return seq{}; + } else { + //auto next_stride = get(shape) * get(stride); + using next_stride = decltype(get(shape) * get(stride)); // NOTE: WAR for g++-7 + + if constexpr (is_static::value) { + auto next_idx = find_if(stride, [](auto a) { return is_constant{}; }); + return inverse_seq(shape, stride, seq{}); + } else { + return seq{}; + } + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +// +// Build the right-inverse of a layout +// @pre is_static +// @result A layout @a result such that +// @a layout(@a result(i)) == i for all i < size(@a result) +// @result A layout @a result such that +// composition(@a layout, @a result) is identical to make_layout(shape(result)) +// + +template +CUTE_HOST_DEVICE constexpr +auto +right_inverse(Layout const& layout) +{ + auto flat_layout = coalesce(layout); + auto astride = transform_leaf(flat_layout.stride(), abs_fn{}); + + // Find Int<1>{}, the starting idx, and follow the strides to gen inverse_seq + auto next_I = find_if(astride, [](auto a) { return is_constant<1, decltype(a)>{}; }); + [[maybe_unused]] auto iseq = detail::inverse_seq(flat_layout.shape(), astride, seq<>{}); + + if constexpr (tuple_size::value == 0) { + return Layout<_1,_0>{}; // Empty case, nothing found + } else { + // Generate the corresponding new strides and construct + auto rstride = compact_col_major(flat_layout.shape()); + return make_layout(unwrap(transform(iseq, [&](auto i) { return shape(flat_layout); })), + unwrap(transform(iseq, [&](auto i) { return signum(stride(flat_layout)) * get(rstride); }))); + } + + CUTE_GCC_UNREACHABLE; +} + +CUTE_HOST_DEVICE constexpr +auto +right_inverse(Underscore const& _) +{ + return _; +} + +// +// Build the left-inverse of a layout +// @pre is_static +// @pre not has_int0 // @a layout has no 0-strides (is injective) +// @result A layout @a result such that +// @a result(@a layout(i)) == i for all i < size(@a layout) +// @result A layout @a result such that +// composition(@a result, @a layout) is identical to make_layout(shape(layout)) +// + +template +CUTE_HOST_DEVICE constexpr +auto +left_inverse(Layout const& layout) +{ + return right_inverse(make_layout(layout, complement(layout))); +} + +CUTE_HOST_DEVICE constexpr +auto +left_inverse(Underscore const& _) +{ + return _; +} + +// +// Max Common Vector +// + +/* Return Int such that N is the maximum number of continguous elements + * that logically correspond in the layouts of @a a and @a b. This is, + * the number of elements that could reasonably be "vectorized" in the layouts. + * + * @returns Int with N >= 1 + * @post For all 0 <= n < N, a(b[n]) == n (NOTE: Problems with negative strides/coords in this post-condition) + */ +template +CUTE_HOST_DEVICE constexpr +auto +max_common_vector(Layout const& a, Layout const& b) +{ + if constexpr (is_static>::value && + is_static>::value) + { + auto result = coalesce(composition(a, right_inverse(b))); + + if constexpr (is_constant<1, decltype(stride<0>(result))>::value) { + return shape<0>(result); + } else { + return Int<1>{}; + } + } else { + // Dynamic case NOTE: could weaken if we assume dynamic strides are large and multiples of the vector + return Int<1>{}; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Zip +// + +template +CUTE_HOST_DEVICE constexpr +auto +zip(Layout const& layout) +{ + return make_layout(zip(layout.shape()), + zip(layout.stride())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +zip(Layout const& layoutA, + Layout const& layoutB) +{ + return make_layout(zip(layoutA.shape(), layoutB.shape()), + zip(layoutA.stride(), layoutB.stride())); +} + +// +// Tile unzip +// Logical product and logical divide (on layouts) produce rank-2 results by design. +// Follow the profile of @a tile and zip the rank-2 modes located at the terminals into +// their own mode. +// + +template +CUTE_HOST_DEVICE constexpr +auto +tile_unzip(Layout const& layout, + IntTuple const& tile) +{ + return make_layout(zip2_by(layout.shape(), tile), + zip2_by(layout.stride(), tile)); +} + +// +// Logical divide +// + +template +CUTE_HOST_DEVICE constexpr +auto +logical_divide(Layout const& layout, + Layout const& tile) +{ + //CUTE_STATIC_ASSERT_V(size(layout) % size(tile) == Int<0>{}, + // "Tiling does not evenly divide the block"); + // NOTE: With tiles that have stride-0, this doesn't have to be true + + return composition(layout, make_layout(tile, complement(tile, size(layout)))); +} + +template +CUTE_HOST_DEVICE constexpr +auto +logical_divide(Layout const& layout, + IntTuple const& tile) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value <= Layout::rank, "logical_divide: Too many modes in tile."); + return transform_layout(layout, tile, [](auto const& l, auto const& t) { return logical_divide(l,t); }); + } else if constexpr (is_underscore::value) { + return layout; + } else if constexpr (is_integral::value) { + return logical_divide(layout, make_layout(tile)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Convenience operator +// that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y)) +// by gathering the tile modes and residuals into a rank-2 result. +// + +template +CUTE_HOST_DEVICE constexpr +auto +zipped_divide(Layout const& layout, + Tile const& tile) +{ + return tile_unzip(logical_divide(layout, tile), tile); +} + +// Same as zipped_divide, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y) +template +CUTE_HOST_DEVICE constexpr +auto +tiled_divide(Layout const& layout, + Tile const& tile) +{ + auto div = zipped_divide(layout, tile); + + auto R = rank<1>(div); + return div(_, repeat(_)); +} + +// +// Logical product +// + +template +CUTE_HOST_DEVICE constexpr +auto +logical_product(Layout const& layout, + Layout const& tile) +{ + return make_layout(layout, composition(complement(layout, size(layout)*cosize(tile)), tile)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +logical_product(Layout const& layout, + IntTuple const& tile) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value <= Layout::rank); + return transform_layout(layout, tile, [](auto const& l, auto const& t) { return logical_product(l,t); }); + } else if constexpr (is_underscore::value) { + return layout; + } else if constexpr (is_integral::value) { + return logical_product(layout, make_layout(tile)); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Convenience operator +// that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y)) +// by gathering the block modes and products into a rank-2 result. +// + +template +CUTE_HOST_DEVICE constexpr +auto +zipped_product(Layout const& layout, + Tile const& tile) +{ + return tile_unzip(logical_product(layout, tile), tile); +} + +// Same as zipped_product, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y) +template +CUTE_HOST_DEVICE constexpr +auto +tiled_product(Layout const& layout, + Tile const& tile) +{ + auto div = zipped_product(layout, tile); + + auto R = rank(tile); + return div(_, repeat(_)); +} + +// Attempts to reproduce layout "block" over layout "layout" +// That is, think of every element of "layout" as a "block" +// and return the layout of the resulting structure +template +CUTE_HOST_DEVICE constexpr +auto +blocked_product(Layout const& block, + Layout const& layout) +{ + constexpr int R = cute::max(rank_v, rank_v); + auto padded_block = append(block); + auto padded_layout = append(layout); + + auto result = logical_product(padded_block, padded_layout); + + return coalesce(zip(get<0>(result), get<1>(result)), repeat(Int<1>{})); +} + +template +CUTE_HOST_DEVICE constexpr +auto +raked_product(Layout const& block, + Layout const& layout) +{ + constexpr int R = cute::max(rank_v, rank_v); + auto padded_block = append(block); + auto padded_layout = append(layout); + + auto result = logical_product(padded_block, padded_layout); + + return coalesce(zip(get<1>(result), get<0>(result)), repeat(Int<1>{})); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tile_to_shape(Layout const& layout, + TrgShape const& trg_shape, + ModeOrder const& ord_shape = {}) +{ + CUTE_STATIC_ASSERT_V(rank(layout) <= rank(trg_shape), "Rank of layout must be <= rank of target shape."); + constexpr int R = rank_v; + + auto padded_layout = append(layout); + + auto layout_shape = product_each(padded_layout.shape()); + auto target_shape = product_each(trg_shape); + + // Assert proper division + CUTE_STATIC_ASSERT_V(sum(transform(target_shape, layout_shape, modulus{})) == Int<0>{}, + "Layout shape does not divide the target shape."); + + auto product_shape = shape_div(target_shape, layout_shape); + + return coalesce(blocked_product(padded_layout, make_ordered_layout(product_shape, ord_shape)), product_shape); +} + +// +// Upcast +// For stride-1 mode, divide size by N. Divide all other strides by N. +// + +template +CUTE_HOST_DEVICE constexpr +auto +upcast(Shape const& shape, Stride const& stride) +{ + if constexpr (is_tuple::value) { // tuple stride + return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast(s,d); }); + } else if constexpr (is_constant<0, Stride>::value) { // static-0 stride + return Layout{shape,stride}; + } else if constexpr (is_static::value) { // static stride + return make_layout(shape_div(shape, shape_div(Int{}, abs(stride))), + shape_div(stride, Int{})); + } else { // dynamic stride + // assume dynamic strides are larger than N and divisible + // assert(stride % N == 0); + return make_layout(shape, safe_div(stride, Int{})); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +upcast(Layout const& layout) +{ + return upcast(layout.shape(), layout.stride()); +} + +// +// Downcast +// For stride-1 mode, multiply size by N. Multiply all other strides by N. +// + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(Shape const& shape, Stride const& stride) +{ + if constexpr (is_tuple::value) { + return transform_layout(shape, stride, [](auto const& s, auto const& d) { return downcast(s,d); }); + } else if constexpr (is_constant<1, Stride>::value || is_constant<-1, Stride>::value) { + return make_layout(shape * Int{}, stride); + } else { + return make_layout(shape, stride * Int{}); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(Layout const& layout) +{ + CUTE_STATIC_ASSERT(has_int1::value, "Downcast requires adjacent elements"); + return downcast(layout.shape(), layout.stride()); +} + +// +// Recast +// + +template +CUTE_HOST_DEVICE constexpr +auto +recast(Layout const& layout) +{ + if constexpr (sizeof(NewType) == sizeof(OldType)) { + return layout; + } else if constexpr (sizeof(NewType) > sizeof(OldType)) { + static_assert(sizeof(NewType) % sizeof(OldType) == 0, "NewType must be a multiple of OldType"); + return upcast(layout); + } else if constexpr (sizeof(NewType) < sizeof(OldType)) { + static_assert(sizeof(OldType) % sizeof(NewType) == 0, "NewType must be a divisor of OldType"); + return downcast(layout); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(Layout const& layout) +{ + print(layout.shape()); print(":"); print(layout.stride()); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, Layout const& layout) +{ + return os << shape(layout) << ":" << stride(layout); +} + +// Generic 2D Layout to console table +template +CUTE_HOST_DEVICE +void +print_layout(Layout const& layout) // (m,n) -> idx +{ + CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{}); + + int idx_width = num_digits(cosize(layout)) + 2; + const char* delim = "+-----------------------"; + + print(layout); print("\n"); + + // Column indices + print(" "); + for (int n = 0; n < size<1>(layout); ++n) { printf(" %*d ", idx_width-2, n); } + printf("\n"); + + // Print out A m-by-n + for (int m = 0; m < size<0>(layout); ++m) { + // Header + print(" "); + for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); } + printf("+\n"); + // Values + printf("%2d ", m); // Row indices + for (int n = 0; n < size<1>(layout); ++n) { printf("| %*d ", idx_width-2, int(layout(m,n))); } + printf("|\n"); + } + // Footer + print(" "); + for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); } + printf("+\n"); +} + +// Generic ThrVal 2D Layout to console table +template +CUTE_HOST_DEVICE +void +print_layout(Layout const& layout, ThrID const& thrid) // (m,n) -> (tid,vid) and tid -> thr_idx +{ + CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{}); + + print(layout); print("\n"); + print(thrid); print("\n"); + + // Print out m-by-n + for (int m = 0; m < size<0>(layout); ++m) { + // Header + for (int n = 0; n < size<1>(layout); ++n) printf("+------"); + printf("+\n"); + // Values + for (int n = 0; n < size<1>(layout); ++n) printf("|%03d-%02d", int(thrid(layout(m,n) % size(thrid))), int(layout(m,n) / size(thrid))); + printf("|\n"); + } + // Footer + for (int n = 0; n < size<1>(layout); ++n) printf("+------"); + printf("+\n"); +} + +// Generic 2D Layout to Latex printer -- B&W 8-value color coding +template +CUTE_HOST_DEVICE +void +print_latex(Layout const& layout) // (m,n) -> idx +{ + CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{}); + + char const* latex_header = + "\\documentclass[convert]{standalone}\n" + "\\usepackage{tikz}\n\n" + "\\begin{document}\n" + "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},box/.style={rectangle,draw=black,thick,minimum size=1cm,anchor=center,font=\\Large}]\n\n"; + char const* latex_footer = + "\\end{tikzpicture}\n" + "\\end{document}\n"; + + char const* color_map[8] = {"black!00", + "black!40", + "black!20", + "black!60", + "black!10", + "black!50", + "black!30", + "black!70"}; + + // Header + printf("%% Layout: "); print(layout); printf("\n"); + + printf(latex_header); + + // Layout + for (int i = 0; i < size<0>(layout); ++i) { + for (int j = 0; j < size<1>(layout); ++j) { + int idx = layout(i,j); + + printf("\\node[box,fill=%s] at (%d,%d) {%d};\n", + color_map[idx % 8], + i, j, + idx); + } + } + + // Labels + for (int i = 0, j = -1; i < size<0>(layout); ++i) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i); + } + for (int j = 0, i = -1; j < size<1>(layout); ++j) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j); + } + + // Footer + printf(latex_footer); +} + +// Generic ThrVal 2D Layout to Latex TIKZ -- 8-value color coded by thread +template +CUTE_HOST_DEVICE +void +print_latex(Layout const& layout, ThrID const& thr) // (m,n) -> (tid,vid) and tid -> thr_idx +{ + CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{}); + + char const* latex_header = + "\\documentclass[convert]{standalone}\n" + "\\usepackage{tikz}\n\n" + "\\begin{document}\n" + "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},box/.style={rectangle,draw=black,thick,minimum size=1cm,anchor=center}]\n\n"; + char const* latex_footer = + "\\end{tikzpicture}\n" + "\\end{document}\n"; + + char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}", + "{rgb,255:red,175;green,255;blue,175}", + "{rgb,255:red,255;green,255;blue,175}", + "{rgb,255:red,255;green,175;blue,175}", + "{rgb,255:red,210;green,210;blue,255}", + "{rgb,255:red,210;green,255;blue,210}", + "{rgb,255:red,255;green,255;blue,210}", + "{rgb,255:red,255;green,210;blue,210}"}; + + // Header + printf("%% layout: "); print(layout); printf("\n"); + printf("%% thrid: "); print(thr); printf("\n\n"); + + printf(latex_header); + + // Layout + for (int i = 0; i < size<0>(layout); ++i) { + for (int j = 0; j < size<1>(layout); ++j) { + int thrid = layout(i,j) % size(thr); + int val_idx = layout(i,j) / size(thr); + int thr_idx = thr(thrid); + + printf("\\node[box,fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n", + color_map[thr_idx % 8], + i, j, + thr_idx, val_idx); + } + } + + // Labels + for (int i = 0, j = -1; i < size<0>(layout); ++i) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i); + } + for (int j = 0, i = -1; j < size<1>(layout); ++j) { + printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j); + } + + // Footer + printf(latex_footer); +} + +} // end namespace cute + +// +// Extended Layouts +// + +#include diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp new file mode 100644 index 0000000000..33471e4f16 --- /dev/null +++ b/include/cute/numeric/arithmetic_tuple.hpp @@ -0,0 +1,388 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace cute +{ + +template +struct ArithmeticTuple : tuple +{ + template + CUTE_HOST_DEVICE constexpr + ArithmeticTuple(ArithmeticTuple const& u) + : tuple(static_cast const&>(u)) {} + + template + CUTE_HOST_DEVICE constexpr + ArithmeticTuple(tuple const& u) + : tuple(u) {} + + template + CUTE_HOST_DEVICE constexpr + ArithmeticTuple(U const&... u) + : tuple(u...) {} +}; + +template +struct is_tuple> : true_type {}; + +template +CUTE_HOST_DEVICE constexpr +auto +make_arithmetic_tuple(T const&... t) { + return ArithmeticTuple(t...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +as_arithmetic_tuple(tuple const& t) { + return ArithmeticTuple(t); +} + +// +// Numeric operators +// + +// Addition +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ArithmeticTuple const& t, ArithmeticTuple const& u) { + constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U))); + return transform_apply(append(t,Int<0>{}), append(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); }); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ArithmeticTuple const& t, tuple const& u) { + constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U))); + return transform_apply(append(t,Int<0>{}), append(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); }); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(tuple const& t, ArithmeticTuple const& u) { + constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U))); + return transform_apply(append(t,Int<0>{}), append(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); }); +} + +// +// Special cases +// + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(constant, ArithmeticTuple const& u) { + return u; +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ArithmeticTuple const& t, constant) { + return t; +} + +// +// ArithmeticTupleIterator +// + +template +struct ArithmeticTupleIterator +{ + ArithTuple coord_; + + CUTE_HOST_DEVICE constexpr + ArithmeticTupleIterator() : coord_() {} + CUTE_HOST_DEVICE constexpr + ArithmeticTupleIterator(ArithTuple const& coord) : coord_(coord) {} + + CUTE_HOST_DEVICE constexpr + ArithTuple const& operator*() const { return coord_; } + + template + CUTE_HOST_DEVICE constexpr + auto operator+(Coord const& c) const { + return ArithmeticTupleIterator(coord_ + c); + } + + template + CUTE_HOST_DEVICE constexpr + auto operator[](Coord const& c) const { return *(*this + c); } +}; + +template +CUTE_HOST_DEVICE void print(ArithmeticTupleIterator const& iter) { + printf("ArithTuple"); print(iter.coord_); +} + +// +// ArithmeticTuple "basis" elements +// + +// Abstract value: +// A ScaledBasis is a (at least) rank-N0 ArithmeticTuple: +// (_0,_0,...,T,_0,...) + +template +struct ScaledBasis : private tuple +{ + CUTE_HOST_DEVICE constexpr + ScaledBasis(T const& t = {}) : tuple(t) {} + + CUTE_HOST_DEVICE constexpr + decltype(auto) value() { return get<0>(static_cast &>(*this)); } + CUTE_HOST_DEVICE constexpr + decltype(auto) value() const { return get<0>(static_cast const&>(*this)); } + + CUTE_HOST_DEVICE static constexpr + auto mode() { return Int{}; } +}; + +template +struct is_scaled_basis : false_type {}; +template +struct is_scaled_basis> : true_type {}; + +template +struct is_integral> : true_type {}; + +template +CUTE_HOST_DEVICE constexpr auto +basis_value(T const& e) { + return e; +} + +template +CUTE_HOST_DEVICE constexpr auto +basis_value(ScaledBasis const& e) { + return basis_value(e.value()); +} + +namespace detail { + +template +struct Basis; + +template <> +struct Basis<> { + using type = Int<1>; +}; + +template +struct Basis { + using type = ScaledBasis::type, N>; +}; + +} // end namespace detail + +template +using E = typename detail::Basis::type; + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +as_arithmetic_tuple(T const& t, seq, seq) { + return make_arithmetic_tuple((void(I),Int<0>{})..., t, (void(J),Int<0>{})...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +as_arithmetic_tuple(ArithmeticTuple const& t, seq, seq) { + return make_arithmetic_tuple(get(t)..., (void(J),Int<0>{})...); +} + +} // end namespace detail + +// Turn a ScaledBases into a rank-M ArithmeticTuple +// with N prefix 0s: (_0,_0,...N...,_0,T,_0,...,_0,_0) +template +CUTE_HOST_DEVICE constexpr +auto +as_arithmetic_tuple(ScaledBasis const& t) { + static_assert(M > N, "Mismatched ranks"); + return detail::as_arithmetic_tuple(t.value(), make_seq{}, make_seq{}); +} + +// Turn an ArithmeticTuple into a rank-M ArithmeticTuple +// with postfix 0s: (t0,t1,t2,...,_0,...,_0,_0) +template +CUTE_HOST_DEVICE constexpr +auto +as_arithmetic_tuple(ArithmeticTuple const& t) { + static_assert(M >= sizeof...(T), "Mismatched ranks"); + return detail::as_arithmetic_tuple(t, make_seq{}, make_seq{}); +} + +// Return... +template +CUTE_HOST_DEVICE constexpr +auto +make_basis_like(Shape const& shape) +{ + if constexpr (is_integral::value) { + return Int<1>{}; + } else { + // Generate bases for each rank of shape + return transform(tuple_seq{}, [&](auto I) { + // Generate bases for each rank of shape_i and add an i on front + constexpr int i = decltype(I)::value; // NOTE: nvcc workaround + return transform_leaf(make_basis_like(get(shape)), [&](auto e) { return ScaledBasis{}; }); + }); + } + + CUTE_GCC_UNREACHABLE; +} + +// Equality +template +CUTE_HOST_DEVICE constexpr +auto +operator==(ScaledBasis, Int) { + return false_type{}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator==(Int, ScaledBasis) { + return false_type{}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator==(ScaledBasis const& t, ScaledBasis const& u) { + return bool_constant{} && t.value() == u.value(); +} + +// Multiplication +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +operator*(A const& a, ScaledBasis const& e) { + return ScaledBasis{a*e.value()}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +operator*(ScaledBasis const& e, B const& b) { + return ScaledBasis{e.value()*b}; +} + +// Addition +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ScaledBasis const& t, ArithmeticTuple const& u) { + constexpr int R = cute::max(N+1, int(sizeof...(U))); + return as_arithmetic_tuple(t) + as_arithmetic_tuple(u); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ArithmeticTuple const& t, ScaledBasis const& u) { + constexpr int R = cute::max(int(sizeof...(T)), M+1); + return as_arithmetic_tuple(t) + as_arithmetic_tuple(u); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ScaledBasis const& t, ScaledBasis const& u) { + constexpr int R = cute::max(N+1,M+1); + return as_arithmetic_tuple(t) + as_arithmetic_tuple(u); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(constant, ScaledBasis const& u) { + return u; +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator+(ScaledBasis const& t, constant) { + return t; +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(ScaledBasis const& e) { + printf("%d:", N); print(e.value()); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, ScaledBasis const& e) { + return os << N << ":" << e.value(); +} + +} // end namespace cute + + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> + : std::tuple_element> +{}; + +} // end namespace std diff --git a/include/cute/numeric/bfloat.hpp b/include/cute/numeric/bfloat.hpp new file mode 100644 index 0000000000..94f64ab572 --- /dev/null +++ b/include/cute/numeric/bfloat.hpp @@ -0,0 +1,51 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute { + +using cutlass::bfloat16_t; + +// +// Display utilities +// + +CUTE_HOST std::ostream& operator<<(std::ostream& os, bfloat16_t const& v) +{ + return os << float(v); +} + +} // end namespace cute diff --git a/include/cute/numeric/complex.hpp b/include/cute/numeric/complex.hpp new file mode 100644 index 0000000000..3790ebd3b1 --- /dev/null +++ b/include/cute/numeric/complex.hpp @@ -0,0 +1,163 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +//#if defined(__CUDA_ARCH__) +//# include +//#else +//# include +//#endif + +// With CUDA 11.4, builds show spurious "-Wconversion" warnings +// on line 656 of thrust/detail/type_traits.h. +// These pragmas suppress the warnings. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#pragma GCC diagnostic pop + +#include + +namespace cute +{ + +//#if defined(__CUDA_ARCH__) +//template +//using complex = cuda::std::complex; +//#else +//template +//using complex = std::complex; +//#endif + +//template +//using complex = thrust::complex; + +using thrust::complex; + +template +CUTE_HOST_DEVICE +T real(complex const& z) { + return z.real(); +} + +template +CUTE_HOST_DEVICE +T imag(complex const& z) { + return z.imag(); +} + +template +CUTE_HOST_DEVICE +complex conj(complex const& z) { + return complex(real(z), -imag(z)); +} + +// cute::conj forwards scalars +template +CUTE_HOST_DEVICE +T conj(T z) { + return z; +} + +//CUTE_HOST_DEVICE constexpr +//float conj(float z) { return z; } +//CUTE_HOST_DEVICE constexpr +//double conj(double z) { return z; } + +/// Fused multiply-add for complex numbers +template +CUTE_HOST_DEVICE constexpr +void +fma(complex & d, + complex const& a, + complex const& b, + complex const& c) +{ + d.real(c.real() + a.real() * b.real()); + d.imag(c.imag() + a.real() * b.imag()); + d.real(d.real() - a.imag() * b.imag()); + d.imag(d.imag() + a.imag() * b.real()); +} + +/// Fused multiply-add for triplets +template +CUTE_HOST_DEVICE constexpr +void +fma(complex const& a, + complex const& b, + complex & c) +{ + return fma(c, a, b, c); +} + +/// Used to determine the real-valued underlying type of a numeric type T +template +struct RealType { + using Type = T; +}; + +/// Partial specialization for complex-valued type +template +struct RealType> { + using Type = T; +}; + +////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct is_complex { + static bool const value = false; +}; + +template +struct is_complex> { + static bool const value = true; +}; + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Display utilities + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, complex const& z) +{ + T _r = z.real(); + T _i = z.imag(); + + if (bool(_i)) { + return os << _r << "+i" << _i; + } else { + return os << _r; + } +} + +} // end namespace cute diff --git a/include/cute/numeric/float8.hpp b/include/cute/numeric/float8.hpp new file mode 100644 index 0000000000..3fa471db34 --- /dev/null +++ b/include/cute/numeric/float8.hpp @@ -0,0 +1,43 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute { + +using cutlass::float_e4m3_t; +using cutlass::float_e5m2_t; + +} // end namespace cute diff --git a/include/cute/numeric/half.hpp b/include/cute/numeric/half.hpp new file mode 100644 index 0000000000..704ba28d55 --- /dev/null +++ b/include/cute/numeric/half.hpp @@ -0,0 +1,41 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include +#include + +namespace cute { + +using cutlass::half_t; + +} // end namespace cute diff --git a/include/cute/numeric/int.hpp b/include/cute/numeric/int.hpp new file mode 100644 index 0000000000..a08297f209 --- /dev/null +++ b/include/cute/numeric/int.hpp @@ -0,0 +1,129 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include +#include + +namespace cute +{ + +// +// Signed integers +// + +using int8_t = std::int8_t; +using int16_t = std::int16_t; +using int32_t = std::int32_t; +using int64_t = std::int64_t; + +template struct int_bit; +template <> struct int_bit< 2> { using type = cute::int2b_t; }; +template <> struct int_bit< 4> { using type = cute::int4b_t; }; +template <> struct int_bit< 8> { using type = int8_t; }; +template <> struct int_bit< 16> { using type = int16_t; }; +template <> struct int_bit< 32> { using type = int32_t; }; +template <> struct int_bit< 64> { using type = int64_t; }; + +template +using int_bit_t = typename int_bit::type; + +template +using int_byte = int_bit<8*N>; + +template +using int_byte_t = typename int_byte::type; + +// +// Unsigned integers +// + +using uint8_t = std::uint8_t; +using uint16_t = std::uint16_t; +using uint32_t = std::uint32_t; +using uint64_t = std::uint64_t; + +template struct uint_bit; +template <> struct uint_bit< 1> { using type = cute::uint1b_t; }; +template <> struct uint_bit< 2> { using type = cute::uint2b_t; }; +template <> struct uint_bit< 4> { using type = cute::uint4b_t; }; +template <> struct uint_bit< 8> { using type = uint8_t; }; +template <> struct uint_bit< 16> { using type = uint16_t; }; +template <> struct uint_bit< 32> { using type = uint32_t; }; +template <> struct uint_bit< 64> { using type = uint64_t; }; +template <> struct uint_bit<128> { using type = cute::uint128_t; }; + +template +using uint_bit_t = typename uint_bit::type; + +template +using uint_byte = uint_bit<8*N>; + +template +using uint_byte_t = typename uint_byte::type; + +// +// sizeof_bytes +// + +template +struct sizeof_bytes { + static constexpr std::size_t value = sizeof(T); +}; +template +static constexpr int sizeof_bytes_v = sizeof_bytes::value; + +// +// sizeof_bits +// + +template +struct sizeof_bits { + static constexpr std::size_t value = sizeof(T) * 8; +}; +template <> +struct sizeof_bits { + static constexpr std::size_t value = 1; +}; +template +struct sizeof_bits> { + static constexpr std::size_t value = Bits; +}; +template +static constexpr int sizeof_bits_v = sizeof_bits::value; + +} // namespace cute diff --git a/include/cute/numeric/integer_sequence.hpp b/include/cute/numeric/integer_sequence.hpp new file mode 100644 index 0000000000..73a83f76a9 --- /dev/null +++ b/include/cute/numeric/integer_sequence.hpp @@ -0,0 +1,139 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include // std::integer_sequence + +#include + +namespace cute +{ + +using std::integer_sequence; +using std::make_integer_sequence; + +namespace detail { + +template +struct make_integer_range_impl; + +template +struct make_integer_range_impl, Begin> { + using type = integer_sequence; +}; + +} // end namespace detail + +template +using make_integer_range = typename detail::make_integer_range_impl< + T, + make_integer_sequence 0) ? (End-Begin) : 0>, + Begin>::type; + +// +// Common aliases +// + +// int_sequence + +template +using int_sequence = integer_sequence; + +template +using make_int_sequence = make_integer_sequence; + +template +using make_int_range = make_integer_range; + +// index_sequence + +template +using index_sequence = integer_sequence; + +template +using make_index_sequence = make_integer_sequence; + +template +using make_index_range = make_integer_range; + +// +// Shortcuts +// + +template +using seq = int_sequence; + +template +using make_seq = make_int_sequence; + +template +using make_range = make_int_range; + +template +using tuple_seq = make_seq>::value>; + +} // end namespace cute + + +// +// Specialize tuple-related functionality for cute::integer_sequence +// + +#include +#include + +namespace cute +{ + +template +CUTE_HOST_DEVICE constexpr +std::tuple_element_t> +get(integer_sequence) { + static_assert(I < sizeof...(Ints), "Index out of range"); + return {}; +} + +} // end namespace cute + +namespace std +{ + +template +struct tuple_size> + : std::integral_constant +{}; + +template +struct tuple_element> + : std::tuple_element...>> +{}; + +} // end namespace std diff --git a/include/cute/numeric/integer_subbyte.hpp b/include/cute/numeric/integer_subbyte.hpp new file mode 100644 index 0000000000..3d24a95293 --- /dev/null +++ b/include/cute/numeric/integer_subbyte.hpp @@ -0,0 +1,233 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include +#include + +namespace cute { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct integer_subbyte +{ + /// Storage type + using Storage = uint8_t; + + /// Number of bits + static_assert(Bits <= 8*sizeof(Storage), "Require a subbyte of bits in integer_subbyte"); + + /// External type + using xint_t = typename std::conditional::type; + + /// Bitmask for truncation from larger integers + static constexpr Storage bits_mask_ = Storage((1 << Bits) - 1); + /// Bitmask for the sign bit + static constexpr Storage sign_mask_ = Storage((Signed ? 1 : 0) << (Bits - 1)); + + // + // Data members + // + + Storage storage; + + // + // Methods + // + + /// No operation + CUTE_HOST_DEVICE constexpr + integer_subbyte() {} + + /// Conversion from integer type + CUTE_HOST_DEVICE constexpr + integer_subbyte(int value) // NOTE: Sign extension? + : storage(reinterpret_cast(value) & bits_mask_) {} + + CUTE_HOST_DEVICE constexpr + integer_subbyte(unsigned value) + : storage(reinterpret_cast(value) & bits_mask_) {} + + /// Convert to int or unsigned + CUTE_HOST_DEVICE constexpr + operator xint_t() const { + if (sign_mask_ & storage) { // Sign extend + return xint_t(storage) | ~xint_t(bits_mask_); + } else { + return xint_t(storage); + } + } + + /// Equality + CUTE_HOST_DEVICE constexpr + bool operator==(integer_subbyte const& rhs) const { + return storage == rhs.storage; + } + + /// Inequality + CUTE_HOST_DEVICE constexpr + bool operator!=(integer_subbyte const& rhs) const { + return storage != rhs.storage; + } + + /// Less than or equal + CUTE_HOST_DEVICE constexpr + bool operator<=(integer_subbyte const& rhs) const { + if (sign_mask_ & storage) { + return !(rhs.storage < storage); + } else { + return storage < rhs.storage; + } + } + + /// Less than + CUTE_HOST_DEVICE constexpr + bool operator<(integer_subbyte const& rhs) const { + if (sign_mask_ & storage) { + return !(rhs.storage <= storage); + } else { + return storage < rhs.storage; + } + } + + /// Greater than or equal + CUTE_HOST_DEVICE constexpr + bool operator>=(integer_subbyte const& rhs) const { + return !(*this < rhs); + } + + /// Greater than + CUTE_HOST_DEVICE constexpr + bool operator>(integer_subbyte const& rhs) const { + return !(*this <= rhs); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// 1-bit unsigned integer type +using uint1b_t = integer_subbyte<1, false>; + +/// 2-bit integer type +using int2b_t = integer_subbyte<2, true>; + +/// 2-bit unsigned integer type +using uint2b_t = integer_subbyte<2, false>; + +/// 4-bit integer type +using int4b_t = integer_subbyte<4, true>; + +/// 4-bit unsigned integer type +using uint4b_t = integer_subbyte<4, false>; + +/// 1-bit binary type +using bin1_t = bool; + +} // namespace cute + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#if !defined(__CUDACC_RTC__) + +#include + +namespace std { + +template <> +struct numeric_limits { + CUTE_HOST_DEVICE static constexpr + cute::uint1b_t const lowest() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint1b_t const min() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint1b_t const max() noexcept { return 1; } + static constexpr bool is_integer = true; + static constexpr bool is_signed = false; +}; + +template <> +struct numeric_limits { + CUTE_HOST_DEVICE static constexpr + cute::int2b_t lowest() noexcept { return -2; } + CUTE_HOST_DEVICE static constexpr + cute::int2b_t min() noexcept { return -2; } + CUTE_HOST_DEVICE static constexpr + cute::int2b_t max() noexcept { return 1; } + static constexpr bool is_integer = true; + static constexpr bool is_signed = true; +}; + +template <> +struct numeric_limits { + CUTE_HOST_DEVICE static constexpr + cute::uint2b_t const lowest() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint2b_t const min() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint2b_t const max() noexcept { return 3; } + static constexpr bool is_integer = true; + static constexpr bool is_signed = false; +}; + +template <> +struct numeric_limits { + CUTE_HOST_DEVICE static constexpr + cute::int4b_t lowest() noexcept { return -8; } + CUTE_HOST_DEVICE static constexpr + cute::int4b_t min() noexcept { return -8; } + CUTE_HOST_DEVICE static constexpr + cute::int4b_t max() noexcept { return 7; } + static constexpr bool is_integer = true; + static constexpr bool is_signed = true; +}; + +template <> +struct numeric_limits { + CUTE_HOST_DEVICE static constexpr + cute::uint4b_t const lowest() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint4b_t const min() noexcept { return 0; } + CUTE_HOST_DEVICE static constexpr + cute::uint4b_t const max() noexcept { return 15; } + static constexpr bool is_integer = true; + static constexpr bool is_signed = false; +}; + +} // namespace std + +#endif diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp new file mode 100644 index 0000000000..106763df58 --- /dev/null +++ b/include/cute/numeric/integral_constant.hpp @@ -0,0 +1,414 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute +{ + +template +struct constant : std::integral_constant { + static constexpr T value = v; + using value_type = T; + using type = constant; + CUTE_HOST_DEVICE constexpr operator value_type() const noexcept { return value; } + CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; } +}; + +template +using integral_constant = constant; + +template +using bool_constant = constant; + +using true_type = bool_constant; +using false_type = bool_constant; + +// +// Traits +// + +// Use std::is_integral to match built-in integral types (int, int64_t, unsigned, etc) +// Use cute::is_integral to match both built-in integral types AND constant + +template +struct is_integral : bool_constant::value> {}; +template +struct is_integral> : true_type {}; + +// is_static detects if an (abstract) value is defined completely by it's type (no members) + +template +struct is_static : bool_constant::value> {}; + +// is_constant detects if a type is a constant and if v is equal to a value + +template +struct is_constant : false_type {}; +template +struct is_constant > : bool_constant {}; +template +struct is_constant const > : bool_constant {}; +template +struct is_constant const&> : bool_constant {}; +template +struct is_constant &> : bool_constant {}; +template +struct is_constant &&> : bool_constant {}; + +// +// Specializations +// + +template +using Int = constant; + +using _m32 = Int<-32>; +using _m24 = Int<-24>; +using _m16 = Int<-16>; +using _m12 = Int<-12>; +using _m10 = Int<-10>; +using _m9 = Int<-9>; +using _m8 = Int<-8>; +using _m7 = Int<-7>; +using _m6 = Int<-6>; +using _m5 = Int<-5>; +using _m4 = Int<-4>; +using _m3 = Int<-3>; +using _m2 = Int<-2>; +using _m1 = Int<-1>; +using _0 = Int<0>; +using _1 = Int<1>; +using _2 = Int<2>; +using _3 = Int<3>; +using _4 = Int<4>; +using _5 = Int<5>; +using _6 = Int<6>; +using _7 = Int<7>; +using _8 = Int<8>; +using _9 = Int<9>; +using _10 = Int<10>; +using _12 = Int<12>; +using _16 = Int<16>; +using _24 = Int<24>; +using _32 = Int<32>; +using _64 = Int<64>; +using _96 = Int<96>; +using _128 = Int<128>; +using _192 = Int<192>; +using _256 = Int<256>; +using _512 = Int<512>; +using _1024 = Int<1024>; +using _2048 = Int<2048>; +using _4096 = Int<4096>; +using _8192 = Int<8192>; + +/***************/ +/** Operators **/ +/***************/ + +#define CUTE_LEFT_UNARY_OP(OP) \ + template \ + CUTE_HOST_DEVICE constexpr \ + constant \ + operator OP (constant) { \ + return {}; \ + } +#define CUTE_RIGHT_UNARY_OP(OP) \ + template \ + CUTE_HOST_DEVICE constexpr \ + constant \ + operator OP (constant) { \ + return {}; \ + } + +#define CUTE_BINARY_OP(OP) \ + template \ + CUTE_HOST_DEVICE constexpr \ + constant \ + operator OP (constant, constant) { \ + return {}; \ + } + +CUTE_LEFT_UNARY_OP(+); +CUTE_LEFT_UNARY_OP(-); +CUTE_LEFT_UNARY_OP(~); +CUTE_LEFT_UNARY_OP(!); +CUTE_LEFT_UNARY_OP(*); + +CUTE_BINARY_OP( +); +CUTE_BINARY_OP( -); +CUTE_BINARY_OP( *); +CUTE_BINARY_OP( /); +CUTE_BINARY_OP( %); +CUTE_BINARY_OP( &); +CUTE_BINARY_OP( |); +CUTE_BINARY_OP( ^); +CUTE_BINARY_OP(<<); +CUTE_BINARY_OP(>>); + +CUTE_BINARY_OP(&&); +CUTE_BINARY_OP(||); + +CUTE_BINARY_OP(==); +CUTE_BINARY_OP(!=); +CUTE_BINARY_OP( >); +CUTE_BINARY_OP( <); +CUTE_BINARY_OP(>=); +CUTE_BINARY_OP(<=); + +#undef CUTE_BINARY_OP +#undef CUTE_LEFT_UNARY_OP +#undef CUTE_RIGHT_UNARY_OP + +// +// Mixed static-dynamic special cases +// + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator*(constant, U) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator*(U, constant) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator/(constant, U) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator%(U, constant) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator%(U, constant) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator%(constant, U) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator&(constant, U) { + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +constant +operator&(U, constant) { + return {}; +} + +template ::value && !bool(t))> +CUTE_HOST_DEVICE constexpr +constant +operator&&(constant, U) { + return {}; +} + +template ::value && !bool(t))> +CUTE_HOST_DEVICE constexpr +constant +operator&&(U, constant) { + return {}; +} + +template ::value && bool(t))> +CUTE_HOST_DEVICE constexpr +constant +operator||(constant, U) { + return {}; +} + +template ::value && bool(t))> +CUTE_HOST_DEVICE constexpr +constant +operator||(U, constant) { + return {}; +} + +// +// Named functions from math.hpp +// + +#define CUTE_NAMED_UNARY_FN(OP) \ + template \ + CUTE_HOST_DEVICE constexpr \ + constant \ + OP (constant) { \ + return {}; \ + } + +#define CUTE_NAMED_BINARY_FN(OP) \ + template \ + CUTE_HOST_DEVICE constexpr \ + constant \ + OP (constant, constant) { \ + return {}; \ + } \ + \ + template ::value)> \ + CUTE_HOST_DEVICE constexpr \ + auto \ + OP (constant, U u) { \ + return OP(t,u); \ + } \ + \ + template ::value)> \ + CUTE_HOST_DEVICE constexpr \ + auto \ + OP (T t, constant) { \ + return OP(t,u); \ + } + +CUTE_NAMED_UNARY_FN(abs); +CUTE_NAMED_UNARY_FN(signum); +CUTE_NAMED_UNARY_FN(has_single_bit); + +CUTE_NAMED_BINARY_FN(max); +CUTE_NAMED_BINARY_FN(min); +CUTE_NAMED_BINARY_FN(shiftl); +CUTE_NAMED_BINARY_FN(shiftr); +CUTE_NAMED_BINARY_FN(gcd); +CUTE_NAMED_BINARY_FN(lcm); + +#undef CUTE_NAMED_UNARY_FN +#undef CUTE_NAMED_BINARY_FN + +// +// Other functions +// + +template +CUTE_HOST_DEVICE constexpr +constant +safe_div(constant, constant) { + static_assert(t % u == 0, "Static safe_div requires t % u == 0"); + return {}; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +safe_div(constant, U u) { + return t / u; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +safe_div(T t, constant) { + return t / u; +} + +// cute::true_type prefers standard conversion to std::true_type +// over user-defined conversion to bool +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +conditional_return(std::true_type, TrueType&& t, FalseType&&) { + return static_cast(t); +} + +// cute::false_type prefers standard conversion to std::false_type +// over user-defined conversion to bool +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +conditional_return(std::false_type, TrueType&&, FalseType&& f) { + return static_cast(f); +} + +// TrueType and FalseType must have a common type +template +CUTE_HOST_DEVICE constexpr +auto +conditional_return(bool b, TrueType const& t, FalseType const& f) { + return b ? t : f; +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(integral_constant const&) { + printf("_%d", N); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, integral_constant const&) { + return os << "_" << N; +} + +} // end namespace cute diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp new file mode 100644 index 0000000000..03e8379977 --- /dev/null +++ b/include/cute/numeric/math.hpp @@ -0,0 +1,319 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include + +namespace cute +{ + +// +// Common Operations +// + +template ::value && + std::is_arithmetic::value)> +CUTE_HOST_DEVICE constexpr +auto +max(T const& t, U const& u) { + return t < u ? u : t; +} + +template ::value && + std::is_arithmetic::value)> +CUTE_HOST_DEVICE constexpr +auto +min(T const& t, U const& u) { + return t < u ? t : u; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +abs(T const& t) { + if constexpr (std::is_signed::value) { + return t < T(0) ? -t : t; + } else { + return t; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// C++17 operations +// + +// Greatest common divisor of two integers +template ::value && + std::is_integral::value)> +CUTE_HOST_DEVICE constexpr +auto +gcd(T t, U u) { + while (true) { + if (t == 0) { return u; } + u %= t; + if (u == 0) { return t; } + t %= u; + } +} + +// Least common multiple of two integers +template ::value && + std::is_integral::value)> +CUTE_HOST_DEVICE constexpr +auto +lcm(T const& t, U const& u) { + return (t / gcd(t,u)) * u; +} + +// +// C++20 operations +// + +// Checks if a number is an integral power of two +template +CUTE_HOST_DEVICE constexpr +bool +has_single_bit(T x) { + return x != 0 && (x & (x - 1)) == 0; +} + +// Smallest number of bits needed to represent the given value +// bit_width( 0b0000 ) = 0 +// bit_width( 0b0001 ) = 1 +// bit_width( 0b0010 ) = 2 +// bit_width( 0b0011 ) = 2 +// bit_width( 0b0100 ) = 3 +// bit_width( 0b0101 ) = 3 +// bit_width( 0b0110 ) = 3 +// bit_width( 0b0111 ) = 3 +template +CUTE_HOST_DEVICE constexpr +T +bit_width(T x) { + static_assert(std::is_unsigned::value, "Only to be used for unsigned types."); + constexpr int N = (std::numeric_limits::digits == 64 ? 6 : + (std::numeric_limits::digits == 32 ? 5 : + (std::numeric_limits::digits == 16 ? 4 : + (std::numeric_limits::digits == 8 ? 3 : (assert(false),0))))); + T r = 0; + for (int i = N - 1; i >= 0; --i) { + T shift = (x > ((T(1) << (T(1) << i))-1)) << i; + x >>= shift; + r |= shift; + } + return r + (x != 0); +} + +// Smallest integral power of two not less than the given value +// bit_ceil( 0b00000000 ) = 0b00000001 +// bit_ceil( 0b00000001 ) = 0b00000001 +// bit_ceil( 0b00000010 ) = 0b00000010 +// bit_ceil( 0b00000011 ) = 0b00000100 +// bit_ceil( 0b00000100 ) = 0b00000100 +// bit_ceil( 0b00000101 ) = 0b00001000 +// bit_ceil( 0b00000110 ) = 0b00001000 +// bit_ceil( 0b00000111 ) = 0b00001000 +// bit_ceil( 0b00001000 ) = 0b00001000 +// bit_ceil( 0b00001001 ) = 0b00010000 +template +CUTE_HOST_DEVICE constexpr +T +bit_ceil(T x) { + return x == 0 ? T(1) : (T(1) << bit_width(x - 1)); +} + +// Largest integral power of two not greater than the given value +// bit_floor( 0b00000000 ) = 0b00000000 +// bit_floor( 0b00000001 ) = 0b00000001 +// bit_floor( 0b00000010 ) = 0b00000010 +// bit_floor( 0b00000011 ) = 0b00000010 +// bit_floor( 0b00000100 ) = 0b00000100 +// bit_floor( 0b00000101 ) = 0b00000100 +// bit_floor( 0b00000110 ) = 0b00000100 +// bit_floor( 0b00000111 ) = 0b00000100 +// bit_floor( 0b00001000 ) = 0b00001000 +// bit_floor( 0b00001001 ) = 0b00001000 +template +CUTE_HOST_DEVICE constexpr +T +bit_floor(T x) { + return x == 0 ? 0 : (T(1) << (bit_width(x) - 1)); +} + +template +CUTE_HOST_DEVICE constexpr T rotl(T x, int s); +template +CUTE_HOST_DEVICE constexpr T rotr(T x, int s); + +// Computes the result of circular bitwise left-rotation +template +CUTE_HOST_DEVICE constexpr +T +rotl(T x, int s) { + constexpr int N = std::numeric_limits::digits; + return s == 0 ? x : s > 0 ? (x << s) | (x >> (N - s)) : rotr(x, -s); +} + +// Computes the result of circular bitwise right-rotation +template +CUTE_HOST_DEVICE constexpr +T +rotr(T x, int s) { + constexpr int N = std::numeric_limits::digits; + return s == 0 ? x : s > 0 ? (x >> s) | (x << (N - s)) : rotl(x, -s); +} + +// Counts the number of consecutive 0 bits, starting from the most significant bit +// countl_zero( 0b00000000 ) = 8 +// countl_zero( 0b11111111 ) = 0 +// countl_zero( 0b00011100 ) = 3 +template +CUTE_HOST_DEVICE constexpr +T +countl_zero(T x) { + return std::numeric_limits::digits - bit_width(x); +} + +// Counts the number of consecutive 1 bits, starting from the most significant bit +// countl_one( 0b00000000 ) = 0 +// countl_one( 0b11111111 ) = 8 +// countl_one( 0b11100011 ) = 3 +template +CUTE_HOST_DEVICE constexpr +T +countl_one(T x) { + return countl_zero(~x); +} + +// Counts the number of consecutive 0 bits, starting from the least significant bit +// countr_zero( 0b00000000 ) = 8 +// countr_zero( 0b11111111 ) = 0 +// countr_zero( 0b00011100 ) = 2 +template +CUTE_HOST_DEVICE constexpr +T +countr_zero(T x) { + return x == 0 ? std::numeric_limits::digits : bit_width(T(x & T(-x))) - 1; // bit_width of the LSB +} + +// Counts the number of consecutive 1 bits, starting from the least significant bit +// countr_one( 0b00000000 ) = 0 +// countr_one( 0b11111111 ) = 8 +// countr_one( 0b11100011 ) = 2 +template +CUTE_HOST_DEVICE constexpr +T +countr_one(T x) { + return countr_zero(~x); +} + +// Counts the number of 1 bits in an unsigned integer +// popcount( 0b00000000 ) = 0 +// popcount( 0b11111111 ) = 8 +// popcount( 0b00011101 ) = 4 +template +CUTE_HOST_DEVICE constexpr +int +popcount(T x) { + int c = 0; + while (x) { + ++c; + x &= x - 1; // clear the least significant bit set + } + return c; +} + +// +// Custom operations +// + +// Computes the result of bitwise left-shift +template +CUTE_HOST_DEVICE constexpr +T +shiftl(T x, int s) { + return s >= 0 ? (x << s) : (x >> -s); +} + +// Computes the result of bitwise right-shift +template +CUTE_HOST_DEVICE constexpr +T +shiftr(T x, int s) { + return s >= 0 ? (x >> s) : (x << -s); +} + +// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero. +template ::value)> +CUTE_HOST_DEVICE constexpr +int +signum(T const& x) { + return T(0) < x; +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +int +signum(T const& x) { + return (T(0) < x) - (x < T(0)); +} + +// Safe divide +// @pre t % u == 0 +// @result t / u +template ::value && + std::is_integral::value)> +CUTE_HOST_DEVICE constexpr +auto +safe_div(T const& t, U const& u) { + //assert(t % u == 0); + return t / u; +} + +} // namespace cute diff --git a/include/cute/numeric/real.hpp b/include/cute/numeric/real.hpp new file mode 100644 index 0000000000..d85e30405a --- /dev/null +++ b/include/cute/numeric/real.hpp @@ -0,0 +1,56 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +namespace cute +{ + +/// Generic fused multiply-add +template +CUTE_HOST_DEVICE constexpr +void +fma(D& d, A const& a, B const& b, C const& c) +{ + d = a * b + c; +} + +/// Fused multiply-add for triplets +template +CUTE_HOST_DEVICE constexpr +void +fma(A const& a, B const& b, C& c) +{ + return fma(c, a, b, c); +} + +} // end namespace cute diff --git a/include/cute/numeric/tfloat.hpp b/include/cute/numeric/tfloat.hpp new file mode 100644 index 0000000000..bb68b703eb --- /dev/null +++ b/include/cute/numeric/tfloat.hpp @@ -0,0 +1,51 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include + +namespace cute { + +using cutlass::tfloat32_t; + +// +// Display utilities +// + +CUTE_HOST std::ostream& operator<<(std::ostream& os, tfloat32_t const& v) +{ + return os << float(v); +} + +} // end namespace cute diff --git a/include/cute/numeric/uint128.hpp b/include/cute/numeric/uint128.hpp new file mode 100644 index 0000000000..fb02441fae --- /dev/null +++ b/include/cute/numeric/uint128.hpp @@ -0,0 +1,259 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#include +#include +#include +#include +#endif + +#include + +/// Optionally enable GCC's built-in type +#if defined(__x86_64) && !defined(__CUDA_ARCH__) +# if defined(__GNUC__) && 0 +# define CUTE_UINT128_NATIVE +# elif defined(_MSC_VER) +# define CUTE_INT128_ARITHMETIC +# include +# endif +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cute { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///! Unsigned 128b integer type +struct alignas(16) uint128_t +{ + /// Size of one part of the uint's storage in bits + static constexpr int storage_bits_ = 64; + + struct hilo + { + uint64_t lo; + uint64_t hi; + }; + + // Use a union to store either low and high parts or, if present, a built-in 128b integer type. + union + { + struct hilo hilo_; + +#if defined(CUTE_UINT128_NATIVE) + unsigned __int128 native; +#endif // defined(CUTE_UINT128_NATIVE) + }; + + // + // Methods + // + + /// Default ctor + CUTE_HOST_DEVICE constexpr + uint128_t() : hilo_{0, 0} {} + + /// Constructor from uint64 + CUTE_HOST_DEVICE constexpr + uint128_t(uint64_t lo_) : hilo_{lo_, 0} {} + + /// Constructor from two 64b unsigned integers + CUTE_HOST_DEVICE constexpr + uint128_t(uint64_t lo_, uint64_t hi_) : hilo_{lo_, hi_} {} + + /// Optional constructor from native value +#if defined(CUTE_UINT128_NATIVE) + uint128_t(unsigned __int128 value) : native(value) { } +#endif + + /// Lossily cast to uint64 + CUTE_HOST_DEVICE constexpr + explicit operator uint64_t() const + { + return hilo_.lo; + } + + template + CUTE_HOST_DEVICE constexpr + static void exception() + { + //static_assert(sizeof(Dummy) == 0, "Not implemented exception!"); + //abort(); + //printf("uint128 not implemented!\n"); + } + + /// Add + CUTE_HOST_DEVICE constexpr + uint128_t operator+(uint128_t const& rhs) const + { + uint128_t y; +#if defined(CUTE_UINT128_NATIVE) + y.native = native + rhs.native; +#else + y.hilo_.lo = hilo_.lo + rhs.hilo_.lo; + y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (!y.hilo_.lo && (rhs.hilo_.lo)); +#endif + return y; + } + + /// Subtract + CUTE_HOST_DEVICE constexpr + uint128_t operator-(uint128_t const& rhs) const + { + uint128_t y; +#if defined(CUTE_UINT128_NATIVE) + y.native = native - rhs.native; +#else + y.hilo_.lo = hilo_.lo - rhs.hilo_.lo; + y.hilo_.hi = hilo_.hi - rhs.hilo_.hi - (rhs.hilo_.lo && y.hilo_.lo > hilo_.lo); +#endif + return y; + } + + /// Multiply by unsigned 64b integer yielding 128b integer + CUTE_HOST_DEVICE constexpr + uint128_t operator*(uint64_t const& rhs) const + { + uint128_t y; +#if defined(CUTE_UINT128_NATIVE) + y.native = native * rhs; +#elif defined(CUTE_INT128_ARITHMETIC) + // Multiply by the low part + y.hilo_.lo = _umul128(hilo_.lo, rhs, &y.hilo_.hi); + + // Add the high part and ignore the overflow + uint64_t overflow; + y.hilo_.hi += _umul128(hilo_.hi, rhs, &overflow); +#else + exception(); +#endif + return y; + } + + /// Divide 128b operation by 64b operation yielding a 64b quotient + CUTE_HOST_DEVICE constexpr + uint64_t operator/(uint64_t const& divisor) const + { + uint64_t quotient = 0; +#if defined(CUTE_UINT128_NATIVE) + quotient = uint64_t(native / divisor); +#elif defined(CUTE_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + uint64_t remainder = 0; + quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder); +#else + exception(); +#endif + return quotient; + } + + /// Divide 128b operation by 64b operation yielding a 64b quotient + CUTE_HOST_DEVICE constexpr + uint64_t operator%(uint64_t const& divisor) const + { + uint64_t remainder = 0; +#if defined(CUTE_UINT128_NATIVE) + remainder = uint64_t(native % divisor); +#elif defined(CUTE_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder); +#else + exception(); +#endif + return remainder; + } + + /// Computes the quotient and remainder in a single method. + CUTE_HOST_DEVICE constexpr + uint64_t divmod(uint64_t &remainder, uint64_t divisor) const + { + uint64_t quotient = 0; +#if defined(CUTE_UINT128_NATIVE) + quotient = uint64_t(native / divisor); + remainder = uint64_t(native % divisor); +#elif defined(CUTE_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder); +#else + exception(); +#endif + return quotient; + } + + /// Left-shifts a 128b unsigned integer + CUTE_HOST_DEVICE constexpr + uint128_t operator<<(int sh) const + { + if (sh == 0) { + return *this; + } + else if (sh >= storage_bits_) { + return uint128_t(0, hilo_.lo << (sh - storage_bits_)); + } + else { + return uint128_t( + (hilo_.lo << sh), + (hilo_.hi << sh) | uint64_t(hilo_.lo >> (storage_bits_ - sh)) + ); + } + } + + /// Right-shifts a 128b unsigned integer + CUTE_HOST_DEVICE constexpr + uint128_t operator>>(int sh) const + { + if (sh == 0) { + return *this; + } + else if (sh >= storage_bits_) { + return uint128_t((hilo_.hi >> (sh - storage_bits_)), 0); + } + else { + return uint128_t( + (hilo_.lo >> sh) | (hilo_.hi << (storage_bits_ - sh)), + (hilo_.hi >> sh) + ); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cute + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cute/pointer.hpp b/include/cute/pointer.hpp new file mode 100644 index 0000000000..40ce5d1aef --- /dev/null +++ b/include/cute/pointer.hpp @@ -0,0 +1,322 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include + +namespace cute +{ + +// +// has_dereference to determine if a type is a pointer concept +// + +template +struct has_dereference : std::false_type { +}; + +template +struct has_dereference())>> : std::true_type { +}; + +// +// Pointer categories +// + +template +struct is_gmem : false_type {}; + +template +struct is_smem : false_type {}; + +// Anything that is not gmem or smem is rmem +template +struct is_rmem : bool_constant< not (is_gmem::value || is_smem::value)> {}; + +// +// A very simplified wrapper for pointers -- use for constructing tagged pointers +// +template +struct device_ptr +{ + using value_type = T; + + CUTE_HOST_DEVICE constexpr + device_ptr(T* ptr) : ptr_(ptr) {} + + CUTE_HOST_DEVICE constexpr + T* get() const { return ptr_; } + + CUTE_HOST_DEVICE constexpr + T& operator*() const { return *ptr_; } + + template + CUTE_HOST_DEVICE constexpr + T& operator[](Index const& i) const { return ptr_[i]; } + + template + CUTE_HOST_DEVICE constexpr + DerivedType operator+(Index const& i) const { return {ptr_ + i}; } + + CUTE_HOST_DEVICE constexpr friend + std::ptrdiff_t operator-(device_ptr const& a, + device_ptr const& b) { + return a.ptr_ - b.ptr_; + } + + T* ptr_; +}; + +// +// gmem_ptr +// + +template +struct gmem_ptr : device_ptr> { + using device_ptr>::device_ptr; +}; + +template +CUTE_HOST_DEVICE constexpr +gmem_ptr +make_gmem_ptr(T* ptr) { + return {ptr}; +} + +template +CUTE_HOST_DEVICE constexpr +gmem_ptr +make_gmem_ptr(void* ptr) { + return {reinterpret_cast(ptr)}; +} + +template +struct is_gmem> : true_type {}; + +// +// smem_ptr +// + +template +struct smem_ptr : device_ptr> { + using device_ptr>::device_ptr; +}; + +template +CUTE_HOST_DEVICE constexpr +smem_ptr +make_smem_ptr(T* ptr) { + return {ptr}; +} + +template +CUTE_HOST_DEVICE constexpr +smem_ptr +make_smem_ptr(void* ptr) { + return {reinterpret_cast(ptr)}; +} + +template +struct is_smem> : true_type {}; + +// +// rmem_ptr +// + +template +struct rmem_ptr : device_ptr> { + using device_ptr>::device_ptr; +}; + +template +CUTE_HOST_DEVICE constexpr +rmem_ptr +make_rmem_ptr(T* ptr) { + return {ptr}; +} + +template +CUTE_HOST_DEVICE constexpr +rmem_ptr +make_rmem_ptr(void* ptr) { + return {reinterpret_cast(ptr)}; +} + +template +struct is_rmem> : true_type {}; + +// +// counting iterator -- quick and dirty +// + +struct counting +{ + using index_type = int; + using value_type = index_type; + + CUTE_HOST_DEVICE constexpr + counting() : n_(0) {} + CUTE_HOST_DEVICE constexpr + counting(index_type const& n) : n_(n) {} + + CUTE_HOST_DEVICE constexpr + index_type operator[](index_type const& i) const { return n_ + i; } + + CUTE_HOST_DEVICE constexpr + index_type const& operator*() const { return n_; } + + CUTE_HOST_DEVICE constexpr + counting operator+(index_type const& i) const { return {n_ + i}; } + CUTE_HOST_DEVICE constexpr + counting& operator++() { ++n_; return *this; } + + CUTE_HOST_DEVICE constexpr + bool operator==(counting const& other) const { return n_ == other.n_; } + CUTE_HOST_DEVICE constexpr + bool operator!=(counting const& other) const { return n_ != other.n_; } + + CUTE_HOST_DEVICE constexpr + bool operator< (counting const& other) const { return n_ < other.n_; } + + index_type n_; +}; + +// +// recast +// + +template +CUTE_HOST_DEVICE constexpr +auto +recast(T* ptr) { + return reinterpret_cast(ptr); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(T const* ptr) { + return reinterpret_cast(ptr); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(gmem_ptr const& ptr) { + return make_gmem_ptr(recast(ptr.ptr_)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(gmem_ptr const& ptr) { + return make_gmem_ptr(recast(ptr.ptr_)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(smem_ptr const& ptr) { + return make_smem_ptr(recast(ptr.ptr_)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(smem_ptr const& ptr) { + return make_smem_ptr(recast(ptr.ptr_)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(rmem_ptr const& ptr) { + return make_rmem_ptr(recast(ptr.ptr_)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(rmem_ptr const& ptr) { + return make_rmem_ptr(recast(ptr.ptr_)); +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(T const* const ptr) +{ + printf("raw_ptr_%db(%p)", int(8*sizeof(T)), ptr); +} + +template +CUTE_HOST_DEVICE void print(gmem_ptr const& ptr) +{ + printf("gmem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get()); +} + +template +CUTE_HOST_DEVICE void print(smem_ptr const& ptr) +{ + printf("smem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get()); +} + +template +CUTE_HOST_DEVICE void print(rmem_ptr const& ptr) +{ + printf("rmem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get()); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, gmem_ptr const& ptr) +{ + return os << "gmem_ptr_" << int(8*sizeof(T)) << "b"; +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr const& ptr) +{ + return os << "smem_ptr_" << int(8*sizeof(T)) << "b"; +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, rmem_ptr const& ptr) +{ + return os << "rmem_ptr_" << int(8*sizeof(T)) << "b"; +} + +} // end namespace cute diff --git a/include/cute/stride.hpp b/include/cute/stride.hpp new file mode 100644 index 0000000000..5fb0da8aec --- /dev/null +++ b/include/cute/stride.hpp @@ -0,0 +1,411 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +/** crd2idx maps a coordinate within to an index + * This is computed as follows: + * [coord, shape, and stride are all integers => step forward by stride] + * op(c, s, d) => c * d + * [coord is integer, shape and stride are tuple => divmod coord for each mode] + * op(c, (s,S), (d,D)) => op(c % prod(s), s, d) + op(c / prod(s), (S), (D)) + * [coord, shape, and stride are all tuples => consider each mode independently] + * op((c,C), (s,S), (d,D)) => op(c, s, d) + op((C), (S), (D)) + */ + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx(Coord const& coord, + Shape const& shape, + Stride const& stride); + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx_ttt(Coord const& coord, + Shape const& shape, + Stride const& stride, seq) +{ + return (... + crd2idx(get(coord), get(shape), get(stride))); +} + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx_itt(CInt const& coord, + STuple const& shape, + DTuple const& stride, seq) +{ + if constexpr (sizeof...(Is) == 0) { // Avoid recursion and mod on single/last iter + return crd2idx(coord, get(shape), get(stride)); + } else { // General case + return crd2idx(coord % product(get(shape)), get(shape), get(stride)) + + crd2idx_itt(coord / product(get(shape)), shape, stride, seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx(Coord const& coord, + Shape const& shape, + Stride const& stride) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // tuple tuple tuple + static_assert(tuple_size::value == tuple_size< Shape>::value, "Mismatched Ranks"); + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return detail::crd2idx_ttt(coord, shape, stride, tuple_seq{}); + } else { // tuple "int" "int" + static_assert(sizeof(Coord) == 0, "Invalid parameters"); + } + } else { + if constexpr (is_tuple::value) { // "int" tuple tuple + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return detail::crd2idx_itt(coord, shape, stride, tuple_seq{}); + } else { // "int" "int" "int" + return coord * stride; + } + } + + CUTE_GCC_UNREACHABLE; +} + +// +// If we know Stride is default [CompactColMajor], then we can take shortcuts +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx_horner(CTuple const& coord, + STuple const& shape, seq) +{ + if constexpr (sizeof...(Is) == 0) { // No recursion on single/last iter + return get(coord); + } else { // General case + return get(coord) + get(shape) * crd2idx_horner(coord, shape, seq{}); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +crd2idx(Coord const& coord, + Shape const& shape) +{ + static_assert(decltype(congruent(coord,shape))::value, "Mismatched Ranks"); + if constexpr (is_tuple::value) { + // Flatten and apply Horner's method + auto flat_coord = flatten(coord); + auto flat_shape = flatten(shape); + return detail::crd2idx_horner(flat_coord, flat_shape, tuple_seq{}); + } else { + return coord; + } + + CUTE_GCC_UNREACHABLE; +} + +/** idx2crd splits an index to a coordinate within . + * + * This is computed as follows: + * [index, shape, and stride are all integers => determine 1D coord] + * op(i, s, d) => (i / d) % s + * [index is integer, shape and stride are tuple => determine component for each mode] + * op(i, (s,S), (d,D)) => (op(i, s, d), op(i, S, D)...) + * [index, shape, and stride are all tuples => consider each mode independently] + * op((i,I), (s,S), (d,D)) => (op(i, s, d), op((I), (S), (D))) + * + * NOTE: This only works for compact shape+stride layouts. A more general version would + * apply to all surjective layouts + */ + +template +CUTE_HOST_DEVICE constexpr +auto +idx2crd(Index const& idx, + Shape const& shape, + Stride const& stride) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // tuple tuple tuple + static_assert(tuple_size::value == tuple_size< Shape>::value, "Mismatched Ranks"); + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return transform(idx, shape, stride, [](auto const& i, auto const& s, auto const& d){ return idx2crd(i,s,d); }); + } else { // tuple "int" "int" + static_assert(sizeof(Index) == 0, "Invalid parameters"); + } + } else { + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // "int" tuple tuple + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return transform(shape, stride, [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); }); + } else { // "int" tuple "int" + return transform(shape, compact_col_major(shape, stride), [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); }); + } + } else { // "int" "int" "int" + return (idx / stride) % shape; + } + } + + CUTE_GCC_UNREACHABLE; +} + +// +// If we know Stride is default [CompactColMajor], then we can take shortcuts +// + +//(idx / 1) % s0 +//(idx / s0) % s1 +//(idx / (s0 * s1)) % s2 +//... + +template +CUTE_HOST_DEVICE constexpr +auto +idx2crd(Index const& idx, + Shape const& shape) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // tuple tuple + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return transform(idx, shape, [](auto const& i, auto const& s) { return idx2crd(i,s); }); + } else { // tuple "int" + static_assert(sizeof(Index) == 0, "Invalid parameters"); + } + } else { + if constexpr (is_tuple::value) { // "int" tuple + return idx2crd(idx, shape, compact_col_major(shape)); + } else { // "int" "int" + return idx; + } + } + + CUTE_GCC_UNREACHABLE; +} + +// +// crd2crd +// + +template +CUTE_HOST_DEVICE constexpr +auto +crd2crd(Coord const& coord, + SShape const& src_shape, + DShape const& dst_shape) +{ + if constexpr (is_tuple::value && is_tuple::value && is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return transform(coord, src_shape, dst_shape, [](auto const& c, auto const& s, auto const& d) { return crd2crd(c,s,d); }); + } else { + // assert(size(src_shape) == size(dst_shape)) + return idx2crd(crd2idx(coord, src_shape), dst_shape); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Compact Major +// + +// General tag for common layouts and dispatching +struct GenColMajor {}; +struct GenRowMajor {}; + +template , class Major = GenColMajor> +CUTE_HOST_DEVICE constexpr +auto +compact_major(Shape const& shape, + Current const& current = {}, + Major const& major = {}); + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +compact_major_ti(Shape const& shape, + Current const& current, + GenColMajor const& major, seq) +{ + return cute::make_tuple(compact_major(get(shape), current * product<0,Is>(shape), major)...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +compact_major_ti(Shape const& shape, + Current const& current, + GenRowMajor const& major, seq) +{ + constexpr int E = tuple_size::value; + return cute::make_tuple(compact_major(get(shape), current * product(shape), major)...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +compact_major(Shape const& shape, + Current const& current, + Major const& major) +{ + if constexpr (is_tuple::value) { + if constexpr (is_tuple::value) { // tuple tuple + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return transform(shape, current, [&](auto const& s, auto const& c){ return compact_major(s,c,major); }); + } else { // tuple int + return detail::compact_major_ti(shape, current, major, tuple_seq{}); + } + } else { + if constexpr (is_tuple::value) { // int tuple + static_assert(sizeof(Shape) == 0, "Invalid parameters"); + } else { // int int + if constexpr (is_constant<1, Shape>::value) { + return Int<0>{}; // If current is dynamic, this could save a reg + } else { + return current; + } + } + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Compact Col Major +// + +template > +CUTE_HOST_DEVICE constexpr +auto +compact_col_major(Shape const& shape, + Current const& current = {}) +{ + return compact_major(shape, current, GenColMajor{}); +} + +template +using ColMajor = decltype(compact_col_major(std::declval())); + +// +// Compact Row Major +// + +template > +CUTE_HOST_DEVICE constexpr +auto +compact_row_major(Shape const& shape, + Current const& current = {}) +{ + return compact_major(shape, current, GenRowMajor{}); +} + +template +using RowMajor = decltype(compact_row_major(std::declval())); + +// +// Compact Order -- compute a compact stride based on an ordering of the modes +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +compact_order(Shape const& shape, Order const& order, + OrigShape const& orig_shape, OrigOrder const& orig_order) +{ + if constexpr (is_tuple::value) { + return transform(shape, order, [&](auto const& x, auto const& y) { return compact_order(x, y, orig_shape, orig_order); }); + } else { + auto d = product(transform(orig_shape, orig_order, + [&](auto const& s, auto const& o) { + return conditional_return(o < order, product(s), Int<1>{}); + })); + return compact_col_major(shape, d); + } + + CUTE_GCC_UNREACHABLE; +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +compact_order(Shape const& shape, Order const& order) +{ + static_assert(is_congruent::value, "Need congruence of shape and order."); + return detail::compact_order(shape, order, flatten_to_tuple(shape), flatten_to_tuple(order)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +compact_order(Shape const& shape, GenColMajor const& major) +{ + return compact_major(shape, Int<1>{}, major); +} + +template +CUTE_HOST_DEVICE constexpr +auto +compact_order(Shape const& shape, GenRowMajor const& major) +{ + return compact_major(shape, Int<1>{}, major); +} + +} // end namespace cute diff --git a/include/cute/swizzle.hpp b/include/cute/swizzle.hpp new file mode 100644 index 0000000000..0a13e55143 --- /dev/null +++ b/include/cute/swizzle.hpp @@ -0,0 +1,497 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace cute +{ + +// A generic Swizzle functor +/* 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx + * ^--^ MBase is the number of least-sig bits to keep constant + * ^-^ ^-^ BBits is the number of bits in the mask + * ^---------^ SShift is the distance to shift the YYY mask + * (pos shifts YYY to the right, neg shifts YYY to the left) + * + * e.g. Given + * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx + * the result is + * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY + */ +template +struct Swizzle +{ + static constexpr int num_bits = BBits; + static constexpr int num_base = MBase; + static constexpr int num_shft = SShift; + + static_assert(num_base >= 0, "MBase must be positive."); + static_assert(num_bits >= 0, "BBits must be positive."); + static_assert(abs(num_shft) >= num_bits, "abs(SShift) must be more than BBits."); + + // using 'int' type here to avoid unintentially casting to unsigned... unsure. + using bit_msk = cute::constant; + using yyy_msk = cute::constant; + using zzz_msk = cute::constant; + using msk_sft = cute::constant; + + static constexpr uint32_t swizzle_code = uint32_t(yyy_msk{} | zzz_msk{}); + + template ::value)> + CUTE_HOST_DEVICE constexpr static + auto + apply(Offset const& offset) + { + return offset ^ shiftr(offset & yyy_msk{}, msk_sft{}); // ZZZ ^= YYY + } + + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + operator()(Offset const& offset) const + { + return apply(offset); + } +}; + +// Translation for legacy SwizzleXor +// TODO: Deprecate +template +using SwizzleXor = Swizzle; + +// +// make_swizzle<0b1000, 0b0100>() -> Swizzle<1,2,1> +// make_swizzle<0b11000000, 0b00000110>() -> Swizzle<2,1,5> +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_swizzle() +{ + constexpr uint32_t BZ = popcount(Y); // Number of swizzle bits + constexpr uint32_t BY = popcount(Z); // Number of swizzle bits + static_assert(BZ == BY, "Number of bits in Y and Z don't match"); + constexpr uint32_t TZ_Y = countr_zero(Y); // Number of trailing zeros in Y + constexpr uint32_t TZ_Z = countr_zero(Z); // Number of trailing zeros in Z + constexpr uint32_t M = cute::min(TZ_Y, TZ_Z) % 32; + constexpr int32_t S = int32_t(TZ_Y) - int32_t(TZ_Z); // Difference in trailing zeros + static_assert((Y | Z) == Swizzle::swizzle_code, "Something went wrong."); + return Swizzle{}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Swizzle, Swizzle) +{ + static_assert(S0 == S1, "Can only merge swizzles of the same shift."); + constexpr uint32_t Y = Swizzle::yyy_msk::value ^ Swizzle::yyy_msk::value; + constexpr uint32_t Z = Swizzle::zzz_msk::value ^ Swizzle::zzz_msk::value; + return make_swizzle(); + + //return ComposedFn, Swizzle>{}; +} + +// +// Upcast and Downcast +// + +template +CUTE_HOST_DEVICE constexpr +auto +upcast(Swizzle const& swizzle) +{ + static_assert(has_single_bit(N), "N must be a power of two"); + constexpr int log2_n = bit_width(uint32_t(N)) - 1; + constexpr int NewM = M - log2_n; + if constexpr (NewM >= 0) { + return Swizzle{}; + } else { + return Swizzle{}; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(Swizzle const& swizzle) +{ + static_assert(has_single_bit(N), "N must be a power of two"); + constexpr int log2_n = bit_width(uint32_t(N)) - 1; + return Swizzle{}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(Swizzle const& swizzle) +{ + if constexpr (sizeof_bits::value == sizeof_bits::value) { + return swizzle; + } else if constexpr (sizeof_bits::value > sizeof_bits::value) { + static_assert(sizeof_bits::value % sizeof_bits::value == 0, "NewType must be a multiple of OldType"); + return upcast::value/sizeof_bits::value>(swizzle); + } else if constexpr (sizeof_bits::value < sizeof_bits::value) { + static_assert(sizeof_bits::value % sizeof_bits::value == 0, "NewType must be a divisor of OldType"); + return downcast::value/sizeof_bits::value>(swizzle); + } +} + +// +// Utility for slicing and swizzle "offsets" +// + +// For swizzle functions, it is often needed to keep track of which bits are +// consumed and which bits are free. Furthermore, it is useful to know whether +// each of these bits is known statically or dynamically. + +// MixedBits is an integer class where some bits are known statically and some +// bits are known dynamically. These sets of bits are disjoint and it is known +// statically which bits are known dynamically. + +// MixedBits can only be manipulated through bitwise operations + +// Abstract value: StaticInt | (dynamic_int_ & StaticFlags) +template // 0: static, 1: dynamic +struct MixedBits +{ + // Representation invariants + static_assert(StaticFlags != 0, "Should be at least one dynamic bit in MixedBits."); + static_assert((StaticInt & StaticFlags) == 0, "No static/dynamic overlap allowed in MixedBits."); + // assert((dynamic_int_ & ~F) == 0); + + DynamicType dynamic_int_; +}; + +template +CUTE_HOST_DEVICE constexpr +auto +make_mixed_bits(constant const&, DynamicType const& d, constant const&) +{ + static_assert(is_integral::value); + if constexpr (is_static::value) { + static_assert((s & DynamicType::value & f) == 0, "No static/dynamic overlap allowed."); + return constant{} | (d & constant{}); // Just return a static int + } else if constexpr (f == 0) { + return constant{}; // Just return a static int + } else { + return MixedBits{d & f}; // MixedBits + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Explicit conversion for now -- consider casting on plus or minus +// + +template +CUTE_HOST_DEVICE constexpr +auto +to_integral(MixedBits const& m) +{ + //return S | (m.dynamic_int_ & F); + return S | m.dynamic_int_; +} + +// Any cute::is_integral +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +to_integral(I const& i) +{ + return i; +} + +// +// Operators +// + +// Equality +template +CUTE_HOST_DEVICE constexpr +auto +operator==(MixedBits const& m, constant const&) +{ + return (S0 == (S1 & ~F0)) && (m.dynamic_int_ == (S1 & F0)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator==(constant const& s, MixedBits const& m) +{ + return m == s; +} + +// Bitwise AND +template +CUTE_HOST_DEVICE constexpr +auto +operator&(MixedBits const& m0, MixedBits const& m1) +{ + // Truth table for (S0,D0,F0) & (S1,D1,F1) -> (S,D,F) + // S0D0F0 | 0X0 | 001 | 011 | 1X0 | + // S1D1F1 + // 0X0 | 0X0 | 0X0 | 0X0 | 0X0 | + // 001 | 0X0 | 001 | 001 | 001 | + // 011 | 0X0 | 001 | 011 | 011 | + // 1X0 | 0X0 | 001 | 011 | 1X0 | + + return make_mixed_bits(constant{}, + //(S0 | m0.dynamic_int_) & (S1 | m1.dynamic_int_), + ((S1 & F0) & m0.dynamic_int_) | ((S0 & F1) & m1.dynamic_int_) | (m0.dynamic_int_ & m1.dynamic_int_), + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator&(MixedBits const& m, constant const&) +{ + return make_mixed_bits(constant{}, + m.dynamic_int_, + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator&(constant const& s, MixedBits const& m) +{ + return m & s; +} + +// Bitwise OR +template +CUTE_HOST_DEVICE constexpr +auto +operator|(MixedBits const& m0, MixedBits const& m1) +{ + // Truth table for (S0,D0,F0) | (S1,D1,F1) -> (S,D,F) + // S0D0F0 | 0X0 | 001 | 011 | 1X0 | + // S1D1F1 + // 0X0 | 0X0 | 001 | 011 | 1X0 | + // 001 | 001 | 001 | 011 | 1X0 | + // 011 | 011 | 011 | 011 | 1X0 | + // 1X0 | 1X0 | 1X0 | 1X0 | 1X0 | + + return make_mixed_bits(constant{}, + ((~S1 & F0) & m0.dynamic_int_) | ((~S0 & F1) & m1.dynamic_int_), + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator|(MixedBits const& m, constant const&) +{ + return make_mixed_bits(constant{}, + m.dynamic_int_, + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator|(constant const& s, MixedBits const& m) +{ + return m | s; +} + +// Bitwise XOR +template +CUTE_HOST_DEVICE constexpr +auto +operator^(MixedBits const& m0, MixedBits const& m1) +{ + // Truth table for (S0,D0,F0) ^ (S1,D1,F1) -> (S,D,F) + // S0D0F0 | 0X0 | 001 | 011 | 1X0 | + // S1D1F1 + // 0X0 | 0X0 | 001 | 011 | 1X0 | + // 001 | 001 | 001 | 011 | 011 | + // 011 | 011 | 011 | 001 | 001 | + // 1X0 | 1X0 | 011 | 001 | 0X0 | + + return make_mixed_bits(constant{}, + (S0 | m0.dynamic_int_) ^ (S1 | m1.dynamic_int_), + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator^(MixedBits const& m, constant const&) +{ + return make_mixed_bits(constant{}, + (S0 | m.dynamic_int_) ^ S1, + constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +operator^(constant const& s, MixedBits const& m) +{ + return m ^ s; +} + +// +// upcast and downcast +// + +template +CUTE_HOST_DEVICE constexpr +auto +safe_div(MixedBits const& m, constant const& s) +{ + static_assert(has_single_bit(S1), "Only divide MixedBits by powers of two."); + return make_mixed_bits(safe_div(constant{}, s), + safe_div(m.dynamic_int_, s), + safe_div(constant{}, s)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +upcast(MixedBits const& m) +{ + static_assert(has_single_bit(N), "Only divide MixedBits by powers of two."); + return safe_div(m, constant{}); +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +upcast(T const& m) +{ + return safe_div(m, constant{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(MixedBits const& m) +{ + static_assert(has_single_bit(N), "Only scale MixedBits by powers of two."); + return make_mixed_bits(constant{}, + m.dynamic_int_ * N, + constant{}); +} + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +downcast(T const& m) +{ + return m * constant{}; +} + +// +// Convert a Pow2Layout+Coord to a MixedBits +// + +template +CUTE_HOST_DEVICE constexpr +auto +to_mixed_bits(Shape const& shape, Stride const& stride, Coord const& coord) +{ + if constexpr (is_tuple::value && is_tuple::value && is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched ranks"); + static_assert(tuple_size::value == tuple_size::value, "Mismatched ranks"); + return transform_apply(shape, stride, coord, [](auto const& s, auto const& d, auto const& c) { return to_mixed_bits(s,d,c); }, + [](auto const&... a) { return (a ^ ...); }); + } else if constexpr (is_integral::value && is_integral::value && is_integral::value) { + static_assert(decltype(shape*stride)::value == 0 || has_single_bit(decltype(shape*stride)::value), "Requires pow2 shape*stride."); + return make_mixed_bits(Int<0>{}, coord * stride, (shape - Int<1>{}) * stride); + } else { + static_assert(is_integral::value && is_integral::value && is_integral::value, "Either Shape, Stride, and Coord must be all tuples, or they must be all integral (in the sense of cute::is_integral)."); + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +to_mixed_bits(Layout const& layout, Coord const& coord) +{ + return to_mixed_bits(layout.shape(), layout.stride(), idx2crd(coord, layout.shape())); +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(MixedBits const& m) +{ + printf("M_%u|(%u&%u)=%u", S, uint32_t(m.dynamic_int_), F, to_integral(m)); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits const& m) +{ + return os << "M_" << S << "|(" << uint32_t(m.dynamic_int_) << "&" << F << ")=" << to_integral(m); +} + +template +CUTE_HOST_DEVICE void print(Swizzle const&) +{ + print("S<%d,%d,%d>", B, M, S); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, Swizzle const&) +{ + return os << "S<" << B << "," << M << "," << S << ">"; +} + +} // end namespace cute diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp new file mode 100644 index 0000000000..1376a47ddd --- /dev/null +++ b/include/cute/swizzle_layout.hpp @@ -0,0 +1,1010 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include + +/* This implements a ComposedLayout of the form + * InvolutionFn o OffsetPlus o Layout + * where the InvolutionFn need not be linear (hence the need for the Offset). + * + * This ComposedLayout provides similar coordinate-to-index mapping and layout manipulations, + * but is not considered a "normal" layout. + * For example, this layout provides size() functions, but does not provide stride() functions. + * + * Furthermore, for known InvolutionFns, this layout attempts to decay itself + * to a normal-layout with dynamic or static strides. + * This is possible by determining the subdomain of the Involution function + * that is identity and testing if the right Layout's codomain is contained + * within it. + */ + +namespace cute +{ + +// A Layout of non-trivially composable functions: F o I o L +template +struct ComposedLayout + : private cute::tuple // EBO for static layouts +{ + CUTE_HOST_DEVICE constexpr + ComposedLayout(InvolutionFn const& fn = {}, + IntermediateOffset const& offset = {}, + Layout const& layout = {}) + : cute::tuple(fn, offset, layout) + {} + + // + // Accessors + // + + static constexpr int rank = Layout::rank; + + CUTE_HOST_DEVICE constexpr + decltype(auto) + swizzle_fn() const { + return get<0>(static_cast const&>(*this)); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + offset_fn() const { + return get<1>(static_cast const&>(*this)); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + layout_fn() const { + return get<2>(static_cast const&>(*this)); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + layout() const { + return *this; + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + shape() const { + return layout_fn().shape(); + } + + // Doesn't really make sense to ask for the strides of this "layout" + CUTE_HOST_DEVICE constexpr + decltype(auto) + stride() const = delete; + + // + // Mappings + // + + // Map a logical coordinate to a linear index (Coord has no Underscore slice operators) + // OR + // Slice the layout and return the sublayout (Coord has an Underscore slice op) + template + CUTE_HOST_DEVICE constexpr + auto + operator()(Coord const& coord) const { + if constexpr (has_underscore::value) { + return slice(coord, *this); + } else { + return swizzle_fn()(to_integral(offset_fn()) + layout_fn()(coord)); // (F o L)(c) + } + + CUTE_GCC_UNREACHABLE; + } + + // Map a 1D linear coordinate to a flat ND logical coordinate + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + operator[](Int const& linear_idx) const { + return get_flat_coord(linear_idx); + } + + // Convenience function for multi-dimensional coordinates + template + CUTE_HOST_DEVICE constexpr + auto + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { + return operator()(make_coord(c0,c1,cs...)); + } + + // + // Compose + // + + template + CUTE_HOST_DEVICE constexpr + auto + compose(OtherLayout const& other) const { + return composition(*this, other); + } + + template + CUTE_HOST_DEVICE constexpr + auto + compose(Layouts const&... layouts) const { + return composition(*this, make_tile(layouts...)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + with_shape(OtherShape const& shape) const { + return composition(*this, make_layout(shape)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + with_shape(Shapes const&... shapes) const { + return composition(*this, make_layout(make_shape(shapes...))); + } + + // + // Tile + // + + template + CUTE_HOST_DEVICE constexpr + auto + tile(OtherLayout const& other) const { + return tiled_divide(*this, other); + } + + template + CUTE_HOST_DEVICE constexpr + auto + tile(Layouts const&... layouts) const { + return tiled_divide(*this, make_tile(layouts...)); + } + + // + // Utility + // + + // + // Index to Coordinate + // + + // NOTE Only valid for compact layouts + + // Return the (hierarchical) ND logical coordinate corresponding to the linear index + // @post this->crd2idx(@a result) == idx + // @post congruent(@a result, shape()) + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_hier_coord(IInt const& idx) const { + return layout_fn().get_hier_coord(swizzle_fn()(idx) - to_integral(offset_fn())); // (L^-1 o F)(k) + } + + // Return the (flat) ND logical coordinate corresponding to the linear index + // @post this->crd2idx(@a result) == idx + // @post rank(@a result) == rank(shape()) && depth(@a result) == 1 + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_flat_coord(IInt const& idx) const { + return layout_fn().get_flat_coord(swizzle_fn()(idx) - to_integral(offset_fn())); // (L^-1 o F)(k) + } + + // Return the generalized column-major 1D logical coordinate corresponding to the linear index + // @post this->crd2idx(@a result) == idx + // @post is_integral::value + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_1d_coord(IInt const& idx) const { + return layout_fn().get_1d_coord(swizzle_fn()(idx) - to_integral(offset_fn())); // (L^-1 o F)(k) + } +}; + +template +struct is_layout> : true_type {}; + +template +struct is_composed_layout : false_type {}; +template +struct is_composed_layout> : true_type {}; + +// +// Constructors +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(Swizzle const& sxor) +{ + return composition(sxor, Layout,Int<1>>{}); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(ComposedLayout const& a, Layout const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), make_layout(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_layout(Layout const& a, ComposedLayout const& b) +{ + return composition(b.swizzle_fn(), b.offset_fn(), make_layout(a, b.layout_fn())); +} + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +transfer_swizzle(Layout const& old_layout, + Layout const& new_layout) +{ + // Our goal is to determine a new swizzle for the strides in new_layout for consistent vectorizations + + // This is accomplished by identifying + // S o L :=: S? o L* + // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S + // Then that active identifier is transformed through the layouts: + // L*(L[(P o L)(c*)]) + // which is a new swizzle identifier for S?, the new swizzle + + // Projections of the swizzle layout for composition, P + auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B )>{}, Int<1>{}), + make_stride( Int<0>{}, Int<(1 << M)>{}, Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{})); + + // Compose with the tile to get the swizzle projection, P o L [The Z and Y contributing portions of L] + auto layout_only_zy = composition(swizzle_only_zy, old_layout); + // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*) + auto swizzle_active_bits = layout_only_zy(size(layout_only_zy)-Int<1>{}); + + // Get the Z bit and the Y bits -- keep only those that are active in Z *and* Y + auto zzz_msk = typename Swizzle::zzz_msk{}; + auto yyy_msk = typename Swizzle::yyy_msk{}; + auto msk_sft = typename Swizzle::msk_sft{}; + auto active_Z = swizzle_active_bits & shiftr(swizzle_active_bits, msk_sft) & zzz_msk; + auto active_Y = swizzle_active_bits & shiftr(swizzle_active_bits, -msk_sft) & yyy_msk; + + // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)]) + auto new_active_Z = new_layout(old_layout.get_1d_coord(active_Z)); + auto new_active_Y = new_layout(old_layout.get_1d_coord(active_Y)); + + // Use this new swizzle identifier to construct the new swizzle for new_layout + // (this also makes sure it's a "valid" swizzle that Swizzle can represent) + return composition(make_swizzle(), new_layout); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +make_fragment_like(ComposedLayout,Offset,Layout> const& layout) +{ + return detail::transfer_swizzle(layout.layout_fn(), make_fragment_like(layout.layout_fn())); +} + +// +// Utilities +// + +// Return the layout of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +layout(ComposedLayout const& clayout) +{ + return composition(clayout.swizzle_fn(), clayout.offset_fn(), layout(clayout.layout_fn())); +} + +// Return the shape of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +shape(ComposedLayout const& layout) +{ + return shape(layout.layout_fn()); +} + +// Doesn't make sense to directly ask for the strides of this "layout" +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +stride(ComposedLayout const& layout) = delete; + +// Return the number of elements in a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +size(ComposedLayout const& layout) +{ + return size(layout.layout_fn()); +} + +// Return the number of modes +template +CUTE_HOST_DEVICE constexpr +auto +rank(ComposedLayout const& layout) +{ + return rank(layout.layout_fn()); +} + +// Return the depth of the layout +template +CUTE_HOST_DEVICE constexpr +auto +depth(ComposedLayout const& layout) +{ + return depth(layout.layout_fn()); +} + +// Return the codomain size of a mode +template +CUTE_HOST_DEVICE constexpr +auto +cosize(ComposedLayout const& layout) +{ + return cosize(layout.layout_fn()); +} + +// +// Operations to manipulate Layouts like a tuple of pairs +// + +template +CUTE_HOST_DEVICE constexpr +auto +get(ComposedLayout const& a) +{ + return composition(a.swizzle_fn(), a.offset_fn(), get(a.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +take(ComposedLayout const& a) +{ + return composition(a.swizzle_fn(), a.offset_fn(), take(a.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +flatten(ComposedLayout const& a) +{ + return composition(a.swizzle_fn(), a.offset_fn(), flatten(a.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +append(ComposedLayout const& a, X const& x) +{ + return composition(a.swizzle_fn(), a.offset_fn(), append(a.layout_fn(), x)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +group(ComposedLayout const& a) +{ + return composition(a.swizzle_fn(), a.offset_fn(), group(a.layout_fn())); +} + +// +// Slice a ComposedLayout +// + +namespace detail { + +template +CUTE_HOST_DEVICE constexpr +auto +make_swizzle_strides(true_type, + IntZ const& Z, + IntY const& Y, + Offset const& offset, + int_sequence) +{ + // Below is an optimized/compressed version of: + //return make_tuple((swizzle(offset + Z*Int<(1 << I)>{}) - swizzle(offset))...); + // with knowledge of Swizzle, I... ranges for each B bits, + // and the layout won't slice along z-bits that are already set + + // y\z 0 1 + // 0 Z DC + // 1 -Z DC + + return cute::make_tuple(conditional_return((offset & (Y << Int{})) == Int<0>{}, Z << Int{}, -(Z << Int{}))...); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_swizzle_strides(false_type, + IntZ const& Z, + IntY const& Y, + Offset const& offset, + int_sequence) +{ + // Below is an optimized/compressed version of: + //return make_tuple((swizzle(offset + Y*Int<(1 << I)>{}) - swizzle(offset))...); + // with knowledge of Swizzle, I... ranges for each B bits, + // and the layout won't slice along y-bits that are already set + + // y\z 0 1 + // 0 Y+Z Y-Z + // 1 DC DC + + return cute::make_tuple(conditional_return((offset & (Z << Int{})) == Int<0>{}, (Y+Z) << Int{}, (Y-Z) << Int{})...); +} + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr +auto +slice_and_offset(Coord const& coord, ComposedLayout,Offset,Layout> const& layout) +{ + if constexpr (all_underscore::value) { + // Skip the expensive/complicated attempt to decay to a normal layout and just reshape + return cute::make_tuple(composition(layout.swizzle_fn(), layout.offset_fn(), slice(coord, layout.layout_fn())), Int<0>{}); + } else { + + // Projections of the swizzle layout for composition + auto sw = make_layout(make_shape(Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B)>{}, Int<1>{})); + + auto swizzle_anti_zy = make_layout(shape(sw), + make_stride(stride<0>(sw), Int<0>{}, stride<2>(sw), Int<0>{}, size(sw))); + auto swizzle_only_zy = make_layout(shape(sw), + make_stride( Int<0>{}, stride<1>(sw), Int<0>{}, stride<3>(sw), Int<0>{})); + + // The portion of the layout that is not yet consumed + auto sliced_layout = slice(coord, layout.layout_fn()); + + // If the sliced_layout hits two bits that are swizzled together, then don't attempt to decay + + // Compose with the layout to get the swizzle projection, P o L [The Z and Y contributing portions of L] + // (this also tests that shape/stride of layout compose with swizzle) + auto sliced_layout_only_zy = composition(swizzle_only_zy, sliced_layout); + // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*) + auto swizzle_active_bits = sliced_layout_only_zy(size(sliced_layout_only_zy)-Int<1>{}); + // Determine if any active bits collide under the swizzle + auto hit_ZandY = !(swizzle_active_bits & ~layout.swizzle_fn()(swizzle_active_bits)); + + // The portion of the layout that we are consuming now + auto diced_layout = dice(coord, layout.layout_fn()); + auto diced_coord = dice(coord, coord); + + auto diced_layout_anti_zy = composition(swizzle_anti_zy, diced_layout); + auto diced_layout_only_zy = composition(swizzle_only_zy, diced_layout); + + // New swizzle and offset + auto swizzle = layout.swizzle_fn(); + // offset_only_zy interacts with swizzle and gets accumulated with layout.offset_fn() + // being careful about the static/dynamic contributions from diced_layout and diced_coord + auto offset_only_zy = layout.offset_fn() ^ to_mixed_bits(diced_layout_only_zy, diced_coord); + // offset_anti_zy always gets passed through, no interaction with swizzle + auto offset_anti_zy = diced_layout_anti_zy(diced_coord); + + // If Layout's codomain hits on Y AND Z, then it's not reducible + // If Layout's codomain hits on Y XOR Z, then it's dynamic-normal + // If Layout's codomain hits on neither Y NOR Z, then it's static-normal + + // Test the sliced layout for hit_X & hit_Y for potential decay + if constexpr (is_constant::value) + { // Hits on Y AND Z, so it's not reducible + return cute::make_tuple(composition(swizzle, offset_only_zy, sliced_layout), offset_anti_zy); + } else + { // Misses on Y or Z, so it's static-normal or dynamic-normal + + // Lowest bit of the Z and Y masks + auto Z = typename Swizzle::zzz_msk{} & -typename Swizzle::zzz_msk{}; + auto Y = typename Swizzle::yyy_msk{} & -typename Swizzle::yyy_msk{}; + auto stride_lo = detail::make_swizzle_strides(Z < Y, Z, Y, offset_only_zy, make_int_sequence{}); + auto stride_hi = detail::make_swizzle_strides(Z > Y, Z, Y, offset_only_zy, make_int_sequence{}); + + // Construct a (dynamic) layout that we can perform the composition with + auto swizzle_layout = make_layout(make_shape (Int<(1 << M)>{}, repeat(Int<2>{}), Int<(1 << (abs(S)-B))>{}, repeat(Int<2>{}), Int< 1>{}), + make_stride(Int< 1>{}, stride_lo, Int<(1 << (M+B))>{}, stride_hi , Int<(1 << (M+B+abs(S)))>{})); + + // Decay to a normal layout with offset + return cute::make_tuple(composition(swizzle_layout, sliced_layout), + swizzle(to_integral(offset_only_zy)) + offset_anti_zy); + } + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +slice(Coord const& coord, ComposedLayout const& layout) +{ + return get<0>(slice_and_offset(coord, layout)); +} + +// +// composition +// + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Swizzle const& sxor, + Offset const& offset, + Layout const& layout) +{ + return ComposedLayout>{sxor, offset, layout}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Swizzle const& sxor, + Offset const& offset, + ComposedLayout const& layout) +{ + // Assume disjoint swizzles and offsets for commutivity + return composition(composition(sxor,layout.swizzle_fn()), offset ^ layout.offset_fn(), layout.layout_fn()); +} + +// Ignore identity case +template +CUTE_HOST_DEVICE constexpr +auto +composition(Swizzle<0,M,S> const&, + Int<0> const&, + Layout const& layout) +{ + return layout; +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Swizzle const& sxor, + Layout const& layout) +{ + return composition(sxor, Int<0>{}, layout); +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(ComposedLayout const& a, + LayoutOrTile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), composition(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Layout const& a, + Swizzle const& b) +{ + // Get the Z bits and the Y bits + auto active_Y = a(typename Swizzle::yyy_msk{}); + auto active_Z = a(typename Swizzle::zzz_msk{}); + + // Works in simple cases... but could be greatly generalized + + return composition(make_swizzle(), a); +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(Layout const& a, + ComposedLayout const& b) +{ + CUTE_STATIC_ASSERT_V(b.offset_fn() == Int<0>{}, "Require Swizzle offset == 0."); + + return composition(composition(a, b.swizzle_fn()), b.layout_fn()); +} + +template +CUTE_HOST_DEVICE constexpr +auto +composition(ComposedLayout const& a, + ComposedLayout const& b) +{ + auto asb = composition(a.layout_fn(), b); + + return composition(composition(a.swizzle_fn(),asb.swizzle_fn()), asb.offset_fn(), asb.layout_fn()); +} + +// +// complement +// + +template +CUTE_HOST_DEVICE constexpr +auto +complement(ComposedLayout const& layout, CoSizeHi const& cosize_hi) +{ + // Assume there is no swizzle component in the complement + return complement(layout.layout_fn(), cosize_hi); +} + +template +CUTE_HOST_DEVICE constexpr +auto +complement(ComposedLayout const& layout) +{ + return complement(layout, cosize(layout)); +} + +// +// inverse +// + +template +CUTE_HOST_DEVICE constexpr +auto +right_inverse(ComposedLayout const& layout) +{ + CUTE_STATIC_ASSERT_V(layout.offset_fn() == Int<0>{}, "Requires 0-offset."); + return composition(right_inverse(layout.layout_fn()), layout.swizzle_fn()); +} + +template +CUTE_HOST_DEVICE constexpr +auto +left_inverse(ComposedLayout const& layout) +{ + CUTE_STATIC_ASSERT_V(layout.offset_fn() == Int<0>{}, "Requires 0-offset."); + return composition(left_inverse(layout.layout_fn()), layout.swizzle_fn()); +} + +// +// Other operations +// + +template +CUTE_HOST_DEVICE constexpr +auto +max_common_vector(ComposedLayout,Offset,SLayout> const& a, + Layout const& b) +{ + // This assumes that Offset is in the YZ domain of the Swizzle... + return cute::min(Int<(1 << M)>{}, max_common_vector(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +max_common_vector(Layout const& a, + ComposedLayout,Offset,SLayout> const& b) +{ + return max_common_vector(b, a); +} + +template +CUTE_HOST_DEVICE constexpr +auto +max_common_vector(ComposedLayout,Offset0,SLayout0> const& a, + ComposedLayout,Offset1,SLayout1> const& b) +{ + auto result = coalesce(composition(a, right_inverse(b))); + + if constexpr (is_constant<1, decltype(stride<0>(result.layout_fn()))>::value) { + return shape<0>(result); + } else { + return Int<1>{}; + } + + CUTE_GCC_UNREACHABLE; +} + +template +CUTE_HOST_DEVICE constexpr +auto +zip(ComposedLayout const& a) +{ + return composition(a.swizzle_fn(), a.offset_fn(), zip(a.layout_fn())); +} + +// Partitions + +template +CUTE_HOST_DEVICE constexpr +auto +logical_divide(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), logical_divide(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tile_unzip(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), tile_unzip(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tiled_divide(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), tiled_divide(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +zipped_divide(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), zipped_divide(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +logical_product(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), logical_product(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tiled_product(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), tiled_product(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +blocked_product(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), blocked_product(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +raked_product(ComposedLayout const& a, + Tile const& b) +{ + return composition(a.swizzle_fn(), a.offset_fn(), raked_product(a.layout_fn(), b)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tile_to_shape(ComposedLayout const& layout, + Shape const& trg_shape, + ModeOrder const& ord_shape = {}) +{ + return composition(layout.swizzle_fn(), layout.offset_fn(), tile_to_shape(layout.layout_fn(), trg_shape, ord_shape)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +filter(ComposedLayout const& layout, Shape const& trg_profile) +{ + return composition(layout.swizzle_fn(), layout.offset_fn(), filter(layout.layout_fn(), trg_profile)); +} + +template +CUTE_HOST_DEVICE constexpr +auto +coalesce(ComposedLayout const& layout) +{ + return composition(layout.swizzle_fn(), layout.offset_fn(), coalesce(layout.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +coalesce(ComposedLayout const& layout, Shape const& trg_profile) +{ + return composition(layout.swizzle_fn(), layout.offset_fn(), coalesce(layout.layout_fn(), trg_profile)); +} + +/////////////////////////////////////////////////////////////////////////////// +// ComposedLayout as second argument is often more difficult... + +template +CUTE_HOST_DEVICE constexpr +auto +logical_product(Layout const& block, + ComposedLayout,Offset,LayoutT> const& tile) +{ + CUTE_STATIC_ASSERT_V(tile.offset_fn() == Int<0>{}, "Require Swizzle offset == 0."); + // The new layout -- if swizzle wasn't an issue, this is the result + // our goal is to determine a new swizzle for these strides + auto new_layout = logical_product(block, tile.layout_fn()); + + // This is accomplished by identifying + // S o L :=: S? o L* + // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S + // Then that active identifier is transformed through the layouts: + // L*(L[(P o L)(c*)]) + // which is a new swizzle identifier for S?, the new swizzle + + // Projections of the swizzle layout for composition, P + auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B )>{}, Int<1>{}), + make_stride( Int<0>{}, Int<(1 << M)>{}, Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{})); + + // Compose with the tile to get the swizzle projection, P o L [The Z and Y contributing portions of L] + auto layout_only_zy = composition(swizzle_only_zy, tile.layout_fn()); + // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*) + auto swizzle_active_bits = layout_only_zy(size(layout_only_zy)-Int<1>{}); + // Get the Z bit and the Y bits + auto active_Z = swizzle_active_bits & typename Swizzle::zzz_msk{}; + auto active_Y = swizzle_active_bits & typename Swizzle::yyy_msk{}; + + // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)]) + auto new_active_Z = new_layout(Int<0>{}, tile.layout_fn()[active_Z]); + auto new_active_Y = new_layout(Int<0>{}, tile.layout_fn()[active_Y]); + + // Use this new swizzle identifier to construxt the new swizzle for new_layout + // (this also makes sure it's a "valid" swizzle that Swizzle can represent) + return composition(make_swizzle(), new_layout); +} + +template +CUTE_HOST_DEVICE constexpr +auto +tiled_product(Layout const& block, + ComposedLayout const& tile) +{ + /// Avoid swizzle slice + auto result = logical_product(block, tile); + return composition(result.swizzle_fn(), result.offset_fn(), result.layout_fn()(_, repeat>(_))); +} + +template +CUTE_HOST_DEVICE constexpr +auto +blocked_product(Layout const& block, + ComposedLayout const& layout) +{ + constexpr int R = cute::max(rank_v, rank_v); + auto padded_block = append(block, Layout<_1,_0>{}); + auto padded_layout = append(layout, Layout<_1,_0>{}); + + auto result = logical_product(padded_block, padded_layout); + + return composition(result.swizzle_fn(), + result.offset_fn(), + coalesce(zip(get<0>(result.layout_fn()), get<1>(result.layout_fn())), repeat(Int<1>{}))); +} + +// +// Upcast and Downcast +// + +template +CUTE_HOST_DEVICE constexpr +auto +upcast(ComposedLayout const& layout) +{ + return composition(upcast(layout.swizzle_fn()), upcast(layout.offset_fn()), upcast(layout.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(ComposedLayout const& layout) +{ + return composition(downcast(layout.swizzle_fn()), downcast(layout.offset_fn()), downcast(layout.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(ComposedLayout const& layout) +{ + if constexpr (sizeof(NewType) == sizeof(OldType)) { + return layout; + } else if constexpr (sizeof(NewType) > sizeof(OldType)) { + static_assert(sizeof(NewType) % sizeof(OldType) == 0, "NewType must be a multiple of OldType"); + return upcast(layout); + } else if constexpr (sizeof(NewType) < sizeof(OldType)) { + static_assert(sizeof(OldType) % sizeof(NewType) == 0, "NewType must be a divisor of OldType"); + return downcast(layout); + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print(ComposedLayout const& layout) +{ + print(layout.swizzle_fn()); print(" o "); print(layout.offset_fn()); print(" o "); print(layout.layout_fn()); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, ComposedLayout const& layout) +{ + return os << layout.swizzle_fn() << " o " << layout.offset_fn() << " o " << layout.layout_fn(); +} + +} // end namespace cute diff --git a/include/cute/swizzle_ptr.hpp b/include/cute/swizzle_ptr.hpp new file mode 100644 index 0000000000..ed77acba75 --- /dev/null +++ b/include/cute/swizzle_ptr.hpp @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#include +#include +#include + +#include +#include +#include + +/* This implements a swizzle pointer of the form + * InvolutionFn o PtrAdd + * where the InvolutionFn need not be linear. + * + * This differs subtly from swizzle_layout because the smem pointer is used + * as the offset. That means that swizzle_layout will implement position-independent + * swizzle layouts, while swizzle_ptr implements position-dependent swizzle tensors. + * Arch chose to design hardware with position-dependent swizzles. + * + * For clarity: + * NormalLayout : DeRef <- PtrAdd <- [Layout] + * ComposedLayout: DeRef <- PtrAdd <- [Swizzle <- OffsetAdd <- Layout] + * SwizzlePtr : [DeRef <- Swizzle <- PtrAdd] <- Layout + * + * Furthermore, for known swizzles, this pointer attempts to decay itself + * to a normal-pointer with a new layout containing dynamic or static strides. + * This is possible by determining the subdomain of the InvolutionFn + * that is identity and testing if the Layout's codomain is contained + * within it. + */ + +namespace cute +{ + +template +struct smem_ptr_swizzle +{ + static_assert(std::is_empty::value, "Swizzle can't have state."); + + CUTE_HOST_DEVICE constexpr + T* get() const + { + return ptr_; + } + + CUTE_HOST_DEVICE constexpr static + Swizzle get_swizzle() + { + return {}; + } + + CUTE_HOST_DEVICE constexpr static + T* apply_swizzle(T* ptr) + { + return reinterpret_cast(Swizzle::apply(reinterpret_cast(ptr))); + } + + CUTE_HOST_DEVICE constexpr + T& operator*() const + { + return *apply_swizzle(get()); + } + + template + CUTE_HOST_DEVICE constexpr + T& operator[](Int const& i) const + { + return *apply_swizzle(get() + i); + } + + template + CUTE_HOST_DEVICE constexpr + smem_ptr_swizzle operator+(Int const& i) const + { + return {ptr_ + i}; + } + + T* ptr_; +}; + +template +struct is_smem> : true_type {}; + +// Make a swizzle pointer +template +CUTE_HOST_DEVICE constexpr +auto +make_smem_ptr(T* ptr, Swizzle const& swizzle) +{ + return smem_ptr_swizzle{ptr}; +} + +// A model of a nullptr smem_ptr with B == sizeof_bits::value +// That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr +template +struct smem_ptr_flag_bits : Int<0> {}; + +using smem_ptr_flag = smem_ptr_flag_bits<1>; + +// A flagged construction method to transform ComposedLayout +// Make a swizzle pointer tensor and check that the intended type size matches +template +CUTE_HOST_DEVICE constexpr +auto +make_tensor(smem_ptr const& ptr, + ComposedLayout,Layout> const& layout) +{ + static_assert(B == sizeof_bits::value, "Expected a B-bit pointer type."); + return make_tensor(make_smem_ptr(ptr.get(), layout.swizzle_fn()), + layout.layout_fn()); +} + +// Specialization for immediate decay +template +CUTE_HOST_DEVICE constexpr +auto +make_tensor(smem_ptr_swizzle>& p, Layout const& layout) +{ + return make_tensor(make_smem_ptr(p.ptr_), layout); +} + +template +CUTE_HOST_DEVICE constexpr +auto +make_tensor(smem_ptr_swizzle> const& p, Layout const& layout) +{ + return make_tensor(make_smem_ptr(p.ptr_), layout); +} + +// NOTE: To preserve smem_ptr_flag_bits under recast ops +template +CUTE_HOST_DEVICE constexpr +auto +upcast(ComposedLayout,Layout> const& layout) +{ + return composition(layout.swizzle_fn(), smem_ptr_flag_bits{}, upcast(layout.layout_fn())); +} + +template +CUTE_HOST_DEVICE constexpr +auto +downcast(ComposedLayout,Layout> const& layout) +{ + return composition(layout.swizzle_fn(), smem_ptr_flag_bits{}, downcast(layout.layout_fn())); +} + +// +// Recast +// Swizzle operates on the pointer address, so it doesn't care about the type +// + +template +CUTE_HOST_DEVICE constexpr +auto +recast(smem_ptr_swizzle const& ptr) +{ + return smem_ptr_swizzle{recast(ptr.ptr_)}; +} + +template +CUTE_HOST_DEVICE constexpr +auto +recast(smem_ptr_swizzle const& ptr) +{ + return smem_ptr_swizzle{recast(ptr.ptr_)}; +} + +// +// Conversion with swizzle_layout +// + +template +CUTE_HOST_DEVICE +auto +as_position_independent_swizzle_layout(ComposedLayout,Layout> const& layout) +{ + return composition(recast,uint_bit_t>(layout.swizzle_fn()), Int<0>{}, layout.layout_fn()); +} + +template +CUTE_HOST_DEVICE +auto +as_position_independent_swizzle_tensor(Tensor>, Layout> const& tensor) +{ + { + uint32_t address = cast_smem_ptr_to_uint(tensor.data().get()); + uint32_t mask = ((uint32_t(1) << Swizzle::num_base) - 1) & (Swizzle::swizzle_code); + assert((address & mask) == 0); // Alignment to the Base, Z, and Y of Swizzle + } + auto new_swizzle = recast,uint_bit_t>>(tensor.data().get_swizzle()); + return make_tensor(make_smem_ptr(tensor.data().get()), composition(new_swizzle, Int<0>{}, tensor.layout())); +} + +template +CUTE_HOST_DEVICE +auto +as_position_independent_swizzle_tensor(Tensor>, Layout>& tensor) +{ + { + uint32_t address = cast_smem_ptr_to_uint(tensor.data().get()); + uint32_t mask = ((uint32_t(1) << Swizzle::num_base) - 1) & (Swizzle::swizzle_code); + assert((address & mask) == 0); // Alignment to the Base, Z, and Y of Swizzle + } + auto new_swizzle = recast,uint_bit_t>>(tensor.data().get_swizzle()); + return make_tensor(make_smem_ptr(tensor.data().get()), composition(new_swizzle, Int<0>{}, tensor.layout())); +} + +template +CUTE_HOST_DEVICE +auto +as_position_independent_swizzle_tensor(Tensor>, Layout>&& tensor) +{ + return as_position_independent_swizzle_tensor(tensor); +} + +// +// Print +// + +// Capture and cast smem_ptr_flag Layouts to offset-0 layouts +template +CUTE_HOST_DEVICE +void +print_latex(ComposedLayout,Layout> const& layout) +{ + auto new_swizzle = recast,uint_bit_t>(layout.swizzle_fn()); + print_latex(composition(new_swizzle, Int<0>{}, layout.layout_fn())); +} + +template +CUTE_HOST_DEVICE void print(smem_ptr_flag_bits const& ptr) +{ + printf("smem_ptr_%db(unset)", B); +} + +template +CUTE_HOST_DEVICE void print(smem_ptr_swizzle> const& ptr) +{ + printf("smem_ptr_S<%d,%d,%d>_%db(%p)", B, M, S, int(8*sizeof(T)), ptr.get()); +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr_swizzle> const&) +{ + return os << "smem_ptr_S<" << B << "," << M << "," << S << ">_" << int(8*sizeof(T)) << "b"; +} + +} // end namespace cute diff --git a/include/cute/tensor.hpp b/include/cute/tensor.hpp new file mode 100644 index 0000000000..e88c22bcb7 --- /dev/null +++ b/include/cute/tensor.hpp @@ -0,0 +1,900 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cute +{ + +// +// Engine -- owning or non-owning data store +// + +// concept Engine { +// using value_type = ; +// iterator begin(); +// }; + +template +using ArrayEngine = typename std::conditional<(sizeof_bits::value % 8 == 0), + array_aligned, + array_subbyte>::type; + +template +struct ViewEngine +{ + using value_type = typename cute::remove_cvref())>::type; + + using iterator = Iterator; + iterator storage_; + + CUTE_HOST_DEVICE constexpr + iterator const& + begin() const { + return storage_; + } + + CUTE_HOST_DEVICE constexpr + iterator& + begin() { + return storage_; + } +}; + +template +struct is_rmem> : is_rmem {}; +template +struct is_smem> : is_smem {}; +template +struct is_gmem> : is_gmem {}; +template +struct ConstViewEngine +{ + using value_type = typename cute::remove_cvref())>::type; + + using iterator = Iterator; + iterator storage_; + + CUTE_HOST_DEVICE constexpr + iterator const& + begin() const { + return storage_; + } +}; + +template +struct is_rmem> : is_rmem {}; +template +struct is_smem> : is_smem {}; +template +struct is_gmem> : is_gmem {}; +// +// Tensor +// + +template +struct Tensor +{ + using value_type = typename Engine::value_type; + //using pointer = typename engine_traits::pointer; + //using const_pointer = typename engine_traits::const_pointer; + //using reference = typename engine_traits::reference; + //using const_reference = typename engine_traits::const_reference; + + using engine_type = Engine; + using layout_type = Layout; + + CUTE_HOST_DEVICE constexpr + Tensor() {} + + template + CUTE_HOST_DEVICE constexpr + Tensor(Ptr const& ptr, Layout const& layout) + : rep_(layout, ptr) { + } + + // + // Accessors + // + + static constexpr int rank = Layout::rank; + + CUTE_HOST_DEVICE constexpr + decltype(auto) + tensor() const { + return *this; + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + layout() const { + return get<0>(rep_); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + engine() const { + return get<1>(rep_); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + engine() { + return get<1>(rep_); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + data() const { + return engine().begin(); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + data() { + return engine().begin(); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + shape() const { + return layout().shape(); + } + + CUTE_HOST_DEVICE constexpr + auto + size() const { + return cute::size(shape()); + } + + CUTE_HOST_DEVICE constexpr + decltype(auto) + stride() const { + return layout().stride(); + } + + // + // Indexing op() and op[] + // + + // Index into this tensor like an array by computing the offset via layout() + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator[](Coord const& coord) { + return data()[layout()(coord)]; + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator[](Coord const& coord) const { + return data()[layout()(coord)]; + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator()(Coord const& coord) { + if constexpr (has_underscore::value) { + auto const& [sliced_layout,offset] = slice_and_offset(coord, layout()); + return make_tensor(data() + offset, sliced_layout); + } else { + return data()[layout()(coord)]; + } + + CUTE_GCC_UNREACHABLE; + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator()(Coord const& coord) const { + if constexpr (has_underscore::value) { + auto const& [sliced_layout,offset] = slice_and_offset(coord, layout()); + return make_tensor(data() + offset, sliced_layout); + } else { + return data()[layout()(coord)]; + } + + CUTE_GCC_UNREACHABLE; + } + + // op() convenience function for multi-dimensional coordinates + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) { + return operator()(make_coord(c0,c1,cs...)); + } + + template + CUTE_HOST_DEVICE constexpr + decltype(auto) + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { + return operator()(make_coord(c0,c1,cs...)); + } + + // + // Compose + // + + template + CUTE_HOST_DEVICE constexpr + auto + compose(Layouts const&... layouts) { + return make_tensor(data(), layout().compose(layouts...)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + compose(Layouts const&... layouts) const { + return make_tensor(data(), layout().compose(layouts...)); + } + + // + // Tile + // + + template + CUTE_HOST_DEVICE constexpr + auto + tile(Layouts const&... layouts) { + return make_tensor(data(), layout().tile(layouts...)); + } + + template + CUTE_HOST_DEVICE constexpr + auto + tile(Layouts const&... layouts) const { + return make_tensor(data(), layout().tile(layouts...)); + } + + // + // Utility + // + + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_1d_coord(Int const& linear_idx) const { + return layout().get_1d_coord(linear_idx); + } + + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_hier_coord(Int const& linear_idx) const { + return layout().get_hier_coord(linear_idx); + } + + template ::value)> + CUTE_HOST_DEVICE constexpr + auto + get_flat_coord(Int const& linear_idx) const { + return layout().get_flat_coord(linear_idx); + } + + cute::tuple rep_; +}; + + +template +struct is_tensor : false_type {}; +template +struct is_tensor> : true_type {}; + +template +struct is_rmem> : is_rmem {}; +template +struct is_smem> : is_smem {}; +template +struct is_gmem> : is_gmem {}; +// +// Make an owning Tensor that will allocate a static array +// + +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +make_tensor(Layout const& layout) +{ + static_assert(is_static::value, "Dynamic owning tensors not supported"); + using Engine = ArrayEngine>; + return Tensor(); +} + +// e.g. make_tensor(12) +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +make_tensor(LayoutArg const& arg, LayoutArgs const&... args) +{ + return make_tensor(make_layout(arg, args...)); +} + +// +// Make a non-owning Tensor that will use a pointer (view) +// + +template ::value && + is_layout::value)> +CUTE_HOST_DEVICE constexpr +auto +make_tensor(Iterator const& iter, Layout const& layout) +{ + using Engine = ViewEngine; + return Tensor(iter, layout); +} + +// e.g. make_tensor(vec.data(), 12) +template ::value)> +CUTE_HOST_DEVICE constexpr +auto +make_tensor(Iterator const& iter, LayoutArg const& arg, LayoutArgs const&... args) +{ + return make_tensor(iter, make_layout(arg, args...)); +} + +// +// make_tensor_like -- make a register tensor the same type and shape as another +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_tensor_like(Tensor const& tensor) +{ + using value_type = typename Tensor::value_type; + return make_tensor(tensor.shape()); +} + +// +// make_fragment_like -- make a register tensor the same type, shape, and (if possible) order as another tensor +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_fragment_like(Tensor const& tensor) +{ + using value_type = typename Tensor::value_type; + return make_tensor(make_layout_like(tensor.layout())); +} + +// +// make_identity_tensor +// + +template +CUTE_HOST_DEVICE constexpr +auto +make_identity_tensor(Shape const& shape) +{ + return make_tensor(ArithmeticTupleIterator(as_arithmetic_tuple(repeat_like(shape, Int<0>{}))), + make_identity_layout(shape)); +} + +// +// Utilities +// + +// Return the subtensor of a mode +template >::value)> +CUTE_HOST_DEVICE constexpr +decltype(auto) +tensor(Tensor&& tensor) +{ + return std::forward(tensor); +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +decltype(auto) +tensor(Tensor&& tensor) +{ + return make_tensor(std::forward(tensor).data(), get(tensor.layout())); +} + +// Return the subtensor of a range of modes +template >::value)> +CUTE_HOST_DEVICE constexpr +decltype(auto) +take(Tensor&& tensor) +{ + return make_tensor(std::forward(tensor).data(), take(tensor.layout())); +} + +// Return the layout of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +layout(Tensor const& tensor) +{ + return layout(tensor.layout()); +} + +// Return the shape of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +shape(Tensor const& tensor) +{ + return shape(tensor.layout()); +} + +// Return the stride of a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +stride(Tensor const& tensor) +{ + return stride(tensor.layout()); +} + +// Return the number of elements in a mode +template +CUTE_HOST_DEVICE constexpr +decltype(auto) +size(Tensor const& tensor) +{ + return size(tensor.layout()); +} + +// Return the rank of a mode +template +CUTE_HOST_DEVICE constexpr +auto +rank(Tensor const& tensor) +{ + return rank(tensor.layout()); +} + +// Return the depth of a mode +template +CUTE_HOST_DEVICE constexpr +auto +depth(Tensor const& tensor) +{ + return depth(tensor.layout()); +} + +// +// Operations to manipulate Tensors like a Layout +// + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +flatten(Tensor&& tensor) +{ + return make_tensor(std::forward(tensor).data(), flatten(tensor.layout())); +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +coalesce(Tensor&& tensor) +{ + return make_tensor(std::forward(tensor).data(), coalesce(tensor.layout())); +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +coalesce(Tensor&& tensor, Profile const& profile) +{ + return make_tensor(std::forward(tensor).data(), coalesce(tensor.layout(), profile)); +} + +// Group the modes [B,E) into a single mode +// e.g. group<2,4>(make_tensor(Layout>{})) +// => make_tensor(Layout,_5,_6>>{}) +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +group_modes(Tensor&& tensor) +{ + return make_tensor(std::forward(tensor).data(), + group(tensor.layout())); +} + +// +// Recast +// + +// NOTE: This is very dangerous to do +// -- doesn't check dynamic integer divisibility +// -- doesn't check alignment + +// A tagged version for dispatching +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +recast(Tensor&& tensor, type_list) +{ + using OldType = typename remove_cvref_t::value_type; + auto old_layout = tensor.layout(); + auto new_layout = recast(old_layout); + + // If this is an upcast of a normal Layout with static negative strides, then offset as well + if constexpr (sizeof(OldType) < sizeof(NewType) && not is_composed_layout::value) { + auto shape_diff = transform(flatten(old_layout.shape()), flatten(new_layout.shape()), minus{}); + auto extent_diff = transform(shape_diff, flatten(old_layout.stride()), multiplies{}); + auto offset = fold(extent_diff, Int<0>{}, [](auto const& i, auto const& a) { return i + cute::min(a,Int<0>{}); }); + + return make_tensor(recast(std::forward(tensor).data() + offset), new_layout); + } else { + return make_tensor(recast(std::forward(tensor).data() ), new_layout); + } + + CUTE_GCC_UNREACHABLE; +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +recast(Tensor&& tensor) +{ + return recast(std::forward(tensor), type_list{}); +} + +// +// max_common_vector +// + +/* Return Int such that N is the maximum number of continguous elements + * that logically correspond in the tensors of @a a and @a b. This is, + * the number of elements that could reasonably be vectorized into a single load/store. + * + * @returns Int with N >= 0 + * + * A return value of Int<0> indicates that no such conclusion can be made and no + * vectorization should be attempted. + */ +template +CUTE_HOST_DEVICE constexpr +auto +max_common_vector(Tensor const& a, + Tensor const& b) +{ + using SrcType = typename Tensor::value_type; + using DstType = typename Tensor::value_type; + + using SrcRef = decltype(*(a.data())); + using DstRef = decltype(*(b.data())); + + // Determine if vectorization candidates at all + if constexpr (// Should be the same value_types, else the copy is also performing a cast + sizeof(SrcType) == sizeof(DstType) && + // The types should be trivially copyable so that vectorization is valid + std::is_trivially_copyable::value && + std::is_trivially_copyable::value && + // Should be load/storing real data, rather than implicit iterators or such + std::is_reference::value && + std::is_reference::value) + { + return max_common_vector(a.layout(), b.layout()); + } else { + return Int<0>{}; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Key algebraic operations +// + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +logical_divide(Tensor && tensor, + Tile const& tile) +{ + return make_tensor(std::forward(tensor).data(), + logical_divide(tensor.layout(), tile)); +} + +// zipped_divide is logical_divide with modes gathered into standard form ((BLK_A,BLK_B),(a,b)) +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +zipped_divide(Tensor && tensor, + Tile const& tile) // Layout or Tile +{ + return make_tensor(std::forward(tensor).data(), + zipped_divide(tensor.layout(), tile)); +} + +// tiled_divide is logical_divide with the second output mode flattened ((BLK_A,BLK_B),a,b) +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +tiled_divide(Tensor && tensor, + Tile const& tile) // Layout or Tile +{ + return make_tensor(std::forward(tensor).data(), + tiled_divide(tensor.layout(), tile)); +} + +// logical_product on a Tensor doesn't make sense since it often increases cosize + +// +// Logicial Divide utilities: local_partition and local_tile +// + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +local_partition(Tensor && tensor, + Tile const& tile, + Coord const& coord) +{ + constexpr int R1 = decltype(rank(tensor))::value; + + // Split the modes of tensor according to the modes of tile + // zipped_divide returns something like ((VEC_A,VEC_B,...),(a,b,...)) + + // The_coord is the coord into the first mode, flatten the rest + return zipped_divide(std::forward(tensor), tile)(coord, repeat(_)); +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +local_partition(Tensor && tensor, + Tile const& tile, + Coord const& coord, + Projection const& proj) +{ + return local_partition(std::forward(tensor), + dice(proj, tile), + dice(proj, coord)); +} + +// Special case with Layout and Integral that extracts the coord first +// e.g. local_partition(tensor, ThrLayout, threadIdx.x) +template >::value && + is_integral::value)> +CUTE_HOST_DEVICE +auto +local_partition(Tensor && tensor, + Layout const& tile, + Index const& index) +{ + return local_partition(std::forward(tensor), + product_each(shape(tile)), + tile.get_flat_coord(index)); +} + +// Special case with Layout and Integral that extracts the coord first +// e.g. local_partition(tensor, ThrLayout, threadIdx.x, Step<_1,X,_1>{}) +template >::value && + is_integral::value)> +CUTE_HOST_DEVICE +auto +local_partition(Tensor && tensor, + Layout const& tile, + Index const& index, + Projection const& proj) +{ + return local_partition(std::forward(tensor), + dice(proj, product_each(shape(tile))), + dice(proj, tile).get_flat_coord(index)); +} + +template >::value)> +CUTE_HOST_DEVICE constexpr +auto +local_tile(Tensor && tensor, + Tile const& tile, + Coord const& coord) +{ + constexpr int R0 = decltype(rank(tile))::value; + constexpr int R1 = decltype(rank(tensor))::value; + + // Split the modes of tensor according to the modes of tile + // zipped_divide returns something like ((VEC_A,VEC_B,...),(a,b,...)) + + // The padded_coord is the coord into the second mode, flatten the rest + return zipped_divide(std::forward(tensor), tile)(repeat(_), append(coord,_)); +} + +template >::value)> +CUTE_HOST_DEVICE +auto +local_tile(Tensor && tensor, + Tile const& tile, + Coord const& coord, + Proj const& proj) +{ + return local_tile(std::forward(tensor), + dice(proj, tile), + dice(proj, coord)); +} + +// +// Display utilities +// + +template +CUTE_HOST_DEVICE void print_tensor(Tensor const& tensor) +{ + auto format = get_format(tensor(0)); + using type = typename decltype(format)::type; + + if constexpr (Layout::rank == 1) + { + for (int m = 0; m < size(tensor); ++m) { + printf(format.format, format.digits, type(tensor(m))); + printf("\n"); + } + } else + if constexpr (Layout::rank == 2) + { + for (int m = 0; m < size<0>(tensor); ++m) { + for (int n = 0; n < size<1>(tensor); ++n) { + printf(format.format, format.digits, type(tensor(m,n))); + } + printf("\n"); + } + } else + if constexpr (Layout::rank == 3) + { + print_tensor(tensor(_,_,0)); + for (int k = 1; k < size<2>(tensor); ++k) { + for (int i = 0; i < format.digits*size<1>(tensor); ++i) { print("-"); } print("\n"); + print_tensor(tensor(_,_,k)); + } + } else + if constexpr (Layout::rank == 4) + { + print_tensor(tensor(_,_,_,0)); + for (int p = 1; p < size<3>(tensor); ++p) { + for (int i = 0; i < format.digits*size<1>(tensor); ++i) { print("="); } print("\n"); + print_tensor(tensor(_,_,_,p)); + } + } +} + +template +CUTE_HOST_DEVICE void print(Tensor const& tensor) +{ + print(tensor.layout()); print("\n"); + print_tensor(tensor); +} + +template +CUTE_HOST std::ostream& print_tensor_os(std::ostream& os, Tensor const& tensor) +{ + int digits = 9; + + if constexpr (Layout::rank == 1) + { + for (int m = 0; m < size(tensor); ++m) { + os << std::setw(digits) << tensor(m) << std::endl; + } + } else + if constexpr (Layout::rank == 2) + { + for (int m = 0; m < size<0>(tensor); ++m) { + for (int n = 0; n < size<1>(tensor); ++n) { + os << std::setw(digits) << tensor(m,n); + } + os << std::endl; + } + } else + if constexpr (Layout::rank == 3) + { + print_tensor_os(os, tensor(_,_,0)); + for (int k = 1; k < size<2>(tensor); ++k) { + for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "-"; } os << std::endl; + print_tensor_os(os, tensor(_,_,k)); + } + } else + if constexpr (Layout::rank == 4) + { + print_tensor_os(os, tensor(_,_,_,0)); + for (int p = 1; p < size<3>(tensor); ++p) { + for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "="; } os << std::endl; + print_tensor_os(os, tensor(_,_,_,p)); + } + } + + return os; +} + +template +CUTE_HOST std::ostream& operator<<(std::ostream& os, Tensor const& tensor) +{ + os << tensor.layout() << std::endl; + return print_tensor_os(os, tensor); +} + +} // end namespace cute + +// +// Extended Engines +// + +#include + +// +// Tensor Algorithms +// + +#include +#include +#include +#include +#include +#include diff --git a/include/cute/tensor_predicate.hpp b/include/cute/tensor_predicate.hpp new file mode 100644 index 0000000000..730f219462 --- /dev/null +++ b/include/cute/tensor_predicate.hpp @@ -0,0 +1,63 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +template +struct ConstantTensor +{ + template + CUTE_HOST_DEVICE constexpr + T const& + operator()(Coords const&...) const { + return val_; + } + + T val_; +}; + +struct TrivialPredTensor +{ + template + CUTE_HOST_DEVICE constexpr + true_type + operator()(Coords const&...) const { + return {}; + } +}; + +} // end namespace cute diff --git a/include/cute/tile.hpp b/include/cute/tile.hpp new file mode 100644 index 0000000000..b2fa2e8b7b --- /dev/null +++ b/include/cute/tile.hpp @@ -0,0 +1,58 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +namespace cute +{ + +// +// A Tile is not a Layout, it's a tuple of Layouts or Tiles or Underscores +// + +template +using Tile = tuple; + +template +using is_tile = is_tuple; + +template +CUTE_HOST_DEVICE constexpr +auto +make_tile(Layouts const&... layouts) +{ + return Tile(layouts...); +} + +} // end namespace cute diff --git a/include/cute/underscore.hpp b/include/cute/underscore.hpp new file mode 100644 index 0000000000..d79b4ee8c4 --- /dev/null +++ b/include/cute/underscore.hpp @@ -0,0 +1,148 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace cute +{ + +// For slicing +struct Underscore : Int<0> {}; + +CUTE_INLINE_CONSTANT Underscore _; + +// Treat Underscore as an integral like integral_constant +template <> +struct is_integral : true_type {}; + +template +struct is_underscore : false_type {}; +template <> +struct is_underscore : true_type {}; + +// Tuple trait for detecting static member element +template +struct has_elem : false_type {}; +template +struct has_elem : true_type {}; +template +struct has_elem::value> > + : has_elem > {}; +template +struct has_elem> + : disjunction, Elem>...> {}; + +// Tuple trait for detecting static member element +template +struct all_elem : false_type {}; +template +struct all_elem : true_type {}; +template +struct all_elem::value> > + : all_elem > {}; +template +struct all_elem> + : conjunction, Elem>...> {}; + +// Tuple trait for detecting Underscore member +template +using has_underscore = has_elem; + +template +using all_underscore = all_elem; + +template +using has_int1 = has_elem>; + +template +using has_int0 = has_elem>; + +// +// Slice keeps only the elements of Tuple B that are paired with an Underscore +// + +template +CUTE_HOST_DEVICE constexpr +auto +slice(A const& a, B const& b) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return filter_tuple(a, b, [](auto const& x, auto const& y) { return slice(x,y); }); + } else if constexpr (is_underscore::value) { + return cute::tuple{b}; + } else { + return cute::tuple<>{}; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Dice keeps only the elements of Tuple B that are paired with an Int +// + +template +CUTE_HOST_DEVICE constexpr +auto +dice(A const& a, B const& b) +{ + if constexpr (is_tuple::value) { + static_assert(tuple_size::value == tuple_size::value, "Mismatched Ranks"); + return filter_tuple(a, b, [](auto const& x, auto const& y) { return dice(x,y); }); + } else if constexpr (is_underscore::value) { + return cute::tuple<>{}; + } else { + return cute::tuple{b}; + } + + CUTE_GCC_UNREACHABLE; +} + +// +// Display utilities +// + +CUTE_HOST_DEVICE void print(Underscore const&) { + printf("_"); +} + +CUTE_HOST std::ostream& operator<<(std::ostream& os, Underscore const&) { + return os << "_"; +} + +} // end namespace cute diff --git a/include/cute/util/debug.hpp b/include/cute/util/debug.hpp new file mode 100644 index 0000000000..9a62143c95 --- /dev/null +++ b/include/cute/util/debug.hpp @@ -0,0 +1,153 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +/** + * \file + * \brief Debugging and logging functionality + */ + +#include + +#include + +namespace cute +{ + +/****************************************************************************** + * Debug and logging macros + ******************************************************************************/ + +/** + * Formats and prints the given message to stdout + */ +#if !defined(CUTE_LOG) +# if !defined(__CUDA_ARCH__) +# define CUTE_LOG(format, ...) printf(format, __VA_ARGS__) +# else +# define CUTE_LOG(format, ...) \ + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + blockIdx.x, blockIdx.y, blockIdx.z, \ + threadIdx.x, threadIdx.y, threadIdx.z, \ + __VA_ARGS__); +# endif +#endif + +/** + * Formats and prints the given message to stdout only if DEBUG is defined + */ +#if !defined(CUTE_LOG_DEBUG) +# ifdef DEBUG +# define CUTE_LOG_DEBUG(format, ...) CUTE_LOG(format, __VA_ARGS__) +# else +# define CUTE_LOG_DEBUG(format, ...) +# endif +#endif + +/** + * \brief Perror macro with exit + */ +#if !defined(CUTE_ERROR_EXIT) +# define CUTE_ERROR_EXIT(e) \ + do { \ + cudaError_t code = (e); \ + if (code != cudaSuccess) { \ + fprintf(stderr, "<%s:%d> %s:\n %s: %s\n", \ + __FILE__, __LINE__, #e, \ + cudaGetErrorName(code), cudaGetErrorString(code)); \ + fflush(stderr); \ + exit(0); \ + } \ + } while (0) +#endif + +#if !defined(CUTE_CHECK_LAST) +# define CUTE_CHECK_LAST() CUTE_ERROR_EXIT(cudaPeekAtLastError()); CUTE_ERROR_EXIT(cudaDeviceSynchronize()) +#endif + +#if !defined(CUTE_CHECK_ERROR) +# define CUTE_CHECK_ERROR(e) CUTE_ERROR_EXIT(e) +#endif + +// A dummy function that uses compilation failure to print a type +template +CUTE_HOST_DEVICE +void +print_type(T&&) { + static_assert(sizeof(T) < 0, "Printing type T."); +} + +// +// Device-specific helpers +// +// e.g. +// if (thread0()) print(...); +// if (block0()) print(...); +// if (thread(42)) print(...); + +CUTE_HOST_DEVICE +bool +thread(int tid, int bid) +{ +#if defined(__CUDA_ARCH__) + return (threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y == tid) + && ( blockIdx.x + blockIdx.y* gridDim.x + blockIdx.z* gridDim.x* gridDim.y == bid); +#else + return true; +#endif +} + +CUTE_HOST_DEVICE +bool +thread(int tid) +{ + return thread(tid, 0); +} + +CUTE_HOST_DEVICE +bool +thread0() +{ + return thread(0,0); +} + +CUTE_HOST_DEVICE +bool +block0() +{ +#if defined(__CUDA_ARCH__) + return !(blockIdx.x | blockIdx.y | blockIdx.z); +#else + return true; +#endif +} + +} // end namespace cute diff --git a/include/cute/util/print.hpp b/include/cute/util/print.hpp new file mode 100644 index 0000000000..ec774b00ff --- /dev/null +++ b/include/cute/util/print.hpp @@ -0,0 +1,140 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +// +// CUDA compatible print and printf +// + +namespace cute +{ + +CUTE_HOST_DEVICE +int +num_digits(int x) +{ + return (x < 10 ? 1 : + (x < 100 ? 2 : + (x < 1000 ? 3 : + (x < 10000 ? 4 : + (x < 100000 ? 5 : + (x < 1000000 ? 6 : + (x < 10000000 ? 7 : + (x < 100000000 ? 8 : + (x < 1000000000 ? 9 : + 10))))))))); +} + +template +struct format_and_size { + using type = T; + char const* format; + int digits; +}; + +CUTE_HOST_DEVICE +format_and_size +get_format(bool) { + return {"%*d", 3}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(int32_t) { + return {"%*d", 5}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(uint32_t) { + return {"%*d", 5}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(int64_t) { + return {"%*d", 5}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(uint64_t) { + return {"%*d", 5}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(half_t) { + return {"%*.2f", 8}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(float) { + return {"%*.2e", 10}; +} + +CUTE_HOST_DEVICE +format_and_size +get_format(double) { + return {"%*.3e", 11}; +} + +// +// print dispatcher +// + +CUTE_HOST_DEVICE +void +print(char const& c) { + printf("%c", c); +} + +template ::value)> +CUTE_HOST_DEVICE +void +print(T const& a) { + printf("%d", int(a)); +} + +template +CUTE_HOST_DEVICE +void +print(char const* format, T const&... t) { + printf(format, t...); +} + +} // end namespace cute diff --git a/include/cute/util/type_traits.hpp b/include/cute/util/type_traits.hpp new file mode 100644 index 0000000000..4d37eb9e48 --- /dev/null +++ b/include/cute/util/type_traits.hpp @@ -0,0 +1,101 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include + +#define __CUTE_REQUIRES(...) typename std::enable_if<(__VA_ARGS__)>::type* = nullptr +#define __CUTE_REQUIRES_V(...) typename std::enable_if::type* = nullptr + +namespace cute +{ + +using std::conjunction; +using std::conjunction_v; + +using std::disjunction; +using std::disjunction_v; + +using std::negation; +using std::negation_v; + +using std::void_t; + +// C++20 +// using std::remove_cvref; +template +struct remove_cvref { + using type = std::remove_cv_t>; +}; + +// C++20 +// using std::remove_cvref_t; +template +using remove_cvref_t = typename remove_cvref::type; + +// +// is_valid +// + +namespace detail { + +template ()(std::declval()...))> +CUTE_HOST_DEVICE constexpr auto +is_valid_impl(int) { return std::true_type{}; } + +template +CUTE_HOST_DEVICE constexpr auto +is_valid_impl(...) { return std::false_type{}; } + +template +struct is_valid_fn { + template + CUTE_HOST_DEVICE constexpr auto + operator()(Args&&...) const { return is_valid_impl(int{}); } +}; + +} // end namespace detail + +template +CUTE_HOST_DEVICE constexpr auto +is_valid(F&&) { + return detail::is_valid_fn{}; +} + +template +CUTE_HOST_DEVICE constexpr auto +is_valid(F&&, Args&&...) { + return detail::is_valid_impl(int{}); +} + +} // end namespace cute diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h new file mode 100644 index 0000000000..34f0b4ee72 --- /dev/null +++ b/include/cutlass/arch/barrier.h @@ -0,0 +1,404 @@ +/*************************************************************************************************** + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are not permit- + * ted. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Barrier Operations on SM90+ +*/ + +#pragma once + +#include +#include + +namespace cutlass { +/// @brief +namespace arch { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12) +#define CUDA_BARRIER_ENABLED 1 +#else +#define CUDA_BARRIER_ENABLED 0 +#endif + +class NamedBarrier { + + // Data Members: + + // Range = [1 , NUM_THREADS_PER_CTA] + // Range % warp-size (i.e 32) == 0 + uint32_t const num_threads_; + + // Range : [0, 15] + uint32_t const id_; + + public: + + CUTLASS_DEVICE + NamedBarrier(uint32_t num_threads, uint32_t id = 0) + : num_threads_(num_threads), id_(id) {} + + CUTLASS_DEVICE + void arrive_and_wait() const { + NamedBarrier::arrive_and_wait(num_threads_, id_); + } + + CUTLASS_DEVICE + void arrive() const { + NamedBarrier::arrive(num_threads_, id_); + } + + CUTLASS_DEVICE + void sync() const { + NamedBarrier::arrive_and_wait(); + } + + // Static variants + CUTLASS_DEVICE + static void arrive_and_wait(uint32_t num_threads, uint32_t barrier_id) { +#if CUDA_BARRIER_ENABLED + asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + CUTLASS_DEVICE + static void arrive(uint32_t num_threads, uint32_t barrier_id) { +#if CUDA_BARRIER_ENABLED + asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + CUTLASS_DEVICE + static void sync(uint32_t num_threads, uint32_t barrier_id) { + NamedBarrier::arrive_and_wait(num_threads, barrier_id); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Hopper introduces a new cluster-wide barrier which handle with Cluster-wide AW behaviour. +// This is an extension to the Ampere AW barriers +// Note : Ampere AW Barriers have a larger max-arrive count (2^30) than Hopper AW Barriers (2^20). +struct ClusterBarrier { + + using ValueType = uint64_t; + +protected: + // Can never be initializated - can only be aliased to smem + ValueType barrier_; + +public: + + CUTLASS_DEVICE + ClusterBarrier() = delete; + + CUTLASS_DEVICE + void init(uint32_t arrive_count) const { + ClusterBarrier::init(&this->barrier_, arrive_count); + } + + CUTLASS_DEVICE + uint32_t test_wait(uint32_t phase, uint32_t pred=true) const { + return ClusterBarrier::test_wait(&this->barrier_, phase, pred); + } + + CUTLASS_DEVICE + void wait(uint32_t phase) const { + ClusterBarrier::wait(&this->barrier_, phase); + } + + // Barrier arrive on local smem + CUTLASS_DEVICE + void arrive() const { + ClusterBarrier::arrive(&this->barrier_); + } + + // Remote SMEM arrive with a perdicate (usually done to pick the thread doing the arrive) + CUTLASS_DEVICE + void arrive(uint32_t cta_id, uint32_t pred = true ) const { + ClusterBarrier::arrive(&this->barrier_, cta_id, pred); + } + + // + // Static Versions + // + CUTLASS_DEVICE + static void init(ValueType const* smem_ptr, uint32_t arrive_count) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + "mbarrier.init.shared.b64 [%1], %0; \n" + "}" + : + : "r"(arrive_count), "r"(smem_addr)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + // Static version of wait - in case we don't want to burn a register + CUTLASS_DEVICE + static void wait(ValueType const* smem_ptr, uint32_t phase) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + // Arbitrarily large timer value after which try-wait expires and re-tries. + uint32_t ticks = 0x989680; + asm volatile( + "{\n\t" + ".reg .pred P1; \n\t" + "LAB_WAIT: \n\t" + "mbarrier.try_wait.parity.shared.b64 P1, [%0], %1, %2; \n\t" + "@P1 bra.uni DONE; \n\t" + "bra.uni LAB_WAIT; \n\t" + "DONE: \n\t" + "}" + : + : "r"(smem_addr), "r"(phase), "r"(ticks)); + +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + CUTLASS_DEVICE + static uint32_t test_wait(ValueType const* smem_ptr, uint32_t phase, uint32_t pred) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + uint32_t waitComplete; + + asm volatile( + "{\n\t" + ".reg .pred P1; \n\t" + ".reg .pred P2; \n\t" + "setp.eq.u32 P2, %3, 1;\n\t" + "@P2 mbarrier.test_wait.parity.shared.b64 P1, [%1], %2; \n\t" + "selp.b32 %0, 1, 0, P1; \n\t" + "}" + : "=r"(waitComplete) + : "r"(smem_addr), "r"(phase), "r"(pred)); + + return waitComplete; +#else + asm volatile ("brkpt;\n" ::); +#endif + return 0; + } + + // Static Predicated version of the above - in case we know the address. + CUTLASS_DEVICE + static void arrive(ValueType const* smem_ptr, uint32_t cta_id, uint32_t pred) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + ".reg .b32 remAddr32;\n\t" + "setp.eq.u32 p, %2, 1;\n\t" + "@p mapa.shared::cluster.u32 remAddr32, %0, %1;\n\t" + "@p mbarrier.arrive.shared::cluster.b64 _, [remAddr32];\n\t" + "}" + : + : "r"(smem_addr), "r"(cta_id), "r"(pred)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + // Barrier arrive on local smem + CUTLASS_DEVICE + static void arrive(ValueType const* smem_ptr) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + uint64_t state = 0; + asm volatile( + "{\n\t" + "mbarrier.arrive.shared.b64 %1, [%0];\n\t" + "}" + : + : "r"(smem_addr), "l"(state)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + CUTLASS_DEVICE + static void invalidate(ValueType const* smem_ptr) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + "mbarrier.ival.shared.b64 [%0]; \n\t" + "}" + : + : "r"(smem_addr)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// SM90 also introduces a new type of cluster-barrier which supports sync. +// not just based on Arrive Count, but also transaction count (in bytes) +struct ClusterTransactionBarrier : public ClusterBarrier { + + CUTLASS_DEVICE + ClusterTransactionBarrier() = delete; + + // Performs an arrive operation + bytes reset + CUTLASS_DEVICE + void arrive_and_reset_bytes(uint32_t transaction_bytes) const { + ClusterTransactionBarrier::arrive_and_reset_bytes(&this->barrier_, transaction_bytes); + } + + // Performs an arrive operation + bytes reset + CUTLASS_DEVICE + void arrive_and_reset_bytes(uint32_t transaction_bytes, uint32_t cta_id) const { + ClusterTransactionBarrier::arrive_and_reset_bytes(&this->barrier_, transaction_bytes , cta_id, true); + } + + CUTLASS_DEVICE + void commit(uint32_t transaction_bytes, uint32_t pred = 1) const { + uint32_t cta_rank = cute::block_rank_in_cluster(); + ClusterTransactionBarrier::commit(&this->barrier_, cta_rank, transaction_bytes, pred); + } + + CUTLASS_DEVICE + void commit(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const { + ClusterTransactionBarrier::commit(&this->barrier_, dst_cta_id, transaction_bytes, pred); + } + + // + // Static Versions + // + + // Performs an arrive operation + bytes reset + CUTLASS_DEVICE + static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + "mbarrier.arrive.expect_tx.shared.b64 _, [%1], %0; \n\t" + "}" + : + : "r"(transaction_bytes), "r"(smem_addr)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + // Performs an arrive operation + bytes reset for a remote cta_id in a Cluster + CUTLASS_DEVICE + static void arrive_and_reset_bytes( + ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + ".reg .b32 remAddr32;\n\t" + "setp.eq.u32 p, %2, 1;\n\t" + "@p mapa.shared::cluster.u32 remAddr32, %0, %1;\n\t" + "@p mbarrier.arrive.expect_tx.shared::cluster.b64 _, [remAddr32], %3;\n\t" + "}" + : + : "r"(smem_addr), "r"(cta_id), "r"(pred), "r"(transaction_bytes)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + // Performs an bytes reset without doing an arrive operation + CUTLASS_DEVICE + static void reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + asm volatile( + "{\n\t" + "mbarrier.expect_tx.shared.b64 [%1], %0; \n\t" + "}" + : + : "r"(transaction_bytes), "r"(smem_addr)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } + + // Increments transaction bytes in the barrier + CUTLASS_DEVICE + static void commit( + ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) { +#if CUDA_BARRIER_ENABLED + uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr); + smem_addr = cute::set_block_rank(smem_addr, dst_cta_id); + asm volatile( + "{\n\t" + ".reg .pred p;\n\t" + "setp.eq.u32 p, %2, 1;\n\t" + "@p mbarrier.complete_tx.shared::cluster.relaxed.cluster.b64 [%1], %0;" + "}" + : + : "r"(transaction_bytes), "r"(smem_addr), "r"(pred)); +#else + asm volatile ("brkpt;\n" ::); +#endif + } +}; + +// Helps with visibility of barrier init operations across warps / cta / cluster +// Available as a separate function so as to batch inits across barriers and fence once +// Note : It must be composed with an appropriate sync instruction with the right scope +// to ensure visibility eg. __syncthreads() or a cluster_arrive() + cluster_wait() +CUTLASS_DEVICE +void fence_barrier_init() { +#if CUDA_BARRIER_ENABLED + asm volatile( + "{\n\t" + "fence.mbarrier_init.release.cluster; \n" + "}" + ::); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +// Issue a shared memory fence for async operations +CUTLASS_DEVICE +void fence_view_async_shared() { +#if CUDA_BARRIER_ENABLED + asm volatile ( + "{\n\t" + "fence.proxy.async.shared::cta; \n" + "}" + ::); +#else + asm volatile ("brkpt;\n" ::); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////////////////////////// +} // end namespace arch +} // end namespace cutlass diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h index 5f45eb5858..ba59364f5e 100644 --- a/include/cutlass/arch/memory_sm75.h +++ b/include/cutlass/arch/memory_sm75.h @@ -36,6 +36,7 @@ #include "cutlass/array.h" #include "cutlass/layout/matrix.h" +#include "cute/arch/util.hpp" namespace cutlass { namespace arch { @@ -65,74 +66,13 @@ inline __device__ void ldsm(Array & D, void const* ptr); #define CUDA_LDMATRIX_SUPPORTED 1 #endif -///////////////////////////////////////////////////////////////////////////////////////////////// -/* -#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) && (__CUDACC_VER_MAJOR__ > 10) - #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED 1 -#endif -#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) - #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 1)) -#endif - -#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_ENABLED) - #define CUDA_NVVM_GET_SMEM_POINTER_ENABLED CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED -#endif -*/ - -#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) - extern "C" { - // - // This NVVM intrinsic is subject to change in future versions of CUDA. - // Clients should not call it directly. Rather, they should use the - // cutlass::arch::ldsm<>() template. - // - __device__ uint32_t __nvvm_get_smem_pointer(void *); - } -#endif - ///////////////////////////////////////////////////////////////////////////////////////////////// /// CUTLASS helper to get SMEM pointer inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { - -// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to -// the previous internal intrinsics if they are available. -#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) - // - // This NVVM intrinsic converts an address in shared memory to a plain - // unsigned integer. This is necessary to pass to shared memory instructions - // in inline PTX. - // - // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer() [only available in 10.2]. - // - //__device__ size_t __cvta_generic_to_shared(void* ptr); - - /// CUTLASS helper to get SMEM pointer - return static_cast(__cvta_generic_to_shared(ptr)); - -#elif (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) - - return __nvvm_get_smem_pointer(ptr); - -#elif defined(__CUDA_ARCH__) - - uint32_t smem_ptr; - - asm( - "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" - : "=r"(smem_ptr) : "l"(ptr)); - - return smem_ptr; - -#else - - CUTLASS_UNUSED(ptr); - CUTLASS_NOT_IMPLEMENTED(); - return 0; - -#endif + return cute::cast_smem_ptr_to_uint(ptr); } - + /// CUTLASS helper to get SMEM pointer inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) { return cutlass_get_smem_pointer(const_cast(ptr)); diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index 587ff8864f..7d4d693a09 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -224,5 +224,4 @@ struct SparseMma; #include "cutlass/arch/mma_sm80.h" #include "cutlass/arch/mma_sparse_sm80.h" #include "cutlass/arch/mma_sm90.h" - ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h index cb7debc8bc..8682ae1ba8 100644 --- a/include/cutlass/arch/mma_sm80.h +++ b/include/cutlass/arch/mma_sm80.h @@ -2166,7 +2166,7 @@ struct Mma< "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); #else - + CUTLASS_UNUSED(a); CUTLASS_UNUSED(b); CUTLASS_UNUSED(c); diff --git a/include/cutlass/arch/mma_sm90.h b/include/cutlass/arch/mma_sm90.h index 85e808a59d..1d0745b408 100644 --- a/include/cutlass/arch/mma_sm90.h +++ b/include/cutlass/arch/mma_sm90.h @@ -47,10 +47,21 @@ //////////////////////////////////////////////////////////////////////////////// #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8)) -#define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1 -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) -#define CUTLASS_ARCH_MMA_SM90_ENABLED + #define CUTLASS_ARCH_MMA_SM90_F64_MMA_SUPPORTED + #if (!defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)) + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + #define CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED + #endif + #endif #endif + +#if (__CUDACC_VER_MAJOR__ >= 12) + #define CUTLASS_ARCH_MMA_SM90_SUPPORTED + #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED)) + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + #define CUTLASS_ARCH_MMA_SM90_ENABLED + #endif + #endif #endif //////////////////////////////////////////////////////////////////////////////// @@ -97,7 +108,7 @@ struct Mma< void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const { -#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED) +#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) double const *A = reinterpret_cast(&a); double const *B = reinterpret_cast(&b); @@ -105,10 +116,73 @@ struct Mma< double const *C = reinterpret_cast(&c); double *D = reinterpret_cast(&d); - asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" + asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3]) - : "d"(A[0]), "d"(A[1]), - "d"(B[0]), + : "d"(A[0]), "d"(A[1]), + "d"(B[0]), + "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3])); + +#else + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Matrix Multiply-Add 16x8x8 fp64 +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F64 = F64 * F64 + F64 +template <> +struct Mma< + gemm::GemmShape<16,8,8>, + 32, + double, + layout::RowMajor, + double, + layout::ColumnMajor, + double, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,8>; + + using ElementA = double; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = double; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm90; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) + + double const *A = reinterpret_cast(&a); + double const *B = reinterpret_cast(&b); + + double const *C = reinterpret_cast(&c); + double *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" + : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3]) + : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]), + "d"(B[0]), "d"(B[1]), "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3])); #else @@ -118,7 +192,65 @@ struct Mma< CUTLASS_UNUSED(b); CUTLASS_UNUSED(c); CUTLASS_NOT_IMPLEMENTED(); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Matrix Multiply-Add 16x8x16 fp64 +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F64 = F64 * F64 + F64 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + double, + layout::RowMajor, + double, + layout::ColumnMajor, + double, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = double; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = double; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm90; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { +#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) + + double const *A = reinterpret_cast(&a); + double const *B = reinterpret_cast(&b); + + double const *C = reinterpret_cast(&c); + double *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n" + : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3]) + : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]) + "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]), + "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3])); + +#else + CUTLASS_NOT_IMPLEMENTED(); #endif } }; @@ -129,3 +261,4 @@ struct Mma< } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h new file mode 100644 index 0000000000..2b74a22e6c --- /dev/null +++ b/include/cutlass/arch/reg_reconfig.h @@ -0,0 +1,68 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief PTX for CTA Reconfiguration +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) + #if (defined(__CUDA_ARCH_FEAT_SM90_ALL)) + #define CUDA_CTA_RECONFIG_ACTIVATED 1 + #endif +#else + #define CUDA_CTA_RECONFIG_ACTIVATED 0 +#endif + +namespace cutlass { +namespace arch { + +template +CUTLASS_DEVICE +void warpgroup_reg_alloc(){ +#if CUDA_CTA_RECONFIG_ACTIVATED + asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) ); +#endif +} + +template +CUTLASS_DEVICE +void warpgroup_reg_dealloc(){ +#if CUDA_CTA_RECONFIG_ACTIVATED + asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) ); +#endif +} + +} // namespace arch +} // namespace cutlass diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h index d3822da97c..ac30422408 100644 --- a/include/cutlass/array_subbyte.h +++ b/include/cutlass/array_subbyte.h @@ -370,8 +370,6 @@ class Array { CUTLASS_HOST_DEVICE reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { } - - // TODO }; /// Bidirectional constant iterator over elements @@ -390,8 +388,6 @@ class Array { CUTLASS_HOST_DEVICE const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { } - - // TODO }; private: diff --git a/include/cutlass/cluster_launch.hpp b/include/cutlass/cluster_launch.hpp new file mode 100644 index 0000000000..4843540752 --- /dev/null +++ b/include/cutlass/cluster_launch.hpp @@ -0,0 +1,156 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief PTX for TMA Tensor Memory Access operators on memory added for SM90 +*/ + +#pragma once + +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/trace.h" + +#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) +# define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED +#endif + +namespace cutlass { + +#ifndef NDEBUG +#define Return_Status(cudaError_t_status) \ + if (cudaError_t_status != cudaSuccess) { \ + fprintf(stderr, \ + "[ ERROR: CUDA Runtime ] %s:%d: %s\n", \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(cudaError_t_status)); \ + return Status::kInvalid; \ + } else { \ + return Status::kSuccess; \ + } +#else +#define Return_Status(cudaError_t_status) \ + if (cudaError_t_status != cudaSuccess) { \ + return Status::kInvalid; \ + } else { \ + return Status::kSuccess; \ + } +#endif + +struct ClusterLauncher { + constexpr static int MaxClusterSize = 32; + + // Check for hardware compatibility + static inline __host__ + Status check_cluster_dims(dim3 const& grid, dim3 const& cluster) { + if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) && + (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) { + return Status::kSuccess; + } + else { + CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch."); + return Status::kInvalid; + } + } + + static inline __host__ + Status +#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED) + init(void const* kernel_function) +#else + init(void const* /* kernel_function */) +#endif + { +#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED) + // This attribute was added in CUDA 11.8. + cudaError_t status = + cudaFuncSetAttribute( + kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1); + Return_Status(status); +#else + return Status::kInvalid; +#endif + } + + // This is the method we expect to use going forward + static inline __host__ + Status launch( + dim3 const& grid_dims, + dim3 const& cluster_dims, + dim3 const& block_dims, + size_t const& smem_size, + cudaStream_t& cuda_stream, + void const* kernel, + void** kernel_params) { +#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED) + if (check_cluster_dims(grid_dims, cluster_dims) != Status::kSuccess) { + CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting."); + return Status::kInvalid; + } + + auto init_status = init(kernel); + if (init_status != Status::kSuccess) { + CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting."); + return Status::kInvalid; + } + + cudaLaunchConfig_t launch_config; + launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z}; + launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z}; + launch_config.dynamicSmemBytes = smem_size; + launch_config.stream = cuda_stream; + + cudaLaunchAttribute launch_attribute[1]; + launch_attribute[0].id = cudaLaunchAttributeClusterDimension; + launch_attribute[0].val.clusterDim.x = cluster_dims.x; + launch_attribute[0].val.clusterDim.y = cluster_dims.y; + launch_attribute[0].val.clusterDim.z = cluster_dims.z; + + launch_config.attrs = launch_attribute; + launch_config.numAttrs = 1; + + CUTLASS_TRACE_HOST("ClusterLauncher: Launching GPC_CLUSTER_GRID GridDims = " + "(" << grid_dims.x << ", " << grid_dims.y << ", " << grid_dims.z << "), " + "And ClusterDims = " + "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n"); + + cudaError_t status = cudaLaunchKernelExC(&launch_config, kernel, kernel_params); + Return_Status(status); +#else + CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch."); + return Status::kInvalid; +#endif + } +}; + +} // namespace cutlass diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h index 2d2d249466..11ac967c65 100644 --- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h @@ -332,7 +332,7 @@ struct ImplicitGemmConvolution { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h index d65a34e1e7..b740c9058f 100644 --- a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h @@ -339,7 +339,7 @@ struct ImplicitGemmConvolutionFusion { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h index 75d0338b86..7304cbdecb 100644 --- a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h @@ -335,7 +335,7 @@ struct ImplicitGemmConvolutionStridedDgrad { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // Check if CTA contributes valid MMA (Dy * w) and accumulator will be non-zero after MMA diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h index 8c6013c5d5..3fa7daca1b 100644 --- a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h @@ -341,7 +341,7 @@ struct ImplicitGemmConvolutionWithFusedEpilogue { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h index 1884443470..12bc3a3717 100644 --- a/include/cutlass/cutlass.h +++ b/include/cutlass/cutlass.h @@ -72,20 +72,20 @@ CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) #include -#if defined(_MSC_VER) - #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__) -#else - #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__) -#endif - -#else - -#if defined(_MSC_VER) - #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__) -#else - #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__) -#endif + #if defined(__CUDA_ARCH__) + #if defined(_MSC_VER) + #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __FUNCSIG__); asm volatile ("brkpt;\n"); } + #else + #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __PRETTY_FUNCTION__); asm volatile ("brkpt;\n"); } + #endif + #else + #if defined(_MSC_VER) + #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__) + #else + #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__) + #endif + #endif #endif //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -181,10 +181,11 @@ static char const* cutlassGetStatusString(cutlass::Status status) { //////////////////////////////////////////////////////////////////////////////////////////////////// -static const int NUM_THREADS_PER_WARP = 32; -static const int NUM_THREADS_PER_HALF_WARP = NUM_THREADS_PER_WARP / 2; -static const int NUM_THREADS_PER_QUAD = 4; -static const int NUM_THREADS_PER_QUAD_PAIR = NUM_THREADS_PER_QUAD * 2; +static const int NumThreadsPerWarp = 32; +static const int NumThreadsPerWarpGroup = 128; +static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2; +static const int NumThreadsPerQuad = 4; +static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -197,6 +198,28 @@ CUTLASS_HOST_DEVICE bool thread0() { #endif } +/// Returns a warp-uniform value indicating the canonical warp index of the calling threads. +/// Threads within the warp must be converged. +CUTLASS_DEVICE +int canonical_warp_idx() { + #if defined(__CUDA_ARCH__) + return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarp, 0); + #else + return 0; + #endif +} + +/// Returns a warp-uniform value indicating the canonical warp group index of the calling threads. +/// Threads within the warp must be converged. +CUTLASS_DEVICE +int canonical_warp_group_idx() { + #if defined(__CUDA_ARCH__) + return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0); + #else + return 0; + #endif +} + //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h index d2903ac352..68042e3fb0 100644 --- a/include/cutlass/device_kernel.h +++ b/include/cutlass/device_kernel.h @@ -34,7 +34,24 @@ #pragma once -#include "cutlass/cutlass.h" +// __grid_constant__ was introduced in CUDA 11.7. +#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7))) +# define CUTLASS_GRID_CONSTANT_SUPPORTED +#endif + +// __grid_constant__ can be enabled only on SM70+ +#if defined(CUTLASS_GRID_CONSTANT_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) +# define CUTLASS_GRID_CONSTANT_ENABLED +#endif + +#if ! defined(CUTLASS_GRID_CONSTANT) +# if defined(CUTLASS_GRID_CONSTANT_ENABLED) +# define CUTLASS_GRID_CONSTANT __grid_constant__ +# else +# define CUTLASS_GRID_CONSTANT +# endif +#endif + //////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -75,5 +92,22 @@ void Kernel2(typename Operator::Params params) { //////////////////////////////////////////////////////////////////////////////// -} /// namespace cutlass +// +// 3.0 specific launch +// +//////////////////////////////////////////////////////////////////////////////// + +/// Generic CUTLASS kernel template. +template +__global__ __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor) +void device_kernel(CUTLASS_GRID_CONSTANT typename Operator::Params const params) +{ + // Dynamic shared memory base pointer + extern __shared__ char smem[]; + + Operator op; + op(params, smem); +} +//////////////////////////////////////////////////////////////////////////////// +} /// namespace cutlass diff --git a/include/cutlass/epilogue/collective/collective_epilogue.hpp b/include/cutlass/epilogue/collective/collective_epilogue.hpp new file mode 100644 index 0000000000..5b1b924549 --- /dev/null +++ b/include/cutlass/epilogue/collective/collective_epilogue.hpp @@ -0,0 +1,49 @@ +/*************************************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::epilogue::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class DispatchPolicy, + class... Args +> +struct CollectiveEpilogue { + static_assert(std::is_void_v, "Could not find an epilogue specialization."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::epilogue::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "default_epilogue.hpp" +#include "epilogue.hpp" +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp new file mode 100644 index 0000000000..71499b5d38 --- /dev/null +++ b/include/cutlass/epilogue/collective/default_epilogue.hpp @@ -0,0 +1,195 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing elementwise operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cute/numeric/int.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies an element wise operation to all elements within the fragment +/// and writes them out to destination storage. +template < + class StrideC_, + class StrideD_, + class ThreadEpilogueOp_ +> +class DefaultEpilogue { +public: + // + // Type Aliases + // + // derived types of output thread level operator + using ThreadEpilogueOp = ThreadEpilogueOp_; + using ElementOutput = typename ThreadEpilogueOp::ElementOutput; + using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator; + using ElementCompute = typename ThreadEpilogueOp::ElementCompute; + using ElementScalar = ElementCompute; + using ElementC = typename ThreadEpilogueOp::ElementC; + using StrideC = StrideC_; + using ElementD = typename ThreadEpilogueOp::ElementD; + using StrideD = StrideD_; + + static const int kOutputAlignment = ThreadEpilogueOp::kCount; + using AlignmentType = typename cute::uint_bit::value * kOutputAlignment>::type; + + static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + + struct SharedStorage { }; + + // Params of epilogue::collective contain the epilogue::thread params + struct Params { + ElementC const* ptr_C = nullptr; + StrideC dC{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + typename ThreadEpilogueOp::Params thread_params{}; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.epilogue_params}; + } + + CUTLASS_HOST_DEVICE + DefaultEpilogue(Params const& params_) : params(params_) { } + + template< + class ProblemShapeMNKL, + class BlockShapeMNK, + class BlockCoordMNKL, + class FrgEngine, class FrgLayout, + class TiledMma, + class ResidueMNK + > + CUTLASS_HOST_DEVICE void + operator()( + ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& accumulators, + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) + { + using namespace cute; + using X = Underscore; + + static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, "ThreadBlock tile shape must be static"); + static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3"); + static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3"); + + (void) smem_buf; + ThreadEpilogueOp epilogue_op{params.thread_params}; + + // Separate out problem shape for convenience + auto M = get<0>(problem_shape_mnkl); + auto N = get<1>(problem_shape_mnkl); + auto L = get<3>(problem_shape_mnkl); + + // Represent the full output tensor + Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD); // (m,n,l) + Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + + // Slice to get the tile this CTA is responsible for + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; + Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + + // Partition source and destination tiles to match the accumulator partitioning + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCgD = thr_mma.partition_C(gD); // (VEC,THR_M,THR_N) + Tensor tCgC = thr_mma.partition_C(gC); // (VEC,THR_M,THR_N) + + static_assert(is_static::value, "Accumulator layout must be static"); + CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD), + "Source and destination must have the same number of elements."); + CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators), + "Accumulator count must have the same destination element count."); + + // Make an identity coordinate tensor for predicating our output MN tile + auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD)))); + Tensor tCcD = thr_mma.partition_C(cD); + + // source is needed + if (epilogue_op.is_source_needed()) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accumulators); ++i) { + if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) { + tCgD(i) = epilogue_op(accumulators(i), tCgC(i)); + } + } + } + // source is not needed, avoid load + else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accumulators); ++i) { + if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) { + tCgD(i) = epilogue_op(accumulators(i)); + } + } + } + } + +private: + Params params; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace collective +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/collective/default_transposed_epilogue.hpp b/include/cutlass/epilogue/collective/default_transposed_epilogue.hpp new file mode 100644 index 0000000000..7e38acd75b --- /dev/null +++ b/include/cutlass/epilogue/collective/default_transposed_epilogue.hpp @@ -0,0 +1,203 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing elementwise operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cute/numeric/int.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies an element wise operation to all elements within the fragment +/// and writes them out to destination storage. +template < + class StrideC_, + class StrideD_, + class ThreadEpilogueOp_ +> +class DefaultTransposedEpilogue { + +public: + // + // Type Aliases + // + // derived types of output thread level operator + using ThreadEpilogueOp = ThreadEpilogueOp_; + using ElementOutput = typename ThreadEpilogueOp::ElementOutput; + using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator; + using ElementCompute = typename ThreadEpilogueOp::ElementCompute; + using ElementScalar = ElementCompute; + using ElementC = typename ThreadEpilogueOp::ElementC; + using StrideC = StrideC_; + using ElementD = typename ThreadEpilogueOp::ElementD; + using StrideD = StrideD_; + + static const int kOutputAlignment = ThreadEpilogueOp::kCount; + using AlignmentType = typename cute::uint_bit::value * kOutputAlignment>::type; + + static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + + struct SharedStorage { }; + + // Params of epilogue::collective contain the epilogue::thread params + struct Params { + ElementC const* ptr_C = nullptr; + StrideC dC{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + typename ThreadEpilogueOp::Params thread_params{}; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.epilogue_params}; + } + + CUTLASS_HOST_DEVICE + DefaultTransposedEpilogue(Params const& params_) : params(params_) { } + + template< + class ProblemShapeMNKL, + class BlockShapeMNK, + class BlockCoordMNKL, + class FrgEngine, class FrgLayout, + class TiledMma, + class ResidueMNK + > + CUTLASS_HOST_DEVICE void + operator()( + ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& accumulators, + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) + { + using namespace cute; + using X = Underscore; + + static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, "ThreadBlock tile shape must be static"); + static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3"); + static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3"); + + (void) smem_buf; + ThreadEpilogueOp epilogue_op{params.thread_params}; + + // Separate out problem shape for convenience + auto M = get<0>(problem_shape_mnkl); + auto N = get<1>(problem_shape_mnkl); + auto L = get<3>(problem_shape_mnkl); + + // Tranpose stride C/D. + auto stride_c = make_stride(get<1>(params.dC), get<0>(params.dC), get<2>(params.dC)); + auto stride_d = make_stride(get<1>(params.dD), get<0>(params.dD), get<2>(params.dD)); + + // Represent the full output tensor + Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d); // (m,n,l) + Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + + // Slice to get the tile this CTA is responsible for + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; + Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + + // Partition source and destination tiles to match the accumulator partitioning + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCgD = thr_mma.partition_C(gD); // (VEC,THR_M,THR_N) + Tensor tCgC = thr_mma.partition_C(gC); // (VEC,THR_M,THR_N) + + static_assert(is_static::value, "Accumulator layout must be static"); + CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD), + "Source and destination must have the same number of elements."); + CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators), + "Accumulator count must have the same destination element count."); + + auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD)))); + Tensor tCcD = thr_mma.partition_C(cD); + + // source is needed + if (epilogue_op.is_source_needed()) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accumulators); ++i) { + if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) { + tCgD(i) = epilogue_op(accumulators(i), tCgC(i)); + } + } + } + // source is not needed, avoid load + else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accumulators); ++i) { + if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) { + tCgD(i) = epilogue_op(accumulators(i)); + } + } + } + } + +private: + Params params; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace collective +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/collective/epilogue.hpp b/include/cutlass/epilogue/collective/epilogue.hpp new file mode 100644 index 0000000000..565e752ea0 --- /dev/null +++ b/include/cutlass/epilogue/collective/epilogue.hpp @@ -0,0 +1,322 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing elementwise operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies an element wise operation to all elements within the fragment +/// and writes it out to destination storage. +/// +/// Ways to generalize this: +/// - CTA tile shape +/// - vectorization requirements (GMEM) +/// - vectoriz(able) transform() +/// +template < + class StrideC_, + class StrideD_, + class ThreadEpilogueOp_, + class SmemLayout_, + class CopyAtomR2S_, + class TiledCopyS2R_, + class CopyAtomR2G_ +> +class Epilogue { +public: + // + // Type Aliases + // + // derived types of output thread level operator + using ThreadEpilogueOp = ThreadEpilogueOp_; + using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator; + using ElementCompute = typename ThreadEpilogueOp::ElementCompute; + using ElementScalar = ElementCompute; + using ElementOutput = typename ThreadEpilogueOp::ElementOutput; + using ElementC = typename ThreadEpilogueOp::ElementC; + using StrideC = StrideC_; + using ElementD = typename ThreadEpilogueOp::ElementD; + using StrideD = StrideD_; + + using SmemLayout = SmemLayout_; + using CopyAtomR2S = CopyAtomR2S_; + using TiledCopyS2R = TiledCopyS2R_; + using CopyAtomR2G = CopyAtomR2G_; + + static const int kOutputAlignment = ThreadEpilogueOp::kCount; + using AlignmentType = typename cute::uint_bit::value * kOutputAlignment>::type; + + static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + + struct SharedStorage + { + cute::array_aligned> smem_epilogue; + }; + + // Params of epilogue::collective contain the epilogue::thread params + struct Params { + ElementC const* ptr_C = nullptr; + StrideC dC{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + typename ThreadEpilogueOp::Params thread_params{}; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.epilogue_params}; + } + + CUTLASS_HOST_DEVICE + Epilogue(Params const& params_) : params(params_) { }; + + template< + class ProblemShapeMNKL, + class BlockShapeMNK, + class BlockCoordMNKL, + class FrgEngine, class FrgLayout, + class TiledMma, + class ResidueMNK + > + CUTLASS_DEVICE void + operator()( + ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& accumulators, // (MMA,MMA_M,MMA_N) + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) + { + using namespace cute; + using X = Underscore; + + static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, "ThreadBlock tile shape must be static"); + static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3"); + static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3"); + + // synchronizing function for smem reads/writes +#if CUDA_BARRIER_ENABLED + auto synchronize = [] () { NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, 0); }; +#else + auto synchronize = [] () { __syncthreads(); }; +#endif + + ThreadEpilogueOp epilogue_op{this->params.thread_params}; + + // Separate out problem shape for convenience + auto M = get<0>(problem_shape_mnkl); + auto N = get<1>(problem_shape_mnkl); + auto L = get<3>(problem_shape_mnkl); + + // Represent the full output tensor + Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD); // (m,n,l) + Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + + // Slice to get the tile this CTA is responsible for + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; + Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + + // Construct a tensor in SMEM that we can partition for rearranging data + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sC = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{}); // (SMEM_M,SMEM_N) + + // Partition sC to match the accumulator partitioning + auto tC = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma).get_thread_slice(thread_idx); + Tensor tCaC = tC.retile_S(accumulators); // ((Atom,AtomNum), MMA_M, MMA_N) + Tensor tCsC = tC.partition_D(sC); // ((Atom,AtomNum),PIPE_M,PIPE_N) + + // Tile gD and gC by the shape of SmemLayout first + auto tile = make_shape(size<0>(sC), size<1>(sC)); + Tensor gCt = local_tile(gC, tile, _); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + Tensor gDt = local_tile(gD, tile, _); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + + // Partition sC, gC, and gD for the output + auto tD = TiledCopyS2R{}.get_thread_slice(thread_idx); + Tensor tDsC = tD.partition_S(sC); // ((Atom,AtomNum),ATOM_M,ATOM_N) + Tensor tDgC = tD.partition_D(gCt); // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N) + Tensor tDgD = tD.partition_D(gDt); // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N) + + // Allocate intermediate registers on the dst tensors + Tensor tDrC = make_tensor(take<0,3>(shape(tDgC))); // ((Atom,AtomNum),ATOM_M,ATOM_N) + Tensor tDrD = make_tensor(shape(tDrC)); // ((Atom,AtomNum),ATOM_M,ATOM_N) + + // Repeat the D-partitioning for coordinates and predication + Tensor cD = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n) + Tensor cDt = local_tile(cD, tile, _); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + Tensor tDcD = tD.partition_D(cDt); // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N) + + CUTE_STATIC_ASSERT(size<1>(tCaC) % size<3>(tDgC) == 0); // TILE_M divides MMA_M + CUTE_STATIC_ASSERT(size<2>(tCaC) % size<4>(tDgC) == 0); // TILE_N divides MMA_N + CUTE_STATIC_ASSERT(typename TiledCopyS2R::TiledNumThr{} == size<0>(typename TiledMma::AtomLayoutC_TV{})); + +#if 0 + if (thread_idx == 0 && m_coord == 0 && n_coord == 0) { + print("aC : "); print(accumulators.layout()); print("\n"); + print("gC : "); print(gC.layout()); print("\n"); + print("gD : "); print(gD.layout()); print("\n"); + print("sC : "); print(sC.layout()); print("\n"); + print("\n"); + print("tCsC : "); print(tCsC.layout()); print("\n"); + print("tCaC : "); print(tCaC.layout()); print("\n"); + print("\n"); + print("gDt : "); print(gDt.layout()); print("\n"); + print("tDsC : "); print(tDsC.layout()); print("\n"); + print("tDrC : "); print(tDrC.layout()); print("\n"); + print("\n"); + print("tDrD : "); print(tDrD.layout()); print("\n"); + print("tDgC : "); print(tDgC.layout()); print("\n"); + print("tDgD : "); print(tDgD.layout()); print("\n"); + print("\n"); + } +#endif + + // For each tiling needed for SmemLayout to cover shape(gD) + CUTLASS_PRAGMA_UNROLL + for (int step_m = 0; step_m < size<2>(cDt); ++step_m) + { + CUTLASS_PRAGMA_UNROLL + for (int step_n = 0; step_n < size<3>(cDt); ++step_n) + { + // Step 1. Copy to SMEM + CUTLASS_PRAGMA_UNROLL + for (int pipe_m = 0; pipe_m < size<1>(tCsC); ++pipe_m) { + CUTLASS_PRAGMA_UNROLL + for (int pipe_n = 0; pipe_n < size<2>(tCsC); ++pipe_n) { + int mma_m = step_m * size<1>(tCsC) + pipe_m; + int mma_n = step_n * size<2>(tCsC) + pipe_n; + + copy(tC, tCaC(_,mma_m,mma_n), tCsC(_,pipe_m,pipe_n)); + } + } + + // Step 2. Wait for SMEM writes to complete + synchronize(); + + // Step 3. Copy from SMEM into a fragment + copy(tD, tDsC, tDrC); + + // Step 4. Wait for SMEM reads to complete + synchronize(); + + Tensor tDgDmn = tDgD(_,_,_,step_m,step_n); + Tensor tDcDmn = tDcD(_,_,_,step_m,step_n); + + if (epilogue_op.is_source_needed()) { + // source is needed + Tensor tDgCmn = tDgC(_,_,_,step_m,step_n); + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<1>(tDgDmn); ++m) + { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size<2>(tDgDmn); ++n) + { + // Predication + if (get<0>(tDcDmn(0,m,n)) < get<0>(residue_mnk) && + get<1>(tDcDmn(0,m,n)) < get<1>(residue_mnk)) + { + // Step 5. Elementwise operation with conversion + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size<0>(tDrC); ++i) { + tDrD(i,m,n) = epilogue_op(tDrC(i,m,n), tDgCmn(i,m,n)); + } + // Step 6. Copy to GMEM + copy(CopyAtomR2G{}, tDrD(_,m,n), tDgDmn(_,m,n)); + } + } + } + } + else { + // source is not needed, avoid load and lift compute + + // Step 5. Elementwise operation with conversion + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(tDrC); ++i) { + tDrD(i) = epilogue_op(tDrC(i)); + } + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<1>(tDgDmn); ++m) + { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size<2>(tDgDmn); ++n) + { + // Predication + if (get<0>(tDcDmn(0,m,n)) < get<0>(residue_mnk) && + get<1>(tDcDmn(0,m,n)) < get<1>(residue_mnk)) + { + // Step 6. Copy to GMEM + copy(CopyAtomR2G{}, tDrD(_,m,n), tDgDmn(_,m,n)); + } + } + } + } + } + } + } + +private: + Params params; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace collective +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp new file mode 100644 index 0000000000..de318d538f --- /dev/null +++ b/include/cutlass/epilogue/dispatch_policy.hpp @@ -0,0 +1,39 @@ +/*************************************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::epilogue { + +////////////////////////////////////////////////////////////////////////////// + +// +// Collective Epilogue Policies +// + +////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::epilogue diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h index b22c26d0ba..0c4b3849df 100644 --- a/include/cutlass/epilogue/thread/linear_combination.h +++ b/include/cutlass/epilogue/thread/linear_combination.h @@ -62,7 +62,8 @@ template < typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination ScaleType::Kind Scale = ScaleType::Default, ///< Control Alpha and Beta scaling - FloatRoundStyle Round = FloatRoundStyle::round_to_nearest + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, + typename ElementSource_ = ElementOutput_ > class LinearCombination { public: @@ -70,6 +71,8 @@ class LinearCombination { using ElementOutput = ElementOutput_; using ElementAccumulator = ElementAccumulator_; using ElementCompute = ElementCompute_; + using ElementC = ElementSource_; + using ElementD = ElementOutput_; static int const kCount = Count; static const ScaleType::Kind kScale = Scale; @@ -78,7 +81,6 @@ class LinearCombination { using ComputeFragment = Array; using ParamsBase = LinearCombinationParams; - static FloatRoundStyle const kRound = Round; /// Host-constructable parameters structure @@ -89,28 +91,28 @@ class LinearCombination { ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory CUTLASS_HOST_DEVICE - Params(): + Params(): ParamsBase( - ElementCompute(1), + ElementCompute(1), ElementCompute(0) ), - alpha(ElementCompute(1)), - beta(ElementCompute(0)), - alpha_ptr(nullptr), + alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), beta_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta - ): + ): ParamsBase(alpha, beta), - alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { } + alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha - ): + ): ParamsBase(alpha, ElementCompute(0)), alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) { } @@ -118,7 +120,7 @@ class LinearCombination { Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr - ): + ): ParamsBase(*alpha_ptr, *beta_ptr), alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { } @@ -132,13 +134,13 @@ class LinearCombination { CUTLASS_HOST_DEVICE Params( ParamsBase const& base - ): ParamsBase(base), alpha_ptr(nullptr), beta_ptr(nullptr) { + ): ParamsBase(base), alpha_ptr(nullptr), beta_ptr(nullptr) { #if defined(__CUDA_ARCH__) alpha = reinterpret_cast(base.alpha_data); beta = reinterpret_cast(base.beta_data); #else - memcpy( alpha, base.alpha_data, sizeof(ElementCompute) ); - memcpy( beta, base.alpha_data, sizeof(ElementCompute) ); + memcpy( alpha, base.alpha_data, sizeof(ElementCompute) ); + memcpy( beta, base.alpha_data, sizeof(ElementCompute) ); #endif } }; @@ -184,7 +186,7 @@ class LinearCombination { /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( - FragmentAccumulator const &accumulator, + FragmentAccumulator const &accumulator, FragmentOutput const &source) const { // Convert source to interal compute numeric type @@ -236,8 +238,61 @@ class LinearCombination { ComputeFragment intermediate; multiplies mul_accumulator; - intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + return destination_converter(intermediate); + } + + // + // Specializations for scalar (for use with cute::collective::DefaultEpilogue) + // + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator, ElementC const source) const { + // Convert everything to Compute type, do compute, and then store to output type + NumericConverter accumulator_converter; + [[maybe_unused]] NumericConverter source_converter; + NumericConverter destination_converter; + + // Convert to destination numeric type + + ElementCompute converted_accumulator = accumulator_converter(accumulator); + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + multiply_add madd; + + if constexpr (Scale == ScaleType::NoBetaScaling) { + intermediate = source_converter(source); + } + else { + intermediate = multiply(beta_, source); // X = beta * C + uniform + } + + intermediate = madd(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X + return destination_converter(intermediate); + } + + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator) const { + // Convert everything to Compute type, do compute, and then store to output type + NumericConverter accumulator_converter; + NumericConverter destination_converter; + ElementCompute converted_accumulator = accumulator_converter(accumulator); + + // Convert to destination numeric type + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + intermediate = multiply(alpha_, accumulator); // D = alpha * Accum return destination_converter(intermediate); } }; diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 972cf04bf1..277bad5c0f 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -56,6 +56,12 @@ struct absolute_value_op { } }; +template <> +struct absolute_value_op { + CUTLASS_HOST_DEVICE + float operator()(float lhs) const { return fabs(lhs); } +}; + template struct plus { CUTLASS_HOST_DEVICE @@ -83,6 +89,30 @@ struct multiplies { } }; +// Maximum with nan propogation +// To propgate the NANs, the "max" of a two element that contains NaNs should also return a NaN +template +struct maximum_with_nan_propogation { + CUTLASS_HOST_DEVICE + T operator()(T const &lhs, T const &rhs) const { + return lhs > rhs or std::isnan(lhs) ? lhs : rhs; + } +}; + +template <> +struct maximum_with_nan_propogation { + CUTLASS_HOST_DEVICE + float operator()(float const lhs, float const rhs) const { + float res; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs)); +#else + res = lhs > rhs or std::isnan(lhs) ? lhs : rhs; +#endif + return res; + } +}; + /// Squares with optional conversion template struct square { diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl new file mode 100644 index 0000000000..c1444a9840 --- /dev/null +++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl @@ -0,0 +1,414 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cute/atom/mma_traits_sm90_gmma.hpp" +#include "cute/atom/copy_traits_sm90_tma.hpp" + +// SM90 Collective Builders should be used only starting CUDA 12.0 +#if (__CUDACC_VER_MAJOR__ >= 12) +#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +// +// Some named constants +// +constexpr int tma_alignment_bytes = 16; +constexpr int cp_async_min_alignment_bytes = 4; +constexpr int sm90_smem_capacity_bytes = 232448; + +// Maps 2.x A matrix layout tag to respective GMMA major mode enum +template +constexpr cute::GMMA::Major +tag_to_gmma_major_A() { + // MN major mode is only valid for non-TF32 and non-int MMAs + if constexpr (std::is_same_v && + not std::is_same_v && + not std::is_same_v && + not std::is_same_v) { + return cute::GMMA::Major::MN; + } + else { + return cute::GMMA::Major::K; + } +} + +// Maps 2.x B matrix layout tag to respective GMMA major mode enum +template +constexpr cute::GMMA::Major +tag_to_gmma_major_B() { + // MN major mode is only valid for non-TF32 and non-int MMAs + if constexpr (std::is_same_v && + not std::is_same_v && + not std::is_same_v && + not std::is_same_v) { + return cute::GMMA::Major::MN; + } + else { + return cute::GMMA::Major::K; + } +} + +// Maps a rank-1 cute::Shape<> representing the cluster shape on to the TMA atom that should be used with it +template +constexpr auto +cluster_shape_to_tma_atom(UnimodalClusterShape unimodal_cluster_shape) { + static_assert(cute::rank(unimodal_cluster_shape) == 1, + "Use this function to figure out TMA for each mode individually."); + + if constexpr (cute::size(unimodal_cluster_shape) == 1) { + return cute::SM90_TMA_LOAD{}; + } + else { + return cute::SM90_TMA_LOAD_MULTICAST{}; + } +} + +// Generates the most efficient possible TiledCopy with cp.async copy atom given a set of parameters. +template +constexpr auto +make_cp_async_gmem_tiled_copy() { + using AlignmentType = cute::uint_byte_t(sizeof(Element)) * Alignment>; + constexpr int TileSizeMN = cute::size(TileMN{}); + constexpr int TileSizeK = cute::size(TileK{}); + + // Maximize the number of threads along the gmem major mode to promote coalesced reads + // While making sure our thread layout tiles the threadblock tile evenly + if constexpr (cute::size<1>(StrideType{}) == 1) { + // K major thread layout for K major gmem + constexpr int threads_major = TileSizeK / Alignment; + constexpr int threads_minor = ThreadCount / threads_major; + static_assert(threads_major > 0); + static_assert(ThreadCount % threads_major == 0); + static_assert(threads_minor == 0 || (TileSizeMN % threads_minor == 0)); + return make_tiled_copy( + Copy_Atom, Element>{}, + Layout,Int>, + Stride, _1>>{}, + Layout>>{}); + } + else if constexpr (cute::size<0>(StrideType{}) == 1) { + // MN major thread layout for MN major gmem + constexpr int threads_major = TileSizeMN / Alignment; + constexpr int threads_minor = ThreadCount / threads_major; + static_assert(threads_major > 0); + static_assert(ThreadCount % threads_major == 0); + static_assert(threads_minor == 0 || (TileSizeK % threads_minor == 0)); + return make_tiled_copy( + Copy_Atom, Element>{}, + Layout,Int>, + Stride< _1,Int>>{}, + Layout,_1>>{}); + } + else { + static_assert(std::is_void_v, "Unsupported gmem layout for automatic gmem tiled copy builder."); + } +} + +// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. +template +constexpr int +compute_stage_count_or_override(int KernelSmemCarveout = 0) { + if constexpr (std::is_same_v) { + // 32 bytes to account for barriers etc. + constexpr int stage_barrier_bytes = 32; + constexpr int a_bytes = static_cast(sizeof(ElementA)); + constexpr int b_bytes = static_cast(sizeof(ElementB)); + constexpr int stage_bytes = + (a_bytes * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) + + (b_bytes * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) + + stage_barrier_bytes; + + return (CapacityBytes - KernelSmemCarveout) / stage_bytes; + } + else { + return StageCountType::value; + } +} + +// Kernel policy selection logic: auto dispatches to KernelTmaWarpSpecialized for now. Subject to change. +template < + class ElementA, + class ElementB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + class KernelScheduleType +> +constexpr auto +generate_gmma_dispatch_policy() { + if constexpr (std::is_base_of_v or + std::is_same_v) { + constexpr int PipelineStages = compute_stage_count_or_override< + sm90_smem_capacity_bytes, ElementA, ElementB, TileShape_MNK, StageCountType>(); + + if constexpr (std::is_same_v or + std::is_same_v) { + return MainloopSm90TmaGmmaWarpSpecialized{}; + } + else { + static_assert(sizeof(ElementA) == 0, "Invalid kernel schedule type."); + } + } + + else if constexpr (std::is_base_of_v) { + // For the persistent kernel, assume that the epilogue uses 1 MN tile worth of smem + constexpr int EpilogueTileCarveout = sizeof(ElementAccumulator) * + (size<0>(TileShape_MNK{}) * size<1>(TileShape_MNK{})); + constexpr int PipelineStages = compute_stage_count_or_override< + sm90_smem_capacity_bytes, ElementA, ElementB, TileShape_MNK, StageCountType>(EpilogueTileCarveout); + + if constexpr (std::is_same_v) { + return MainloopSm90TmaGmmaWarpSpecialized{}; + } + else { + static_assert(sizeof(ElementA) == 0, "Invalid kernel schedule type."); + } + } + + else if constexpr (std::is_base_of_v) { + constexpr int PipelineStages = compute_stage_count_or_override< + sm90_smem_capacity_bytes, ElementA, ElementB, TileShape_MNK, StageCountType>(); + + return MainloopSm90TmaGmma{}; + } + + else { + static_assert(sizeof(ElementA) == 0, "Invalid kernel schedule type."); + } +} + +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA_TMA_SS +template < + class ElementA, + class GmemLayoutA, + int AlignmentA, + class ElementB, + class GmemLayoutB, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + class KernelScheduleType +> +struct CollectiveBuilder< + arch::Sm90, + arch::OpClassTensorOp, + ElementA, + GmemLayoutA, + AlignmentA, + ElementB, + GmemLayoutB, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + ClusterShape_MNK, + StageCountType, + KernelScheduleType, + std::enable_if_t< + // TMA requires alignment be 16 bytes + ((sizeof(ElementA) * AlignmentA) % detail::tma_alignment_bytes == 0) && + ((sizeof(ElementB) * AlignmentB) % detail::tma_alignment_bytes == 0) && + not std::is_same_v && + // dispatch TN tf32 and int8 kernels only to TMA builder + ((sizeof(ElementA) == 2 && sizeof(ElementB) == 2) || + (std::is_same_v && std::is_same_v))> +> { + static_assert(is_static::value); + static_assert(is_static::value); + + #ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED + static_assert(sizeof(ElementA) == 0, "Unsupported Toolkit for SM90 Collective Builder\n"); + #endif + + // For fp32 types, map to tf32 MMA value type + using MmaElementA = std::conditional_t, tfloat32_t, ElementA>; + using MmaElementB = std::conditional_t, tfloat32_t, ElementB>; + + static constexpr cute::GMMA::Major GmmaMajorA = detail::tag_to_gmma_major_A(); + static constexpr cute::GMMA::Major GmmaMajorB = detail::tag_to_gmma_major_B(); + + using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< + MmaElementA, MmaElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>())); + + using GmemTiledCopyA = decltype(detail::cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); + using GmemTiledCopyB = decltype(detail::cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); + + using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector< + GmmaMajorA, MmaElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})) + >()); + using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector< + GmmaMajorB, MmaElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})) + >()); + + using DispatchPolicy = decltype(detail::generate_gmma_dispatch_policy< + MmaElementA, MmaElementB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, KernelScheduleType>()); + + using CollectiveOp = CollectiveMma< + DispatchPolicy, + TileShape_MNK, + ElementA, + TagToStrideA_t, + ElementB, + TagToStrideB_t, + TiledMma, + GmemTiledCopyA, + SmemLayoutAtomA, + void, // GMMA_SS does not need an SmemCopyAtom + cute::identity, + GmemTiledCopyB, + SmemLayoutAtomB, + void, // GMMA_SS does not need an SmemCopyAtom + cute::identity + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA_CpAsync_SS +template < + class ElementA, + class GmemLayoutA, + int AlignmentA, + class ElementB, + class GmemLayoutB, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + class KernelScheduleType +> +struct CollectiveBuilder< + arch::Sm90, + arch::OpClassTensorOp, + ElementA, + GmemLayoutA, + AlignmentA, + ElementB, + GmemLayoutB, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + ClusterShape_MNK, + StageCountType, + KernelScheduleType, + std::enable_if_t< + // Even if we could build a TMA kernel, let the user override and use cp_async instead + std::is_same_v || + // But always guard against invalid TMA alignments and dispatch to cp_async + ((sizeof(ElementA) * AlignmentA) % detail::tma_alignment_bytes != 0) || + ((sizeof(ElementB) * AlignmentB) % detail::tma_alignment_bytes != 0) || + // dispatch non-TN tf32 and int8 kernels only to cp_async builder + ((sizeof(ElementA) != 2 || sizeof(ElementB) != 2) && + (not std::is_same_v || not std::is_same_v))> +> { + static_assert(is_static::value); + static_assert(is_static::value); + + #ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED + static_assert(sizeof(ElementA) == 0, "Unsupported Toolkit for SM90 Collective Builder\n"); + #endif + + // For fp32 types, map to tf32 MMA value type + using MmaElementA = std::conditional_t, tfloat32_t, ElementA>; + using MmaElementB = std::conditional_t, tfloat32_t, ElementB>; + + static_assert((sizeof(ElementA) * AlignmentA) % detail::cp_async_min_alignment_bytes == 0 and + (sizeof(ElementB) * AlignmentB) % detail::cp_async_min_alignment_bytes == 0, + "Minimum alignment required for cp.async is 4B."); + + static constexpr cute::GMMA::Major GmmaMajorA = detail::tag_to_gmma_major_A(); + static constexpr cute::GMMA::Major GmmaMajorB = detail::tag_to_gmma_major_B(); + + using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< + MmaElementA, MmaElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>())); + + using GmemTiledCopyA = decltype(detail::make_cp_async_gmem_tiled_copy< + 128, ElementA, AlignmentA, TagToStrideA_t, + decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + + using GmemTiledCopyB = decltype(detail::make_cp_async_gmem_tiled_copy< + 128, ElementB, AlignmentB, TagToStrideB_t, + decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + + using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector< + GmmaMajorA, MmaElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})) + >()); + + using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector< + GmmaMajorB, MmaElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})) + >()); + + static constexpr int PipelineStages = detail::compute_stage_count_or_override< + detail::sm90_smem_capacity_bytes, MmaElementA, MmaElementB, TileShape_MNK, StageCountType>(); + + using CollectiveOp = CollectiveMma< + MainloopSm90CpAsyncGmma, + TileShape_MNK, + ElementA, + TagToStrideA_t, + ElementB, + TagToStrideB_t, + TiledMma, + GmemTiledCopyA, + SmemLayoutAtomA, + void, // GMMA_SS does not need an SmemCopyAtom + cute::identity, + GmemTiledCopyB, + SmemLayoutAtomB, + void, // GMMA_SS does not need an SmemCopyAtom + cute::identity + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/collective_builder.hpp b/include/cutlass/gemm/collective/collective_builder.hpp new file mode 100644 index 0000000000..3cd68a41de --- /dev/null +++ b/include/cutlass/gemm/collective/collective_builder.hpp @@ -0,0 +1,78 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +///////////////////////////////////////////////////////////////////////////////////////////////// +#include "collective_mma.hpp" + +namespace cutlass::gemm::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Used to specify stage counts or dispatch to automatic computation of stage count +template +struct StageCount { static constexpr int value = num_stages; }; +struct StageCountAuto {}; + +// Used to automatically let the builder pick the kernel schedule. +// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp +struct KernelScheduleAuto {}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class ArchTag, + class OpClass, + class ElementA, + class GmemLayoutA, + int AlignmentA, + class ElementB, + class GmemLayoutB, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + class KernelScheduleType, + class Enable = void +> +struct CollectiveBuilder { + static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "builders/sm90_gmma_builder.inl" +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp new file mode 100644 index 0000000000..a2a9067571 --- /dev/null +++ b/include/cutlass/gemm/collective/collective_mma.hpp @@ -0,0 +1,71 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class DispatchPolicy, + class TileShape, + class ElementA, + class StrideA, + class ElementB, + class StrideB, + class TiledMma, + class GmemTiledCopyA, + class SmemLayoutAtomA, + class SmemCopyAtomA, + class TransformA, + class GmemTiledCopyB, + class SmemLayoutAtomB, + class SmemCopyAtomB, + class TransformB +> +struct CollectiveMma { + static_assert(sizeof(ElementA) == 0, "Could not find a mainloop specialization."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "sm70_mma_twostage.hpp" +#include "sm80_mma_multistage.hpp" +#include "sm90_mma_multistage_gmma_ss.hpp" +#include "sm90_mma_tma_gmma_ss.hpp" +#include "sm90_mma_tma_gmma_ss_warpspecialized.hpp" +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp new file mode 100644 index 0000000000..11e5515aed --- /dev/null +++ b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp @@ -0,0 +1,588 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/tensor_predicate.hpp" + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm70TwoStageUnpredicated, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm70TwoStageUnpredicated; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})))); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + template < + class FrgTensorD, + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + FrgTensorD &accum, + TensorA gA, + TensorB gB, + FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf) + { + using namespace cute; + + (void)residue_mnk; + + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutA{}) == 2, + "MainloopTwoStage must not have a smem shape with a pipeline mode."); + static_assert(rank(SmemLayoutB{}) == 2, + "MainloopTwoStage must not have a smem shape with a pipeline mode."); + + // Construct shared memory tiles + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_a; + GmemTiledCopyB gmem_tiled_copy_b; + auto copy_a_thr = gmem_tiled_copy_a.get_slice(thread_idx); + auto copy_b_thr = gmem_tiled_copy_b.get_slice(thread_idx); + + Tensor tAgA = copy_a_thr.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = copy_a_thr.partition_D(sA); // (ACPY,ACPY_M,ACPY_K) + Tensor tBgB = copy_b_thr.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = copy_b_thr.partition_D(sB); // (BCPY,BCPY_N,BCPY_K) + + // Allocate the register tiles for double buffering -- same shape as partitioned data + Tensor tArA = make_fragment_like(tAsA); // (ACPY,ACPY_M,ACPY_K) + Tensor tBrB = make_fragment_like(tBsB); // (BCPY,BCPY_N,BCPY_K) + + // Tile MMA compute thread partitions and allocate accumulators + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCrA = thr_mma.partition_fragment_A(sA); // (MMA,MMA_M,MMA_K) + Tensor tCrB = thr_mma.partition_fragment_B(sB); // (MMA,MMA_M,MMA_K) + + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K + + // + // Copy Atom retiling + // + + auto thr_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx); + Tensor tCsA = thr_copy_A.partition_S(sA); + Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA); + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // M + + auto thr_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx); + Tensor tCsB = thr_copy_B.partition_S(sB); + Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB); + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N + + // + // Prologue + // + + // Copy gmem to rmem for the first k_tile + copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA); + copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB); + if (--k_tile_count > 0) ++k_tile_iter; + // Copy rmem to smem + copy(tArA, tAsA); + copy(tBrB, tBsB); + // Clear accumulators + __syncthreads(); + + // Load A, B smem->rmem for k=0 + copy(tCsA(_,_,0), tCrA_copy_view(_,_,0)); + copy(tCsB(_,_,0), tCrB_copy_view(_,_,0)); + // + // Mainloop + // + + // Size of the k-tiles's outer product mode (k) + auto K_BLOCK_MAX = size<2>(tCrA); + + CUTLASS_PRAGMA_NO_UNROLL + while (k_tile_count > -1) + { + // Pipeline the outer products with a static for loop + for_each(make_int_sequence{}, [&] (auto k_block) + { + if (k_block == K_BLOCK_MAX - 1) + { + __syncthreads(); + + // Copy rmem to smem + copy(tArA, tAsA); + copy(tBrB, tBsB); + __syncthreads(); + } + + // Load A, B smem->rmem for k+1 + int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX; // static + copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next)); + copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next)); + if (k_block == 0) + { + // Copy gmem to rmem + copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA); + copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB); + if (--k_tile_count > 0) ++k_tile_iter; + } + + // transform before compute + cute::transform(tCrA(_,_,k_block), TransformA{}); + cute::transform(tCrB(_,_,k_block), TransformB{}); + + // Thread-level register gemm for k + // disambiguate gemm (shared with the namespace name) + cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum); + }); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm70TwoStage, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm70TwoStage; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})))); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + template < + class FrgTensorD, + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + FrgTensorD &accum, + TensorA gA, + TensorB gB, + FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf) + { + using namespace cute; + + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutA{}) == 2, + "MainloopTwoStage must not have a smem shape with a pipeline mode."); + static_assert(rank(SmemLayoutB{}) == 2, + "MainloopTwoStage must not have a smem shape with a pipeline mode."); + + // Construct shared memory tiles + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k) + // This aligns the tensor with BLK_K for all but the 0th k_tile + gA.data() = &gA(0, get<2>(residue_mnk), 0); + gB.data() = &gB(0, get<2>(residue_mnk), 0); + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_a; + GmemTiledCopyB gmem_tiled_copy_b; + auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx); + auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx); + + Tensor tAgA = gmem_thr_copy_a.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = gmem_thr_copy_a.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE) + Tensor tBgB = gmem_thr_copy_b.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = gmem_thr_copy_b.partition_D(sB); // (BCPY,BCPY_N,BCPY_K,PIPE) + + // Allocate the register tiles for double buffering -- same shape as partitioned data + Tensor tArA = make_fragment_like(tAsA); // (ACPY,ACPY_M,ACPY_K) + Tensor tBrB = make_fragment_like(tBsB); // (BCPY,BCPY_N,BCPY_K) + + // + // PREDICATES + // + + // Allocate predicate tensors for m and n + Tensor tApA = make_tensor(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{}); + Tensor tBpB = make_tensor(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{}); + + // Construct identity layout for sA and sB + Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + + // Repeat the partitioning with identity layouts + Tensor tAcA = gmem_thr_copy_a.partition_S(cA); // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k) + Tensor tBcB = gmem_thr_copy_b.partition_S(cB); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + + // Set predicates for m bounds + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<0>(tApA); ++m) { + tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk); // blk_m coord < residue_m + } + // Set predicates for n bounds + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size<0>(tBpB); ++n) { + tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk); // blk_n coord < residue_n + } + + // + // PREFETCH + // + + // Clear the rmem tiles to account for predicated off loads + clear(tArA); + clear(tBrB); + + // Start async loads for 0th k-tile, where we take care of the k residue + { + Tensor tAgAk = tAgA(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tArA); ++k) { + if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gA shifted) + copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tArA(_,_,k)); + } + } + Tensor tBgBk = tBgB(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tBrB); ++k) { + if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gB shifted) + copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBrB(_,_,k)); + } + } + ++k_tile_iter; + --k_tile_count; + } + + // Tile MMA compute thread partitions and allocate accumulators + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA)); // (MMA,MMA_M,MMA_K) + Tensor tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB)); // (MMA,MMA_M,MMA_K) + + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K + + // + // Copy Atom retiling + // + + auto thr_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx); + Tensor tCsA = thr_copy_A.partition_S(sA); + Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA); + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // M + + auto thr_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx); + Tensor tCsB = thr_copy_B.partition_S(sB); + Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB); + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N + + // + // Prologue + // + + // Copy rmem to smem + copy(tArA, tAsA); + copy(tBrB, tBsB); + // Clear accumulators + __syncthreads(); + + // Load A, B smem->rmem for k=0 + copy(tCsA(_,_,0), tCrA_copy_view(_,_,0)); + copy(tCsB(_,_,0), tCrB_copy_view(_,_,0)); + // + // Mainloop + // + + // Size of the k-tiles's outer product mode (k) + auto K_BLOCK_MAX = size<2>(tCrA); + + CUTLASS_PRAGMA_NO_UNROLL + while (k_tile_count > -1) + { + // Pipeline the outer products with a static for loop + for_each(make_int_sequence{}, [&] (auto k_block) + { + if (k_block == K_BLOCK_MAX - 1) + { + __syncthreads(); + + // Copy rmem to smem + copy(tArA, tAsA); + copy(tBrB, tBsB); + __syncthreads(); + } + + // Load A, B smem->rmem for k+1 + int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX; // static + copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next)); + copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next)); + if (k_block == 0) + { + if (k_tile_count <= 0) { + clear(tApA); + clear(tBpB); + } + copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tArA); + copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBrB); + ++k_tile_iter; + --k_tile_count; + } + + // transform before compute + cute::transform(tCrA(_,_,k_block), TransformA{}); + cute::transform(tCrB(_,_,k_block), TransformB{}); + + // Thread-level register gemm for k + // disambiguate gemm (shared with the namespace name) + cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum); + }); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp new file mode 100644 index 0000000000..6ba6ccc008 --- /dev/null +++ b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp @@ -0,0 +1,680 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/tensor_predicate.hpp" +#include "cute/numeric/arithmetic_tuple.hpp" + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + int Stages, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm80CpAsyncUnpredicated, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm80CpAsyncUnpredicated; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}))); + + static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline."); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a collective-scoped matrix multiply-accumulate + template < + class FrgTensorD, + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + FrgTensorD &accum, + TensorA gA, + TensorB gB, + FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf) + { + using namespace cute; + + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutA{}) == 3, + "MainloopSm80CpAsync must have a pipeline mode in the smem layout."); + static_assert(rank(SmemLayoutB{}) == 3, + "MainloopSm80CpAsync must have a pipeline mode in the smem layout."); + + // Construct shared memory tiles + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA)); // BLK_M + CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA)); // BLK_K + CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB)); // BLK_N + CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB)); // BLK_K + CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB)); // BLK_K + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_A; + GmemTiledCopyB gmem_tiled_copy_B; + auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx); + auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx); + + Tensor tAgA = gmem_thr_copy_A.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = gmem_thr_copy_A.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE) + Tensor tBgB = gmem_thr_copy_B.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = gmem_thr_copy_B.partition_D(sB); // (BCPY,BCPY_N,BCPY_K,PIPE) + + // + // PREDICATES + // + + (void) residue_mnk; + //assert(residue_mnk == make_tuple(0,0,0)); + + // + // PREFETCH + // + + // Start async loads for all pipes but the last + CUTLASS_PRAGMA_UNROLL + for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) { + copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe)); + copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe)); + cp_async_fence(); + --k_tile_count; + if (k_tile_count > 0) { ++k_tile_iter; } + } + + // + // MMA Atom partitioning + // + + // Tile MMA compute thread partitions and allocate accumulators + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0)); // (MMA,MMA_M,MMA_K) + Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0)); // (MMA,MMA_N,MMA_K) + + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K + CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_A) == size(tiled_mma)); + CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_B) == size(tiled_mma)); + + // + // Copy Atom retiling + // + + auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma); + auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(thread_idx); + Tensor tCsA = smem_thr_copy_A.partition_S(sA); // (CPY,CPY_M,CPY_K,PIPE) + Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA); // (CPY,CPY_M,CPY_K) + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // CPY_M + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view)); // CPY_K + + auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma); + auto smem_thr_copy_B = smem_tiled_copy_B.get_thread_slice(thread_idx); + Tensor tCsB = smem_thr_copy_B.partition_S(sB); // (CPY,CPY_N,CPY_K,PIPE) + Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB); // (CPY,CPY_N,CPY_K) + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // CPY_N + CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view)); // CPY_K + + // + // PIPELINED MAIN LOOP + // + + // Current pipe index in smem to read from + int smem_pipe_read = 0; + // Current pipe index in smem to write to + int smem_pipe_write = DispatchPolicy::Stages-1; + + Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read); + Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read); + + // Size of the register pipeline + auto K_BLOCK_MAX = size<2>(tCrA); + + // PREFETCH register pipeline + if (K_BLOCK_MAX > 1) { + // Wait until our first prefetched tile is loaded in + cp_async_wait(); + __syncthreads(); + + // Prefetch the first rmem from the first k-tile + copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{})); + copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{})); + } + + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count) + { + // Pipeline the outer products with a static for loop. + // + // Note, the for_each() function is required here to ensure `k_block` is of type Int. + for_each(make_int_sequence{}, [&] (auto k_block) + { + if (k_block == K_BLOCK_MAX - 1) + { + // Slice the smem_pipe_read smem + tCsA_p = tCsA(_,_,_,smem_pipe_read); + tCsB_p = tCsB(_,_,_,smem_pipe_read); + + // Commit the smem for smem_pipe_read + cp_async_wait(); + __syncthreads(); + } + + // Load A, B shmem->regs for k_block+1 + auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX; // static + copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next)); + copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next)); + // Copy gmem to smem before computing gemm on each k-pipe + if (k_block == 0) + { + copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write)); + copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write)); + cp_async_fence(); + if (k_tile_count > 0) { ++k_tile_iter; } + + // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe) + smem_pipe_write = smem_pipe_read; + ++smem_pipe_read; + smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read; + } + + // Transform before compute + cute::transform(tCrA(_,_,k_block), TransformA{}); + cute::transform(tCrB(_,_,k_block), TransformB{}); + // Thread-level register gemm for k_block + cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum); + }); + + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + int Stages, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm80CpAsync, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm80CpAsync; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}))); + + static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline."); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a collective-scoped matrix multiply-accumulate + template < + class FrgTensorD, + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + FrgTensorD &accum, + TensorA gA, // (BLK_M, BLK_K, K_TILES) + TensorB gB, // (BLK_N, BLK_K, K_TILES) + FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf) + { + using namespace cute; + + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + + // Construct shared memory tiles + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA)); // BLK_M + CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA)); // BLK_K + CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB)); // BLK_N + CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB)); // BLK_K + CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB)); // BLK_K + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k) + // This aligns the tensor with BLK_K for all but the 0th k_tile + gA.data() = &gA(0, get<2>(residue_mnk), 0); + gB.data() = &gB(0, get<2>(residue_mnk), 0); + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_A; + GmemTiledCopyB gmem_tiled_copy_B; + auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx); + auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx); + + Tensor tAgA = gmem_thr_copy_A.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = gmem_thr_copy_A.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE) + Tensor tBgB = gmem_thr_copy_B.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = gmem_thr_copy_B.partition_D(sB); // (BCPY,BCPY_N,BCPY_K,PIPE) + + // + // PREDICATES + // + + // Allocate predicate tensors for m and n + Tensor tApA = make_tensor(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{}); + Tensor tBpB = make_tensor(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{}); + + // Construct identity layout for sA and sB + Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + + // Repeat the partitioning with identity layouts + Tensor tAcA = gmem_thr_copy_A.partition_S(cA); // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k) + Tensor tBcB = gmem_thr_copy_B.partition_S(cB); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + + // Set predicates for m bounds + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<0>(tApA); ++m) { + tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk); // blk_m coord < residue_m + } + // Set predicates for n bounds + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size<0>(tBpB); ++n) { + tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk); // blk_n coord < residue_n + } + + // + // PREFETCH + // + + // Clear the smem tiles to account for predicated off loads + clear(tAsA); + clear(tBsB); + + // Start async loads for 0th k-tile, where we take care of the k residue + { + constexpr int k_pipe = 0; + + Tensor tAgAk = tAgA(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tAsA); ++k) { + if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gA shifted) + copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe)); + } + } + Tensor tBgBk = tBgB(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tBsB); ++k) { + if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gB shifted) + copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe)); + } + } + cp_async_fence(); + ++k_tile_iter; + --k_tile_count; + } + + // Start async loads for 1st k-tile onwards, no k-residue handling needed + CUTLASS_PRAGMA_UNROLL + for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) { + if (k_tile_count <= 0) { + clear(tApA); + clear(tBpB); + } + copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe)); // CpAsync + copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe)); // CpAsync + cp_async_fence(); + ++k_tile_iter; + --k_tile_count; + } + + // + // MMA Atom partitioning + // + + // Tile MMA compute thread partitions and allocate accumulators + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0)); // (MMA,MMA_M,MMA_K) + Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0)); // (MMA,MMA_N,MMA_K) + + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum)); // MMA_M + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum)); // MMA_N + CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K + + // + // Copy Atom retiling + // + + auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma); + auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(thread_idx); + Tensor tCsA = smem_thr_copy_A.partition_S(sA); // (CPY,CPY_M,CPY_K,PIPE) + Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA); // (CPY,CPY_M,CPY_K) + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // CPY_M + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view)); // CPY_K + + auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma); + auto smem_thr_copy_B = smem_tiled_copy_B.get_thread_slice(thread_idx); + Tensor tCsB = smem_thr_copy_B.partition_S(sB); // (CPY,CPY_N,CPY_K,PIPE) + Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB); // (CPY,CPY_N,CPY_K) + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // CPY_N + CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view)); // CPY_K + + // + // PIPELINED MAIN LOOP + // + + // Current pipe index in smem to read from + int smem_pipe_read = 0; + // Current pipe index in smem to write to + int smem_pipe_write = DispatchPolicy::Stages-1; + + Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read); + Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read); + + // Size of the register pipeline + auto K_BLOCK_MAX = size<2>(tCrA); + + // PREFETCH register pipeline + if (K_BLOCK_MAX > 1) { + // Wait until our first prefetched tile is loaded in + cp_async_wait(); + __syncthreads(); + + // Prefetch the first rmem from the first k-tile + copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{})); + copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{})); + } + + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count) + { + // Pipeline the outer products with a static for loop. + // + // Note, the for_each() function is required here to ensure `k_block` is of type Int. + for_each(make_int_sequence{}, [&] (auto k_block) + { + if (k_block == K_BLOCK_MAX - 1) + { + // Slice the smem_pipe_read smem + tCsA_p = tCsA(_,_,_,smem_pipe_read); + tCsB_p = tCsB(_,_,_,smem_pipe_read); + + // Commit the smem for smem_pipe_read + cp_async_wait(); + __syncthreads(); + } + + // Load A, B shmem->regs for k_block+1 + auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX; // static + copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next)); + copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next)); + // Copy gmem to smem before computing gemm on each k-pipe + if (k_block == 0) + { + // Set all predicates to false if we are going to overshoot bounds + if (k_tile_count <= 0) { + clear(tApA); + clear(tBpB); + } + copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write)); + copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write)); + cp_async_fence(); + ++k_tile_iter; + + // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe) + smem_pipe_write = smem_pipe_read; + ++smem_pipe_read; + smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read; + } + + // Transform before compute + cute::transform(tCrA(_,_,k_block), TransformA{}); + cute::transform(tCrB(_,_,k_block), TransformB{}); + // Thread-level register gemm for k_block + cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum); + }); + + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss.hpp new file mode 100644 index 0000000000..3b1921b9cc --- /dev/null +++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss.hpp @@ -0,0 +1,596 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/pipeline.hpp" +#include "cute/arch/cluster_sm90.hpp" +#include "cutlass/arch/reg_reconfig.h" + +#include "cute/arch/copy_sm90.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + int Stages, + class ClusterShape, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90CpAsyncGmmaUnpredicated, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90CpAsyncGmmaUnpredicated; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}))); + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(std::is_base_of::value && + std::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a collective-scoped matrix multiply-accumulate + template < + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + TensorA gA, + TensorB gB, + FrgTensorC& accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf, + Params const& mainloop_params) + { + using namespace cute; + + (void) residue_mnk; + + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(std::is_same::value, + "SM90 warpgroup MMA must specify transforms through MMA_Atom."); + static_assert(std::is_same::value, + "SM90 warpgroup MMA must specify transforms through MMA_Atom."); + static_assert(std::is_same::value, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(std::is_same::value, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_a; + GmemTiledCopyB gmem_tiled_copy_b; + auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx); + auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx); + + Tensor tAgA = gmem_thr_copy_a.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = gmem_thr_copy_a.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE) + Tensor tBgB = gmem_thr_copy_b.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = gmem_thr_copy_b.partition_D(sB); // (BCPY,BCPY_N,BCPY_K,PIPE) + + // Tile MMA atom and compute thread partitions across A, B and C + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + + // Allocate registers for pipelining + Tensor tCsA = thr_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thr_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + Tensor tCrA = thr_mma.make_fragment_A(tCsA); // (MMA,MMA_N,MMA_K,PIPE) + Tensor tCrB = thr_mma.make_fragment_B(tCsB); // (MMA,MMA_M,MMA_N,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // + // Prologue + // + + CUTLASS_PRAGMA_UNROLL + for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) { + copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe)); + copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe)); + cp_async_fence(); + ++k_tile_iter; + --k_tile_count; + } + + // Current pipe index in smem to read from + int smem_pipe_read = 0; + // Current pipe index in smem to write to + int smem_pipe_write = DispatchPolicy::Stages-1; + + // + // Pipelined Main Loop + // + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count) + { + // Copy gmem to smem before computing gemm on each k-pipe + // pipe index in smem where the next gmem tile will be read into + copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write)); + copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write)); + cp_async_fence(); + if (k_tile_count > 0) { ++k_tile_iter; } + + // + // Compute on k_tile + // + warpgroup_fence_operand(accum); + warpgroup_arrive(); + + cp_async_wait(); + cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read), tCrB(_,_,_,smem_pipe_read), accum); + warpgroup_commit_batch(); + + // + // Advance the pipe + // + ++smem_pipe_read; + smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? smem_pipe_read = 0 : smem_pipe_read; + + ++smem_pipe_write; + smem_pipe_write = (smem_pipe_write == DispatchPolicy::Stages) ? smem_pipe_write = 0 : smem_pipe_write; + + // Wait for the pipeline MMAs to drain + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + int Stages, + class ClusterShape, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90CpAsyncGmma, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90CpAsyncGmma; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}))); + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(std::is_base_of::value && + std::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + + struct SharedStorage + { + cute::array_aligned> smem_a; + cute::array_aligned> smem_b; + }; + + struct Params { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + }; + + // + // Methods + // + + CollectiveMma() = default; + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + return {args.ptr_A, args.dA, args.ptr_B, args.dB}; + } + + /// Perform a collective-scoped matrix multiply-accumulate + template < + class FrgTensorD, + class TensorA, + class TensorB, + class FrgTensorC, + class KTileIterator, + class ResidueMNK + > + CUTLASS_DEVICE void + operator() ( + FrgTensorD &accum, + TensorA gA, + TensorB gB, + FrgTensorC const &src_accum, + KTileIterator k_tile_iter, int k_tile_count, + ResidueMNK residue_mnk, + int thread_idx, + char *smem_buf) + { + using namespace cute; + + static_assert(is_rmem::value, "D tensor must be rmem resident."); + static_assert(is_gmem::value, "A tensor must be gmem resident."); + static_assert(is_gmem::value, "B tensor must be gmem resident."); + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(std::is_same::value, + "SM90 warpgroup MMA must specify transforms through MMA_Atom."); + static_assert(std::is_same::value, + "SM90 warpgroup MMA must specify transforms through MMA_Atom."); + static_assert(std::is_same::value, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(std::is_same::value, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + SharedStorage& storage = *reinterpret_cast(smem_buf); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k) + // This aligns the tensor with BLK_K for all but the 0th k_tile + gA.data() = &gA(0, get<2>(residue_mnk), 0); + gB.data() = &gB(0, get<2>(residue_mnk), 0); + + // Partition the copying of A and B tiles across the threads + GmemTiledCopyA gmem_tiled_copy_a; + GmemTiledCopyB gmem_tiled_copy_b; + auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx); + auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx); + + Tensor tAgA = gmem_thr_copy_a.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k) + Tensor tAsA = gmem_thr_copy_a.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE) + Tensor tBgB = gmem_thr_copy_b.partition_S(gB); // (BCPY,BCPY_N,BCPY_K,k) + Tensor tBsB = gmem_thr_copy_b.partition_D(sB); // (BCPY,BCPY_N,BCPY_K,PIPE) + + // + // PREDICATES + // + + // Allocate predicate tensors for m and n + Tensor tApA = make_tensor(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{}); + Tensor tBpB = make_tensor(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{}); + + // Construct identity layout for sA and sB + Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K) -> (blk_m,blk_k) + Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB))); // (BLK_N,BLK_K) -> (blk_n,blk_k) + + // Repeat the partitioning with identity layouts + Tensor tAcA = gmem_thr_copy_a.partition_S(cA); // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k) + Tensor tBcB = gmem_thr_copy_b.partition_S(cB); // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k) + + // Set predicates for m bounds + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<0>(tApA); ++m) { + tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk); // blk_m coord < residue_m + } + // Set predicates for n bounds + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size<0>(tBpB); ++n) { + tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk); // blk_n coord < residue_n + } + + // + // Prologue/PREFETCH + // + + // Clear the smem tiles to account for predicated off loads + clear(tAsA); + clear(tBsB); + + // Start async loads for 0th k-tile, where we take care of the k residue + { + constexpr int k_pipe = 0; + + Tensor tAgAk = tAgA(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tAsA); ++k) { + if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gA shifted) + copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe)); + } + } + Tensor tBgBk = tBgB(_,_,_,*k_tile_iter); + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < size<2>(tBsB); ++k) { + if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) { // blk_k coord < residue_k (gB shifted) + copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe)); + } + } + cp_async_fence(); + ++k_tile_iter; + --k_tile_count; + } + + // Start async loads for 1st k-tile onwards, no k-residue handling needed + CUTLASS_PRAGMA_UNROLL + for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) { + if (k_tile_count <= 0) { + clear(tApA); + clear(tBpB); + } + copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe)); // CpAsync + copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe)); // CpAsync + cp_async_fence(); + ++k_tile_iter; + --k_tile_count; + } + + // + // MMA Atom partitioning + // + + // Tile MMA atom and compute thread partitions across A, B and C + TiledMma tiled_mma; + auto thr_mma = tiled_mma.get_thread_slice(thread_idx); + + // Allocate registers for pipelining + Tensor tCsA = thr_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thr_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + Tensor tCrA = thr_mma.make_fragment_A(tCsA); // (MMA,MMA_N,MMA_K,PIPE) + Tensor tCrB = thr_mma.make_fragment_B(tCsB); // (MMA,MMA_M,MMA_N,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(src_accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(src_accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // Current pipe index in smem to read from + int smem_pipe_read = 0; + // Current pipe index in smem to write to + int smem_pipe_write = DispatchPolicy::Stages-1; + + // + // Pipelined Main Loop + // + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count) + { + // + // Copy gmem to smem for *k_tile_iter + // + if (k_tile_count <= 0) { + clear(tApA); + clear(tBpB); + } + copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write)); // CpAsync + copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write)); // CpAsync + cp_async_fence(); + ++k_tile_iter; + + // + // Compute on k_tile + // + warpgroup_fence_operand(accum); + warpgroup_arrive(); + + cp_async_wait(); + cute::gemm(tiled_mma, accum, tCrA(_,_,_,smem_pipe_read), tCrB(_,_,_,smem_pipe_read), src_accum); + warpgroup_commit_batch(); + + // + // Advance the pipe + // + ++smem_pipe_read; + smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? smem_pipe_read = 0 : smem_pipe_read; + + ++smem_pipe_write; + smem_pipe_write = (smem_pipe_write == DispatchPolicy::Stages) ? smem_pipe_write = 0 : smem_pipe_write; + + // Wait for the pipeline MMAs to drain + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp new file mode 100644 index 0000000000..25eaffb74b --- /dev/null +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp @@ -0,0 +1,480 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cute/arch/cluster_sm90.hpp" +#include "cute/arch/copy_sm90.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/tensor_predicate.hpp" +#include "cute/numeric/arithmetic_tuple.hpp" +#include "cutlass/pipeline.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + int Stages, + class ClusterShape, + int PipelineAsyncMmaStages, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90TmaGmma, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90TmaGmma; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + using MainloopPipeline = cutlass::PipelineTmaAsync< + DispatchPolicy::Stages, + typename DispatchPolicy::ClusterShape>; + + using PipelineParams = typename MainloopPipeline::Params; + using PipelineState = typename cutlass::PipelineState; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + // Tile along K mode first before tiling over MN. PIPE mode last as usual. + // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs. + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), + Step<_2,_1,_3>{})); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), + Step<_2,_1,_3>{})); + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(std::is_base_of::value && + std::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + static_assert(std::is_same_v || std::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(std::is_same_v || std::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + + // TMA converts f32 input to tf32 when copying from GMEM to SMEM + // For all other types, cast to size equivalent uint type to avoid any rounding by TMA. + static constexpr bool ConvertF32toTF32A = std::is_same_v; + static constexpr bool ConvertF32toTF32B = std::is_same_v; + using InternalElementA = std::conditional_t>>; + using InternalElementB = std::conditional_t>>; + + struct SharedStorage + { + cute::array_aligned> smem_A; + cute::array_aligned> smem_B; + + using PipelineStorage = typename MainloopPipeline::SharedStorage; + alignas(16) PipelineStorage pipeline_storage; + }; + + struct Params { + InternalElementA const* ptr_A; + StrideA dA; + InternalElementB const* ptr_B; + StrideB dB; + // Assumption: StrideA is congruent with Problem_MK + using TMA_A = decltype(make_tma_copy( + GmemTiledCopyA{}, + make_tensor(ptr_A, repeat_like(StrideA{}, int32_t(0)), dA), + SmemLayoutA{}(_,_,0), + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})), + size<1>(ClusterShape{}))); // mcast along N mode for this M load, if any + // Assumption: StrideB is congruent with Problem_NK + using TMA_B = decltype(make_tma_copy( + GmemTiledCopyB{}, + make_tensor(ptr_B, repeat_like(StrideB{}, int32_t(0)), dB), + SmemLayoutB{}(_,_,0), + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})), + size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any + TMA_A tma_load_a; + TMA_B tma_load_b; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + // Optionally append _1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + auto reinterpreted_ptr_A = reinterpret_cast(args.ptr_A); + auto reinterpreted_ptr_B = reinterpret_cast(args.ptr_B); + + Tensor tensor_a = make_tensor(reinterpreted_ptr_A, make_layout(make_shape(M,K,L), args.dA)); + Tensor tensor_b = make_tensor(reinterpreted_ptr_B, make_layout(make_shape(N,K,L), args.dB)); + typename Params::TMA_A tma_load_a = make_tma_copy( + GmemTiledCopyA{}, + tensor_a, + SmemLayoutA{}(_,_,cute::Int<0>{}), + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})), + size<1>(ClusterShape{})); // mcast along N mode for this M load, if any + typename Params::TMA_B tma_load_b = make_tma_copy( + GmemTiledCopyB{}, + tensor_b, + SmemLayoutB{}(_,_,cute::Int<0>{}), + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})), + size<0>(ClusterShape{})); // mcast along M mode for this N load, if any + return { + reinterpreted_ptr_A, + args.dA, + reinterpreted_ptr_B, + args.dB, + tma_load_a, + tma_load_b + }; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& mainloop_params) + { + cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); + } + + /// Perform a collective-scoped matrix multiply-accumulate + template < + class TensorA, class TMA_LOAD_A, + class TensorB, class TMA_LOAD_B, + class FrgTensorC, + class KTileIterator + > + CUTLASS_DEVICE void + operator() ( + TensorA const& gA, TMA_LOAD_A& tma_load_a, + TensorB const& gB, TMA_LOAD_B& tma_load_b, + FrgTensorC& accum, + KTileIterator k_tile_iter, int k_tile_count, + int thread_idx, + char* shared_memory, + Params const& mainloop_params) + { + using namespace cute; + + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2."); + static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(std::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(std::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + SharedStorage& storage = *reinterpret_cast(shared_memory); + Tensor sA = make_tensor(make_smem_ptr(storage.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(storage.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // + // Prepare the TMA loads for A and B + // + dim3 cluster_local_block_id = cute::block_id_in_cluster(); + auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y); + auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x); + + // Applies the mapping from block_tma_a + Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) + Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) + + Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) + Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) + + // + // Prepare TMA membars and PREFETCH + // + + // Number of pipelined k-tiles in smem + constexpr int K_PIPE_MAX = DispatchPolicy::Stages; + + // NOTE: Another parameter: Partition the pipeline between active MMAs and active TMAs + // Tunable via the dispatch policy to tollerate latencies evenly across the math and compute stages + // K_PIPE_MMAS: The max number of active MMA pipes at beginning of every loop + // K_PIPE_TMAS: The max number of active TMA pipes at beginning of every loop (geq 1) + constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages; + constexpr int K_PIPE_TMAS = K_PIPE_MAX - K_PIPE_MMAS; + static_assert(0 <= K_PIPE_MMAS && K_PIPE_MMAS < K_PIPE_MAX); + static_assert(0 < K_PIPE_TMAS && K_PIPE_TMAS <= K_PIPE_MAX); + + static_assert(K_PIPE_MMAS < K_PIPE_MAX - 1); + + // Set the bytes transferred in this TMA transaction (may involve multiple issues) + constexpr uint32_t TmaTransactionBytes = static_cast( + (size<0>(sA) * size<1>(sA) * sizeof(InternalElementA)) + + (size<0>(sB) * size<1>(sB) * sizeof(InternalElementB))); + + + // Obtain warp index + int warp_idx = canonical_warp_idx(); + int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup; + + PipelineParams params; + params.transaction_bytes = TmaTransactionBytes; + params.role = MainloopPipeline::ThreadCategory::ProducerConsumer; + params.is_leader = warp_group_thread_idx == 0; + params.num_consumers = NumThreadsPerWarpGroup; + + MainloopPipeline pipeline( + storage.pipeline_storage, + params); + + // State variables used for iterating the circular buffer + // smem_pipe_read / release is used by the consumer of SMEM data - i.e MMA + // smem_pipe_write is used by the producer of SMEM data - i.e TMA + PipelineState smem_pipe_read; + PipelineState smem_pipe_release; + PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + + // We need this to guarantee that the Pipeline init is visible + // To all producers and consumer blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + cute::cluster_wait(); + } + else { + __syncthreads(); + } + + // Set predicate for the lowest lane_id in the warp + int lane_predicate = cute::elect_one_sync(); + + uint16_t mcast_mask_a = 0; + uint16_t mcast_mask_b = 0; + // Keep a copy to know when to stop issuing loads + int k_tile_count_tma = k_tile_count; + + // Issue TmaLoads (Prologue fetches) + if (warp_idx == 0 && lane_predicate == 1) { + // Maps the tile -> block, value + if constexpr (std::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int n = 0; n < size<1>(block_layout); ++n) { + mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); + } + } + + if constexpr (std::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); + } + } + + // Issue the prologue loads + int prologue_tma_count = min(K_PIPE_MAX, k_tile_count); + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < prologue_tma_count; ++stage) { + pipeline.producer_acquire(smem_pipe_write); + using BarrierType = typename MainloopPipeline::ValueType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(stage); + + copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,stage)); + copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,stage)); + ++k_tile_iter; + ++smem_pipe_write; + } + k_tile_count_tma -= prologue_tma_count; + } + + // + // Define C accumulators and A/B partitioning + // + + TiledMma tiled_mma; + auto thread_mma = tiled_mma.get_thread_slice(thread_idx); + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA)); // PIPE + CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + __syncthreads(); + + warpgroup_fence_operand(accum); + // Prologue MMAs + CUTLASS_PRAGMA_UNROLL + for (int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + prologue_mma_count > 0; --prologue_mma_count) + { + // WAIT on smem_pipe_read until it's data is available + pipeline.consumer_wait(smem_pipe_read); + warpgroup_arrive(); + cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum); // (V,M,K) x (V,N,K) => (V,M,N) + warpgroup_commit_batch(); + ++smem_pipe_read; + --k_tile_count; + } + warpgroup_fence_operand(accum); + + // + // PIPELINED MAIN LOOP + // + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) + { + // WAIT on smem_pipe_read until data is available + pipeline.consumer_wait(smem_pipe_read); + + // + // Compute on k_tile + // + + warpgroup_fence_operand(accum); + warpgroup_arrive(); + cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum); // (V,M,K) x (V,N,K) => (V,M,N) + warpgroup_commit_batch(); + + /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed + warpgroup_wait(); + warpgroup_fence_operand(accum); + + pipeline.consumer_release(smem_pipe_release); // UNLOCK wr stage, done _computing_ on it + + // + // Copy gmem to smem for *k_tile_iter + // + + // Do Acquire & Load only if needed - helps with both performance and also corner case illegal barrier-ops + if (warp_idx == 0 && lane_predicate == 1 && (k_tile_count_tma > 0) ) { + pipeline.producer_acquire(smem_pipe_write); // LOCK wr stage, for _writing_ + + using BarrierType = typename MainloopPipeline::ValueType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write.index()); + + copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write.index())); + copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write.index())); + ++smem_pipe_write; + ++k_tile_iter; + --k_tile_count_tma; + } + + // Advance consumer pipeline + ++smem_pipe_read; + ++smem_pipe_release; + } + + // Wait on all GMMAs + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + + // Workaround for ensuring Smem destruction doesn't happen accidentally + if constexpr (size(typename DispatchPolicy::ClusterShape{}) > 1) { + cute::cluster_arrive(); + cute::cluster_wait(); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp new file mode 100644 index 0000000000..41b0f13b65 --- /dev/null +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp @@ -0,0 +1,494 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cute/arch/cluster_sm90.hpp" +#include "cute/arch/copy_sm90.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/tensor_predicate.hpp" +#include "cute/numeric/arithmetic_tuple.hpp" +#include "cutlass/pipeline.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// WarpSpecialized Mainloop +template < + int Stages, + class ClusterShape, + class KernelSchedule, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90TmaGmmaWarpSpecialized, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + using MainloopPipeline = cutlass::PipelineTmaAsync< + DispatchPolicy::Stages, + typename DispatchPolicy::ClusterShape>; + using PipelineState = cutlass::PipelineState; + + using PipelineParams = typename MainloopPipeline::Params; + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + // Tile along K mode first before tiling over MN. PIPE mode last as usual. + // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs. + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), + Step<_2,_1,_3>{})); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), + Step<_2,_1,_3>{})); + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(std::is_base_of::value && + std::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + static_assert(std::is_same_v || std::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(std::is_same_v || std::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + + // TMA converts f32 input to tf32 when copying from GMEM to SMEM + // For all other types, cast to size equivalent uint type to avoid any rounding by TMA. + static constexpr bool ConvertF32toTF32A = std::is_same_v; + static constexpr bool ConvertF32toTF32B = std::is_same_v; + using InternalElementA = std::conditional_t>>; + using InternalElementB = std::conditional_t>>; + + struct SharedStorage + { + cute::array_aligned> smem_A; + cute::array_aligned> smem_B; + + using PipelineStorage = typename MainloopPipeline::SharedStorage; + alignas(16) PipelineStorage pipeline_storage; + }; + + struct Params { + InternalElementA const* ptr_A; + StrideA dA; + InternalElementB const* ptr_B; + StrideB dB; + // Assumption: StrideA is congruent with Problem_MK + using TMA_A = decltype(make_tma_copy( + GmemTiledCopyA{}, + make_tensor(ptr_A, repeat_like(StrideA{}, int32_t(0)), dA), + SmemLayoutA{}(_,_,0), + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})), + size<1>(ClusterShape{}))); // mcast along N mode for this M load, if any + // Assumption: StrideB is congruent with Problem_NK + using TMA_B = decltype(make_tma_copy( + GmemTiledCopyB{}, + make_tensor(ptr_B, repeat_like(StrideB{}, int32_t(0)), dB), + SmemLayoutB{}(_,_,0), + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})), + size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any + TMA_A tma_load_a; + TMA_B tma_load_b; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(Args const& args, void* workspace) { + (void) workspace; + // Optionally append _1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + auto reinterpreted_ptr_A = reinterpret_cast(args.ptr_A); + auto reinterpreted_ptr_B = reinterpret_cast(args.ptr_B); + + Tensor tensor_a = make_tensor(reinterpreted_ptr_A, make_layout(make_shape(M,K,L), args.dA)); + Tensor tensor_b = make_tensor(reinterpreted_ptr_B, make_layout(make_shape(N,K,L), args.dB)); + typename Params::TMA_A tma_load_a = make_tma_copy( + GmemTiledCopyA{}, + tensor_a, + SmemLayoutA{}(_,_,cute::Int<0>{}), + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})), + size<1>(ClusterShape{})); // mcast along N mode for this M load, if any + typename Params::TMA_B tma_load_b = make_tma_copy( + GmemTiledCopyB{}, + tensor_b, + SmemLayoutB{}(_,_,cute::Int<0>{}), + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})), + size<0>(ClusterShape{})); // mcast along M mode for this N load, if any + return { + reinterpreted_ptr_A, + args.dA, + reinterpreted_ptr_B, + args.dB, + tma_load_a, + tma_load_b + }; + } + + static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; + static constexpr int K_PIPE_MMAS = 1; + static constexpr uint32_t TmaTransactionBytes = + (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof(ElementA)))+ + (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof(ElementB))); + + CUTLASS_DEVICE + static MainloopPipeline make_pipeline(char* shared_memory, PipelineParams params){ + SharedStorage& shared_storage = *reinterpret_cast(shared_memory); + return {shared_storage.pipeline_storage, params}; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& mainloop_params) + { + cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Producer Perspective + template < + class TensorA, class TMA_LOAD_A, + class TensorB, class TMA_LOAD_B, + class KTileIterator + > + CUTLASS_DEVICE void + dma(MainloopPipeline pipeline, + PipelineState smem_pipe_write, + TensorA const& gA, TMA_LOAD_A& tma_load_a, + TensorB const& gB, TMA_LOAD_B& tma_load_b, + KTileIterator k_tile_iter, int k_tile_count, + int thread_idx, + char* shared_memory) + { + + using namespace cute; + int warp_idx = canonical_warp_idx(); + int warp_idx_in_warp_group = warp_idx % 4; + int lane_predicate = cute::elect_one_sync(); + + if (warp_idx_in_warp_group == 0 and lane_predicate) { + SharedStorage& shared_storage = *reinterpret_cast(shared_memory); + Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // + // Prepare the TMA loads for A and B + // + + dim3 cluster_local_block_id = cute::block_id_in_cluster(); + auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y); + auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x); + + // Applies the mapping from block_tma_a + Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) + Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) + + Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) + Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) + + uint16_t mcast_mask_a = 0; + uint16_t mcast_mask_b = 0; + + // Issue TmaLoads + // Maps the tile -> block, value + if constexpr (std::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int n = 0; n < size<1>(block_layout); ++n) { + mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); + } + } + + if constexpr (std::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); + } + } + + // Issue the prologue loads + int k_tile_prologue = min(k_tile_count, K_PIPE_MAX); + CUTLASS_PRAGMA_UNROLL + for (int count = 0; count < k_tile_prologue; ++count) { + pipeline.producer_acquire(smem_pipe_write); + int write_stage = smem_pipe_write.index(); + using BarrierType = typename MainloopPipeline::ValueType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(write_stage); + + copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); + copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); + ++k_tile_iter; + ++smem_pipe_write; + } + k_tile_count -= k_tile_prologue; + + // Mainloop + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) + { + // LOCK smem_pipe_write for _writing_ + pipeline.producer_acquire(smem_pipe_write); + + // + // Copy gmem to smem for *k_tile_iter + // + + int write_stage = smem_pipe_write.index(); + using BarrierType = typename MainloopPipeline::ValueType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(write_stage); + + copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); + copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); + ++k_tile_iter; + + // Advance smem_pipe_write + ++smem_pipe_write; + } + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + dma_epilogue(MainloopPipeline pipeline, + PipelineState smem_pipe_write) + { + int warp_idx = canonical_warp_idx(); + int warp_idx_in_warp_group = warp_idx % 4; + int lane_predicate = cute::elect_one_sync(); + + // Issue the epilogue waits + if (warp_idx_in_warp_group == 0 and lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all + * Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was + * still inverted from make_producer_start_state + */ + for (int count = 0; count < K_PIPE_MAX; ++count) { + pipeline.producer_acquire(smem_pipe_write); + ++smem_pipe_write; + } + } + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Consumer Perspective + template < + class FrgTensorC + > + CUTLASS_DEVICE void + mma(MainloopPipeline pipeline, + PipelineState smem_pipe_read, + FrgTensorC& accum, + int k_tile_count, + int thread_idx, + char* shared_memory, + Params const& mainloop_params + ) + { + using namespace cute; + + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(std::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(std::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + SharedStorage& shared_storage = *reinterpret_cast(shared_memory); + Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // + // Define C accumulators and A/B partitioning + // + + TiledMma tiled_mma; + auto thread_mma = tiled_mma.get_thread_slice(thread_idx); + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // + // PIPELINED MAIN LOOP + // + static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), + "ERROR : Incorrect number of MMAs in flight"); + + // We release buffers to producer warps(dma) with some mmas in flight + PipelineState smem_pipe_release = smem_pipe_read; + + // Prologue GMMAs + int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + + warpgroup_fence_operand(accum); + CUTLASS_PRAGMA_UNROLL + for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + pipeline.consumer_wait(smem_pipe_read); + + int read_stage = smem_pipe_read.index(); + warpgroup_arrive(); + cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N) + warpgroup_commit_batch(); + + ++smem_pipe_read; + } + + warpgroup_fence_operand(accum); + // Mainloop GMMAs + k_tile_count -= prologue_mma_count; + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + pipeline.consumer_wait(smem_pipe_read); + + // + // Compute on k_tile + // + + int read_stage = smem_pipe_read.index(); + warpgroup_fence_operand(accum); + warpgroup_arrive(); + cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N) + warpgroup_commit_batch(); + + /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed + warpgroup_wait(); + warpgroup_fence_operand(accum); + + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + + // Advance smem_pipe_read and smem_pipe_release + ++smem_pipe_read; + ++smem_pipe_release; + } + + // Wait on all GMMAs to complete + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + + for (int count = 0; count < prologue_mma_count; ++count) { + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + ++smem_pipe_release; + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 657488c564..66884fb26b 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -36,20 +36,369 @@ #pragma once +// common #include "cutlass/cutlass.h" +#include "cutlass/trace.h" +#include "cutlass/cluster_launch.hpp" +#include "cutlass/device_kernel.h" +#include "cutlass/gemm/gemm.h" + +// 2.x #include "cutlass/gemm/device/gemm_universal_base.h" #include "cutlass/gemm/kernel/gemm_transpose_operands.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +// 3.x +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::device { //////////////////////////////////////////////////////////////////////////////// -namespace cutlass { -namespace gemm { -namespace device { +/*! + GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel + of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal. + + It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs + to create it from the host facing arguments. For power users, new static methods + are exposed in 3.x APIs that bypass the stateful methods or args->params lowering. + + It supports kernel types that implement both the 2.x and 3.0 APIs, + however, this is done by specializing the implementation of GemmUniversalAdapter + on the two kernel API types, and thus, GemmUniversalAdapter's behaviour might + differ between the two specializations. +*/ +template +class GemmUniversalAdapter; + +//////////////////////////////////////////////////////////////////////////////// +////////////////////////////// CUTLASS 3.x API ///////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +template +class GemmUniversalAdapter< + GemmKernel_, + std::enable_if_t::value>> +{ +public: + using GemmKernel = GemmKernel_; + using TileShape = typename GemmKernel::TileShape; + using ElementA = typename GemmKernel::ElementA; + using ElementB = typename GemmKernel::ElementB; + using ElementC = typename GemmKernel::ElementC; + using ElementAccumulator = typename GemmKernel::TiledMma::ValTypeC; + using DispatchPolicy = typename GemmKernel::DispatchPolicy; + using CollectiveMainloop = typename GemmKernel::CollectiveMainloop; + using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue; + + // Map back to 2.x type as best as possible + using LayoutA = gemm::detail::StrideToLayoutTagA_t; + using LayoutB = gemm::detail::StrideToLayoutTagB_t; + using LayoutC = gemm::detail::StrideToLayoutTagC_t; + using LayoutD = gemm::detail::StrideToLayoutTagC_t; + + // NOTE: 3.0 kernels do not support complex transforms for now ... + static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformB = ComplexTransform::kNone; + + // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0 + using MathOperator = cutlass::arch::OpMultiplyAdd; + + // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop! + using OperatorClass = std::conditional_t< + (cute::size(typename GemmKernel::TiledMma::AtomThrID{}) > 1), + cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>; + + using ArchTag = typename GemmKernel::ArchTag; + + // NOTE: Assume identity swizzle for now + static_assert(std::is_void_v, + "CUTLASS 3.x kernel types do not support grid swizzle functors yet."); + using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; + + // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape + using ThreadblockShape = cutlass::gemm::GemmShape< + cute::size<0>(TileShape{}), + cute::size<1>(TileShape{}), + cute::size<2>(TileShape{})>; + + using ClusterShape = cutlass::gemm::GemmShape< + cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>; + + // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape + using InstructionShape = cutlass::gemm::GemmShape< + cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}), + cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}), + cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>; + + // Legacy: provide a correct warp count, but no reliable warp shape + static int const kThreadCount = GemmKernel::MaxThreadsPerBlock; + + // Warp shape is not a primary API type in 3.x + // But we can best approximate it by inspecting the TiledMma::TiledShape_MNK + // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K + // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads + static constexpr int WarpsInMma = std::max(4, cute::size(typename GemmKernel::TiledMma{}) / 32); + static constexpr int WarpsInMmaM = 4; + static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM); + using WarpCount = cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape< + cute::size<0>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{}) / WarpsInMmaM, + cute::size<1>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{}) / WarpsInMmaN, + cute::size<2>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{})>; + + static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages; + + // Inspect TiledCopy for A and B to compute the alignment size + static int constexpr kAlignmentA = gemm::detail::get_alignment_count_from_gmem_tiled_copy< + typename CollectiveMainloop::GmemTiledCopyA, ElementA>(); + static int constexpr kAlignmentB = gemm::detail::get_alignment_count_from_gmem_tiled_copy< + typename CollectiveMainloop::GmemTiledCopyB, ElementB>(); + + // NOTE: 3.0 DefaultEpilogues don't support vectorized stores (yet) + static int constexpr kAlignmentC = 1; + static int constexpr kAlignmentD = 1; + + using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp; + + // Split-K preserves splits that are 128b aligned + static int constexpr kSplitKAlignment = std::max( + 128 / sizeof_bits::value, 128 / sizeof_bits::value); + + /// Argument structure: User API + using Arguments = typename GemmKernel::Arguments; + /// Argument structure: Kernel API + using Params = typename GemmKernel::Params; + +private: + + /// Kernel API parameters object + Params params_; + +public: + + /// Determines whether the GEMM can execute the given problem. + static Status + can_implement(Arguments const& args) { + if (GemmKernel::can_implement(args)) { + return Status::kSuccess; + } + else { + return Status::kInvalid; + } + } -///////////////////////////////////////////////////////////////////////////////////////////////// + /// Gets the workspace size + static size_t + get_workspace_size(Arguments const& args) { + size_t workspace_bytes = 0; + if (args.mode == GemmUniversalMode::kGemmSplitKParallel) { + workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{})); + } + + CUTLASS_TRACE_HOST(" workspace_bytes: " << workspace_bytes); + + workspace_bytes += GemmKernel::get_workspace_size(args); + return workspace_bytes; + } + + /// Computes the grid shape + static dim3 + get_grid_shape(Arguments const& args) { + auto tmp_params = GemmKernel::to_underlying_arguments(args); + return GemmKernel::get_grid_shape(tmp_params); + } + + /// Computes the grid shape + static dim3 + get_grid_shape(Params const& params) { + return GemmKernel::get_grid_shape(params); + } + + /// Computes the maximum number of active blocks per multiprocessor + static int maximum_active_blocks(int /* smem_capacity */ = -1) { + CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()"); + int max_active_blocks = -1; + int smem_size = GemmKernel::SharedStorageSize; + + // first, account for dynamic smem capacity if needed + cudaError_t result; + if (smem_size >= (48 << 10)) { + CUTLASS_TRACE_HOST(" Setting smem size to " << smem_size); + result = cudaFuncSetAttribute( + device_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + if (cudaSuccess != result) { + result = cudaGetLastError(); // to clear the error bit + CUTLASS_TRACE_HOST( + " cudaFuncSetAttribute() returned error: " + << cudaGetErrorString(result)); + return -1; + } + } + + // query occupancy after setting smem size + result = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, + device_kernel, + GemmKernel::MaxThreadsPerBlock, + smem_size); + + if (cudaSuccess != result) { + result = cudaGetLastError(); // to clear the error bit + CUTLASS_TRACE_HOST( + " cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: " + << cudaGetErrorString(result)); + return -1; + } + + CUTLASS_TRACE_HOST(" max_active_blocks: " << max_active_blocks); + return max_active_blocks; + } + + /// Initializes GEMM state from arguments. + Status + initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) { + CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace " + << workspace << ", stream: " << (stream ? "non-null" : "null")); + + size_t workspace_bytes = GemmKernel::get_workspace_size(args); + CUTLASS_TRACE_HOST(" workspace_bytes: " << workspace_bytes); + + if (workspace_bytes) { + if (!workspace) { + CUTLASS_TRACE_HOST(" error: device workspace must not be null"); + return Status::kErrorWorkspaceNull; + } + + if (args.mode == GemmUniversalMode::kGemm) { + CUTLASS_TRACE_HOST(" clearing device workspace"); + cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream); + if (cudaSuccess != result) { + result = cudaGetLastError(); // to clear the error bit + CUTLASS_TRACE_HOST(" cudaMemsetAsync() returned error " << cudaGetErrorString(result)); + return Status::kErrorInternal; + } + } + } + + // Initialize the Params structure + params_ = GemmKernel::to_underlying_arguments(args, workspace); + + // account for dynamic smem capacity if needed + int smem_size = GemmKernel::SharedStorageSize; + if (smem_size >= (48 << 10)) { + CUTLASS_TRACE_HOST(" Setting smem size to " << smem_size); + cudaError_t result = cudaFuncSetAttribute( + device_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + if (cudaSuccess != result) { + result = cudaGetLastError(); // to clear the error bit + CUTLASS_TRACE_HOST(" cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result)); + return Status::kErrorInternal; + } + } + return Status::kSuccess; + } + + /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params. + Status + update(Arguments const& args, void* workspace = nullptr) { + CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace); + + size_t workspace_bytes = get_workspace_size(args); + if (workspace_bytes > 0 && nullptr == workspace) { + return Status::kErrorWorkspaceNull; + } + + params_ = GemmKernel::to_underlying_arguments(args, workspace); + return Status::kSuccess; + } + + /// Primary run() entry point API that is static allowing users to create and manage their own params. + /// Supplied params struct must be construct by calling GemmKernel::to_underling_arguments() + static Status + run(Params& params, cudaStream_t stream = nullptr) { + CUTLASS_TRACE_HOST("GemmUniversal::run()"); + dim3 constexpr block = GemmKernel::get_block_shape(); + dim3 const grid = get_grid_shape(params); + + // configure smem size and carveout + int smem_size = GemmKernel::SharedStorageSize; + + Status launch_result; + // Use extended launch API only for mainloops that use it + if constexpr(GemmKernel::ArchTag::kMinComputeCapability >= 90) { + dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})); + void const* kernel = (void const*) device_kernel; + void* kernel_params[] = {¶ms}; + launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params); + } + else { + launch_result = Status::kSuccess; + device_kernel<<>>(params); + } + + cudaError_t result = cudaGetLastError(); + if (cudaSuccess == result && Status::kSuccess == launch_result) { + return Status::kSuccess; + } + else { + CUTLASS_TRACE_HOST(" Kernel launch failed. Reason: " << result); + return Status::kErrorInternal; + } + } + + // + // Non-static launch overloads that first create and set the internal params struct of this kernel handle. + // + + /// Launches the kernel after first constructing Params internal state from supplied arguments. + Status + run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) { + Status status = initialize(args, workspace, stream); + if (Status::kSuccess == status) { + status = run(params_, stream); + } + return status; + } + + /// Launches the kernel after first constructing Params internal state from supplied arguments. + Status + operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) { + return run(args, workspace, stream); + } + + /// Overload that allows a user to re-launch the same kernel without updating internal params struct. + Status + run(cudaStream_t stream = nullptr) { + return run(params_, stream); + } + + /// Overload that allows a user to re-launch the same kernel without updating internal params struct. + Status + operator()(cudaStream_t stream = nullptr) const { + return run(params_, stream); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +////////////////////////////// CUTLASS 2.x API ///////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// template -class GemmUniversalAdapter { +class GemmUniversalAdapter< + GemmKernel_, + std::enable_if_t::value>> +{ public: using GemmKernel = GemmKernel_; @@ -193,10 +542,8 @@ class GemmUniversalAdapter { } }; -///////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// -} // namespace device -} // namespace gemm -} // namespace cutlass +} // namespace cutlass::gemm::device -///////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp new file mode 100644 index 0000000000..a2cd9a1117 --- /dev/null +++ b/include/cutlass/gemm/dispatch_policy.hpp @@ -0,0 +1,144 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/arch/arch.h" + +#include "cute/layout.hpp" +#include "cute/numeric/integral_constant.hpp" + +////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm { +using namespace cute; + +////////////////////////////////////////////////////////////////////////////// + +// +// Policies for categorical dispatch of mainloop against kernel grid schedules +// +struct KernelMultistage { }; +struct KernelTma { }; +struct KernelTmaWarpSpecialized { }; +struct KernelTmaWarpSpecializedPersistent { }; + +// +// Collective Mainloop Policies +// + +// 2 stage pipeline through 1 stage in smem, 1 in rmem, WITHOUT predicated gmem loads +struct MainloopSm70TwoStageUnpredicated { + constexpr static int Stages = 2; + using ArchTag = arch::Sm70; + using Schedule = KernelMultistage; + using ClusterShape = Shape<_1,_1,_1>; +}; + +// 2 stage pipeline through 1 stage in smem, 1 in rmem, with predicated gmem loads +struct MainloopSm70TwoStage { + constexpr static int Stages = 2; + using ArchTag = arch::Sm70; + using Schedule = KernelMultistage; + using ClusterShape = Shape<_1,_1,_1>; +}; + +// n-buffer in smem (cp.async), pipelined with registers, WITHOUT predicated gmem loads +template +struct MainloopSm80CpAsyncUnpredicated { + constexpr static int Stages = Stages_; + using ArchTag = arch::Sm80; + using Schedule = KernelMultistage; + using ClusterShape = Shape<_1,_1,_1>; +}; + +// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads +template +struct MainloopSm80CpAsync { + constexpr static int Stages = Stages_; + using ArchTag = arch::Sm80; + using Schedule = KernelMultistage; + using ClusterShape = Shape<_1,_1,_1>; +}; + +// n-buffer in smem (cp.async), pipelined with Hopper GMMA, WITHOUT predicated gmem loads +template< + int Stages_, + class ClusterShape_ = Shape<_1,_1,_1> +> +struct MainloopSm90CpAsyncGmmaUnpredicated { + constexpr static int Stages = Stages_; + using ClusterShape = ClusterShape_; + using ArchTag = arch::Sm90; + using Schedule = KernelMultistage; +}; + +// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads +template< + int Stages_, + class ClusterShape_ = Shape<_1,_1,_1> +> +struct MainloopSm90CpAsyncGmma { + constexpr static int Stages = Stages_; + using ClusterShape = ClusterShape_; + using ArchTag = arch::Sm90; + using Schedule = KernelMultistage; +}; + +// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA +template< + int Stages_, + class ClusterShape_ = Shape<_1,_1,_1>, + int PipelineAsyncMmaStages_ = 1 +> +struct MainloopSm90TmaGmma { + constexpr static int Stages = Stages_; + using ClusterShape = ClusterShape_; + constexpr static int PipelineAsyncMmaStages = PipelineAsyncMmaStages_; + using ArchTag = arch::Sm90; + using Schedule = KernelTma; +}; + +// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule +template< + int Stages_, + class ClusterShape_ = Shape<_1,_1,_1>, + class KernelSchedule = KernelTmaWarpSpecialized +> +struct MainloopSm90TmaGmmaWarpSpecialized { + constexpr static int Stages = Stages_; + using ClusterShape = ClusterShape_; + using ArchTag = arch::Sm90; + using Schedule = KernelSchedule; +}; + +////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h index 96a08de3f4..4b76101b28 100644 --- a/include/cutlass/gemm/gemm.h +++ b/include/cutlass/gemm/gemm.h @@ -35,6 +35,9 @@ #include "cutlass/cutlass.h" #include "cutlass/coord.h" +#include "cutlass/layout/matrix.h" +#include "cute/layout.hpp" +#include "cute/arch/copy_sm90.hpp" namespace cutlass { namespace gemm { @@ -420,6 +423,151 @@ enum class SharedMemoryClearOption { //////////////////////////////////////////////////////////////////////////////////////////////////// +// For each cutlass::layout, provides its corresponding cute stride types, 64b by default + +template +struct TagToStrideA {}; + +// Maps to modes [M, K, L] +template <> +struct TagToStrideA { + using type = cute::Stride, int64_t>; + using tag = layout::RowMajor; +}; + +// Maps to modes [M, K, L] +template <> +struct TagToStrideA { + using type = cute::Stride, int64_t, int64_t>; + using tag = layout::ColumnMajor; +}; + +template +struct TagToStrideB {}; + +// Maps to modes [N, K, L] +template <> +struct TagToStrideB { + using type = cute::Stride, int64_t, int64_t>; + using tag = layout::RowMajor; +}; + +// Maps to modes [N, K, L] +template <> +struct TagToStrideB { + using type = cute::Stride, int64_t>; + using tag = layout::ColumnMajor; +}; + + +// Maps to modes [N, N, L] +template +struct TagToStrideC : TagToStrideA { }; + +// Convenience aliases +template +using TagToStrideA_t = typename TagToStrideA::type; + +template +using TagToStrideB_t = typename TagToStrideB::type; + +template +using TagToStrideC_t = typename TagToStrideC::type; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// For 2.x compatibility APIs, provide stride->layout tag mappers + +namespace detail { + +// Note : This method can be used for deducing the Layout Tag of A, C, D Matrices +template +constexpr +auto +stride_to_layout_tag_A() { + // Account for stride types with and without batch mode and batch modes with static zero stride + if constexpr (cute::size<0>(StrideAC{}) == 1) { // M major + return layout::ColumnMajor{}; + } + else { // K major + return layout::RowMajor{}; + } + + CUTE_GCC_UNREACHABLE; +} + +template +constexpr +auto +stride_to_layout_tag_B() { + // Account for stride types with and without batch mode and batch modes with static zero stride + if constexpr (cute::size<0>(StrideB{}) == 1) { // N major + return layout::RowMajor{}; + } + else { // K major + return layout::ColumnMajor{}; + } + + CUTE_GCC_UNREACHABLE; +} + +// Inspects a TiledCopy and returns its alignment in terms of element count +template +constexpr int +get_alignment_count_from_gmem_tiled_copy() { + // For TMA tiled copies, we know the alignment has to be 128 bits + if constexpr (std::is_base_of_v || + std::is_base_of_v) { + return 128 / sizeof_bits::value; + } + else + { + // For non-TMA tiled copies, TiledCopy holds the alignment count directly in its TiledShape_MN + return GmemTiledCopy::NumValSrc; + } +} + +// Utilities to map Stride back on to their corresponding layout tags +template +struct StrideToLayoutTagA { + using type = decltype(detail::stride_to_layout_tag_A()); +}; + +template +struct StrideToLayoutTagB { + using type = decltype(detail::stride_to_layout_tag_B()); +}; + +// Maps to modes [N, N, L] +template +struct StrideToLayoutTagC : StrideToLayoutTagA { }; + +// Convenience aliases +template +using StrideToLayoutTagA_t = typename StrideToLayoutTagA::type; + +template +using StrideToLayoutTagB_t = typename StrideToLayoutTagB::type; + +template +using StrideToLayoutTagC_t = typename StrideToLayoutTagC::type; + +/////////////////////////////////////////////////////////////////////////////// + +// The following two metafunctions are used to detect whether a `kernel::Gemm` or `kernel::GemmUniversal` +// is implementing the CUTLASS 3.x API or not, by checking if the problem shape type is aliased within or not. +template +struct IsCutlass3GemmKernel : std::false_type { }; + +template +struct IsCutlass3GemmKernel> + : std::true_type { }; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace detail + +/////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h index f6a312367c..4432008e65 100644 --- a/include/cutlass/gemm/kernel/default_gemm.h +++ b/include/cutlass/gemm/kernel/default_gemm.h @@ -262,8 +262,8 @@ struct DefaultGemm { - static_assert(platform::is_same::value - || platform::is_same>::value, + static_assert((platform::is_same::value + || platform::is_same>::value), "Epilogue in the kernel level must be row major"); /// Define the threadblock-scoped matrix multiply-accumulate @@ -714,8 +714,8 @@ struct DefaultGemm< PermuteDLayout, typename platform::enable_if< ! platform::is_same::value >::type > { - static_assert(platform::is_same::value - || platform::is_same>::value, + static_assert((platform::is_same::value + || platform::is_same>::value), "Epilogue in the kernel level must be row major"); /// Define the threadblock-scoped matrix multiply-accumulate @@ -841,8 +841,8 @@ struct DefaultGemm { - static_assert(platform::is_same::value - || platform::is_same>::value, + static_assert((platform::is_same::value + || platform::is_same>::value), "Epilogue in the kernel level must be row major"); /// Define the threadblock-scoped matrix multiply-accumulate diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h index 1427acbb2b..b5064ec7cf 100644 --- a/include/cutlass/gemm/kernel/gemm.h +++ b/include/cutlass/gemm/kernel/gemm.h @@ -256,7 +256,7 @@ struct Gemm { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h index 2e226a9748..1862e206fd 100644 --- a/include/cutlass/gemm/kernel/gemm_array.h +++ b/include/cutlass/gemm/kernel/gemm_array.h @@ -193,7 +193,7 @@ struct GemmArray { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h index 489a899937..464aeef51d 100644 --- a/include/cutlass/gemm/kernel/gemm_batched.h +++ b/include/cutlass/gemm/kernel/gemm_batched.h @@ -204,7 +204,7 @@ struct GemmBatched { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_grouped.h b/include/cutlass/gemm/kernel/gemm_grouped.h index fd3d7f2f7f..84dc4aeec9 100644 --- a/include/cutlass/gemm/kernel/gemm_grouped.h +++ b/include/cutlass/gemm/kernel/gemm_grouped.h @@ -395,7 +395,7 @@ struct GemmGrouped { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h index 93faa2cc15..df450d08c7 100644 --- a/include/cutlass/gemm/kernel/gemm_pipelined.h +++ b/include/cutlass/gemm/kernel/gemm_pipelined.h @@ -111,7 +111,7 @@ __global__ void GemmPipelined( tb_thread_id, tb_offset_B); - int warp_id = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_id = canonical_warp_idx(); int lane_id = threadIdx.x % 32; // diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h index a2c24b258d..7dbc5923f9 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h @@ -525,7 +525,7 @@ struct GemmPlanarComplex { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h index b990d6c298..21b801149a 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h @@ -467,7 +467,7 @@ struct GemmPlanarComplexArray { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h index 7ddb76d57c..fc62c01bf3 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.h +++ b/include/cutlass/gemm/kernel/gemm_universal.h @@ -42,6 +42,8 @@ #include "cutlass/matrix_coord.h" #include "cutlass/complex.h" #include "cutlass/semaphore.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + #include "cutlass/layout/matrix.h" #include "cutlass/gemm/gemm.h" #include "cutlass/gemm/kernel/params_universal_base.h" @@ -61,7 +63,15 @@ template < typename Epilogue_, ///! Epilogue typename ThreadblockSwizzle_ ///! Threadblock swizzling function > -struct GemmUniversal { +class GemmUniversal< + Mma_, + Epilogue_, + ThreadblockSwizzle_, + void, + // 3.x kernels use the first template argument to define the ProblemShape tuple + // We use this invariant to SFINAE dispatch against either the 2.x API or the 3.x API + std::enable_if_t::value> +> { public: using Mma = Mma_; @@ -528,7 +538,7 @@ struct GemmUniversal { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp new file mode 100644 index 0000000000..cdac6ca488 --- /dev/null +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -0,0 +1,72 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::kernel { + +//////////////////////////////////////////////////////////////////////////////// + +/* + * Stateless universal device GEMM kernel type that treats GEMM as + * a composition of a collective mainloop and a collective epilogue. + * + * Supports both the 2.x and 3.x APIs based on whether the first type is + * a cute::tuple<> or not. + * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h + * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp + * + * In the following declaration, the name preceding the 'Or' refers to + * 3.x API type argument order, and the name succeeding the 'Or' refers to + * 2.x API type argument order. Template arguments without two names + * belong to the 3.x API only. +**/ +template < + class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l) + class CollectiveMainloopOrEpilogue_, + class CollectiveEpilogueOrThreadblockSwizzle_, + class GridSwizzle_ = void, + class Enable = void +> +class GemmUniversal; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel + +//////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/gemm/kernel/sm70_gemm.hpp" +#include "cutlass/gemm/kernel/sm90_gemm_tma.hpp" +#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp" +#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp" +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h index 7ab9d13968..8f67bd4577 100644 --- a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h +++ b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h @@ -918,7 +918,7 @@ struct GemmWithFusedEpilogue { lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt) { CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size); - CUTLASS_TRACE_HOST(" ptr_Reduction: " << (void *)this->ptr_Reduction); + CUTLASS_TRACE_HOST(" ptr_Vector: " << (void *)this->ptr_Vector); CUTLASS_TRACE_HOST(" ptr_Tensor: " << (void *)this->ptr_Tensor); CUTLASS_TRACE_HOST(" ldr: " << this->ldr); CUTLASS_TRACE_HOST(" ldt: " << this->ldt); @@ -1019,7 +1019,7 @@ struct GemmWithFusedEpilogue { batch_stride_Tensor(args.batch_stride_Tensor) { CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params() - problem_size: " << problem_size); - CUTLASS_TRACE_HOST(" ptr_Reduction: " << (void *)this->ptr_Reduction); + CUTLASS_TRACE_HOST(" ptr_Vector: " << (void *)this->ptr_Vector); CUTLASS_TRACE_HOST(" ptr_Tensor: " << (void *)this->ptr_Tensor); CUTLASS_TRACE_HOST(" ldr: " << this->ldr); CUTLASS_TRACE_HOST(" ldt: " << args.ldt); @@ -1222,7 +1222,7 @@ struct GemmWithFusedEpilogue { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h index 5145fb5db9..8e00e184d5 100644 --- a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h +++ b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h @@ -505,7 +505,7 @@ struct GemmWithKReduction { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/params_universal_base.h b/include/cutlass/gemm/kernel/params_universal_base.h index 1e77ea9c99..453379d448 100644 --- a/include/cutlass/gemm/kernel/params_universal_base.h +++ b/include/cutlass/gemm/kernel/params_universal_base.h @@ -189,15 +189,16 @@ struct UniversalParamsBase void *workspace, cudaStream_t stream = nullptr) { + semaphore = static_cast(workspace); // Zero-initialize entire workspace - if (workspace) + if (semaphore) { size_t workspace_bytes = get_workspace_size(); CUTLASS_TRACE_HOST(" Initialize " << workspace_bytes << " workspace bytes"); cudaError_t result = cudaMemsetAsync( - workspace, + semaphore, 0, workspace_bytes, stream); @@ -208,7 +209,6 @@ struct UniversalParamsBase } } - semaphore = static_cast(workspace); return Status::kSuccess; } diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped.h b/include/cutlass/gemm/kernel/rank_2k_grouped.h index b93ecb2df2..1c840e7aff 100644 --- a/include/cutlass/gemm/kernel/rank_2k_grouped.h +++ b/include/cutlass/gemm/kernel/rank_2k_grouped.h @@ -525,7 +525,7 @@ struct Rank2KGrouped { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/rank_2k_universal.h b/include/cutlass/gemm/kernel/rank_2k_universal.h index c1ae5d33bb..6d1f4ac2ff 100644 --- a/include/cutlass/gemm/kernel/rank_2k_universal.h +++ b/include/cutlass/gemm/kernel/rank_2k_universal.h @@ -450,7 +450,7 @@ struct Rank2KUniversal { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/rank_k_universal.h b/include/cutlass/gemm/kernel/rank_k_universal.h index 3eaf595bf4..b7d1ad1958 100644 --- a/include/cutlass/gemm/kernel/rank_k_universal.h +++ b/include/cutlass/gemm/kernel/rank_k_universal.h @@ -403,7 +403,7 @@ struct RankKUniversal { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp new file mode 100644 index 0000000000..efe51e23c9 --- /dev/null +++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp @@ -0,0 +1,252 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cute/tensor.hpp" + +namespace cutlass::gemm::kernel { + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class GridSwizzle_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + GridSwizzle_, + std::enable_if_t>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + using GridSwizzle = GridSwizzle_; + static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using MainloopParams = typename CollectiveMainloop::Params; + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueParams = typename CollectiveEpilogue::Params; + static_assert(std::is_same_v, + "Mainloop and epilogue do not agree on accumulator value type."); + + static constexpr int SharedStorageSize = cute::max( + sizeof(typename CollectiveMainloop::SharedStorage), + sizeof(typename CollectiveEpilogue::SharedStorage)); + + static constexpr uint32_t MaxThreadsPerBlock = cute::size(TiledMma{}); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + ElementA const* ptr_A = nullptr; + StrideA dA{}; + ElementB const* ptr_B = nullptr; + StrideB dB{}; + EpilogueParams epilogue_params{}; + KernelHardwareInfo hw_info; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode; + ProblemShape problem_shape; + MainloopParams mainloop; + EpilogueParams epilogue; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + (void) workspace; + return { + args.mode, + args.problem_shape, + CollectiveMainloop::to_underlying_arguments(args, workspace), + CollectiveEpilogue::to_underlying_arguments(args, workspace) + }; + } + + static + bool + can_implement(Arguments const& args) { + return args.mode == GemmUniversalMode::kGemm or + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4); + } + + static + int + get_workspace_size(Arguments const& args) { + return 0; + } + + static constexpr + dim3 + get_grid_shape(Params const& params) { + int batch_count = 1; + if constexpr (rank(ProblemShape{}) == 4) { + batch_count = cute::size<3>(params.problem_shape); + } + + return dim3( + cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))), + cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))), + batch_count + ); + } + + static constexpr + dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + using namespace cute; + using X = Underscore; + + // Preconditions + CUTE_STATIC_ASSERT(is_static::value); + + // Separate out problem shape for convenience + // Optionally append _1s until problem shape is rank-4 in case its is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + // Preconditions + static_assert(rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + // Get the appropriate blocks for this thread block -- potential for thread block locality + int thread_idx = int(threadIdx.x); + auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) + auto [m_coord, n_coord, l_coord] = blockIdx; + auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord); // (m,n,k,l) + + // Represent the full tensors + Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l) + Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l) + + // Get batch slice + Tensor mA_mk = mA_mkl(_,_,l_coord); // (m,k) + Tensor mB_nk = mB_nkl(_,_,l_coord); // (n,k) + + // Slice to get the tiles this thread block is responsible for + Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) + Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) + + // Compute tile residues for predication + auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl); // M - BLK_M * m_coord + auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl); // N - BLK_N * n_coord + auto k_residue = K - size<1>(gA) * size<2>(gA); // K - BLK_K * k_coord_max + auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue); + + // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape + TiledMma tiled_mma; + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) + clear(accumulators); + + auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA)); + int k_tile_count = size<2>(gA); + + // Perform the collective scoped MMA + CollectiveMainloop collective_mma; + collective_mma( + accumulators, + gA, + gB, + accumulators, + k_tile_iter, k_tile_count, + residue_mnk, + thread_idx, + smem_buf + ); + + // Epilogue and write to gD + CollectiveEpilogue epilogue{params.epilogue}; + epilogue( + problem_shape_MNKL, + blk_shape, + blk_coord_mnkl, + accumulators, + tiled_mma, + residue_mnk, + thread_idx, + smem_buf + ); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp new file mode 100644 index 0000000000..bd82ed111e --- /dev/null +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp @@ -0,0 +1,301 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/fast_math.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cute/arch/cluster_sm90.hpp" +#include "cutlass/arch/mma_sm90.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp" + +#include "cute/tensor.hpp" + +/////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::kernel { + +namespace detail { + +// IF_SWAP_AB::value will be true only if: +// class T has member SwapAB and T::SwapAB is true +template +struct IF_SWAP_AB { static constexpr bool value = false; }; + +template +struct IF_SWAP_AB > +{ static constexpr bool value = T::SwapAB; }; + +} // namespace + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class GridSwizzle_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + GridSwizzle_, + std::enable_if_t>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + using GridSwizzle = GridSwizzle_; + static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using ClusterShape = typename DispatchPolicy::ClusterShape; + using MainloopParams = typename CollectiveMainloop::Params; + static_assert(ArchTag::kMinComputeCapability >= 90); + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueParams = typename CollectiveEpilogue::Params; + static_assert(std::is_same_v, + "Mainloop and epilogue do not agree on accumulator value type."); + + static constexpr int SharedStorageSize = cute::max( + sizeof(typename CollectiveMainloop::SharedStorage), + sizeof(typename CollectiveEpilogue::SharedStorage)); + + static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + ElementA const* ptr_A = nullptr; + StrideA dA{}; + ElementB const* ptr_B = nullptr; + StrideB dB{}; + EpilogueParams epilogue_params{}; + KernelHardwareInfo hw_info; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode; + ProblemShape problem_shape; + MainloopParams mainloop; + EpilogueParams epilogue; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + (void) workspace; + auto problem_shape = args.problem_shape; + if constexpr (detail::IF_SWAP_AB::value) { + // swap M/N + get<0>(problem_shape) = get<1>(args.problem_shape); + get<1>(problem_shape) = get<0>(args.problem_shape); + } + return { + args.mode, + problem_shape, + CollectiveMainloop::to_underlying_arguments(args, workspace), + CollectiveEpilogue::to_underlying_arguments(args, workspace) + }; + } + + CUTLASS_HOST_DEVICE static + bool + can_implement(Arguments const& args) { + return args.mode == GemmUniversalMode::kGemm or + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4); + } + + static + int + get_workspace_size(Arguments const& args) { + return 0; + } + + // Computes the kernel launch grid shape based on runtime parameters + static constexpr + dim3 + get_grid_shape(Params const& params) { + auto cluster_shape = ClusterShape{}; + auto tile_shape = TileShape{}; + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + return detail::PersistentTileSchedulerSm90::get_tiled_blk_shape_mnl( + problem_shape_MNKL, tile_shape, cluster_shape); + } + + static constexpr + dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + using namespace cute; + using X = Underscore; + + // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. + #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) + if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { + printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + return; + } + #endif + + // Preconditions + static_assert(rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + int thread_idx = int(threadIdx.x); + int warp_idx = canonical_warp_idx(); + int lane_predicate = cute::elect_one_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if ((warp_idx == 0) && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(params.mainloop); + } + + // Separate out problem shape for convenience + // Optionally append _1s until problem shape is rank-4 in case its is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + // TMA requires special handling of strides to deal with coord codomain mapping + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) + Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) + + // Get the appropriate blocks for this thread block -- potential for thread block locality + auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) + auto blk_coord = make_coord(_,_,_); // (m,n,k) -- defer the slice + + // Make tiled views + Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) + Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) + + // Compute m_coord, n_coord, and l_coord with their post-tiled shapes + auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl)); + auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl)); + auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl)); + auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord); + + // Slice with m_coord and n_coord + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape + TiledMma tiled_mma; + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) + + clear(accumulators); + + auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA)); + auto k_tile_count = size<2>(gA); + + // Perform the collective scoped MMA + CollectiveMainloop collective_mma; + collective_mma( + gA, params.mainloop.tma_load_a, + gB, params.mainloop.tma_load_b, + accumulators, + k_tile_iter, k_tile_count, + thread_idx, + smem_buf, + params.mainloop + ); + + constexpr int BLK_M_RANK = rank<0>(blk_shape); + bool m_oob = int(blockIdx.x) >= size<2>(gA_mkl); + auto m_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return m_oob ? 0 : get(M) - get<0,i>(blk_shape) * get(m_coord); + })); + + constexpr int BLK_N_RANK = rank<1>(blk_shape); + bool n_oob = int(blockIdx.y) >= size<2>(gB_nkl); + auto n_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return n_oob ? 0 : get(N) - get<1,i>(blk_shape) * get(n_coord); + })); + auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{}); + + // Epilogue and write to gD + CollectiveEpilogue epilogue{params.epilogue}; + epilogue( + problem_shape_MNKL, + blk_shape, + output_tile_coord, + accumulators, + tiled_mma, + residue_mnk, + thread_idx, + smem_buf + ); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp new file mode 100644 index 0000000000..9fc719e2dc --- /dev/null +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp @@ -0,0 +1,351 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/fast_math.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cute/arch/cluster_sm90.hpp" +#include "cutlass/arch/reg_reconfig.h" +#include "cutlass/arch/mma_sm90.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp" +#include "cutlass/pipeline.hpp" +#include "cute/tensor.hpp" + +/////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::kernel { + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class GridSwizzle_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + GridSwizzle_, + std::enable_if_t>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + using GridSwizzle = GridSwizzle_; + static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using ClusterShape = typename DispatchPolicy::ClusterShape; + using MainloopParams = typename CollectiveMainloop::Params; + static_assert(ArchTag::kMinComputeCapability >= 90); + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueParams = typename CollectiveEpilogue::Params; + static_assert(std::is_same_v, + "Mainloop and epilogue do not agree on accumulator value type."); + + static constexpr int SharedStorageSize = cute::max( + sizeof(typename CollectiveMainloop::SharedStorage), + sizeof(typename CollectiveEpilogue::SharedStorage)); + + static constexpr uint32_t NumDmaWarpGroups = 1; + static constexpr uint32_t NumMmaWarpGroups = 1; + static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}) + (NumDmaWarpGroups * NumThreadsPerWarpGroup); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + ElementA const* ptr_A = nullptr; + StrideA dA{}; + ElementB const* ptr_B = nullptr; + StrideB dB{}; + EpilogueParams epilogue_params{}; + KernelHardwareInfo hw_info; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode; + ProblemShape problem_shape; + MainloopParams mainloop; + EpilogueParams epilogue; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + (void) workspace; + auto problem_shape = args.problem_shape; + if constexpr (detail::IF_SWAP_AB::value) { + // swap M/N + get<0>(problem_shape) = get<1>(args.problem_shape); + get<1>(problem_shape) = get<0>(args.problem_shape); + } + return { + args.mode, + problem_shape, + CollectiveMainloop::to_underlying_arguments(args, workspace), + CollectiveEpilogue::to_underlying_arguments(args, workspace) + }; + } + + CUTLASS_HOST_DEVICE static + bool + can_implement(Arguments const& args) { + return args.mode == GemmUniversalMode::kGemm or + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4); + } + + static + int + get_workspace_size(Arguments const& args) { + return 0; + } + + // Computes the kernel launch grid shape based on runtime parameters + static constexpr + dim3 + get_grid_shape(Params const& params) { + auto cluster_shape = ClusterShape{}; + auto tile_shape = TileShape{}; + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + return detail::PersistentTileSchedulerSm90::get_tiled_blk_shape_mnl( + problem_shape_MNKL, tile_shape, cluster_shape); + } + + static constexpr + dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + using namespace cute; + using X = Underscore; + + // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. + #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) + if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { + printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + return; + } + #endif + + enum class WarpGroupRole { + Producer = 0, + Consumer = 1, + }; + + int thread_idx = int(threadIdx.x); + int warp_idx = canonical_warp_idx(); + int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup; + auto warp_group_role = WarpGroupRole(canonical_warp_group_idx()); + int lane_predicate = cute::elect_one_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if ((warp_idx == 0) && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(params.mainloop); + } + + using Pipeline = typename CollectiveMainloop::MainloopPipeline; + + using PipelineParams = typename CollectiveMainloop::PipelineParams; + PipelineParams params_pipeline; + params_pipeline.transaction_bytes = CollectiveMainloop::TmaTransactionBytes; + if (warp_group_role == WarpGroupRole::Producer) { + params_pipeline.role = Pipeline::ThreadCategory::Producer; + } + else { + params_pipeline.role = Pipeline::ThreadCategory::Consumer; + } + params_pipeline.is_leader = warp_group_thread_idx == 0; + params_pipeline.num_consumers = NumThreadsPerWarpGroup; + + // Initialize pipeline and setup starting pipeline state for the collectives + Pipeline pipeline = CollectiveMainloop::make_pipeline(smem_buf, params_pipeline); + + auto cluster_wait_fn = [&] () { + // We need this to guarantee that the Pipeline init is visible + // To all producers and consumer thread blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + return [] () { cute::cluster_wait(); }; + } + else { + __syncthreads(); + return [] () {}; // do nothing + } + } (); + + // Preconditions + static_assert(rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + // Separate out problem shape for convenience + // Optionally append _1s until problem shape is rank-4 in case its is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + // TMA requires special handling of strides to deal with coord codomain mapping + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) + Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) + + // Get the appropriate blocks for this thread block -- potential for thread block locality + auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) + auto blk_coord = make_coord(_,_,_); // (m,n,k) -- defer the slice + + // Make tiled views + Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) + Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) + + // Compute m_coord, n_coord, and l_coord with their post-tiled shapes + auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl)); + auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl)); + auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl)); + auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord); + + // Slice with m_coord and n_coord + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA)); + auto k_tile_count = size<2>(gA); + + // Wait for all thread blocks in the Cluster + cluster_wait_fn(); + + // In a warp specialized kernel, CollectiveMainloop exposes data movement and compute operations separately + CollectiveMainloop collective_mainloop; + + if (warp_group_role == WarpGroupRole::Producer) { + // For the DMA (prologue) - we start with an opposite phase - since we skip all waits + // i.e., we know that the buffer is indeed empty + typename CollectiveMainloop::PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + collective_mainloop.dma( + pipeline, + smem_pipe_write, + gA, params.mainloop.tma_load_a, + gB, params.mainloop.tma_load_b, + k_tile_iter, k_tile_count, + thread_idx, + smem_buf + ); + // Update starting pipeline state for the next tile + smem_pipe_write.advance(k_tile_count); + // Make sure all Consumer Warp Groups have been waited upon + collective_mainloop.dma_epilogue(pipeline, smem_pipe_write); + } + else if (warp_group_role == WarpGroupRole::Consumer) { + typename CollectiveMainloop::PipelineState smem_pipe_read; + TiledMma tiled_mma; + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) + clear(accumulators); + + collective_mainloop.mma( + pipeline, + smem_pipe_read, + accumulators, + k_tile_count, + thread_idx, + smem_buf, + params.mainloop + ); + + constexpr int BLK_M_RANK = rank<0>(blk_shape); + bool m_oob = int(blockIdx.x) >= size<2>(gA_mkl); + auto m_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return m_oob ? 0 : get(M) - get<0,i>(blk_shape) * get(m_coord); + })); + + constexpr int BLK_N_RANK = rank<1>(blk_shape); + bool n_oob = int(blockIdx.y) >= size<2>(gB_nkl); + auto n_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return n_oob ? 0 : get(N) - get<1,i>(blk_shape) * get(n_coord); + })); + auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{}); + + // Epilogue and write to gD + CollectiveEpilogue epilogue{params.epilogue}; + epilogue( + problem_shape_MNKL, + blk_shape, + output_tile_coord, + accumulators, + tiled_mma, + residue_mnk, + warp_group_thread_idx, + smem_buf + ); + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp new file mode 100644 index 0000000000..498bfad436 --- /dev/null +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp @@ -0,0 +1,487 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cutlass/fast_math.h" +#include "cute/arch/cluster_sm90.hpp" +#include "cutlass/arch/reg_reconfig.h" +#include "cutlass/arch/mma_sm90.h" +#include "cutlass/pipeline.hpp" +#include "cutlass/trace.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp" + +#include "cute/tensor.hpp" + +/////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::kernel { + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class GridSwizzle_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + GridSwizzle_, + std::enable_if_t>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + using GridSwizzle = GridSwizzle_; + static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using ClusterShape = typename DispatchPolicy::ClusterShape; + using MainloopParams = typename CollectiveMainloop::Params; + static_assert(ArchTag::kMinComputeCapability >= 90); + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueParams = typename CollectiveEpilogue::Params; + static_assert(std::is_same_v, + "Mainloop and epilogue do not agree on accumulator value type."); + + static constexpr uint32_t NumDmaWarpGroups = 1; + static constexpr uint32_t NumMmaWarpGroups = 2; + static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{}) + (NumMmaWarpGroups * NumThreadsPerWarpGroup); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + + /// Register requirement for DMA and MATH WGs + static constexpr uint32_t DmaRegisterRequirement = 40; + static constexpr uint32_t MmaRegisterRequirement = 232; + + /* Order Sequence barrier with two stages: one for Mainloop and one for Epilogue */ + static constexpr uint32_t StagesPerMathWarpGroup = 2; + using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier< + StagesPerMathWarpGroup, NumMmaWarpGroups>; + + // Kernel level shared memory storage + struct SharedStorage { + using MainloopSharedStorage = typename CollectiveMainloop::SharedStorage; + using EpilogueSharedStorage = typename CollectiveEpilogue::SharedStorage; + using MathWarpGroupOrderBarrierStorage = typename MathWarpGroupOrderBarrier::SharedStorage; + + MainloopSharedStorage mainloop; + EpilogueSharedStorage epilogue; + alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order_barrier_storage; + }; + + static constexpr int SharedStorageSize = sizeof(SharedStorage); + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + ElementA const* ptr_A = nullptr; + StrideA dA{}; + ElementB const* ptr_B = nullptr; + StrideB dB{}; + EpilogueParams epilogue_params{}; + KernelHardwareInfo hw_info; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode; + ProblemShape problem_shape; + MainloopParams mainloop; + EpilogueParams epilogue; + KernelHardwareInfo hw_info; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + CUTLASS_TRACE_HOST("to_underlying_arguments():"); + + (void) workspace; + auto problem_shape = args.problem_shape; + if constexpr (detail::IF_SWAP_AB::value) { + // swap M/N + get<0>(problem_shape) = get<1>(args.problem_shape); + get<1>(problem_shape) = get<0>(args.problem_shape); + } + + // Get SM count if needed, otherwise use user supplied SM count + int sm_count = args.hw_info.sm_count; + if (sm_count <= 0) { + CUTLASS_TRACE_HOST(" WARNING: Arguments do not include a valid SM count.\n" + " For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count."); + sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id); + } + + CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count); + return { + args.mode, + problem_shape, + CollectiveMainloop::to_underlying_arguments(args, workspace), + CollectiveEpilogue::to_underlying_arguments(args, workspace), + {args.hw_info.device_id, sm_count} + }; + } + + CUTLASS_HOST_DEVICE static + bool + can_implement(Arguments const& args) { + bool implementable = args.mode == GemmUniversalMode::kGemm or + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4); + + // Number of blocks per problem (without batch) must not exceed 2^31 for the persistent scheduler to calculate using FastDivmod + auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{}); + auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = + detail::PersistentTileSchedulerSm90::get_tiled_blk_shape_mnl(problem_shape_MNKL, TileShape{}, ClusterShape{}); + uint64_t problem_blocks = problem_blocks_m * problem_blocks_n * problem_blocks_l; + implementable = implementable && (problem_blocks < (uint64_t(1) << 31)); + + return implementable; + } + + static + int + get_workspace_size(Arguments const& args) { + return 0; + } + + // Computes the kernel launch grid shape based on runtime parameters + static constexpr + dim3 + get_grid_shape(Params const& params) { + int sm_count = params.hw_info.sm_count; + CUTLASS_TRACE_HOST("get_grid_shape(): Persistent schedule grid plan using SM count = " << sm_count); + + // Compute the total number of output tiles our problem has + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = + detail::PersistentTileSchedulerSm90::get_tiled_blk_shape_mnl(problem_shape_MNKL, TileShape{}, ClusterShape{}); + int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks_l; + + // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently + dim3 launch_grid(1, cute::size<1>(ClusterShape{}), 1); + + // The else path is generic, however, we can avoid some divs if we know Cluster size is 1 + if constexpr (size(ClusterShape{}) == 1) { + launch_grid.x = std::min(sm_count, problem_blocks_total); + } + else { + /* + * Optimal grid size calculation is based on + * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU + * Hence, maximum SMs per GPC = 18 + */ + constexpr int max_sm_per_gpc = 18; + // Provided SM count could possibly be less than the assumed maximum SMs per GPC + int min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc; + int max_blk_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(ClusterShape{})); + int blk_per_device = min_num_gpc * max_blk_occupancy_per_gpc; + + launch_grid.x = std::min( + blk_per_device / size<1>(ClusterShape{}), + problem_blocks_total / size<1>(ClusterShape{})); + } + + return launch_grid; + } + + static constexpr + dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + using namespace cute; + using X = Underscore; + + // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. + #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) + if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { + printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + return; + } + #endif + + // Preconditions + static_assert(rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + enum class WarpGroupRole { + Producer = 0, + Consumer0 = 1, + Consumer1 = 2 + }; + + // Kernel level shared memory storage + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); + + int thread_idx = int(threadIdx.x); + int warp_idx = canonical_warp_idx(); + int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup; + auto warp_group_role = WarpGroupRole(canonical_warp_group_idx()); + int lane_predicate = cute::elect_one_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if ((warp_idx == 0) && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(params.mainloop); + } + + using Pipeline = typename CollectiveMainloop::MainloopPipeline; + using PipelineParams = typename CollectiveMainloop::PipelineParams; + PipelineParams params_pipeline; + params_pipeline.transaction_bytes = CollectiveMainloop::TmaTransactionBytes; + if (warp_group_role == WarpGroupRole::Producer) { + params_pipeline.role = Pipeline::ThreadCategory::Producer; + } + else { + params_pipeline.role = Pipeline::ThreadCategory::Consumer; + } + params_pipeline.is_leader = warp_group_thread_idx == 0; + params_pipeline.num_consumers = NumThreadsPerWarpGroup; + + // Initialize pipeline and setup starting pipeline state for the collectives + Pipeline pipeline = CollectiveMainloop::make_pipeline(smem_buf, params_pipeline); + typename CollectiveMainloop::PipelineState collective_start_state_pipe; + + typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier; + // DMA WG will not participate in these Ordered Barrier syncs + params_math_wg_order_barrier.group_id = canonical_warp_group_idx() - static_cast(WarpGroupRole::Consumer0); + params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group + MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.math_wg_order_barrier_storage, params_math_wg_order_barrier); + + auto cluster_wait_fn = [&] () { + // We need this to guarantee that the Pipeline init is visible + // To all producers and consumer thread blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + return [] () { cute::cluster_wait(); }; + } + else { + __syncthreads(); + return [] () {}; // do nothing + } + } (); + + // Separate out problem shape for convenience + // Optionally append _1s until problem shape is rank-4 in case its is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + auto M = get<0>(problem_shape_MNKL); + auto N = get<1>(problem_shape_MNKL); + auto K = get<2>(problem_shape_MNKL); + auto L = get<3>(problem_shape_MNKL); + + // TMA requires special handling of strides to deal with coord codomain mapping + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) + Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) + + // Get the appropriate blocks for this thread block -- potential for thread block locality + auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) + auto blk_coord = make_coord(_,_,_); // (m,n,k) -- defer the slice + + // Slice to get the tiles this thread block is responsible for + Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) + Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) + + // Get iterations along k-dimension + auto k_tile_count = size<3>(gA_mkl); + + detail::PersistentTileSchedulerSm90 scheduler(problem_shape_MNKL, blk_shape, ClusterShape{}); + + if (warp_group_role == WarpGroupRole::Consumer1) { + /* Advance 2nd Math WG to the next work tile for the startup */ + scheduler.advance_to_next_work(); + /* Advance 2nd Math WG pipeline state to the end of 1st Math WG */ + collective_start_state_pipe.advance(k_tile_count); + } + auto work_tile_info = scheduler.get_current_work(); + + // Perform the collective scoped MMA + CollectiveMainloop collective_mainloop; + + // Wait for all thread blocks in the Cluster + cluster_wait_fn(); + + if (warp_group_role == WarpGroupRole::Producer) { + cutlass::arch::warpgroup_reg_dealloc(); + + // For the DMA (prologue) - we start with an opposite phase - since we skip all waits + // i.e., we know that the buffer is indeed empty + typename CollectiveMainloop::PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + while (work_tile_info.is_valid_tile) { + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape + auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); + auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); + auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); + auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); + + // Slice with our work tile coordinates to construct mainloop tensor views + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA)); + + collective_mainloop.dma( + pipeline, + smem_pipe_write, + gA, params.mainloop.tma_load_a, + gB, params.mainloop.tma_load_b, + k_tile_iter, k_tile_count, + thread_idx, + reinterpret_cast(&shared_storage.mainloop) + ); + // Update starting pipeline state for the next tile + smem_pipe_write.advance(k_tile_count); + scheduler.advance_to_next_work(); + work_tile_info = scheduler.get_current_work(); + } // Scheduler work fetch loop + + // Make sure all Consumer Warp Groups have been waited upon + collective_mainloop.dma_epilogue(pipeline, smem_pipe_write); + } // Producer Warp Group End + + else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { + // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape + cutlass::arch::warpgroup_reg_alloc(); + + while (work_tile_info.is_valid_tile) { + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape + auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); + auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); + auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); + auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); + + // Slice with our work tile coordinates to construct mainloop tensor views + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA)); + + TiledMma tiled_mma; + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) + clear(accumulators); + + /* Order two Math WG's MMA one after the other, helps hide Epilogue */ + math_wg_order_barrier.wait(); + + collective_mainloop.mma( + pipeline, + collective_start_state_pipe, + accumulators, + k_tile_count, + thread_idx, + reinterpret_cast(&shared_storage.mainloop), + params.mainloop + ); + + /* Cue for next Math WG's MMA to start */ + math_wg_order_barrier.arrive(); + + /* Order two Math WG's Epilogue one after the other */ + math_wg_order_barrier.wait(); + + constexpr int BLK_M_RANK = rank<0>(blk_shape); + bool m_oob = int(work_tile_info.M_idx) >= size<2>(gA_mkl); + auto m_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return m_oob ? 0 : get(M) - get<0,i>(blk_shape) * get(m_coord); + })); + + constexpr int BLK_N_RANK = rank<1>(blk_shape); + bool n_oob = int(work_tile_info.N_idx) >= size<2>(gB_nkl); + auto n_max_coord = unwrap(cute::transform(make_seq{}, [&](auto i) { + return n_oob ? 0 : get(N) - get<1,i>(blk_shape) * get(n_coord); + })); + auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{}); + + // Epilogue and write to gD + CollectiveEpilogue epilogue{params.epilogue}; + epilogue( + problem_shape_MNKL, + blk_shape, + blk_coord, + accumulators, + tiled_mma, + residue_mnk, + warp_group_thread_idx, + reinterpret_cast(&shared_storage.epilogue) + ); + + /* Cue for next Math WG's Epilogue to start */ + math_wg_order_barrier.arrive(); + + // Update starting pipeline state for the next tile + collective_start_state_pipe.advance(k_tile_count * NumMmaWarpGroups); + + scheduler.advance_to_next_work(NumMmaWarpGroups); + work_tile_info = scheduler.get_current_work(); + } // Scheduler work fetch loop + } // Consumer Warp Groups End + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp new file mode 100644 index 0000000000..496d5e0703 --- /dev/null +++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp @@ -0,0 +1,133 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/fast_math.h" +#include "cute/layout.hpp" + +namespace cutlass::gemm::kernel::detail { + +/////////////////////////////////////////////////////////////////////////////// + +// Persistent Thread Block (TB) scheduler +class PersistentTileSchedulerSm90 { + // + // Data members + // + +private: + uint32_t blocks_per_problem_; + uint32_t current_work_linear_idx_; + uint32_t grid_blocks_total_; + + FastDivmod divmod_batch_; + FastDivmod divmod_grid_y_; + FastDivmod divmod_blk_m_; + + struct WorkTileInfo { + int32_t M_idx = 0; + int32_t N_idx = 0; + int32_t L_idx = 0; + uint32_t is_valid_tile = false; + }; + + // + // Methods + // + +public: + + template + CUTLASS_DEVICE + PersistentTileSchedulerSm90(ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape, ClusterShape cluster_shape) { + // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic + static_assert(is_static::value); + static_assert(is_static::value); + + // Round up to nearest multiple of cluster dim along each mode + auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = get_tiled_blk_shape_mnl( + problem_shape_mnkl, tile_shape, cluster_shape); + + blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks_l; + current_work_linear_idx_ = (int(blockIdx.x) * int(gridDim.y)) + int(blockIdx.y); + grid_blocks_total_ = int(gridDim.x) * int(gridDim.y); + + // Pre-compute our fast div/mods for rasterization so we don't have to pay for DIVs + divmod_batch_ = FastDivmod(problem_blocks_m * problem_blocks_n); + divmod_grid_y_ = FastDivmod(size<1>(cluster_shape)); + divmod_blk_m_ = FastDivmod(problem_blocks_m); + } + + CUTLASS_DEVICE + WorkTileInfo + get_current_work() const { + // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices + int work_idx_l, remainder; + divmod_batch_(work_idx_l, remainder, current_work_linear_idx_); + + int blk_per_grid_dim, dontcare; + divmod_grid_y_(blk_per_grid_dim, dontcare, remainder); + + int block_idx_m, block_idx_n; + divmod_blk_m_(block_idx_n, block_idx_m, blk_per_grid_dim); + int work_idx_m = block_idx_m; + int work_idx_n = (block_idx_n * gridDim.y) + blockIdx.y; + + return {work_idx_m, work_idx_n, work_idx_l, current_work_linear_idx_ < blocks_per_problem_}; + } + + CUTLASS_DEVICE + void + advance_to_next_work(uint32_t advance_count = 1) { + current_work_linear_idx_ += grid_blocks_total_ * advance_count; + } + + // Given the inputs, computes the total number of output blocks this problem will compute over + // Note that this is only the logical size of our grid, not the physical grid we will actually launch. + template + CUTLASS_HOST_DEVICE constexpr static + dim3 + get_tiled_blk_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape blk_shape, ClusterShape cluster_shape) { + // Across M and N is our Cluster tile, so we must round up the blocks to the nearest whole number of Cluster tiles + auto blk_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(blk_shape))); + auto blk_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(blk_shape))); + + // Round up to nearest multiple of cluster dim along each mode + int problem_blocks_m = round_up(blk_m, cute::size<0>(cluster_shape)); + int problem_blocks_n = round_up(blk_n, cute::size<1>(cluster_shape)); + + // Cluster tile does not span the batch mode, so no extra rounding up required for it + int problem_blocks_l = int(cute::size<3>(problem_shape_mnkl)); + return {uint32_t(problem_blocks_m), uint32_t(problem_blocks_n), uint32_t(problem_blocks_l)}; + } +}; + +} // namespace cutlass::gemm::kernel::detail diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h index f7b2678111..eba95aad4c 100644 --- a/include/cutlass/gemm/kernel/sparse_gemm.h +++ b/include/cutlass/gemm/kernel/sparse_gemm.h @@ -277,7 +277,7 @@ struct SparseGemm { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; // diff --git a/include/cutlass/gemm/kernel/symm_universal.h b/include/cutlass/gemm/kernel/symm_universal.h index 4bab2cf939..47e7035abe 100755 --- a/include/cutlass/gemm/kernel/symm_universal.h +++ b/include/cutlass/gemm/kernel/symm_universal.h @@ -415,7 +415,7 @@ struct SymmUniversal { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/kernel/trmm_universal.h b/include/cutlass/gemm/kernel/trmm_universal.h index 69e5563de1..7ba223bbb4 100644 --- a/include/cutlass/gemm/kernel/trmm_universal.h +++ b/include/cutlass/gemm/kernel/trmm_universal.h @@ -380,7 +380,7 @@ struct TrmmUniversal { // Broadcast the warp_id computed by lane 0 to ensure dependent code // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int warp_idx = canonical_warp_idx(); int lane_idx = threadIdx.x % 32; diff --git a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h index 9d46a14153..995796796d 100644 --- a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h @@ -44,7 +44,7 @@ #include "cutlass/matrix_shape.h" #include "cutlass/arch/memory_sm75.h" -#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm75.h" #include "cutlass/arch/mma_sm80.h" #include "cutlass/gemm/gemm.h" @@ -120,9 +120,9 @@ class MmaWithReductionTensorOp { /// Underlying matrix multiply operator (concept: arch::Mma) using ArchMmaOperator = typename Policy::Operator; - /// Indicates math operator + /// Indicates math operator using MathOperator = typename ArchMmaOperator::Operator; - + /// Architecture tag from underlying instruction using ArchTag = typename ArchMmaOperator::ArchTag; @@ -223,9 +223,9 @@ class MmaWithReductionTensorOp { /// Performs a warp-level matrix multiply-accumulate operation CUTLASS_DEVICE void operator()( - FragmentC &D, - TransformedFragmentA const &A, - TransformedFragmentB const &B, + FragmentC &D, + TransformedFragmentA const &A, + TransformedFragmentB const &B, FragmentC const &C, FragmentReduction &gemm_k_reduction ) const { @@ -236,9 +236,9 @@ class MmaWithReductionTensorOp { D = C; - MmaOperandA const *ptr_A = reinterpret_cast(&A); - MmaOperandB const *ptr_B = reinterpret_cast(&B); - MmaOperandC *ptr_D = reinterpret_cast(&D); + [[maybe_unused]] MmaOperandA const *ptr_A = reinterpret_cast(&A); + [[maybe_unused]] MmaOperandB const *ptr_B = reinterpret_cast(&B); + [[maybe_unused]] MmaOperandC *ptr_D = reinterpret_cast(&D); #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) assert(0); @@ -258,7 +258,7 @@ class MmaWithReductionTensorOp { ptr_D[m + n_serpentine * MmaIterations::kRow]); if (!kReduceKForA && m == 0) { - #if 0 + #if 0 gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]); gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]); gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]); @@ -306,12 +306,12 @@ class MmaWithReductionTensorOp { } if (kReduceKForA && (n == 0)) { - #if 0 + #if 0 gemm_k_reduction[m * 2] += float(A[m * 8]); gemm_k_reduction[m * 2] += float(A[m * 8 + 1]); gemm_k_reduction[m * 2] += float(A[m * 8 + 4]); gemm_k_reduction[m * 2] += float(A[m * 8 + 5]); - + gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]); gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]); gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]); @@ -411,9 +411,9 @@ class MmaWithReductionTensorOp { Array * ptr_dst_B = reinterpret_cast *>(&dst_B); - + dst_A = convert_A(A); - + ptr_dst_B[0] = convert_B(ptr_B[0]); ptr_dst_B[1] = convert_B(ptr_B[1]); @@ -429,9 +429,9 @@ class MmaWithReductionTensorOp { Array * ptr_dst_A = reinterpret_cast *>(&dst_A); - + dst_B = convert_B(B); - + ptr_dst_A[0] = convert_A(ptr_A[0]); ptr_dst_A[1] = convert_A(ptr_A[1]); #else diff --git a/include/cutlass/kernel_hardware_info.hpp b/include/cutlass/kernel_hardware_info.hpp new file mode 100644 index 0000000000..3ae09324c5 --- /dev/null +++ b/include/cutlass/kernel_hardware_info.hpp @@ -0,0 +1,71 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cuda_runtime.h" + +#include "cutlass/trace.h" + +namespace cutlass { + +struct KernelHardwareInfo { + // + // Data members + // + int device_id = 0; + int sm_count = 0; + + // + // Methods + // + + static int + query_device_multiprocessor_count(int device_id = 0) { + cudaError_t result = cudaGetDevice(&device_id); + if (result != cudaSuccess) { + CUTLASS_TRACE_HOST( + " cudaGetDevice() returned error " + << cudaGetErrorString(result)); + return 0; + } + cudaDeviceProp properties; + result = cudaGetDeviceProperties(&properties, device_id); + if (result != cudaSuccess) { + CUTLASS_TRACE_HOST( + " cudaGetDeviceProperties() returned error " + << cudaGetErrorString(result)); + return 0; + } + return properties.multiProcessorCount; + } +}; + +} // namespace cutlass diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h index 51100f40c0..fe7a848934 100644 --- a/include/cutlass/layout/matrix.h +++ b/include/cutlass/layout/matrix.h @@ -39,6 +39,8 @@ */ #pragma once +#include "cute/layout.hpp" + #include "cutlass/cutlass.h" #include "cutlass/fast_math.h" #include "cutlass/matrix_coord.h" @@ -143,6 +145,15 @@ class RowMajor { LongIndex capacity(MatrixCoord const &extent) const { return LongIndex(extent.row()) * LongIndex(stride_[0]); } + + CUTLASS_HOST_DEVICE + cute::Layout, cute::Stride > > + to_cute_layout(MatrixCoord const &extent) const { + return cute::Layout, cute::Stride > >{ + {extent[0], extent[1]}, + {stride(0), cute::Int<1>{}} + }; + } }; /// Mapping function for column-major matrices. @@ -236,6 +247,15 @@ class ColumnMajor { LongIndex capacity(MatrixCoord const &extent) const { return LongIndex(extent.column()) * LongIndex(stride_[0]); } + + CUTLASS_HOST_DEVICE + cute::Layout, cute::Stride< cute::Int<1>, int64_t> > + to_cute_layout(MatrixCoord const &extent) const { + return cute::Layout, cute::Stride, int64_t> >{ + {extent[0], extent[1]}, + {cute::Int<1>{}, stride(0)} + }; + } }; /// Mapping function for interleaved matrices. Matrix is structured diff --git a/include/cutlass/pipeline.hpp b/include/cutlass/pipeline.hpp new file mode 100644 index 0000000000..67538aea17 --- /dev/null +++ b/include/cutlass/pipeline.hpp @@ -0,0 +1,529 @@ +/*************************************************************************************************** + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are not permit- + * ted. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cute/numeric/integral_constant.hpp" +#include "cute/arch/cluster_sm90.hpp" +#include "cutlass/arch/barrier.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +using namespace arch; +using namespace cute; + +// Circular Buffer Index + Associated Phase +// Assumes only one operation possible - i.e., ++ +template +struct PipelineState { + + static constexpr uint32_t Stages = Stages_; + +private: + int index_ = 0; + uint32_t phase_ = 0; + +public: + CUTLASS_DEVICE + PipelineState(): index_{}, phase_{} {} + + CUTLASS_DEVICE + PipelineState(int index, uint32_t phase) + : index_(index) + , phase_(phase){} + + CUTLASS_DEVICE + int index() const { + return index_; + } + + CUTLASS_DEVICE + uint32_t phase() const { + return phase_; + } + + CUTLASS_DEVICE + void operator++() { + ++index_; + if (index_ == Stages) { + index_ = 0; + phase_ ^= 1; + } + } + + CUTLASS_DEVICE + PipelineState& operator=(const PipelineState& other) { + index_ = other.index(); + phase_ = other.phase(); + return *this; + } + + CUTLASS_DEVICE + PipelineState advance(uint32_t num_iterations) { + // Number of iterations cross over the stage boundary => flipped phase + if ((num_iterations < Stages) && (index_ + num_iterations) >= Stages ) { + phase_ ^= 1; + } + // How many times number of iterations cross over the stage boundary and + // end up on a odd number => flipped phase + if ((num_iterations >= Stages) && (((index_ + num_iterations) / Stages) % 2) == 1) { + phase_ ^= 1; + } + index_ = (index_ + num_iterations) % Stages; + return *this; + } + + CUTLASS_DEVICE + static PipelineState make_pipeline_state(PipelineState start_state, uint32_t num_iterations) { + return start_state.advance(num_iterations); + } +}; + +template +CUTLASS_DEVICE +PipelineState make_producer_start_state() +{ + // Producer starts with an opposite phase as the buffer are initially empty + constexpr int InitialProducerStage = 0; + constexpr uint32_t InitialProducerPhase = 1; + return {InitialProducerStage, InitialProducerPhase}; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// TMA (producer) Async Pipeline class +// +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Assumptions : Constructor is Visible Cluster-wide (as it needs a Cluster-Sync) +// We have exactly one thread elected in the Producer as the "leader" +// Currently, it is optional to elect a leader for the Consumers +template +class PipelineTmaAsync { +public : + using ClusterShape = ClusterShape_; + using FullBarrier = ClusterTransactionBarrier; + using EmptyBarrier = ClusterBarrier; + using ValueType = FullBarrier::ValueType; + static constexpr uint32_t Stages = Stages_; + + struct SharedStorage { + FullBarrier full_barrier_[Stages]; + EmptyBarrier empty_barrier_[Stages]; + }; + + enum class ThreadCategory { + NonParticipant, + Producer, + Consumer, + ProducerConsumer + }; + + struct Params { + uint32_t transaction_bytes = 0; + ThreadCategory role = ThreadCategory::NonParticipant; + uint32_t is_leader = 0; + uint32_t num_consumers = 0; + }; + +private : + // + // Data Members + // + uint32_t dst_blockid_ = 0; + uint32_t is_signalling_thread_ = 0; + FullBarrier *full_barrier_ptr_ = nullptr; + EmptyBarrier *empty_barrier_ptr_ = nullptr; + Params params_; + + // + // Methods + // + +public: + // Constructor + CUTLASS_DEVICE + PipelineTmaAsync(SharedStorage& storage, Params params) + : params_(params) + , full_barrier_ptr_(&storage.full_barrier_[0]) + , empty_barrier_ptr_(&storage.empty_barrier_[0]) { + + int warp_idx = canonical_warp_idx(); + int lane_predicate = cute::elect_one_sync(); + auto cluster_shape = ClusterShape{}; + + if (warp_idx == 0 && lane_predicate == 1) { + // Barrier FULL init + for (int i = 0; i < Stages; ++i) { + full_barrier_ptr_[i].init(1); + } + + // Barrier EMPTY init + uint32_t const num_consumers = cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1; + for (int i = 0; i < Stages; ++i) { + empty_barrier_ptr_[i].init(num_consumers); + } + } + + // Logic to optimally schedule Empty Arrives + // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads) + dim3 block_id = block_id_in_cluster(); + auto cluster_size = cute::size(cluster_shape); + static constexpr int MaxClusterSize = 16; + static_assert(cluster_size <= MaxClusterSize, "ERROR : Cluster size too large !" ); + + // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15) + if (params_.num_consumers == 128) { + int thread_idx = threadIdx.x % 128; + is_signalling_thread_ = (thread_idx % (128 / MaxClusterSize)) == 0; + auto layout = cute::composition(Swizzle<2,0,-2>{}, + Layout,Stride<_4, _1>>{}); + uint32_t thread_row = warp_idx % 4; + uint32_t thread_col = (thread_idx / 8) % 4; + dst_blockid_ = layout(thread_row, thread_col); + } + else if (params_.num_consumers == 32){ + int thread_idx = threadIdx.x % 32; + is_signalling_thread_ = (thread_idx % (32 / MaxClusterSize)) == 0; + auto layout = Layout,Stride<_4, _1>>{}; + uint32_t thread_row = thread_idx / 8; + uint32_t thread_col = (thread_idx % 8) / 2; + dst_blockid_ = layout(thread_row, thread_col); + } + else { + is_signalling_thread_ = 0; + } + + // STEP 2: Find if this dst block-id needs an arrival for this problem + is_signalling_thread_ &= dst_blockid_ < cluster_size; + is_signalling_thread_ &= is_same_row_or_col(dst_blockid_, block_id, cluster_shape); + + cutlass::arch::fence_barrier_init(); + } + + CUTLASS_DEVICE + void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait = false) { + // 1. Wait for empty barrier to be ready + // 2. Set the transaction bytes set to occur on the Full barrier + uint32_t done = empty_barrier_ptr_[stage].test_wait(phase, (!skip_wait)); + if ((!done) && (!skip_wait)){ + empty_barrier_ptr_[stage].wait(phase); + } + + if (params_.is_leader) { + full_barrier_ptr_[stage].arrive_and_reset_bytes(params_.transaction_bytes); + } + + } + + CUTLASS_DEVICE + void producer_acquire(PipelineState state) { + producer_acquire(state.index(), state.phase()); + } + + // NOP for TMA based mainloop + CUTLASS_DEVICE + void producer_commit(uint32_t stage, uint32_t bytes) { + // Below code is used only for unit-testing (in the absennce of TMA commit) + #if CUTLASS_UNIT_TEST_PIPELINE + if (params_.is_leader) { + // STEP 1 : Commit to self + full_barrier_ptr_[stage].commit(bytes); + + // STEP 2 : Commit to other blocks in our cluster + auto cluster_shape = ClusterShape{}; + Layout block_layout_in_cluster = make_layout(cluster_shape); + dim3 local_block_id = cute::block_id_in_cluster(); + + CUTLASS_PRAGMA_UNROLL + for(int n = 0; n < size<1>(block_layout_in_cluster); ++n) { + uint32_t dst_block_id = block_layout_in_cluster(local_block_id.x,n,Int<0>{}); + full_barrier_ptr_[stage].commit(dst_block_id, bytes, n!=local_block_id.y); + } + + CUTLASS_PRAGMA_UNROLL + for(int m = 0; m < size<0>(block_layout_in_cluster); ++m) { + uint32_t dst_block_id = block_layout_in_cluster(m,local_block_id.y,Int<0>{}); + full_barrier_ptr_[stage].commit(dst_block_id, bytes, m!=local_block_id.x); + } + } + #endif + } + + CUTLASS_DEVICE + void producer_commit(PipelineState state, uint32_t bytes) { + producer_commit(state.index(), bytes); + } + + + // Wait for producer to commit transactions (done by TMA) + CUTLASS_DEVICE + void consumer_wait(uint32_t stage, uint32_t phase) { + uint32_t done = full_barrier_ptr_[stage].test_wait(phase); + if (!done){ + full_barrier_ptr_[stage].wait(phase); + } + } + + CUTLASS_DEVICE + void consumer_wait(PipelineState state) { + consumer_wait(state.index(), state.phase()); + } + + // Consumer signalling Producer of completion + // Ensures all blocks in the Same Row and Column get notifed. + CUTLASS_DEVICE + void consumer_release(uint32_t stage, uint32_t skip = false) { + empty_barrier_ptr_[stage].arrive(dst_blockid_, is_signalling_thread_ & (!skip)); + } + + CUTLASS_DEVICE + void consumer_release(PipelineState state) { + consumer_release(state.index()); + } + + CUTLASS_DEVICE + ValueType* producer_get_barrier(uint32_t stage) { + return reinterpret_cast(&full_barrier_ptr_[stage]); + } + + CUTLASS_DEVICE + bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) { + return ((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x || + (dst_block_id / cute::size<0>(cluster_shape)) == block_id.y); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Simple producer-consumer async Pipeline class +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// *Count Signifies the number of producers / consumers who will announce their completion + +template +class PipelineAsync { +public : + using FullBarrier = ClusterBarrier; + using EmptyBarrier = ClusterBarrier; + using ProducerBarrierType = FullBarrier::ValueType; + static constexpr uint32_t Stages = Stages_; + + struct SharedStorage { + FullBarrier full_barrier_[Stages]; + EmptyBarrier empty_barrier_[Stages]; + }; + + enum class ThreadCategory { + NonParticipant, + Producer, + Consumer, + ProducerConsumer + }; + + struct Params { + ThreadCategory role = ThreadCategory::NonParticipant; + uint32_t producer_arv_count = 1; + uint32_t consumer_arv_count = 1; + uint32_t dst_blockid = cute::block_rank_in_cluster(); + }; + +private: + // + // Data Members + // + Params params_; + FullBarrier *full_barrier_ptr_; + EmptyBarrier *empty_barrier_ptr_; + +public: + + // Default assumption when only storage is passed is : + // => single producer, single consumer & they are in the same block (within the Cluster) + CUTLASS_DEVICE + PipelineAsync(SharedStorage& storage) + : PipelineAsync(storage, {}) {} + + CUTLASS_DEVICE + PipelineAsync( + SharedStorage& storage, + Params const& params) : + params_(params), + full_barrier_ptr_(&storage.full_barrier_[0]), + empty_barrier_ptr_(&storage.empty_barrier_[0]) { + + int warp_idx = canonical_warp_idx(); + int lane_predicate = cute::elect_one_sync(); + + // Barrier FULL, EMPTY init + // Init is done only by thread 0 of the block + if (warp_idx == 0 && lane_predicate == 1) { + for (int i = 0; i < Stages; ++i) { + full_barrier_ptr_[i].init(params.producer_arv_count); + empty_barrier_ptr_[i].init(params.consumer_arv_count); + } + } + + cutlass::arch::fence_barrier_init(); + } + + CUTLASS_DEVICE + void producer_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait = false) { + uint32_t done = empty_barrier_ptr_[stage].test_wait(phase, (!skip_wait)); + if ((!done) && (!skip_wait)){ + empty_barrier_ptr_[stage].wait(phase); + } + } + + CUTLASS_DEVICE + void producer_acquire(PipelineState state) { + producer_acquire(state.index(), state.phase()); + } + + CUTLASS_DEVICE + void producer_commit(uint32_t stage) { + full_barrier_ptr_[stage].arrive(); + } + + CUTLASS_DEVICE + void producer_commit(PipelineState state) { + producer_commit(state.index()); + } + + CUTLASS_DEVICE + void consumer_wait(uint32_t stage, uint32_t phase) { + uint32_t done = full_barrier_ptr_[stage].test_wait(phase); + if (!done){ + full_barrier_ptr_[stage].wait(phase); + } + } + + CUTLASS_DEVICE + void consumer_wait(PipelineState state) { + consumer_wait(state.index(), state.phase()); + } + + CUTLASS_DEVICE + void consumer_release(uint32_t stage, uint32_t skip = false) { + empty_barrier_ptr_[stage].arrive(params_.dst_blockid, (not skip)); + } + + CUTLASS_DEVICE + void consumer_release(PipelineState state) { + consumer_release(state.index()); + } + + CUTLASS_DEVICE + ProducerBarrierType* get_producer_barrier(uint32_t stage) { + return reinterpret_cast(&full_barrier_ptr_[stage]); + } +}; + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Barrier to ensure an Ordered Sequence between +// SequenceLength number of groups (each with group_size participants) executing SequenceDepth Stages +// i.e., for all i < j - only after id "i" arrives at a particular stage "m" +// will the wait() for id "j" succeed for the same stage +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +class OrderedSequenceBarrier { +public : + using Barrier = ClusterBarrier; + + struct SharedStorage { + Barrier barrier_[SequenceDepth][SequenceLength]; + }; + + struct Params { + uint32_t group_id; + uint32_t group_size; + }; + +private : + // + // Data Members + // + + // In future this Params object can be replaced easily with a CG object + Params params_; + Barrier *barrier_ptr_; + PipelineState stage_; + + static constexpr int Depth = SequenceDepth; + static constexpr int Length = SequenceLength; + +public: + OrderedSequenceBarrier() = delete; + OrderedSequenceBarrier(const OrderedSequenceBarrier&) = delete; + OrderedSequenceBarrier(OrderedSequenceBarrier&&) = delete; + OrderedSequenceBarrier& operator=(const OrderedSequenceBarrier&) = delete; + OrderedSequenceBarrier& operator=(OrderedSequenceBarrier&&) = delete; + ~OrderedSequenceBarrier() = default; + + CUTLASS_DEVICE + OrderedSequenceBarrier(SharedStorage& storage, Params const& params) : + params_(params), + barrier_ptr_(&storage.barrier_[0][0]), + // Group 0 - starts with an opposite phase + stage_({0, params.group_id == 0}) { + + int warp_idx = canonical_warp_idx(); + int lane_predicate = cute::elect_one_sync(); + + // Barrier FULL, EMPTY init + // Init is done only by the one elected thread of the block + if (warp_idx == 0 && lane_predicate == 1) { + for (int d = 0; d < Depth; ++d) { + for (int l = 0; l < Length; ++l) { + barrier_ptr_[d * Length + l].init(params.group_size); + } + } + } + + cutlass::arch::fence_barrier_init(); + } + + // Wait on a stage to be unlocked + CUTLASS_DEVICE + void wait() { + get_barrier_for_current_stage(params_.group_id).wait(stage_.phase()); + } + + // Signal completion of Stage and move to the next stage + // (group_id) signals to (group_id+1) + CUTLASS_DEVICE + void arrive() { + int signalling_id = (params_.group_id + 1) % Length; + get_barrier_for_current_stage(signalling_id).arrive(); + ++stage_; + } + +private: + + CUTLASS_DEVICE + Barrier& get_barrier_for_current_stage(int group_id) { + return barrier_ptr_[stage_.index() * Length + group_id]; + } +}; + +} // end namespace cutlass diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h index d62d1c6274..1015be4bf0 100644 --- a/include/cutlass/quaternion.h +++ b/include/cutlass/quaternion.h @@ -745,7 +745,6 @@ struct multiply_add, Quaternion, Quaternion> { } }; - ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h index 8ed0538c44..c084dd4870 100644 --- a/include/cutlass/transform/pitch_linear_thread_map.h +++ b/include/cutlass/transform/pitch_linear_thread_map.h @@ -29,7 +29,7 @@ * **************************************************************************************************/ /*! \file - \brief Templates implementing how threads are mapped to a given tile. + \brief Templates implementing how threads are mapped to a given tile. */ @@ -163,9 +163,9 @@ struct PitchLinearTilePolicyStripminedThreadContiguous using Iterations = layout::PitchLinearShape< Shape::kContiguous / (kThreads * kElementsPerAccess), - Shape::kStrided>; + Shape::kStrided>; - using Delta = layout::PitchLinearShape<1, 1>; + using Delta = layout::PitchLinearShape<1, 1>; CUTLASS_HOST_DEVICE static TensorCoord initial_offset(int thread_id) @@ -183,7 +183,7 @@ struct PitchLinearTilePolicyStripminedThreadStrided { static_assert((Shape::kStrided % Threads == 0), "Strided shape must divide number of threads"); - + using TensorCoord = layout::PitchLinearCoord; static int const kThreads = Threads; @@ -191,16 +191,16 @@ struct PitchLinearTilePolicyStripminedThreadStrided using Iterations = layout::PitchLinearShape< Shape::kContiguous / kElementsPerAccess, - Shape::kStrided / kThreads>; + Shape::kStrided / kThreads>; - using Delta = layout::PitchLinearShape<1, 1>; + using Delta = layout::PitchLinearShape<1, 1>; using ShapeVec = Shape; CUTLASS_HOST_DEVICE static TensorCoord initial_offset(int thread_id) { - + return TensorCoord(0, thread_id * Iterations::kStrided); } }; @@ -334,7 +334,7 @@ struct PitchLinearWarpRakedThreadMap { }; // This is the offset of a thread within a threadblock tile (units of vectors) - layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = + layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = warp_footprint * warp_offset + thread_offset_in_warp; // This is the offset of a thread within a threadblock tile (units of elements) @@ -460,7 +460,7 @@ struct PitchLinearStridedWarpRakedThreadMap { }; // This is the offset of a thread within a threadblock tile (units of vectors) - layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = + layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = warp_footprint * warp_offset + thread_offset_in_warp; // This is the offset of a thread within a threadblock tile (units of elements) @@ -601,7 +601,7 @@ struct TransposePitchLinearThreadMapSimt { static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1"); ///< Iterations along each dimension (concept: PitchLinearShape) - using Iterations = + using Iterations = layout::PitchLinearShape; @@ -615,7 +615,7 @@ struct TransposePitchLinearThreadMapSimt { ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape) using Delta = - layout::PitchLinearShape; @@ -693,12 +693,12 @@ struct PitchLinearWarpStripedThreadMap { // Divide it into the number of warps, first partitioning the strided dimension then the // contiguous. - static int const kWarpsStrided = - (WarpAccessIterations::kStrided >= kWarpCount + static int const kWarpsStrided = + (WarpAccessIterations::kStrided >= kWarpCount ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided)); - static int const kWarpsContiguous = - (kWarpCount > WarpAccessIterations::kStrided ? + static int const kWarpsContiguous = + (kWarpCount > WarpAccessIterations::kStrided ? WarpAccessIterations::kContiguous / kWarpsStrided : 1); /// Arrangement of warps within a threadblock-scoped tile @@ -752,7 +752,7 @@ struct PitchLinearWarpStripedThreadMap { }; // This is the offset of a thread within a threadblock tile (units of vectors) - layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = + layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = warp_footprint * warp_offset + thread_offset_in_warp; // This is the offset of a thread within a threadblock tile (units of elements) @@ -776,7 +776,7 @@ struct PitchLinearWarpStripedThreadMap { template < typename Shape_, int Threads, - typename ThreadTileShape + typename ThreadTileShape > struct PitchLinear2DThreadTileStripminedThreadMap; @@ -888,7 +888,7 @@ struct TransposePitchLinearThreadMap2DThreadTile { static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1"); ///< Iterations along each dimension (concept: PitchLinearShape) - using Iterations = + using Iterations = layout::PitchLinearShape; @@ -899,7 +899,7 @@ struct TransposePitchLinearThreadMap2DThreadTile { ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape) using Delta = - layout::PitchLinearShape; diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h index df65623c66..38d5b4d587 100644 --- a/include/cutlass/uint128.h +++ b/include/cutlass/uint128.h @@ -54,7 +54,7 @@ namespace cutlass { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Optionally enable GCC's built-in type -#if (defined(__x86_64) || defined (__aarch64__)) && !defined(__CUDA_ARCH__) && defined(__GNUC__) +#if defined(__x86_64) && !defined(__CUDA_ARCH__) && defined(__GNUC__) #define CUTLASS_UINT128_NATIVE #elif defined(_MSC_VER) && defined(_M_AMD64) && !defined(__CUDA_ARCH__) #define CUTLASS_INT128_ARITHMETIC @@ -71,7 +71,7 @@ namespace cutlass { struct uint128_t { /// Size of one part of the uint's storage in bits - int const kPartSize = sizeof_bits::value; + static constexpr int kPartSize = sizeof_bits::value; struct hilo { uint64_t lo; @@ -158,7 +158,7 @@ struct uint128_t { /// Multiply by unsigned 64b integer yielding 128b integer CUTLASS_HOST_DEVICE uint128_t operator*(uint64_t const &rhs) const { - uint128_t y; + uint128_t y{}; #if defined(CUTLASS_UINT128_NATIVE) y.native = native * rhs; #elif defined(CUTLASS_INT128_ARITHMETIC) diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md index 61ffbafe9c..53ffc84dfe 100644 --- a/media/docs/code_organization.md +++ b/media/docs/code_organization.md @@ -7,6 +7,7 @@ This document describes the layout of the CUTLASS repository. The main components are: * **CUTLASS Template Library** - CUDA Templates for Linear Algebra Subroutines and Solvers (header only) +* **CuTe Template Library** - CUTLASS's core vocabulary layout type and associated algebra (header only) * **CUTLASS Utilities** - Additional templates * **CUTLASS Instance Library** - instantiations of CUTLASS templates covering the design space * **CUTLASS Profiler** - CUTLASS Library, Profiler, and Utilities @@ -29,7 +30,6 @@ CUTLASS Templates are implemented by header files in the following directory str ``` include/ # Top-level include directory. Client applications should target this path. - cutlass/ # CUDA Templates for Linear Algebra Subroutines and Solvers - headers only arch/ # direct exposure of architecture features (including instruction-level GEMMs) @@ -37,10 +37,11 @@ include/ # Top-level include directory. Client applications gemm/ # code specialized for general matrix product computations thread/ # thread-level operators warp/ # warp-level operators + collective/ # 3.x API operators for all threads a tiled mma/copy are built over threadblock/ # CTA-level operators kernel/ # CUDA kernel entry points device/ # launches kernel(s) over a full device - * # scope-agnostic components and basic vocabular type definitions for GEMM + * # scope-agnostic components and basic vocabulary type definitions for GEMM layout/ # layout definitions for matrices, tensors, and other mathematical objects in memory * @@ -51,7 +52,7 @@ include/ # Top-level include directory. Client applications threadblock/ # CTA-level operators kernel/ # CUDA kernel entry points device/ # launches kernel(s) over a full device - * # scope-agnostic components and basic vocabular type definitions + * # scope-agnostic components and basic vocabulary type definitions transform/ # code specialized for layout, type, and domain transformations thread/ # thread-level operators @@ -64,11 +65,27 @@ include/ # Top-level include directory. Client applications util/ # miscellaneous CUTLASS components * * # core vocabulary types and fundamental arithmetic operators + + cute / # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy + algorithm/ # Definitions of core operations such as copy, gemm, and operations on cute::tuples + arch/ # Bare bones PTX wrapper structs for copy and math instructions + atom/ # Meta-information either link to or built from arch/ operators + mma_atom.hpp # cute::Mma_Atom and cute::TiledMma + copy_atom.hpp # cute::Copy_Atom and cute::TiledCopy + *sm*.hpp # Arch specific meta-information for copy and math operations + container/ # Core container types used across CuTe, namely, cute::tuple + numeric/ # CuTe's internal numerics implementation + * # Core library types such as Shape, Stride, Layout, Tensor, and associated operations ``` See [Programming Guidelines](/media/docs/programming_guidelines.md) for further details about conventions and design patterns used throughout CUTLASS. +## CuTe + +CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations. More documentation +for CuTe can be found in [`/media/docs/cute/`](/media/docs/cute/). + ## Tools The `tools/` directory contains clients of the CUTLASS Template library and includes the following. @@ -181,9 +198,9 @@ examples/ 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes - 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu activation function - 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel + 13_fused_two_gemms/ # example demonstrating two GEMMs fused into one kernel ``` ## Media diff --git a/media/docs/cute/00_quickstart.md b/media/docs/cute/00_quickstart.md new file mode 100644 index 0000000000..df7ceadc7e --- /dev/null +++ b/media/docs/cute/00_quickstart.md @@ -0,0 +1,75 @@ +# Getting Started With CuTe + +CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. This lets programmers focus on the logical descriptions of their algorithms while CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, implement, and modify all dense linear algebra operations. + +The core abstraction of CuTe are the hierarchically multidimensional layouts which can be composed with data arrays to represent tensors. The representation of layouts is powerful enough to represent nearly everything we need to implement efficient dense linear algebra. Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning. + +## System Requirements + +CuTe shares CUTLASS 3.0's software requirements, +including NVCC with a C++17 host compiler. + +## Knowledge prerequisites + +CuTe is a CUDA C++ library. It requires C++17 +(the revision of the C++ Standard that was released in 2017). + +Throughout this tutorial, we assume intermediate C++ experience. +For example, we assume that readers know +how to read and write templated functions and classes, and +how to use the `auto` keyword to deduce a function's return type. +We will be gentle with C++ and explain some things +that you might already know. + +We also assume intermediate CUDA experience. +For example, readers must know +the difference between device and host code, +and how to launch kernels. + +## Building Tests and Examples + +CuTe's tests and examples build and run as part of CUTLASS's normal build process. +CuTe's unit tests live in the [`test/unit/cute`](../../../test/unit/cute) subdirectory. +Its examples live in the [`examples/cute`](../../../examples/cute) subdirectory. + +## Library Organization + +CuTe is a header-only C++ library, so there is no source code that needs building. Library headers are contained within the top level [`include/cute`](../../../include/cute) directory, with components of the library grouped by directories that represent their semantics. + +| Directory | Contents | +|------------------------|------------------------| +| [`include/cute`](../../../include/cute) | Each header in the top level corresponds to one of the fundamental building blocks of CuTe, such as [`Layout`](../../../include/cute/layout.hpp) or [`Tensor`](../../../include/cute/tensor.hpp). | +| [`include/cute/container`](../../../include/cute/container) | Implementations of STL-like container objects, such as tuple, array, aligned array, and array views. | +| [`include/cute/numeric`](../../../include/cute/numeric) | Templates that handle nonstandard floating-point types, unsigned integers, complex numbers, and integer sequence - like fundamental numeric data types. | +| [`include/cute/algorithm`](../../../include/cute/algorithm) | Implementations of utility algorithms such as copy, fill, and clear that automatically leverage architecture-specific features if available. | +| [`include/cute/arch`](../../../include/cute/arch) | Wrappers for architecture-specific matrix-matrix multiply and copy instructions. | +| [`include/cute/atom`](../../../include/cute/atom) | Meta-information for instructions in `arch` and utilities like partitioning and tiling. + +## Tutorial + +This directory contains a CuTe tutorial in Markdown format. +The file +[`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md) +explains how to implement dense matrix-matrix multiply using CuTe components. +It gives a broad overview of CuTe and thus would be a good place to start. + +Other files in this directory discuss specific parts of CuTe. + +* [`01_layout.md`](./01_layout.md) describes `Layout`, CuTe's core abstraction. + +* [`02_layout_operations.md`](./02_layout_operations.md) describes more advanced `Layout` operations and the CuTe layout algebra. + +* [`03_tensor.md`](./03_tensor.md) describes `Tensor`, + a multidimensional array abstraction which composes `Layout` + with an array of data. + +* [`04_algorithms.md`](./04_algorithms.md) summarizes CuTe's + generic algorithms that operate on `Tensor`s. + +* [`0t_mma_atom.md`](./0t_mma_atom.md) demonstrates CuTe's meta-information and interface to our GPUs' + architecture-specific Matrix Multiply-Accumulate (MMA) instructions. + +* [`0x_gemm_tutorial.md`](./0x_gemm_tutorial.md) provides a walkthrough of building a GEMM from scratch using CuTe. + +* [`0y_predication.md`](./0y_predication.md) explains what to do + if a tiling doesn't fit evenly into a matrix. diff --git a/media/docs/cute/01_layout.md b/media/docs/cute/01_layout.md new file mode 100644 index 0000000000..882d541ab3 --- /dev/null +++ b/media/docs/cute/01_layout.md @@ -0,0 +1,254 @@ +# CuTe Layouts + +## Layout + +This document describes `Layout`, CuTe's core abstraction. +A `Layout` maps from (a) logical coordinate space(s) +to a physical index space. + +`Layout`s present a common interface to multidimensional array access +that abstracts away the details of how the array's elements are organized in memory. +This lets users write algorithms that access multidimensional arrays generically, +so that layouts can change, without users' code needing to change. + +CuTe also provides an "algebra of `Layout`s." +`Layout`s can be combined and manipulated +to construct more complicated layouts +and to partition them across other layouts. +This can help users do things like partition layouts of data over layouts of threads. + +## Layouts and Tensors + +Any of the `Layout`s discussed in this section can be composed with data -- a pointer or an array -- to create a `Tensor`. The responsibility of the `Layout` is to define valid coordinate space(s) and, therefore, the logical shape of the data and map those into an index space. The index space is precisely the offset that would be used to index into the array of data. + +For details on `Tensor`, please refer to the +[`Tensor` section of the tutorial](./03_tensor.md). + +## Shapes and Strides + +A `Layout` is a pair of `Shape` and `Stride`. +Both `Shape` and `Stride` are `IntTuple` types. + +### IntTuple + +An `IntTuple` is an integer or a tuple of `IntTuple`s. +This means that `IntTuple`s can be arbitrarily nested. +Operations defined on `IntTuple`s include the following. + +* `get(IntTuple)`: The `I`th element of the `IntTuple`. Note that `get<0>` is defined for integer `IntTuples`. + +* `rank(IntTuple)`: The number of elements in an `IntTuple`. An int has rank 1, a tuple has rank `tuple_size`. + +* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc. + +* `size(IntTuple)`: The product of all elements of the IntTuple. + +We write `IntTuple`s with parenthesis to denote the hierarchy. E.g. `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s. + +## Layout + +A `Layout` is then a pair of `IntTuple`s. The first defines the abstract *shape* of the layout and the second defines the *strides*, which map from coordinates within the shape to the index space. + +As a pair of `IntTuple`s, we can define many similar operations on `Layout`s including + +* `get(Layout)`: The `I`th sub-layout of the `Layout`. + +* `rank(Layout)`: The number of modes in a `Layout`. + +* `depth(Layout)`: The number of hierarchical `Layout`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc. + +* `shape(Layout)`: The shape of the `Layout`. + +* `stride(Layout)`: The stride of the `Layout`. + +* `size(Layout)`: The logical extent of the `Layout`. Equivalent to `size(shape(Layout))`. + +### Hierarchical access functions + +`IntTuple`s and thus `Layout`s can be arbitrarily nested. +For convenience, we define versions of some of the above functions +that take a sequence of integers, instead of just one integer. +This makes it possible to access elements +inside of nested `IntTuple` or `Layout`. +For example, we permit `get(x)`, where `I...` here +and throughout this section is a "C++ parameter pack" +that denotes zero or more (integer) template arguments. +That is, `get(x)` is equivalent to +`get(` $\dots$ `(get(get(x)))` $\dots$ `))`, +where the ellipses are pseudocode and not actual C++ syntax. +These hierarchical access functions include the following. + +* `rank(x) := rank(get(x))`. The rank of the `I...`th element of `x`. + +* `depth(x) := depth(get(x))`. The depth of the `I...`th element of `x`. + +* `size(x) := size(get(x))`. The size of the `I...`th element of `x`. + +### Vector examples + +Then, we can define a vector as any `Shape` and `Stride` pair with `rank == 1`. +For example, the `Layout` + +``` +Shape: (8) +Stride: (1) +``` + +defines a contiguous 8-element vector. +Similarly, with a stride of `(2)`, +the interpretation is that the eight elements +are stored at positions 0, 2, 4, $\dots$. + +By the above definition, we *also* interpret + +``` +Shape: ((4,2)) +Stride: ((1,4)) +``` + +as a vector, since its shape is rank 1. The inner shape describes a 4x2 layout of data in column-major order, but the extra pair of parenthesis suggest we can interpret those two modes as a single 1-D 8-element vector instead. Due to the strides, the elements are also contiguous. + +### Matrix examples + +Generalizing, we define a matrix as any `Shape` and `Stride` pair with rank 2. For example, + +``` +Shape: (4,2) +Stride: (1,4) + 0 4 + 1 5 + 2 6 + 3 7 +``` + +is a 4x2 column-major matrix, and + +``` +Shape: (4,2) +Stride: (2,1) + 0 1 + 2 3 + 4 5 + 6 7 +``` + +is a 4x2 row-major matrix. + +Each of the modes of the matrix can also be split into *multi-indices* like the vector example. +This lets us express more layouts beyond just row major and column major. For example, + +``` +Shape: ((2,2),2) +Stride: ((4,1),2) + 0 2 + 4 6 + 1 3 + 5 7 +``` + +is also logically 4x2, with a stride of 2 across the rows but a multi-stride down the columns. +Since this layout is logically 4x2, +like the column-major and row-major examples above, +we can _still_ use 2-D coordinates to index into it. + +## Constructing a `Layout` + +A `Layout` can be constructed in many different ways. +It can include any combination of compile-time (static) integers +or run-time (dynamic) integers. + +```c++ +auto layout_8s = make_layout(Int<8>{}); +auto layout_8d = make_layout(8); + +auto layout_2sx4s = make_layout(make_shape(Int<2>{},Int<4>{})); +auto layout_2sx4d = make_layout(make_shape(Int<2>{},4)); + +auto layout_2x4 = make_layout(make_shape (2, make_shape (2,2)), + make_stride(4, make_stride(1,2))); +``` + +## Using a `Layout` + +The fundamental use of a `Layout` is to map between logical coordinate space(s) and index space. For example, to print an arbitrary rank-2 layout, we can write the function + +```c++ +template +void print2D(Layout const& layout) +{ + for (int m = 0; m < size<0>(layout); ++m) { + for (int n = 0; n < size<1>(layout); ++n) { + printf("%3d ", layout(m,n)); + } + printf("\n"); + } +} +``` + +which produces the following output for the above examples. + +``` +> print2D(layout_2sx4s) + 0 2 4 6 + 1 3 5 7 +> print2D(layout_2sx4d) + 0 2 4 6 + 1 3 5 7 +> print2D(layout_2x4) + 0 2 1 3 + 4 6 5 7 +``` + +The multi-indices within the `layout_4x4` example are handled as expected and interpreted as a rank-2 layout. + +Note that for `layout_1x4`, we're using a 1-D coordinate for a 2-D multi-index in the second mode. In fact, we can generalize this and treat all of the above layouts as 1-D layouts. For instance, the following `print1D` function + +```c++ +template +void print1D(Layout const& layout) +{ + for (int i = 0; i < size(layout); ++i) { + printf("%3d ", layout(i)); + } +} +``` + +produces the following output for the above examples. + +``` +> print1D(layout_8s) + 0 1 2 3 4 5 6 7 +> print1D(layout_8d) + 0 1 2 3 4 5 6 7 +> print1D(layout_2sx4s) + 0 1 2 3 4 5 6 7 +> print1D(layout_2sx4d) + 0 1 2 3 4 5 6 7 +> print1D(layout_2x4) + 0 4 2 6 1 5 3 7 +``` + +This shows explicitly that all of the layouts are simply folded views of an 8-element array. + +## Summary + +* The `Shape` of a `Layout` defines its coordinate space(s). + + * Every `Layout` has a 1-D coordinate space. + This can be used to iterate in a "generalized-column-major" order. + + * Every `Layout` has a R-D coordinate space, + where R is the rank of the layout. + These spaces are ordered _colexicographically_ + (reading right to left, instead of "lexicographically," + which reads left to right). + The enumeration of that order + corresponds to the 1-D coordinates above. + + * Every `Layout` has an h-D coordinate space where h is "hierarchical." These are ordered colexicographically and the enumeration of that order corresponds to the 1-D coordinates above. An h-D coordinate is congruent to the `Shape` so that each element of the coordinate has a corresponding element of the `Shape`. + +* The `Stride` of a `Layout` maps coordinates to indices. + + * In general, this could be any function from 1-D coordinates (integers) to indices (integers). + + * In `CuTe` we use an inner product of the h-D coordinates with the `Stride` elements. diff --git a/media/docs/cute/02_layout_operations.md b/media/docs/cute/02_layout_operations.md new file mode 100644 index 0000000000..f9c9734a79 --- /dev/null +++ b/media/docs/cute/02_layout_operations.md @@ -0,0 +1,710 @@ +# CuTe Layout Operations + +CuTe provides an "algebra of `Layout`s." +`Layout`s can be combined and manipulated +to construct more complicated `Layout`s. +This includes tiling and partitioning `Layout`s across other `Layout`s. +In this section, we explain some of these core operations in detail. + +## How do I print CuTe objects on host or device? + +CuTe comes with different ways to print CuTe objects. +You can print human-readable text, +or you can print LaTeX commands for generating +a beautifully formatted and colored table +describing the CuTe object. +Both of these can be helpful for reasoning about or debugging +layouts, copy atoms, or matrix multiply atoms +(don't worry, we'll explain all of these things in this tutorial). + +CuTe's print functions work on either host or device. +Note that on device, printing is expensive. +Even just leaving print code in place on device, +even if it is never called +(e.g., printing in an `if` branch that is not taken at run time), +may generate slower code. +Thus, be sure to remove code that prints on device after debugging. + +The following code examples assume that you have a +`using namespace cute;` statement in scope. + +### Printing human-readable text + +The `cute::print` function has overloads for almost all CuTe types, including Pointers, Layout, Shape, Stride, and Tensors. When in doubt, try calling `print` on it. You might also only want to print on thread 0 of each thread block, or block 0 of the grid. The `thread0()` function returns true only for global thread 0 of the kernel. A typical idiom for printing CuTe objects to print only on thread 0 of block 0. + +```c++ +if (thread0()) { + print(some_cute_object); +} +``` + +Some algorithms do different things on different threads or blocks, +so you might sometimes need to print on threads or blocks other than zero. +The header file +[`cute/util/debug.hpp`](../../../include/cute/util/debug.hpp), +among other utilities, +includes the function `bool thread(int tid, int bid)` +that returns `true` if running on thread `tid` and block `bid`. + +Some CuTe types have special printing functions that use a different output format. +For example, `print_layout` can display a rank-2 layout in a table +(using plain text formatting). +It has an overload taking a rank-2 matrix layout and a thread layout, +that displays a table with the mapping between threads and values. + +Some CuTe types might not have overloads for `print`, +but there are other ways to print their contents. +For example, copy atoms and mma atoms +(see elsewhere in this tutorial) +have a `print_all()` member function. + +### Printing LaTeX output + +The `cute::print_latex` function works like `cute::print`, +but prints LaTeX commands that you can use +to generate a nicely formatted and colored table. + +## Fundamental types + +### Layout and its components + +This directory includes +[an overview of CuTe's fundamental types for describing layouts](./01_layout.md). + +#### Tuple + +CuTe starts with a Tuple, which is a finite ordered list of zero or more elements. +In C++, we identify a Tuple with the +[`cute::tuple` class](../../../include/cute/container/tuple.hpp). +`cute::tuple` behaves like `std::tuple`, but it works on device or host, +and it imposes restrictions on its template arguments for performance and simplicity. + +#### IntTuple + +CuTe then defines an IntTuple as either an integer, or a Tuple of IntTuple. +This recursive definition lets us build arbitrarily nested layouts. +In C++, we identify an IntTuple with [`IntTuple`](../../../include/cute/int_tuple.hpp), +which is just an alias of `cute::tuple`. +Any of the following are thus valid template arguments of IntTuple. + +1. "Run-time integers" (or "static integers") + are just ordinary integral types like `int` or `size_t`. + +2. "Compile-time integers" include `std::integral_constant` + or subclasses of it that CuTe defines, + such as `Int` (see below). + These types all have in common + that the value is encoded in the type itself + (as a public `static constexpr value` member). + CuTe defines aliases `_1`, `_2`, `_3` etc. + to the types `Int<1>`, `Int<2>`, `Int<3>` etc. + +3. `IntTuple` with any valid template arguments. + +CuTe reuses IntTuple for many different things, +including Shape, Stride, Step, and Coord +(see [`include/cute/layout.hpp`](../../../include/cute/layout.hpp)). +In C++, Shape, Stride, Step, and Coord are all aliases for IntTuple. + +### Layout + +A Layout is a tuple of (Shape, Stride). +Semantically, it implements a mapping from +a "logical" Shape-shaped (multidimensional) index, +to a "physical" 1-D index into an array. +Here is an example of a 2 x 3 array with static strides (3, 1). + +```c++ +Layout layout = make_layout(make_shape (_2{}, _3{}), + make_stride(_3{}, _1{})); +print_layout(layout); +for (int i = 0; i < size(layout); ++i) { + print(layout(i)); + print(", "); +} +print("\n"); +print(layout(1, 1)); +print("\n"); +``` + +This code produces the following text output. + +```text +(_2,_3):(_3,_1) + 0 1 2 + +---+---+---+ + 0 | 0 | 1 | 2 | + +---+---+---+ + 1 | 3 | 4 | 5 | + +---+---+---+ +0, 3, 1, 4, 2, 5, +4 +``` + +`print(layout(1, 1))` prints the mapping of +the logical 2-D coordinate (0,1) to 1-D index, which is 4. +You can see that from the table, +which shows the left logical index as the "row," +and the right logical index as the "column." + +### Underscore (`_`) + +An Underscore is a special type used for array slices. The underscore punctuation `_` is a constant instance of Underscore. It acts like `:` (the colon punctuation) in Python or Fortran array slices. See [`include/cute/underscore.hpp`](../../../include/cute/underscore.hpp). + +### Tile + +"A Tile is not a Layout, it's a tuple of Layouts or Tiles or Underscores." +See [`include/cute/tile.hpp`](../../../include/cute/tile.hpp). + +The algebraic layout operations discussed below are defined on `Layout`s, but `Tile` allows these operations to recurse and to be applied to sublayouts or particular modes of a given Layout. These are referred to as by-mode operations. + +See the section on "Logical Divide" to see an example of using `Tile` to extract portions of a row-mode and portions of a column-mode independently. + +## Layout definitions and operations + +### Layouts are functions from integers (logical 1-D coordinate) to integers (1-D index) + +The `for` loop in the above print example shows how CuTe identifies 1-D coordinates with a column-major layout of logical 2-D coordinates. Iterating from `i = 0` to `size(layout)` (which is 6), and indexing into our layout with the single integer coordinate `i`, traverses the layout in column-major fashion, even though this is a row-major layout. You can see this from the output of the `for` loop (0, 3, 1, 4, 2, 5). CuTe calls this index `i` a "1-D coordinate," versus the "natural coordinate," which would be the logical 2-D coordinate. + +If you're familiar with the C++23 feature `mdspan`, +this is an important difference between +`mdspan` layout mappings and CuTe `Layout`s. +`mdspan` layout mappings are *one way*: +they always take a multidimensional logical coordinate, +and they return an integer offset. +Depending on the strides, +the offset may skip over elements of the physical 1-D array. +Thus, `mdspan`'s offset does NOT mean the same thing as +the 1-D logical coordinate `i` in the `for` loop above. +You can iterate correctly over any CuTe `Layout` +by using the 1-D logical coordinate. +`mdspan` doesn't have an idea of a 1-D logical coordinate. + +### Rank, depth, size, cosize + +*Rank*: the tuple size of the layout's shape. + +*Depth*: the depth of the layout's shape. A single integer has depth 0. A tuple has depth 1 + the max depth of its components. + +*Size*: Size of the shape; size of the domain of the function. This is the product of all extents in the layout's shape. + +*Cosize*: Size of the function's codomain (not necessarily the range); for a layout A, A(size(A) - 1) + 1. (Here, we use size(A) - 1 as a 1-D logical coordinate input.) + +### Layout compatibility + +We say that layouts A and B are *compatible* if their shapes are compatible. Shape A is compatible with shape B if any natural coordinate of A is also a valid coordinate for B. + +### Flatten + +The `flatten` operation "un-nests" a potentially nested Layout. For example, + +```c++ +Layout layout = Layout, _1>, + Stride, _0>>{}; +Layout flat_layout = flatten(layout); +``` + +results in `flat_layout` having the following type + +```text +Layout, Stride<_3, _1, _0>> +``` + +and + +```c++ +Layout layout = Layout>, + Stride<_4, Stride<_1, _16>>>{}; +Layout flat_layout = flatten(layout); +``` + +results in `flat_layout` having the following type + +```text +Layout, Stride<_4, _1, _16>> +``` + +Hierarchical Layouts and flattening let us reinterpret tensors in place as matrices, matrices as vectors, vectors as matrices, etc. This lets us implement arbitrary tensor contractions as batched matrix multiply, by combining the contraction modes into a single mode, and combining the A, B, C, and "batch" modes as needed to reach the desired form. + +### Coalesce + +The `coalesce` operation first flattens the layout, then combines all the modes that are possible to combine, starting with mode 0 (the leftmost mode) and moving right. If all the modes can be combined, then this results in a 1-D layout expressing what array elements the original layout accesses. + +For example, + +```text +layout: (_2,(_1,_6)):(_1,(_6,_2)) +coalesce(layout): _12:_1 +``` + +What does it mean to "combine" modes? In the above example, the flattened layout is (2, 1, 6) : (1, 6, 2). + +1. If we look at the leftmost two modes, this is just a vector of length 2 and stride 1. The middle mode has extent 1, so the corresponding stride 6 would not be observed anyway. This leaves us with (2, 6) : (1, 2). + +2. The intermediate result (2, 6) : (1, 2) is just a 2 x 6 column-major matrix, which can be coalesced into a vector of length 12 and stride 1. + +More formally, "combining all the modes" means a left fold, where the binary operation that combines two modes has three cases. + +1. If the leftmost layout is s1:d1, and the next layout is 1:d0, then combine into s1:d1. This generalizes Step 1 above. If a mode has extent 1, we can't observe its stride, so we can skip the mode. + +2. If the leftmost layout is 1:d1, and the next layout is s0:d0, then combine into s0:d0. Again, if a mode has extent 1, we can't observe its stride, so we can skip the mode. + +3. If the leftmost layout is s1:d1, and the next layout is s0 : s1*d1, then combine into s0 * s1 : d1. This generalizes Step 2 above. One can call this "noticing a column-major layout sequence." + +That's it! For example, the result of coalescing the row-major layout (2, 2) : (2, 1) is (2, 2) : (2, 1), the same layout, because none of the above three cases applies. + +### Complement + +#### Definition + +The complement B of a layout A with respect to an integer M satisfies the following properties. + +1. $A$ and $B$ are *disjoint*: $A(x) \neq B(x)$ for all $x \neq 0$ in the domain of $A$. + +2. B is *ordered*: $`B(x-1) < B(x)`$ for all $x$ in $\{0, 1, \dots, size(B) - 1\}$. + +3. B is *bounded* by M: $size(B) \geq M / size(A)$, and $cosize(B) \leq floor(M / cosize(A)) * cosize(A)$. + +Regarding disjointness: we need to specify $x \neq 0$ because CuTe layouts are linear. That is, if the domain is nonempty, the range always contains zero. + +Regarding the ordered property: CuTe layouts are hierarchically strided, so this implies that if size(B) is nonzero, then the strides of B are all positive. + +#### Examples + +complement(4:1, 24) is 6:4. + +1. The result is disjoint of 4:1, so it must have a stride of at least 4 (since it includes 0, but must skip over 1, 2, 3). + +2. The size of the result is $\geq 24 / 4 = 6$. (This plus Step (1) means that the cosize is at least 24.) + +3. The cosize of the result is $\leq (24 / 4) * 4 = 24$. (This plus Step (2) means that the cosize is exactly 24.) + +4. The only (1-D) layout with size 6 and cosize 24 is 6:4. + +complement(6:4, 24) is 4:1. + +1. 4:1 is disjoint of 6:4, but so is s:d + for any s > 0 and d > 20. + +2. The size of the result is $\geq 24 / 6 = 4$. + +3. The cosize of the result is $\leq (24 / 21) * 21 = 21$. + +4. The stride cannot be greater than 20 + (else (2) would contradict (3)), + so it must be less than 4. + +5. This leaves 4:1 by elimination. + +### Composition + +Layouts are functions, so composition of layouts is just composition of functions. The composition $A \circ B$ means "apply the layout B first, then treat the result as a 1-D logical coordinate input to the layout A, and apply A to it." Very often, this composition can be represented as another Layout. + +#### Rules for computing composition + +Both humans and CuTe compute composition using the following rules. + +1. $A \circ B$ has a shape that is compatible with B. In function composition, the rightmost function defines the domain. For `Layout`s this means that any valid coordinate for $B$ can also be used as a coordinate for $A \circ B$. + +2. Concatenation: A layout can be expressed as the concatenation of its sublayouts. We denote concatenation with parentheses: $B = (B_0,B_1,...)$. The CuTe function `make_layout`, when given zero or more `Layout`s, concatenates them. + +3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B0, B1, ...) = (A \circ B0, A \circ B1, ...)$. + +4. "Base case": For layouts $A = a : b$ and $B = c : d$ with integral shape and stride, $A \circ B = R = c : (b * d)$. + +5. By-mode composition: Let $\langle B, C \rangle$ (angle brackets, not parentheses) + denote a tuple of two layouts B and C, not their concatenation. Let A = (A0, A1). + Then, $A \circ \langle B, C \rangle = (A0, A1) \circ \langle B, C \rangle = (A0 \circ B, A1 \circ C)$. + This allows the application of composition independently to sublayouts of $A$. + +#### Examples: Reshape a vector into a matrix + +This section gives two composition examples. Both start with a vector with layout $20:2$ (that is, the vector has 20 elements, and the stride between each is 2). They compose this vector with a 4 x 5 matrix layout. This effectively "reshapes" the vector in place into a matrix. + +##### Example 1 + +$20:2 \circ (4,5) : (1,4)$. + +This describes interpreting the vector $20:2$ +as a 4 x 5 column-major matrix. + +The resulting layout has shape $(4,5)$, +because in function composition, +the rightmost function defines the domain. +What are the strides? + +1. A layout can be expressed as the concatenation of its sublayouts, + so $(4,5) : (1,4)$ is $(4:1, 5:4)$. + +2. Composition is distributive, so + $20:2 \circ (4:1, 5:4)$ is $(20:2 \circ 4:1, 20:2 \circ 5:4)$. + +3. $20:2 \circ 4:1$ has shape 4 (rightmost function defines the domain) + and stride $2 = 2 \cdot 1$. + +4. $20:2 \circ 5:4$ has shape 5 and stride $8 = 2 \cdot 4$. + +5. Result: (4:2, 5:8), which by concatenation is (4,5) : (2,8). + +#### Example 2 + +$20:2 \circ (4,5) : (5,1)$. + +This describes interpreting the vector 20:2 +as a 4 x 5 row-major matrix. + +The resulting layout has shape $(4,5)$, just as before. What are the strides? + +1. By deconcatenation, $(4,5) : (5,1)$ is $(4:5, 5:1)$. + +2. Composition is distributive, so $20:2 \circ (4:5, 5:1)$ is $(20:2 \circ 4:5, 20:2 \circ 5:1)$. + +3. $20:2 \circ 4:5$ has shape $4$ and stride $10 = 2 \cdot 5$. + +4. $20:2 \circ 5:1$ has shape $5$ and stride $2 = 2 \cdot 1$. + +5. Result: (4:10, 5:2), which by concatenation is (4,5) : (10,2). + +### Product + +CuTe includes four different kinds of layout products. + +1. `logical_product` + +2. `blocked_product` + +3. `raked_product` + +4. `tiled_product` + +`logical_product(A, B)` results in a layout where each element of layout B +has been replaced by a "copy" of layout A. +The other three products offer variations of this idea. + +#### Example: Tiled matrix + +Suppose that I want to make a matrix consisting of 3 x 4 tiles +in a row-major arrangement, +where each tile is a 2 x 2 column-major matrix. + +The Layout of each tile (tile) has Shape (2,2) and Stride (1,2). + +The Layout of the "matrix of tiles" (`matrix_of_tiles`) +has Shape (3,4) and Stride (4,1). + +##### Blocked product: the intuitive tiling + +If I were to deduce by hand what the layout of the tiled matrix should be, +it would look like this. + +| | (0,0) | (1,0) | (0,1) | (1,1) | (0,2) | (1,2) | (0,3) | (1,3) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| (0,0) | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | +| (1,0) | 1 | 3 | 5 | 7 | 9 | 11 | 13 | 15 | +| (0,1) | 16 | 18 | 20 | 22 | 24 | 26 | 28 | 30 | +| (1,1) | 17 | 19 | 21 | 23 | 25 | 27 | 29 | 31 | +| (0,2) | 32 | 34 | 36 | 38 | 40 | 42 | 44 | 46 | +| (1,2) | 33 | 35 | 37 | 39 | 41 | 43 | 45 | 47 | + +The row and column labels use the equivalence of 1-D logical coordinates and 2-D column-major coordinates. The left index in each pair is the row resp. column coordinate of the tile, while the right index in each pair is the row resp. column coordinate of the matrix-of-tiles. The resulting layout has Shape ((2, 3), (2, 4)), and Stride ((1, 16), (2, 4)), and the second mode can be coalesced. The Shape ((2, 3), (2, 4)) is hierarchical, but it is still rank-2 and can be drawn in 2D as above. Note how the row mode of the tile remains part of the row mode of the product, and the column mode of the tile remains a column mode of the product. + +The above layout is what `blocked_product(tile, matrix_of_tiles)` produces. +A critical use case for blocked product is "tiling" an "atom" +(some tile that relates to a hardware feature) over a matrix. + +```c++ +Layout tile = Layout, + Stride<_1,_2>>{}; +Layout matrix_of_tiles = Layout, + Stride<_4,_1>>{}; + +print_layout(blocked_product(tile, matrix_of_tiles)); +``` + +##### Logical product + +The logical product `logical_product(tile, matrix_of_tiles)` +results in Shape ((2, 2), (3, 4)) and Stride ((1, 2), (16, 4)). + +| | (0,0) | (1,0) | (2,0) | (0,1) | (1,1) | (2,1) | (0,2) | (1,2) | (2,2) | (0,3) | (1,3) | (2,3) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| (0,0) | 0 | 16 | 32 | 4 | 20 | 36 | 8 | 24 | 40 | 12 | 28 | 44 | +| (1,0) | 1 | 17 | 33 | 5 | 21 | 37 | 9 | 25 | 41 | 13 | 29 | 45 | +| (0,1) | 2 | 18 | 34 | 6 | 22 | 38 | 10 | 26 | 42 | 14 | 30 | 46 | +| (1,1) | 3 | 19 | 35 | 7 | 23 | 39 | 11 | 27 | 43 | 15 | 31 | 47 | + +Note how the tile appears in the leftmost column and is reproduced +in each column in the same order as the matrix-of-tiles. That is, +the tile can be indexed through the first mode of the result and the +matrix-of-tiles can be indexed through the second mode. + +```c++ +Layout tile = Layout, + Stride<_1,_2>>{}; +Layout matrix_of_tiles = Layout, + Stride<_4,_1>>{}; + +print_layout(logical_product(tile, matrix_of_tiles)); +``` + +##### Raked product + +The raked product `raked_product(tile, matrix_of_tiles)` results in +Shape ((3, 2), (4, 2)) and Stride ((16, 1), (4, 2)). + +| | (0,0) | (1,0) | (2,0) | (3,0) | (0,1) | (1,1) | (2,1) | (3,1) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| (0,0) | 0 | 4 | 8 | 12 | 2 | 6 | 10 | 14 | +| (1,0) | 16 | 20 | 24 | 28 | 18 | 22 | 26 | 30 | +| (2,0) | 32 | 36 | 40 | 44 | 34 | 38 | 42 | 46 | +| (0,1) | 1 | 5 | 9 | 13 | 3 | 7 | 11 | 15 | +| (1,1) | 17 | 21 | 25 | 29 | 19 | 23 | 27 | 31 | +| (2,1) | 33 | 37 | 41 | 45 | 35 | 39 | 43 | 47 | + +The tile is now interleaved or "raked" with the other 3x4 matrix-of-tiles +instead of appearing as blocks. Other references call this is cyclic +distribution. + +This might look familiar if you have ever used ScaLAPACK. +It expresses a 2-D block cyclic distribution of a 6 x 8 matrix +over 4 processes in a 2 x 2 "process grid." See +["The Two-dimensional Block-Cyclic Distribution"](https://netlib.org/scalapack/slug/node75.html#sec2dbcd) +and +["Local Storage Scheme and Block-Cyclic Mapping"](https://netlib.org/scalapack/slug/node76.html#seclocalstorage) +in the ScaLAPACK Users' Guide. + +In general, `logical_product` and these variations can produce any interleaving, +including blocked, cyclic, by-mode blocked/cyclic, and intermediate interleavings +that don't have common names. + +```c++ +Layout tile = Layout, + Stride<_1,_2>>{}; +Layout matrix_of_tiles = Layout, + Stride<_4,_1>>{}; + +print_layout(raked_product(tile, matrix_of_tiles)); +``` + +### Division + +The previous section covered layout products, +that reproduce one layout over another. +This section covers layout *division*. +Functions that divide a layout into components are useful +as a basis for tiling and partitioning layouts. + +For example, consider folding a vector into a matrix. +We could imagine an operation, called `logical_divide`, + +```c++ +Layout vec = Layout<_16,_3>{}; // 16 : 3 +Layout col = Layout< _4,_1>{}; // 4 : 1 +Layout mat = logical_divide(vec, col); // (4,4) : (3,12) +``` + +that "takes" the first 4 elements of the vector into the first mode +and leaves the "rest" in the second mode. This is a column-major matrix +view of the data in `vec`. +What if we want a row-major matrix view? + +```c++ +Layout vec = Layout<_16,_3>{}; // 16 : 3 +Layout col = Layout< _4,_4>{}; // 4 : 4 +Layout mat = logical_divide(vec, col); // (4,4) : (12,3) +``` + +Now, every fourth element of the vector is in the first mode and +the "rest" are in the second mode. +Multidimensional, hierarchical indices let us extend this operation +to any layout that "divides" the vector. + +```c++ +Layout vec = Layout<_16,_3>{}; // 16 : 3 +Layout col = Layout< _4,_2>{}; // 4 : 2 +Layout mat = logical_divide(vec, col); // (4,(2,2)) : (6,(3,24)) +``` + +```c++ +Layout vec = Layout<_16,_3>{}; // 16 : 3 +Layout col = Layout< _4,_2>{}; +Layout col = Layout, + Stride<_4,_1>>{}; // (2,2) : (4,1) +Layout mat = logical_divide(vec, col); // ((2,2),(2,2)) : ((12,3),(6,24)) +``` + +All of the above examples produce a 4x4 matrix +that can be indexed and treated like a normal 4x4 matrix, +but each has a different underlying layout. +Thus, our algorithms can be written using logical coordinates, +without needing to address the detailed indexing that each layout requires. + +CuTe includes 3 different kinds of layout division operations. + +1. `logical_divide` + +2. `zipped_divide` + +3. `tiled_divide` + +We will summarize these in the sections that follow. + +#### Logical divide : the intuitive tiling + +Suppose I have the 6 x 8 matrix from the Raked Product section +and want to "collect" the `tile`, turning the Raked Product into +the Blocked Product. + +To do this, we would like to gather two elements from the column +and leave the rest, then gather two elements from the row and leave the rest. +Thus, we want to apply `logical_divide` independently to the rows and cols +in order to retrieve the appropriate elements. + +In code, we copy the Layout from the result of the Raked Product section, then +specify the elements in the rows and cols we would like to gather. + +```c++ +Layout raked_prod = Layout,Shape <_4,_2>>, + Stride,Stride<_4,_2>>>{}; +Tile subtile = make_tile(Layout<_2,_3>{}, // Gather elements 2 : 3 from mode 0 + Layout<_2,_4>{}); // Gather elements 2 : 4 from mode 1 + +print_layout(logical_divide(raked_prod, subtile)); +``` + +Indeed, this does produce the result from the Blocked Product section. + +| | (0,0) | (1,0) | (0,1) | (1,1) | (0,2) | (1,2) | (0,3) | (1,3) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| (0,0) | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | +| (1,0) | 1 | 3 | 5 | 7 | 9 | 11 | 13 | 15 | +| (0,1) | 16 | 18 | 20 | 22 | 24 | 26 | 28 | 30 | +| (1,1) | 17 | 19 | 21 | 23 | 25 | 27 | 29 | 31 | +| (0,2) | 32 | 34 | 36 | 38 | 40 | 42 | 44 | 46 | +| (1,2) | 33 | 35 | 37 | 39 | 41 | 43 | 45 | 47 | + +Of course, any other rearrangement of the rows and cols is also valid. + +#### Zipped divide + +The `zipped_divide` function applies `logical_divide`, and then gathers the +"subtiles" into a single mode and the "rest" into a single mode. + +For example, if we apply `zipped_divide` instead of `logical_divide` in the example above, + +```c++ +Layout raked_prod = Layout,Shape <_4,_2>>, + Stride,Stride<_4,_2>>>{}; +Tile subtile = make_tile(Layout<_2,_3>{}, // Gather elements 2 : 3 from mode 0 + Layout<_2,_4>{}); // Gather elements 2 : 4 from mode 1 + +print_layout(zipped_divide(raked_prod, subtile)); +``` + +then we get the result + +| | (0,0) | (1,0) | (2,0) | (0,1) | (1,1) | (2,1) | (0,2) | (1,2) | (2,2) | (0,3) | (1,3) | (2,3) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| (0,0) | 0 | 16 | 32 | 4 | 20 | 36 | 8 | 24 | 40 | 12 | 28 | 44 | +| (1,0) | 1 | 17 | 33 | 5 | 21 | 37 | 9 | 25 | 41 | 13 | 29 | 45 | +| (0,1) | 2 | 18 | 34 | 6 | 22 | 38 | 10 | 26 | 42 | 14 | 30 | 46 | +| (1,1) | 3 | 19 | 35 | 7 | 23 | 39 | 11 | 27 | 43 | 15 | 31 | 47 | + +Note that this is the same layout as the result in the Logical Product section. +That is, the first mode is our original tile (and can be interpreted as a 2x2 matrix itself) +and the second mode is its logical layout within the raked layout. + +##### More Examples of Divide + +For brevity, shapes can be used with `logical_divide` and `tiled_divide` to quickly split and tile modes of a tensor. For example, this C++ code + +```c++ +Layout layout = Layout, + Stride< _1,_128,_0>>{}; +Shape tile_shape = make_shape(_4{},_8{}); +Layout logical_divided_tile = logical_divide(layout, tile_shape); +Layout zipped_divided_tile = zipped_divide(layout, tile_shape); + +print("layout : "); print(layout); print("\n"); +print("tile_shape : "); print(tile_shape); print("\n"); +print("logical_divided_tile : "); print(logical_divided_tile); print("\n"); +print("zipped_divided_tile : "); print(zipped_divided_tile); print("\n\n"); +``` + +produces the following output when we vary `layout`. + +```text +full_layout : (_12,_32,_6):(_1,_128,_0) +tile_shape : (_4,_8) +logical_divided_tile : ((_4,_3),(_8,_4),_6):((_1,_4),(_128,_1024),_0) +zipped_divided_tile : ((_4,_8),(_3,_4,_6)):((_1,_128),(_4,_1024,_0)) + +full_layout : (_12,(_4,_8),_6):(_1,(_32,_512),_0) +tile_shape : (_4,_8) +logical_divided_tile : ((_4,_3),((_4,_2),_4),_6):((_1,_4),((_32,_512),_1024),_0) +zipped_divided_tile : ((_4,(_4,_2)),(_3,_4,_6)):((_1,(_32,_512)),(_4,_1024,_0)) +``` + +This code + +```c++ +Layout layout = make_layout(Shape<_8,_8>{}, + Stride<_8,_1>{}); +Layout tile = make_tile(make_layout(Shape<_4>{}), + make_layout(Shape<_2>{})); +print("layout: "); +print_layout(layout); +print("\n"); +print("tile: "); +print(tile); +print("\n"); +print("logical_divide: "); +print_layout(logical_divide(layout, tile)); +print("zipped_divide: "); +print_layout(zipped_divide(layout, tile)); +``` + +results in the following layouts. + +

+ logical_divide-and-zipped_divide +

+ +This code + +```c++ +Layout layout = make_layout(Shape<_8,_8>{}, + Stride<_8,_1>{}); +Layout tile = make_tile(make_layout(Shape<_2>{}), + make_layout(Shape<_4>{})); +print("layout: "); +print_layout(layout); +print("\n"); +print("tile: "); +print(tile); +print("\n"); +print("logical_divide: "); +print_layout(logical_divide(layout, tile)); +print("zipped_divide: "); +print_layout(zipped_divide(layout, tile)); +``` + +results in the following layouts. + +

+ logical_divide-and-zipped_divide-2 +

+ +#### Tiled divide + +The `tiled_divide` function works like `zipped_divide`, +except that it unpacks the second mode. This is useful when you have a `Tile` that describes all of the elements for a particular operation, for example, and want to gather those together but retain the logical shape of those tiles within the original layout. That is, + +```text +Layout Shape : (M, N, L, ...) +Tile Shape : +Tiled Result : ((M', N'), m, n, L, ...) +``` + +where `m` is `M / M'` and `n` is `N / N'`. +We can consider `m` as the "number of `Tile`s in `M`" and `n` as the "number of `Tile`s in `N`". This style of operation is common when applying MMA Atoms and Copy Atoms. diff --git a/media/docs/cute/03_tensor.md b/media/docs/cute/03_tensor.md new file mode 100644 index 0000000000..2382d834f7 --- /dev/null +++ b/media/docs/cute/03_tensor.md @@ -0,0 +1,262 @@ +# CuTe Tensors + +## A Tensor is a multidimensional array + +CuTe's `Tensor` class represents a multidimensional array. +The array's elements can live in any kind of memory, +including global memory, shared memory, and register memory. + +### Array access + +Users access a `Tensor`'s elements in one of three ways: + +* `operator()`, taking as many integral arguments as the number of modes, + corresponding to the element's (possibly) multidimensional logical index; + +* `operator()`, taking a `Coord` (an `IntTuple` of the logical indices); or + +* `operator[]`, taking a `Coord` (an `IntTuple` of the logical indices). + +### Slices: Get a Tensor accessing a subset of elements + +Users can get a "slice" of a `Tensor`, +that is, a `Tensor` that accesses a subset of elements +of the original `Tensor`. + +Slices happen through the same `operator()` +that they use for accessing an individual element. +Passing in `_` (the underscore character, an instance of `Underscore`) +has the same effect as `:` (the colon character) in Fortran or Matlab: +the resulting slice accesses all indices in that mode of the `Tensor`. + +### Tensor's behavior determined by its Layout and Engine + +A `Tensor`'s behavior is entirely determined by its two components, +which correspond to its two template parameters: `Engine`, and `Layout`. + +For a description of `Layout`, +please refer to [the `Layout` section](./01_layout.md) +of this tutorial, or the [GEMM overview](./0x_gemm_tutorial.md). + +An `Engine` represents a one-dimensional array of elements. +When users perform array access on a `Tensor`, +the `Tensor` uses its `Layout` to map from a logical coordinate +to a one-dimensional index. +Then, the `Tensor` uses its `Engine` +to map the one-dimensional index +to a reference to the element. +You can see this in `Tensor`'s implementation of array access. + +```c++ +decltype(auto) operator[](Coord const& coord) { + return engine().begin()[layout()(coord)]; +} +``` + +One could summarize almost all CuTe use cases as follows: + +* create `Layout`s, + +* create `Tensor`s with those `Layout`s, and + +* invoke (either CuTe's, or custom) algorithms on those `Tensor`s. + +### Ownership of the elements + +`Tensor`s can be owning or nonowning. + +"Owning" `Tensor`s behave like `std::array`. +When you copy the `Tensor`, you (deep-)copy its elements, +and the `Tensor`'s destructor deallocates the array of elements. + +"Nonowning" `Tensor`'s behave like a (raw) pointer to the elements. +Copying the `Tensor` doesn't copy the elements, +and destroying the `Tensor` doesn't deallocate the array of elements. + +Whether a `Tensor` is "owning" or "nonowning" depends entirely on its `Engine`. +This has implications for developers of generic `Tensor` algorithms. +For example, input `Tensor` parameters of a function +should be passed by const reference, +because passing the `Tensor`s by value +might make a deep copy of the `Tensor`'s elements. +It might also *not* make a deep copy of the elements; +there's no way to know without specializing the algorithm +on the `Tensor`'s `Engine` type. +Similarly, output or input/output `Tensor` parameters of a function +should be passed by (nonconst) reference. +Returning a `Tensor` might (or might not) +make a deep copy of the elements. + +The various overloads of the `copy_if` algorithm in +[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp) +take their `src` (input, source of the copy) parameter +as `Tensor& const`, +and take their `dst` (output, destination of the copy) parameter +as `Tensor&`. +Additionally, there are overloads for mutable temporaries like +`Tensor&&` +so that these algorithms can be applied directly to slices, +as in the following example. + +```c++ +copy(src_tensor(_,3), dst_tensor(2,_)); +``` + +In C++ terms, each of the expressions +`src_tensor(_,3)`, and `dst_tensor(2,_)` +is in the "prvalue" +[value category](https://en.cppreference.com/w/cpp/language/value_category), +because it is a function call expression +whose return type is nonreference. +(In this case, calling `Tensor::operator()` +with at least one `_` (`Underscore`) argument +returns a `Tensor`.) +The prvalue `dst_tensor(2,_)` won't match +the `copy` overload taking +`Tensor&`, +because prvalues can't be bound to +nonconst lvalue references (single `&`). +However, it will match the `copy` overload taking +`Tensor&&` +(note the two `&&` instead of one `&`). +Calling the latter overload binds the reference +to the prvalue `dst_tensor(2,_)`. +This results in +[creation of a temporary](https://en.cppreference.com/w/cpp/language/implicit_conversion#Temporary_materialization) +`Tensor` result to be passed into `copy`. + +### CuTe's provided `Engine` types + +CuTe comes with three `Engine` types. + +* `ArrayEngine`: an owning `Engine`, + representing an array of `N` elements of type `T` + +* `ViewEngine`: a nonowning `Engine`, + where `Iterator` is a random access iterator + (either a pointer to an array, or something that acts like one) + +* `ConstViewEngine`: a nonowning `Engine`, + which is the view-of-const-elements version of `ViewEngine` + +### "Tags" for different kinds of memory + +`ViewEngine` and `ConstViewEngine` wrap pointers to various kinds of memory. +Users can "tag" the memory with its space -- e.g., global or shared -- +by calling `make_gmem_ptr(g)` when `g` is a pointer to global memory, +or `make_smem_ptr(s)` when `s` is a pointer to shared memory. + +Tagging memory makes it possible for CuTe's `Tensor` algorithms +to use the fastest implementation for the specific kind of memory. +It also avoids incorrect memory access. +For example, some kinds of optimized copy operations require +that the source of the copy be in global memory, +and the destination of the copy be in shared memory. +Tagging makes it possible for CuTe to dispatch +to those optimized copy operations where possible. +CuTe does this by specializing `Tensor` algorithms +on the `Tensor`'s `Engine` type. + +### Engine members + +In order for a type to be valid for use as an `Engine`, +it must have the following public members. + +```c++ +using value_type = /* ... the value type ... */; +using iterator = /* ... the iterator type ... */; +iterator begin() /* sometimes const */; +``` + +## Constructing a Tensor + +### Nonowning view of existing memory + +A `Tensor` can be a nonowning view of existing memory. +For this use case, users can create the `Tensor` by calling `make_tensor` +with two arguments: a wrapped pointer to the memory to view, and the `Layout`. +Users wrap the pointer by identifying its memory space: +e.g., global memory (via `make_gmem_ptr`) or shared memory (via `make_smem_ptr`). +`Tensor`s that view existing memory can have either static or dynamic `Layout`s. + +Here are some examples of creating `Tensor`s +that are nonowning views of existing memory. + +```c++ +// Global memory (static or dynamic layouts) +Tensor gmem_8s = make_tensor(make_gmem_ptr(A), Int<8>{}); +Tensor gmem_8d = make_tensor(make_gmem_ptr(A), 8); +Tensor gmem_8sx16d = make_tensor(make_gmem_ptr(A), make_shape(Int<8>{},16)); +Tensor gmem_8dx16s = make_tensor(make_gmem_ptr(A), make_shape ( 8 ,Int<16>{}), + make_stride(Int<16>{},Int< 1>{})); + +// Shared memory (static or dynamic layouts) +Shape smem_shape = make_shape(Int<4>{},Int<8>{}); +__shared__ T smem[decltype(size(smem_shape))::value]; // (static-only allocation) +Tensor smem_4x8_col = make_tensor(make_smem_ptr(&smem[0]), smem_shape); +Tensor smem_4x8_row = make_tensor(make_smem_ptr(&smem[0]), smem_shape, GenRowMajor{}); +``` + +### Owning array of register memory + +A `Tensor` can also be an owning array of register memory. +For this use case, users can create the `Tensor` +by calling `make_tensor(layout)`, +where `T` is the type of each element of the array, +and `layout` is the `Tensor`'s `Layout`. +Owning `Tensor`s must have a static `Layout`, +as CuTe does not perform dynamic memory allocation in `Tensor`s. + +Here are some examples of creating owning `Tensor`s. + +```c++ +// Register memory (static layouts only) +Tensor rmem_4x8_col = make_tensor(make_shape(Int<4>{},Int<8>{})); +Tensor rmem_4x8_row = make_tensor(make_shape(Int<4>{},Int<8>{}), GenRowMajor{}); +Tensor rmem_4x8_mix = make_tensor(make_shape (Int<4>{},Int< 8>{}), + make_stride(Int<2>{},Int<32>{})); +Tensor rmem_8 = make_fragment_like(gmem_8sx16d(_,0)); +``` + +The `make_fragment_like` function makes an owning Tensor of register memory, +with the same shape as its input `Tensor` argument. + +## Tensor use examples + +### Copy rows of a matrix from global memory to registers + +The following example copies rows of a matrix (with any `Layout`) +from global memory to register memory, +then executes some algorithm `do_something` +on the row that lives in register memory. + +```c++ +Tensor gmem = make_tensor(make_gmem_ptr(A), make_shape(Int<8>{}, 16)); +Tensor rmem = make_fragment_like(gmem(_, 0)); +for (int j = 0; j < size<1>(gmem); ++j) { + copy(gmem(_, j), rmem); + do_something(rmem); +} +``` + +This code does not need to know anything the `Layout` of `gmem` +other than that it is rank-2 and that the first mode is a compile-time value. +The following code checks both of those conditions at compile time. + +```c++ +CUTE_STATIC_ASSERT_V(rank(gmem) == Int<2>{}); +CUTE_STATIC_ASSERT_V(is_static(gmem))>{}); +``` + +A `Tensor` encapsulates the data type, data location, +and possibly also the shape and stride of the tensor at compile time. +As a result, `copy` can dispatch, based on the types and Layouts of its arguments, +to use any of various synchronous or asynchronous hardware copy instructions +and can auto-vectorize the copy instructions in many cases as well. +CuTe's `copy` algorithm lives in +[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp). +For more details on the algorithms that CuTe provides, +please refer to [the algorithms section](./04_algorithms.md) +of the tutorial, or the +[CuTe overview in the GEMM tutorial](./0x_gemm_tutorial.md). + diff --git a/media/docs/cute/04_algorithms.md b/media/docs/cute/04_algorithms.md new file mode 100644 index 0000000000..e35b75612d --- /dev/null +++ b/media/docs/cute/04_algorithms.md @@ -0,0 +1,223 @@ +# CuTe Tensor algorithms + +This section summarizes the interfaces and implementations +of common numerical algorithms performed on `Tensor`s. + +The implementation of these algorithms may be found in the +[include/cute/algorithm/](../../../include/cute/algorithm/) +directory. + +## `copy` + +CuTe's `copy` algorithm copies the elements of a source `Tensor` +into the elements of a destination `Tensor`. +The various overloads of `copy` can be found in +[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp). + +### Interface and specialization opportunities + +A `Tensor` encapsulates the data type, data location, +and possibly also the shape and stride of the tensor at compile time. +As a result, `copy` can and does dispatch, +based on the types of its arguments, +to use any of various synchronous or asynchronous hardware copy instructions. + +The `copy` algorithm has two main overloads. +The first just takes the source `Tensor` and the destination `Tensor`. + +```c++ +template +CUTE_HOST_DEVICE +void +copy(Tensor const& src, + Tensor & dst); +``` + +The second takes those two parameters, plus a `Copy_Atom`. + +```c++ +template +CUTE_HOST_DEVICE +void +copy(Copy_Atom const& copy_atom, + Tensor const& src, + Tensor & dst); +``` + +The two-parameter `copy` overload picks a default implementation +based only on the types of the two `Tensor` parameters. +The `Copy_Atom` overload lets callers override that default +by specifying a nondefault `copy` implementation. + +### Parallelism and synchronization depend on parameter types + +Either the default implementation or +the implementation selected by a `Copy_Atom` overload +may use none or all available parallelism, +and may have a variety of synchronization semantics. +The behavior depends on `copy`'s parameter types. +Users are expected to figure this out based on their knowledge +of the architecture on which they are running. +(Developers often write a custom optimized kernel +for each GPU architecture.) + +The `copy` algorithm may be sequential per thread, +or it may be parallel across some collection of threads +(e.g., a block or cluster). + +If `copy` is parallel, +then the collection of participating threads +may need synchronization before any thread in the collection +may assume that the copy operation has completed. +For example, if the participating threads form a thread block, +then users must invoke `__syncthreads()` +or the Cooperative Groups equivalent +before they may use the results of `copy`. + +The `copy` algorithm may use asynchronous copy instructions, +such as `cp.async`, or its C++ interface `memcpy_async`. +In that case, users will need to perform +the additional synchronization appropriate to that underlying implementation +before they may use the results of the `copy` algorithm. +[The CuTe GEMM tutorial example](../../../examples/cute/tutorial/sgemm_nt_1.cu) +shows one such synchronization method. +More optimized GEMM implementations use pipelining techniques +to overlap asynchronous `copy` operations with other useful work. + +### A generic copy implementation + +A simple example of a generic `copy` implementation +for any two `Tensor`s looks like this. + +```c++ +template +CUTE_HOST_DEVICE +void +copy(Tensor const& src, // Any logical shape + Tensor & dst) // Any logical shape +{ + for (int i = 0; i < size(src); ++i) { + dst(i) = src(i); + } +} +``` + +This generic `copy` algorithm addresses both `Tensor`s +with 1-D logical coordinates, thus traversing both `Tensor`s +in a logical column-major order. +Some reasonable architecture-independent optimizations +would include the following. + +1. If the two `Tensor`s have known memory spaces with optimized + access instructions (like `cp.async`), then dispatch to the + custom instruction. + +2. The the two `Tensor`s have static layouts and it can be proven + that element vectorization is valid -- for example, four `LDS.32`s + can be combined into a single `LDS.128` -- then vectorize the source + and destinations tensors. + +3. If possible, validate that the copy instruction to be used is + appropriate for the source and destination tensors. + +CuTe's optimized copy implementations can do all of these. + +## `copy_if` + +CuTe's `copy_if` algorithm lives in the same header as `copy`, +[`include/cute/algorithm/copy.hpp`](../../../include/cute/algorithm/copy.hpp). +The algorithm takes source and destination `Tensor` parameters like `copy`, +but it also takes a "predication `Tensor`" +with the same shape as the input and output. +Elements of the source `Tensor` are only copied +if the corresponding predication `Tensor` element is nonzero. + +For details on why and how to use `copy_if`, +please refer to the +["predication" section of the tutorial](./0y_predication.md). + +## `gemm` + +### What `gemm` computes + +The `gemm` algorithm takes three `Tensor`s, A, B, and C. +What it does depends on the number of modes +that its `Tensor` parameters have. +We express these modes using letters. + +* V indicates a "vector," a mode of independent elements. + +* M and N indicate the number of rows resp. columns + of the matrix result C of the BLAS's GEMM routine. + +* K indicates the "reduction mode" of GEMM, + that is, the mode along which GEMM sums. + Please see the [GEMM tutorial](./0x_gemm_tutorial.md) for details. + +We list the modes of the input `Tensor`s A and B, +and the output `Tensor` C, +using a notation `(...) x (...) => (...)`. +The two leftmost `(...)` describe A and B (in that order), +and the `(...)` to the right of the `=>` describes C. + +1. `(V) x (V) => (V)`. The element-wise product of vectors: Cv += Av Bv. Dispatches to FMA or MMA. + +2. `(M) x (N) => (M,N)`. The outer product of vectors: Cmn += Am B_n. Dispatches to (4) with V=1. + +3. `(M,K) x (N,K) => (M,N)`. The product of matrices: Cmn += Amk Bnk. Dispatches to (2) for each K. + +4. `(V,M) x (V,N) => (V,M,N)`. The batched outer product of vectors: Cvmn += Avm Bvn. Optimizes for register reuse and dispatches to (1) for each M, N. + +5. `(V,M,K) x (V,N,K) => (V,M,N)`. The batched product of matrices: Cvmn += Avmk Bvnk. Dispatches to (4) for each K. + +Please refer to the [GEMM tutorial](./0x_gemm_tutorial.md) +for an overview of CuTe's convention for ordering the modes. +For example, if K appears, it always appears rightmost ("outermost"). +If V appears, it always appears leftmost ("innermost"). + +### Dispatch to optimized implementations + +Just like with `copy`, CuTe's implementations of `gemm` +uses its `Tensor` arguments' types to dispatch +to an appropriately optimized implementation. +Also like `copy`, `gemm` takes an optional `MMA_Atom` parameter +that lets callers override the default `FMA` instruction +that CuTe would select based on the `Tensor` arguments' types. + +For more information on `MMA_Atom` and on specialization of `gemm` +for different architectures, please refer to the +[MMA section of the tutorial](./0t_mma_atom.md). + +## `axpby` + +The `axpby` algorithm lives in the header file +[`include/cute/algorithm/axpby.hpp`](../../../include/cute/algorithm/axpby.hpp). +It assigns to $y$ the result of $\alpha x + \beta y$, +where $\alpha$ and $\beta$ are scalars and $x$ and $y$ are `Tensor`s. +The name stands for "Alpha times X Plus Beta times Y," +and is a generalization of the original BLAS "AXPY" routine +("Alpha times X Plus Y"). + +## `fill` + +The `fill` algorithm lives in the header file +[`include/cute/algorithm/fill.hpp`](../../../include/cute/algorithm/fill.hpp). +It overwrites the elements of its `Tensor` output argument +with a given scalar value. + +## `clear` + +The `clear` algorithm lives in the header file +[`include/cute/algorithm/clear.hpp`](../../../include/cute/algorithm/clear.hpp). +It overwrites the elements of its `Tensor` output argument with zeros. + +## Other algorithms + +CuTe provides other algorithms. +Their header files can be found in the +[`include/cute/algorithm`](../../../include/cute/algorithm) +directory. diff --git a/media/docs/cute/0t_mma_atom.md b/media/docs/cute/0t_mma_atom.md new file mode 100644 index 0000000000..7bdc407413 --- /dev/null +++ b/media/docs/cute/0t_mma_atom.md @@ -0,0 +1,434 @@ +# CuTe's support for Matrix Multiply-Accumulate instructions + +In this file, we explain in detail how we support our GPUs' +Matrix Multiply-Accumulate (MMA) hardware instructions in CuTe. + +MMAs are architecture-specific. +Different generations of GPU architectures +introduce different sets of MMA instructions. +However, CuTe features such as `Layout` +makes it possible to expose MMAs for use in generic CUDA C++ code. +We do this in two steps. + +1. We wrap each MMA's PTX instruction in an "Operation" struct. + +2. For each Operation struct, we define a "Traits" struct + that defines all of the meta-information needed to use the Operation. + +## CuTe MMA Atoms + +CuTe exposes each MMA to generic CUDA C++ code as a pair of structs: +an "Operation" struct, +and an `MMA_Traits` struct templated on the Operation struct type. + +An "Operation" struct exposes the PTX instruction +for that specific operation. +It defines the arguments and interface it expects. +Operation structs have minimal software dependencies -- +it does not use layouts, tensors, or non-standard numeric data types. +Different structs have different names +that describe what the MMA instruction does. +We will explain the naming scheme below. + +A corresponding `MMA_Traits` struct specialization +defines meta-information about the Operation, +such as the compute types, the logical shape of the operation, +and the `Layout`s of threads and values within the operation. +The `MMA_Traits` struct takes the Operation as a template parameter. +CuTe specializes `MMA_Traits` for each Operation type that it supports. + +Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of of the PTX instruction. The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates. + +CuTe MMA atoms expose the semantics of a single MMA operation. +This is true regardless of the hardware level at which the MMA operates. +CuTe supports MMA atoms that operate at a variety of hardware levels, +including + +* a single thread (e.g., fused multiply-add (FMA) instruction); + +* a quadpair (Volta); + +* a single warp (Ampere); and + +* a warpgroup (Hopper). + +### Operation structs + +#### Location of files + +CuTe provides its Operations structs in the +[`include/cute/arch`](../../../include/cute/arch) +directory, in header files starting with `mma`. + +#### Operation struct's name + +A CuTe Operation struct's name encodes information about + +* its first supported architecture, + +* the M, N, and K dimensions that it accepts, + +* the types that it takes, and + +* the expected A and B layouts. + +For example, the Volta section below will refer to the +`SM70_8x8x4_F32F16F16F32_NT` Operation struct defined in +[`include/cute/arch/mma_sm70.hpp`](../../../include/cute/arch/mma_sm70.hpp). + +* "SM70" refers to Volta. + +* "8x8x4" refers to M = 8, N = 8, and K = 4, + the dimensions of the MMA operation that the quadpair performs + (see below). + +* "F32F16F16F32" refers to the element types + of the four matrix operands A, B, C, and D. + An MMA computes D = C + A * B, + so we read the types from left to right: + D is F32 (`float`), A is F16 (half), + B is F16 (half), and C is F32 (`float`). + +* "NT" means that A is M-major (not transposed) + and B is N-major (transposed). + +#### Contents + +An Operation struct has the following members. + +##### Type aliases + +An Operation struct has four public type aliases: +`DRegisters`, `ARegisters`, `BRegisters`, and `CRegisters`. +For example, the `SM70_8x8x4_F32F16F16F32_NT` Operation struct defined in +[`include/cute/arch/mma_sm70.hpp`](../../../include/cute/arch/mma_sm70.hpp) +defines these as follows. + +```c++ +using DRegisters = float[8]; +using ARegisters = uint32_t[2]; +using BRegisters = uint32_t[2]; +using CRegisters = float[8]; +``` + +This shows how many values each thread will pass into the PTX instruction +for each of the matrices A, B, C, and D. For this Operation, +each thread passes 8 F32 values each for C and D (hence `float[8]`), +and 4 F16 values each for A and B (hence `uint32_t[2]`; +the instruction packs two 16-bit F16 values +in each of the two 32-bit `uint32_t` values). + +##### `fma` static member device function + +An operation struct defines a public `static void fma` function. +It is marked with the `CUTE_HOST_DEVICE` macro, +which adds the `__host__ __device__` annotations. +Different Operations define `fma` to take different numbers of arguments, +depending on the PTX MMA instruction. +The implementation protects use of the PTX instruction with a macro, +and raises an `assert` if `fma` is called when the macro is not defined. +This ensures that tests and examples that use this Operation in an Atom +can still compile, even if the PTX instruction is not available. + +### Traits + +#### Location of files + +CuTe provides its Traits structs in the +[`include/cute/atom`](../../../include/cute/atom) +directory, in header files starting with `mma_traits`. + +#### Contents + +An `MMA_Traits` specialization defines the following public type aliases. + +* `ElementDVal`: Compute type of the D matrix + +* `ElementAVal`: Compute type of the A matrix + +* `ElementBVal`: Compute type of the B matrix + +* `ElementCVal`: Compute type of the C matrix + +* `Shape_MNK`: Logical MxNxK shape of the MMA operation + +* `ThrID`: Logical thread mapping within the single MMA operation + (specifying the quadpair, warp, or warpgroup view) + +* `ALayout`: Mapping of (thread,value) pairs to the logical MxK A matrix + +* `BLayout`: Mapping of (thread,value) pairs to the logical NxK B matrix + +* `CLayout`: Mapping of (thread,value) pairs to the logical MxN C matrix + +#### Example + +The specialization of MMA_Traits for the +`SM70_8x8x4_F32F16F16F32_NT` Operation lives in the header file +[`include/cute/atom/mma_traits_sm70.hpp`](../../../include/cute/atom/mma_traits_sm70.hpp). +It looks like this. + +```c++ +template <> +struct MMA_Traits +{ + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; + + using Shape_MNK = Shape<_8,_8,_4>; + using ThrID = SM70_QuadPair; + using ALayout = SM70_8x4_Col; + using BLayout = SM70_8x4_Col; + using CLayout = SM70_8x8_32b; +}; +``` + +The next section will explain these type aliases in detail. + +## Volta + +This and the following sections show examples of how to construct MMA atoms. +We don't try to explain this for all GPU architectures and MMAs. +Instead, we use selected examples to illustrate the process +of developing new atoms. + +Volta architecture implements an HMMA instruction where a group of 8 threads called a quadpair (QP) collaborate to share data and perform an 8x8x4 (fp32 or fp16) matrix multiply-accumulate. (since a warp is 32 threads wide, it would perform an MMA across 4 QPs for a tile size of 16x16x4). + +We first take a look at how we would take the ISA semantics of thread and data partitioning for the HMMA instruction, and encode it in a Traits struct. The HMMA NT instruction has the thread-data layout: + +

+ HMMA.8x8x4.NT.png +

+ +### Types + +The HMMA NT above uses types: + +```cpp + using ElementDVal = float; + using ElementAVal = half_t; + using ElementBVal = half_t; + using ElementCVal = float; +``` + +The rest of the `MMA_Traits` will be described in units of these types. + +### Shape + +The HMMA NT above has shape 8x8x4: + +```cpp + // Logical shape of the MMA + using Shape_MNK = Shape <_8,_8,_4>; +``` + +### Thread ID + +If the 32 threads in a warp are logically indexed by [0 ... 31], then the above image contains threads [0,1,2,3]U[16,17,18,19]. These threads make up the 0th quadpair. We can write a thread mapping that maps eight logical thread ids [0,1,2,3,4,5,6,7] of the MMA to a quadpair thread index [0,1,2,3]U[16,17,18,19] of a warp. The layout function has 4 elements with a stride of 1 and 2 of those with a stride of 16. With this, we write a layout that represents a quadpair: + +```cpp + // Mapping from (logical thread id) -> (thread idx) + using ThrID = Layout, + Stride<_1,_16>>; +``` + +Again, this layout function maps the logical thread id [0,8) of the MMA operation onto the quadpair thread index [0,4)U[16,20) of a warp. + +### Accumulator Mapping + +Let us look at exactly how the 8 threads within a QP are mapped to the A, B and C matrices. For the C and D matrices, the above image is broken down a bit more below. On the left is shown the whole QP level view, and on the right is shown the values owned by just thread 0. + +

+ HMMA.8x8x4.quadpair.C.png +

+ +The metainformation of this single instruction level view is what we want to encode in CuTe. Specifically, the QP level view in this diagram corresponds to the four MMA traits for [SM70_F32F16F16F32](../../../include/cute/arch/mma_sm70.hpp). These structs contain the `Element` types, the `Shape_MNK`, and the `ThrID` mapping we constructed above. Now, let us take a look at the definition of `CLayout`, the thread-data layout of accumulators. The job of `CLayout` is to construct a mapping between the `(logical_thr_id, logical_val_id)` and `(m, n)` coordinate in the C matrix which can then be used to build up more complicated layouts and operations like the 16x16x4 WMMA. + +We can start constructing a `CLayout` from the picture above. As with any CuTe layout, it is a pair of `Shape` and corresponding `Stride`. Let us just look at the shape for now. We know that the HMMA uses 8 threads each of which own 8 values. Therefore, the shape of our mapping must have a size of 8 along two modes. With this, we have + +```cpp + // (T8,V8) -> (m,n) + using CLayout = Layout, + Stride<_?, _?>; // Stride to be filled in below +``` + +This is not to be confused with the logical 8x8 shape of the C matrix. This is 8-threads by 8-values. We now want to map those to (m,n) coordinates. Since CuTe layouts return indices rather than coordinates, we choose a column-major encoding of the (m,n) coordinates: + +``` +(logical_thr_id, logical_val_id) -> (m, n) == m + n * M +``` + +With this in place, we can start thinking about how to construct the strides in `CLayout`. Let's begin by looking at the strides between threads. Note that +* `(T0,V0)` is located at `(m,n) = (0,0) = 0` +* `(T1,V0)` is located at `(m,n) = (1,0) = 1` +* `(T2,V0)` is located at `(m,n) = (0,2) = 16` +* `(T3,V0)` is located at `(m,n) = (1,2) = 17` +* `(T4,V0)` is located at `(m,n) = (4,0) = 4` +* `(T5,V0)` is located at `(m,n) = (5,0) = 5` +* `(T6,V0)` is located at `(m,n) = (4,2) = 20` +* `(T7,V0)` is located at `(m,n) = (5,2) = 21` + +where `T4`,`T5`,`T6`,`T7` are the 4th,5th,6th,7th logical thread id of the MMA corresponding to thread indices of 16,17,18,19 of the warp (recorded in the `ThrID` mapping!). + +We note that the pattern can be transcribed to a layout. We can find the position of the 8 threads via + +```cpp + using CLayout = Layout, _8>, + Stride, _?>; +``` + +With the exact same approach, we can construct the stride along the `logical value id` mode. +* `(T0,V0)` is located at `(m,n) = (0,0) = 0` +* `(T0,V1)` is located at `(m,n) = (0,1) = 8` +* `(T0,V2)` is located at `(m,n) = (2,0) = 2` +* `(T0,V3)` is located at `(m,n) = (2,1) = 10` +* `(T0,V4)` is located at `(m,n) = (0,4) = 32` +* `(T0,V5)` is located at `(m,n) = (0,5) = 40` +* `(T0,V6)` is located at `(m,n) = (2,4) = 34` +* `(T0,V7)` is located at `(m,n) = (2,5) = 42` + +We note that this pattern can also be transcribed to a layout. We can find the position of the 8 values via + +```cpp + // (T8,V8) -> (m,n) + using CLayout = Layout, Shape <_2,_2, _2>>, + Stride, Stride<_8,_2,_32>>>; +``` + +And that's all! We can verify that each `(tid,vid)` coordinate in this layout is reliably mapped to the correct (encoded) `(m,n)` coordinate. + +In the case of F16 accumulators, the layout is way less complex. Each row of accumulators `(m, :)` is held by a single thread, which makes the layout: + +```cpp + using CLayout = Layout, + Stride<_1,_8>>; +``` + +### A and B Layout Mapping + +A and B matrix layouts depend on whether the sources are transposed or not. The diagram below shows the thread ID to data ownership map for A and B matrices in the case of NT and TN transposes. + +

+ HMMA.8x8x4.quadpair.AB.png +

+ +Let's look at the TN layout for A matrix first (right side in the diagram). Again, there are the same 8 logical threads, but each threads owns only 4 elements this time. The shape of `ALayout` will then be `Shape<_8, _4>`. As for the strides, we again need a similar mapping between `(m, k) == m + k * M`. Looking down the `M` mode, we go from `(T0, V0)` to `(T1, V0)` which is a stride of 1 for all 8 threads. For the `K` mode, as we go across, we go from `(T0, V0)` to `(T0, V1)`, which makes a stride of 8 for all 4 values. Therefore, the A layout is: + +```cpp + // (T8,V4) -> (m,k) + using ALayout = Layout, + Stride<_1,_8>>; +``` + +Source B layout is constructed similarly for the TN HMMA, except that we want write it as `(N,K)` rather than `(K,N)` for convenience. For the strides, as we go across the `N` mode, we go from `(T0, V0)` to `(T1, V0)`, making this a stride of 1 for all 8 threads. As we go down the `K` mode, `(T0, V0)` to `(T0, V1)` which is a stride of 8 for all 4 values. So the B layout is the same as A: + +```cpp + // (T8,V4) -> (n,k) + using BLayout = Layout, + Stride<_1,_8>>; +``` + +The layouts in the case of NT are a bit more complicated (left side of the diagram). Going down the `M` mode of `A`, we see the four values of `T0` first and then we see the four values of `T4`. This means we first have a stride of 1 for 4 values, followed by a stride of 4 from `T0` to `T4`. So we have two sub-strides along the `M` mode. For the `K` mode, as we go across, we simply increment the `thr_id`, keeping `val_id` the same, making the stride 8 for 4 threads. This makes the A layout: + +```cpp + // (T8,V4) -> (m,k) + using ALayout = Layout,_4>, + Stride,_1>>; +``` + +With the `(N,K)` ordering for B, the layout is the same. + +```cpp + // (T8,V4) -> (n,k) + using BLayout = Layout,_4>, + Stride,_1>>; +``` + +For the NN and TT transposes, they are simply combinations of the two layouts we have seen for A and B so far. + +## Hopper + +Now, we are ready to take a look at the much larger GMMA operation (Group MMA) first introduced with Hopper architecture. These MMA instructions operate at the granularity of 128 threads (4 warps), which are collectively referred to as a warpgroup. + +### Thread ID + +In the case of Hopper GMMAs, the thread IDs are assigned based on the simple 1D contiguous layout, which makes `thrID` trivial: + +```cpp +using ThrID = Layout<_128, _1>; +``` + +### Accumulator Mapping + +Accumulators are mapped hierarchically in GMMA, starting from the concept of a core matrix and building up to a layout for the whole C matrix tile. Let's look at this core matrix first. We only consider fp16 accumulators here, but extensions of fp32 accumulators as trivial as we will see later. + +Each core matrix has the layout as shown in the diagram below. +

+ gmma_coremat_cd_fp16.png +

+ +As in the Volta examples, the thread IDs are logical only, and which of the four warps they belong to in the warpgroup is not important. + +Then GMMA tiles this core matrix first vertically along the M mode, and then repeats that column of core matrices along the N mode to construct the full MxN tile. This tiling is shown in the image below. + +

+ gmma_wg_n_slice.png +

+ +With this image, we are again ready to start building the `CLayout` for `SM90_64x128x16_F16F16F16F16_TN` atom. Same as before, we are constructing a mapping between the `(logical_thr_id, logical_val_id) -> (m, n)` coordinate spaces. + +To begin, let's follow the first few threads and values. We immediately see that they are arranged along the `N`-mode with pairs of values and four threads. This gives us + +```cpp +// (T128,V4) -> (M64,N8) +using CLayout = Layout, Shape < _2, ...>>, + Stride, Stride<_64, ...>>>; +``` + +To complete the first 8x8 core matrix, the four threads repeat eight times down the `M`-mode: + +```cpp +// (T128,V4) -> (M64,N8) +using CLayout = Layout, Shape < _2, ...>>, + Stride, Stride<_64, ...>>>; +``` + +Then, as we go to the next core matrix, we wrap back again to `T0`, but this time to `(T0, V2)`. + +```cpp +// (T128,V4) -> (M64,N8) +using CLayout = Layout, Shape < _2, _2>>, + Stride, Stride<_64, _8>>>; +``` + +Finally, we get this entire pattern repeating four times, once for each warp, down the `M`-mode starting at `(m,n) = (16,0) = 16`. where two core matrices that belong to the same warp are stacked on top of each other. This makes the size of the final sub-mode of M 4. As for the stride, this time we go to `(T32, V0)`, which makes it a stride of 32. + +```cpp +// (T128,V4) -> (M64,N8) +using CLayout = Layout, Shape < _2, _2>>, + Stride, Stride<_64, _8>>>; +``` + +This is the full `CLayout` for 64x8 accumulators. The GMMA instructions include 64xN variants with `N = [16,32,64,128,256]` where this 64x8 pattern is repeated giving each thread additional values. As this starts at `(m,n) = (0,8) = 512`, this is easy to account for in our `CLayout`. For example, the 64x128 `CLayout` is + +```cpp +// (T128,V64) -> (M64,N128) +using CLayout = Layout, Shape < _2, _2, _16>>, + Stride, Stride<_64, _8, _512>>>; +``` + +where we see 16 copies of the 64x8 tile. + +### A and B Layout Mapping + +GMMA atoms that consume A and B sources directly from shared memory are a bit interesting. The GMMA Descriptor is constructed on an entore tile of A and/or B data in shared memory rather than being partitioned by threads. That is, every thread sees the entire tile of data and the tile is not reordered so that the descriptor can be constructed on it. In `ALayout` form, this can be expressed + +```cpp +// (T128,V64x8) -> (M64,K16) +using ALayout = Layout>, + Stride< _0, Stride< _1,_64>>>; +``` + +That is, all threads are mapped the to `(m,k) = (0,0) = 0` element and the values (and shape of the values) remains unchanged. The GMMA Descriptor Constructor can then inspect the `(M,K)` layout of this data and create an appropriate GMMA Descriptor or produce an error message saying the data is in an invalid layout for GMMA. diff --git a/media/docs/cute/0x_gemm_tutorial.md b/media/docs/cute/0x_gemm_tutorial.md new file mode 100644 index 0000000000..102010bb6b --- /dev/null +++ b/media/docs/cute/0x_gemm_tutorial.md @@ -0,0 +1,668 @@ +# CuTe dense matrix-matrix multiply tutorial + +This section uses the CuTe functionality to write +a dense matrix-matrix multiply implementation. + +## A simple dense matrix-matrix multiply example + +In this section, we will go through +[this example](../../../examples/cute/tutorial/sgemm_nt_1.cu). +It illustrates a blocked GPU implementation of GEMM +that uses the building blocks of CuTe +to construct global and shared memory layout mappings +and partition threads among them. +This example is closest to the blocked GEMM +that a computer science student might be asked to implement +in a first-year graduate school +or upper-division undergraduate scientific computing course. + +Readers who understand this section may also wish to study +CUTLASS's implementation of the stream-K GEMM algorithm, +which uses many features of CuTe. + +### Filename and high-level interface + +First, let's look at the example's filename `sgemm_nt_1.cu`. +"SGEMM" is the BLAS (Basic Linear Algebra Subroutines) abbreviation +for "Single-precision real, GEneral, Matrix-matrix Multiply." +(If we want to refer to matrix-matrix multiply for all data types, +we say "GEMM.") +The BLAS project started in the 1970s. +You can learn more about its history in Turing Award winner Jack Dongarra's +2004 Oral History interview by SIAM +(the Society for Industrial and Applied Mathematics), +and also in the C++ Standard document [P1417](https://wg21.link/p1417). +The abbreviation SGEMM unpacks as follows. + +* "Single-precision" is Fortran-speak for float. + The BLAS supports four different matrix or vector element types: + + * S for single precision (`float`), + + * D for double precision (`double`), + + * C for complex float (like C++'s `std::complex`, + where each of the real and imaginary components has type `float`), + and + + * Z for complex double (like C++'s `std::complex`). + +* "GEneral" means that the matrix is represented + as a two-dimensional dense array + and not assumed to have any kind of symmetry. + The BLAS supports a variety of matrix representations, + including + + * SY: SYmmetric, + + * HE: HErmitian, + + * TR: TRiangular, + + * GB: General Banded, + + * SB: Symmetric Banded, + + * SP: Symmetric Packed, and + + * TP: Triangular Packed. + +* MM means "Matrix-matrix multiply," as opposed to other operations, + like MV (Matrix-Vector multiply). + +The string "nt" in the filename means that +the first input matrix A is "Not transposed," +while the second input matrix B is "Transposed." +That is, the function computes `C := beta * C + alpha * A * B^T`, +where the superscript T denotes the transpose of the matrix. +(We never change the input matrix in place or +store its entire transpose explicitly. +Instead, we reinterpret its data in place.) + +GEMM's TRANSA and TRANSB arguments lets users specify +the transpose or Hermitian transpose (if complex) +of either or both input matrices A or B. +It turns out that implementations favor this "NT" case, +along with "TN" (A is Transposed, B is Not transposed). +We will explain why below. + +As described, the original BLAS GEMM specifies +the dimensions of its matrices +as A is M x K, B is K x N, and C is M x N. +Out of convenience, CuTe interprets A +as M x K, B as N x K, and C as M x N. Instead of row-major or column-major (or Transposed +and Not-Transposed like above), we like to be more specific with M-major, N-major, or K-major. +Regardless, we'll still use the BLAS "NT" notation for high-level descriptions +of kernels when it's appropriate. + +Now, let's look at the code. +We'll start with the kernel entry point `gemm_device` +at the top of the file. + +```c++ +template +__global__ static +__launch_bounds__(decltype(size(CThreadLayout{}))::value) +void +gemm_device(MShape M, NShape N, KShape K, + TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, + TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, + TC * C, CStride dC, CBlockLayout , CThreadLayout tC, + Alpha alpha, Beta beta); +``` + +There are many template parameters; +we'll explain them all in due time. + +`TA`, `TB`, and `TC` are the element types +of the matrices `A`, `B`, and `C`, respectively. +The two scalar constants `alpha` and `beta` +are part of what GEMM computes: `C = beta * C + alpha * A * B`. +Unlike the (traditional Fortran and C) BLAS, +CuTe lets you mix different matrix element types and/or scalar types. +The compiler will help, but it's somewhat up to you +to use types that are safe and efficient on the GPU. +For example, a custom arbitrary-precision real type +that does dynamic allocation inside may not work on the GPU at all. +Even if it does, it may not perform well. + +This leaves five kinds of things to explain: + +1. Shapes + +2. Strides + +3. Block layouts + +4. Thread layouts + +5. Launch bounds + +### Shapes + +The original Fortran BLAS GEMM lists the matrices' dimensions +in the order M, N, K. CuTe also uses this convention. +The "MShape" is just M, +the NShape is just N, +and the KShape is just K. +In this example, they are dynamic (run-time) values +defined at the top of the `gemm` host function +that invokes the device kernel. + +```c++ +// Define shapes (dynamic) +auto M = int(m); +auto N = int(n); +auto K = int(k); +``` + +Note that the function takes M, N, and K. +It doesn't take the shapes of the three matrices separately, +as (say) three different `Shape` objects. +This is because matrix-matrix multiply constrains the shapes. + +There's nothing mysterious about `int` here; +it's the usual C++ built-in integral type. +`auto M = int(m)` is a way to say +"convert `m` to an `int` if it's not already an `int`, +and assign it to the freshly declared variable `M`." +CuTe also has a capitalized `Int` templated type +for representing values as compile-time constants. +For example, `Int<5>` represents a compile-time `int` value 5. +(CuTe implements these as subclasses +of the C++ Standard Library class `std::integral_constant`.) +The above `gemm_device` function is templated on the types +of M, N, and K; this shows that CuTe can represent dimensions +as either run-time or compile-time values. + +If you're familiar with the mdspan class going into C++23, +you might notice that CuTe represents shapes +a bit differently from mdspan. +mdspan uses `extents` +to represent a shape. +The `Extents` are zero or more compile-time values +(see below) representing the dimensions in the shape. +The `Extents...` are "non-type template parameters" (NTTPs) -- +that is, they are not types, but compile-time values of type `size_t`. +If you use the special reserved `size_t` value `std::dynamic_extent` +as an extent value, +the resulting dimension is a run-time value +and is stored in the `extents` instance. +Any other extent value is a compile-time value +that is encoded in the extents type itself. +In contrast, CuTe represents a shape as `Shape`. +The `Types...` are actual types, not NTTPs. +A built-in integral type like `int` or `uint64_t` +denotes a run-time dimension that is stored in the `Shape` instance, +while a compile-time value like `Int<5>` +encodes a compile-time dimension. +For example, the CuTe equivalent of +`extents` +is `Shape, int, Int<5>>`. + +#### Compile-time-ness of values + +C++ values have three levels of "compile-time-ness": + +1. dynamic (run-time) values, + +2. constexpr values, and + +3. static (compile-time) values. + +(Rather than saying "C++ has," +it's more accurate to say "C++17 has." +C++20 introduces `consteval` or "immediate" functions, +which make attempting to evaluate the function at run time +(any call not in an unevaluated context) a compiler error. +We'll ignore those for this tutorial, +since CuTe only requires C++17.) + +The `constexpr` keyword was introduced in C++11. +It means something like +"the compiler can evaluate this expression at compile time." +It does NOT mean "the compiler MUST evaluate this at compile time." +If you use a `constexpr` expression in a `static_assert` +or as a non-type template argument, +then the compiler must evaluate the expression at compile time. +However, for `constexpr` occurring in other places, +the compiler may choose to store the value in registers or memory, +and/or do computations with the value at run time. +In some cases, the compiler must do that. +The following example shows that the compiler +might need to store `constexpr` values in memory sometimes. + +```c++ +// Some function defined in a different compilation unit. +extern int foo(int* x); + +int bar() +{ + constexpr int value = 42; // a compile-time constant + + // Even constexpr variables have a sizeof, + // because we still might need to take their address. + static_assert(sizeof(value) == 4); + + // Compiler can't inspect foo to see how it uses the value, + // so it has to store the value in some memory location + // so that we can pass its address to the function. + return foo(&value); +} +``` + +"Static" is an unfortunately overloaded term in C++. Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class. (Some programming languages, like Java, say "class method" to refer to a "static function of a class.") That's not what we mean here. Instead, we mean "part of a compile-time type." For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int`. `Int<3>` and `Int<4>` have different types. You can get the value of of the type like this: `Int<3>::value`. (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.") As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value). In some situations, this may mean that the compiler treats it as a run-time value. + +#### Strides + +We define a layout using both shapes and strides. +The shape just tells you the dimensions (modes, etc.) of the array. +The strides tell you the mapping from a multidimensional index +into a one-dimensional offset. +Here, we're describing the shapes and strides +of the "global" matrices A, B, and C. +The example defines the global matrices' strides +near the top of the `gemm` function. + +```c++ +// Define strides (mixed) +auto dA = make_stride(Int<1>{}, ldA); // (dM,dK) +auto dB = make_stride(Int<1>{}, ldB); // (dN,dK) +auto dC = make_stride(Int<1>{}, ldC); // (dM,dN) +``` + +To evaluate this mapping for a given multidimensional index, take the dot product of the indices with the strides. For example, the offset of `A(index_m, index_k)` is `index_m * 1 + index_k * ldA`. Note the implications for the compile-time-ness of the offset. Any run-time value among either the shape or the strides makes the offset a run-time value. Of course, if a particular stride is a compile-time constant (especially 1), it's easier for the compiler to optimize the arithmetic and result. + +Note that in the original source code, +this example is missing the comments after each line. +We've added them in here, +as they stir a brief digression about shapes and modes. +The comment after B says (dN, dK), not (dK, dN). +This means that B is treated as an N x K matrix +instead of a K x N matrix. +As mentioned, CuTe follows the convention +that the meaning of matrix modes is +(M,K) for A, (N,K) for B, and (M,N) for C. +In particular, CuTe's convention is that +"the reduction mode is outermost." +The "reduction mode" of `Shape` is K. +That's the mode over which we do a reduction, +that is, sum up products of matrix entries. +The K mode disappears in the output C. +"Outermost" here means "rightmost" +(literally, appearing rightmost in the list M, N, K). +Note that the shapes form a kind of Einstein tensor notation. +GEMM does Shape = Shape * Shape. +In Einstein notation, the repeated index indicates +a sum of that term over all values of K. + +We say in general that the leftmost mode is the "inner(most)" mode, +and the rightmost mode is the "outer(most)" mode. +This is because, +along with CuTe's convention of thinking of arrays as logically column major, +the leftmost mode is most commonly the mode with the most spatial locality. +It's very often the "most contiguous" mode. +For this reason, it's "the mode that we want in the innermost loop" +(in the nesting of loops that implements GEMM). +This is why we call it the "innermost" mode. +Its contiguity means that also call the innermost mode the "vector mode." + +The vector mode also has special meaning: +it contains all of the information needed +to execute the smallest possible computation or communication operations +on hardware, that is, what CuTe calls the "atoms." + +Modes are like units conceptually. +For example, you shouldn't mix M-mode indices with K-mode indices. +However, CuTe does nothing to enforce this. +(For example, CuTe does not require use of "tagged" index types. +Indexing works with the usual integer types.) + +The previous paragraph relates to shapes, not strides. +Returning to the strides, the above code describes these strides as "mixed." +This means that they include both run-time and compile-time values. +For example, the stride between A(m, k) and A(m+1, k) is `Int<1>`, +a compile-time value 1. The stride between A(m, k) and A(m, k+1), +however, is `ldA`, the "leading dimension of A," a run-time value. +The "leading dimension" of a matrix +refers to the stride between consecutive columns of a column-major matrix +(where the stride between consecutive rows is 1), +or the stride between consecutive rows of a row-major matrix +(where the stride between consecutive columns is 1). +This is a naming convention from the BLAS +and libraries that use it, like LAPACK. +For the purpose of this tutorial, it's just a naming convention +for "the stride that isn't the compile-time constant 1." + +#### M-major, N-major, K-major + +Note that we haven't uttered the phrases "column-major" or "row-major" here. This is where the experience of a BLAS user diverges from the experience of a BLAS implementer. BLAS users speak of "column-major" and "row-major" layouts. C++23's `mdspan` class encodes these as `layout_left` resp. `layout_right`. However, we don't speak of "column-major" or "row-major" in our GEMM implementations. + +We say that a matrix is "M-major" if it is stride 1 in the M-mode, "N-major" if it is stride 1 in the N-mode, or "K-major" if it is stride 1 in the K-mode. In the above code, A has shape (M, K) and strides (1, ldA). Since A has stride 1 in the M mode, we say that A is "M major." B has shape (N, K) and strides (1, ldB), so B is "N-major." Similarly, C has shape (M, N) and strides (1, ldC), so C is "M major." + +How do we translate this into the BLAS user's experience? +The following table illustrates for B and C. +(Throughout the table, "Impl" stands for "implementation.") + +Note that the implementation reverses the order of B's modes, +and flips B's strides. +Recall that one evaluates a layout +by taking the dot product of the indices and strides. +Thus, reversing the order of both the modes and the strides +does not change this evaluation. + +| Matrix | User's shape | User's layout | User's strides | Impl layout | Impl shape | Impl strides | +| --- | --- | --- | --- | --- | --- | --- | +| C | M x N | Column major | (1, LDC) | M-major | (M, N) | (1, LDC) | +| A | M x K | Column major | (1, LDA) | M-major | (M, K) | (1, LDA) | + +What about the matrix B? We explained above that B is N-major. How would that translate back into the BLAS user's experience? We take a hint here from the filename including "nt." The "nt" part of the name means that A is not transposed, while B is transposed. The BLAS convention (see e.g., [the documentation for DGEMM](https://netlib.org/lapack/explore-html/d1/d54/group__double__blas__level3_gaeda3cbd99c8fb834a60a6412878226e1.html)) is that if you take the transpose, then the dimensions refer to the transpose ("with op( A ) an m by k matrix, op( B ) a k by n matrix and C an m by n matrix"). Thus, this example actually computes `C = beta * C + alpha * A * B^T`, where `B^T` is an K x N matrix with strides (LDB, 1). The user's "original" matrix B is thus N x K, with strides (1, LDB) -- that's a column-major layout. (Reversing the modes and the strides preserves the layout, since evaluating the layout mapping just takes the dot product of indices and strides.) This lets us expand the above table to include B. + +| Matrix | Transposed? | User's shape | User's layout | User's strides | Impl layout | Impl shape | Impl strides | +| --- | --- | --- | --- | --- | --- | --- | --- | +| C | No | M x N | Column major | (1, LDC) | M-major | (M, N) | (1, LDC) | +| A | No | M x K | Column major | (1, LDA) | M-major | (M, K) | (1, LDA) | +| B | Yes | N x K | Column major | (1, LDB) | N-major | (N, K) | (1, LDB) | + +CuTe developers say: "In CuTe, you can't tell transposed +apart from non-transposed, MN-major from K-major, etc. +without inspecting the strides." +It's now a bit more clear what that means. +CuTe doesn't see whether A or B are transposed. +Instead, CuTe sees shapes and strides. +A CuTe developer must reason backwards from the shapes and strides +in order to see what the BLAS user sees. + +Why does CuTe do this? Consider that matrix multiply performs a reduction in the K-mode. From the user's perspective, it's reducing across rows of the first input matrix, but across columns of the second input matrix. If we instead mentally flip the modes of the first input matrix, then the implementation reduces over columns (the K mode) of both input matrices. This leads to two cases in which the implementation can effectively treat both input matrices in the same way. (If you call it with A and B reversed, it should even give the same results for these cases.) + +| Case | User asks for A | User asks for B | Abbreviation | +| --- | --- | --- | --- | +| A is M major, B is N major | Not transposed | Transposed | NT | +| A and B are both K major | Transposed | Not transposed | TN | + +This is why an introductory example starts with NT or TN. +For a summary of the four different transpose options for A and B, +and their corresponding implementation layouts, +please see the table below. + +| Transpose abbreviation | User sees A transposed? | User sees B transposed? | A's impl layout | B's impl layout | +| --- | --- | --- | --- | --- | +| NT | No | Yes | M major | N major | +| TN | Yes | No | K major | K major | +| NN | No | No | M major | K major | +| TT | Yes | Yes | K major | N major | + +#### MN-major and K-major + +As we mentioned above, there are two "preferred arrangements," TN and NT. In the TN arrangement, both A and B are K-major. In the NT arrangement, A is M-major and B is N-major. Even though the two stride-1 modes in NT have different names, it's still the leftmost mode for both A and B that has stride 1. Thus, we can think of the NT arrangement as "MN-major," analogous to how the TN arrangement is "K-major." + +The two preferred arrangements tend to work themselves into implementations, particularly when they use hardware instructions for accelerating matrix multiplies of blocks. In some cases, the hardware instruction may require NT (MN-major) or TN (K-major). For NN or TT, such instructions would require an intermediate transpose -- for example, when loading from global memory to shared memory. + +### Block layouts + +Efficient matrix multiply implementations loop over blocks. +For example, a typical GPU implementation strategy +is for each thread block to iterate over some number of blocks. +In the example, this loop occurs near the end of `gemm_device`. + +```c++ +// TUTORIAL: Example of a very simple compute loop +// Data is read from global to shared memory via the tA|tB partitioning +// gemm(.) operates on the shared memory directly via the tC partitioning + +auto k_max = size<2>(tAgA); + +for (int k = 0; k < k_max; ++k) +{ + // Copy A and B blocks from global memory to shared memory. + copy(tAgA(_,_,k), tAsA); + copy(tBgB(_,_,k), tBsB); + + // On some architectures, copy may be asynchronous. + // This may call for extra synchronization instructions + // beyond just __syncthreads(). + + __syncthreads(); + + // Compute gemm on shared memory input and register accumulator. + // The "epilogue" after this loop will copy the accumulator + // from the register file into global memory. + gemm(tCsA, tCsB, tCrC); + + __syncthreads(); +} +``` + +We will explain the notation in this loop below. The important things to remember are that the coordinate `k` loops over the blocks which the calling thread is supposed to compute, the `copy` functions copy A resp. B blocks from global memory (the first argument) to shared memory (the second argument -- same as C++'s `std::copy`, but the opposite of `memcpy`), and the `gemm` function computes C += A * B on the shared memory blocks. + +It turns out that copy takes an optional first argument, the "atom," as in the following. + +```c++ +copy(atom, source, destination); +``` + +The "atom" is metadata that explains how to do the copy operation. + +There are a few topics to push onto the stack. + +The copy function call shows a notation for taking slices of a tensor. A CuTe `Tensor` is a multidimensional array view. It consists of a pointer and a `Layout`. You can learn more about `Tensor`s elsewhere in CuTe's documentation, but for now, please note that `tAgA(_,_,k)` means "create a Tensor that views (i, j, k) for all valid i, all valid j, and a specific value of k." The result has rank one less than the original Tensor. CuTe's underscore means the same thing as a single stand-alone colon in Fortran or Matlab. Note also that CuTe uses the same notation for slices as for tensor indexing. The implementation can distinguish the two cases by checking whether any of the arguments is an underscore. In contrast, the C++23 class mdspan uses a separate function, `submdspan` (not in C++23, and proposed for C++26; see [P2630](https://wg21.link/p2630)), for slicing. + +Fully understanding what `copy` and `gemm` do calls for learning about thread layouts as well, so we will wait to explain them completely. For now, note that these functions are implicitly parallel, as they are called collectively by all threads in a thread block. + +The block dimensions are defined near the top of the host function `gemm`. + +```c++ +// Define block sizes (static) +auto bM = Int<128>{}; +auto bN = Int<128>{}; +auto bK = Int< 8>{}; +``` + +We see that these are fully compile-time dimensions. This is often the case, especially when we use hardware instructions that only work for certain problem dimensions. Three lines of code immediately below these construct the block layouts. + +```c++ +// Define the block layouts (static) +auto sA = make_layout(make_shape(bM,bK)); +auto sB = make_layout(make_shape(bN,bK)); +auto sC = make_layout(make_shape(bM,bN)); +``` + +Here, the block layouts just come from the block dimensions. A Layout has two things: a Shape, and Strides. If the caller does not provide Strides, then CuTe computes Strides corresponding to the default "column-major" arrangement of data. This just happens to match the global matrices' layouts, but in general doesn't have to. For example, in the NN or TT cases, we may want to transpose one of the input matrices when copying from global memory to shared memory. + +The example "comments out" some code that prints all the layouts on "thread 0" of each thread block. If you enable the printing code and run the example, it will print all the layouts. For example, sA prints as + +``` +sA +(_128,_8) +(_1,_128) +``` + +and sB prints as + +``` +sB +(_128,_8) +(_1,_128) +``` + +consistently with the definitions above. + +If you have looked at other GEMM examples in CuTe, you might be wondering about hardware matrix-matrix multiply instructions. Those instructions tend to require certain values for shapes and strides, that may be a function of the matrix's element type. CuTe knows about these instructions and their required shapes and strides. We will go into more detail about that elsewhere. + +The `gemm_device` top-level kernel uses these block layouts to allocate shared memory buffers for A and B tiles. + +```c++ +// Shared memory buffers +__shared__ TA smemA[cosize_v]; +__shared__ TB smemB[cosize_v]; +``` + +Note how the shared memory buffers' sizes depend only on the A resp. B layouts (and element sizes). What's a `cosize_v`? The "`_v`" is a C++ naming convention that specifies a function from one or more template argument(s), to a value. In this case, it's a number of elements. A layout is a function from a set of multidimensional coordinates to a set of one-dimensional array offsets. It's a function, so we can speak of its domain and codomain. The "cosize" of a layout is the size of its codomain. (See e.g., CuTe's implementation of `Layout`.) If we want to allocate a linear array, for which all the offsets produced by a layout are valid, then we can use the cosize of the layout as the length of the array (in terms of number of elements, not in terms of number of bytes). + +### Thread layouts + +CuTe uses a `Layout` to describe the assignment of threads to work items. +In this example, the host function `gemm` constructs the thread layouts +for A, B, and C. + +```c++ +// Define the thread layouts (static) +auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); +auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); +auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); +``` + +That is, the thread layout for the A read is M-major 32x8, for the B read is N-major 32x8, and for the C compute/write is M-major 16x16. These thread layouts will partition the data for their respective stages. + +#### The example uses compile-time thread and block layouts + +Note that the device function `gemm_device` insists that all the thread and block layouts are static -- that is, known at compile time. You can see this from the `CUTE_STATIC_ASSERT` statements near the top of `gemm_device`. `CUTE_STATIC_ASSERT` is a wrapper for `static_assert`, which fails at compile time if its condition is `false`. + +```c++ +// Preconditions +CUTE_STATIC_ASSERT(is_static::value); +CUTE_STATIC_ASSERT(is_static::value); +CUTE_STATIC_ASSERT(is_static::value); + +CUTE_STATIC_ASSERT(is_static::value); +CUTE_STATIC_ASSERT(is_static::value); +CUTE_STATIC_ASSERT(is_static::value); +``` + +Use of static layouts has two advantages. First, it makes it easier to prove correctness of the algorithm. If the code compiles, it's likely correct. (On the other hand, new CuTe users may find themselves doing more debugging at compile time than they have before.) Second, it makes it easier and faster for CuTe to dispatch to the correct optimized implementations (called "atoms" -- see below) for copying blocks and performing matrix multiplies. + +#### The example's block gemm is parallel over elements of C + +In the actual device function, `tC` has layout `CThreadLayout`. You might recall that the kernel function `gemm_device` uses `CThreadLayout` to derive the launch bounds, specifically the maximum number of threads per block. The launch bounds show up in the declaration of `gemm_device`. + +```c++ +template +__global__ static +__launch_bounds__(decltype(size(CThreadLayout{}))::value) +void +gemm_device(MShape M, NShape N, KShape K, + TA const* A, AStride dA, ABlockLayout blockA, AThreadLayout tA, + TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB, + TC * C, CStride dC, CBlockLayout , CThreadLayout tC, + Alpha alpha, Beta beta); +``` + +The "size" of `CThreadLayout` is the total number of threads, 16 * 16 = 256. (We take `::value` because the size is actually `Int<256>`, a compile-time constant with a `static constexpr int value = 256` member.) This suggests that the block gemm function (in the loop over blocks) parallelizes over elements of the C block. We can see this as well from the kernel launch (at the end of the `gemm` host function), which uses the size of `CThreadLayout` as the block dimension. + +```c++ +// Define the thread layouts (static) +auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); +auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); +auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); + +dim3 dimBlock(size(tC)); +dim3 dimGrid(ceil_div(size(M), size(bM)), + ceil_div(size(N), size(bN))); +gemm_device + <<< dimGrid, dimBlock, 0, stream >>> + (M, N, K, + A, dA, sA, tA, + B, dB, sB, tB, + C, dC, sC, tC, + alpha, beta); +``` + +Note that dimBlock is single-dimensional (despite being a dim3), as the size of a layout is a single value. We can see this also because the example only ever uses `threadIdx.x`, not `threadIdx.y`. Yet, C's thread layout has shape (16, 16). What's with that? Recall that a thread layout maps from a "logical" coordinate space (possibly multidimensional tuples of indices) to (one-dimensional) integer indices. In this case, `CThreadLayout` maps from pairs of indices in the Cartesian product space {0, 1, 2, ..., 15} x {0, 1, 2, ..., 15}, to one-dimensional indices 0, 1, 2, ..., 255. The latter, the output of `CThreadLayout`, is the actual thread index `threadIdx.x` in this case. `CThreadLayout` has only a shape (16, 16) and no nondefault strides, so it uses CuTe's default column-major arrangement (with strides (1, 16) in this case). + +#### What does `local_tile` do? + +The following code near the top of `gemm_device` +operates on the "global" (input and output) matrices A, B, and C +(where mA, mB, and mC are their Tensor representations). + +```c++ +// Get the appropriate blocks for this thread block -- +// potential for thread block locality +auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB)); // (BLK_M,BLK_N,BLK_K) +auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) + +Tensor gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) +Tensor gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) +Tensor gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) +``` + +There are two new features here: + +* `make_coord`, which returns a `Coord`, a multidimensional index which can be used as the input of a `Layout`; and + +* `local_tile`, which we will explain below. + +The `Coord`(inate) `blk_coord` refers to the set of blocks (indexed by k -- the underscore here indicating a free parameter) our thread block will access. (The index k here doesn't mean the K mode; it's the same index as in the loop over blocks that does the computation.) + +If we print out the `gA`, `gB`, and `gC` layouts, we get the following. + +``` +gA +(_128,_8,512) +(_1,5120,40960) + +gB +(_128,_8,512) +(_1,5120,40960) + +gC +(_128,_128) +(_1,5120) +``` + +All of these layouts come from the original input or output matrices A, B, and C. Thus, they preserve the original strides, which are all the same in this example (when using default problem dimensions), 5120. This is most easily seen in the gC layout. For the other layouts, there is a clue in 5120 * 8 = 40960. That is, every time we increase k by one, we "skip over 8 columns" of the global matrix, over to the next block of data. This illustrates an important feature of CuTe, that it can view the same data with different modes and/or strides, as a way to identify parallelism or locality. + +## Next steps + +The above "simple GEMM" example's performance on many problems +is asymptotically optimal +with respect to the GPU's floating-point throughput. +Getting nearly peak performance +relative to the GPU's floating-point throughput, +for a wider variety of problem dimensions, +calls for more advanced techniques. +Please refer to other examples in this repository +to learn more about those techniques. +For example, the +[predication section of the tutorial](./0y_predication.md) +explains what to do if a matrix tiling +doesn't perfectly divide the matrix. + +### Implement GEMM as generalized tensor constraction (GETT) + +"GETT" here stands for "general(ized) tensor times tensor," +a tensor contraction. + +CuTe permits matrices to have nested `Layout`s. +For example, a matrix A can have a nested `Layout` for its M and N modes. +This means that we can use a "matrix" (`Tensor` with two modes) +to represent any `Tensor`. +This amounts to a "native hierarchical representation." + +As a result, we can implement GETT by using +our existing GEMM implementation layers, +with a little bit of fancy custom predication for the K mode. +This is because the stride type of A +and the problem shape itself +are CuTe Shapes and Strides. +This lets us represent the hierarchical modes +of a tensor contraction problem +(which still fundamentally only have 4 modes -- +batch mode, +two outer modes (one for A and one for B), +and one reduction mode -- +each of which can now have as many nested modes as you want +for the contraction's inputs). +We thus implement GETT as contraction just in one mode -- the K mode. +However, K itself can be hierarchical and can have noncontiguous strides. +We can reorder the modes such that all contraction modes +become a single, possibly hierarchical K mode in the kernel. +This is how we would encode a contraction in multiple modes at once. diff --git a/media/docs/cute/0y_predication.md b/media/docs/cute/0y_predication.md new file mode 100644 index 0000000000..f764508bf1 --- /dev/null +++ b/media/docs/cute/0y_predication.md @@ -0,0 +1,217 @@ +# Predication: What to do when tiling isn't perfect + +The [GEMM tutorial](./0x_gemm_tutorial.md) shows how +we compute a matrix-matrix multiply +by iterating over tiles of the input matrices and output matrix. +The examples all assume that the tiles fit evenly into the matrices, +with no remainder. +What do we do if this is not the case? +For example, we might want to tile a 41 x 55 matrix into 4 x 8 tiles, +but 41 / 4 is 10 remainder 1, and 55 / 8 is 6 remainder 7. +What do we do with those "leftover" parts of the matrix? + +Another way to say this, is that `logical_divide` +(CuTe's way of tiling layouts) "rounds up." +For example, if `N` is the layout (1000, 1) and `B` is the layout (128, 1), +then `logical_divide(N, B)` is the layout ((128, 8), (1, 128)). +This effectively rounds up the original shape N = 1000 +into an 128 x 8 matrix (as if N = 1024). +What about those last 24 elements, +that aren't part of the original data? + +The idiomatic CuTe way to solve this problem is through "predication." +Rather than trying to reason about the "remainder tiles," +CuTe instead rounds up, but only tries to access data in each tile +that are part of the matrix. +This corresponds well with how our GPUs optimize: +branches without warp divergence are relatively fast. +It also matches the usual CUDA idiom +when dividing N work items in 1-D fashion over B thread blocks: +first test if "my thread" is out of bounds before doing work. + +There are a few ways to figure out +which elements need to be predicated. +In-kernel GEMMs like to do this in the following way. + +```c++ +// Create the predicate tensor +Layout idA = make_layout(shape(A)); // e.g. 1000:1 +Layout idAB = logical_divide(idA, B); // e.g. (128,8):(1,128) + +Tensor pred = make_tensor(shape(idAB)); +for (int i = 0; i < size(pred); ++i) { + pred(i) = idAB(i) < size(A); +} + +// ... intervening code ... + +// Use the predicate tensor. c is some coordinate. +// This code would likely live inside some algorithm. +if (pred(c)) { copy(idAB(c), smem(c)); } +``` + +The general procedure is that we + +1. create an "identity" layout (`Layout idA = make_layout(shape(A))`, + in the above example) with the same shape as our original data; + +2. repeat the same tiling/partitioning/slicing (possibly rounding up) + on that identity layout (`Layout idAB = logical_divide(idA, B)`); + +3. create a "predicate tensor" by comparing the coordinates + of that reference layout with the bounds of the original layout; + and then + +4. use the predicate tensor to mask off accesses to out-of-bounds elements. + +For example, suppose that we've partitioned A and B tiles +across threads as follows. + +```c++ +Tensor tAgA = local_partition(gA, tA, thread_idx); // (THR_M,THR_K,k) +Tensor tAsA = local_partition(sA, tA, thread_idx); // (THR_M,THR_K,PIPE) + +Tensor tBgB = local_partition(gB, tB, thread_idx); // (THR_N,THR_K,k) +Tensor tBsB = local_partition(sB, tB, thread_idx); // (THR_N,THR_K,PIPE) +``` + +`tAgA` and `tBgB` partition the global A resp. B matrices over threads, +and `tAsA` and `tBsB` partition the shared memory tiles of A resp. B over threads. + +The following code creates predicate tensors +corresponding to `tAgA` and `tBgB`. +They will be computed once in the prologue. +and will be used to mask off instructions in the inner loop. + +```c++ +Tensor tApA = make_tensor(make_shape (size<0>(tAgA), size<1>(tAgA)), + make_stride( Int<1>{}, Int<0>{})); +Tensor tBpB = make_tensor(make_shape (size<0>(tBgB), size<1>(tBgB)), + make_stride( Int<1>{}, Int<0>{})); +``` + +We're only thread-parallelizing over the leftmost (row) dimension, +so we only need to predicate over the leftmost dimension. +Thus, we can make the rightmost (column) stride zero, +since we will never actually address the rightmost dimension. + +The following code creates "two-dimensional identity tensors" +that map coordinates (m,k) -> (m,k) +for the tile of data within the thread block. + +```c++ +Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K) -> (blk_m,blk_k) +Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB))); // (BLK_N,BLK_K) -> (blk_n,blk_k) +``` + +The following lines then tile and partition +the two reference tensors +in exactly the same way the data were tiled and partitioned +into `tAsA` and `tBsB`. + +```c++ +Tensor tAcA = local_partition(cA, tA, thread_idx); +Tensor tBcB = local_partition(cB, tB, thread_idx); +``` + +Tiling and partitioning affect the offset and domain, +but not the codomain of the tensors, +so we're left with tensors that map `(thr_m,thr_k) -> (m,k)` +where `(thr_m,thr_k)` is this particular thread's subtensor of the tile +and `(m,k)` is the original codomain: a coordinate into the original tile. + +The unrolled loops in the code below then compare +the m- and n-coordinates of those tensors with our known maximums +to mask off elements we are not allowed to access. + +```c++ +Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K) -> (blk_m,blk_k) +Tensor tAcA = local_partition(cA, tA, thread_idx); + +Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB))); // (BLK_N,BLK_K) -> (blk_n,blk_k) +Tensor tBcB = local_partition(cB, tB, thread_idx); + +// Populate +CUTE_UNROLL +for (int m = 0; m < size<0>(tApA); ++m) { + tApA(m,0) = get<0>(tAcA(m,0)) < m_max_coord; +} +CUTE_UNROLL +for (int n = 0; n < size<0>(tBpB); ++n) { + tBpB(n,0) = get<0>(tBcB(n,0)) < n_max_coord; +} +``` + +Those last `for` loops fill in the two predicate tensors. +In this case, we only need to predicate over the leftmost dimension, +so we only address `(m,0)` resp. `(n,0)`. + +We can then use the predicate tensors in `copy_if` +to copy only the elements for which the corresponding +predicate tensor elements are nonzero. + +```c++ +// Prefetch k_tile=0, gate these on k_residue as well +CUTE_UNROLL +for (int k = 0; k < size<1>(tAsA); ++k) { + if (get<1>(tAcA(0,k)) >= -k_residue) { // some other condition on the column index + copy_if(tApA, tAgA(_,k,0), tAsA(_,k,0)); + } +} + +CUTE_UNROLL +for (int k = 0; k < size<1>(tBsB); ++k) { + if (get<1>(tBcB(0,k)) >= -k_residue) { // some other condition on the column index + copy_if(tBpB, tBgB(_,k,0), tBsB(_,k,0)); + } +} +``` + +Here are some advantages of this "reference tensor" approach. + +1. It doesn't depend on the layout/strides of the tensor + being predicated, just the logical bounds being imposed. + +2. The partitioning stage can be anything. + +3. It naturally extends to any-dimensional predication. + +4. It's a natural generalization of a typical CUDA 1-D + parallel vector access pattern, + which computes an access index `k` + (e.g., as `blockDim.x * blockIdx.x + threadIdx.x`) + and then predicates access to the vector's `k`-th element + on whether `k` is in bounds. + +As an example of (3), the epilogue predication does exactly the same thing, + +```c++ +// Repeat with a tensor of coordinates for predication +Tensor cC = make_identity_tensor(make_shape(size<0>(gC), size<1>(gC))); +Tensor tCcC = thr_mma.partition_C(cC); + +const bool isBetaZero = (beta == 0); + +CUTE_UNROLL +for (int i = 0; i < size(tCrC); ++i) { + if (elem_less(tCcC(i), make_coord(m_max_coord,n_max_coord))) { + tCgC(i) = isBetaZero ? alpha * tCrC(i) : alpha * tCrC(i) + beta * tCgC(i); + } +} +``` + +but with the mma responsible for the tiling/partitioning `tCcC` +so that the reference subtensor matches the accumulator's subtensor. +Then, the reference subtensor is predicated against the `if` bounds +(in both m- and n-coordinates) inside the `for` loop. + +Another way to explain this is that we don't modify the tiles +to give you the "right" extents so that you never overrun. +Instead, we let you query the original coordinate +to see if that coordinate overruns. +This avoids all branching and variable/dynamic loop bounds +(thus maintaining load balance and synchronicity, +both very important in-kernel) in favor of predication. +It's also general enough to extend to all ranks, +all layouts of threads and data, +and all tiling/partitioning patterns. diff --git a/media/docs/cutlass_3x_backwards_compatibility.md b/media/docs/cutlass_3x_backwards_compatibility.md new file mode 100644 index 0000000000..7be2a91bf8 --- /dev/null +++ b/media/docs/cutlass_3x_backwards_compatibility.md @@ -0,0 +1,473 @@ +[README](/README.md#documentation) > **CUTLASS 3.0 GEMM Backwards Compatibility** + +# CUTLASS 3.0 GEMM Backwards Compatibility + +Although CUTLASS 3.0 restructures the GEMM hierarchy and introduces new types for the +threadblock layer and below, we intend the entire source code to be usable in user applications. +We expect users to be able to `#include` any source file from CUTLASS 3.0, whether +they implement the 2.x or the 3.x API, without breaking user builds. This means that a single +translation unit should be able to contain any valid kernel regardless of its API version. The +sections below discuss how `device` and `kernel` layer type names are made compatible across the +two API versions, and what the users can expect out of the `threadblock` layer API going forward. + +## Compatible Device API + +The entry point for CUTLASS's Device GEMM API +is the class +`cutlass::gemm::device::GemmUniversalAdapter`. +This class lives in the header file +[include/cutlass/gemm/device/gemm_universal_adapter.h](/include/cutlass/gemm/device/gemm_universal_adapter.h). + +`GemmUniversalAdapter` is a "universal adapter" +and serves as a common device interface +for both CUTLASS 3.x and CUTLASS 2.x kernels. +Its template parameter `GemmKernel`, +the GEMM kernel type, can be any of the following: + +* `cutlass::gemm::kernel::GemmUniversal`, + implementing CUTLASS 3.x API kernels; +* `cutlass::gemm::kernel::GemmUniversal`, + implementing CUTLASS 2.x API kernels; +* Any valid CUTLASS 2.x `kernel` layer GEMM that + was previously composable with `device::GemmUniversalAdapter` + +Users implementing new kernels in either API should prefer +using `kernel::GemmUniversal` as the kernel type +and compose it with `device::GemmUniversalAdapter`. +Users with existing `kernel::Gemm` kernels +can continue to use them as template arguments +of `device::GemmUniversalAdapter`. They can adopt +`GemmUniversal` as a gradual migration path, +since `GemmUniversal` accepts either 3.0 or 2.x collectives. +Please see the [next section for `kernel::GemmUniversal`](#compatible-kernel-api) for details. + +`GemmUniversalAdapter` presents a single +host-side interface to both 3.0 and 2.x kernels. +CUTLASS accomplishes this by +specializing `GemmUniversalAdapter`'s implementation +on either 2.x API implementing kernel layer GEMMs, or 3.x API +implementing kernel layer GEMMs (as detected by `gemm::detail::IsCutlass3GemmKernel` +discussed below). As a result, `GemmUniversalAdapter`'s behavior +might differ between the two specializations. + +### Device API design differences + +In CUTLASS 2.x, the Device API was more closely tied +to the Kernel API. In CUTLASS 3.0, the Device API +accepts any kernel type that meets the Kernel API +interface requirements. CUTLASS 3.0's Device API code is +parameterized by the kernel type, but this code +is *generic*; the same code works for any kernel type. + +The device layer compatibility interface, `device::GemmUniversalAdapter`, +also provides reflective mappings from 3.0-specific types +back to the closest possible 2.x equivalent types. This is [discussed further in the section below](#conversions-between-2x-tags-and-30-types). + +CUTLASS 3.0's `device::GemmUniversalAdapter` also exposes some new APIs that the 2.x `device::GemmUniversalAdapter` implementation does not. Most notably, this includes the ability to bypass the `GemmKernel::Arguments` to `GemmKernel::Params` lowering. + +```c++ +// Primary run() entry point API that is static allowing users to create and manage their own params. +static Status +run(Params& params, cudaStream_t stream = nullptr); +``` + +This new API is useful for the following scenarios. + +* Running again does not require reinvoking `GemmKernel::to_underlying_arguments()` +* Manual control over construction of `GemmKernel::Params` for custom kernels with custom stride types +* Fully static problem shapes and strides for bespoke kernels where no argument mapping needs to take place + +## Compatible Kernel API + +CUTLASS 3.x API shares the kernel layer API with CUTLASS 2.x +through the single entry point type `cutlass::gemm::kernel::GemmUniversal`. +All kernel layer GEMMs are viewed as a composition of a collective mainloop +and a collective epilogue. + +**`kernel::GemmUniversal` implements both 2.x and 3.x APIs** + +The entry point for CUTLASS's kernel API is the class +`cutlass::gemm::kernel::GemmUniversal`. +This class' declaration lives in the header file +[include/cutlass/gemm/kernel/gemm_universal.hpp](/include/cutlass/gemm/kernel/gemm_universal.hpp). + +```c++ +/* + * Stateless universal device GEMM kernel type that treats GEMM as + * a composition of a collective mainloop and a collective epilogue. + * SFIANE shims both 2.x and 3.0 API kernels based on ProblemShapeOrThreadblockMma_. +**/ +template < + class ProblemShapeOrThreadblockMma_, + class CollectiveMainloopOrEpilogue_, + class CollectiveEpilogueOrThreadblockSwizzle_, + class GridSwizzle_ = void, + class Enable = void +> +class GemmUniversal; +``` + +We call this class "universal" because it can be built +using either the CUTLASS 3.0 or the 2.x mainloops and epilogues. +If `GemmUniversal`'s first template argument +(`ProblemShapeOrThreadblockMma_`) is a `cute::tuple`, +then `GemmUniversal` assumes that +the remaining three template arguments +(the mainloop, epilogue, and grid swizzle) +implement the 3.0 APIs. +Otherwise, `GemmUniversal` assumes that +the remaining three template arguments +implement the 2.x APIs. +All the template arguments must be either +CUTLASS 3.0 or CUTLASS 2.x types. For example, +`GemmUniversal` does not permit using +a 2.x mainloop with a 3.0 collective epilogue. + +CUTLASS 3.x implements various embodiments of `kernel::GemmUniversal`. +Each kernel layer schedule is specialized +for a GEMM scheduling algorithm and GPU architecture. +Specializations of `kernel::GemmUniversal` for 3.0 APIs live in +any of various `gemm_*.hpp` files in the directory +[include/cutlass/gemm/kernel/](../../include/cutlass/gemm/kernel/). +The specialization to which to dispatch is decided through the dispatch policy's `Schedule` type. + +Specializations for 2.x APIs live in the header file +[include/cutlass/gemm/kernel/gemm_universal.h](../../include/cutlass/gemm/kernel/gemm_universal.h). + +### Kernel API design differences + +The CUTLASS 2.x Kernel API was more closely tied +to the Device API, as we mentioned above. +In particular, the 2.x Device API specified the grid shape +used to launch the Kernel API. +In CUTLASS 3.0, the Kernel API controls its own grid shape, +while the device adapter simply queries the kernel with which it needs to be launched. + +This change is required to support various kernel schedules +that may need their own schedule specific grid planning logic. +For example, persistent kernel schedules generally only launch with +as many threadblocks as the number of multiprocessors on the GPU. + +All CUTLASS 3 `kernel::GemmUniversal` specializations expose the following (static) API: + +```c++ +// Returns true if the kernel can execute the provided GEMM arguments. +static bool +can_implement(Arguments const& args); + +// Returns a dim3 representing the threadblock shape. +static constexpr dim3 +get_block_shape(); + +// Returns a dim3 representing the grid shape in terms of threadblocks. +static constexpr dim3 +get_grid_shape(Params const& params); +``` + +The device adapter simply queries the kernel for these three before launching it on the device. +CUTLASS 3.0 provides a meta-function to detect whether a `cutlass::gemm::kernel::*` implements +the 3.x API or 2.x API: + +```c++ +// include/cutlass/gemm/gemm.h + +namespace cutlass:gemm::detail { + +// The following metafunction is used to detect whether a +// `kernel::Gemm` or `kernel::GemmUniversal` implements the CUTLASS 3.x API, +// by checking whether the problem shape type is aliased within. +template +struct IsCutlass3GemmKernel; + +} // namespace cutlass:gemm::detail +``` + +Users can dispatch their generic code against 2.x and 3.x specializations with +this as a type trait for the kernel API version. + +## Threadblock API and Inner Loops + +Much of the CUTLASS 3 GEMM hierarchy for mainloops and inner loops diverges +from that of CUTLASS 2.x. With that also comes the introduction of the +`cutlass::gemm::collective` layer as a direct replacement and a superset +of the 2.x `cutlass::gemm::threadblock` layer. Going forward, +CUTLASS 3.x will discontinue new developments in the following namespaces. + +* `cutlass::*::threadblock::*` +* `cutlass::*::warp::*` +* `cutlass::gemm::thread::*` +* `cutlass::arch::*` (except `barrier.h`) + +`cutlass::gemm::collective`s are a superset of the threadblock layer where +all new mainloops will be developed. Users should look to the `CollectiveMma` type +if they wish to author custom mainloop code in the 3.x API. + +Similarly, for the GEMM inner loops, `cute::MMA_Atom`s replace the +`gemm::warp` and `gemm::thread` layer code. Going forward, all new PTX instructions +and associated metadata development will occur directly inside [`cute/arch/*.hpp`](/include/cute/arch/) and [`cute/atom/*.hpp`](/include/cute/atom/). + +The desired inner loop MMA iteration order and tiling can be achieved through careful +selection of the atom layout, value layout, and permutations of the `cute::TiledMma`. + +For epilogues, the `cutlass::epilogue::collective` layer replaces `cutlass::threadblock::collective`. However, the thread-level epilogue elementwise operations +in `cutlass::epilogue::thread` will continue to be used in 3.x kernels as well, albeit, with +a more idiomatic epilogue vectorization strategy. +[Example 50](/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu) +shows how to use 2.x epilogue thread operators with 3.0 API kernels. + +## Porting from 2.x to 3.0 API + +### CUTLASS 2.x layout tags and CUTLASS 3.0 major modes + +CUTLASS 2.x and CUTLASS 3.0 use both +different wording and different types +to describe the permitted layouts +of GEMM's input matrices A and B. + +CUTLASS 3.0 does not use the terms "column major" +or "row major" to describe matrix layouts. +Starting with CUTLASS 3.0, adoption of CuTe allows us to decouple + +* the coordinate mode order (logical shape) of layouts from + +* the index space stride order of the backing storage. + +In line with our switch to a conceptual GEMM hierarchy, we view the major modes not from a BLAS-3 perspective. +Rather, we divide the modes into two categories. + +* "Inner modes" or "K-modes" are contracted over during the GEMM. + Therefore, they are not present in the output tensor. + +* "Outer modes" or "MN-modes" are preserved in the output. + +Now, instead of `RowMajor` or `ColumnMajor`, whose major stride depends on whether we are referring to the +A or the B matrix, we uniformly employ the "K major" or "MN major" terminology and enforce the convention of all tensors having the shape `[M/N, K, L]` regardless of which mode is major. That is, + +* the input matrix A has shape M x K, +* the input matrix B has shape N x K, and +* the input/output matrices C/D have shape M x N. + +Note that this convention for B +differs from the BLAS's GEMM interface, +which specifies that B has shape K x N. + +CUTLASS 3.0 uses these names of the modes +to specify which mode of a matrix has stride 1. +For the matrix A, + +* "M major" means that the matrix is stride 1 + in the M mode, and +* "K major" means that the matrix is stride 1 + in the K mode. + +For the matrix B, + +* "N major" means that the matrix is stride 1 + in the N mode (which for B is mode 0, + because the convention is that B is N x K); and +* "K major" means that the matrix is stride 1 + in the K mode (which for B is mode 1). + +CUTLASS 2.x defines "layout tag" classes +`cutlass::layout::ColumnMajor` and `cutlass::layout::RowMajor`, +that live in the header file +[`cutlass/layout/matrix.h`](/include/cutlass/layout/matrix.h). +The interpretation of these layouts in GEMM +depends on whether they are applied +to the input matrix A or B. For the matrix A, "column major" means +that mode corresponding to M extent has stride 1, +and "row major" means that mode corresponding to K extent has stride 1. +This is the usual computer science definition +of column major and row major for a rank-2 array. +For the matrix B, the opposite holds: +"column major" means that mode corresponding to N extent has stride 1, +and "row major" means that mode corresponding to K extent has stride 1. + +Using the convention of `[outer, inner, batch]` mode order for tensor logical shapes +avoids potential confusion with the meaning of column major and row major +changing depending on whether they are applied to A or B. + +The table below summarizes our mode order convention and +mapping of 2.x layout tags to corresponding M-major, N-major, or K-major strides. + +| Matrix | CUTLASS 2.x layout | 2.x Shape | Logical major mode| 3.x Shape/Stride | Major ordinal | +| --- | --- | --- | --- | --- | --- | +| A | `ColumnMajor` | M x K | M major | M x K x L | 0 (outer) | +| A | `RowMajor` | M x K | K major | N x K x L | 1 (inner) | +| B | `RowMajor` | K x N | N major | N x K x L | 0 (outer) | +| B | `ColumnMajor` | K x N | K major | N x K x L | 1 (inner) | +| C | `ColumnMajor` | M x N | M major | M x N x L | 0 (outer) | +| C | `RowMajor` | M x N | N major | M x N x L | 1 (inner) | + +Notice that in CUTLASS 3.0, interpretation of layouts no longer changes based on +whether we are talking about the A or B matrix. M and N major inputs always have a +static size-1 stride in their 0th (outer) mode. Similarly, K major inputs +always contain the static size-1 stride in their 1st mode. This uniformity in stride order +allows us to represent tensor layouts much more cleanly and treat both A and B equally in our interfaces. +See for example the following snippet from our [`kernel/sm70_gemm.hpp`](/include/cutlass/gemm/kernel/sm70_gemm.hpp) +for Ampere kernel schedules. + +```c++ +// Represent the full tensors +Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); // (m,k,l) +Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); // (n,k,l) + +// Get batch slice +Tensor mA_mk = mA_mkl(_,_,get<3>(blk_coord_mnkl)); // (m,k) +Tensor mB_nk = mB_nkl(_,_,get<3>(blk_coord_mnkl)); // (n,k) + +// Slice to get the tiles for which this thread block is responsible +Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) +Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) +``` + +As seem in this snippet, all input tensors have the logical shape `[outer, inner, batch]`, +and the strides could represent either outer or inner +(or any other complex hierarchical stride) major storage. +CuTe layouts always maintain the logical consistency of the coordinate spaces regardless of the strides. + +By convention, in CUTLASS 3.0, we treat the M and N mode as the 0th mode, +and K mode as the 1st mode of the stride. + +### Conversions between 2.x tags and 3.0 types + +Starting with CUTLASS 3.0, all layouts are described using +`cute::Shape` and `cute::Stride` which compose into a `cute::Layout`. +In CUTLASS 2.x, various layout tags such as `cutlass::layout::RowMajor` are used to specialize +template implementations. These tag types only encode information about the tensor strides, +as 2.x layouts did not incorporate any concept of tensor shape in the layout tags themselves. +Users may find a need to convert between CUTLASS 2.x layout tags, and 3.0 +CuTe stride types. CUTLASS 3.0 `gemm::collective::CollectiveBuilder` interfaces +also accept these 2.x layout tags as input parameters in their template API as a convenience for users. +At every entry point into CUTLASS 3.0, these tags get converted to their corresponding CuTe Stride type with +metafunctions that best approximate their corresponding `cute::Stride`. + +* `cutlass::gemm::detail::TagToStrideA_t` +* `cutlass::gemm::detail::TagToStrideB_t` +* `cutlass::gemm::detail::TagToStrideC_t` + +By convention, and to match user expectations, the `cute::Stride` types that these +map onto always contain one static mode corresponding to the layout tag, and two 64-bit +dynamic stride modes corresponding to the minor mode and the batch mode. Batch +mode is included by default as all CUTLASS 3.0 kernels support packed batch-mode GEMMs +out of the box. + +The [`cutlass/gemm/gemm.h#440`](../../include/cutlass/gemm/gemm.h#440) +header file includes functions +that can be useful for converting +from CUTLASS 3.0 `cute::Stride`s back to CUTLASS 2.x layout tags. + +* `cutlass::gemm::detail::StrideToLayoutTagA_t` +* `cutlass::gemm::detail::StrideToLayoutTagB_t` +* `cutlass::gemm::detail::StrideToLayoutTagC_t` + +These metafunctions take the CuTe Stride as a template parameter and +attempt to find the size-1 stride in the idiomatic M, N, or K modes +to best approximate a corresponding 2.x layout tag type. +Note that this may not work in general for any `cute::Stride` +as the mapping between the stride and tag type is not bijective. + +These mapping utilities are kept in a `detail` namespace +as we do not guarantee stability of their implementation. +Their behavior may change in future releases as we add new features. +However, we do expect these type names to remain stable. For users who want +these 2.x reflective types from an assembled kernel with a more stable API, +the specialization of `cutlass::gemm::device::GemmUniversalAdapter` +for CUTLASS 3.0 kernel provides all aliases for all 2.x type aliases +in addition to the layout tags. You can see how they are used in the header file +[`cutlass/gemm/device/gemm_universal_adapter.h`](/include/cutlass/gemm/device/gemm_universal_adapter.h). +Here is an excerpt. + +```c++ + // Map back to 2.x type as best as possible + using LayoutA = gemm::detail::StrideToLayoutTagA_t; + using LayoutB = gemm::detail::StrideToLayoutTagB_t; + using LayoutC = gemm::detail::StrideToLayoutTagC_t; + using LayoutD = gemm::detail::StrideToLayoutTagC_t; + + // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0 + using MathOperator = cutlass::arch::OpMultiplyAdd; + + // If our TiledMMA's instruction thread layout size is larger than 1, + // we know it's a tensorop + using OperatorClass = std::conditional_t< + (cute::size(typename GemmKernel::TiledMma::AtomThrID{}) > 1), + cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>; + + // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape + using ThreadblockShape = cutlass::gemm::GemmShape< + cute::size<0>(TileShape{}), + cute::size<1>(TileShape{}), + cute::size<2>(TileShape{})>; + + using ClusterShape = cutlass::gemm::GemmShape< + cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}), + cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>; + + // We get the instruction shape directly from our TiledMma's atom shape + using InstructionShape = cutlass::gemm::GemmShape< + cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}), + cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}), + cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>; + + static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages; + static int const kThreadCount = GemmKernel::MaxThreadsPerBlock; + + // Warp shape is not a primary API type in 3.x, + // but we can best approximate it by inspecting the TiledMma::TiledShape_MNK. + // For this, we make the assumption that we always have 4 warps along M, + // and the rest along N, with none along K. We also always round up + // the warp count to 4 if the tiled mma is smaller than 128 threads. + static constexpr int WarpsInMma = std::max(4, cute::size(typename GemmKernel::TiledMma{}) / 32); + static constexpr int WarpsInMmaM = 4; + static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM); + using WarpCount = cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape< + cute::size<0>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{}) / WarpsInMmaM, + cute::size<1>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{}) / WarpsInMmaN, + cute::size<2>(typename CollectiveMainloop::TiledMma::TiledShape_MNK{})>; + + // Inspect TiledCopy for A and B to compute the alignment size + static int constexpr kAlignmentA = gemm::detail::get_alignment_count_from_gmem_tiled_copy< + typename CollectiveMainloop::GmemTiledCopyA, ElementA>(); + static int constexpr kAlignmentB = gemm::detail::get_alignment_count_from_gmem_tiled_copy< + typename CollectiveMainloop::GmemTiledCopyB, ElementB>(); +``` + +CUTLASS's library and profiler use these reflective interfaces to +obtain the kernel's configuration parameters. Users can use these to approximate the CUTLASS 2.x types +for 3.0 API kernels. However, the reflective interfaces cannot always match the types exactly, +as the mappings are not always bijective. + +# Copyright + +Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +``` + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/media/docs/cutlass_3x_design.md b/media/docs/cutlass_3x_design.md new file mode 100644 index 0000000000..9db3359d26 --- /dev/null +++ b/media/docs/cutlass_3x_design.md @@ -0,0 +1,117 @@ +[README](/README.md#documentation) > **CUTLASS 3.0 Design and Hierarchy** + +# CUTLASS 3.0 Design + +CUTLASS 3.0 is a major enhancement over the abstractions of CUTLASS 2.x +and aims to make usage of all layers of the GEMM hierarchy easier and more composable +while still achieving peak performance on Hardware. + +## CUTLASS 3.0 design goals + +CUTLASS 3.0 has the following design goals, in no particular order. + +- Simplify expressing and manipulating data and thread layouts across + the GEMM hierarchy with CuTe layouts and layout algebra. + +- Improve code readability and learning curve by + reducing the number of named types. + +- Functional correctness by default, + actionable static asserts otherwise. + +- Single, clear points of performance tuning and custom kernel extensions. + +- Support for NVIDIA Hopper GPUs with great performance using + features such as Tensor Cores, tensor memory accelerator, and thread block clusters. + +## A new Conceptual GEMM Hierarchy + +CUTLASS 2.x decomposes the moving parts of a GEMM operation +across a hierarchy that closely mirrors the organization of GPU +architectures. This discussed in detail within the +[CUTLASS 2.x GEMM API documentation](/media/docs/gemm_api.md). +This design, however, sometimes results in a coupling that is too tight +to extend to newer GPU features that might not fit into the same architectural +hierarchy. For instance, Hopper's warp-group wide instructions do not naturally +fit into any warp or thread layer GEMM concept in CUTLASS 2.x. Even for Volta tensor cores, +instructions that atomically exist at the quad-pair granularity are first tiled at +the warp level before use. This hints at the brittleness of the abstraction power. + +CUTLASS 3.0 detaches its interface layers from the hardware, +centering them instead around the natural structure of GEMM algorithms +not tied to any particular GPU generation. +This makes CUTLASS's code more robust to GPU architecture evolution, +less prone to implementation detail leakage, and provides users +with a consistent interface to hardware acceleration regardless of +the architecture specific details. + +The new conceptual GEMM hierarchy is discussed in detail in the dedicated +[CUTLASS 3.0 GEMM API documentation readme](/media/docs/gemm_api_3x.md), +along with code examples of the core concepts and types. + +## Adoption of CuTe Layout and Tensors + +CUTLASS 3.0 introduces a new core library, CuTe, to describe and manipulate tensors of threads and data. +CuTe is a collection of C++ CUDA template abstractions for defining and operating on hierarchically multidimensional layouts of threads and data. CuTe provides `Layout` and `Tensor` objects that compactly packages the type, shape, memory space, and layout of data, while performing the complicated indexing for the user. + +CUTLASS 3.0 adopts CuTe throughout the GEMM hierarchy in its templates, greatly simplifying the design, +improving code composability, and readability. More documentation specific to CuTe can be found in its [dedicated documentation directory](/media/docs/cute/00_quickstart.md). + +![CuTe helps reduce named iterator types down to a single vocabulary type, `Layout`](/media/images/cutlass-reduction-in-named-iterators.png) + +Programming massively parallel systems with various layers of logical thread and data hierarchies is not a trivial task. + +- `cute::Layout`s always maintain logical consistency of their coordinates, + allowing us to check pre- and post-conditions at compile time for all static inner loops. +- Explicit thread to data mapping allows users and kernel authors to inspect and reason about operations + from a single point in the source code. +- Layouts provide a single point of performance tuning, as most optimizations can be done by careful + selection of thread and data layouts. +- Formalized algebra makes manipulation of and reasoning about thread->data mapping explicit in source code. +- Single vocabulary type (`cute::Layout`) subsumes every iterator and layout in CUTLASS 2.x CUTLASS 2.x uses many bespoke thread maps, iterators, and data layouts. Iterators are fundamentally 1-D, whereas most layouts we encounter in the GPU hierarchy are fundamentally n-D. + +## Reducing the number of named types and iterator concepts + +CUTLASS 2.x design preferred introducing bespoke named types for each +architecture specific thread and data layout. For instance, `gemm::treadblock` namespace +contains implementation for `MmaMultistage`, `MmaPlanarComplexMultistage`, `MmaPipelined` etc. +despite them providing mainloops for GEMMs. To spell these types the same way in generic code, +CUTLASS 2.x provides aliases through its `default_x_configuration.h` files, however, +these aliases make the code much harder to read as the user has to perform type substitution +mentally in order to understand the codebase. + +CUTLASS 3.0 greatly reduces the number of named types used throughout by + +- Replacing all iterator concepts for all memory domains with `cute::Tensor`s +- Dispatching mainloop and epilogue implementations on tag-dispatch policies rather than naming new types +- Dispatching kernel layer schedules on tag-dispatch policies rather than naming new types + +Reducing the number of named types has many benefits: + +- It *makes writing generic code easier*, as the primary type names share the same lexical + without aliasing through configuration providers. +- It *flattens the learning curve of CUTLASS* by greatly reducing the mental context required + as the library only exposes a handful of named types. +- It *provides a clear, singular extension point* for users to plug in their customizations + through the dispatch policies. + +## Correctness by default, Performance through clear, individual points of tuning + +CUTLASS 2.x maintained its thread layouts as implicit indexing math implemented +as a part of 1D iterators. This meant that the thread to data layout mapping +was implicit in the imperative structure of the C++ code itself and did not have +a formal algebra we could use to manipulate these mappings. Each iterator +had to re-implement its indexing and mapping logic. This made it hard to learn +how this mapping was performed for existing iterators, and even harder to +implement custom layout functions for the core inner loops of a GEMM. + +CUTLASS 3.0 replaces all iterator concepts from CUTLASS 2.x +with a single layout type for thread and data tensors. +CuTe's formalized layout algebra is then used at every layer of +the GEMM hierarchy to manipulate the mapping between the two. +CuTe layouts always maintain logical consistency, and for fully static layouts +(such as in the core unrolled inner loops), provide +compile time checks that break builds if this consistency is violated. +In this way, CuTe reifies the thread-to-data-layout mapping, +makes it easier to write code that is "correct by construction". +If the code compiles, it's probably correct. diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md index 1cb5a56b07..4145748164 100644 --- a/media/docs/doxygen_mainpage.md +++ b/media/docs/doxygen_mainpage.md @@ -1,14 +1,14 @@ -# CUTLASS 2.0 +# CUTLASS 3.0 -_CUTLASS 2.0 - November 2019_ +_CUTLASS 3.0 - January 2023_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. It incorporates strategies for hierarchical decomposition and data movement similar to those used to implement cuBLAS. CUTLASS decomposes these "moving parts" into reusable, modular software components abstracted by C++ template classes. These -thread-wide, warp-wide, block-wide, and device-wide primitives can be specialized -and tuned via custom tiling sizes, data types, and other algorithmic policy. The +components can be specialized +and tuned via custom tiling sizes, data types, and other algorithmic policies. The resulting flexibility simplifies their use as building blocks within custom kernels and applications. @@ -16,107 +16,25 @@ To support a wide variety of applications, CUTLASS provides extensive support fo mixed-precision computations, providing specialized data-movement and multiply-accumulate abstractions for 8-bit integer, half-precision floating point (FP16), single-precision floating point (FP32), and double-precision floating -point (FP64) types. Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply -operations for targeting the programmable, high-throughput _Tensor Cores_ implemented -by NVIDIA's Volta and Turing architectures. +point (FP64) types. Furthermore, CUTLASS exploits the _Tensor Cores_ and asynchronous +memory copy operations of the latest NVIDIA GPU architectures. +# What's New in CUTLASS 3.0 -# What's New in CUTLASS 2.0 +For an overview of CUTLASS 3.0's GEMM interface levels, +please refer to the +[CUTLASS 3.0 GEMM API document](./gemm_api_3x.md). +To learn how to migrate code using CUTLASS 2.x's interface +to CUTLASS 3.0, please refer to the +[backwards compatibility document](./cutlass_3x_backwards_compatibility.md). -CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer: +# GEMM examples -- Better performance over 1.x, particularly for kernels targeting Turing Tensor Cores -- Robust and durable templates that reliably span the design space -- Encapsulated functionality that may be reusable in other contexts - - -# Example CUTLASS GEMM - -The following illustrates an example function that defines a CUTLASS GEMM kernel -with single-precision inputs and outputs. This is an excerpt from the CUTLASS SDK -[basic_gemm example](https://github.com/NVIDIA/cutlass/tree/master/examples/00_basic_gemm/basic_gemm.cu). - -~~~~~~~~~~~~~~~~~~~~~{.cpp} -// -// CUTLASS includes needed for single-precision GEMM kernel -// - -// Defines cutlass::gemm::device::Gemm, the generic Gemm computation template class. - -#include - -/// Define a CUTLASS GEMM template and launch a GEMM kernel. -cudaError_t cutlass_sgemm_nn( - int M, - int N, - int K, - float alpha, - float const *A, - int lda, - float const *B, - int ldb, - float beta, - float *C, - int ldc) { - - // Define type definition for single-precision CUTLASS GEMM with column-major - // input matrices and 128x128x8 threadblock tile size (chosen by default). - // - // To keep the interface manageable, several helpers are defined for plausible compositions - // including the following example for single-precision GEMM. Typical values are used as - // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details. - // - // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h` - - using ColumnMajor = cutlass::layout::ColumnMajor; - - using CutlassGemm = cutlass::gemm::device::Gemm; // Layout of C matrix - - // Define a CUTLASS GEMM type - - CutlassGemm gemm_operator; - - // Construct the CUTLASS GEMM arguments object. - // - // One of CUTLASS's design patterns is to define gemm argument objects that are constructible - // in host code and passed to kernels by value. These may include pointers, strides, scalars, - // and other arguments needed by Gemm and its components. - // - // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible - // arguments to kernels and (2.) minimized initialization overhead on kernel entry. - // - - CutlassGemm::Arguments args({M , N, K}, // Gemm Problem dimensions - {A, lda}, // Tensor-ref for source matrix A - {B, ldb}, // Tensor-ref for source matrix B - {C, ldc}, // Tensor-ref for source matrix C - {C, ldc}, // Tensor-ref for destination matrix D (may be different memory than source C matrix) - {alpha, beta}); // Scalars used in the Epilogue - - // - // Launch the CUTLASS GEMM kernel. - // - - cutlass::Status status = gemm_operator(args); - - // - // Return a cudaError_t if the CUTLASS GEMM operator returned an error code. - // - - if (status != cutlass::Status::kSuccess) { - return cudaErrorUnknown; - } - - // Return success, if no errors were encountered. - - return cudaSuccess; -} -~~~~~~~~~~~~~~~~~~~~~ +For a code example showing how to define +a GEMM kernel using CUTLASS, please refer to +[the quickstart guide](./quickstart.md). +The [`examples` directory](../../examples) +has a variety of examples. # Copyright diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md index 359d5794a3..533ebc85df 100644 --- a/media/docs/efficient_gemm.md +++ b/media/docs/efficient_gemm.md @@ -219,6 +219,21 @@ which has to happen at the end among the participating warps. This is because each warp computes using only a "slice" of CtaTileK, so each warp only has a partial sum before the reduction. +### Warp Specialization + +Starting with Hopper, CUTLASS 3.0 incorporates the concept of [Warp Specialization](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#spatial-partitioning-also-known-as-warp-specialization) +as part of the kernel design. A thread block is partitioned into two sets of warps, [*producer* warp group](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [*consumer* warp group](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp). The *producer* warp group loads data from global memory into shared memory buffers using the new [Tensor Memory Accelerator (TMA)](https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/). + +[*Producer* warp group (DMA)](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) waits for the shared memory buffers to be signaled as [empty](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) by the *consumer* warp group using the newly added **Async Pipeline class** ([refer](/media/docs/pipeline.md)). Once the data is written into the shared memory, TMA is also updates the barrier associated with that stage to notify affected threads that the buffer has been [filled](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp). The [*Consumer* warp group (MMA)](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) on the other hand waits for the *producer* warp group to signal that the buffer is [filled](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) and then launches tensor core MMA operations. Finally, the *consumer* warp group [releases](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) the buffers for the next set of TMA loads to happens. + +**Warp-Specialized Persistent kernel design** + +Another flavor of Warp Specialized kernel design being introduced starting with Hopper is the [*Warp-Specialized Persistent*](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp) kernel. Like Warp Specialized kernel the concepts of warp groups and barrier synchronization between warp groups remain the same in the persistent design. The distinctive feature of the Warp-Specialized Persistent kernel are the following : +* Persistent thread blocks launched to occupy as many SMs as mentioned in the [KernelHardwareInfo](include/cutlass/kernel_hardware_info.hpp) struct. These persistent thread blocks are used to tile the output and thus (potentially) compute multiple output tiles through their lifetime. The main benefit this adds is amortization of the thread-block launch and kernel prologue overheads which are typical of all kernels. +* Presence of one two *consumer* warp groups which allows for *epilogue* of one *consumer* warp group to be overlapped with the math operations of the other *consumer* warp group - thus maximizing tensor core utilization. + +Each *consumer* warp group is assigned a different output tile. The *producer* warp group synchronizes using the [Ordered Sequence Barrier](/include/cutlass/pipeline.hpp) to fill buffers of the two *consumer* warp groups one after the other in order. Since each thread block now computes multiple output tiles, the shape of the grid launch and the scheduling of tiles to the thread blocks is managed using the new [*Tile Scheduler*](/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp). The *Tile Scheduler* considers the shape of the *clusters* as well as the available number of available SMs to compute a valid scheduling of the output tiles to launched thread blocks. + # Resources The following additional resources describe design and implementation details of GEMMs diff --git a/media/docs/functionality.md b/media/docs/functionality.md index 71bc9b0925..fea258f4ab 100644 --- a/media/docs/functionality.md +++ b/media/docs/functionality.md @@ -4,12 +4,15 @@ # Functionality +Note : CUTLASS-3 requires users to use CUDA 11.4 or newer, and SM70 or newer, for the target toolkit and architecture, respectively. +Please refer to the [Compatibility](/README.md#Compatibility) section for more details. + - N - Column Major Matrix - T - Row Major matrix -- {N,T} x {N,T} - All combinations, i.e. NN, NT, TN, TT +- {N,T} x {N,T} - All combinations, i.e., NN, NT, TN, TT - [NHWC](/include/cutlass/layout/tensor.h#L63-206) - 4 dimension tensor used for convolution - [NCxHWx](/include/cutlass/layout/tensor.h#L290-395) - Interleaved 4 dimension tensor used for convolution -- f - float point +- f - floating point - s - signed int - b - bit - cf - complex float @@ -22,42 +25,55 @@ ## Device-level GEMM -The following table summarizes device-level GEMM kernels in CUTLASS, organized by opcode class, data type, and layout. +The following tables summarize device-level GEMM kernels in CUTLASS, organized by opcode class, data type, and layout. Hyperlinks to relevant unit tests demonstrate how specific template instances may be defined. +### CUTLASS 3.x Kernels + +|**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type** | **Layouts** | **Unit Test** | +|-----------------|------------------------|------------------|--------------------------------|------------------------|------------------| +| **TensorOp** | 90a | 12.0+ | `f16 * f16 + { f16, f32 } => { f16, f32 }` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu) | +| **TensorOp** | 90a | 12.0+ | `bf16 * bf16 + { f16, f32 } => { bf16, f32 }`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu) | +| **TensorOp** | 90a | 12.0+ | `{f32, tf32} * {f32, tf32} + f32 => f32`| { T } x { N } => {N,T} | [example](/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu) | +| **TensorOp** | 90a | 12.0+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu) | + + +### CUTLASS 2.x Kernels + |**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type** | **Layouts** | **Unit Test** | |-----------------|------------------------|------------------|--------------------------------|------------------------|------------------| -| **Simt** | 50,60,61,70,75 | 9.2+ | `f32 * f32 + f32 => f32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_sgemm_nt_sm50.cu) | -| **Simt** | 50,60,61,70,75 | 9.2+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_dgemm_nt_sm50.cu) | -| **Simt** | 60,61,70,75 | 9.2+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_hgemm_nt_sm50.cu) | -| **Simt** | 61,70,75 | 9.2+ | `s8 * s8 + s32 => {s32,s8}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_igemm_nt_sm50.cu) | -| **WmmaTensorOp** | 70 | 9.2+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu) | -| **WmmaTensorOp** | 70 | 9.2+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu) | -| **WmmaTensorOp** | 75 | 10.0+ | `s8 * s8 + s32 => {s32, s8}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu) | -| **WmmaTensorOp** | 75 | 10.0+ | `s4 * s4 + s32 => {s32, s4}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s4t_wmma_tensor_op_s32_sm75.cu) | -| **WmmaTensorOp** | 75 | 10.0+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_b1t_wmma_tensor_op_s32_sm75.cu) | -| **TensorOp** | 70 | 10.1+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu) | -| **TensorOp** | 70 | 10.1+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f32_sm70.cu) | -| **TensorOp** | 75 | 10.2+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) | -| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `cf32 * cf32 + cf32 => cf32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `cf64 * cf64 + cf64 => cf64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) | -| **SpTensorOp** | 80 | 11.1+ | `f16 * f16 + f32 => {f16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) | -| **SpTensorOp** | 80 | 11.1+ | `bf16 * bf16 + f32 => {bf16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) | -| **SpTensorOp** | 80 | 11.1+ | `tf32 * tf32 + f32 => f32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu) | -| **SpTensorOp** | 80 | 11.1+ | `s8 * s8 + s32 => {s8, s32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu) | -| **SpTensorOp** | 80 | 11.1+ | `s4 * s4 + s32 => {s4, s32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) | +| **Simt** | 50+ | 11.4+ | `f32 * f32 + f32 => f32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_sgemm_nt_sm50.cu) | +| **Simt** | 50+ | 11.4+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_dgemm_nt_sm50.cu) | +| **Simt** | 60+ | 11.4+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_hgemm_nt_sm50.cu) | +| **Simt** | 61+ | 11.4+ | `s8 * s8 + s32 => {s32,s8}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/simt_igemm_nt_sm50.cu) | +| **WmmaTensorOp** | 70+ | 11.4+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu) | +| **WmmaTensorOp** | 70+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu) | +| **WmmaTensorOp** | 75+ | 11.4+ | `s8 * s8 + s32 => {s32, s8}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu) | +| **WmmaTensorOp** | 75+ | 11.4+ | `s4 * s4 + s32 => {s32, s4}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s4t_wmma_tensor_op_s32_sm75.cu) | +| **WmmaTensorOp** | 75+ | 11.4+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_b1t_wmma_tensor_op_s32_sm75.cu) | +| **TensorOp** | 70+ | 11.4+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu) | +| **TensorOp** | 70+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f32_sm70.cu) | +| **TensorOp** | 75+ | 11.4+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) | +| **TensorOp** | 80+ | 11.4+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `cf32 * cf32 + cf32 => cf32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `cf64 * cf64 + cf64 => cf64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) | +| **SpTensorOp** | 80+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) | +| **SpTensorOp** | 80+ | 11.4+ | `bf16 * bf16 + f32 => {bf16, f32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) | +| **SpTensorOp** | 80+ | 11.4+ | `tf32 * tf32 + f32 => f32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu) | +| **SpTensorOp** | 80+ | 11.4+ | `s8 * s8 + s32 => {s8, s32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu) | +| **SpTensorOp** | 80+ | 11.4+ | `s4 * s4 + s32 => {s4, s32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) | +| **TensorOp** | 90+ | 11.8+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) | ## Device-level Implicit GEMM convolution @@ -68,19 +84,19 @@ One can find and/or create equivalent dgrad and wgrad convolutional operators. |**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type** | **Layouts** | **Unit Test** | |-----------------|------------------------|------------------|--------------------------------|------------------|------------------| -| **Simt** | 50,60,61,70,75 | 9.2+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu) | -| **Simt** | 50,60,61,70,75 | 9.2+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu) | -| **TensorOp** | 70 | 10.1+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) | -| **TensorOp** | 75 | 10.2+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `s8 * s8 + s32 => {s32, s8}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu) | -| **TensorOp** | 75 | 10.2+ | `s4 * s4 + s32 => {s32, s4}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu) | -| **Simt** | 80 | 11.0+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu) | -| **Simt** | 80 | 11.0+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f16 => f16` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `tf32 * tf32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `s8 * s8 + s32 => {s32, s8}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu) | -| **TensorOp** | 80 | 11.0+ | `s4 * s4 + s32 => {s32, s4}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu) | +| **Simt** | 50+ | 11.4+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu) | +| **Simt** | 50+ | 11.4+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu) | +| **TensorOp** | 70+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) | +| **TensorOp** | 75+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `s8 * s8 + s32 => {s32, s8}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu) | +| **TensorOp** | 75+ | 11.4+ | `s4 * s4 + s32 => {s32, s4}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu) | +| **Simt** | 80+ | 11.4+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu) | +| **Simt** | 80+ | 11.4+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `f16 * f16 + f16 => f16` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `tf32 * tf32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `s8 * s8 + s32 => {s32, s8}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80+ | 11.4+ | `s4 * s4 + s32 => {s32, s4}` | NHWC, NCxHWx | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu), [ncxhwx](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu) | diff --git a/media/docs/gemm_api_3x.md b/media/docs/gemm_api_3x.md new file mode 100644 index 0000000000..c4a454896e --- /dev/null +++ b/media/docs/gemm_api_3x.md @@ -0,0 +1,701 @@ +![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS GEMM API") + +[README](/README.md#documentation) > **CUTLASS 3.0 GEMM API** + +# CUTLASS 3.0 GEMM API + +CUTLASS presents a uniform programming model +for matrix multiply-accumulate (MMA) operations +at different levels of the GPU system hierarchy. +CUTLASS 3.0 has GEMM APIs corresponding to the following levels +in order of highest to the lowest level. + +1. Device +2. Kernel +3. Collective +4. Tiled MMA and Copy +5. Atom + +This document will cover the first three levels in detail: +Device, Kernel, and Collective. +It also briefly discusses the Tiled MMA/Copy and Atom level, +and then refers readers to CuTe's tutorial for more information. + +# CUTLASS GEMM Model + +CUTLASS implements algorithms that express +the classical "triply nested loop" GEMM algorithm +with a tiled structure mirroring the above hierarchy. + +The following pseudocode describes the model for a GEMM kernel +targeting a warp-synchronous matrix multiply instruction like `mma.sync.` +The entire operation is referred to as "Gemm," +as it is assumed that an epilogue operation +performs the general matrix update similar to BLAS. +This is pseudocode and is only meant to illustrate which parts of the layers +correspond to the inner or outer loops of the GEMM. + +```c++ +// cutlass::gemm::kernel::GemmUniversal: ClusterTileM and ClusterTileN loops +// are either rasterized by the hardware or scheduled by the kernel in persistent kernels. +// Parallelism over thread block clusters +for (int cluster_m = 0; cluster_m < GemmM; cluster_m += ClusterTileM) { + for (int cluster_n = 0; cluster_n < GemmN; cluster_n += ClusterTileN) { + + // cutlass::gemm::collective::CollectiveMma: mainloop that iterates over all k-tiles + // No loop unrolling is performed at this stage + for (int k_tile = 0; k_tile < size<2>(gmem_tensor_A); k_tile++) { + + // loops inside cute::gemm(tiled_mma, a, b, c); Dispatch 5: (V,M,K) x (V,N,K) => (V,M,N) + // TiledMma uses the hardware instruction provided through its Mma_Atom + // TiledMma's atom layout, value layout, and permutations define the iteration order + for (int tiled_mma_k = 0; tiled_mma_k < size<2>(A); tiled_mma_k++) { + for (int tiled_mma_m = 0; tiled_mma_m < size<1>(A); tiled_mma_m++) { + for (int tiled_mma_n = 0; tiled_mma_n < size<1>(B); tiled_mma_n++) { + + // TiledMma's vector mode dispatches to the underlying instruction. + mma.call(d, a, b, c); + } // tiled_mma_n + } // tiled_mma_m + } // tiled_mma_k + } // k_tile mainloop + } // cluster_m +} // cluster_n +``` + +The first three nested `for` loops +correspond to parallelism over thread block clusters. +The code does not actually express them as explicit `for` loops. +Instead, the parallelization scheme over tiles +is implied by CUDA grid launch semantics. +However, for persistent kernels, +these three loops are expressed in the source code +as a single `while` loop that queries the +[work tile scheduler](/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp) +for problem tiles on which to compute. + +Inside the three nested `for` loops, +one finds code that pulls matrix tiles +from global memory into more "local" memory +(like shared memory or registers) +and computes MMAs. +These tiled copy and tiled mma iterations are generally +fully static and get fully unrolled. + +# CUTLASS GEMM Components + +CUTLASS expresses the above loop nest +with the following components which are specialized for +data type, layout, and math instruction. + +| API level | API Class and/or function names | +| --- | --- | +| Device | `cutlass::gemm::device::GemmUniversalAdapter` | +| Kernel | `cutlass::gemm::kernel::GemmUniversal` | +| Collective | `cutlass::gemm::collective::CollectiveMma`
`cutlass::epilogue::collective::DefaultEpilogue`
`cutlass::epilogue::collective::Epilogue`
| +| Tiled (MMA and Copy) | `cute::TiledMma` and `cute::TiledCopy`
`cute::gemm()` and `cute::copy()` | +| Atom | `cute::Mma_Atom` and `cute::Copy_Atom` | + +In CUTLASS 3.0, we assemble kernels +by first composing a collective mainloop and collective epilogue +together at the kernel layer, +and then wrapping them with a host-side adapter +to form a GEMM handle to that kernel. + +The following sections describe these components +in the order a user should instantiate them +in order to assemble a kernel. This order is + +1. assemble the required collective mainloop and epilogues, + +2. compose them together to build a kernel type, and + +3. wrap up the kernel with a device layer adapter. + +This order is also reflected in the [CUTLASS 3.0 Hopper kernel examples](/examples/48_hopper_warp_specialized_gemm) as seen in the excerpt below. + +```c++ +// Step 1: Generate the required collective layer mainloop specialization +using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + ElementA, LayoutA, AlignmentA, + ElementB, LayoutB, AlignmentB, + ElementAccumulator, + TilesShape, ClusterShape, + cutlass::gemm::collective::StageCountAuto, + cutlass::gemm::collective::KernelScheduleAuto + >::CollectiveOp; + +// Step 2: Specify the collective layer epilogue type +using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + cutlass::epilogue::thread::LinearCombination>; + +// Step 3: Compose the mainloop and epilogue together at the kernel layer +using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + cute::Shape, // ProblemShape [M,N,K,L] + CollectiveMainloop, + CollectiveEpilogue +>; + +// Step 4: Wrap up the kernel::GemmUniversal kernel class +// with the device adapter to obtain a host-side handle to the kernel +using GemmHandle = cutlass::gemm::device::GemmUniversalAdapter; +``` + +Towards the end, we also briefly cover CuTe's tiled mma and copy as well as the atom layer APIs, +before redirecting users to CuTe-specific documentation for further details. + +## Collective API + +A Collective is "the largest collection of threads +onto which mma atoms and copy atoms are tiled." +That is, it is the largest number of threads in a grid +that can cooperate by leveraging hardware features +for accelerated communication and synchronization. +These hardware features include + +* asynchronous array copy + (e.g., from global memory to shared memory); + +* MMA instructions + for small tiles that live in shared memory; + +* synchronization operations for clusters, + thread blocks, and/or warps; and/or + +* hardware acceleration (such as barriers) + for ensuring that data dependencies + between asynchronous operations are met. + +A Collective uses the `TiledMma` and `TiledCopy` API (see below) +to access operations that copy and perform MMA on tiles. + +Different units of parallelism +(e.g., threads, warps, or thread blocks) +in a Collective might have different roles. +For example, in "warp-specialized" algorithms, +some warps may be responsible for copying data, +while others may be responsible for computation. +Nevertheless, the different units of parallelism +still need to share data and coordinate access +to the shared data. For example, +the producer warps in a warp-specialized algorithm +that copy input matrix tiles into shared memory +need to let the consumer MMA warp(s) know +that their MMA inputs are ready. +We contrast this with the `kernel::` layer API, +which schedules the collectives over *independent* tiles in the grid. + +The Collective API includes both the "mainloop" +of matrix multiply-accumulate, and the epilogue. +This API is the composition point for optimizations +such as mainloop fusions and epilogue fusions. +It is responsible for implementing +the `k_tile` loop in the above triply nested loop pseudocode. + +### Collective Mainloops + +The `cutlass::gemm::collective::CollectiveMma` class +is the primary interface to the collective +matrix multiply-accumulate (MMA) mainloops. +"Mainloop" refers to the "main loop" over tiles -- +the "cluster tile k" loop in the pseudocode +near the top of this document. +Any looping over multiple tiles that +the algorithm might need to do would happen here. + +The `CollectiveMma` class is declared in the header +[cutlass/gemm/collective/collective_mma.hpp](/include/cutlass/gemm/collective/collective_mma.hpp). + +```c++ +namespace cutlass::gemm::collective { + +template < + class DispatchPolicy, + class TileShape, + class ElementA, + class StrideA, + class ElementB, + class StrideB, + class TiledMma, + class GmemTiledCopyA, + class SmemLayoutAtomA, + class SmemCopyAtomA, + class TransformA, + class GmemTiledCopyB, + class SmemLayoutAtomB, + class SmemCopyAtomB, + class TransformB +> +struct CollectiveMma { + static_assert(sizeof(ElementA) == 0, "Could not find a mainloop specialization."); +}; + +} // namespace cutlass::gemm::collective +``` + +- `DispatchPolicy` is the most important type for a collective, and is +[covered in more detail below](#collective-dispatch-policies). + +- `StrideA` and `StrideB` are instances of type `cute::Stride` that represent the global memory layout of A and B tensors. These strides are required to be rank-3, representing the modes `[outer, inner, batch]`. Each of the 3 ranks can be a multi-modal hierarchical stride; this would apply if implementing a tensor contraction. + +- `TiledMma` is an instance of `cute::TiledMma`. + +- `GmemTiledCopyA` and `GmemTiledCopyB` are instances of `cute::TiledCopy` types. Both tiled operation types are [covered in more detail below](#tiled-mma-and-copy). + +- `SmemLayoutAtomA` and `SmemLayoutAtomB` are instances of type `cute::Layout` and represent the smallest +layout that will get tiled over the entire collective's shared memory. This layout does _not_ include the +pipeline mode, and therefore, both are expected to be rank 2 layouts of shape [`outer`, `inner`]. + +- `SmemCopyAtomA` and `SmemCopyAtomB` are `Copy_Atom`s to be used for moving data from shared memory +into register memory. + +Notice that CUTLASS 3.0 mainloops do not accept a dedicated accumulator element type. +We obtain the accumulator type from the `typename TiledMma::ValTypeC`. Note also that +top level API's `ElementA` and `ElementB` can defer from those of the MMA facing +`typename TiledMma::ValTypeA` and `typename TiledMma::ValTypeB`, allowing TMA or user +supplied transform operations to perform type conversions. + +### Collective Dispatch Policies + +`CollectiveMma` implementations are not generic. +Instead, they must be specialized for each algorithm and GPU architecture. +Users can dispatch to a `CollectiveMma` specialization +by picking template arguments matching that specialization. +CUTLASS 3.0 adopts a tag-based dispatch policy type to specialize +mainloop implementations and add tuning knobs to them. + +Below is an example of one of the dispatch policies that is used to dispatch to a Hopper TMA +warp-specialized mainloop implementation: + +```c++ +// n-buffer in smem (Hopper TMA), +// pipelined with Hopper GMMA and TMA, +// warp-specialized dynamic schedule +template< + int Stages_, + class ClusterShape_ = Shape<_1,_1,_1>, + class KernelSchedule = KernelTmaWarpSpecialized +> +struct MainloopSm90TmaGmmaWarpSpecialized { + constexpr static int Stages = Stages_; + using ClusterShape = ClusterShape_; + using ArchTag = arch::Sm90; + using Schedule = KernelSchedule; +}; +``` + +The `Stages_` template parameter lets the user freely vary the number of pipeline stages, +while the `ClusterShape_` type allows for parameterization over the shape of the threadblock +cluster over which TMA multicast will take place. + +The collective dispatch policy is also the primary point of composing various kernel schedules +freely with any mainloop. Each mainloop policy either prescribes a `Schedule` with which +it needs to be run, or exposes a template API that lets the user pick a subset of the following schedules: + +```c++ +struct KernelMultistage { }; +struct KernelTma { }; +struct KernelTmaWarpSpecialized { }; +struct KernelTmaWarpSpecializedPersistent { }; +``` + +- A single kernel schedule can support multiple mainloop implementations. For example, +`KernelMultistage` can be composed with many different mainloop implementations across GPU +architectures such as `MainloopSm70TwoStage`, `MainloopSm80CpAsyncUnpredicated`, `MainloopSm90CpAsyncGmma`, and many more. + +- A single mainloop can be composed with multiple +possible kernel schedules. For example, the `MainloopSm90TmaGmmaWarpSpecialized` can be +composed with either the `KernelTmaWarpSpecialized` or `KernelTmaWarpSpecializedPersistent` +kernel schedules. + +As [discussed in the CUTLASS 3.0 design documentation](cutlass_3x_design.md), adopting tag +dispatch policies for our core vocabulary types allows us to maintain a single type name for +all operations that conceptually belong to the same class. This design has the following benefits. + +- It *avoids code duplication* in cases where mainloops can be composed with multiple kernels or vice versa. +- It *makes writing generic code easier*, as the primary type name `CollectiveMma` does not change across any implementation. +- It *provides a clear, singular extension point* for users to plug in new, custom mainloops implementations specialized on their own dispatch policies. + +### Collective Builder for `CollectiveMma`s + +The primary `CollectiveMma` is intended to be an expert user interface that allows full control over +all the properties of the collective's GPU micro-kernel. However, often a user just wants an +off-the-shelf GEMM mainloop implementation parameterized on simple configuration parameters. CUTLASS 3.0 +provides [`cutlass::gemm::collective::CollectiveBuilder`](include/cutlass/gemm/collective/collective_builder.hpp) for such scenarios. + +```c++ +namespace cutlass::gemm::collective { +template < + class ArchTag, + class OpClass, + class ElementA, + class GmemLayoutA, + int AlignmentA, + class ElementB, + class GmemLayoutB, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + class KernelScheduleType, + class Enable = void +> +struct CollectiveBuilder { + static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters."); +}; +} // namespace cutlass::gemm::collective +``` + +`CollectiveBuilder` accepts CUTLASS 2.x equivalent input template arguments, and attempts to build +the best performing `CollectiveMma` from the given parameters. + +- `ArchTag` is one of the SM architectures tags from `cutlass::arch::Sm*`. +- `OpClass` is one of the operator class tags from `cutlass::arch::Sm*`. +- `ElementA` and `ElementB` are the logical value types of the A resp. B tensors. +- `ElementAccumulator` is the accumulator type to be used in the instruction. +- `GmemLayoutA` and `GmemLayoutB` are CUTLASS 2.x layout tags, `layout::RowMajor` or `layout::ColumnMajor`. +- `AlignmentA` and `AlignmentB` are global memory alignments of A and B tensors in terms of element count. +- `TileShape_MNK` is an instance of `cute::Shape` that is rank-3, representing the MxNxK collective tile shape. +- `ClusterShape_MNK` is an instance of `cute::Shape` that is rank-3, representing the MxNxK threadblock cluster tile shape. +- `StageCountType` is either `collective::StageCountAuto` or an instance of `collective::StageCount`. +- `KernelScheduleType` is either `collective::KernelScheduleAuto` or one of the specific kernel schedule tags discussed in the [dispatch policy section](#collective-dispatch-policies) above. + +`StageCountAuto` allows the collective builder to compute the size of a single stage's size in shared memory +and maximize the shared memory usage assuming 1 threadblock / multiprocessor occupancy. + +`KernelScheduleAuto` allows the collective builder to pick the best kernel schedule available for the +given set of parameters, or let's the user override this with a specific kernel schedule type. + +Note that collective builders are still in beta, and their functionality +does not map onto the full design space that the primary expert `CollectiveMma` API +allows for. We expect their supported mainloop types to expand in future releases, but +with 3.0, only SM90 tensorop kernels are supported through the builder API. The builder API +may also change in the future as we adopt user feedback. + +If the builder is able to provide a collective mainloop type for the given set of parameters, +it will be aliased within as `CollectiveOp`. For more information on how to +parameterize kernels conveniently with the collective builder, please see example [49_hopper_gemm_schedules_with_collective_builder](49_hopper_gemm_schedules_with_collective_builder). + +### Epilogue + +The collective epilogue implements element-wise operations +involving the output matrix. Users can provide a custom +epilogue, or use one of the standard epilogues. +These live in the directory +[include/cutlass/epilogue/collective/](../../include/cutlass/epilogue/collective/), +and include classes like +`cutlass::epilogue::collective::DefaultEpilogue` +and +`cutlass::epilogue::collective::Epilogue`. +CUTLASS's provided collective epilogues +do not live under `include/cutlass/gemm` +or in the `cutlass::gemm` namespace, +because they can be used for computations +other than GEMM. + +## Kernel API + +The kernel is "a collection of all clusters in the grid." +The kernel layer schedules have four main responsibilities. + +- Ordering the execution of collectives within the kernel, performing any synchronization between that may be necessary +- Marshalling the threads of a warp specialized schedules into their respective roles +- Performing any necessary grid swizzling logic +- Tiling the input tensors with the threadblock cluster value tile before invoking the collectives on them + +The Kernel API is the entry point for a grid of thread blocks +that may or may not be organized in a cluster. +It is the composition point for fusing back-to-back GEMMs, +epilogues, and/or other operations. + +The entry point API for CUTLASS 3.0 kernel is the class +`cutlass::gemm::kernel::GemmUniversal`, found in the header file +[include/cutlass/gemm/kernel/gemm_universal.hpp](../../include/cutlass/gemm/kernel/gemm_universal.hpp). +`GemmUniversal` is a stateless universal device kernel +that implements GEMM as the composition of two parts: + +* a collective mainloop, and +* a collective epilogue + +```cpp +namespace cutlass::gemm::kernel { +/* + * Stateless universal device GEMM kernel type that treats GEMM as + * a composition of a collective mainloop and a collective epilogue. + * + * Supports both the 2.x and 3.x APIs based on whether the first type is + * a cute::tuple<> or not. + * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h + * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp + * + * In the following declaration, the name preceding the 'Or' refers to + * 3.x API type argument order, and the name succeeding the 'Or' refers to + * 2.x API type argument order. Template arguments without two names + * belong to the 3.x API only. +**/ +template < + class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l) + class CollectiveMainloopOrEpilogue_, + class CollectiveEpilogueOrThreadblockSwizzle_, + class GridSwizzle_ = void, + class Enable = void +> +class GemmUniversal; +} // namespace cutlass::gemm::kernel +``` + +*Stateless* means that the caller -- +for example, the Device API described above -- +manages the kernel's state. +The kernel just takes input and output parameters (`Params`). + +*Universal* means that `GemmUniversal` works +for both CUTLASS 3.0 and 2.x interfaces +and across a broad range of kernel schedules. +If `GemmUniversal`'s first template argument is a `cute::Shape`, +then `GemmUniversal` assumes that the remaining template arguments +implement the 3.0 APIs. Otherwise, `GemmUniversal` assumes that +the remaining template arguments implement the 2.x APIs. +Starting with CUTLASS 3.0, the problem shape has been promoted +to a top-level template API for the GEMM kernel. +This supports fully static GEMM instantiations +where the user expects to know some or all +of the problem shapes at compile time +in order to extract even more performance. + +The *collective mainloop* implements MMA on local tiles. +The *collective epilogue* addresses any operations after the MMA, +such as applying the `beta * C` part of `C := beta * C + alpha * A * B`. +We will explain *collective* in more detail below. + +Specializations of `kernel::GemmUniversal` for 3.0 APIs live in +any of various `gemm_*.hpp` files in the directory +[include/cutlass/gemm/kernel/](../../include/cutlass/gemm/kernel/). +Specializations for 2.x APIs can be found in the header file +[include/cutlass/gemm/kernel/gemm_universal.h](../../include/cutlass/gemm/kernel/gemm_universal.h). + +CUTLASS 3.x implements various embodiments of `kernel::GemmUniversal`. +Each kernel layer schedule is specialized +for a GEMM scheduling algorithm and GPU architecture. +Specializations of `kernel::GemmUniversal` for 3.0 APIs live in +any of various `include/cutlass/gemm/kernel/{arch_tag}*.hpp` files in the directory +[include/cutlass/gemm/kernel/](../../include/cutlass/gemm/kernel/). +Which specialization to dispatch to is decided through the dispatch policy's `Schedule` type. + +For example, the header file +[include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp) +has a specialization of `kernel::GemmUniversal` for Hopper +that uses a warp-specialized mainloop with a persistent scheduling algorithm, +while the header file +[include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp](../../include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) +has a specialization of `GemmUniversal` for Hopper +that uses a warp-specialized but non-persistent algorithm. + +To support composition between supported kernel schedules and mainloop dispatch policies without having to +duplicate collective mainloop implementations, GEMM kernel layer schedules can be composed with +any mainloop that specifies their corresponding kernel schedule as their `Schedule` type in the policy. +This is discussed in detail in the [collective dispatch policy section](#collective-dispatch-policies) above. + +```c++ +// An example of the SM90 KernelMultistage kernel's +// specialization logic that allows it to be composed +// with many mainloops such as `MainloopSm80CpAsync` +// and `MainloopSm70TwoStage`. +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class GridSwizzle_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + GridSwizzle_, + std::enable_if_t>> +``` + +## Device API + +The Device API is a universal, kernel-agnostic host interface +for kernel launch and managing the lifetime of +reusable host-side parameters. + +This API is how users' host-side .cu code +invokes CUTLASS's single-GPU GEMM kernels. +It serves the same purpose as cuBLAS and behaves similarly. + +The entry point for the Device GEMM API is the class +`cutlass::gemm::device::GemmUniversalAdapter`. +This class lives in the header file +[include/cutlass/gemm/device/gemm_universal_adapter.h](/include/cutlass/gemm/device/gemm_universal_adapter.h). +`GemmUniversalAdapter` is a stateful, reusable handle, +which is parameterized on the `cutlass::gemm::kernel` type. + +```c++ +/*! + GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel + of type cutlass::gemm::kernel::* + + It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs + to create it from the host facing arguments. For power users, new static methods + are exposed in 3.x APIs that bypass the stateful methods or args->params lowering. + + It supports kernel types that implement both the 2.x and 3.0 APIs, + however, this is done by specializing the implementation of GemmUniversalAdapter + on the two kernel API types, and thus, GemmUniversalAdapter's behavior might + differ between the two specializations. +*/ +template +class GemmUniversalAdapter; +``` + +*Stateful* means that the handle instance contains state +that the kernel needs to run. +This means that the user must initialize the handle first, +then use the initialized handle instance to run the kernel. +Statefulness also means that the handle can manage the lifetime +of the kernel's `Params` -- the parameters of the kernel itself. +An important duty of `GemmUniversalAdapter` +is to map from the user's `Arguments` -- +what the user sees as the kernel's parameters -- +to the `Params` that the kernel actually sees. +For power users, the class exposes new static methods +in 3.0 APIs that can bypass stateful methods +or go directly to `Params` without intermediate `Arguments`. + +*Reusable* means that the handle instance can be used +to call the kernel multiple times with different arguments +(e.g., different matrices). +Reusing the handle may be more efficient than just +creating a new handle for each kernel invocation. + +*Parameterized on the kernel type* means that +the `GemmUniversalAdapter` class' behavior +depends on the GEMM kernel type (see the next section). +Specifically, `GemmUniversalAdapter` has a template parameter +`GemmKernel`, which is the GEMM kernel type. +Valid template arguments for `GemmKernel` are + +* `cutlass::gemm::kernel::GemmUniversal`, + implementing CUTLASS 3.x API kernels; +* `cutlass::gemm::kernel::GemmUniversal`, + implementing CUTLASS 2.x API kernels; or +* Any valid CUTLASS 2.x `kernel` layer GEMM that + was previously composable with the `device::GemmUniversalAdapter`. + +`GemmUniversalAdapter` presents a single +host-side interface to both 3.0 and 2.x kernels. +CUTLASS accomplishes this by +specializing `GemmUniversalAdapter`'s implementation +on either the 2.x API implementing kernel layer GEMMs, or on the 3.x API +implementing kernel layer GEMMs. The metafunction [`cutlass::gemm::detail::IsCutlass3GemmKernel`](cutlass_3x_backwards_compatibility.md#kernel-api-design-differences) +is what `GemmUniversalAdapter` uses to distinguish between 2.x and 3.x kernels. + +`GemmUniversalAdapter` sets up and launches the kernel, using the +CUDA extended launch API for threadblock cluster support if required. +Note, `GemmUniversalAdapter` does *not* specify the grid shape. +The kernel controls the grid shape +and other kernel-specific launch parameters. +This makes it possible for all 3.0 kernels +to use the same kernel launch code, +thus factoring out kernel launch from the actual kernel. + +## Tiled MMA and Copy + +The Tiled MMA or Copy are tilings of MMA atoms resp. Copy atoms +across threads and data, with possible permutations applied to the +resulting tiling. This layer is most analogous to the warp level +tiling of MMA instructions in CUTLASS 2.x. However, it views the tiling +from the perspective of all threads participating in the operation +and generalizes the concept to copy operations as well. The purpose +of this layer is to build composable GPU micro-kernels out of a plethora +of hardware accelerated math and data movement operations, each with their +unit layouts in threads and data. The tiled MMA and Copy types present +all these various hardware accelerated CuTe Atoms with a single, consistent +API. + +The resulting tiled operation acts as a single MMA or copy operation +that users can invoke in the "inner" loop +of the three-nested-loops pseudocode +at the top of this document using `cute::gemm()` or `cute::copy()`. + +We call this API "tiled" because it constructs +larger operations out of the Atoms provided by CuTe, +as if fitting together individual tiles +to build a reusable component of a mosaic. +For example, CuTe might provide an MMA Atom +that users can call on a single warp, +for fixed M, N, and K dimensions. +CUTLASS can then use CuTe operations like `make_tiled_mma` +to turn this Atom into an operation +that works on an entire thread block, +for larger M, N, and K dimensions. + +## Atom API + +An "Atom" is the smallest collection of threads and data +that must participate in the execution of a hardware-accelerated +math or copy operation. + +An Atom is "atomic" (indivisible) not in the sense of +concurrent memory operations like `atomicAdd` +(which are "indivisible in time (causality)"), +but in the sense of indivisibility in "space" -- +the number of values and the groups of parallel workers +that must participate in the operation together. + +An Atom uses CuTe Layouts to express the required +dimensions and strides of its input and output arrays. +Generally these are fixed at compile time. + +The Atom API wraps calls to actual hardware instructions +that accelerate MMA or copy operations. +Users can ask for GPU architecture-specific implementations, +or just pick generic implementations and rely on +whatever GPU architectures were enabled. + +For more information about Atoms, +please refer to CuTe's tutorial, e.g., the sections on + +* [algorithms](./cute/04_algorithms.md) like `gemm` and `copy`, + +* [MMA Atoms](./cute/0t_mma_atom.md#cute-mma-atoms), and + +* [a GEMM example](./cute/0x_gemm_tutorial.md). + +# Copyright + +Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +``` + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/media/docs/layout.md b/media/docs/layout.md index f8e21da048..eb68abcdb3 100644 --- a/media/docs/layout.md +++ b/media/docs/layout.md @@ -2,6 +2,11 @@ [README](/README.md#documentation) > **Layouts and Tensors** +Note: This document talks about CUTLASS 2.x layout tag types. +CUTLASS 3.0 deprecates all legacy 2.x layout tags in favour of a single `cute::Layout` +vocabulary type for all thread and data tensors. Please refer to the +[documentation for cute layouts](media/docs/cute/01_layout.md) for more details about CUTLASS 3.0's definition of "layout". + # Layouts and Tensors _Tensors_ are mathematical objects represented by a multidimensional array of numeric elements in memory. diff --git a/media/docs/pipeline.md b/media/docs/pipeline.md new file mode 100644 index 0000000000..ccf8385953 --- /dev/null +++ b/media/docs/pipeline.md @@ -0,0 +1,210 @@ +# Synchronization primitives + +## Overview of CUDA's synchronization methods + +The CUDA programming model provides 3 abstractions: + +* hierarchical parallelism -- that is, parallel threads + grouped into hierarchical units such as blocks and clusters; + +* shared memory, through which parallel threads that are + in the same hierarchical unit can communicate; and + +* synchronization methods for threads. + +These abstractions help developers extract +both fine-grained and coarse-grained parallelism, +by making it possible for them to subdivide problems +into independent components, +and to insert synchronization at appropriate points. + +Over the years CUDA has introduced several synchronization primitives +that operate at different levels of the hierarchy. +These include + +* [thread block - level](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#synchronization-functions) synchronization (e.g., `__syncthreads()`); + +* [warp-level](https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/) synchronization (e.g., `__syncwarp()`); and + +* [thread-level](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-fence-functions) fence operations. + +As an extension to this, starting with the Hopper architecture, CUDA added the following improvements: + +* [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters) -- + a new level in the thread hierarchy representing + a group of thread blocks that can coordinate and share data; + +* synchronization instructions for a thread block cluster and threads within a cluster scope. + +## CUTLASS's abstractions for Hopper features + +CUTLASS now includes abstractions +for the following features introduced in Hopper. + +1. Thread block cluster - level synchronization and query + [APIs](/include/cute/arch/cluster_sm90.hpp) + +2. Abstractions for new + [barrier instructions](/include/cutlass/arch/barrier.h) + which help with efficient synchronization + of threads within a thread block cluster. + +### Asynchronous pipelines + +In order to write a performant GEMM Kernel, +software pipelining is critical to hide the latency of global memory loads. +(Please refer to the +[Efficient GEMM](/media/docs/efficient_gemm.md#pipelining) document.) +Different threads or groups of threads +may have different roles in the pipeline. +Some are "producers" that load data or perform computations +to satisfy other threads' input data dependencies. +The same or different threads may be "consumers" +that do other work with those input data dependencies, +once they are satisfied. +Starting with the Hopper architecture, +the presence of hardware-accelerated synchronization instructions +make it possible for "producer" and "consumer" threads +to communicate with each other efficiently +about their data dependencies. + +Implementing a persistent GEMM algorithm calls for managing +dozens of different kinds of asynchronously executing operations +that synchronize using multiple barriers organized as a circular list. +This complexity is too much for human programmers to manage by hand. +As a result, we have developed +[asynchronous Pipeline classes](/include/cutlass/pipeline.hpp). +These classes help developers orchestrate a pipeline +of asynchronous producer and consumer threads, +without needing to worry about lower-level hardware details. +These classes serve a similar function as the various +[pipeline abstractions](https://nvidia.github.io/libcudacxx/extended_api/synchronization_primitives/pipeline.html) +in libcu++. + +#### Pipeline methods + +##### Producer acquire + +The `producer_acquire` method is to be used by asynchronous producer threads +before issuing other instructions associated with a particular pipeline stage +(e.g., copy or write). + +This is a blocking instruction +which blocks further execution of consumer threads +unless the particular stage waiting to be acquired +is released by a consumer. + +We say that a pipeline at its start is "empty" if producer threads are free to produce and do not need to wait for a consumer release -- that is, if an acquire operation is expected to succeed. If the pipeline at its start is empty, then we can either skip performing producer acquire operations during the first pass through the pipeline stages, or use the `make_producer_start_state` method. The latter ensures that the acquire operation will succeed at the start of a pipeline. + +##### Producer commit + +The `producer_commit` method is to be issued by asynchronous producer threads +after the instructions associated with a particular stage +(e.g., shared memory writes) have completed, +in order to notify the waiting asynchronous consumer threads. +This is a nonblocking instruction. + +This API may result in a No-Op in some cases, +if the producer instructions also update the barrier stage associated automatically +(e.g., TMA_based producer threads using the `PipelineTmaAsync ` class). + +##### Consumer wait + +The `consumer_wait` method is to be used by consumer threads +before consuming data from a particular pipeline stage +which is expected to be produced by producer threads. + +This is a blocking instruction. That is, +until the producer threads have committed to a particular stage, +this instruction is expected to block further execution of consumer threads. + +##### Consumer release + +The `consumer_release` method is to be used by consumer threads +to signal waiting producer threads that they have finished consuming data +associated with a particular stage of the pipeline. +This is a nonblocking instruction. + +#### Pipeline example + +```c++ +// 4-stage Pipeline +static constexpr int NumStages = 4; +using MainloopPipeline = typename cutlass::PipelineAsync; +using PipelineState = typename cutlass::PipelineState; + +// 2 producer threads and 1 consumer thread +typename MainloopPipeline::Params params; +params.producer_arv_count = 2; +params.consumer_arv_count = 1; +MainloopPipeline pipeline(shared_storage.storage, params); + +// Producer threads +if (thread_idx == 0 or thread_idx == 1) { + PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + for ( ; iter > 0; --iter) { + pipeline.producer_acquire(smem_pipe_write); + + // Producer ops + // If any memory operations are involved, then we also need + // to guarantee that writes are completed and visible to consumer(s). + + pipeline.producer_commit(smem_pipe_write.index()); + ++smem_pipe_write; + } +} +else if (thread_idx == 2) { + PipelineState smem_pipe_read; + for (; iter > 0; --iter) { + pipeline.consumer_wait(smem_pipe_read); + + // Consumer ops + + pipeline.consumer_release(smem_pipe_read); + ++smem_pipe_read; + } +} +``` + +In this example, we create an instance of the asynchronous pipeline class `PipelineSync`, +and then synchronize among 3 asynchronously executing threads: +2 producer threads and 1 consumer thread. + +Please note that this is a basic example. +There are different versions possible, +depending on what the producer and consumer threads are doing. +Please refer to our [unit tests](/test/unit/pipeline) +and the other [pipeline classes](/include/cutlass/pipeline.hpp) +for more details. + +# Copyright + +Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +``` + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/media/docs/profiler.md b/media/docs/profiler.md index a841a80757..b8e409fc05 100644 --- a/media/docs/profiler.md +++ b/media/docs/profiler.md @@ -13,7 +13,7 @@ The CUTLASS Profiler may be compiled with: $ make cutlass_profiler -j ``` -To limit compilation time, only one tile size (typically 128x128) is instantiated for each data type, +To limit compilation time, only one tile size (typically 128x128) and threadblock cluster size (typically 2x1x1) is instantiated for each data type, math instruction, and layout. To instantiate all sizes, set the following environment variable when running CMake from an empty `build/` directory. ```bash @@ -168,8 +168,8 @@ Example: The CUTLASS Profiler is capable of executing GEMM and Sparse GEMM problems. The CUTLASS Profiler can be built with cuBLAS enabled to use as a reference implementation. If CMake detects -the cuBLASS library available in the system, it is included as a dependency. This may be explicitly overridden -with CMake flag `CUTLASS_ENABLE_CUBLAS`. +the cuBLAS library available in the system, it is included as a dependency. This may be explicitly overridden +with CMake flag `CUTLASS_ENABLE_CUBLAS`. ## GEMM Arguments @@ -197,6 +197,9 @@ GEMM [int] --cta_m,--threadblock-shape::m Threadblock shape in the M dimension. [int] --cta_n,--threadblock-shape::n Threadblock shape in the N dimension. [int] --cta_k,--threadblock-shape::k Threadblock shape in the K dimension. + [int] --cluster_m,--cluster-shape-shape::m Cluster shape in the M dimension. + [int] --cluster_n,--cluster-shape-shape::n Cluster shape in the N dimension. + [int] --cluster_k,--cluster-shape-shape::k Cluster shape in the K dimension. [int] --stages,--threadblock-stages Number of stages of threadblock-scoped matrix multiply. [int] --warps_m,--warp-count::m Number of warps within threadblock along the M dimension. [int] --warps_n,--warp-count::n Number of warps within threadblock along the N dimension. @@ -342,7 +345,50 @@ To faclitate generation of pivot tables and charts, additional columns may be pr $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn \ --m=3456 --n=4096 --k=8:4096:8 --output=report.csv \ --tags=cutlass:2.2,date:2020-06-08 -``` +``` + +## CUTLASS 3.0 GEMM procedural names + +CUTLASS 3.0 introduces a new naming convention for GEMMs used by the profiler targeting the NVIDIA +Hopper architecture and beyond so as to indicate new features of the kernel within the name +(e.g., the cluster shape). + +To best illustrate this naming convention, we will walk through the meaning of each of the components +in a GEMM kernel used by the profiler: +``` +cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f32_128x128x64_2x1x1_0_ntn_align8 +``` + +The components within this name are as follows: + +* `cutlass3x`: indicates that the kernel was generated through the CUTLASS 3.0 API +* `sm90`: indicates that the kernel targets NVIDIA GPUs with compute capability 90 +* `tensorop`: indicates that the kernel makes use of NVIDIA Tensor Cores +(as opposed to `simt`, which indicates the use of "CUDA cores") +* `s`: indicates that the Tensor Core instruction being used accumulates in single precision +(as opposed to `h`, which indicates half precision) +* `64x128x16gemm`: indicates that the shape of the Tensor Core instruction being used (MxNxK) is 64x128x16 +* `f16_f16_f32_f16`: indicates that the data types for operands A, B, and C are each `f16` +(half precision) and that accumulation is performed using `f32` (single precision) +* `128x128x64`: indicates that the thread block shape used in the GEMM (MxNxK) is 128x128x64 +* `2x1x1`: indicates that the cluster shape being used is 2x1x1 +* `0`: indicates that the kernel uses the CollectiveBuilder's automatic stage calculation to determine the +number of pipeline stages in the kernel. Note that `0` does not mean that no stages are used. A nonzero value indicates that automatic stage calculation is not performed and indicates the number of pipeline stages to be used. +This 0 is only added to the kernel's procedural name, the profiler will still report the actual stage count +when printing the kernel argument details (`--stages=N`) and kernel discovery will still support filtering through the `--stages` argument. +* `ntn`: indicates that the layouts for operands A, B, and C are column major ("n"; non-transposed), +row major ("t"; transposed), and column major, respectively. +* `align8`: indicates that the maximum alignment between operands A and B is 8. + +Note that in some special cases where the input A/B types do not match that of the MMA +instruction's, the MMA facing input type is added to the instruction string as well. + +``` +cutlass3x_sm90_tensorop_s64x128x8tf32gemm_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4 +``` + +* `s64x128x8tf32gemm`: indicates that the MMA consumes inputs in `tf32` format, and therefore +the kernel performs rounding of the `f32` values in global memory while loading them into shared memory. # Convolution diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md index 59c2f57f0c..8e454fa42f 100644 --- a/media/docs/programming_guidelines.md +++ b/media/docs/programming_guidelines.md @@ -6,32 +6,23 @@ ## Hierarchical Organization -CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvlabs.github.io/cub/) -for expressing collective operations. Objects expose an interface for a problem that is then decomposed -into concurrent subtasks executed by cooperating threadblocks, warps, and threads. For example, a grid-level -object may be constructed with base pointers to the start of a GEMM operation, add a threadblock-dependent -offset to partition the problem, and then compute a per-threadblock GEMM. This in turn performs some -operations as a collection of cooperating threads, while it may partition other parts of the task into -warp-level subtasks. - -Consequently, CUTLASS components are organized by the computation then by the layer of -the following hierarchy. - -* *device*: an operation is _device-wide_ and may launch one or more kernels on the GPU -* *kernel*: an operation is implemented by a CUDA kernel with definitions for `__shared__` memory and constant memory allocations -* *threadblock*: an operation is collectivey executed by a threadblock; any component calling `__syncthreads()` is likely to be threadblock-scope -* *warp*: an operation is collectively executed by a warp; threads within the context of a warp are referred to as _lane_ -* *thread*: an operation is performed by an individual thread with no other data sharing or interaction with other threads -* *instruction*: an operation corresponds to an individual hardware or PTX instruction +The [CUTLASS 3.0 GEMM API](./gemm_api_3x.md) document +explains CUTLASS 3.0's hierarchical organization, +based conceptually on parallelization strategy. +This differs from CUTLASS 2.x's approach, +which more closely mirrors the GPU hardware hierarchy +of thread blocks, warps, and threads. ## Design Patterns -CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a -flexible composition that an be easily applied to solve new problems related to Deep Learning and -linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given -a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several -design patterns are necessary to yield a composable structure while also satisfying these performance -objectives. This section is intended to provide more detail. +CUTLASS aims for the highest performance possible on NVIDIA GPUs. +It also offers flexible components that can be assembled and customized +to solve new problems related to deep learning and linear algebra. +Given a tradeoff between simplicity and performance, +CUTLASS chooses performance. +Consequently, several design patterns are necessary +to yield a composable structure +while also satisfying these performance objectives. ### Templates @@ -75,8 +66,9 @@ objects for each data member. To be consistent, this pattern defines a convention in which classes define internal shared memory storage requirements. Classes should consider all SharedStorage structures to be opaque other than their own child class. When the lifetimes -of child objects are known to be non-overlapping, unions may be used to alias multiple SharedStorage objects to the same -shared memory region and reduce overall SMEM capacity. +of child objects are known to be non-overlapping, `union`s may be used to alias multiple SharedStorage objects to the same +shared memory region and reduce overall shared memory capacity. Developers should carefully note that C++ `union` rules +require that they only access the most recently written ("active") member of the `union`; this differs from C rules. ### Loop Unrolling @@ -104,123 +96,578 @@ for (int idx = 0; idx < kN; ++idx) { // Loop has constant number of iterati ## Style -### C++ Style +### No automatic code formatting -CUTLASS source code follows the -[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions. +Do not use any kind of automatic code formatting, +like `clang-format`, on CUTLASS code. -Design choices should be consistent with the -[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter. +### C++ style -### CUDA Built-in Variables +#### CUTLASS is a C++ project -Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within -CUTLASS components except in special circumstances. +CUTLASS is a C++ project. CUDA C++ is a C++ dialect. +Therefore, we write using standard C++ idioms as much as possible. +We aim for portability to as many compilers as possible, +by writing host code in Standard C++ +and device code in CUDA C++ +that resembles Standard C++ as much as possible. +This improves usability +for the general community of C++ developers, +and makes it easier for new staff to join the project. -Using built-in 'global' variables directly within resuable components necessitates that all components -use them consistently which may not be possible if CUTLASS components are used in other contexts. +#### Follow Standard C++ idioms where possible -Instead, components should accept a linear ID identifying threads, warps, and threadblocks from calling -code. The top-level kernel may then decide how to map threads, warps, and blocks to the problem it is -solving. +Regarding "standard C++ idioms," +CUTLASS source code follows the following guidelines, +with deviations only because of compiler limitations +or where performance absolutely requires it. +"Performance requires it" implies measurement. +Deviations should be limited in scope +and we should always strive to eliminate them. -### Use CUTLASS Fundamental Types +* [C++ Core Guidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) -Use the [fundamental types](fundamental_types.md) defined in CUTLASS consistently. Doing so contributes -to a framework of interoperable, consistent components. +* [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) -In particular, be sure to use: +#### Spacing and line length -* [Numeric types](fundamental_types.md#numeric-types) to represent numeric data in host and device code -* [Containers](fundamental_types.md#containers) to store data in register-backed arrays -* [functional.h](fundamental_types.md#functional) to perform numeric operations in generic code -* [Layouts](layout.md) to store stride and partially specialize template classes -* [`TensorRef` and `TensorView`](layout.md#tensorref) to pass pointers and layout objects +* Use spaces, not tabs. -Avoid defining alternative implementations of the same functionality. Instead, prefer to enhance -or extend additional components where it makes sense. +* Use 2 spaces to indent. -### Classes and Structs +* Max 100 characters per line. -Type names use `CapitalLetters` except when implementations are a _perfect_ drop-in replacement for -Standard Library components. +Lines longer than 100 characters typically wrap unfavorably +when viewed in Github's pretty printer. -Follow the [CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-struct) -to decide whether to use `class` or `struct`. Namely, -* use `class` when the object must maintain an invariant. Data members related to the invariant should be private. -* use `struct` when the class has no invariant to maintain, and data members may vary arbitrarily. +#### Function indentation -### Class Members +When calling a function or function object with a long name, +break the line right after the invoking open parenthesis. +Here is an example. -Methods and members are written using `snake_case`. +```c++ +detail::very_long_function_object_name{}( + params.long_parameter_name, some_operator.another_long_function_name()); +``` -Private data and function members have suffix `_`. +When declaring functions, indent function parameters like this. + +```c++ +void possibly_an_unusually_long_function_name( + std::uint32_t foo + std::uint32_t const* bar, + TypeA a, + TypeB b, + TypeC c) +{ + // ... the function's body ... +} +``` + +For function definitions only, +break the line between the parenthesis +that closes the function's parameters, +and the curly bracket +that opens the function's body. + +#### If-else brackets and spacing + +* Always use braces with conditionals such as `if`. + +* Use a space after control flow keywords + such as `if`, `for`, and `while`. + +* Use a space after the parenthesis closing a conditional + such as `if`, and the curly bracket opening a scope. + +* Use a new line between the closing brace + of an `if` branch, and the `else` keyword. + +```c++ +if (condition) { + // ... code ... +} +else { + // ... other code ... +} + +for (int k = 0; k < num_iters; ++k) { + // ... still more code ... +} +``` + +#### East const + +CUTLASS uses the +["East const"](http://slashslash.info/2018/02/a-foolish-consistency/) +convention. +That is, the `const` or `constexpr` keyword +goes after the type, not before. +The general rule is that `const` or `constexpr` +modifies the type to the left of it. +Here are some examples. + +```c++ +float constexpr compile_time_constant = 42.3f; + +float const const_float = /* whatever */; +float const& reference_to_const_float = const_float; +float const* pointer_to_const_float = &const_float; +float const* const const_pointer_to_const_float = &const_float; + +float nonconst_float; +float& reference_to_nonconst_float = nonconst_float; +float* pointer_to_nonconst_float = &nonconst_float; +float* const pointer_to_nonconst_float = &nonconst_float; +``` + +Contrast this with "West const" style, e.g., + +```c++ +const float const_float = /* whatever */; +const float* pointer_to_const_float = &const_float; +``` -### Constant names +#### Alignment of reference and pointer types -CUTLASS makes extensive use of constants and compile-time evaluation. Constant variable names should have -prefix `k` and use mixed case. True compile-time constsants should be defined as `constexpr` to enable -dependent `constexpr` functions. +For reference and pointer types, +align the `&` resp. `*` flush against the type +that it modifies. This is called "left alignment." -CUTLASS uses ["East const"](http://slashslash.info/2018/02/a-foolish-consistency/) style, placing `constexpr` keyword -after the type name. +For example, do this: ```c++ -float constexpr kPi = 3.14159f; +int const& var; +int const* var; ``` -### Class Member Order +and not this. + +```c++ +int const &var; +int const *var; +``` + +#### Avoid calling functions "fast" or "optimized" + +Putting words like "fast" or "optimized" +in the name of a function +assumes that the "fast" path is actually faster. +That might be true now, but later changes +(in the code, compilers, or GPU hardware) +might make it false. In that case, +your name could be unintentionally misleading. +Consider instead a name that briefly describes +the algorithm or feature that is relevant for optimization. +For example, `compute_on_host` is more meaningful +than `compute_slowly`, and computing on host +might be faster in some cases +(e.g., if the data are already on host +and the algorithm is not GPU-friendly). + +CUTLASS code has not always followed this rule in the past. +Some functions and classes might have words like "fast" in their name. +New code should follow this rule, however. + +#### Avoid creating unconstrained templated functions with common names + +See [C++ Core Guidelines T.47](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#t47-avoid-highly-visible-unconstrained-templates-with-common-names): +"Avoid highly visible unconstrained templates +with common names." +Argument-dependent lookup (ADL) means that +if users call a function name without specifying the namespace, +the compiler can find overloads +of that function in any namespace. +This can lead to ambiguous overloads in users' code, +just because they happened to include one of your header files +that exposes an unconstrained function template. +The following illustrates this +with an unconstrained swap overload in the `cutlass` namespace. + +```c++ +#include +#include +#include + +// Uncomment the line below to observe unwarranted build errors. +//#define BAD_CUTLASS_SWAP 1 + +namespace cutlass { +struct Bar { + float f; +}; +} // namespace cutlass + +#ifdef BAD_CUTLASS_SWAP +namespace cutlass { + +template +void swap(T& a, T& b) // don't do this +{ + T tmp = a; + a = b; + b = tmp; +} + +} // namespace cutlass +#endif // BAD_CUTLASS_SWAP + +namespace other { + +#ifdef BAD_CUTLASS_SWAP +using cutlass::swap; +#endif // BAD_CUTLASS_SWAP + +// Imagine for the sake of this example +// that "foo" is a less common name, +// and that T is constrained via +// std::enable_if or a requires clause. +template +void foo(T& a, T& b) +{ + // The usual idiom for using std::swap is the "swap two-step": + // + // 1. import std::swap into the current scope, then + // 2. call swap without namespace qualification. + // + // That won't build if we have another swap + // overload available in the scope already. + + using std::swap; + swap(a, b); // OBSERVE UNWARRANTED BUILD ERROR HERE +} + +} // namespace other + +int main() +{ + int x = 42; + int y = 43; + other::foo(x, y); + assert(x == 43); + assert(y == 42); + + cutlass::Bar a{42.0}; + cutlass::Bar b{43.0}; + other::foo(a, b); + assert(a.f == 43.0); + assert(b.f == 42.0); + + // GCC 7.5 std::unique_ptr::reset calls swap, + // leading to the same issue as above. + // GCC 12.2's implementation of std::unique_ptr + // does not have this issue. Nevertheless, + // breaking the swap two-step will break users' code, + // just by them happening to include your headers. + auto ptr = std::make_unique(cutlass::Bar{666.0f}); + ptr.reset(new cutlass::Bar{777.0f}); // OBSERVE UNWARRANTED BUILD ERROR HERE + + return 0; +} +``` + +#### Function return values and in-out parameters + +##### Prefer return values to output parameters + +In general, avoid in-out mutable references to return a value. +If you need to return multiple values, +you can return them by `struct` or `tuple`, +rather than by output references. +This includes the special case of error reporting +by returning either a value or an error code. +Please see the next section for details. + +```c++ +// Instead of passing in-out mutable references ... +void not_preferred(float& input_and_output); // not preferred + +// keep functions pure and return value types instead +float preferred(float input); // preferred +``` + +##### Return multiple values by struct or tuple + +Sometimes a function needs to return multiple values. In that case, consider the following, in decreasing order of preference. + +1. Return a `struct`. This lets you name the fields + (for more self-documenting code), + yet still permits use of structured binding. + +2. Return a `tuple`. If you need a tuple type + that works on device, use `cute::tuple`. + (Please note that `cute::tuple` does not work + for all the types that work in `std::tuple`. + CuTe's documentation explains.) + +Here is an example of the struct approach for named values. +For a comparable example in the C++ Standard, +please see [`std::allocate_at_least`](https://en.cppreference.com/w/cpp/memory/allocate_at_least), +which returns `std::allocation_result`. + +```c++ +struct my_computation_result { + float value = 0.0f; + float relative_error = 0.0f; + bool success = false; +}; + +my_computation_result my_computation(float tolerance); + +void foo(float tolerance) +{ + // Approach 1: Use structured binding. The names + // you choose on the left-hand side have nothing + // to do with the struct, so it's up to you + // to get the order right. On the other hand, + // this code works whether my_computation returns + // a struct or a tuple. + auto [val, rel_err, ok] = my_computation(tolerance); + + // Approach 2: Keep the struct and use its named fields. + // This approach prevents errors like mixing the order of return types. + // However, it only works for structs, not for tuples. + + auto result = my_computation(tolerance); + if (not result.success) { + // computation did not succeed + } + else if (result.relative_error > tolerance) { + // successful but relative error too large + } + else { + // successful and relative error is in bounds + } +} +``` + +##### Reporting errors from a function that returns one or more values + +We may want to return one or more values +from a function that could fail +or otherwise report errors. +That is, the function either + +* returns one or more valid values, or + +* does not return any values and reports an error, + +but NOT BOTH. We contrast this with cases +when it's meaningful to report both a result +and whether the result is satisfactory. +For example, when solving +a system of nonlinear equations iteratively, +users may want the approximate computed solution, +even if the iteration did not succeed +by converging to the desired tolerance +in the desired number of steps. +(Users may want to invest more steps, +or use the current approximation +to jump-start a different algorithm.) + +We're talking here about the "either valid value(s), +or error, but not both" case. +For this case, C++ offers a few options. + +1. Return the value(s), or throw an exception on error + +2. `std::expected` (requiring C++23) or something like it + +3. `std::optional` (for a Boolean error state) + or something like it + +4. `std::variant` (a C++17 fall-back for `std::expected`) + or something like it + +5. C-style interface: return an error code, + and "return" the values as output parameters + +We usually cannot or do not want to +throw exceptions on device. +Some code projects forbid exceptions entirely +(on host or device) +and tell the compiler to disable them. +If we exclude a C-style interface (the last option) +as not idiomatic C++, then for host-only code, +`std::expected`, `std::optional`, and `std::variant` +all work. +For code that needs to build and run on device, +we can fall back to libcu++ equivalents +in the `cuda::std::` namespace, when they exist. +Otherwise, we must resort to returning a struct or tuple +with the value and the error information, +and ask users not to use the value on error. +This is acceptable if the value can be constructed +cheaply with a reasonable default. + +##### Performance of different value-or-error reporting methods + +[P1886R0](https://wg21.link/P1886R0) +(Ben Craig, "Error speed benchmarking") +surveys different ways in Standard C++ +to report errors from a function +that returns one or more values, +and compares their (host-only) performance +with different compilers. + +##### Use aggregate initialization when returning a struct or tuple + +Use aggregate initialization when returning a struct or tuple. +This avoids duplication of the return type name. + +```c++ +struct foo_result { + float value = 0.0f; + float error = 0.0f; + bool success = false; +}; + +foo_result foo(std::span input) +{ + // ... code ... + + // Prefer this. We know what type the function returns. + return {val, err, ok}; // prefer this + + // Naming foo_result again here is unnecessary. + // return foo_result{val, err, ok}; +} +``` + +However, note that this won't work if the function returns `auto`. +The general rule is to avoid code duplication. + +```c++ +auto foo(std::span input) +{ + // ... code ... + + if constexpr (some_condition) { + return foo_result{val, err, ok}; + } + else { + return bar_result{val, err, ok}; + } +} +``` + +##### Prefer using the actual return type to auto, if you know the type + +C++ lets you use `auto` to deduce the type returned from a function. + +* If you know the actual type, prefer using the type instead of `auto`. + +* Use [Constructor Type Argument Deduction](https://en.cppreference.com/w/cpp/language/class_template_argument_deduction) + (CTAD) if you know that a function returns some type + (e.g., `Tensor`), but don't know the type's template arguments. + +* Use `auto` in structured bindings (where you have to use it anyway). This also makes your code agnostic of whether the return type is a `struct`, `tuple`, `pair`, or other tuple-like type. + +* Be careful using `auto` with types that provide expression templates. + +Contrast this with "Almost Always Auto" (AAA) style. +We deliberately choose not to follow AAA style, +for the following reasons. + +* Using the actual type when we know it can help prevent common loss-of-precision errors in mixed-precision computations, an important use case for CUTLASS. + +* CTAD gives us much of the brevity of AAA, with more clarity. + +* Using the actual type instead of `auto` can prevent common dangling errors with expression templates. + +#### Classes and structs + +Type names use `CamelCase`. +That is, words start with capital letters. +The remaining letters in the word are lower case, +and words are joined with no intervening underscores. +The only exception is when implementations are +a drop-in replacement for C++ Standard Library components. + +Follow the +[C++ Core Guidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-struct) +to decide whether to use `class` or `struct`. + +* Use `class` when the object must maintain an invariant. + Data members related to the invariant should be `private`. + +* Use `struct` when the class has no invariant to maintain, + and data members may vary arbitrarily with respect to each other. + +Prefer nonmember functions and statelessness where possible. +Member functions imply invariants. +More invariants make code maintenance and testing harder. + +#### Class members + +Methods and members are written using `snake_case`. + +Private data and function members have suffix `_`. + +#### Class Member Order Members within classes and structures should be organized as follows: 1. Type and constant definitions + 2. Data members + 3. Constructors + 4. Other methods -This convention follows the [CUB library](https://nvlabs.github.io/cub/) and is also described by -[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). Unsurprisingly, it approximates -the usual ordering of chapters in a typical Systems and Controls textbook. That is, -(1.) identify relevant constants, (2.) define a state-space representation of the dynamical system -under study (i.e. the data members), and (3.) devote subsequent chapters to definining dynamical behavior -of the system (i.e. the methods). +This convention follows the +[CUB library](https://nvlabs.github.io/cub/) +and is also described by +[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). +It also approximates the usual ordering of chapters +in a typical Systems and Controls textbook. +That is, it + +1. identifies relevant constants, + +2. defines a state-space representation + of the dynamical system under study + (the class's data members), and then + +3. devotes the remaining "chapters" to defining + the system's dynamical behavior + (the class's methods). + +Here is an example class. -_Example_: ```c++ class A { public: - // Type definitions + // type definitions protected: - // protected Type definitions + // protected type definitions private: - // private Type definitions + // private type definitions public: - // Data members + // data members protected: // protected data members + // STRONGLY TO BE AVOIDED; + // please see C++ Core Guidelines private: // private data members public: - // Methods + // methods protected: // protected methods private: // private methods - }; - ``` -### File Names - -Files should be named using `snake_case` with extension `.h` for header files, `.cu` for CUDA sources, -and `.cpp` for C++ host-only source files. +#### Use scoped enums -### Use scoped enums - -Use scoped enums added in C++11 for enumerated types. Use capital letters for the enumerated type name +Use scoped enums (a C++11 feature) for enumerated types. +Use capital letters for the enumerated type name and prefix `k` for enumerators like other constants. ```c++ @@ -232,63 +679,129 @@ enum class MatrixOperation { }; ``` -### Namespaces +#### Namespaces -Namespaces are all lower case. The top-level namespace is `cutlass::`. The second nested namespace refers -top the general category of operation performed by its members, and the third nested namespace refers to -the CUDA execution model scope (if applicable). +Namespaces are all lower case. +The top-level namespace is `cutlass::`. +The second nested namespace refers to +the general category of operation +performed by its members: e.g., `gemm::`. +The third nested namespace refers to +the operations' position in the conceptual hierarchy: +e.g., `device::`, `kernel::`, or `collective::`. -The bodies of namespace definitions should not be intented, and comments on the closing brace are welcome. +The bodies of namespace definitions should not be indented. +Comments on the closing brace to indicate +the namespace being closed are welcome. ```c++ namespace cutlass { namespace gemm { -namespace warp { - -struct MmaTensorCore { +namespace kernel { +struct AnotherGemmKernel { + // ... contents ... }; -} // namespace warp +} // namespace kernel } // namespace gemm } // namespace cutlass ``` -### Macros +#### File Names + +New files should be named using `snake_case` +with extension `.hpp` for header files, +`.cu` for CUDA sources, +and `.cpp` for C++ host-only source files. + +Header files with extension `.h` +are CUTLASS 2.x legacy headers. -Avoid defining macros except where preprocessing is obligatory. In particular, -avoid using macros for constants. +#### Macros -Several existing macros defined in `cutlass/cutlass.h` are useful for working around compiler-dependent -behavior. +Only use macros when the preprocessor +is the only way to accomplish the task. +Do not use macros for literal constants. +Instead, if inside the body of a function, +use `constexpr` values, +and if at namespace scope, use +[`inline constexpr` variables](https://en.cppreference.com/w/cpp/language/inline) +(a C++17 feature). + +"Namespace" macros by starting them with the module name, e.g., `CUTLASS_`. +Macros and ONLY MACROS use all capital letters with underscores between words. +For example: + +```c++ +#define CUTLASS_MACROS_USE_ALL_CAPS inline __host__ __device__ +``` -Annotations for device code: -* `CUTLASS_HOST_DEVICE` for functions running on the host and the device -* `CUTLASS_DEVICE` for functions running on the device only +Header files such as +[cutlass/cutlass.h](../../include/cutlass/cutlass.h) +and +[cute/config.hpp](../../include/cutlass/cutlass.h) +offer macros for expressing compiler-dependent behavior. +These include -Loop unrolling: -* `CUTLASS_PRAGMA_UNROLL` for full unrolling of loops with constant trip counts -* `CUTLASS_PRAGMA_NO_UNROLL` to prevent unrolling +* replacements for `__device__` and/or `__host__` + annotations: -### #pragma once + * `CUTLASS_HOST_DEVICE` or `CUTE_HOST_DEVICE` + for functions that run on the host and the device, + + * `CUTLASS_DEVICE` or `CUTE_DEVICE` + for functions that run on the device only, and + + * `CUTE_HOST` + for functions that run on the host only; and + +* annotations to loop unrolling: + + * `CUTLASS_PRAGMA_UNROLL` or `CUTE_UNROLL` + for full unrolling of loops with constant trip counts, and + + * `CUTLASS_PRAGMA_NO_UNROLL` or `CUTE_NO_UNROLL` to prevent unrolling. + +#### Guard all headers with `#pragma once` Use `#pragma once` to guard all headers. -```c++ -/*! +### CUDA C++ style + +#### CUDA Built-in Variables + +Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within +CUTLASS components except in special circumstances. -*/ +Using built-in global variables directly within resuable components necessitates that all components +use them consistently which may not be possible if CUTLASS components are used in other contexts. -#pragma once +Instead, components should accept a linear ID identifying threads, warps, and threadblocks from calling +code. The top-level kernel may then decide how to map threads, warps, and blocks to the problem it is +solving. -... -``` +#### Use CUTLASS's and CuTe's fundamental types and operations + +Use the +[fundamental types and operations](fundamental_types.md) +defined in CUTLASS consistently. +This contributes to a framework of interoperable, consistent components. +It reduces code duplication, which reduces build and test times. +It also saves developer effort. + +CUTLASS's fundamental types and operations include -### Source Line Length +* [Numeric types](fundamental_types.md#numeric-types) to represent numeric data in host and device code, and -Avoid lines longer than 100 characters. These typically wrap unfavorably when viewed in -Github's pretty printer. +* [functional.h](fundamental_types.md#functional) to perform numeric operations in generic code. +CUTLASS 3.0 uses CuTe components to represent data layouts and multidimensional arrays. +Please refer to the [CuTe Tutorial](./cute/00_quickstart.md) for details. +CuTe has replaced CUTLASS 2.x components such as +[Containers](fundamental_types.md#containers), +[Layouts](layout.md), and +[`TensorRef` and `TensorView`](layout.md#tensorref). # Copyright diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index ff13abf9c7..f0d4d8a311 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -7,9 +7,9 @@ ## Prerequisites CUTLASS requires: -- NVIDIA CUDA Toolkit (9.2 or later required, [11.1](https://developer.nvidia.com/cuda-toolkit) recommended) -- CMake 3.12+ -- host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended) +- NVIDIA CUDA Toolkit (11.4 or later required, [12.0](https://developer.nvidia.com/cuda-toolkit) recommended) +- CMake 3.18+ +- host compiler supporting C++17 or greater (minimum g++ 7.5.0) - Python 3.6+ CUTLASS may be optionally compiled and linked with @@ -24,13 +24,13 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc $ mkdir build && cd build -$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a # compiles for NVIDIA Hopper GPU architecture ``` If your goal is strictly to build only the CUTLASS Profiler and to minimize compilation time, we suggest executing the following CMake command in an empty `build/` directory. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON ``` This reduces overall compilation time by excluding unit tests and enabling the unit build. @@ -39,13 +39,13 @@ You may reduce build times by compiling only certain operations by setting the ` executed from an empty `build/` directory. This only compiles 2-D convolution kernels. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_OPERATIONS=conv2d +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_OPERATIONS=conv2d ``` -You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. +You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. For example the below command selects only CUTLASS-3 kernels. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=s16816gemm,s16816fprop*128x128 +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=cutlass3x* ``` See more examples on selectively compiling CUTLASS GEMM and convolution kernels [here](quickstart.md#example-cmake-commands). @@ -180,6 +180,10 @@ To minimize compilation time, specific GPU architectures can be enabled via the selected by [CUDA Compute Capability.](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities) **NVIDIA Ampere Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=90a # compiles for NVIDIA Hopper GPU architecture +``` + ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture ``` @@ -204,32 +208,10 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS="60;61" # compiles for NVIDIA Pascal GP $ cmake .. -DCUTLASS_NVCC_ARCHS="50;53" # compiles for NVIDIA Maxwell GPU architecture ``` -## Clang - -For experimental purposes, CUTLASS has been verified to compile with the following versions of Clang and CUDA. - -* [clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the -[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive). -* [clang release/13.x](https://github.com/llvm/llvm-project/tree/release/13.x) using [CUDA 11.4](https://developer.nvidia.com/cuda-toolkit-archive) - -At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm) -but does not enable TensorCores. - -```bash -$ mkdir build && cd build - -$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. -# Add -DCMAKE_CXX_FLAGS=-D__NV_NO_HOST_COMPILER_CHECK=1 -DCMAKE_CUDA_FLAGS=-D__NV_NO_HOST_COMPILER_CHECK=1 if compiler -# checks fail during CMake configuration. - -$ make test_unit -j -``` - - ## Using CUTLASS within other applications Applications should list [`/include`](/include) within their include paths. They must be -compiled as C++11 or greater. +compiled as C++17 or greater. **Example:** print the contents of a variable storing half-precision data. ```c++ @@ -345,6 +327,136 @@ Note, the above could be simplified as follows using helper methods defined in ` }); ``` +## Launching a GEMM kernel using CUTLASS 3.0 or newer + +**Example:** launch a mixed-precision GEMM targeting Hopper Tensor Cores. + +```c++ +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/packed_stride.hpp" + +using namespace cute; + +int main(int argc, char const **args) { + + // A matrix configuration + using ElementA = cutlass::half_t; // Element type for A matrix operand + using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand + constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes) + + // B matrix configuration + using ElementB = cutlass::half_t; // Element type for B matrix operand + using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand + constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes) + + // C/D matrix configuration + using ElementC = cutlass::half_t; // Element type for C and D matrix operands + using LayoutC = cutlass::layout::ColumnMajor; // Layout type for C and D matrix operands + + // Core kernel configurations + using ElementAccumulator = float; // Element type for internal accumulation + using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature + using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag + using TilesShape = Shape<_128,_128,_64>; // Threadblock-level tile size + using ClusterShape = Shape<_1,_2,_1>; // Shape of the threadblocks in a cluster + using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized based on the tile size + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; // Kernel to launch based on the default setting in the Collective Builder + + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + ElementA, LayoutA, AlignmentA, + ElementB, LayoutB, AlignmentB, + ElementAccumulator, + TilesShape, ClusterShape, + cutlass::gemm::collective::StageCountAuto, + cutlass::gemm::collective::KernelScheduleAuto + >::CollectiveOp; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + cutlass::epilogue::thread::LinearCombination>; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, // Indicates ProblemShape + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + Gemm gemm_op; + cutlass::Status status; + + // + // Define the problem size + // + + int M = 512; + int N = 256; + int K = 128; + + float alpha = 1.25f; + float beta = -1.25f; + + // + // Allocate device memory + // + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + stride_A = make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, Int<1>{})); + stride_B = make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, Int<1>{})); + stride_C = make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, Int<1>{})); + stride_D = make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, Int<1>{})); + + block_A.reset(M * K); + block_B.reset(K * N); + block_C.reset(M * N); + block_D.reset(M * N); + + // + // Launch GEMM on the device + // + + status = gemm_op({ + cutlass::gemm::GemmUniversalMode::kGemm, + {M, N, K}, + block_A.get(), + stride_A, + block_B.get(), + stride_B, + {block_C.get(), stride_C, block_D.get(), stride_D, {alpha, beta}} + }); + + if (status != cutlass::Status::kSuccess) { + return -1; + } + + return 0; +} +``` + # CUTLASS Library The [CUTLASS Library](/tools/library) defines an API for managing and executing collections of compiled diff --git a/media/docs/terminology.md b/media/docs/terminology.md index f2d8b6838c..e0f04790a3 100644 --- a/media/docs/terminology.md +++ b/media/docs/terminology.md @@ -4,10 +4,10 @@ # CUTLASS Terminology -`AlignedBuffer`: statically sized array type; union-safe, no construction guarantee for elements +**cute::Layout**: A `cute::Layout` vocabulary type composed of the hierarchical `cute::Shape` and `cute::Stride` +tuples that is used throughout CUTLASS 3.0 to represent and manipulate thread and data layouts. More details are included in the [CuTe specific tensor type documentation](/media/docs/cute/03_tensor.md). -`Array`: container for holding numeric types - handles bit packing for small numeric types (e.g. int4_t, uint4_t, bin1_t) - `sizeof(Array)` - gives expected value in units of bytes with minimum storage of `1 B`: (sizeof_bits::value * N) / 8 +**cute::Tensor**: A pointer backed by a `cute::Layout` used to represent a tensor. More details are included in the [CuTe specific tensor type documentation](/media/docs/cute/03_tensor.md). **Capacity**: (scalar) physical number of elements in memory required to store a multidimensional object; expressed as the type's LongIndex type - example: the capacity of a column-major matrix is `lda * N` @@ -28,8 +28,6 @@ **Numeric Type**: a CUTLASS data type used to represent real-valued quantities; is trivially copyable. -**Operator**: an object performing a computation on matrix or tensor objects. May be further refined by scope within the execution model hierarchy. - **Pitch Linear**: linear memory allocation obtained from a user-defined 2-D size, which specifies the contiguous and strided dimensions of a tile. @@ -61,17 +59,27 @@ contiguous and strided dimensions of a tile. **Tile**: partitions of a tensor that have constant extents and layout known at compile time -**Tile Iterator**: abstraction for accessing and traversing a sequence of tiles in a tensor; CUTLASS specifies - [formal concepts for tile iterators](tile_iterator_concept.md) - -**Thread Map**: abstraction for defining how threads are mapped to a given tile. - **Trait**: characteristics of a fully-specialized type, typically used in metaprogramming reflection **View**: an object containing references to a data structure that it does not own; typically, construction of views is lightweight **Warp**: a collection of hardware threads executing in lock-step; warp-level operations typically rely on cooperation among the threads within the warp +`AlignedBuffer`: statically sized array type; union-safe, no construction guarantee for elements + +`Array`: container for holding numeric types - handles bit packing for small numeric types (e.g. int4_t, uint4_t, bin1_t) + `sizeof(Array)` - gives expected value in units of bytes with minimum storage of `1 B`: (sizeof_bits::value * N) / 8 + +**Operator**: an object performing a computation on matrix or tensor objects. May be further refined by scope within the execution model hierarchy. Deprecated starting CUTLASS 3.0, +replaced by [MMA and Copy atoms from CuTe](/media/docs/cute/0t_mma_atom.md). + +**Tile Iterator**: abstraction for accessing and traversing a sequence of tiles in a tensor; CUTLASS specifies + [formal concepts for tile iterators](tile_iterator_concept.md). Deprecated starting CUTLASS 3.0. + Replaced by `cute::Layout` in equivalent usage scenarios to represent data tensors. + +**Thread Map**: abstraction for defining how threads are mapped to a given tile. Deprecated starting CUTLASS 3.0. + Replaced by `cute::Layout` in equivalent usage scenarios to represent thread tensors. + # Copyright Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md index 3c20797fd9..efff36131d 100644 --- a/media/docs/tile_iterator_concept.md +++ b/media/docs/tile_iterator_concept.md @@ -4,9 +4,15 @@ # Tile Iterator Concepts +Note: CUTLASS 3.0 deprecates all tile access iterators in favour of CuTe's single +vocabulary type `cute::Tensor`, which is parameterized on `cute::Layout`. +`cute::Tensor`s can therefore be manipulated with the same layout algebra as all CuTe layouts. +This removes the need for bespoke types that encapsulate iterator properties. +The following text thus only applies to legacy CUTLASS 2.x API and related types. + CUTLASS 2.x implements generic algorithms on tiles of matrix or tensors of constant size. These may be considered as partitions of tensors of infinite size, with a range of partitions accessible -by _tile iterators_. +by _tile iterators_. Various data structures may make operations such as random access to tiles inexpensive, while data structures may not offer random access at all. For example, iterating over a linked @@ -14,7 +20,9 @@ list of matrices requires sequential traversal. Algorithms implemented in terms should require only the minimum set of operators be defined for tile iterators. This document describes a set of C++ concepts which may be used to define tile iterators used -by CUTLASS algorithms. Each concept specifies members and type definitions that a tile iterator +by CUTLASS algorithms. ("Concept" here does not refer to a C++20 concept that uses the `concept` keyword. +Rather, it refers to a set of requirements on a type.) +Each concept specifies members and type definitions that a tile iterator must implement. Frequently, a tile iterator implements several concepts, and its members are the union of the members from each individual concept. These definitions were inspired by [Boost "New style" iterator concepts](https://www.boost.org/doc/libs/1_40_0/libs/iterator/doc/new-iter-concepts.html). @@ -23,7 +31,6 @@ The set of all possible combinations of these concepts is quite large, however m templates can be described by one of several combinations. The section Frequently Used Tile Iterator Concepts describes several common interfaces used throughout CUTLASS. - ## Definitions **_Base Tile Iterator Concept_.** All tile iterators must describe an _Element_ type as well as a _Shape_. diff --git a/media/docs/utilities.md b/media/docs/utilities.md index 66e71cad7b..c464f2007d 100644 --- a/media/docs/utilities.md +++ b/media/docs/utilities.md @@ -2,6 +2,13 @@ [README](/README.md#documentation) > **CUTLASS Utilities** +Note: This document discusses utilities commonly used with code that targets CUTLASS 2.x. +Although CUTLASS 3.0's primary entry point APIs do not transact in these `cutlass::*` tensor types anymore, +users can still find them convenient for managing allocations with trivial affine layouts. +For more advanced host side tensor management, [`cute::Tensor`](/media/docs/cute/03_tensor.md)s +can be used on either host or device for any memory space and full expressive power of +[`cute::Layout`](/media/docs/cute/01_layout.md)s. + # CUTLASS Utilities CUTLASS utilities are additional template classes that facilitate recurring tasks. These are diff --git a/media/images/cute/HMMA.8x8x4.NT.png b/media/images/cute/HMMA.8x8x4.NT.png new file mode 100644 index 0000000000000000000000000000000000000000..adedbac03c6e0b45ba40d18bcef174b13acbe6f2 GIT binary patch literal 547992 zcmeEu1zT0!)+ik!pme8*v~0RXq`MoG?(S4lgbheH2+}3pp~z;_-6d?gyW=k2SI>9u z`R@A*?%B_?SZl6XV~ja!jyV=#N(z#gk4PTD!NFllOTAHngF^-Z*K;%!;7u-}>@*zQ zV`ED(F(qj+F)Ae|doxQLQ#d%Ou=qq&E!A#<_v;C2I671aDzELcT@a+uzlcAHX8b54 zg@8^M5|m{mnW*ltOY;d|A79zU71@O4`-q&nkh(Cgdlj2{I`|>&(yMY+4`3+SpA;PQ2!A{$S!65u_YEtNz zz-aR#Ufup)wY0)DTSWe_Lp>-MF51$kRge^JSL1~_K6>RB@?FVj{oL1KcyQ#1&<_ag zhG=|Gk3QbqFfe9dOv7*WQtb!z4czpYoh|N|V@(`GP z_~#~?)FPaZDj8NBEDhwUB{Guf(APynijmWwGpx{AL0It^e&BF&Pxj{j?80(NNb4Pr zD24T+uBmUhUJ$Ljpjpdt}y}y@;-WFF; z7|A04Y~*wDmie`^MH0VTiTpG9k)Pr7vG^fBts!U7qh-!3UhO!-FEP5J+{$=#7Ler} zjHP9@*{&SZV8<{kk;4eGyCmKk5s#-IQS_cX(t3q%{X7Z5>HXKzrZ1#-D(69n!qI^? zR0ymj-M>D@eV}0QdH&w$fd1K2rN>|l;tl){G%xNspQ(%JznXYuKivLN?|{fj_#=Yn zL7mgvI2VgrlLIuFXZK0^aK8P4>f`5%#HcdaTsW&)`UwMB&)Fl0WFnr0;F3Gy3O+gS z%30zd(r`}$Ut@V)i)S7r4JmlMs+ZWHnoD73`0Nks)f$4|4Yj5jxZyXN!>_CWtn$gPf}eUM!l@zJEUO1+BHGK_Y5dMwFrB#do|=dr2z+OK(hV zWz&=Gn*!f{vK~hFg7w+U2E$ctwAliCu8QC(_#I9+GJ$Y%2nig9KnRx+;Wt`F3LUX1 zl!>8VHiw9JE3XA&{QJ5l`wDx${-~L#={?QK!^ZoRhhM}^bxVJSTC2@xD5l?}9==-}P% ze*0M#A-7qIr9&MX!H0CCo!srQSarlaOKbkkVFuoUS>qgC&|2^Q+ra2i-n``V}4B9L+lw& z@*D4L>^X4ZPqO95%SOv6%LYaq73mkzQ?fLlBcunu!l^KaDKb8bkC}*Zi;euu8p|)e z57jZjs1)E!OOr^5k%i)~=&$6g^sdma2!CSjOjwo-%SB@tWsqf3psH*X!>G#MABcKb57kY9^Ko=+?=cNAF?I8wiDDq#BeMv@RI6=zE!;W^Bct?4ER- z1YcpC)NN%B`9s@@CLXhjwI>ZuL3q12IH^dcNDm%6J?_CgBdR7oA=)DKB6&`#z)=Dc zG@APUGs#Vkf+MOSt|8Mi{qkMX^vi*l!{5X^wnFqN&L%ACmA`$47W7$^4|C;ULzrPN zVS2Dt2qwf`-T{|B*73QjJaBTJ1Ldz1L zpvyeVd|k3r8dD-ua;7t1lc!TvT2-Q{U8i+Er8N~%lWeJKeVoeDnPAIQt8eT*b5@$( zqW@|B+zr!BaTmPfdDeMWbLLLAfhCHCj5Us>Ngfty)1}vi8JU=-M9)TFp&CCbZay|W ztG(R(y}=jhUC=(-=Etq4JbD(aWQjD1I*He`$+S6=2@y6#mp*a*1=Y&ElIawu)8oBd zgRZIDwmiOd>~?f^@^)2t_?>U)|(8x%_v&VnRD1|Iz0p9Fcl7EX*jz^7LW#hu%+ zby0P5O#@B6T@d;X^Oix(T6lrTmLy+D;xE!=fv)p!``#kS%_1kl%wVHG)#6ZmuyZhTkbzeIk@#u<=OAbSboDcjhnXOJ zdB`LO#JsPBy;QlnAgI7q89O^Ydv#dec(}qCQby)Y58;*|8GV9b- zisWUr(;KhvI|F7hx3 zGDzYtJpN2QW+q8OMpj`qV?u!1=EoL#`li8V?&c)@jA9!1;m)&IY0ub5xryYyOdqPi z=+7|~OfXuJ1jGLQ{uR@*!JVfL$ks@MLEnF<|6o;qZxOs9t?_E&Q%U|b{4`Tl(@)Xm zM~-ztOxpCGpK+6`7}jm=Z1y7O?waG8TP{~_B5uS~d@akWT|ZsMS*p&M4pgdWOMWo? zz7;^>8Ryf4pe(1xNkG!xagCfCcqFiTo86vWln@^FH$0N_b`@^{@`-o42U#2ov+C>@-z+EnSWiPZ=NrTpUr)toUV+HNycDMF?q4N z-RAooGIm?|MOm#{lbV4S%=>rv{CV#E!`>)2FMVz3yd0y2{PVOFb1#+nN;`@uglZUPoJ1_!;=A6H1dW zCSBAdG`Y)aw3}>pyIhxDCyQPeot5$Ew^`iIV*bEP-8a<5f2Z{mG?2;sUEf_W!1A}RbI$6>-clA^PoJ%yq7tL_ZhgMmJQc!R4vB<} zLC(aCv1&LkdB|@}SJ(uwbScXCjXX>?mA8hGC9k9KDYtDCxZEiRh3~$ZI`!`2!W8vv z$tP#rKwB%xs!RC_X=`c1ZqI-9EmirMW2YNuTJ!Gvw%!Fy1_!oz-FkiNZz{dZ9_)n3 zwkD>`NY+udkhxr5#1Ap$mmQU5v=KO$-g=Fkrt*x~yUgJ&_;@kxPlPXi^4o3_I>%a4 znAV?f!atYWSHIu8xL&~8kEA^p+c)%xJ^ zp0u!do8Q>2d!Jp8QmP0GZl+#+KB+j0h2`fBDrfGWte>CvUO3&gE5ili_xB+DTE&9f zC_LD4KsUgBUa;?jh^)M*ii&naud#@M`iTclAOKz}1>+T`V2(}DH~|Q=wkXHT=;bRR z!H}Z77nOo@-c1Dc3{U#6X9@ThaEO1sM}UJ1wS+_Z-9`bpKm4MB>!Huzcf^m716&?i2!C25 zgR&6+_52*DgA-L1la>bVs>V*HrgqL2_AW4Z&W9Rw2Pti5I5+~jhYMa><=Fu+{*E$cfIdm~d;4_k+aao_|!_<=`TQ}lun&V$9ynfC8N{*3d+)Y;g{(!s^j-j3>F+;>Ly zt}a3}G!GN~>-RUDrXH66Gn1Y3Z?pge*&b@x*jZn){cCKXtKh?1ekDr}QyZ-}mbL)R zfH{QOIXGSl{?*}sRsGMDe|J@PHgyuSw*`8-2>%b&e|P@(%Kz;6*PPn_J|{cxzt8z^ zRlmCmvOVDbZ?yQE(0{!JC@uU*knLZqCj2Os9iAOnNHWVeifX_eV6%rGLM-s-#ou?} z8Ij=GK)b~_9Na57={KTk9`HN!$UbbzIkZRZwi6O-KUw=#X#^Sp_}-R$ms&tz>{z&-8eQq={eIxd#^uI(G3v@#LT8coiMS z{l91aJs@v0T!36?IZX-G{|mf_M(?R`ZU5J-e^7DofCs=g6xR~{C-C7!pU?b1QGXcw z*8u_=h`jaFw}5}3&wBv6>i?8QU2iiXV!nU4JCXPy?LM>gdg-z2tF~%#{7>J|DRE?*xpl(E)}COq5Ttr zz`~RMPy7Es1k;Qeh};#w2Nz5 zzAZ|1s)!m#E*4}+9?8}1j{SQLj8_5^UBr;9`jmu6#;TaUejydYWqG+NB)EJ}u@>cA zt1qWdrhd%nrXJX*dic&r@=S+sdE~W0Lc+p^ z8~v%~MMFvK#;SAm(iiLPZ&jOe#rtC(LxgQ^R*D_TxD_57W2OcYU0+=dZBL>Yo0}8L zBpnD4ZNI(Qn|H1~qcdOMh-21%v=ha}bC#=Fp1}aK7*1nwTsFZI@^ay|-eB%}vcJ%> z_Rd;ZrP^Zng>v@F$})m6W|+FS@A)1kssAsdbPxZuB%{txV7C(L%2-v-)$6r>S#`!p zFl}Tw6~}VO3-~`?Kfy~(P_f3gjA?Hz6$LjgHW=xGEK*R&eyF|zh(|{GUDiC}vgJ=x z%zkQO^`?)$=M6UHLUx7mRz(FHspg$k$vBQuB$0@pPbeg!tQesmj{!9Di<)#OlFNo< zBz0@Gw~N!%Nfs-DQh94^({f@*UbYa&s*jaY_X{C{i<3L3Gpf)q^YzKjw0=)CHR%axnaLO=~SGIcSq+v`(#$U%Co6c*E_ zTD~Crwxr^)rAf2UyY9Y_io>>XXHNHk<^jbMf@Vp?Yr^$~fb$o=S0&Z1v-l_9Muw)Q zr^CFVi}_~q;lnwS?&?r(|L?3lX?z0v0zKoq_=h3j;zGs!kRmBy6%)n( z14An4SRFC&XHrv!&=@KCia(q^K+YHZ)MRU$DVG*Hwn!wJcLzZOo$@5m0nHYOMf1+` zUOfGq=bz94yT2{nU7-3je&ngiOc&6)b>L5N%RYii)&F5s(q5b=CRY1E(OWg{_h##3 z46yS&Ep-E`uVgmSK)c33wW0Z+;wg_rbYU9+%q`dy9Sv0D4isA$WPWe--be$x-YX3O z09M~z7o_AfcA&bP<4^I5^k1M?A|WNu2mr+eJi6cNGW#pAr}Zr+2ViYg;SZ3zCIYG_ z0nGj`-acUeH|W|30dl+PfV|p&!uhSQlvRM;e(QK70IMMbFobW!1FarF|Eu^ntsX$H zV1@^11rVc#M?CCr(fRO#;XRe=qLm~7Yk6EG)u`|TWnCXYe<=Q@)dT2S2|z<{fEY8W z*>C=KlU~K)y^1chuX%t~Q${k{lJc+h`cwQ(s|V1TWr2n*05Nvum1+Lg_Ea*D9#(qh z&$?DeJuEYT+5dOTti4Y)+AW=WdVW5cmj-U9g} ze-Q7DMD*s$N*BFAt6|G6!$w%e5*7bJ$*c$zX)C>4a^~)C7&7EOs`hKWA&x3L6t-mf#IDkGHp>aHIB##Dh@{CZDJFCf!@1`q*o;8 zp+}osetwAOb)a;|#A{a`>!j$%qfo!G!9ka+a(h(InBC8}qC>at(cR2`Tsw89B_c=4y{2dj%G+U?o>j@Xk>?ZpX}-?NtEbH6_r0} zhnBWRk;Wi^_=jI;AVikxD|FF3jhGjxzoohpd57Gjr+YW>dUE4zw`%MC!eS=&(Pg{A z9fxpQTK3_05k)Ar=c2od>mOI16TU*&9!bfiTKml^5H}Apr$t1lD#c4{!6z@O^5Xdw$IQJQSKHJG* z4v&zxTWIsNfJEO^9OL-;$}&!DN92ZKV&1;02~x7r&ZcL&-Mk>~P$cGh{t|~m0w)XU zJp{hZQ-Gks!swJr4bT29M;*4x0X4xpn(_PVa@B+}bMaa#KG%b-}}A13_5n%e?
e?gXL8lNUb>Jhry7kC zxgLpJZ1#JXQ!y5}x!C*)W-&0#pqTNEh~5GO)K7%o&d4=7F9&r5&6VE+KQy~^JLok0=>N$pK}El<0t!fJ}U?OjSlayN%M_haW+0+aGTO_wvk?(;=-=6%{~=}2D8zk za<$L&Thlatu6?Ff9-h1Xt#?=ZB4PrOq&!baCpql92KPr@7c^hMPCV<$!yOSYU^$W# zqnF1`@#VavGl%AkKQhDTJU>l$YDH10B1E07r6Yp*M%Q3r|2ru}RD~P@7J-UF%V-9X zYiHtCtNSMVMeY zUw;Ay;FfS+kI32+l5iGaF|Rt@o7c5GFKg49$^@&%@St|>FSdLEnOYzCHOT~lPtib_ zko`it^yZU|VSUpZT+)0l%P#xH)@=FI1ZRtl(wR~Fw)ot>Was1>V6IskAMG00-EF>7 z@q~T=*V*xL-tD#fP}+1yq3-$Txx1cFs}d5{CcmqT&v#eWPeTK)8T@X7ZrrIE{ILv= z1>U%BRe#ZI@LQ|A=pY;j^^h9cUz_lJZN44wP?!&v$Zvblw9Ql7_<+bj9(3EM_(u7n}A$nl>ZdudB!x!VqQ z{`j__1h#jr2dx|BZG@v0IoEv=rs39;mi-VEyUI7b9*`05$C3TURiB`X;E7j z7AQ9kb9Ad6MD4w<&-lO;-Y==rqABpM&&ps2#&Gd0iwQCbHht78IKuVk+2Mp|qbcLG zyh$&lv5ruN!(jJHEmvYM>{~%Q#EFUWlU+=RptY_p(n-8^SQ)?PzpCMH*!_OWa0%X9KGkkr;k8}&ih56 z+|k15BC#&&BA#-eu$4Zcyp^SulQ0tQ7Yd22>0go1aYk>Dn_id?Wb!_VSxCJjyYgAS zUH8_hbEEEG`c+~qskt>&VgHKvFxVR%*bpWzw0L)YzvbiiUZdKbyQIIrOMxhA$>TGQ zVX~)1I~<;FYmyOR=J;N7tAa^brkaHPGnNZ8Y`P+M^J=WuRPNzbJzAcnp&2UC&K7A# zUxs)$C9uXvc9hACRm2vn!ysUpBSM@mK9{F3fAu8%F^}6uan97hDlGC42fkb8h{ZPl zhuIhE$g=(K=L;gbz=q=ZOcU6{ofga=h8J4ym1?Z|J?`h;cNATaJ@@^RBYFPqkZfzx zAE#XN?ny6BS2P7+x&T?tQ=uDmu+UW|&R4KSgMoK>yIloZfs-MGU7qH6IhL zeZ5!=@L|e1DKV6p&rWw*`ic(4g}>}g)@AAF1%KM*WzHauID}8(%ONM-R~BN;Ei>^M zO}}T&B_5qsO|6Z+45|+VC7ge?TeUErCd)~+SiLvNmswjhZJktOqWa8cZ*85Hg}2Ey*Q&( z0;VsJSUjm)lOd&E>{|IU1Jn@zCH_3xeECT zma>vjH_}FF0*ci757L|~xw;R=81ByzH+p%2G$aQy2pAlE%rA{1T)%J?^);2}A zD25DHM@3P(Zt8(SCQyYkJJ^LRhJP6@eWUaGb1>@Ab!udZrk zdMs{S)7_}k*iuUTb~bES;0e{mR@?%e)5UST<->z7^q%$ZS+Vrz0E4DqR&l3Iid<9% z;MpD)8;Gur=4Xu6D~qNG!zi%^kFT|a9l*>x%eV+W{&!uTa4e<6ji)g;Wt~u;$ zwZ)-dPliyFI;>3J>L8v+fC(TJF@x$FVKiTV>L8aW%Q`|AOIT?@jw8&tAT<8v5q>s@;58Nz7BU zoa(qUoPI3pj*ZiGW|6G@Zm`HP@G5tV?hYOIxiW;$Y319?1miM0eQOmUPS!ndtn&Ww z_PXxkLN203ZR-B^JlV1bFYs{6vA94bcL?#aD~j?^xJ~%{W!oV9u}n{9+uQet8A3*a z5r_k_R)!;4@4_8OW$~$?k8`O*V75rfORqYZ{{+Ic9!+sT<8X~cL?}tt41#+!Ee2cI ztsM?bS`|X5j$dMK7^<~UQ|qp?u2H8pQ!mAzP&kn5g1 zx5d$#3%gR;=>;&~cZ5!kH5jvw_r5LY!g9?bcOj1$wLSz`eAHrxy_!+lL*uIjOEH9y z4Jo8^m8p7_%2}hm`=eu%Jop+@tHU@VPm_AJ${gF1tSuWal`W7$*of!cPuhq;LKe>P zdcqUS{@B@mh<~oiICAUKm_&;-@k?jbgH1d;Kc{AARjtLxXVK{TG4;fRBEQx5@w*2J z;xkD#9@p~9mlD{UcgN3_OI9ZCohn1T!hcs+Pu9A-l;I@J60twm+P-8fd`C9EHVD<5 z2k#P_kbpxQG;;vAK=lykv=2eXk_YghW!`G(<=Mz`XPC36S7YZmmJ2goaAZoGdj{Ei^L}{&!B&-rf)obS2bC`BA4Bdpo|?!wxUWDd#>H%w zvTf&TjgGieP(M@{@i{^@k_OTRvLOh1%_btK$vBnCiuCZ~pYTUF`!>Zmt5WlK?J}~{MvD_sDjJ~Yb_%~tRa7%Y9Wf$?jsIu2rg=p zWT9zz%+2Ld?$zn6b3IRdhv4z)BUyX^mQTGE#@81Zlss5&wYpr9(~TlX$4oS+eJ0~jaME0ASbs)l|$Ng zI|&d&Ap!whc2il;6zihe1QlKzrO`^V7DI~flX|Z+(&wz5!O0bgiG=%1X$;aa9)c#F zwBx-R+z%7iY9n~EFEqNIPEw?uA|$XtU|kKvH$Qfk7k5{VBvPy z$nTA&vPOG`XkJ=DZ@t0@N0CDMhZEzxIRg0TQqerMq>~;p!EcYo?Oz6o^&+@jjDj$;XU|lsU4CE4U}(fIdFsU_lphBt1wq_i ztV27OYY@kMp1w>WG0id|#$*kccB7j^80vVW^~8=?a+Q>#(s5}87nG9-7d3g@L$BO# zE)rAdv-_^z5BfNk22^`o?u_2=-jBnh`+K)pND0?^z3eLw(Pc7)^ESi z8wt!A&=vt6l%RJwFQJX3ly&xtS&Z>j1%_geKy*rtez%|+J33>FtKsHLfs4wuIe6-n zlI!d9ef7bPSwUg*b=HnC)5fJ1GEEJsaw_M+40*w?5Ro}Cn@DwIT|mPK``y5?=67676;*H2@y19U#^ z-b=MeYpwEsr3>}49@ompAlO?zWONQ{(qm8!A)l%_X2WV;L$S7zjb?z*D^g@%9zz_c zD4P#kyz2^$f+@#CLJ0r~@;7RVGVr{IfbZ+;ekox!rZ=#7 zAKdZN?vMUh2mHkb|9gVg86(;{n5Tv}NJgNR4jQ*Bf#KO2&eiPoIo?9>4pHp3RLA3V~Is~_&YN6tk|YOHLm0UE{9 zyZ*ureKf~Z+UP{WQ%gQy@s$itF(zi=>Yb2Uxxb=9J$hCKe5UOg zxXemy#)1ypoZIVUf=|FC*cgxt#7S{6isws|d-EK@Y=`?LsTCBWc}Uu%Jj}{GQkygV zTvY8KTS8zIHbY^><25i1c58*yT)pG;55bS0SEor#8tX5PP4jLo4*}M>(XRHOM+JlM zq~0duOhNZYwmJAUJ%$WC@@C)l4p#bQWSkdjj~ewn)2@uOk{jwv&h~G?!YlIXmI-MtrUqiJOzkSh$=CM7r^J73FmCV7KbtOI|bzrfD+xG zPC7OE@1rcvSy+67)FLITO!(jHFIPx~GQd0zmy>Jw3N@1E7S7htVU}Zh%XbmF^WP%3*{tP?^931{)xCBU_1Aky_O5GBC7QF zxnH|m*MIR9Kf;@->+ca_u|Fm^q(Aa-n`hp+@R0{|aC@q7P#8J{HpaS?FBC_FPu>7Q zoMFV2BnP_w5yHH<(-i>-=Z0Iaf&)7c{OiL&P7F zm_@lpTBdd}6F4q-6ARY?EQXh+5D-+FsbT5;&?W9Veev-#M+0ge+ax0Pa`(A=fs*KQ7(Djhn#9IOZr-rWL8#%X8);PQ8Si3 zhAV?pNQqh@iGa7iZLh*yNV>Wdo2m67fD>M#s$jymle@v#^~ATc6)d88ai>SC7_P}} zb7|cv6>`Q83S)${S$A?~u67~%)SaOLYWev3`noU{YZHEhzn*!K>zp@Ad+I$3ros0X1eV0NXdrz) zDB1T!zudJZBFZ({p;7H1q6FuVeSE;ELTVoOKZsTjt}(j(%F5o|t@T*BBDHX`hq&)` zX&w&Uw!1LTjjKbgfyTpkIcmGmuxx4L2gGVxH#|f!5xO71erZ3%GG2NZ!ZH)$o_(mCJzpy zexBb;2#yK&YL<~u6zL@s5J%&k@=bJ1@3<}jj^w87I!KVKdRO5FYS}^HS-QjzlCkkM zN724P08u0SJ+DsCtwtM}_hlDG;96-t14BwTfy@WNg%#GTn@vPI=__(nMq~!nJR7ny zXwpXw0wPGF$!#?j$P_1(BrNs53}+;~Vini9I-SG;?9JGX`xuCqYoe)i5KuT0gq~Q1w=X9#%<%BKxQfpHI}@&r60rb~)^d-Z?i*3}Z-AIYLqAZn8uPop3uLsG`}fPlJN z!VC1@hfKN1fOd!rG=YT~D&2eo07^(xZS%cOnG12A5|&vP5%s0CB|}^}8j1XQd5BeoiWn{V&phab+0M`MyZ7(Gd{_;Q-}2#oVeImCMmjznPkA#nx>;(n)S2 z)KR$-M;%{z;J}CA!4=a!9Ez~T1kvqmE$q764KiKI4v*dB&V#VR6-SB_2QOvEKW(9K zdDoL0w)%vGR#@~-xbjhj6YiKp8LgrTP3KLU9eE}-fa3*{anh;uQ11eRmUr!#Apw0x zqvf>^F2Un&US`1Loo_6}aog$)X#V=D0!d^`&jV6$Djbm)*4Rv53d_50)qeSezwO9& z@{t&WC_pYXMLy~I>uBQVta6WkCuF~JK=}OB1f`Dk$V)3wE?E)*Y(7Xasx;~3!#9=t z5-Ito=vurz2+&}^DBd+x&s9Hm6!tcw1d}NTdN&<`%}`2URHI%{GZK^M3TdARSntjP zj+y9501#&vc0yRci_28C0dgOBNf= zJBqX+=U?NQ@>Pp9ZZ9M_?dCXCAwHOvECK$CSlE3H?%A=w0O=-hIlWQTY4s@r?ai551V!WuA|+>M?IMci)n>fF z{4MyV$gROtstO+V^UZP;d-$G@>rl}^VwU|&m>|>E^!u_bAg*}S-2r@*LCq_xWwuR} zoY(pBuq>TvoP4;YMFemuI?<1uJ%IhhLW?!NeCmu&4Uh=&uBlp3iYw`s5k8tFF+uhT zhjkw6;$xE+s2u}wLf3Fn-nXz{m-l>O00t-a&)3E zKd{|KH^ozmwCbMvlxmcsW0G;|gE1_3M41yW4I5oDx|m9#hDkmv6*7qqgMCFB_64{) zx-sVuT@PXOrpz4*_4Plqiq!qrmjcZkUQu$qWmaTw^|?eK)m&zSU#vJ(+3VM5nS(3? zIVO13zCaqs1a9tcJDXk7UWa3{0BcHaY*JrIC7lDLN5(o*D%$hyJxZ&p4~Ov6yZ2Na zNb4Vwl2Ng5x*_rxNIQrDeM%7>qqM3s_g@124?V1nc7iFA-;vRB1TiYZHbNfs6u@k< z9Ok8Ka6g*Eb#5*c#+o!qb=v&kqm)xp_ZwR>ah}T&JuwKXW}R3FFA~ku0uC2L%?7oN z$LkgM-Kv4J`BqBDH84pf^8ZlP-H|VEc$V z`{wEpuyIhYmM{kzNL+LlH9*c*NSWk!vP;h+9}K|WdY%LEehS*E$Mf}$k3`L0l=3x; z=DoA|j=|}ngf9FcnjK^daE)|KoYf7)-Zr8yI;n%y>hJamHeYN0M{)!sOydeGK^{AyUPf-dM0))IGzSQn| zX`f_#BApDAh$4{!b|V@*D#`4}l&vmx-#T5WOJg^^l^FnM7#ZW@=r-&&&heZgr%=CX zfxyj3(Cp-Gihw&FX;(fm2Tz5ga`>hkx!4sSK*vlG|1YfR#mAW6YgKP9(D|Yl<&YoY zEoUUIiJJ8#DU7x1v&>YjD^d)sNv5>Dr5CwxZ!zr8$LOB8Fi^t)nLd_lsMcd$Bs7Nk zwWl}&;YEwdGE6{2OPJo=o^N)TGEbm^z$^+)4~a+G+0XtY9uZs+1oqL0Zx=Tu_+OcL zIujF@TU~-W^n3O7jE1X;2|e_RBNpfMd&R8Ao}z;Q!G0BpLK4s??5nBl;6BC;kkECD z*@}52dV+ywU;3r%`k84far@w0I5boy?k0WzP0O>_iINf~z9tI*qt@B$n;45lPd5$9 zP_(hv3s|8>eokgoH#23XulExC$qTYO!yTy&Iv+X85;Yq%==GFq3L%?IVv~O%;*SMz zv1t#b<=RehMvhtUM~Kbq%xQX zinm^8Z>X{!$#-!{%w@wFw>3^_^nq?P86_FnuXvz^het~;e;i{`C7Qu)f3YoFf!t{% zl}E8aRE(!+u|0m4CwZRewV$RtOD2@(ny>PMAB$2v%HtbVCY5lf)?6yp9pE@qT%Uh6 zd$OFeYX6BF1xW-uYI1Dq$dcIhW};!yQ2TeIKIDyqk5-~k@=8;RdzDnWAkS_OHpdt0CQuLO-b77B(B!(D%y#yE46sBu)HQm2NCvV zO2FbOup1it^Va%jL;g}nhs)Bhrc)D`lZmXLYil!M8s!Ff;jQjMe zZ|E42lW}g&@LsVg4OIKFPo-pj`D)8llj zlgz9(A2>K*FmCXBeMuf5CqQNV^mjHz z0o6e%Qu0Wz!{8p54`LMLiv=(8p2!#ac4^y?jWU15Gu~|A`{vKyigC=#Ug~nVl0#A9YCjwEhliSOacchXe?9O2PUj~ijJ*~-P=KP@0S zQUtnk5}__U%e5TcT@y+c=PEiUV;trd;4Zj0PO6y(x9KircT1VxT-`FuwM|&_w#*p^nI7~h^RIe zWd7N)2w^?eQrO?Ci+<4a!ekE<({)jS&*{#HTmIMB%*{%@=o(@DcdVTHj@I_%sz#M8CO&tpkb?;b3gV;^_uegk54QE1=&wANRtPO|{v{-f4A%F2_SZtv{pm}!;i=I&DGr=a{;8`7ruiIY! zl-}kWE9^E~`w-a7(3{Tf`m8@`36eOK@HIG```CrmY!vcobf$IR5Ygy}0moaH9s;6oHVCertZ)eWubB|8$q$y{|bm_0b`?sFjR zmFYgSo2yT`hj6NG6J<22a4cuS0cq#(co%=Z`S|b=jrVo*=Kg*T)*2sA)&^^<;J_-n zs~GWg*@$V1>SWm{bdrr<5%!hUcCB#CG)uHjH{gBSeTsx^;t7y-*QK1>W9vz=S9E#$ zRH%{id`3@RHNSeL>61ndg`h@mc=+n-I%KoP!!3@Aqu{U;BOmz2!N>|%Hr@^b|0Gjw z`YrQPWYi#i>s%E*@V!P*m+Gus!!e$*IpqH8ws95ylZSz|{HVVdf6^|H>Z`5c$R$!Ov zv8Qp{5XtuIM&vN63vAf!poy9(!HP8pl-pWEUbO-VQ$CkFJ1>d+*GRih^2lci9qilX zPuenNQ%EI`mnc$cEpHo#=e)g{N!?f6h`{;i6?`Gr@<4dd&rjIOkdc0k-PlX@*2{GB z<}xRaN|Xj)Atas9@z4JPFd58<7g>i4x^QMpTvVO*AD<_d?cd#qhf*#+y~9#3PSKZt z0k72HG%lx?j(9eY!zk?Ik=W>3DzBI+yb`Tq_v>>iBITtl6--+u`sO`3U`8A;qLAU1h)c0fWqBf3iraD-syAR+kN}?Pq<$huo<;$t~J+u zY-L0_?ZmO%yowSM5=w#tYADOi(jr}rsCW2_9DSTTSL+`tAnzl$MrO`&57(~}g_Z!3 z#7fCPQY|g48m0lj5FCHz`srFcmv3Th&YPX2nwe%dR+5{iP!r;5o|$vyd&1TgpPyQT zpSQWUUa-seHL?NF?-o%eB(%#Ndat5_D}G?hzI=ZIdao z1g+r%?M837=(qVZ7*5G8+!z_f^OkL-0D6N7BjwoI$xco_nwxT8YP#DzIgm2S=}2r(QZ2vRGXP66WSU zVY9K}o3P6k_W2Bh&g@+6b$x%ze_@QYU8&2BXFjML#_yWhFZAxxwztMdF?&fhrAEZh zH)f(_R7R&!FMKS#38WJ>irED5yo_Mh<%PP{L|57kxU4^5024X+N#H8To>m0SW*>gj zS+v`FGEu&|#V4Cg)OnQ{UjsaD^j^i@^XY*ti_a(a>A|U5+hqOkR;hRJ-97?w4e3al z>Vmi63L>HF%0$A^R2k(Gtn4MC*KKpjG;cQdr`Zyetkdkp8$+}oGs^c@UwGw?F+QM_ z{q2Is2zde%(m&JPycxdvv>_xcYL$NSZ0lgc{%sHM@;4YKQSmm4Lv@2Cpby$JI|U)MhEF z{M*j=Z~RRNcp{$8iuhihE8E$rP9A-+0~qYd5np1<(yNM}0lX#=EBHTnP3313#5DGX zttGy;-SMu2a?H&av%S~IpTcr7F;9zR(MfbmEQlKVUz|8P5;ABA-nH$$YbW9ioc^Hl z7m1Hgr`8j1xyd&k5Yo$9PMWPJT+`62ox#L^S=M4CQrl8joLd8vl8>F#nciTJ`^wVF zMQpE1Vra}LZu`TGMJB*m>Z*z=<$`prmQ1J++h(GQQ06N`(d{P&wko|nW)bQ2^O17q zK5dZQQLnChFcrZ-GK5@LYqOZU1V#k2pKAUoXGN-5bo7(d+1V7JN4#mlprKSF!GJ}x z)9)?g?$ZN#7w|3tQz8*xqmSaG{7nja3oOuy`>&(9a~4FvGQ(p@U5)XA>mmihS-H`k zYVBCtdxTg*ClJxn{s93(3BTb?{LT(cgyvhWf!5p%x0wIY>u!al_$34o-@xkr}VidvokP#aef8+eieR?X z-=Od*;rsbosRts}x8#n233kg;jy{DuV_kwT_&v^kYt!suq$h;lJAlGFQ`WM#9-nQG zK}(Un5x+VQ4otXnw3}I)2J4f>6XsR+erv~oC0SgEoP?{{(^XS0X z#dMxTK)35WZ$n|lsQ`Sda9n49C}xA(2{{yh=qK18p4cC#t}~Lv(81n*x61bVRLJ%r;EF;@Mv+i{D(`Y*xqG2 z<`krC<9MAqx~|{lusR2|u=dt~KI;`rW8ERLU42_LTW-Q8bVDC^Kg~@px9gUOzAg-{ z`Pp$XfW{f!Wv~*p2_NA|uQ#5zn^%lDX{kS-7G_YTzK!qshA#!<(WyK9a(nhM`gb)i zSs!Nu!cjKhv@;Kn|BvoRn+&M9&Yft{{m*br9n_Nh?a{OYcZ7dyqf-2id zgT1#Sqa{a!UW3BCV2*`Qk<#OCw(%{A>bi5 zlR;{VXYDxXY|)2+M(K-)v5JrdK=_RctsFDJkTw}{hs__UGt2=p-4JhcL9dpBT0JU!e4thDEVe8{CgKtGkmKVx|zW368A##{k z(YgKmz+g&MllhMRah_lLHPg7dW;~Apn@keD>2LCKkgrb85r)Y?3vW5R;#<|N6(XB( z5tzeaH$@S@WMb&wvkM3YJ^RhqI|C69AyUEca1ZZYxilxZ19wp~kK>`?7vZ$onNT5s z1jd!J>8;W4#2@5s0c}_qGY*RWU!_zS0dj4pTPAv9%*V8%6ED9i^r6yBcruOG=t?26 zGtJ7iek&}=3}TfuM@qkJH(7N)A_izSJ(^#Bu5XN`fd22w#l%t*b@b6B-L97Mk@02f zVQmz71xN%lyOkfYQSa(Kd!UZ5NrxS}vDe$QI9vRQv!X2g#hUfV7aO^Ao?)aae4TCP z0JqSzCGhvalW>rmqHn6b`2Vn2t^tG$ah_NK0B6sSD3kX&zk%5c^?_|*r6eW`T_WX( zX;-ZOJeGiw?Ekli{s*PS{_PgK*X{NJeQ$l#4Htb6W9cDp7*{3DvmF;-)XYH=dW14e z-*o#WV^p)fEr*h_4`rmU=_pU@EsvkYIsRbdbuHea!2jomP5>Q(85VSLivE`2-ybU` z(?y~c-Nb4z>7GbP#^PtjfnJN%zPEcC+vZ5NKiL81yg(;*JHM~`$6*H`YUb?2mSxJeJ{d{078h*{H*6=nnK0aD4=|KeDK}{@Ouc-MDgG8=(c9! z`=DRM0(`gdnoU2xyvX9GM)MCQfCP7RG$qQ^Ag>JsP6fG>8rvFtXH6!!{B~(&(R=lN z5wK0(-Q7);DEdQ-$phaYKlwgk3n=Iw_gFlhKLlU*0(6|&de9KfR`g4*Bf-kcQ7~7yy{2;_y2H*r zlNUa9FG^fEJn*#C+nciZvs1d>FME8Rj{q>yg_^Tnb2QzRaobq5UM+v?Gamw;+8woZ zlC78;HNBzKVm4%0z}Psmf<%i@7O5;k?VhKeTFZAh3mNXyw>S=aT4tTm|puZ zwGlE2Pmux>-s3!!$xK%!(yM!JIN3z@j#D1&O%#3cW@Vrmd}zoNb#TXL(Kjkrs($@R zArm}^mN~roOzF|~{8ce$WgF|51)I{0Q6`K3bJs!=SRp!zVZ|q3QN{63RmA+7+^IpyQ;QkQhsV%Wz zjX7|HLnn=Z3tlCRagS-H7m!2XB?IdGPCA1Y(>HVDQO@v04*z{V-?Xy#7}h%h)bX*8$>= z)X@Mtd9_g8(SiQ_;I9vHm#8uj(s0=ays0WBaXu1^*(?oDe;{vw!4`9Fsm&{4rZR1K z@p$lAr@{F4kjG{!9PkEozkI`UOco>L^;C@2d##NJ*dl-I^X!WE$A^Q~yj7~&+ew?4 zI<}vMZp{s3V#(NwSBX%Y`#vvSMkSWEPXQ z-I1(&BI@<$T2?5vjBFq0_y#^%q-k1AG zdd-j=fH;sBa2+bi;8CDIZvtUC{g@91#w1Zbr=_N`lmhuH0u#~9m-Ii%E)Swo~Vqk!QBY3cpDp%y8(^J3_Mbi5$N;p z>RCdWMB29>8OFn+kZVEG+zO|cRKp(<3B*3w13LDx7B0%Y$ZM(nX|JM|sbOtms|st{ zVLe}e)&TCnlH`-@u9JzB44u(7Y4O&%($4^8D65Nmd z(!sS@UHM340ogH>zY_A@>^Eq+x$(_=xTROgqd>j6q=I^{fNGw@^T$*ZoD?edKMS=~aI@GhpiKP;+^DncD9P_BcQQOSYM57%ny9!LN_$;r zdq8k^e_}eoESs#WR<4F>HyFIPlM`8X+dY!uH3=}dncQ}o;`-i}ZYazr{iCnV^?YF$7GG@q zRowBUT@L1`1zB=G@!5YL$lw{9o4w}967-0DZm8TbuG@u($dq~x>38^s`$d1*V&Kf( z=wWc8CFU`FHqs~~cnTm>{W4S~0Vw$X_i&6KfM%&P3Cl&;T%CL?_*}}OREN*S5~e7U zR7`HCb3?|(?}u#fzN&57#}}4~TcyheMt8(jDjBK0C2L~LG2uN&3RxG@g6@o-SgtJT z69p0F7Y|E1@T$rAM6!@?cfPi^m|Ru8HEd!6l$J@j1t&cBKoJEJ7(g5nd&e-1Sq2hH zs0i)1uncfBz~6om<#~3y9CBtTZV#m44?1RxbSPHo3m4MFoW$;QSv%j1X$y}xcMCI zTJ&d@yx&tNkvrL(#-3hie4`G_YK7dkC64G_YDAcp%bu?X6Sjms9Agl!SVnTHyb^_d zG!|Np-d9z5gyHc@b^FFEjW&r0``l60+o`8RAS=r4-aQsqnZJnAdo-Y;U*upadb%iW zvu48t!}pteoKrq%fT2BmkQ0y4%UEU+egPT1zvqUq+{wL0P}m%!!4hb7NWBxk)Ld$I zq68!>Q4u&y!%&pG$*yc*o(Tx@TuM`X92Rie<|ItnFnnY}Y+r4EBu#k?@p}K%+lD-PJ_nG2R&NxoCJG5Jk2(BjC76e& zOPJC$QvNjv*Na_+2Ob&E2TWiRm5U;#>-stMYcV+_CbejPeUgSx^7oQ2(jG4htl>5o z0oA78Ku)hvI^lUOyKjE#09Zk7uY$ar4<-J^{m`qnPy?=xq(x7;%bvr15wmdwv~qx` z%460GRnFpPx~*}pbk8CIx}PZr7<%Sr_So`~)8s4i)m7d?{o+`kvJp`f>K7go_NI@B z7dw_4g(5fZILs9M&h&pB>E^|PuF^x7X;F#p|M}6@nB|3T}=oaX;!QB=6JeVy{QCyNG!<~ zy$00!(Rb*YQX-F4cpX&+iVRvSNB@;fhme&b*_YN#^-$~T>mgcj( z|9*^&iaG7qBFO;bvJec=peUAl#mt*QRVv?Ld%3b~kyKTv{Ya+VsjBl^l@#XV6Vs~2 zA>+9|dhqmiIjda8d20}lx7zHJ=IQ*bm8F?Pdn<7LUN673`e)pQFM;q+`YfUID*iHJ zp{sY-I1l~QuuqZ8e0?5Yg*->=$=m- z!i})oav?k?6wt*6NpAuT_ZG?GUNg;isrE!c>(6PJ5t&>1+LTgLYkT_VpJqtB!z~RO z+>lbC&cyXM^DCmod;GGa`+%Zm(*Lzi68!9ADB#NW-mkUe*Bi?*2KCnVCxn!d%op@B zJ}gAP?xH}d7xwe*?Np&z=y)bg=YF)OSIP}bR~U%5y*j)5Oe~0& zo{@5wo${;|sQ5{jkNkXeFyoM*&K)HgHAX{{jr4j|=3I{R| zf8QH&WF~D$Oz6h%71%Uy&iZ`H-LAnk5c$|y=60%tpU{6#cT!&-F}f^vdN|yAZg|HvyWL1uU^Ygoynd) zcQT8o*wfO01LEU7SBYGjBjERc48#)v-M2~l+;eVZ|GIR~SxOE4Bi&_6P>B=w+vD-B zh=`l#TTae8W%4N}s+e9esIBW9h`tfDA|DMT<1itGApf^s;z!g!t>-{FpTPa(h+hh* zbS@(BrC+&58{)Mm!5~djChME?*n}0glWaf$N+#HX{eft#3h&d# zAWkB(BhiNcrDa=y_L_J z(8d2&IgzyBmTDLXM?S;T4o+_*>h)K~;4ns-pZP^RoT5s)^TLQE2ahIxf%9JNYb$|J zx$t$6UT=yJ4Nc7VXB8-^Q$>PsEfUwAL^7_HXfdPL=&?G{Hm4~~*G<8ZfZV2GJUq~& zPtblf0;Fy=<6XC(WV;YO?YlN&3UJAd^yKCv`J9dy`n8^?*Vn~>2Iu_K_$;69Jt^Wv z24K5OG@E==d*jf?cwhqf+=?0KUaoErS6F_2rd|s1zEv4l0BC-hFaw0Ixo{xQ&T6*d zJz`G*uOLI^esA4UDqGg>TBi`8CX=Aos^P!>t8L3B55`kiLbm3*JXj1%=Q`*y5CgL< znr9^Aj^|JU&DPX@z9u9vb9H`BO7HA&pPNbCI8BO~&VIG>eW*3%ZsV$q_6@%LrWsEN zND2lQ4s4n=R5(`l4*n+-d70y#21L|-O&41Hav6KL>?K%lF`w48vi61Bau5hiP!-}U zkzXwo<1_xlXgc5?)fMWpo!y=0;WyzwC~s);Z4{FlP>v^SH_+yV$o9p5;`tX2x6zvd zi&;yt*7%C&8$JY>X9bHWbLk$FrEeL8G{w3cT zxV9(XT~txyYSgn z3A3Xa6V+nrBfV;{$(DCAbIWHp)lq4^;+W>SF5bcLX8Z{qA{?tKeNRg*$1gpB#o1(< z&MYD*BAbf|6iw$14q?i7SSuvU!(uA+F^_;$pe2jgTo- zl5g!%#}v`E2`sv04)UPgjoT1dep_r9DM?$hRwZ%Vb4n`Da<=V_!#5H3bWTA)XYA(* z$AIYqX+DSscbOD(M;~FvQqM)h&vHfCf@T$<2VG;?WS<^AH1_)|x<1hf+~eDX@ZwR3 zp+dmDNKgG)di0cv2wfnFrGVamF z5o$Y4Mo^;H^>N|H!IH7mY6g$5Z}AIw9n8r)11~g5L@Lpl3`Q=;tWiRVedbsk2tWyr zfH_$9mv*w6vGCLF>?fE0Qk-JZ!LT6!+AnI%z3@(|X5ld7!~-0JHSl(yR*U%wX-?Or z_g6^5cnPfXSHv}|agQoR5qvj}=M~EIeTjF;pAS#hUoSD#(qCWsRJ^*NQ=R=Cby3W1 z?*+Xg?>y-aKfZ_$<;U9>j=Vm#n4tRVT$&=)1SQ z_JGT$luX5(>hRt{#;zRyKQ-XN$lY4T(QIQA1U&;bVaLr}_T+UGDz!{@7vj(eaze%I z&CdH%DIoB1aY%C@?B)xHRjb@S2cm&A%NZz?zuO1m38`9Fb zLDqRPE}aBq*;l^-ME8G>r~3dD)q2nYvV8g&*z54;R^XhwpzDI?@h7ba7KzQ;YBKb$ ziyTXMey6u%R!I5mt{nz#A7_E&RIzdIQy~KC^6f}-U|^!ryH%5)ZE$14u`*R1z8C3# zUn-eL)-al(>>xXWiGhDH?tWH?RbQvu{!qy$$d&R6@me$3_LlTdAo2@FZZ^aRLP z^cZ{%z4{?~_0}`zgrUD94%}K5wbBBBlm*xUfc#97zlKTdcFSh()ytOzHH|6QNb>Eh z)GHc-%(pW>nF92Zzr8Sra#WyJ9qnpi!Pn&%A|3`}lP@1lAOrV;b|w;N;wetZp(6GRz+F|I$VHk+FvquDby_Ol9|bkIbAFzvh^6SFLps9uf@ zrm&RCAFvZGWfj1&G*sV`Tr*q5(iz(VmgvBR2D(w9{ga9prS9-E9MRA%?V!h`J{0yS zKwIN62JxUw0s%=bH&7X-=GnBZ2D~U5MMrg^`-O&sQCAhkJVaw70`)+-GUMjF+zTUs zLuXz6N&j&;qa(7zWDqK}#dKuii6kiC@>6s!p)1N&>kTM7E$9Yz@o`uohc;?kH2H(36i^8o=i@qJCM7m&4~27Spd%16Lp! zQ-{~NdWw&QF1g-x=N z){)q>bxjD3FUC*W_?VkBsZvaMnq$1Yn9N)4-TVZ2<{ytT(kk@`lsdNBzB2oL^k-9D z^LThTR$<8czJCKW2jWk%z8D(lr=Yd*y~R3i6JH+73F*s^{gfO|I{)oP*>@soe zS7M*vy?aT%miT5v)`Z<=Bz>XHx@dsj>+vqwN`Iwgn3g|J9-A5)jMzh_!mK2sHs#!S zj*xGY+BAlW|!H&g+0eg$74E5*M`y zJg6m;352%HS@FD=LaXXc9Y>wYcR9E$-cG<`oG~Zh;jV(Inw+^h@vXF)qb9+wx^M8K z=Je5DBQO^^+GN3h={y_FfEB2cN+=Og7ru|A8awL!d3JI|M^`e|o@rKuw0UWaNw^@E ze|fd5`3O$0Ul!!5qS~nNvd6vhL}4I*qtlxLx*^v?$Wy6|b-CXJn82SFg|SD@OlXB3aawR(Sb{z4P4h#XzR<#f%J>)lvT_#mCEu+AI0(evRf4jhpu z-qih1JSdl$qRWWh_VCuVqsSwZrQ+}UM;1yY>N1s{_d^oPEyIV*l-0`!+uU9=RB~zT zpV^LA{n(`kNHp-Bt}!~E+mQ!T9>u}9jTgzKuxy>)|uGNEzh&kY?UL_<|R zDGmOo(|V5re4iH4e-k2rY!l*7S&JAbN0+vXS8pRh2#+kBu#TI_RQRcIU+!u%8Ud>77Q@8*3O*MHxI zuejvgaO9Q*+8$Eg?_wr2)jBFP+xGYy6aebb;{B;Fgx+zoC@wGaZM;`ETAqH>4SsdJ zT$-+O?nua^SGYapvF9$4IjUMeIX-=olp3nV z<~4#w@IaNL4z-?T&LvbnV6(lesyj#q+rk}Oo+B?lAB#I~G?UD!JH zih@!Ya02r3I;W_-$)*?6cFmL&D>^{gC!K*_Q!(2}cS6|t?d(n}#29P}hK3(53vL4m z+A>>BZcU-o1asv95+02%L;2^;lwopa-sjs~oYvZyfW#)>=P@4dS(0LoAJ(6MK+a@p z*I>Iim_Ne3?6v}6+G#8z=iQG!BAypP?4Pa2J%;!=?N^czkO?Gske4Zm5WAKZ9vLW9 z1(Km(DcGZ^Hnsj&L4N#KkdXDvTh39p57KFHE0Gr0jL7oo_JV}Tu-2Z(IUxHe79p|p zi+pECiA`46YOO`H_nxA=hi_R>FIl|H7Q-WYw8vmcn-dG= zO5Xc6w<<|mod$l>?=Cg;#9Gcat5*zn7nb>dY*zjTQg1bU5LlO>cfV1(2?n8jq-fj2o*fX}{XyPyR(aPsG#~hfZb4Op)~A(4y`xYfGP(2TyZ~c&8R&LSSRkNp)5nlzsZr7RaH%;2uSun&#cP-7mv_)A zuhT!3@ukmvgf_igKE`U~qKVDsUq4t=G!l|W-VA)L)7Yx$l!JJ1x>UDWc=}O4Q9f%# z?1?aCc3(@pSc9qy7S@MiPF@9UZ6E$;r}o$U5N03Ok9#7xE7{RrB+YxHdZqf-9d%U( z*J~>IV!8V~6CfEBOhW%S%}Ejz@u6m|5p-|~p06SJWiJ51*}&CwtFi;GSaFtA-4AxU z9l3ZN9_JB~6wd*YhARqS%b-9vtWl?{AyUmc2Qs!5lcZ{p{<%3uS_rr^gNK77Ol2mci{3|10zMP# zmbKdKWqd70uMVd-as3Z=%MYQDi%dw(NqmWtGo-bIoe{4~fi&|>x5e%1hY ziF!xpOAM+&*RI})g_y4OCxq@oRak~no2B7wxB^g3hHpcE!!WY2F)`BK2O4=FEp$Y{ zfq^3ogg=y7ADBLPmTQ2}4+Se`@q??4J(bK>%?jo3UnapEUYduQ+#3LILKqj&cIO6p zDJhW=a9TuSP?}HL3+jGl-`N^8+lUe2R$l#iuscT;hGP!HaBxA*>&Tj9^Yymp*Vtoo zT{6NvyFQn7u(I4c>w5c%*{CWOt5Z(G3a1J|_75AM~d-ve`u zCUZI%X$aUusfcm>-tNSvoz&D&uTuNL2@Jf^b7DDM$I5PZH8A^wrYi{R0Ax(C@o*bB zwPX0&`G76vE=U%yA^Z31f^}r64`~nqa`GjVI(#%@-7ltGaYnkomC}CuD8`Z{vO_}0 zGV`Y7oWaZBTZ)IfOO86tcb9rMZ*ZB3{_J(QuD{-=&Yk(T9GkOG`K>XCUf}DOpBQ3% zz|urDf&xffZ!~d584*o4#KTJoBCcg*)AX>};yNLo+%B5f!#!d~`h!uzFOxqAp#i_( zN>$wKW?kcv`^w%QAm1=xZ%!|lzi2JwNEr%xOeh%jH#Dya*z;U|CmG>QRoFy8w~~0xr%~7$)AUhn z$~5gDdS^68HR~bf@A{4#BI1>OL5Bwoopl2zaRo_Vwh#?lu}B!d0#Fs9KvVK6##H`K zJ5t4T09GIFX{#%<((8#+3CDtQRLx_eQ2xLogla{t)#^Ec3&D*R%h`u14zfkd~ za&H-5V@>D+E}(b_>Ykh3owr7bTGC~fa|n?Ndq@hC`!Zu>VetX=?{=V8t>k#rn0UFR z)4_~Lsq#iMBam#vmUPK~*~?WPGqf8aHS%?nP*y)XOHIQ(fm$RNh}^BKEUiVO6bXp) zJD{aVXbi+3y)mUit>g?S6l6Q*i4(UgVTG0`=Hw=`FIyB|Pk+r3=9unR20E!%w5>J` z;+r?)x63aWwZvTDC0k=9CsbCQ=hAiiH+^N=z6=zT@zORT7;*SI0X2~SZY3|tbKmg> z^qC-eoRD0qa+Oj%JXB8VJ-*_M?KR`dMf7WQ8?_=(Gs3?^)vT=f{gaqqc)jgx6~Be` zC*Ch%b|4=ysQKJZ8@a6C9HIy3n~MR_pn&Z$VKUt}^lT%F!;U&h%b7y(-*urx(GB;+xX*4i?LAEzC=D2 zh)+WZ+mp{Ke|$gkSmX2i0hLQ7EoG?bFCmU^)<;V>=YRgpo|0$~?u;B2+%&UxJ5idx z+?qZ5JFhHGjhqbtW#uRwBR>YxlQ`xZRI}m|2AEJ=8n$@$C+BdnXnqyOjW9_qayy!7 zH9GBRTe&VM`IuDoJv^LSX*TUX9#JNIXZ>f_p@RzkkV|H4mcGM;{cOgD{nD9LCH>`u zZG8UH3hYhk1-yjZE=ubhUJ!PcZ>a>nuj1+L4j1cV$+Q{|75PZsDSUgVUn#y%Wa4C5 zxYKNNB%g`wPfquOVk(ChqjvqiD32h5%91#JF-TY_n^?l+tep)Ao{PbNrMq7phVS#Z z-6>jIP`SK@VF>$7$dA9o-^|a?6zu+eGI7{>-(_oV@ zv5L_UFTRrdl>eUsDWO_>N0FeYtEU@4>s0a!GZ_ifee7+g8UacX@ul81j?>OX;kaGo z8JFpl#UIVq;m^!g9QkHFC2l1?OHLV$;e z1#|6oN7A$L_)*JpF!h-2{&p-_Q*m=hD=~ds8|@I|BLUD+#D?akx$p*)qyG+k6z=AQ z3qS#|wAsls?%jRrot_DFX0V!RhUj`Xj)`+lUId##A9b~XK%$_Fy2OV0=?E{mhI-Sc zC-QhF8aR}1x%ZrnYgR911byJh2De=DkFBx3OciM5LG@INMR)`2RHZOK-1u9z?0?Zg z7>-yoW`-r(8jLImKPVsH&w$DcjzfAJDmEr<#_`%*faV<^G`I*hfUfQ2gwnBfz0Rw% zoQz9fMpQx2VJVL}FW73-3#&fTGll>5y0$N$ryH$zUDn!XFZ{rSrHofS)e3?U@Vk~O z$V;=R`c@)F>B z$s)#+g#zZ2tye31-_5vTZy?Hk8+u?ea_S6Pf6AtlST@=4K|?|5{_!+Cy(YFKF_>B% z3BQ~j-sQpc&`n?+_oSoX&~7?R2hOp2sV3C_HJ<~ASTg!o(PZo%gjnNq!mVyhCEHsf zrJbHYM26Hb{b70|3p`2A#d!diV#@N5=Ld9)Tbux zs8TpNB4d6br1i?hFeR@XlULvcdd3F?C_`x(G0!hAf0d#GOjwc($xS??-71J*PjIKU zf9}Mb@CW>ScTNU)VSnMJ zFy3;ZX4hfz*z0#4rY(oV)i#IJ$yA_%{I^FsDHgfawO^B{U!QH#$*Uv|74>)DLqJ`l zRJW}aipbcEvahusM5|Ea|C(sX>+?tSZDvVl&b^h2CK5{-F^GeBheUj|R=t|6QX@4e zsagIAz>lI&55W|}zcjY0k;`XdwdhJQ( z5-v}_m=(_EA{y7>PzbiQ4GbG{U>B5m_?d=fQFzl?PJ~8uL%B?QB;)hPt|SAyc9*9u z3?W9^uF-kZDQs-)^FZaQ%sjk6smj-SpCA{x$E?}%gB5^J<)b(_PD7d5`~;CvmV^T~G{U%3;tMS5h+T3kK>mKB+#YZI2ox zQiRUdLqX`V-DY2=3|{e%8E@nU~C)^zAvPh@DR7CXm^ z?1pWIRqim%!Sz%I{o)80XP&tCRfx?;kh0wg@sgV)%QHVCzV4D zP@;uOAR}x1*(ha(BZmt0$^?@-&}@c^9UL5R<|yJK+tsN%naIC ze2(qEqa07<(2HO+tg2CryS`057-I!cx$B|P2FjC5BBi7=&YWSr`t;WKr!>-=NLaY- z#x??25ypb<4y1rDSwmj5S5Y%WyFPuw5hLaMp6lL(MBQ`y`zSm=QW%uw%h5uV5!Gov zKO=CyMFl^WN!!^HiL@)SnU&q`l)X+AYRXAZVjV)$?_2y`9!YV4dZ|)IOWWLM9g_A^h_ma}j>#TPRkgAOP8sM-Zdf1qYcSMs!9&k40@&W%!tj3~jIij?@z(3%0HETcpdj$VDr)JQUvk$sHmbj!Z6Dh>0PA{rb3){54MS5TQ-b1OKG#dwVAml z5E7fBc;HKgt`TBu+5;DQ7q?iEfUr8f8Vo4XXLD16HaChgrR-dgDrFnYXu``)ucqpeiEpXx4youDrJ`fba2np}ymzv0cO{ShZ** zXnkdf*Nli!VjC?YNsgy@dnpZGU$JAU6lXN`lj+?@hWbJbd0ff#mqsLC2Cs*1?&^{8 z1WN^7+)g#w09R@o!3~WT`#oN3bFfOn3eA*=jXj_@3%-jylG#d?3PbHMgdq#KHGY=M z9E-j)Kalk?*O-pt<8TcEPU*ej_~%zB#Vx=EN1@bUo=JOQ`9dPiuMUqBC9@^YsxS;L z2Ijn$%D!}_jnBe%cA}|#1vCKHFB-sf-;|0U4*7KGCR6c4QzUfm2gPh*P<1NT1waSNKLzndVv-8(H`&Nwu%2Pe z^Jp)4I@UJYEWM>(wYr}e7pLZE_q=3hdwK=j`l19@EtZo4H)0&r0S9Kuu(y`O1aIlC zB!JVMR21uIKwsU{?}kjA9Jp{B;53yyvx_`h^i%htz8isU-@k=HZreeAH%%uC^)G!q zc~tx;0p)c-zJy4@SyQ4Js5X*gw*UFjEp30%S1q|CpOE@;&mQG^&q6m+bi2*xKGQC~ z?r&n5wMFc2xfa%)metkgk0BoQT@AJsi8tq0X^|~18S|e^yBalHzYV!xA4lyjSCE5{ z#6nQC)UFN+`v<=|N{*xs4i(9otMg#fsT7KaD>4}Fj_q9#zN<2K&oSt@DiL-&fSs;q zmB(0^J@nwJ|1~Lb^)e2l!cWu2Dq4M~E=DbafAo#)e<4UOcgWU&9Uku14IMA& zC=+;Cx@grp?{uaS-iLJtP*73D)~rh!E-!>n5&5v0T1*$cU8YN}<~!EeA1q5D$ZOSc zGZ^^0flQcWg>m`bh&2@0kXPQ0JbZ^rMgJ^tW1fNl*L#o};Tls0jExy?nDNqvW4%@B z@c2cy;@P74m?-&UImo0xVY1o~f1~n=hql2@9Q&U?O3H5QfV zft!D}Q@m_(-g9_>-@o5)tYg*npcz(p*4y1+iAB&lJ#UrNdvln_*bsy5)P^Sz-^a;{ z_Ni*{@xzR4-F!`rVg zyU)R{sI1{uEjDQ`MEw5ri#Hi~62jfJeaI1&to!~Ph*p}7wAB=8Fbaqr9g`2SFMV#E zOl5OI<`Q7C`#Togf=ynw)XD~b3f&yFiYNW#f~6l#&7MYwobLOWZoD}2@d4*(5xtm@ zjq=FsCr-Yfm{`EBABhiLq7g!_{_@1s>h`BPn|ah_FK{6v-xtx;N`n7n0n9QAl11q+ zQ=T^PmiKMip|!Y{eR*sk`9^7lPRLET%4ws3pQLR4vCnK5 zwQLyo7Zf->cG%o1x4bms3&Z1xINB_wIPZKd;(VZu2(+gHUyg(^`CToGx!)3rU=P1n zrByrM^@*oA(3z8kg?JdyMR;I#8doQ)W&4=Su*-wSoUV}@gRU873B?E zW27xOB+~vD@l5t6Gy4bdeYU5XdvLhOSi4XNx-O9uY?l0J?yB}!x(R@-cL}7jhi7cH zb)#pQ$G#g-9G23Pc6g&=5bQNgyZcFY8tP`-tIAQ^?L2P7q5HFA?Woyvi^!A?yX5js z1@%eW{ZL6{u4ZU7=!KcL;WND)wImXjr;_zk9kwswraCW%};9uMt=&Ny-*j$h6g7z-sP7jN32{A%>i4D6X4uM?H}4Mr-P+} z5Mw#aFy;-AY}7`V%eD&*9&6a zCwQmI1s;0mz9BI`1~L`n?Uh5=}k<=A(F!I?F;ZO zYdqzMX2;ptqrm3MZrGD0s1j%`e07HF!k0SeXwfb0Yn~hX(|&B<#SAtFb+GH(ttz9@ z&AZOmY?em%`Y`mQsdDnDA20VLFL?*LM?Y-ABT@Y@(yH^!eY7yrW}5p-{!?W^pE;3u z)B85xgm^i#al6XBE_`Oi^}NzxFhtw5p^|!2x_NME{Uw~O(IsCPC9>LZ{~BO z4UYn+Il{)fbl#!7&!rxn(oZZ_w46FV+sdFLH%eBr5XU|+2fjtDGz4g4ILOC4mYMMG z!WfD`t$f64aQ`IO_BvbWI+Ey{}brwl;D0!R%;!lTZwR*Q+`Wuk^)pRaNo`x)(*$vTHE| z#w*RHwa~Kh$ibv4X@k}^(t(MGgQ_o_ z=^kG=t(HWYFo~tDTF|`o1?UQYT&}WD|AHYXo$2!s{N-ha2T-&Su}oldnNh zH5JnAC!?@lbUMP?$5uVCwu0}Igv-9vj8b-#Zk@NQIB@#w1uJF(VHO9y-i23$VO8z(Y+L;j_GT}bFwcEs;nz0OR* z!F}Fi^?uX^o^-P9607+w4v5L-af+jrsRDv%)0P_L)T;bQgd<#7E`{+6YtiEZ?L6PX zYmBs-Z?!7=VE1oOL(cls&=;&|TRwk=L}Z-bSWy5*H3`NF$f`-XU4n-?p6 zHodG9mtmL=IlzP@X2|ICU+Gm!d|^~=cHCdZ?3qM(bK~FyXqs+G{^2rj5mN)Dg^IIF z>l_KXCF3`<0rMagJlPYDv;Q1U#iUc6$IRDc#BIbrP>@h7l)4S^;aX8ZO0C!C1B{)D zWDcWu+kxZiTAx1Kt2~}&b2%qPJr(g`v|r8bdA|xj0j2^E#bq*2@36H{<=H10Y7Sd| z2(s1b3GkQt0x(22}fE|O`+78Kx}a&*L%@WEZMf^UpM5(%5d{JIv;mF4D)vxnG zQ&;r>{>F%Zc}l|t8CIeBOFhATAoW94dhOEL@hoWW?w78q|9Eyx!GQnyU!Ezak^tr^ ztM=c4*XJ~93!UsVYDTRTnV{(2v1EX23M=M-;H3i8mJJ%N6H8VawE+0tF9;i4Sg|P3 zkH5JSBmQo886Q7HDEQ$t0grQMSG-iGPHm}z)(mO0t%2lx;QO+xA{4IgzBn6P`5Gu3 zcw;@RPV@L*zO3V$2f$a0?f{R~fe~pl_|v zI}Q$j@%>=Nis&6X{d!k%4p|Zyh#z`aG5`g(`U*DwE%_f+I3kk*N(SEiSK#?SwckL> zVwUr+WT1)vxAFhqANiChgl4&HzAEzmqi61K2pf04y*9gvF%*r)n>1E0nI-eP7?M

&%uCs-52%om4F7TRQ;;km~jZ?ocPMai6o!AeI2(a9clBt+KlZ(YRwQ9(9(_Q|o?i%{bTloC^K=GH`Ff%O{>u*}p_M zq=xuQ=X0*>CBbiO-yMuX0Y45h?`xs|iqz(QiYcj7fq1jXjLFRRM7fNqhhnt8z*Q1z zXlQbR6OWFBVGWOgd<~5kl9Hizw$lLNjP13K;lB?G6*L8DizDLs_dgdg(ZU~5ls~F( zcraOMBzx$}<5Qo^>e*WP z%*JHlH~iF8?)A=Zn*oo2b~ZKM^T)~hVe?)?#M_l%x@Num;sg_yiC>f_A0crwlv#Mz zYCtsGh>*qHmz6Z)XP zaiABk?i%~uHWViKmg<$gkc}4KMo{O%t>93`{6jG;;KD}o!xgp{&{|rBP38{mw;7rc zRA|jDc82Dv02&olA4VoXC;4q?kn==OV^o)N8ICnmdAhz#Jz*E7wiBb<|r74nK1|p zbF{p-Kh({fUFee}xU3XDu*Y{QeE@-7lH~OsbZF~kP2{C=73>=bgsL#uVLgJ_dFe)5 z3~Ar&>EshO4Tm`GoK4q!EyGzwyq9M5RVQN*ONI6 zOR5ogQf6>y{^mOxE=8+eo4=Kpcn-fC_T7qF)mmO&^UvW5zi}NyS?rA(f+GhRR1zw2 zKb0BaFFd;>Zabn?La(T`+6j+UAuU zdP3T4{grbf1(kmp$5}qdBhqq9m|AVwp8BR&51zjKEo> zAuzkqao+!hWka{96#tb*YiZC;>$SWhaG-rejR#cbzHJ|RRG|8))%Kbm13~>9TKho3 zsh+~9&HG{Ik;eneZ{}<%M6K(-p;ckfMAp^-oa=$(-~~5v;L2(OpP~KN_wE|`fjn_# zX3POn!!_u*{+rS_9`%s*3i9G`ZHVQaA$ga&TK%%d0m~D6UQ)dzhu4iyHotDfN%vmD zI42%eeeAw@aYOpc`Vk+;477{q<={qW{z#6=!{#q{BP18~S-ZuZ*PCyr_yMqIiE07s z*^&wGZ|LLEeH!DJTzWCqq440eUi8c``e2TX%B-fKtZwg@Fr5ZB^W)ToWVCoQ9jnk( zkdbfzk*L^au9U@>q{N>ds^TVPM8;8_PzU|y*gfecaw&_zvXt{vR+F+95aL&-4zoQu zhYvNh2n7#Ue|*QDq3e7j34dcY-yEa4s<*Uv()6bKeD9BR)&sAV{VbV`L_oCQan^40 zSHE$-&x+Z^NAAOtSwS7zv|Sjm&3zmW(`LI>K6);{wTm44nnq`&Bug(LBbMkyxqf)- z;6+IhV1MCgNm=ZG-k|+^&VG0Vj8h#c@c4f*Jiz>HrzEgq;rD>$Li>L$m;Y6MJ9`jwSbD9)$u(=MR6!EdV9p>AP1PleV+g1|ulbIQxq!*aYD(NIU#?gj*ro#X zgfKPNp*ZO&c&wpB>`S$o?^@%ScD6GJIXvzGZ%leasccQ8tKtC-UfhMcLu^u@R22@fcR%OwQnt)tj*ag)J=JRZU z>D_`c<@lh{YDAAA2W+z!DtRsvy9KQ`M)W916_kG@KN<%7po~mb=6^XkuO6k~M%Vy! zx{s3_hCSnL6-CkJVlO=0@}v0@qUYilkez8CMR)?cU{a7Lh z(GNCQOj9aY6|h_T#&qD--V0J|2sl5Z&l2?}=71t~&A5>1$@a@_YPZ%s^{ikEu!H%# zRRJo{HZY&~oy;cjL;?;eUc7kW=;9KsGNi4GfsWoj-u0=c z6c+xAvF_)3>0ayzU_YO3DfnNT2;yo6KLSj03YutdcjtWrM2^`|4jn%m%Z@+2Pb?SNHnV z80sQlfp7MOuIw*!-{%_Q`=9`AS=Y?_Ihgp2^+)y6 zK)i^Zv)N!aQRhz{<{BjKR4jCG(fCnlA==mteP)qXX<4h?$SQ+R1|&dY?RJlzxiaN zONABV*3DAFVy|j|igk*m5#%(Tl%tckj{%R9I0QX~7M2Cyss$YD&td}!T27iq8!x!d zmv6g|NZzYA<5A7~MSQYSTSmU4rj`5-l}m}(QvaBwyeAG{GfzkT8Puca90)L=YB&lA ziC2*nJztEa-qUXi9$3HWjxHQtIP-;Bf8*fjc>0<(rTJb9 z{8(c8rFl7*LPW?^qdSYQ(0PtR?QlGstY)xC%@UWFQIKTNND+kA3_(%jt&aoin>jh` ze#^r;f{WuY(^t<2X2*r0=C)5GQ1tW};Y!p|69&377FQQ+V6hMA3B)<;8(C+~TkCuv z2fezd^O?=61#duR1F3^zaLNd$q!jRkq!}0FMsNPBr=&>z^{Yf=2R$||e={c z@|oR>>!*2#Jv6PeTB`@D?IL&pa)^8}8kVCt$;St3Gl%;;Grj3L-vC!*wiexlYQV3^ z9SeKPoNB(hUc6H;H`1`nM%9_zATB%9g1WGNHhS35$-S3#!K`^sA!!3EHf=rBazno@ zS9O!_^Kg?kl{7GEuV(y6BG2}9=dHI{>Pnz{+&(}m z`Vt)v)AL97D<3%SFEnWjxW34m_la@}htpa`ecqBf86Va1^t}Yj4;9w?ZSoE6iAHy} zQOZ0J%B8Fepm6rfn6Y;MKv`tU^VSzgD-vlqGTSj|OeU@Cuyh;H4 zddssYm&HSV%NYt^{VNDId^@S4Gegzor<0tlE9fUZ)agh1z1dc8@*BFe8{u2(?sIZI zExUHL3t@hfbF?w*cLNMaP=ckg=|bcm?t&-7Ix`l08-55kTllQcEDrYGy)#jDs%d8disva{{fC2cnC! zMWFi-KZnU4)2AEK9G%y!7k#XK+^@+|x)oI>SVQA<*o8-0SN03P5GPx6{b9}bp$r~S{;AX{Vmrd6GA~Ze#or_Bde5oX5fvE3$_aDm?6bQ^aRz{ic z6vgZ%0;Nz6Rt*Mvqc6X@NYK_dBt9=#UXNYsW?JW74vTgOeO*t`@~8WJ9=k6Z3o@`U zMo>dwTn2AW&Qtvgt1LQn>ls~TW3Q%B8PyNI5B?4#r2edO$Y`0Xzuy9+5#;N}yd$}3 zVZt6C?EZ*aaD6>j=W5hlzlOs*f=teu0PPFM8VTJ7fiK}Sl_$ zPHH~S%gM3xbs~LwrGl8J!|Cf^EAn=R@kNQ&XE@2Z^+R`GuNS8|*yF>o329FWz3NUN z`PpRVP+K@ItS(!b+QLT4m~*al$qYRP8h%7OAzCnbV)kHN-N_hnr-CHD+7Th1Yx1zR<9EV2Aqs1L-=Nz zn>AtFS+`{@F!jwUzY}+;jmVIdlm#ggL+mV1UeI^C<4{7rZ<*F|q|G*#K-@;!DT#BFeur#X3y;dv`KR&xBnf|OhPy=)E35^=KU^@?X zBAa8IKdic{*Qk;gzS;G0x1;K4Ll6EuTBrIXV{X7&&>wXi`m*Pt=CN$pEQk%HmavJF z;CpY_BI}Kb;;5NRFgz9@av>g z>9x4(5Uj|r&R4NjS}q*)d`GQ}GznjR^#wD6-44{x>`~GkpRqds%xN1oUm|#L7uO1#FPR913gzxA7RZBJeRz9Qli|xC0JV8 zZ5yfJ=4<7+I<(*_vlm32YdZLtc&7Wlr`TSA^Dtbr-fx#rW(X9?LtRkCgblZYZs`EV zpuQ5Nm~@`g<25@5cpods=Vwfu%qrQ^aBu(_-Arp}uNJ31aLsHwge|odE#RYiR>LeG za+|-RlvU4g2;d`e?fv@7;=DE-`KI=z%#bGmxYg7+gU?SldI9D{p6|Wpx45IPOs48{ zSz@=SeTThKS+Cg{D$5WJoQd{aZEqN*OgZGe?1}tHUp+-)FF)m_9haA>-wbreBc_N#s#W+FgPzEiru$tV?74H|HShOF--GC+$t=4DT*i~YR; zdC(8S9(y^?N&VJUy;1j5>Y%bV&b(n}5$=hlJ3uHI{62W1;{^#!{q z!CtI={p73TTPd$}Va>g+E(?MT8M833J^$0En7m!G(rg@5yF{rna6>ndk@t?YglA>O zQ=u_1Zle&V09YqEPN78uW~Kz-iBH~CY6#kM=G;Fd9I$Z;w66r_vi>B&Q%oMI_F?R4 zvz@syy*^V*+*A{2F~#J*9`*T~l{#D&V1+#ck~j!L8G^J#mB;qAs6*S%rWhdE7GfAW zS5%P*gKdwc9}|EKnah)_O%t(c7OvFOr3DKA0)F$tZD%g&?)8u2(cO1BR-*OZ!2;&vvjSK?EvcDB#>ogq7Vn(5U${G zUjI5@{+L_L`RvgeG(EVYv#G=$-SpSz+1~7cChrJDOZp$v{V+dLai0uD_@BG1{i{W% zv)J@-kbbN|c`}k*%OOW*kIcG`h9LFo^!Vp~_`!CW5!h*3D#~6k=lQLS!~==1sJfV` zT6Yo*B~Zi@5O-HStBZ)J)LQG{<=4wPQBrYBeQg&S;Ft*$RPG@Lq(5&FpZA)Eq zgj-MfcAataCLVCF9G4-lO`<7ce~^(DIk?Ra=}DukvOu7u&C=9juYLXp&W<}tbJTau zsJ&K8cyciznIxjahK8G+^E(07hsl8_>c1mGbQdT2Vx9affdKw+AZQRI&0rbMdZZuK z=(j~yU2Ppse7t={Q;@`*j!Sfcgx-w*9~dzou86Gm+Q!&Q98YY<#ZRht2+Fse2@ zF_|8$N!Ngn;H8ob89cwwLo)AT$NgQ_>?Qmb53;ph)rX@PFJxz33Z)kJh34*xsusIT z^H@(ZO+Zg@QxTq>zn<2gFDSa6ZrLYPiP9uM41@x(xMhbzPAMGd-da!lp$$VSdskkU zy4y{Mk|#UlNS_jGUV16wrrl$V%j$Bi+}z-5#(wjO4Uc8WJ3?W8J!XKfaCNE; zAT|8AI1+$^D(?%%DBJi)nH95gJkO)$I`6vtk@m3n6F;r9Z-$yVW-67sC2?w@N4894 z8vR%f=9=PU%H2XS9|^3zguPJ4Sh*5NMu)8JmH}4tU1vq18n5NDSspqQ)ecTgOEKV7 zJ9$r9(T{lHk{vlyP{N7<(F&-`XW0s6=NZ<2jHAxifEj-}r8Lqn{fX7yCJsQaXwNDy zRT=cC)T(OGDcMdRf7&(!hOuYae}`(;??YKbb%$u6PY9$VkKw@WdD4*f8^!+poYP)I zxsNoF(jdBBcuOqqMnnzl{~Ee4@+wJR zC;s{CphwVDk~prmon-TR>A^p#V4O5d|IQy&6$M8J-nB!$E6DEw!lxt|Oz-ZN^n5Gk zDfQydSjng=SyP;7=kDzqEi>n4ACJ92OtXFoKZ$oEF`TH=VQP_{6n}`-=L`M!lWmms z3zJg{-jRrh!q19E#@;4#*ws=@r32istDpNXp4p4PPXOP|2_ADqum~V20~awA^Nk$2 zhCgRK;h6D%b*at0hNJ~ZU|+^As%f~&U0}-pkv9JxKjJN{m@j5%K~h3utSDiP@#@OY zRf=Jl8lNIh$-05S@2C$XU0`aA2?4f@9yMrW<#~57tga?f!G}~<7TEa>v(B`9-XMLVfkUs`uwfKDucf!skxKLSQQtGlNR zVSV5o6Wcgoyk+3N1RnyF3liQ&VE?h7jt)&A2?ALTCTS4BeHCf}`H(q8%4v@#Wgm1G zFO_0|3jQZbA+~2ViqXbgkmot!QQ=VG>neNGpPvKeKJrA4pR3;!zBPHK%g?s1F{2i{ z+Xob(3XYBKUGao*J$usKVTG*$U;EMb5XNJ*6Dm@vCdHuk(;XjH%%&X4@Rt!}h1J#2 zOZUqc-`-!K{x&S;g-Gg&+=6=y+BNdt7q#Qvze0zCH*V4>+w&-pY z$dJ=QhRry4jWf67;-Z?ee%@+-&YSklCJzMo1O=i{;7mr)mE0{Ps~B(qob^WNkVM9f zJQS~$R;6Cfq;zoEj?H1)W!_!%U!QK?g@I!$dcSoC4RGP0@m9~sogq-|J<{S6A>c}7 zsRFhw@U1@J>Q484{0~Psk*HKl@^jiCj|g_026=fW}{pz?z&5V=yweH1HfaB zy9+j5^^|F@_IVcsJo^C?pLAUBGz)N%_pSM^8?nU1&x~h(ITfko2U!H1N}Gjo1IRUZ zbHD+7svDwOWg$ICdPhbh6R zjo7$x-SrFzh@-f=dCh@N`}sKo^LF!(B7X)Dn$cr-c-=q7%sW6x2m=k(%jQ!F_?_|a z839)WSiQ4?r)<9g2+lXAU3=Vs<$+{;i+%rZb@%@f184eAG4OxI>uH%dAL(G5btBp= ztzN#$Ya*fH%72TvxB~K{b%sO&E-1=bzJ;YXkz3fN1s;P&-^vtBo2Z@41x%{2D6^dm zu4p8vwwSm@!|Dc!)f{4=GwQba7=tEo5V!f*U9*nnazX>Lx=+VjWh0&GIn7T9(;vRp z5mr+5W4M!kn%#7Hul0p8DnOM&O-s2}@R5$Xh@Bq?%_wndN&rB|UE48}|JwwKA0YsL zk$_aFnGja|P`xIehHLP{#zE2@WAT2OyxhOZ2B<~&!*ye8L31>AB5 z@pgAHZ20mc6u6(0H50*#?@M95_;-Nt9>AYuL;r6iN_ThICe_xnY$`GBn^9+;t)Ha9 z1y7~&LL#Ni(ujmL7P>iU6KYaqk0}DpN0V|*E@enX>~Qhve^JG0+Hu3X^*||QrYW0 zje>D9oTsn1xvF~|a-xVlLz#Q0~{ zSv8PRZ114E7LDA;*$2e^@9+M!F;lLuZ;0sSVR0{MjeadtVCuW~Kxkqo0<(^KvB}?V z#K)Cm5G33L7#ua0NGjLzNg{Pmu=Neon2mw5THhKV0|f#y9OhKUcB6j&RDm8)24XDm z{`>^K-2nzeL+pQ!sDPo2g#=yKbf@Ld+oPMUgR!ezhQ`}p5APpG?0rI`nyRv9WZ;d}AfW%ZJg{>1QFATMaB5I%SMO!NIEkq}gbVwiD(3#yHwfMG2D0*WN*% zTD(FNui`+Wkgk-6bYDiVO=7~A7Po?XwHT;<-J!2On#uFD6>%=$Abs3S_Kj({@8d6a zppG`w8smN=tanRuC(GumOhuYE9VxWk?g40GcSI&FG6P_o1AY@~1^t+PS`S?DyW*wy z;3g$fjJ|=whyMtElXwZMQ#U-@p$PokmjKrh1S0Q!5+RqpS8GQOW4&#~qhH{8*U|C( zM)RbJ8cF6_w@qYxxPxQ7YO3Y_wM^TyQgR{X+uuzOJc3bM=PXas1)r1)e}l7lzZY~e zz?%f6BBp|=o(}MKWn108FGF^DhdIkPuuds}Hb?ej}vs{}kcW@;NBwnt&Rhh}X-1VGuxFVtrOf zcn4UV)TlV^3#4dMY1sDX(DyqxwH7as2GkjbCL{vjlpk{*ov7U&j7;n z()ckoUTWI%C*!|sDI@svwH^t8#}s>r6J7Y>4?@tmCZ-7DrIsoO5``bHUg4(#T84LJEF^d-~_4=S(> z%0qu1bIanTzPSIQE9I9YAw(-f*q>Ok^oho$Zk4q3amqxeJ-k`vJ*U$%9!I5Pz85mS z{10UPg@c;uk4!fhlt0U5NnkxE@gS*h+#Ub~PAsJ0>;eIYA{bg!Wu#i~HSdcCY=Kzd zv@@|?OK#z?LMbXwdsLJQ91v(21R0$jJ;2bQ_)MDYpi3nhuowBWOC8~5{Ir4vU&R1V z9>vYklm9!WBeym@zDdg7(~c_a6Z5;f*~0?sJ9))9-hc>qe`a4f_p@2kWqek^tA8b> z#^u@LHI9fyG#(rcP=)$&ARSKDy?Nat2q~9S`0Jv zF>w?g1Tc~G8@$m^qQ|t}-vOj#C)QKlo~u`hq$B1braPiNe`Pr7M=D^$9cD)k zx03|?Hh4aMkqgtAv0Oq*Q1_&uYPTm<>KqNaJ&!v^ipFy^fvh(R;#2}m#gKggquQM) z2+KiVVfOo%)5~wnbRn}P_AFOki^^^Z;UE%beL5LvLq)yQ5ylo_S-wDwJ|}){Td^Gp zq&1S)saFa?$*slKwA0H)$f2RH2B!cR`W>bJQ>!2l5d8BUaalU>YW6*RXTW)BaMC&L zXX)gMR6P;X2vaZmW8h}SmV&+)IlHvBCj+uNkEip(cU!XcUO8L2IT@HdyNq*jbaU(O zB#(3S-3DjOyXi*0C3f#FCDL;GSnwuF8VC zlp{;*N?+);m&wI51>(njCp{8#YL@k&TIx5tErvMWR#SgR{Dy|tQI#Oy)wr;&uqQ=Z zN{7X9(YVYbwrUM;?*xVzNaFN=ME7t!Sv^am9@-TSrzMXq;z~N~^nhZgaidMJIGm0aH!BhaP4~3XUy5KC!t6AlM3W@U+~8( zmo4CyOoOMl$v2N4Kb9C|s8y-HDwInHvq?xu#J{k?SYgTF_=OnD_lctcBErg7S&PRb zquTrqnI$d*nwJ#gWa`pDc%pr7j*>e0hEzU5hYDv%JA>DanBgQf1h=vD)yc>NCs$JHNgy1iBzwMX+FkTt9Z9)jv95_$E5BMymvB$lfy^>BOuC*Wr4; z-}8rC%w^}v1j2v-UauQ$Eq85Vtr3fz{1OUe&FY96vQCG&Q$&rsXqJ+{9EfAS)BnO5_dBfCZ2E!0#$w%^G@Qcx+IUi8o}WB4TrL{O}1A8o$Xue2YP zitO@?4RlI<2~-B`?jpPN>pRG$^kZ%&ppjFi5e1G$n9@fBCX$5vB9N)^%-7Lu^e>cp zC60HeS~!i3jWOJl8L(?}K&nyji-zZ&A=70wcm(m+(xQ-=4*jzA2%3B(Uz#0m{udgch! z5T#i3iP}V|DM&Nc<((jd>hN@w$*RCJQkUy5bpTAzdT@%4ILOLHyZ2U-IOS>-0prJ{ znhOY7SI(vdpbY=9he6mV1L;`9*qjFz3`?_(>l!>Py$a|+|HYwo>5 z(P7_t$ITA}DIKR@Uz-!!|)YU55cE~r#Y z=5FOPm_K%NMUo=B*QW7+07RQ0=FzE09JAKwqpXFaYGt{x@-Jm80DC{jH}h~Stk_A%W1EnfpR!Log80mO7BrP$~PrgH6D>(O;+B1SC-IRPP3wv-LLV31=HIC<%nH|c7I=Jqm@ zREtKf)NVp3b{^{G1Ar5LV{#H|kK4rTT)r)6{+mSo>X88FeIQefisGN?YKZ@ocn5eQ zfU}ib@w3UwOMfSvwh~DGyOEUz9#F5RK=f%HFfjkCULydQ?3(~w^?#(@aodv$ON=^S zpYq&H010vdfAqEPocx-eet&+nc*&z2THepa>t}O|9k2f9pXA(Ep=p?te4!kYRU@qu}2>zhd#b zG{L(}MRAB>Fi`T{oBygZFJ-pnGxawO5lViS!MJ)?g;olnPX8?VA6582^oJu5<|@3- zfk}Er&7klzl>sd;&3fb_HE`mDy|ZO)cQT62d~F72gjnLIE@dEFDc(}2UTF3Y(QzS5 zDUQ4|&BqSUOc8B>`s?pWqW|8uyLjL0`>#DuRuY$YmfsW>va`0xAwt4RZl}QviV1{! zShM5nYq!l_C=3Uqw$Rej(u@9jiZ)mI@p+s5%FKZA<#N5~yAvf!xZ=Kay5dtrzEkZ~~9p-=iexv%R|M}Mn&iunS zMGv+gds8zyQD|%HlG>SerLaW5FJj5lHM{V;!eKWdV1m7 zL#=-27|J-qKQ6S}C?~g@4OHM)Ez1g<7nd7bG!-=sN~l6vbyoe@{C8$Dm=IA}i^1`S z>-3rKpV&rTaJ960wO?Z!=>23)54asHB`8qS3OLcz6&ihIQSAS?uydn4~DgZNV zMn_k-1h^<$b7?YdS=}rm%Ct8=@TnV z)i0;#fCcKM7hn{Cc80zY!3`aRC=8wCdZvQ=!PusT5mjzzq(<|8SanMCk6C*V zQ~hv!tfDIb-R@24{wgjqCxRNYjE){JKdcK7V$YuDNA+w1Qk5# z!{K{&Ol345pnCE#G?e~bf&6Qzd=f(h88>wzaA>V=?L<+sxc``()WYhcJ^>p;Z1PF@ z-%s8HahZSM5Dj84{FBx3g&=uxb(82UF)@Z0pI#~AVt0#H{udqG?a8%*h`9IiZcp9W z*v=cS!&tNu#Uq+x%IneU> zc+w+x`2>1iiz`t^^~{~sILqPe_Kw*HR$K*bh#qzBp=lk_OXX^FDH4Gjgpp5MD^I?o0}8u1eOTy)2*G%)%2#7QrI@lq`u`(N+z zw)$Nc0JM9SUv$b3o21;oLJ4@?sv{Kg%Xs<#IWmTJrpUjU8MqHrnYqN9FFpwO>@RK? zRti*aaqeZd6tYJV#7C5kw-EobPz-NQJftzn=>-pE%O+DTb74iz9eE(R+cqz{hQ+ z*ru&bm8FUIvZA_D^3)`vczk>}@z{4;LPU>F^4i7k)(b#iv-0Y7TxR(cC1 zzh>%jl2+hNtfv(NSKCGink$JJ9A%?C;Z~ZKyrrf1OotH z3QG<9N~aD`UT4W^K#KzQ66k!JR~$=E1ruC&a<6NZAo(GPe|O@^w2Osg`(tz%t5-Gd zc9f46plX}G=FruowNHqA8B_8+pz4aru(%!%((2Wn;n(cDxVsl4%)e)h>Y`_Qy(E2-im6p4Ls4|&T(3Y%Vi4j0-=ulmCs8! zNk_etJtSI6`t9wzm{l$(O7wg#M2OI8-hsSVBM+FgE1nB*C6KiG*ZVIu;%8@NWgRyV z&A0kK9CpZ(Q>?CkHy6dE(Mo8GKZTa$*Lj_~#Y^p>9<7j!$zJbs&OYDb6@GFw(RnUz z=_a&UQy`n5q)3ST<@R;shd&o4$jIn>5j81ZvTrhe1qF&8Adx1@d<+|!(a2&it=mI=_-tN^XsEU@# zly%|Z%NeY>cJC(!gqJ6l|BfZMD#1G$p&Yi}CPybsq~T^ll3P|l%uZk>sKi2Bs&0WmgCI_jp6ZPw#Pg&`LqK7q zCUf0gwF40k-d`^t&}$C3q1w-C>E4#DjNG;oX}-d3BkGR=hxdGtR(*foJCp7l2~3$D zJMCD9td>C7xJ(JVN5jAhyjL_o{<%^YX(gzI3^SQpYF*GkrO5fklrH%vNPx?%h}Xjz z%d2SCbFO^P>*X=Wd{_DJRqyBd+{o)~9HuSIM>>kliw+mmGk^^p!`yd!RuNjn_h3QH zPqN|{AfO!LL=dJEd*asp8*Jlfym=ouIfdY22dd_yh-|aj2H~l;_u|giFuid}Cw#c- z`M|2T$#AHr^EK9E4Ym+IwHX>OvM56SQCV1e)J~jhy?%?k*V7)Z;IVxc$RFsGhX*xf z`EXpK!dcX*yObMWpG8008z}ii-NKlB1!e>5$aF3o~>Th(;jz9$!sh? zmG90vXL{B>fH}rsnTUss?yB$ifYW0eFU^O_k%?QSKpVsg#BEae?Wy?KXCrcJ!t?ZN zo!d!Wcy^IZ${TC_mLfjKPlir2t9-8QMGDCk*nDndZW1oh-TnAX@E(Ji#Vi4v@`(*U z#U=l4u-{`JR^jF+IxglK=4@RtH0LjF=Nm+vR}R)R(!nL2Od1np0z5ya&><$~_UJgk z^mEqv2DbFT@pxh{tX-8}Z%yMra7|u@C{q{h3@2Hn{J_Rix41Es&!p2pcT;o`-XE?^ zEpDvo87Dh)!9?2e#NqNBl!f_1oHhPjD8tg-;F{b#}E9 zphk9S)7Wex`OQ$Ic2;%8HHp8kNDO;HMy+HOhoLv>7>y|!azzcmsj%_h-G1XwqsQFAkQ0Yi0 z-#NX%1l6&9;C~LT+os6)?fdv4-;*Evgmz1oRLwRe{sAyQz|<}lH|$P*|?tS zptN0)TQQwq=14U7RV^~g9lt*U8-lIT#op0*#)YxA5Fh_R7hHdi54(a!=;rrDWgbKov1vCyw}{CIu(9w@p-gTY3?+muID`EjM}C zFwbVUqTXYdN?=8J^0@OaDu{;8htTE+tJu zuFI#P-0^VE8IvTk+y95Uw+xH2UE79L6h*pIy1Sb}qy$Onj-k6dq-y{HX{1Ct1*IE? z8cI?chVJh8F7CDNdp*y3e|-PmZF_$?xG`L3pZj$j$G-1{ZWS||?DbSWgMqQ#`NPyL zpYJ5}6r`u#ME@F_p)^0zmBO!@v+xrGOQ4l66*ip3fVdHluE40L_2hD}&U+oLVR$DXxCkqd zE`eRw12!fzNV_r)Re3MbNeh`1ecihDz@mF(k^&uWT6ZEzOt>Q;CFTPCT26_NVGgo# zDG3-FYYZMv3C(gS7J6L; zvORbgQEwcIP#I?Z206Ux1ao&=gYqAzSlDc6QH1!Stt#CDDILw94E) z>uz4=NL%3{b5>P22YbMp4w}<1jFkq&mikJG+$|x?)uCYnmt(-m|FHtOA$C8;crEe; zykIe`M+5zgk5;VXdF40h?{kUtvL}wESgE&-JSRMzO-Co}_T$2uT3R{7c_53a=C6@H z6Voz}UVx9sOP?Y^l^-M0$eMSCzrhn*}f?gKUxOG=|p z-OK01CVjN3RyJSrBo!@3PiEiTON885*;%vNy3(BRQf!6cC8gK(AW4^7pHaI6$=r%;&6gQGV7pFTth5=o2Xv zNwCQnB62F?=&mh)g&x7UBFayNH|v#=;zu)AJe;e8pGIFo$sHp<_t=^E>BbE2(Lqlx zat~(wDx=Jqf2ZzM_KP2_&R_$XI$X5sSt@yTc%R@G9Icv{&O(D~TE89L=*D-8^;}*b zur)}~9^8CC`G8W!|GXj#11?Os?E$W8tGp#vMVnyHYVBw^kdw0y#jZK_X zpnN*3Ay@j2bjgvhdUYu9`^zLs&*Igbe#iZO@7*?z()4OH>?t{v$-#tyDl+*zZqqiL z(#(1yU%I&6%_&w*FW$;_z!+vU--oy|c-9MSU0FlfIpLtg`T8eu*SbhW~j0olSSh(_q3$?c1wCx++MLrMuUA?gMWUxhm@5u+`ypHgkaus4kM>dr4T(>#WEf{c>F~7Y2gP_@zKq}YX{j~!8xIkEbDL%k{)Yok zZg_ciXWAwoy3e{Rzmw@lcto*KWhdmkaO|5xWz93b@Kt0xe(B`iajwjL_BV8sWK$gX zVFV9^yzn*gWC{kPbg$-}7uGhx!Bl8PuxUnM=ccing?6Hl7`UTMI+l()nn|nL>rst6 zku2eUS`$yAklCg#Tr3b{hw4f-u5Z1b5@NLqOjU3e`sUTMEoJ)U!rN~{VB#$GB+qSl z#vx^4;r(RD7me0yEtg2Sv)POD9>{r@2MfoqYN@c)H%i~+2bBnK#vp@CVP#6A#b-A> z?8ZGP{PJw>Y_~#Sq0Bi4sUpPhhcD$x^6{}@7XZZV`$KhP%F=spxvgjB}mG;Y;ND$ngu+>>)4H|s1SZ*&g5v@KPI~+0Q zb8t?ImPn$?w^&46kZaXh+f6X3-asL8-{r0R=!=-o(NkdUj z^2c~zrNsl~D(ov)7iYI=puL@&{q&o%n;LcOzRgCi9UvdJQK7_S&cYMKyXcWtZ5tE+ zB@{m8^!10$F{d0czkJ_M3g^<sXOqeJ%^XyIt=E4WLP8FmXGV@>wl;!$f8^PUg8* zjQZo%*GE2Q5O417fL8CjXs3yR7dMSPJ57G~L?=*K0TM-Pw#)`Cy>^2|w(7dCzMw0T zZU0YY<fXLIM&&yV-Y~KvZD#ylcp#cHUI>$pk#R@e@L+P6D_F1WVKEUw7h$7MjdJB8T2dk44Y6Ayxsr8x;IY{$vdp?)K!91NO8t@^*8bRy~@l5aZgrUhirtRj{y0D-J+>l(iu6s5=()Y@^IK& zs7^#xr_IDCR`mh`#L_pfO8B`2a79q?yDwXi@y;~gcT`hxj-YIa5uRM@;vaT&QXi&- zga*b-jsUeW>;+VkmLJ{E6`TdX$y#s!E?o9#$S43#x>N*rX^{WEu;A2YDYS%Hqkts8 z;%oV869ut;GNGv7Apy8C`UUc^4foBjobl0$lhtO)-3P ze3!!CPlaz>HkQARdYjmLjCPJAleq&6{!esXj=^yv! z>B%nUu~eNu<*?O^0EHgbfCKj7q8$?emk=xvmRgLtG5?9PXB6FgpB zx0n{=<;sXJpac;ujze09RKx)X2yUwHuF$ZP z9!n1g?Yt#3m|r!UOqvXEeUVk+`AI-#$g^0;o3g=^({@Z1hYi08G+JO92_$dwvn?`0 z!{xxF#|r;Qb6rY48)e|U$~_ynVJjxy`^FUenZrPe|D&o|8qD$;o8ehO_s)J=BeJ`~ zg46>Kg2!uls(|*z#(45=gCDI}Z(?*vlz-{h8%7(okn`fC3dpXu<&7mo|Neb2hBG|p z5TgJrHR0TC2jRNYCxpkQpWsRD@6%~)!y)z7_hn$dMC#Z5{M@8fdVqvN%oAhigWfHA z-*390vG!4*0HMxi!}Wr7E@&sL0=y&`NsII`<_t0~S~o_J>K;cg`OYK?egTwC1XXQF~6JK)e8Dr%jo}s55BsPkkaBO2rHwX`Aiy$3<#;m zo#c@>?KadRr&gPyB^iiGn%%jbaX21zvG;Iu&kX~38vZy=1l}HPIZocXLcVIz^>v01 z&uAEe`rSa2ECaRlpUQvq`op;#v+S zj#WRp0~0Ssj(5#L8+VfC60~GYTyCDbxdi^%tXGmu?C{iZ{6)==j8N(XAifxr$B>X~ zJN%1+b23Y`U;uf*WFZNk%b%;s?~Z7G$2IIk#GHG3jzA=%p@c&zXi~@u0sD@Ma!i68 zH6Do8JrURck2FiG0=Xct(0TZb>(lo{;ST|+gsw$;67TX^Tt&IL>~Ort_}vnS6{VCT zyNdtBG<-pfm|T`x^w-!gIU)xVSm54s3VM}Bn@T9V0ReHO+zq!(70mTd1p$ zx2%Pe0S!Sf8G1mZpp@}11XJ{Z+I!;WrS>q#nYnCaB8pWY0DBY-En3F>c$f5=$}Rfs zj7@ttgS~xZ$bchopMYMPHCe67+K^OZ!cwTJtzB{*#e#s>=XMig-9l}n@qHzscWeA@ zY)=;T>x1Z&ueY~in8H}WJ+L=c@mZk=#J`>Kpt2A?bzBM$!Fwy{|*(I&Qk7%?8Jp zA}z$dmp`R{toA{UKem=JWC7H`_|NYPotRpY<+TF zic^3R=I8bX)Vc{2Jfh6KY0^3}E8cOw$GWO=SbX=5p2^H->y##~%baoDE-Q$-?_H)# z#Mn%o1678g5Ce`k>8==*nJ}m?ESs|0WG!B^+5YUIwAX}n2>t75sf4qS(@nTf_Xz{^ z2>UjrH;m8#kiYSw;!ekyhM>x{#z-QrT`zD}m! zv&!$@D`Uq0bv55mXubSl@5-THk1mj6Ml@W5B^IaiNOQ(&@gmYNC84NKc79;WUF-oo z!PZWD_o+6=D$!FM=O`Z zo^f=02lE2 zGIK+5O25;b(Oxx5cXIFt{}0TLy6@lxsow@QxApPg3EYeEtZK~B->#m^$g8^^%!z!l zQNq&0cMFOxldp^J{Hy|*E@)bOYXq1>y?5Dx4LV@b5v%A2ock z_v+j*jZ1U=Eiz!z$z;X!IAvvH2AFW2H(QpG+4*>5rofNyVM&Q3)HJ4#GL&HS~E<>UpwylFH+QhZmz& z_$0jdRvC?7+u`j=z;vx+1yNRF8@5?AHM9UC%RUUL})C+}P!J|?rIa7~Dd8CU- zhJ!0=*{m*#qhk_;UU1%D?o&aPRo!-E%Hpae%3mdeX<|2%+mQJn^#cdxoH^0 zx;eS$_9|vM+o?d?*-CP(0ekTdea5u&vuHtHsVQYWLu>HOnB~?Oq}N#v6A0O)9^E>g zaQJ_TaTLuwEpTSq1Sk|PsMuZ#ZKv{M6a0B%jcZ|UEg8HgYaz8qho@~23a%=*Ng^Bu zjpx453$e_1o`I-+7}{Q~$MwL1$iX}31=hzp)#BfG6X0P+je!kVQI)p%96t*hK!xf| z)%Jd2!v-k!A=H)$ZJcdbbic^xklddG`4A;#O^W$|4Q|qTkMMTWm*UZJgko`)S zrELXav(@j34g622PE0Bjo7hPp@#rGRuQObm%HiNtk4N3C74tY)g(7q!>h#>0WTZRH5lb%s>=b}2;b1fP6BFKbCv{^XA9usQ3m`D=auFml1A{EJr4{308}rTTgHO zGS}o(PlnxyV5&dYP;AC%gD!EC=RJ|Bby89?ta=B15|u-9db*$2mdqVpD!yH(FQ?GzyzZ`6MLm_KzZ9BTpWOy;Tzr{2+67rPf? zGQvk<02S^Sh_PI^b7#t#=0T0VjC*%oP|VTFLe%5?%L^tVX8@=L~|9IR9W+I zT$Jqn%!FX&R*nk-_cMvwi>3gTY+OuVN4QNoIm78FXNCQ$uj`H~X)VO&e#YYf{<_7- z>*k>^u<;Cs!UZaS{gQgH#4(*H;%V{n?*`JFaSYP5HM_sDTFlzRu12_4PXulcj7|%u z#C>bduGLq2)2S+9+f(pf{=b3+?-K5WJtBdE599so`iE+;$+71SAzMZrMs?V*Iz%A8 zOdkIYT;`6J!uQvy#l%mw6v!D$5D2Ko#9Zd=O}KF_9!0{BHBhZKJ4vYkoQ67g+_{n6 zg7!B#8eV!}WoKRKIp1IToV1G$JhaaUSdSKKFpXQQDNaad#1t27V$>8T2;8NN{LElr zf|6!#>Q-~Hk~BZL!8x~rP2FCdz@ul0T`Cs!kQRV!MaV!(57F7hk%k_0kU4tCq&veb z^kC8D37W0ID`#hCh=B;HYuY4|s|T_k!{wR3q{+Qt)0K+p5F_Yt` zvm@}4Q1O&mM>OFUk8Fu%EP@|Kg=tS+v=+Z_vNs|~ihoC+d^MTHY_RHgcZmDd*{!31 zFKfid1(}=Z_xlIeC9aABGZuR(O6MMd+tEXG+>k=GAgcUuImB!N``e+*>X(sIOfeKY zVJlmm1)9IDtBf;1-iKNt2+JYI?=)>&%TvfDSeqJV58S)Bw4q~K-ef4BAE3p zO0Apy76gjyLEo;QHzJw7 zh5@M_5$pPUVH|k%#>h`84WIi?u1PwAn2^4pKNh0X*UeZ2R zhXd^h`#uH6n8yX@$Uo3;jxvwUDyrgj#)p;Bo{m-wZ0zprL`C@08r@~MZo{w9P}z+} zJ><7vF9d|kiTI9*0O`~>A7o_yNvG0c0Kx@)D~qi@70}v7|7B2o{s-BF?~wMN2vZIR zFuhZ)8GN%}M$%u8LhK&m?;IZHd>u|z$KvFT%@*2E0^#eS$J-A5z{8IoVE}WPuk=wT28G7 zk;{XpB#Z(}djqv8m0r)Rc#6(ceu(B?g5!Z#R_2Jt$^FOyIyNfH;yg`&YH63*8liHd zVAS8wu*`|J*9060nO94;t66kzW701Xv{8upa&8>$3koVxq@g6l)K`R-}v$2vHPIF z&MY*KlNHq0iLAk?I%dXwc{K0i1Yc;MS-kLfeo!Jjxe^A18-0n++5sYw(qU|1N|kA7 z+E-n+FkFHD+z>Fn@sNWaTiUTJqv6pn$;0e0yr^)wF%{f6ORxO++<(8Q1!&5U1FJc7 z_@mifovWshO<6x65RE5TTHO8=iWlFz^^^ER`YJ)_2!R4rqa@AbE0)u zQ1e}Q!1#;1uLDAmX)dg*{x#&uI$kxSB`JB}n)iYdLo0gHjI1#*Fh22O&#nw@&C<{3 zp)m5GE#Qh$k^p)JD#j6&jva`lL=Ccof%9g@#B(Cv_}_`{0z}hq3GD>d$LlhV;cM{Z zXe*a|C>LdQ?kh~iedoIlSJiR@LVuncrn&M(U66pbhw6qhz^Q)e z6|)l&6C{WjRlISh+u}gD$MjPJ++)Yz`V^2o!7D9FPD0Us2G3n>%`a?ajq|}McmPi6lv3?=%kcXxhIqA zNBX=td2tLs^)0l%>P7noR4?x_g0(NKgW3~Fs80FLq!pxY6T@NQ)sdTNl$=zuvXj{V z$Y^v>VE>4(lZD>wn?vu^MajM8tD|~(4>5d}Z)U}MBJv=K~cS|PO8U5~uOW%@< zLk*F%(qgY6hEKcC!-~i$+bCJR>_*^Y!22_G0By9{-TmWskQ(AnqUXB5zArJ(2yd8- zndW$Q6dyc1UCHj)8HkUUN<|dgwl!D~7u2vBBe;}DTWGj!*1O=Kqtdc2?AmhjTm<>_ zxw|6MVy^IY**4kq6VA03%6L33Kd&Awj{B8!Ne+b+PE09)Wqv8MB3)Q|B;f0HgEWB1 zD%oLzr2NfVexmIJJsV^209l$?$4kc$F#9cBY$f%pP-TB-xl{X)Y7WYL9O|lp`GUx! z4fiZphzd0|lnQeZWCPW;6UeQ$j{`+Ip0E5V67urS8;Dy7&kD(UvF@4`C4zk3)*srl z;_~)vXo81n%twL4=m(i^{Sw^Ml55VAQ%B!Y zvy##EfVV>Q;;K`|U!@k~Y8tJXMVVJCAYgz&Y%z?4I`+)0`wiIy4{MkBGZ0H+3aA%r zhN7Q-Lqu|^Fpzql*VLVMe5!?P+P&l-MsCy*T>RA7&J+#V8u`vFO_&P~tfDJwgj)pm z@_T+`Cw-aJRzxhgqb(J)Mr+GWzy8D-gpQja*7+e*rC`eNEke!L5mq27vkcThbLtAYpuRD4BY&EOeceY>M@k_S zVD6q}BzHIJak7f)l-w7+P;%V{O7u1C6#O9&`H4-S0rq3X%#fcU{oa zzxD{O>bucKBYTn=ulLQi6SX{uif*cLe%nE>ZaYmUdj_N>U7hB!moPseekW>M%ZVat zt-2}PXOxZkc3984j8tBuZCu>DYJ>*lG$Gzo`|Z#nMDcYLv1;By%5j!I!fhl#S!~aj zb`{5+6?tG=NK~Qqk7R6{;Qc<(Z|Mw;A~v+N6yCQRIrBej*55{@(`vGZgQjQxbXZ0t zq+NzmNqhq@FHWXw*yDgncjCWsz8ofmtlOrC4FKN7d>8f(Hhb^=g#t*yja1IRVB0g0 z;gI3v3IW!Px=)#BUd8l2nKXV~ftcI03F;e<#TF(&ls&U3?{Dupa&@e5OUeW`i&%U9 z7{aoGPTm+6+Kh}(r}bO}k|M#0%03)-=YMoyR(yYDKeOqJPX##J0*UNE)B^UIO=8c$ z0~sY~!D!50R6(;03Pos^@h$WWbw8`M;Fss! z5-M6be|GL{>0YUFl{}3J!`o;K?cb~KKUkUrZ!D{TE zejZ+R0S)MJahMfLW2kF2ZaTh{U1k1!QqWK6KHBQBO$o%%murPvbEc;C-zN#DV%_c= z%sCqFJZ1h`gQFMfE_OG0MkTDjT(OFf3{L^9ijROg`H>14Q68NBNI28yn~OQo!OQKC z5|6!Wq6Pg0fK!cmuXP+73Hgw~fLhE~6mzQeyM&mzK5A8c)Ja>??D(pyCS2~m`6SOW zik7XTl%o1`NdNK=@ee_~R&1}djRE1w=|(^MSJuxT3SG{Ar~I-^GlOp(b94jW z6oGfE;DOhdQhPI)xRS{9(QZ~T8?U~VC*FGpYIT}XSTB?ozVBqBR!AaHPf}mXd8(UP zXXy0M_h_Znuj0Zt%$r3iADMOi_4#m7r6^_`A#aw!mAb@wg;Ij;-?a5t0fx_w0ngYd zEjyT^oiI`#CyjWP8u!ydMX6W^@$Ct0A{~c*-a3D$m%~;z059nvvwnFyfJwG=;(K|W z@VCMwtOTQyDCcd~#hnZN!;=&75Ov&f6Y3TmK0G~SGVG{cnP@sn%j>+&>IOFEQc6;u zKJd)Epu37>^Rd9C*G-XDwThX-(^~ukwbfu;X^h_=3#E*{T$g-_7uV*=xbgv-2IvE9 z5zH2tLVW`VBg>wI9HR5R!gooHjkO0Mrk64XPsB0!n~m@5=_ERq^>bw@%3%YgZig4cJG8S-oxSAN*a!f@h7M4~x(`4~ z#%%U|d(I;;z#^~iS%e!NUg;#+D94qh{s1bi)~~NFmuKp&x2xk5WplM^%_=4eE1Q>Yvgv;=*TJa&ydcW&aepE9U za_f4CCsE6!fByHBC13W(-G>V{Fwhlsc;j|t;;m>rOeyzX5347Vog9&^q z^XLzPxe@V{`rsuAW$!DW-ndkQR^RTBPJF<8hAXY=hKX|n7M$_InG&1?5v38PtZc;4 zSau@$>#Oxa8>+#=!^L;avZzNS?{0vqK2eWlZqL!FzEJ<2I%aFF%qs%pTp;p(eSO9h zVf^f2WOgtKLWu--Ta`bpEn0*_*>C{1|+Bgu1<58S^#Wh*I#$DI6 z(CS8}xDN^#0Bc0{*GSC3EaLc&>`6?dfv1?oTJ3xtQT%yP>_lU(0h1vM}^go4o z@5qxdfQvyT_A4FJ67oc^6sr?Fhz&Bh)sa<@xeN_SLPfbd+R3V4e(cTtv|P7lmcEM! zaMROro-yAi695Oer7Toe+(BT)E09cAqY@HsbN5XY8ZLao0i3NxqINMI>(|p^q;Xm) zEDy9PJgsIe%0Peoz-d7Fxe2sMVu~Q_8XmsO1stICoL5bCFb;Fy`l#J{si|`xQVwMx zSIlA1cyy3RrhBou6H(aYhaG2mjm7$Z)Dz0#ramyGw&O2zsF~ceal{k>Vde2-2zO zv}6Eot$UBEYncGidynvh6bJ`Da>~cJ5^?_4%j|kIi~TtWkiut<*TyAHq#n&;XCBRB zSqu~aR`)r;C%1YemY+jB(Sh=_N4MAiU490T%4f^~uUgwHr)@x!?vYe(`$!#6Vk-d5 zX1yPq^XPZ$4Q+Y}v>sr||5u46AY^7+4Ls?WkAN#?s`8`P?XO2i+ks>ifIBbp*qldF zx(IjyAf=B1X0`wIat?6ffBgZT^jCjjg^!$gok!+7Fe3kS{qY0G0T5f>0$Ro`q9UXG zPvnRiV9L+h08jd>zh>i)ZnZ2xQPaPM9D|e#a-iuz%l-?n04#soUw`+P%ZlLf zBQVJS8jgQ={c#Ln-WWjBftJY+tA703$P$C;(QKD@_V4}@@;r|BquDOP#H)(GwE+G# zH2)rk$MIMf0hnr!w!!~ZoA5tYLsTz>=yB{H3HtxfEQOq28=e+iB6OQ)qF%kmM9%>K zK~D(jm3Q0lFgmV|hD59B-SP)?EjjVG5ioDX5fKCT%d6|0$Cyj2(}3vQFd&2P+F5xiS?5y0!^zY3k^+@fE|G2(*D!S5pldR3?<5wZs!{M ziDztCEfT-K$W+~K0#;6$YWMr8h#X&fPaS$jesxyb!}X^NDtjWoF@5FDMG)7EVYNcqc)DLjY@%xKAV#E<% zt&mf49iz~^_$P7p=@mhw+*8Wue}jg9)3rH(BM>U=xAvH)&!L_D4~e@S_B*RdTk?8^|a0=-JELdUiBKXfPQ&9A7oYGliXr z=|i3j*uG<&hJm&%&Ia(!DKLLHfT+A8z{Vco@GH^O(M+MDiB7OIH8ovn?%H3~ylnEm zc)Gt&iUFPZ@Wql0*jPFd;tVOZyZziYUAe*!J>(rgrTh>K2r_eh!THY$r8K1H?1{F0A>9K#E#lq;W|i4t zXA+Ea*<65?N^;g<>?ta%FOkncxLAMk{0jM0iL%;s;8^qL<9YhVP`2;c#lDDS%k6w( zFD#Yo&HkJ|(6IjL43i`4FZrpPA#tgu@pl#mt_{B#6Qt84H`}*-pF8B)$Jz|3Y;G*@vz%Px+ z-)BwL7obmNT)@ms1_Rwv)RZt&eS{_=z_b4w2=F`t0b0hbzYB!IzXLnm>gy5k1lq?Z zbO5j{@DTzsJT~JmEcp20FF;iE2#9!;WfcDgM9ee*AliWdfQZKkcoa4spw{$=Nfd$J z{Wlo=|Lrm0?`HVeg8qLx`}pB6BxLo7gydC!{~a2~`0zO7f^gtb@{e#6=(W^~NAUMJ z!vBQ=RUT2GRc?IdKPehn00mAGvQes71D*Xz$4tq^_z1)@fm!=+1e^wdGZpN_)4#V! z*T+W$Jmm2>r;m>Udad04*n%%WQ~rf~A3yvB|HdETUs_=v-M^0$@+2$({<%VdN2OH) zj{@cy@_%cJa}`a}Qo}tyMf@IROa;nrtHO@H3MqbWeT#m1eVy~KJ^aM9utGi6>a*Rh zkAmzkj1lPU2bV!Tn)dwohd6=lx)VgN8=!Sx3iUgN6(3dtwnv05)^+IroeD zV};+JrV)1{<$OR?VhdF48inDk6y}JfDj35+{Z{j9tNmT+9xYvT8>WNr6 za!9P{I7LK!mk)+b6)#%vVXm&l<@yuJGXDPeEW~UL*XkN{v(uUccOK^X=RFgb z7sa>Q#87@I!b!EB|Y*Q$rf}#$3ftq*r}PBeyUQpU=M-GJB8cZ+x7?N ztBoHvW>8asOZyLsdmEX+?>vb1b8zuewfGTnu9INKT{*9D-u?vC$%o%h{s#+Dz9rw5 z%oo^S@QobMAg%hd{IE&6m_bz95akati7AE;doG98e*aG6J6z%U^@;0pKkD-ZTH6^D zdObmn&!ImFLB7Y(o>;P@>$8^=*}s1MlIZ^;6Gtv4+!bYB^n3aAr$Mgx<7*op6&c8* zeXp4q)bx;h%&af0llMLj=Yn(Dpy#;u` z919Yj1|4afd55z?mDxC191b#(S@rxd5W!!^m$0C&QksylJo!k@;__ERmvCveCTF-) zDVQX=-%*ya2R1XrTHIg7OF1?*g@1#dk>h+*O8xo8ye)ezdyqf`u33H~=JJhR&GR-0 zqt1sGs-rup;GECP={VnIao>ibY;Aha%Tt;?fJc!KlPF?IMRKkMUSQEeF5o*D23!60 z>FGisf6t1Tgo`(9e!A2j(q2bk{rPxTUUDNocUu4&mjYtD2wj)Hc?8(r)lX>bKP270 zA>^_eBv5UElWTFRa7{{?9J>a@92F?_J3i49x(rr{@eeoxGxk_gevGLnPb*5VIVd8% zx*SC{Xl-Chu(!{8C4)s*u!C-$*dwZDBCe}N97jkAZWvl2e%;W}@tGqlEj7M|VhVRX z+(CdAYWb0_VA1OqV|eni^8g}R2Zx8bixSXLKxI;@aNi^!;kDi?0D;MBWX}(P6!8Bt zs>q_oI0FRK!O+sK%4O@%g05I5{a@mV8qZyh2sdux(wEl<>G<9ETfDM3O^pE!bPQfz zX0pMhWwDOsd3<5N+t;kD_DVGtyt@lLSdQ?S-xp71wHq25mZz(T>mC-JeQwr~+}+uH zt}hcM%#3kZGf+(j*2NoK&Shi#7K=g^0X;8U4!2!KlsCTwQJ_OzmzjbGI1E@hRuk`X zI9WeZml%_V-rtsYgvs^#B89mk{ObmUSh z5o3ZhC~-N4zA(JtSWm|k-Hx2C)pJ;f0cB;d^sa+wo}2j*;z7lGLi1BvE)KD$AX4# zow9?qtsoqRJ>^mh{neqbLBT)NQnsFx%B8T?V5FK|9(RB3?>F!`TKO7ZyUbCpltYlX z!7T+@Y`V%OX!V3k?Mme~i@%2;xcCo)Jn5mxcy?Xc0cfXuW$#mLe0*FWieDS#AgzAp zasuzqTq9Rylu9*0t9@Wbs5YQ`PgD9D*G>Yms3f3p-%LW6->Rd_`hItG@yd1<9W%?m zm_(ycDVU%DIgaB2508~325s*l-!>=-ECOtxDZ3FbVvcNop)X7t+;UJK6oXc;<@GY} zI+rclJNeUqg5+eZPs|Nmd5Yr3Y@$ER1~2tsDo(9U78IO=IF0W@h*z#o_+%H%hT>zk zUxT5u48FDcigm0cvL5uytc@&TJ9x z^)|;|6Gq1zu^9Wr5jbD1T~mItE582dk{{y;x;@vS#VXCO1Zt?ohr$dVR3A5C0?XfF zuUAhIo&iT4Jo`g((RAg}bE#OnfbEIMQN2_CaO|G-8)Yq}@ovfS(zU#WS{tpW$m{o=}CC6EO$;h5H%;U*3CPtpHjyYJ6ig3UEd1i4^Jz^5k4){i(>ns@;juQ zja9c!8h^lhdH0q`tpul|P%}b(&trn`6v1?K~w(QRO zr%LqbPkeu{?{5r1AZ}mCr1_kb0bG+%u?y0%Dzialh?`Xa5l`exrXJ}SU9kx(Djc}k zV88KEE~uSA)a#T8HvQ*{WhgWCRD@Ep=W@4RD22=190RIKO}2bsXD`WJ8rZ z_<@`3jn`>L#6(GKo24r9cxe&X#@zfwWCa>%r>QHs#@LAqrwv#GPSq}F}c@+(L%Sz=)Nt=#2u`q5Rg$rk?TlqawVP3=3dDp zSg}BUVy4>Jk;%y2%iQ+oY7DFQrf{%KB)c*G{wm{pvmP6nsJ(Q&HmiD%wbXVmNv}kX z?hF5bMf=Hbij=xlxY7}>AeU12JSJw3!{II8^+FI|LHdWx%UCLGxIw)?tMs3qk5k)(&mwRkR67!`Vu{QZ!#bnauYK>{v*lp^j z9D$5sSk>?2e0ojWE-5$wg~O}-XIw+cv4qm+4S)b^zb5uQKm!bBr(yW#%_;Tm<3+)3 z&k;&QwyxPy%BN@X!Oa}d<1bxS zA>r_R{!FyGfYbalut72ne0hks*vNex!d!7sL0L=0m-@+kctQDF9*QKg%N=XXyKPF# z%EtH25VSbUC_I)P#y{0gl35sp)xz}Jr4@qJLtm{87j*4xo}2reZc4gr2`X|ZWD3V= zKU|%-Z;rVm6|1oX=F1N#OF+)t-wB%~vL&(3N44A~F$M@5`2dj@^|0Jl%31#VmMDxY z0Oif(t}TMqo*!U+bY{JYt5>{tqNcjsGW(${hTX)aP;cs(Cc6^$C(o*DAc@X^1fZRpL(d!-*P zBa*_~mxrytPJMlg$tsjZ(t>kUG^lib*__t5lyZtY$2X7`E@r=UTJ44OC1{SQ)M{Pd zZjG4}2w1TSi!{Y?s(qyXsVFzPRJ|!=IFPheXi)LGne*1Ruws+!1D14b(NJb6ukCh@ z)LG?JMH*mnnrtWNdy3KMl&3K1Ncv78UOaK4C}A-0>4EQknq%D2(r$7xo4g!eB-f!@ zx0L58irk;CS|QUe9wsb;{@p8bqWZ?MV2UI&4CcO=c_*mE24_UB)@;A$Ingk?DjT( z5?(^bB)=>LF=|-CJv*2xEf4p9LVs&sRLF6{$@!IJw`9$SEIfXl_>JT?$a0M-gT)yE zd=OTX+A`GS$i%bjkdYireBl-el-IKp9H2w~QqVd(Z4 zNkt+jI}^nUe!|tC@FwHPW(2;d57w!?$`jTec<_^wp%1ZC+2RS7DUm?79F|BV_>hJ) zkYq}CR-MM4yEMIWmphY?*^edg#&-jrLwI*>`h}MN=&+*3{P9KHCnklS@l#r(O2?iM zpm6R!iNK8m0Uz4?)K42x4zBRuV=Kk7<{@7B!b3z_R_D6@@ORaI(~@za3Ni(-6us40 zHbd1^-hCO|Dkgc(Kcs*8t-s=`mwm^-I&N1=*qm8t@DQxEQI%VhDV6HBi*A%|7_>4U z-dTt(8hu|V6EE?nE1bKrm~mZXhH++S^XlkZV5XzgaJG=w(T=;Qbh_MbKgRFzH5pv3 zvL2v1>XYr^YLLPXI?ou4N6hIP zWoqX8yIVQ*6gw5;MviG56dpU@n76#Iqxp_RY)NZmhBPo;uMWN5hN|T2$$U(BD|K2c z>@mkon9vaXDEievU*+^s)-EX1~Hhk2U zSK4jki5QQ+mk36;QXU^4N4HX|Rn$M``!p8XWj8A1DF3Ca^C=!WK0G`QVF2nVn}qEg z1pUm4?H?YD99gJ7%|%21_73NaxUl}r&kb`dMkXNQ>FX4g&MxK`nD z=)m+GTL((_Mw|z)Ge=FiRk^U|={hN>_Gqo?1qc-bqV(8sV0#CBizBKUc2?m^!VtmG zW2qNzL~ysSs^Xkj!sGH#Co zGkCDh8+g4a7`dJm;jrNH`wQ44xjW|%X&JC-1 zVf8&Z)@F-_xcSpHL)FX6FRmM+I7~KE%YGs>7S@GNkNZ6c^a9fFd++k>u8%cwKFM}H z{-HA}Tedx1&~cT)RB$wZlr?t%X-dT8A~gLFP80DVN`;Q%a}&a~SCC>_;)q7WTEf?V zioDS8p;k+w6|x=KqsQUVU*{#@;Zn=(P}dvg9v2eNEB88xsnBoM@+2}u2dXyHxrrQ4 zxAL=A>s@d;+61!gqQTHCo9sb3mZDBfn9*bo`savX26}odP%$z4ZiL}9nvK`G#vYMM5VwYX6C41IG&RdK z33uH>yq*jQKl_rRY*t9vNI9O}Ksr~8b5uQ&JBf?Wf~$`aYcc7oE;ne@8H@}Tr@*3m zU}4WMRGl&cO{4bo;b!ytaj{-occH;#u$)sx75biiH%_U17!`G#`x#k4g%KNhX%AaH@8EfL$HtV{gpOxZs+G3B_mdJFO}MaDDH|5Xqrgc8wn(#(iV>yxz;A zocW~ABK!o~boktlk9=tY1>|=h+x8e1wjxQ%J?3cOhYo=P%)Lh~B3`y@qPYiM`|`wI zkt3sqR}*)a0igP*@dZ&}HHsb;aN{g`@ANk~>CHzgiU0jvXV7%dT}QSnQ-N4KsmB_A z#@TSD%j>O;-d2qPEH*W`mSHS`tLsZ9t8B%e%kCP3yqRgN45;!5A4A{J03Wz`)eXr* zVe9SDtR++{-+B#_BKpYDC3n$b(%)ST^D(~tM7YuYNXB~suXuXt*1poB^O?)ifGjpG zZe4pf>FMI(*07I{RdP7PpXOH1=+Kf8PbOxhC!vv;TgWc*I{8Bk-7XSwKd<0gHFT7a z8|DTaP8<<7YF|RIaabB#05S%Y8V$=MXr=svVq*t4g_^BhZ`5b4b~vYcsa(=NZ?KU= zn{LusI*SjZ50#JJ15(#!2?m?NAW?f1JA6~C;-X0AUY*3ps|pRhv;GUvefAbi&*Oa} z4EdQktf>8sNY+>?jm`!cMD!PR{Ok_`Xko#N;MKkIz+fd=JXU0L;H>RtB)@MNmF1EF zpt$xzSnlH3^S*Tz2~d7#WBn&1*2@h`1aE4GUMYT|4WB;wAuPq(5qCmHXB0 z$BZxHE)`j_eThmgeJk5*Igk(VC=%eK)y0zQ!E4Hvv z;5);a+(S1T@2r>6f^25j9b&NlO)W2m+8^DG5c%TwLSZRs4ujRc*-k)eRMIIfbPK+- z7SeknCJsL3Hyxabd;Q+;?`&vu>1V=W29)^Sg_=C*{w%2R!tSd{JNjtW2E_FR1vdBB z@ES5O{aDbBb&KB?6(&iS3FrU(?s?xl8ySC~_iJc+HNh?_&^HYmpNx-7sPg63*{S3W z;T+7Ilv>1_s53r~3-*he3g3Cs3~F2=L;tRNpBAF@Z8I%h;XiVksMs|Jrbm|yxHgfb zKYxf(CB1f+ePW~DuKL-=-oPvUejz8r#foJ3r8jbZpIGR z4I(5*6C8!7&-TuUDe}dcQot_#3Oj-*5rtkI3USo7GboX)y|MiJPb++kHm}T}fbyV7 z8n=$ZTee(Rd}JL*6M;g=Zr2zmpNVSDu+)!)>16y=$Fq6;wG`Xt*N|&|#c?z?4MKBk z-hxYevQ{oUb~H$Q4vr%lgD|*AVQ$FGh8bJO5bHMQ6VDz4wuNx9?p&7S2=; zw@K#{|{e2&Ur<*y8pFEQ(uC@6bZ6AyA zE2$8aS_hAFqAkm~BI}MXAS-65<$yax#>@BLnQo9@?}aAO6@wwJ|AdMAqoAuF^~>>y zTb%hRbT}ak*VdatlJV?z6gQvw@G~<`{^Rjc>HGF9GH_{z_mMcOZ(2;~KfzU%ZkG;{ z>uwee9#HzBW=Lyw4r!%%kYf4Oa!G@?llEz@(!RT;fyM#`AK-~0TH-?iou?pS=1T)1 z_Q?J%wF?=$>eZR+l!JIj^>>3aQ^#U*gNTb$@6#A#@HxGlYlcKY0K*f(@i324ssuGfw%fITEdv>X~r( z+d)0L3SuQL;28G7dVZ$dG8a`NS-{C(uXIW>X}p1ViM6K8vQR6dH??bT|M?+#$P=I= zr$_JXyPS>EgU}c^TFx=^@EWY+vTxL=6fFBKjRXZttd$IkKv7v^6inptPU$7H=zSuB`<3Fo30B)M9i>>-)_VN zmo(*u{Y7d(j@{Wuqp0iD(m$_oI49TNi~u)89CZ=;nLarVzh-Q@Qf& zyp-`=-h+;>-PBdGabFH3WdVA_+7@oqna@KVK}sIiMRs@bhkVwB6M@K;Arf&}!m!;O zGmXEQ^+_+Nraigb2n@7K!RP+m(=64S(!CzCWZ?TKF!L(Iah>P+K#=AlT~UBVZ1PZ19+F#v z?)2Vdhv)MXehluKbm>+{&{CJhGlP=oQCPo^dJLg9m-&Z7*{>2IiceqZfHbRxK+rS= z955M1^c4RRSI8Q{h1N*RlymS3k$#~m{!Ylb(tvS8Kr}p-HWaI;KGS%OceTLayPW*G z|776y`;HO$^KnpNVJ}V@`CKn+{n_@*f#c)-RgwbJ>9B~hl*m%y$4eX3v?O**uSg{i zk{p(QqiTRO%|#s-SkLoX4Nw5Ay>ZGKUS;|$2SnV8;R(w)NM&h+Bpyc^gH~vBM#af? z%GtH-|1sIS!chC5$)vchija|*eeUJbUYcXl<<7Q0cbCyxpJmOn#VdKZ2eaMV`Ik9b zPAuOKLigGh-w)(g?BLGguJTh>Sg2~|wS10jIQ$fbIk?h}WQ3sqz)K+7yClJU@IPNS zVk6|H`#S8KgT+2yw$UPCfFkj#eP46_`$*P`+pMLdL%h<~H>~?by!rF$0J%2V=u-@v zE?oX*%t3**+gNb;}Q&{;fgx(v2!6yl(Y!Kp4K&83a2qW5f#mwltDu15Muu!V7>&BwPzTRM06#Dp_NsnYAH8ZO{&M z-@%Pr`Ad&i3?jG&;_79e3p0}K2D@egJnAxXe$D&ll>klEB^Y=fVq&L$@Y;+q`(GWV zpGt0=va_?Rpx|;R|IB=Tl}|f&`a)fCkaySYVbN&bed|#v2slvN!UXW#FR`Q^NS?Eeo13f_CBH*$-{ zG*m%EvyyU8V(*^Q-v8mtl1!;EIMc(tDQ0v%FcJ1DGo`sSSzhb&8%>R2jbtW~kI6$& z2^9GKwQ0DFn-r%vaZi%}To#(;050GxDQ71Vo&|o>Vv5P~Hs%tvE|eSfKFk8|&R2!W zSGW^zFz(U?uXo~vnoRXZk4MTEyj3)4meP%gcjkMaN)K6S$@nYqX7jBzLPqo8?ZSK`FHdXCwzN%C-hnr!PnqC!-21&o8(txYlB-v2OqV{zNqTjjmf8 zQRx%Cv`1{>#O@csJhNBAIBNGIVsCV>ZG*-uZ|IAAAblpLv^Y-?cECPWmh}a+7KL zQ7*NH4h=?THTx9l$}D2Ha+jx77|7KukSWD~uUv1@&#y4bu09RDbD$t{Dm(c(}ZOn*H}7Fj5WGPm``z2xIIChq8LI*&YZ?8w;P-Kr8c!>j3| zC2X1S!r(`P-u_N>Bl*wxr9Cx~BcU(Yz;7suOsKZLSoB-)dd7&53_OkbK81S?BD7-@ z)nG%s=m+R2c{y94l*)rzcM1LR^W6uQ;>*{qYV9q#%ZZjlU}?~d1WTeuE@L19$atYN7!pnm{q{xiWpl;vsNi(>cyt$DC8Px6G z3aS)kHYhB$<*v6aWN_BdLXOQ?UOWA1z$07Q z<9nZ&%IkZ|UB98UEw*q44RHi%;?8+M3`SBuS1*dH(3>$S#-Rf-?U7t(x1Er*a!>bcn`^FD)>>u8k^n^ydKLtS}dR~}rd z(uY%_UWac4!w3CX(MyG1QwzR4OCU9s%pK{gx4AaDyPRj8G&{5fm-0l`r=nbMNF|Ux z$-Ilc3iX9|qTiv?+vqt2y zgxq{Fc^4n)RSeMM^KGhe9K*Wx{g>@Nyf{3eexX~XnnoiP*o zH^gX^sl4xv=98f}^8T$MEj_kDV{1glh;eP@QMNCSP;thEc7WVOG>nSgOWohX)g{n}2V0$2O z|3x`R3Z?uMhDB)v76O|0=x=$uc*ZLQd-8&Kh`bJtl{-2d_8xxlr7>yxt8tC`E;T;R zkU=GaAEc47S&P;}eshA=%}3Ttl7|Q?e#nESvw1%l|JP=Fg@3~ZQNk4SJCCAQU6rg_ zYa2KW(lj6aOdKp&x(Vp!Kc^`yB+G=w&2GlpjxQy?ClA3~tBisqw+5;p^0{P-kZ-+C zDRq#2Hg!+W?ZLNwKkh?)&7U>X&ytw`9T79_K8e;Z+z!<&Rn;e$^b%upnf%bJF@!vI9s5}mVoO!l*y1gn`L5ifSoZIVuARd~h4gHrT>lj3%NTG(0cx`D9n z$jY8GT@1`Zi$vG!K)%U&rR-q7(d-64zF)htNf!{(lOuABCImtf|g#OMla?AANBE%*e^~?KW{L0G}CwF z5>m>$*Kc5EBRb(?E%RuJ*d*C!$sqLg{G8-p4d05j!r#^nW=nNO@%hsu6rfK!{K_6EzA4#Pe`p8g}FP6o14~eoC^;`g3 zbw8q__6oA+7#uACx>@~m5YDkBf1Y>)+VLi7Hy_}ML_ghCSFak?SWw^vx{72H!C-Z+ z1@G(aXJ3Ccj)##n>aP|7c-il+SwEsY(Hgv=G;VX_8>mpnC&LC6`Fw67{6HfgEz*CR zQ^H}|p`qP`<$1Dd+2U(Xd529LMLkvoX&!OowV6uk9cB!;s$e80bdUnhC9Fo`m$jUz zhbRuz*qAOmaBfq;qcH1@d5xf4;#$#QmP;W4H!Pfw7%(GJODh!rvnQ@*2v>Ag`!)-C ze->)cwTiz)H?y!(05~k+2`2SbFFqdc#Z$Wkn_hK7B1L=L;im>hJB>uQ0gL#K;6WTI zm3rGr{y55|bpMMlmRbVh23|D`AkFbE{k~ilt>(<;sjDNF2seQ?foQIiqR+$}PoJeC zh*DU%(R_}zX1nk-LohI4uRLcudf)8N)(O{2{O%2gzjFFMOcpPO3cEY7NjduIlTzul z1hhIMLU-Qf3BJXHB6HB&ynb~s)z3x&SD7i?lHOmfB&*H(E%&(3)kqEEi9Cbx=rV=8 zB;trA0Q`IMxNjSmgl*ASIvHq|46hqJe5#P(tl;W}&k{&YmJE98%Cy()+9{lnN|Rpv zh-xN-cs9sL@&R3(a(~=>=;ehUuHxBgVLuKLCSpbm$v%gkX#rND=&?rWNZgkW=c6|O z>_v7w`Cr z&w&IDcGqHL_xpR$_U;^M&^V2K9Q5>O(1Z(d>lTC!v7tWjR+db>Cz{_So7AzrTtXm7 z$m98!22kJ&C>ZyyeY$?vh^nXBG4y6Vk&UeU>G=3`I zJP5@A(RRu7X9`vwUPrxbdgVhUw@us5R{S3=!W->p%F`3AKBjh$Q4!VW`b@RLWy-12 z%|WD@R@dKOA#>1SnecqoufyOE;b*t#AHM=wk@4(qlcqCc38}xTL^Tmen zAVo7}fv^{hS?dB%Z-16WL6g3f4k~4Jq(^{kDbY;rT(~Iy9O|YLM=UUR-I9*ijNc6y zAkETCO@6)Y5|sSq0}E7F<(e(9(VKpB>$Y_3(be~F6}Kh% zHx_WYpKK1yQ0V?>TRD=0Qtvy*3p!DW^@5+D-r*SS=EPlP(?|v0*&MLO;0u}Qm&V`; z=kYJR*EZw71)5Pv+`SkieUfs0ZbNkY@tv{Bwc8a1mTq2I83Q`3O|p)MFuXHS@0D$j zgjAj67UD2!(uGAA^0A^jzE)gc!nr65527dAJW33@9RLdnd>^Xq=LcKs!Cy}{!m=bc zi)9U)M!TivoK!w_(&+iAJrs!}0`YiPG zh*od$v{M@+*b>?tLHobv1|5EkA4pRX8QY|IzG{tbx-8>(H6WJsTCv{lFDlo%MboLZ@^H8RTSBUg zyuKD%ncAFcE7zh12W(oJHf-*Oj)aR0cOxaD5WxW3nJoqxe-eUfWDpPUR)AUHsfZ;* z^C|;M{{rf(l_Gl5znAuQ7klZ|i3Sj}yOe>vpXn@ONNEvnQ0awCHTAe9N{@rMf3hbu=ekPUHzuCq#vSbc@SUXEG*bMDRT9_@uK8ADYUb ziqX8!l?c2EdTHh!p@p`)Y4=bY7G(>VoSzCG8uBD0-eBu2zU8(x*M;DOczY`|CTKHO zZ+57BxLHMIX$XXC^$I(Xns~sx+r@OhIhG?y8glf*yC>Ut0_ndYWA8f^?#U5L~TF|H4Ar#i|Bm=MhKJg(XnO6-9%R#1<1?^Ry7dd z-qo!#`J99~>f0UOVCzu8`Qr+6E{O1p?PMTM8>@NXYv`lf^C}uZx2z(D4rj1IU(0{! zA|9P~{0 zNkgRCc9&ubThJ69#L?W^ypPUpU4ZsHDyjT$cSd{XA~sBy5-H`nZ@!rY?gM3^1ze0d zx4^aoA10shQWBTkCPmFCOgVewan5ur9*{c|Jj&g7@U*I&Kcp$sS{&7q?EStbgwO}tTxs2*u00XtbTp}SXAwEKG)I9kjK769453jcj8kp72HbF;% z%Gm_X;wsiw0%UVWlTw zP=rH`8FA_ugQ>I5yt)Ndk4-n6rkS&$unq$5UY7{GjBh> zdx4cUJ8{b-6F$)LsgEe@))i%`6h@Fa(Uzzm_qvEW`5G=C=@Jd&mSDu2M)aM)`96U>c53r@8{y^<2uNVC{V<8eYwJ0I7zt%vW9TELZXEoluJhssKFC&tQmq zuSPX@8+FlA(!ZAgn+A{veILK}W+e$h+#2zRQ%~%6F$!9LsM+*0v->%>u)%Y@!zHV9 z!*q0$YFk&+eIry6eF?Sf9saE9V*L=0YYA?Sq^oxY@nD7bA62Do|9)LOYbT8$XKhq% z#?JYKe+f@m7p)$u0I(2i^V(0z1l(+)Z{y$pCocI>^5(UL$zb2ah2bY(#+^DVE={-i zUQY5YqhP)M=QU$-KT1POH=!*^W%P(Cc=^ld8Uyp&BdNi9DogsVqG;3y`i;{dZm%jO zqQvv}0XW71QqfB~y_`KuukguXR9BFbzDLAA+p&m*<&-*sojB zY*xpDoTbs{2X-#ke0Hidh}SqC`KN=Lh^{xu!Er{#fYt<``n@caASzMKM5DEqZT{}g zs$jzUXDwZPjsUjLJ2dK2@fQh|BY8XYA3Cli7h27}+ifdJBuJ zdK9A+56u6**39sW+wkY#J{ChE7BE*xRnhPw5!hOZd-u0!-kvHrBpQ>1uJq*cppcZ9 zDVKZ**$-ik40=2Gk2oXoza`H6d*?%Pb87Hz(%21f?WTz&wZ_E$DpnvhsGq>kq%c0H zq|V9yplg$)Kg`yOaL?gH4w?=YvhsT~_@7z;)o;asWEm|E>FRYd<4I)gKvPFLt6+)T zmG&A$%Ow<;cl6agYRy8a%fNERk4N`_Mr*^ygyh)N#_4_{=YK$w@LiXhD(I52Vn7qt``Y;G5T&i{{d)I)>q;{!%H8_BuJ#r-#lvH#CWFj@Ty|Myhc|2*~o|L_!YgWIf`xk58@@xZ7I$BaY4 z&6t1jc#amhEUe&fECEl~GOmt*J6&9QR96m{`_f5}{>}>c?`5v#rjIJHCdW+w-mIur z{6BGoE-BOnjllES`# zT!PK6`dhHXQ?95Vg1h?^=~2d}~WOz_^RlM>i5~QtEaC89Z}wC()fDFA~df1HFVh zu$QBp<`bO&B2MqWq#pv7j6WT;_Rv>*2lJ9{ra!7s><$_&91Q_af;hzg>Y>3FY><}` z-nw=2$b3C8l*)i)shfIqIGS_4$0DA6n3Vcf@pbTBhYk&9@MO`GAxBI4h zqy2CrA$MosEgIZ~);t#lY6-J?xH<~(e_qvq`>&F9XPwYB*(|XE939x?fxyYB(I=4c z>ED;891?V|*B8SxXFA^yI%x-T`kIRaDfVxljFb*QH;`wSH@GzFc-ap;oJrjmVfUi_aXbm(RF{<`@XZZVBFPQEi;0RT2GB{=L$}dM z5AdMt(%TTb=a> zCS|f|sL!FGVl)do?uX$cWH97Uc5Ydom1Wbbec%dn( zbSJ2p3fE>bZmDnf6??|P67iubc^9o3afS@xB6EMZabr~YC~eVM5TXL@0bNI3zR4f- zf4oB!48gI$jn7l~_@*q43Evz+k;68MqWA6u7=(Lkz^v^p7@k0`U3l?ldPAd^d$s*u zkW$ch>W$?phr-8#cU)K-B@oWvs?SdWx=I+6S_%p26>-SB26QRbK(YZ4#q0rh{5L@ICd?M6x+MAbaFP61=QA^_(gV%w zqt9Xif_S>~qy}UQGsW|mq|GQax z>|=nEA|7x>1g%10$^S5XJ(VwpX4wc_P$q0yZYoBtc2XMVff?4LzW^J!6A$Vz%dB$y z_d_Z7hv0!1w25YL9oB~=yJ$VmTMF= zvuZ)E)rZHj(Zmw&3}El}^#c@Uxf5?9rprzy#`0`z=MFPPaHwiQhl+@I zJ_Mruf?jJA4fJ;{q#*Pbnc0Ase6~XWHCWR^=V{Na!>~)I* zZ+p3|r4sHz5}o5&j1SVpW2S`)sl*!pykON6{_fNe`{&g&c|CHeXHN0tz8+%~tQrE&(BJ4;J&Dr3f~bN6=uwXSST{$#BR4B;Bp1$nT4V%)>F^ z4dpchM@1CjuuCIY+S`)>;dT^#Y`{s{kzLmdRxyIYi63?s(gg-N-pDBn3bWL0qD{TxSo?(HCYi&f7E!XZysm;3suafHJ!Sx|NT z|L(qWT5yFDvHz0sI>;yn2%#HU_J%JP0Vato2*^hi#)d~g-kR1}EZW{Tg4WHbwz?yT zC)X6O!`(z;IuM~G_!W2s(~rm;e^C3zfa_#3yCM3|-lRjCM2PjTx&Ytm7~IvM?Z5TA z7QhWME)sdR(L1tWh|v*8^r+04NSuVMm(s8$Nycr3%Vtt*SZ`?wfBBLO#qYOW;tBuH zd@(j87sN=!T|ifW_VY@ZGzH=;=c-J@kuV)2r6|{Q`H;OD3Ao?!i2Gk4JZgX7?C1>E zDjuu~cYC;TJR;_@q`KhVLPYnOo?$vF2SX-~1Uc2{SS^V;R%M=9ZpaDcBvJ&q&~Zd@ z&p(O=j6>4roWAOTs1wL$DJ-nf95w5dmjGMdx*`l>p!R}NBp7W|5}ETU361b4B3wL; z#$3bk9~M4R7%?a#DeGi|gWL{;y|<(yA6Y%9%fIsSgocegGxou;*-gOo@l$8E++yvAZXx6V z6i%{lYqy>U*am*21hLyyuH?3y5oVXQ@qh_LkeOl+2#^~^%at#^lkt5znuBJ>}GQeyz$%+X`~u9l_AG&QIPtd7;XR zxV~r(^a4)Qh}}f|;<(s@b7WCS9gO^KNqM~;ai?K>V-jtkQUP`2rTLw2QHyN{a5rsF zW16?wo{`xvc~_e)7srybWl8*Eg$Du?9zdA+0<|(wXwVo(6ThC~SuNFApjLtF_cEUwrG-0T%Oa zt5zgaLi-y(!5p29*(?q@R8D=hZI|onJe$QLaWr>wHim1!7`-^11$#0oVF1z~op=bd zyNxsUJY4W%Ho6h`RTG4A%!FP4?l}IbV&>Mt0i+p1h_eC& zZ|STn?6354wSHX-V&?o`i~0TEGoQ;iW-B#w?Pr-^#?u5jYCxW+N&>!Emcd8@LrS|N z$Un+e{lXF8oX~PovvB|-Nn8U(-_$)woEF{R>y@~Rj1@?H(}yBs=Of3LH*Ig>XK=Mj z3pnxPX4IXPYcdGvQIf9;FzE3YfI6BSLA;*w@6InOkk1li92@>+S==40;~C92?{sQ)^Pgn!T2xBL9f7UE%+^sRbGhEbE{2j_5f<@U>;LnsKwf2euyEMlR$RE@4#6 zjg*v%=fe#tnXn`M<Sw>RWt=Gnz#@%>L-XX`MUVBlSWsm1mSgidZ%1+4$2xL> ztfDxBZ%%Qwy@2v5SBCza`N-p;X^c2$b!nZQNKmn}ifa}>e^0djEQYUDrkw!k;=1&G zb&KO<8+{u&_v`r~i#jCQOMB62zu1$caYvU(i}JY3s5QjE`u{~0m;Uho_HHd?2m918 zbc8*29ZSZHtx><_i?=&WR-3b(tyfX6XSaX=PTonm&XGBJ)z0TP6?XUxiAhp2 zcwcopJ8=Xk6$iC`oju$xeF=(&Z9ZM?Yp2R4+9FQU6uqQu+QQZfuSB?qL64B+%~3OTLvzL;Z!si{g2G_+wt4skUB z%9-y*&(Tcd3Wj;g$~Rq1GQL``VyJ2Ro#b-j{Z;uZhmi+!l7m#@8WHDoW<a}Pbl}SHhO)z~y9Tt0m!(?K6{~JwgJ&^>` zl_d=L$z0XmR!74ud5B>X9xR8=M~3QFK0*m!Ec=>CHB`!7k}T~y|V z*PpWR0Fqujan(B!Yq)87z%@Oux6+zCsYgtNuve?vc>d_0@t-B?DQ7-Luw~!N{k*KU z%~Nz06$$k#ik(Q2PD@mnK{aORsR*LZOb7Tho3lIhSEC(*bKHlwYq8bns)8o?*Vbo6 z>Npcqh4LsiSbrz$=4~Im@mOCjsYPK>u@VPyN@t7^Z-?heo7FfHSELZ0kF{EVaVR*YRNl1rV{ zpvf73wqlktHN2)=F;n)2?I?oVQDHbf!+9-Ovh(Sx?1u7-W*J_ko;cBU`D*J3-zs`>etSGST2e~iu+4mb^i*dur1se_F7HPgqCH{1xi)z1`k+?9c+q6n6FH z#L16ez3=xxtgTaB`?e&kt+c|W9miju5)}S9=%F#|iOm(dv6kr0Yy8Ob!#X+}$}P>b zLJL&cTpmJf27-7B+MAy%%*l5ad|z&F2qE=6;E{DsSkN8XFC*F0zcTzsTFup`q$dkC(WT0!NbqGJ zavpxGBHfrqDzH4-XFTfr?h9K&d}v_ODQUZWYMtkI1$&JEEC5cxmVF&EK8NmJBeQ?jv@0tA*OO^*pgK_w^=?W zActRN;ElH1%!lRGOSy!xHDGqPZI&RWbuP~17&X+BFZQ!j_4aRhtgAv#NdvH9C>bv6 z$~-7L&>zhcaOEYg#U)j%$SUJj1o|L`qGh>8)8^D;*0d%d@ENM9av45@eG>Kn@D%pv$k~>|%9NbrVIi7ckQE+#2MYkrtk5)Q z$PaDSya&B!h7PC4XPt$1jv0B-iBN9rE+?;e?qtE{?*Ziga zcQ+(zDRD9TLs#f6`0#M9G71$=_VYGCuCJ{%$w|7-#;cUy2^=kNc95o=<4Yr?wVdp{ z@^!Kg)#-FqO&-AIi(1&}0oJM80YARo(8r)~t`HZ)`WqGtxCuHVA|Ci+Pks80rAi#P z-%nXRwUr1jZ*yD$fnwFkxWfz4jqaiFuRRtXlSTO{N(j>FmBF3BrVzp%Kkx-MCz*-* zxtrQA7U3@ZVJo`$UHQwK!8kzilcU(dp}lmQhh5-O9VML>Z3k?rzfho4yyXDJ&y+`jF7P3YGhQW<&un;; zY9$Ax;$WA?Nw8S&IvMi5^TW3|1#Ag(wr0C63_TKJ8KK@gURRC+#r6PIoXuYse;&(? zmtoK7+;Dxt#+IPYI59$_&PAH&ThlAyl>7W~9EuHmCnpU-|B>B0-OTki;iKh0you&g{~bjSIQFDQww(rvGP+ ziO)^i%<{UC($yxHy`pHLJ#gZab+z?61JL7xD|5D6`Iz17WFtO$p1)d6G0E!=*=SYe z;!mo?&fwz9x#;m^$+x0mACX@a;6~f!x(+)nRP#QzM;f;$Br$xg?@?4jg9YAwAG~YR z)H`;wC|PyopcWMjgdaLDwGU9}y=)94H1I#9&gPmY@piKUrP?*q9{g2leyc)!e08~> z9{M&`uKDQZ?(%-7vK1T8wVp^v-yAuZY=7EEywW&;SFhe7y5f81mS9v-EzuG-XrxEp zO+-}GzMHTaarN7L;^p@Rin@<%1XV^?-De|q)SCpKZ+oCVg_NrJL*Mr2*w{%15=V7( z;PVyl++oNw4LR5eX6{Nc5Q(N(e04|lT=OZbMA zE3lNX=BY1}Je=md`hg)fBd6g993kMZ zSAdhv>-6iz{z4doI27?c`$6~{FJvG=y+r=J7m9CYZ0l_lcTXrG(b$C%ZsxeaU@42e zB%7Em^XS@EtblK6VfUI|^u0xUxqtO9W+0kT89w<5kI9OxE7|4kzYOvJNff8qe{G>N zxZSLAk~}mUuaegS2{Z6FsB57{LMeBXf@Aj@h%&kq^~!zk>K}W>zV%?{z{qI3fp8a! zM*Qck^^bu(xNMy77%|!`o7bQe&Jy!)8>zpxe7<@hbfl z;4FuA6JBGZBT`1&f8keZBK-mqU>%9_(^gc-vVL%i5gD4ZI>JStviQ<%gI5sSP)t15c zg7}QH8dIkNM#XnD%Q2k)i@mpuitAb4MnfRD%iuDQ0KqK;f)5Y~AxO{!X9(`@?hF<@ zxVyXiKnM~7!F>qs?z}tae}3n@>)v(WweH8e*5w1k46}Q8_pYk$uBzuLpR+BsP>gra zpkE+>Ux7BWyw2j%P5$EXL_0#r`qDE5LzIDZib+2y~LO+m$Afb6SYO` zib*B_CXcPqS%e*;L=3?%S?gi|Hv!J`dh!VS0(Ob!iXu*Kc3}6ys%Fhl_DrP^6=7A& zmEAq%54UvagyeebD8ozN;;WFsX zw!qG$6u6V1>uWkbg2+l7=zI$zs*NYA5-CtAqWnMkONOp5aaB|KnlP( z*Sh~CdB&jR$kPE}6;Ugu9lY1`1PtnijCzuywL&VpQwdd~XIWPhg`7RqZSM8-_?9L5 zt<~bil9r9Twos}k7W87mN*-W-^>^{SG}s{^9>I;pUbItXh4R1h)xp8klTYG* zT$Yj^SXxs(e+7`*#@3neHLJ0QdaMNB!^lGh2FHWsH`ZWm-NUYeOBSYZ7%rJ*^{&H~ z^Bs_Q1|ZtLJ=~o7G@%i28{0*a(hS;BXQPdHd*6AHA!Tpz1wN|EvT=vj82`cW{W;K! z$;u8qyOS>h(4!XgFyX|tc2pwD{bETEW6MF*W!G+3>;=+OmHCF>O8h6zRQULTR~ED} zfv~tc&#cAPF}m?z#zPTg`64YEGG)Kmiezvkp8odM`>XaucG+Ob0y?5=$11vSZ|XVA z%D(y9ZCPxtGvQDGM60UY=F=4~FL)13nmO3Zwd)ghenTT%a(ry8a$)2Xt2$dPO+JPZ z!{G^YUH+#uYGs;K+)AX`X`=izhgSbHS-7On~P zgG*lbdKZv|4DfSs>Redduf4#Q2PSh~hs^*mqC$#o!>}-n_N;aBT?0OqW z@Wkh@VgV?)1;7In9J7Qsl|mb{FOsBB#i?s4qJQ_h$MBTpYv!_BmJQmryE@aDBs3V?aK=Q$>(0W&yGZITAt(LRH!RMlM0ab09Cw!os-%P^1s(Dv_XfUI8IIzoF-x&GfH#5M zOI{Y`n%9e+k58u}whj85s40Kje)vs0aa9T8maGi0!_u_x%>S5=>3e*pKkec5i9KD| ziJ)3WrkBDYt!ja9MDx9J|Q;o~fD zwtjqw1GqUcG)MR$Q&)UdJL}&?dlAXKBfoe6A5FZb(VzSna&mC9&VMb|Fg^Y$gfVzK zv{|EZiL9?onC0t(t@BDxkl@BQ9$PHz$*&NKysci(y<7aGbykE=rq<$rQrw;@2 zaU_?$M`?x=eQ%l~gj}%mbEJWbkf39N5OLsny9vM~r^!#=VWtfGlfSm)Q#Z1SCK@1_ zdMR0?S)DeMV^StbA1a97pJvcJOSK#MjJhn`aeVxA(}vD4Dp=nI*$z7bYP&>}<4Cnf zo^V4iqHl`?2+i@xBZ!%-^0O0qHKh)nD~{hSqn7f^X&SPkfYx?S!S3H~eE>_^w2Ws$ z`1#l|V%nfzI@DoDW|L0tbQ2)ISDq;FQL4@n>`s=(eT6o9p&yI!H56m6Dz>ny1_(ls z%r1KsseZ>No$V55+L`vj)-*Mnlx=9h9%GSvAAqXp7qrRdr-zyHG{6-@O44Go32%+m zM%t$fW};$-Y$0L1;L{~wpeEoRFM|S;+`+zAz)TUoKDj#PZDMfgf4=CZSy3qMvx;20 za<=SNQz#RWcwg*!og>>a`DC{<5N@zGX+D5mChrt!y&fa!i}*W(i$ zY9VJ_q9?DpyWas-t6gVG8o2b*-)=19Rg8E}Bog8^84Tv@p~K3e8H1=#F{CQ^wL$_MEX7S@)jS$a z2lP6wu#(ofp>y6Q!%v^{(g+p22y`DkFvoz3KE>0$H=8@d!4FC7U2DP*%-g7%k^}u! zkN1Doqgm|g_E02;nio1cCF(3Mr1|NkS3e*@U~L$_b~p$p1v~6cZnh;vM}~wovr_Lq z44QJW+xg^JAWnkVO3vqAjV$qP>@@K?A&`sCgZT2}H3RG26M0Op7cQ4@S?5Wyq3n^AVx;$i19XgEAq8FdcRA8L#q5)1Za&?~yj+T8Vk<%b zx0oJ=o#|B-hico^W0j&S3}=_m6G$v-&tE?x;EfSCXO!Y3r4U;}USRUO5571 zo><=&q|rr37=Xtrh7#?kFn;_Q)d{B4-6IIBDSEGqJO_ouuphIR|2TS(zF5)?l<%i- z>KjbC`tS`*7EL*gwaWb2)5Pj}$~M z$$tAvY%QIZ?j{B>tZ-`Y3CCi*5iVt>Ha@gbF8<{e6w%TQN&!6d*=U!WsYcNBgo~BW zqZ&EL^Myc})qrIEvDhyH)P}Cy2qXQnT&LZO@wFF!qvW;2C%_3$PHVjT#TB_)B{%UZ zp*4e_%;&k5FM}=sA>?6*erHG`urX%mXNL|SjOiIPJNct}aOStU@pPBeV(df>t!N2q z{m|W)8T16gkqi7G6DhTgQWuWKX`){C<5AO){YA@pqb-)N57 z-7cq=z67RM*vnGRo?wDKWR*=hps}h7ME>&G#Aj zFul%)=!d)7=jEECFr8Gs#j}^!tu8s5fZ>ps)5)zq#F)4DJR2iVl z0aRHyLC1s2KVk-<#rncDpo;x@H8ezYmE3W*cJr?Y*Th8b4fgIb0k2nq1VepsGZISN9TCU;vSG&NoRITBBF2;jO>_%j!Jc|K!`D1&Ny3cGX9$2psmJ{`MPJxeiK&l|gZL#Bm-^&<_q8W& znp7uMOXZhcMIYBi*HG8g1)JI!AS=shvQ(YQ06-ir;Ic2X1j+wM`7ZLG<>Or!%3nFT z?$cg1Z;Y6$OGhQmiT+FKfNhLM7qL&h^M{+(?#Vou)qw&y=Ai6y%1Hw0$c1}?Z|ICC zv?xT&xX*VrXelG5(MIKlAsd?Lk^{xoi)&RhThN@ej-{=KdQxy)WiN@{Au$~AhzF`^ z8E6(_X93zHk#>~-GH$;6hJ}47Xh1+11cb`L2)Pj4^n4bz5cD7V&X2m)3lY|fEs7%U zqY{Ln4QKN<+sX@`yR@K$(X?hi`GFt!q-^Yg(KoWTz3QJmGn+(w2>q!B98_(p|K9Yc zS|9YbG`umn&enldmQ}-b96*s5X%tRIf*mamBkc70*3$>-Owu!eb^6LY`}R9kU)O+NNzqeAqzrWgb9M z9B|C>4bl2-V}x1(gXrzBKQtdmdPbsQz|AwmgxPX#<~CUC zL~LBa;6k`0S@6F&ZIuK_L4DZJ&3{A$}AM~ zI-acKeB&)-Dy!^q5X5YD>yNlB0DL>*d&M%6ZIlLF23K^$fB3?cK064Mp^f%Z`Cm!xQjCDlTn@ z|K8!SD_NNzN4T@#bR!cA?mXw*@(lhirfn!G zib~|Y$3&B>_a{mlm!jSiBb zPN}V+an(iz5=(5o(5XYSYdv5_oFGR4-k)fH-sS7Z0(pO_mkwK3VW)K#({|s)Y~-Ks z3*?k0L5$I<*>&38BLMHCBp$6O(WfU?Aj{#Km6p%T2s>ne*OA%Jf^e^ znS!N7x{E{T=}H)sIG5Aedb97s2w$K#oYgrRD*2R!KFp;85Fv}{2NKxqeyi*8U7hi{ z6jw$B+2~6!)oaO4v%$fuxw_=gmgu}_5d!nolgoP`F_&;y{pDC^h-C1n??5khbbfEJM=xlgj>37a2ZtRR?ry z4X!Kp$h~17z*Cdjbeul%Gn9a}>X9(kY{tadWVMFW-8Tnd&(>sM<#LTjZL}5m35@2h_{@tT?yY!j_%%7EQujd}j1B16Qd%O>SNRr4B1N)3>KmJ1 ze}ZcL{NC*Rw0m-MBeosQBVD!F)V{kLF{&<=f6?7p@{;I6; z`r-dE6HX7HhB?S^k^YqlWId}C$iFmrb;WDE(2kGyGc6G9^QymA{*F>#j~p}GDk0N` zfPe5>ATDxjDSe{x_phB30Jk->-C8DWNV8UA29Wpc_elpBa3J-Y2sUeOyZG7I((LO~ zX-h=NeakuOuM)M8BH38Wswhf`9pdCZeDG;+3#?MRS+e=G37SPN{^47EwltQA-IG^r zEyIE~k)hK=VTHOKs$_-oDWOx9l%_~fYRL08$kbj?JhOpt-|sSPf(Z;XjfL;$CaZ^) zxuZG-N|{Ub#g?;enpQJY=lNyF(@TN!IAvuA5$n2~x_3--cHzXsGg#kg>75OXN-Z0{ zv8y&QiFjND%Mp)ZDAzqhoi}?T$C>ex8(YXJLrzq5UaD>4UFiIL#bPGvm$L5PRk$8C zgX^b;tfli?tl#`5Sh*A!)|Ra_#gJh{hOL5DBVp?0Z=DZ}?_<+)Q6Qh>u^HgC%mAD~ z&M1dARDSz6khG&j(r*Yn+SPd8&O2RuvE1co#8+?M7v3fWhw(qkKfm%DdgvxnGQXdC51#my zNdCzU*mu2^wNCmclk7QCE&5DS48}a_qj-sq35wN%t2TBUs{8K{N@ZT{_ok(26Ql#x zr5y%Y)yV}Pmczx5bO`3lnFWPvQ;Tb^uCE!KH-wEOmeDL=4UH1(RWoW#X2U>%YH;*z;$hVFE6a4-t&i4LxOj&&fdLzJkABt_sP!H{CN&!=413Y zh=ulsDY?+G84AXRlfC?mXO{fU5g+%j_S=fk;lIOu3#PnNwxx?rGuisNA>iPEs!id3 zcf~$;gc=KwDX$6)KBpxkYl*aeJqb!M=o^;qXNfjmjl3N3~++6rX({+EVJg#unX1XAb;>JM`-gBOsKK zC!1s74kAq6(Yw3wZ;&lu^Nzbv9HJ6N_^j$XRjg`pcZ~)hiYeU6_oyP^8!OVJwe9cf z#_h*T+VHm2U!+G;xIUaqotGg`bnv)kXz7jtzVGe>St%l{KmJl|nhNnbONP;q0ubcb zA@FX6?KtUK1UYY8(WeRbE_^r7{w{GGlueH#qh$2b_4l^qW1I)6+>+($0nZ_o;JeN| zfQ5Z!R?sa72)B2e%iw)!rC$sT+zkTJz$6PZ0&PDKR}aykl1- z!}H#lslN@ymrHS2Y)QZuY8UHSZOZblzcB(LW{t;N-iZ)nM~tES|3=Ql@vB~gd6{KH zkQR0ODpRG7*ZOmiZ)JMF20Wv1pjCCty1_nx@k&v(t@fsV9HrV0O1@a_CGDGRMb}u! zS8e_%8tU>vUvMkDzy`8r6D|%Z!~8y> z(s306@u(S-mpsyyq*Icyb_n^cRDRq%qwj3>E8%6vz59Sd@*rMW;DYU@1;!6>_h`c! z5;2&pMlVw&<=Z4dR|$DRbbA15q9Q@n0)vc>-3 zu*1J@{M`fSMDye4MsGA8WDjuwGU4Sx+O^1=ngTnYJ^&bS(z6a0WmFQ`%*?VhWctJv zz>Lbeo!&CLulKUdCK5j`cUTo;@Q9_FfD0?ob z%-PK>0LC}_Z!`-&^fxdI{GiS5`;@rcy!T0%If~LlF`|+9f>@*v=>Tvr@@8Z^g6+?b zKl{pqs%&Kgk*9tMw?=0~pSZ7i(Y%be$Y2GP>IAb3t+%6kM1=}eyV6B zO~O9NPLyAH{uIM!%^cwzAdTA4`4NjI^i(p-R+>KSwe7n%il1s8AUcbS{)(BXQ@5DG zrr*Xi$0Q_WqSFfjP9O0ZSvJB7l)w$BK3U}= zHn0BzSchinmS15+h2u;fIb&B?>OKdWCQ;8?g8`X*4-{U=d7|0&EgRb7--=#u{wV;j zWCPLq__&h$* zbkhwi?7?DDDZsC{(I5q5ea&qXoVo-Uc7WaX+XQXbTY$Jzn8&{T9~dSbhNe{41F*MP z_;DXwekTlIq^O$6Yt9u_K`*%OvL&bK%tf;cLxYk;b$B#s3WC0OHNO0TRo6 zByH~<`o6!)>76-}`_QRjQDy1M2aGe}qS0Z~4a}9uma=#hIO0m%{9AELZgRsCFtT8% zMkv&>-*`L6w!`~A8=u!^$x}}6zVqXb-q$WQF_MKW-e(aO=BIy78~)znHn&nh!HzCI z$^X68Bq>vqaA&U9ts;SzW2eU^8rBG3Jmr+#rrxlUs$=K2_}4N}8WLtv1gCC*{$3n$70lTi=g8hV zv{KlOhNmlnR%pZlZm+`B#)FAPp=en&-hIdIj~^SB|0d%8nOy&IehM6ig96TV0qm?u zVh!Uzde)VVF_Rfn+(~^i^Km@gZqt3HBoYm5x!wCXT#qk4ZpsOCa))#1Gwl4le^L*a ztaCPRDmVa_$<=OdMJEvcH&YV_Ps_5RNa5C9Ju38o;q38B+cm~=x1)y!;N$L&So-y< zTG)lN$pN;8<`^O_Qql?5hnL(JVX&c%n24YKfw)*sAO7sSU12 z)PbO0V%e8+KQ9Lw+H8POlzHu+DZj2C4KZ2eUQ7Rlkctyl@jks^HJT$YV$O^b!Aeo? zU>2Gm&j^wik|g;vQ7scO%&Z4XKQJAG6r|JFUe)b=8mzlA+;c?p0P*Y-r}P(o0shlZ z3LOvvSU~1p%K$i4MwZ?GfW)ZceF3y<$1xCmNaY$(t8Xm(v!hpA9t}W{&%^!$y8VBT z-Zi%D{0kxW)}oRGz`!a%a0SR-0kFkT0K+_%{I_`v2jIj1ivt7rxZb7>=n&&y&@TpH zKSBSx>dW7&{-TO`|JBenIxu1UCVx}^ipmuKb81z33XI|w4)ofp2ly+qikQc$`|nl% z|Ly<3H~Ej5?X^#$=UwNn0#p9}R2yude(N&;ie?)J2d6x)GUC*Ips@bM#mhUhEVLWv zu!p?ZtS5Xcd@<`%ns7E@s(5=aT!x;qq|XK^*nAlgv+V|_H*(LYMo-E3Ti_O~Vlzr_ zwDUoA?mq=?XHjU%$1(95)=rf&`CKr4F$lRMt$SoRRpR(<+&ak`W+L>B`J{Ynh=4vCYX$cebvk7BnapO z3#B81RTkg;u8QEi*=L+-B=!Ts?BY&4j%v?=a;g3z1TILSG20vxv%3E-E$oZ0gaQj1 z?(o`;QT)NOzR86ArhS+}Dl~?SYdQOC{!MIUB_~a2#LpS-@@~yOLlN@j4zDD+_&u!Z znIyn8>N{QxnIlxj{B*Hi>&V=qk4_xKGmVr!g1C4El1FAAuS0aSeSQq*e9 z$ZNC_`)a6N_|)<<3V%CS2s(lPQkwz%w)+ZM8b+2;qWV%nIY+9Sqi(Ch0yJJeg)8GqCA>8z;JEcuwat|VNLUR!Bc-6D)|Cv#{s0`%@&M0x>6Fx< zBW(#VJl=TFKb4Zvk?^@dJZ>>e0fP4>aI6znN+sQw4AI!g_#Vb)s)1b^Ch0k(g7RZD%?gc}h2qk67Vroes709q8%&m2||G zpr%YDRmz*!=2F!Nc*3E`{AAig-{mRk=LRRNd#7W1>3$Tm%}gHn+N*V z{#9nX5$|m4^IfQf)jQB3n@s9CcUKdD@6NYbR>u(UjOX}p>TwS5$*HL+`d~cj{RIBn zTD#EEYMzC&v>;z@yaxy&g@C*$g`TF&hg@*p#-K3jM)~BJ9+#B;0a~>hTX6jv(!jW`u{w20SVx#Qyn`%cXV1C zu(#4yNKMtS(>NEJBm`TWl_ACm!WU-B(-rg}G|bU3OnuMojSoXh`L%>Rlq8%A?^~Qm z;RTo!uA+h?EY%SuNCDq_{x~US(M{k5WG&+u1g>b>=NyVcQw{SjG&*;Ae zv1P$}W|aN?rUPQGX&}a@63Axf6*|O7kc~UAWRdr8bKpXSv11B+9u1oX*>jIGrFp<~ z3%t>KB|TUt{dS4;-zC54P}1n>AO$K-X_dgLcRHN52m#2b!UXm70_8VT1Y3j9Q$`6L zIy!(CUI%9^KyY4cczOXZCL2Q_dZQM|$N@jS(Of?REhf@G|y6md~v&7-=V9yIQi~@3XBH%$b zdH(9ZMhXob==_uCE+xPVuE;n*PDooLSPJ$Tt88z2C<1_=Sct)NokOU z6BO6SoDv_*t0ZNu0SwA25cd@@Taw9X#m@nP<+{9zf6N?MKh76Dg@1hO z)DJ!7#tRj=l z{|y)|DjD@Mu*CeI-#P#NCN9;qDDd*iy5>KY7_9$RJW!sO&ixUXuq7$>dEoJU4O==4 zjVXSu{NqAuu+AvwQt0mvXR=0wfQLP0>pluS#hs{rDFZ!as=4X70zJjLEAGR83hw$H zq5#%@w1>ebEM}Xaj4?7i#>~H$;Srd&|MNT0-*5W$Exrb(J1NVO^FQjx3_KP08GNb1 zGcJ8TPDp{qKZ+!!*WsQ2Z&eCAY9vkv1frNoR%j$N#tbb=fi~=adl&Wh9fSO1fz;q6 zbul0Fzgwqs{VSHe(>j6jc*t*0fg1$@J3AVHUbP?u>GqPucZv7HG3c7dHkfA@pjhZ2GEi-Hj`#X=v=l>WUF5MgBLe&M_ih)5tbd1O419o1HuA5HWg8j+kJBb_?*rLzbqbQ}P8 zA>;Z+3Z7tv0S*1nbu*onf4iUj_k(}U%_jLNu-Hzf*P^GiFeIS<^Hfk2pGq(qtx0G% zu5A`-skFQ%j)xd?8UEW+@p?(Sn&z(co4sp1So*$91|vgDi6Pe}^8h%99{ZGp)D_Qm zx1vQDjm!ti2a-aWMi;M|>Y(R+-ogO=!+@11N5#W_n8Rx^kf^TW9@>6~&tt|nJPO%T zm;D^{)!vDJ+a9S*u~gb1hes^LVcTB`ffRNEJ;li5WBBJ;G!dZ#1Q(^1oL>BFggu0L znX>`j=)$s;KhipkGf!V(nYqs3M*5j)-f0qJ7ogu>`_t0M+NNqJiI87;Rg&zT_C!*~ z=UR~F>EFJMd>@}Dt<&nRZFGtWH4dW{J2FCw#nMYs%9gyvTPo9j9c_z-$in9dp#0Zw z5-1*k*&Fb_ISFc_{nMd-e-vrjB-U)BdjE2HjJcyJQ!Uf-Dsx%Qt7i4lwy9In?$Nw5 zi=#2u%A&I51~w0mw$tmmcS4sGquIKMQWsCnAmkClgg%z~T3j3~3Ym-AJ}IAVPhIN| zm;RTLezOj~ALBKCimG+d??Qj6=ZP1%dMM>h`?&k@XJTQXyn&@-kklZbfA&}=pn1K@ zp9!A+kCDRP!*JCZyIw|vd!J#@7aleq;@z$G1b2(i`>d3qwvb_a zK3pU~3`1l16sTA-Ui`S+cdPO}+3Q7fzdm^(;&YqWku8chjDUwqROiYRny&T~znK$X zdZfj>=TEED2$D}HXX56;CLD4pY$@mcc1<{G1GkEqTSLEnUb^*25sR3sJ_cLf!}-}7MB&FGZg+u8?LLK3 zyykg>@pQ`c;pE&okBO+Oztwqh+oKVefid5S1c$G-e~|K%`#gm{bRZ0q>}0t{697)p z`7d`!FwOuw9jAzM@fdb4VHFu5ZXA*Rn+V7V5o@< zsWun@r`V0A#j`X+VR4M|vfR0E5${bpS8lw6*@fY?XbulWZ4CF!#%E;l=vG@7>7n~k7iVbJCCtaNPSLuz8f#jYX&!RP*-=kF8)#ADokC_;^+A>}>JI<$(GHx~As1@PI1f-jIe z>%s_CAY4)AeOGtoT8=Asg8c#*C>fZ|4yioBPdurqfSJwUGb2XGwuwYwtXvbU!J8?k zw--jV&M)%^&(TGbo4cM+gFy2QOUg#xOhA@CHH8;n^#sL)G~(S@UG-%_qC|-%I@T6LKjEZ zqXL3tCMOh78=y9j>*UFwNtxgyFfZ3e2fa**YF+u` zlW@afGWju`&bKB80|hX>m*X!jPS#b??}Ie(r9+5>mKCJcB0PwG4N|{=K4DZj+~D)&59R4kK%V%i&NrlZ9nA|I-gdxC*; z)I*bz7{Bn`$iS{A6}?9 zfnhm7v_gTHu8OwYo>1ZDQpiJtZm>fNs(){vGy?0de*D}2ZdNO?QpoZe()q%s9=+pH zf7^(Eq5BB$@gV3i1f^rqL|xvWld6#mLhgTDF2u><{Vg9J-rU|hHS8sg<750$yahJ%(YcOD{!9UhYdY_Xa<$YG2~#Ws zva-@hkz=geiW2`R~tmS0&?Ln(+*!a!~hgjByn;`B~jauC!hpA*l@|q@qq& zDu&S+?|?p<>{IwvPY20!9dLcYEEheY#3{joyRnhe-rU-ha9I|;S-ZjKFm~CK%I+SU ziArIg=Z=6611bH?53uf&tNw^OLKrWE_Bous6jUDeej^xI-k_>2o;c8OiH$8Q7;< ziZ+^`vExwYU^tnx=US$ZX@7P=e0P^XRCkn(-?8tP_uH5^ahRw=3jSfF(R17t%?C6o z0s|)appOZhG6~~J4Q4-mzcR_mk9Q?XKWXT9IT~b}&%;RfZ?C>wAN^FzOxYGNOB65i zVi^MPI2;2cOU-d`I;kcmsdU}_ImyV!tUgMR*r;ABs=*)XMU+aP3cW%SA?Kf7T-qYu z!oN8lY<8U z5j}d5wZWdxXliP@(0~{49R8lXhaPor057FR0;-0jNSHyV4o@p83{v*VHv&W4>akcT?Y%lm)^(;y?Oq!LQpQ7bY{#aWiwve>B9 zDn+?lNa0ms^}MtvMlW=l!LeCovTwsH z9^(|y1_T0sJU6G(2;N*{f_FsWgX~->>XpZi3fHj}xye?<)rW~9?&?dcqAwkZmi08S zVTWtY2G?8TlUX1Ch?8?_cSeCASHoeOjZ>~2p@rO{_JN=ZJ;g6rBsnzhTpJb)kb+W> zxdY%9vC4_X`>!UOpaU!9bJtJ6Nfd5iaLT2GHS&}ZFAW>-9K7|U(-MwVO^mIaz z4>~v06Dp0e;il7Cp-LTOZ z_`E(`%E!bPdFZx?yt_7&C>$jwL3VFMvwn~WL-{ry})8e9w($LKL9Jto5*=(5M@jcf;}7TW8pe&KsOqHkV85oi?wb#N63* zA+0&?oqaL4uXAJRlD7r}!pw*h%%vBi`cxP+BKzF}N!-x1Lyj5mbUWy_2hZ5? zcc*CHiw2;OSt^X8_#=p|M_&L&0ykN4ORuH0AI+&_Rtes4K7e?CW^Pc{vwH%YA4&Oe z8;nzUb)>M-1JfEwdq>a{qy}@4jKQmdaxLKqRUiu)`xz1oHzwO@&NNPG6u&*Y3u+Jn z8`UtYPwLcv}VmFtt8w3$v6me|$REe|QOz8>z%97Crzx=d`HDXV zy)}sCYb5WD0w!arUrH)FlXCp#rOM0KjN;|o9J;Oi)NmGW@+v4O@`_YRKE?=-;-L)U$~-MV`fL)^;Kinn$R_u-y6Jp zd3slZ-VdL=WNiB&GVu{{BM_pZN;rzDb@ zB1>Hbr2xR=XPC`ij2dzrpM$i>uJ~ z4${<~j?x?kwOT!jn)`Jy;RUX_^@gYz%9D}t9`JdRvCp-oFkWm)bWRrU(m+L-34j7I z71Dw7I(BlL`As~l zOf4P!lP#BFTZ@dM9D!H|MSS%oLxjxUxEhZCk-aWhn=;-aI70bwr;sM zk!0xCIDkS&Rj45XoSX)^5ucx7DffPT!A{;g_24^g?j4{xOg~SBzBxtaAZs&=bZ_=T zprEHK+A2X>Qa31_T;qFEw7PT1mL*(3W&pQOw>F4Nb9tsQbAa4KZIF+v0t;O0{_u3N z$kFV_vzo3e&d`20kFEzLZ;5ob;j2=KJmteR(mhFNTbspeIb@`OADuvX3U#iHwu*#R zbJ<+w2)EgEbi$A6oQg=?9ekgr`w{HbZ|HS+%>)Kdi6ot6+LH$ zn8!|=bEyMN8>nRC`{ayQs;(ajztI!w%>(H*nxOyYm>vqL$R45QHBm+aRcTV$*JpiL z43ba#Q9V%@U)6+@;msnGDF_Wck=~gs$VCtPLiwi2mHUgl$@3bs%>YDdlqx2hYlZ)G z0Xw|$U@>1cc^zS>GN=mU-VW^eviSHQ6Um~MlaGLFad}5MA`M?*{)*f~LHM)jH}{g= z{?-|kN@db_X=nP7XLY)v^4#jFDgs`quZ_KW-|0skVTlI5Gv)!!*0tyh6TOtceY|EW z|0Z%P1eN!-m33l9-wRvo%NN2|m_?3{&;b+e>-fvqLp?JyB(5Il(GPJfrg|{7GCO=W z4G1>|!w{;-M8Oyi7K-tKo-#F*(Xpuiqc z5ziz;evRP=sJUa6cXl1qJVWfHzftxKOsot-Pv`h#KTUqB40zNkA~XJaapDTRJ6qyiUm=A$vt(E~KD4sx-i^(foHYdYfC*9ddP(>!WwLopC6pk9?X@w}4h_1B;?1 zYG|fMF!uSFjfo?*s-*jUkKpb>Y#9IR2OGTo>wA}Puk4>ggvCD7l;=ViU1{bdSqf7U*7XReLoS=}**9L>k3lPGy(d62uY42%0D1>2~G2xYUQK^#DICb@kiA9l&W#%u&qtZ_yS`>lro_Lai!oH!<|UVJ3ZHY?zRGnQSrG z{u30KBS0GombCv6L|qZJO(nnJ$&@*ZoEk(qhOh6=V^NcpD^`!SE4>?n3v zLa5LXVICY&;1Nx$;cDA*Q6mO^e!yI55a>`GSeE)bR@}gk}I;f+(2NfXz35jmW zA7q*#rb#jQxjFW>i8*BcGnL-D?0Nv#`1PPy5{cIhI1oz)ap%9WOc`~-Dj*6N{JQJi zhH^)&ILvk{ZfDg&DXliddjktl@foKfbzf|Lq~p)l7y@3IQST<2XRFtqPseC~zfZ`~i8@N)X94l0Hwo$QG#FnFkg@P^KLx-C{NAJb0jL3 zoHSU2!y`#UGOv%t7<=BOc?<-e7|-~<>JCJSU~(+~4YpmNyQ&TLA#bZZ-xCYGgMEn! z)MNjd8^${)@dj*UPt#L=R3RT3vx)rgskLlfVI|g}? zV47&rfgqwTcZfGFV6xpIfKW1=vCY4s(d9`RQt7V(J~{gu+x?71VXEvcr`k&6zVLp# zyMW{aU7uSF(*lal$GA^bk{C9x^**Nrdk8ZniTZXzZsO# z=z3L6hY*MI+r<6n`velA@E7X(W*`tsm-uukRbRp{NfMOE_tutzpb$dj+@t=i8gIT{ zeI$g!_S?Fo^?y>MdpqLJn^3)egUzqNH*1teVUJ-TYz=sx4;C^?J+N7Alx3y(M4>bL zQr0mj*@Z+&6cO+~#yR}zGrTa|ed?N`siyeN}rbV88Al(QIju_r)F?vUZib(DXIli&8 zMitYLLUs&BA0|~O1OSyNw$!Xr;3TZu`02~PRn53ypgzD5&Wzh;0U-)Ub#qmwJ846U zy(ZaG8g7tLF6LuVo&>n}UDNPpX8q2ugaWDIqU~2*IH}ZX=k~0h%kKF-MQ~pmNr3?f zb*7RH$w)0YzDNUImktG5moUUjpqHk}z_TpZPC~fQURb(Gw6Zbc<&O5qBDLU@Bc4^? z2#{5d7BZrRoRD)}l)jX>r@cf--$yFS+6#PlHPNv>jnd4ZSdOdZQ>;G`Zgr(Zl$-vB4in; z1iKZ*7ed<#gu}_-(vNNpU>_B7glp^wK0_rCW{?Q}XRmQTZ`+Q*Ndqp z82Th?;L%F0CI4oz2HwF4xd{#{eUaNsy?&d~Z(&9(JWmJpDo(i0Wq+9Uze=iBWXeDx z-AA7X{0PeiVqu_)@Kb&0V!)qb`Ti^{h!Btwrc~kxX};&Kb3(e4P#BMp{ow-&Lqod8 z|0SI{|HbX~@a8Qlu%Kt6(>B? zDZ_HF5uLxKAw4ud>7OJKCqn^`^%PvA%HnBvS$X*SzNP0xgPX1|y%bBWoZQBgFh1Ap zhu^n^sK1wMb}(53xA}^+oJ*``NkX0bAhys)!7`%uE94br=ly6hL%#vm2~-!jot>MLJtIfq1{)`}Vz#E#M)`VoEJ3Zr#f!6Vwmz^G@`9 z`FHczKOb=lpHG)27okL&%pCMP6IU9N@Y$ovIvXRUkmHfS8&v(9-AQ)+U96@zFJ%Jd z31bV4l1+U0q;IUFfv>G zfsG)Lq>JLA9~vb~^BYJl5uknm@>A1K=4O((i5dy#5-m0aUDqp&+l1D#7>E6@W`(il z9)%Bg;16v4b36?XR>Tek%IrQiBZL{=v$IHOi6I$`nZ*!2B6pXkbx^n!C+%F2v=3uF z&6i%-ajqOv1+QTP6qWsD_dQL5jQZKGduYCMSCz(Tv1k{=q;_Y#E=}Esj~^{m7prTo znSjDNWBVk++>`J#>KhNPfSc6MTA}dEbdeCc<5k(Y4A^(BgD#P#mVWWa;Tk5UM96Fk zL6B0IpiDWKC6SF|He<016AH^IFU0eoMt#o51u&3gYZ2f`<}O-}o( zZj-~EHUw~iX zJPHhQI@>J)%zN|m4T`rIXjS}U#)&!}uQNy6&U6|Qr?Do=^1e`Y({*B6g>UpC9@*7J z;TnD#=R#0#_5NDGp9Tn&9`ubt)bNPC1S5^O36OKHL<-@WGLt#_4<7tzv)7Y*;WMpS z<7<5alh~Lp5Lv57?!OkzBt35;k8Vqz z=iQ@HO&Bq1c^O5erD;}j)3K4Zm^m_Cul=#*U>bkx z;OK_-2>hB2l)((em+C=UmeE<8OUSzZ0r#I6rymuKE>of8011PH1t}d&)~(9AQV`W4 zHG`i)W+iK7Rq`wN_74xWgHy0{GpH{ie~j!67WI0EO^4Tv@50`~=KfIH%B$;mJGRk$ z(jJz1{0grLz=1v<$UAq!rn-X`+OoGG6qe*d>-*B{!#GGc7GrrBNRgMw_t&0S& z30luot4a&rB+i`f=TQh+>Yg#ne4;Zt*w8`@2{6cJ(M?wnC-2tW9phAB9dV4Un@N>W z8ep}@$515TS|*p1#kp5RJ9j<>RxKXfloE8Cr8Li|C(EkTf{Gndp^639P;qOR=l<`BNea*> zCDcwJ9`F44M}OZ@Kf6_o>nJM0i+YSDvYmw&%fmeKT7XNu!{^7tASiK(VXped@bC7= zu|=*z&|Aw04SHlz*kSR>HfO+UkMO5JEbeZkJFzr7IP8#=$Hv&2NDECn4ED}S06@1P z_%Dw{K@v?MGf{1KCzE*mHIcpu`_iIMD{q+gd-oMplQg7OMpZH*Ks7Mm)}-OPEMnS| zxy^Z!^F;@1GQvhDGnOiywOpPPQSR}pJuYR-tXlvo8*bV>d`?B(X;Kch&x1DRiU9mi z1Q+<&N`$!s!PT-z_vP^#H+q@fM)1XfX$bT-q`&Mvr6VW%pB-g2vJ5gzUjM`ELo)Ne zAu*Rvj8UW$Zbh0UZ)<t=LL;Rt7Nge?@$ps;~owY}u-j{F|OW5_Aj3#0{ylwzjHBVXbr41vIRyr_qL{X*{ z5+~^UL$y^26fl8o)|Xq34!Dw-=cur1eay2yb2npAb=W@{y`1H9Xjm$YAd}8s8GUV$V!GUiR1nBLpt8AM(}j- z@mnU-#^~>z>t=*Y8~GFv-EHMpepC=$vDEKL|DEbjmsKeReU;73At{E+VgfrKn(#@R zcI`$#CiXv>sxorVoGK(%4vi}0P_Acw+COQayrC?K=tabZzU*Z(N=C3!9DC9(R*|1r z3UF&h99rky<`ZNjK=y9NH@B5ijDp%KN1lhR`ow`+3gE+EW@UZkIKA^THAUw0 zv%Uff;6qsGV_ejZm*a3FFE}wvFoIn{Bf$}#HN=SO{Hu{ucQ`Ik?6>s4{U@3e z|CX;02IU8#ru_XKI4<8HhY=&D>e2q+`C;ofF-ZgWU;Ry-Vs+{Mn>hIsTtgMunReV{ zUMm#=wYiu0MLf}C`J3w!P)OD8#j#eYlq+v*WRcbvareyz?R+kBbD7@*r)&fnSbSvE&g1v$5{t+YyALAcp+aB3D zBwRzf#pC7q@~2`hwecpBz9!59jvv(zJ@gI-fBvRm|6~i)UkYsoH1Mp!{T0%Pq`(|u?F>8IaCAUqe7yFsG;Md*w z8SoxT+H$+$JieeTJ!D!}Ld^p$SIfS#`sqh7+qH5FqARRaEAiYwm~#i_)vN>t2TIKw4CbnP1J%}aPadI7y7!;YlqJ)OXQ0g z&sJ)UM_N~#aaAG5l;HN-?sHZl*Xegm(gEuE)zD`g2>a&Y^vY;}qWiM|t4_K9)x9@* z|I&OgISv%@Gr;121|%l-WLq)FytUMTof+9HAwFrKMq=E^CtV@RETbL|scKd3H<1 z+u`bON(HEykNUAU$_{#lC-S9QJK>?-XVY-)Fm7J3Q`z#DflfuwKU-6iq+Nn(gT6;& zg>5&JOH3Chz+!>F-d&U#1fh5x1kw63uXXA$aeyFT_VwK*ui3{W1Hi~%^lW>dyrp!U zS*;&?V*6~L2bIN~M0+>eil;9SJ>~Lk2XWo(HR&{96_JNYef9HuP%y$~wnlJfV2Xa4 z%cVfQ^ha|HWr@o3wGN^6QkTT$qAJY8oTWq^ZVhieezVA~8lML3pl@ADKMiF6&|;!c z11Tkd$#Y?fuCYl%n^kV7dVkMF4oZFYYp1yK#~!`9SB^oW)@-OWBBUH74IMe4TMWH) zN+Ecp`t!xfOF5h}{UP_ow&f-okl|tMtnCp^-u%xmFL8RIur0 z;mLCdgYFVLk1}P5s1g+|^uJGKvsI@vYG8WX9n2ek_z1&AOIa|Gnq552?uwAg;Z;iE z{hU^g;kn6K?=vc_fjcFKCR(%{tnu4)O_Zb*GjA0P1Uj0r9TpkK6%b5eo?fR_*dSzC z^Su^mC-XCcDn4|=PG06Nh}I*hEi_%{3K zM$LZIpv}apl#2p~xyVBUiGIWa?FCcGc{=Eitx>!78bwWh*R)#q(4thkys2=^;fSD) zbGemdM^t*Zu%{Q6Vd^@W#{oX0c4MpOfiq zxZ4p3h$>U6VyGtN%(@XYApgl(Sb%^nTqd- zNTqKPFH&bOvNe*0Q8`G^ND-7tLJTx(JnDykD^8$@KqT0M?3u9g@{g|*{Ygj;MHkr zB6C-DP$Ct>S&)3cu)RM6vJ4EKMh)fy9aTW7)mGt7W7aDQvq85E#C@1Twkrj8UzSN_ z<>KVM7hw6nQRqHB?k5JBa}|saySU3y3Rx8`o#*W=Hkr`Kh3uZ-G75v%bsfsZK;2oN zm+9`K0MfklvqqQ*Ytci(vy|#RR(i;x1QvRmtWDkKoBHu=T~QCy$jPMzncz3tJKq&d zeQ@MU>r}b})LHAIu@Y zhzgc{nD<392!6LnCit`FcX!(8<`drnTZ`8hyF614i>5J9?PNm)sB1&Wen4YI-wlN4 z#9(N2np&$X`{>VFOMdRy;#(fW;z#+BrKf-S*|^wwRgJ3pOYwomCkxA0I9^Wj=`%sA z*i^xIa|&ySr$p895kw>sa#-5^c&w(i#oCT4?SZKfugrbcsBG)%EAoVHdWAwcw7^yt zmGx%_GZ6AFxK$5PM}uBYRg}(~@n76D2it zFw)ce?ctQuG^g9m>n$aFQo2t{^)J`GhjUl`nPq(y9%hOtCrr~!ep08Riwxvs4cF&q zZVUX7eRZZQhVO+lW~dcr=))Tw>>9wldV3g|MaisDjD7|e26tW z?{&c9rFeVn-#;LeO{+NU9S36$$IsA^pJr7ktPn4N{gVWBraejmWgs3coSiu$&Rav0 z$UWupFNKf~Zrkw`YpT4>o zw=UH$W+~7}kN(JNVdvm{O!#4qKJY3Y_1Qu-hD?3&tAR_Y?x*N>{3txjM5-OBegM7g zzMUxAED1SeB*G4C2aH(hz`y zUX6ZcgQ$z_sPo6HPVXLUfV5jz=#@BXOcbV4mPNhRxLyhCzQAcpDgo!>qNfKF_B#?1 z6(1~{espetbv8o4MHtr9t3TN@sv{VWFM8loVqvelO9f9<~Q`FZ#c1a|{65UOwZ)t%YCT z``B&t2XCb-T;F@x%Fkxz)#Gmo)1ik0DF+9d%GU3NKbY@PPj{-n<@66# zza#>6_m0cy@fzz9h*>H`eW36?*3p{>ZN1JLO>cFk<(scf_J1*LaMHz+$W~eWZRK~( zc;b9tb%N>)5M(5-&-Ps{;*ZaO_GGZg-SEil7f3_rg;E1sdz5}P838B`14(|GjaUD| zvPebjj{&|&u56Ib;z1G15iV54I3~6?mKGdUxaxbXEzE}UT%M9-PR1W@)ET$Bk1Kl1 zv}+m>EsTfUx9oECKih*NruAQ0OL)DqRhZ@qn3ys=`>!z_& z>Zy6xWIwYkTXx6fz2e1JmxXr_v=y{-?l5RYk2jG!ukdB&s_&7`EBu{Us>?TS@E@Mn z;*vNYnth%y4-Qa%;;fVn%o8FrC@MFSA2^T}tnzYF@0BQ6*j264N(5zy?-wBqX5wDD z)?3FPetG(S^2N?@BX`@x`6AM7ljjYZ;rQczmA&E|39|xsULtohn&@)>;ig}@yS4_q zzMT{<`dMR?dPl-H+B1bVs|5Dn3tt|B?R6+|DwEgOhkHT7Z)rImKfW<(_$WN^=Idy> zK;Hk)I)gK|2g)p<;IX;tOOWYbCeqic@np##m17?_K$GHwG2HZqZ;yBUm97`6J*d?C z7wy+`*bv~r7_v%WFB)4qD14moR^fRM+%3%y!E#4T`W42iT!;cIeMzH_`*_OK=v7U) z!v?G`q84_R=_Nia^cnmb6@I7-C9&>m#FA!1Ou<1YBAks^S1XFfwL#1>Dh{qMh?rOv zq3a$vO?X=7O`p5I1<8+mZNZ<@M=mfjl>KnI?mEP70?Or-mVII?kO5$F6K#;kV@f5E zNVVOpaVZV#BfAx)x3McJz9Pqeya~A4B*$*hGcpm1YH8;6I*0gE@(nWA;Z*9=vvc?w=ihJ!|>yp+rd987EUj z_=xC5ueo!|YsbZcAhmp7?Z zvu=)XVqji9X}dV)3S-KNp4En-x2GKjkAATuC9>5Bp@Fz4x6Wu8AuKT4k_8sx=%>~2 zQz2{f@tcVa0r)k^4FR~@Y4>8=@V<9HoZ6qGR=LwklVpeHzSOhSR?aM4?0<0m=75{ zx^9(U+J#h7QR}G3Zdb+`A+3j{*{uHY0yt>|X2P`V;wQDCcbQN-455c}hh@{TFu(Sr zPwZz{(iq>1Ux<7~;%UB*+^C%nfs&{)lE){3Q*wPdmDs3V z8~se(GJ5;gFv%~b7oS?eqQ&ot`?5?jryVahwtT3kR7ri~Q!N@I+)&`5No)Jfv{|ZM6rNq%vXs1PFUQobi7$-G|5~wIocB!HJuJi z?~5a!$c0Ipv3Xo#?=CL8Nb7fq1M4m8y^NH1)S>TMHyV1goyZIcSfivl5u@G717?sZp-nFS)P3eCy$hHZSPIPpeMoMNTZwm z%E+gE6`9i!NuvFO_vzTa_d4awKc}a%I|@><(q*(dx@(lW_~9*Y+KbctjdAbp*wMOh z|Iu9cVep_+yz(mNPHPE=wW%)f&mUm4O#N4uYU_HrnIA%k@61XGBY0Z5oG=mJkKw5# z^;uH7?1aSQ&)wIX_o(OXnE=@ZlG+=2CVmuph3yo8Q)6E}ZSD^2b1E4y_5#w|iy{Ja zZU0p!hpt@!?eVQrzYSqCH6R`A<%%cxA9Mx34f__PP^h|zme2JxJcJulJ*}Pe>{#y) zL3dsO*{h^SG5=IG>N%yn@H)T9PLB%w8c0dA#jQ3I+fXi>w)o%a3`XdjZ+ zxe4S7cVh~L3B{(a+>L}8{8*lr-OJoI&B24eo`%*W{T3AF5F;87yFXdM#G&%q5 z0#vI-xQ}xFTc%zX#sHMVD(myO06n5_=M%#mh)Qia9do@c|IM@;~zQp!};$URMB= z$!5XpYwW;n2!P8n5Pu9*Gw|e_ zjGu;U7!F9ZW&cN+Tn8wVE8cke1Ke{)`(H1f0^>j2IQIs|9~qp=1jbh%Y1;kI&-mYv z9To+#fji{7)q#ST}~S0bF@8oT~3*-zjM&Mt0l zQfQZ*sTwbhy%y&C3WbLk#!{yp;F;OQj9tt!r`<-=00H276J)I?{?4__-?5Fa76Xa7 z0Vf}h-Rs@MFAe48yU2!Webu>!#mX6<9Ip?y-bzkBLp&n3L_9v}mTGHjTZ@aDRDxOSoL5=K ze3q_~4LFWm^gjnQInP!Htb;z|X&^d^)G+Ett%_WA~Nk}e7Pn<+X@ zW1Phs5`z6ar#wu&hc_;tZu9KS9M&|r{XV%mqYhApBjv(iGQSjgChhUI zfX!2Y7R&GJ(Om0EGoB`}mHy}Uggl`4hC@rQX87TQIl9;%l^(fSP)&wdGTD}?wpH58 zBg6g0P<_r-zPjBR6-vvm_X11xy9IXoNOuFfBrZOXu1kLS`L0RsR3(?_|7lzB=+L^a z@c$YYI9eLrPA{InT)3IbN4oD=o?i_ctZWk$`nNkQWz?5$UCLG*e)ZWezUaWHwL5c- zrQxC8HK_^~d>svAzH-{xABdy5&-&m}WrtadW-Ij?{?`B2IctF8LDgW$?lr9b*9K)g zrCi*>)ye_yn|oPxeu^T4rqzj+RhGA!_I~zKBp7pDDvQvXg>y)Y&_h)YFq)x# zRU)Q+q6un!(%N@}?RlyK)E0dJRrgkc9^F5%8<{i#0&urf{Ov`=kj5+-9?~oJQ1JZn zXv{Dk#;G2DK*p-53%K)ey3aQnFzfkTpU^NiA(9sZuk&!Zr60#&%k#U|ZcaAZsKru= zY@NwtpPmt7rSy&FPEbleI&MRBBty-m)yeqiA~rDc#F!#sjfa^@F~?sI4zZv?kz6^G}lJ77Jbwm?8Lc)VUr zgyc34GF{`K%qa!k+r%;sKZJqd7hMV8z>0rXFw$fI1Mc%GHe|H`(=)m717Bv$X;8FE zU={lDj4B1uptVYZM)LDNXKxt&p7sN$E2g6JfMvR$9gWXRbkwt|*dOi624s?C%eh(h zRp+bZvv2ZPIe8-k0+i0AC?l2TlmC9im!?Q3>zO% z=;HTslTA=hsF=7LwQVg0mz5GBNXpfqRq~-$=J3|1j*6wk?<;b*d17S9=c22NSuR4< z*z0!sW!laf>KSKLTAdfLz1)pTi;_gdcW(Kv@cYbj#RO#3T++{5fG$DkD|kzZ> zmetpz4Cxbg2oXY&cZ%2Zp@vDWelb)3snVT*z_m{EO*;_O*-~-;RVuMBKZP8Nwgibmemd$~_X2dDSKV_Jjmo;oEmL0oyP-#pOcb}~>;4Dl|% zzTlF?CEr@1BVCw=8ZG*)6Sa0=iGi4McTfQxp%MLN;L z)=G^`;tsd^wySTib`tt3!c?+-2tQ7+@Q`xY1U{<4^l!dH1 zz7~@cP(3-kyN}L?UxSg9VeX=~eHciL-&oRtx6K|Iw~eo##-#IBRi{DxUZG+s-g*?O zD=Sxv>18Q?*q?cNL7nSy{q=V)edc|!c zbeYOk^6C~dm0w@I)U27Pce2L{YD;@5?R>F3kfM$sJFLLPo{~z#I6IibFGDq)ZE`a8 zV~@6tzxloZD8mPaQN6N^(Tm6pfoW-SqV!9jg(}4q+f}coVpmdo6Y)nueH-us4kZzy zi;tM!%PJFcN9_0iX*~NzTP92EQqyooiQk(x?#4UU5<{4m2!tyF`A1_omU;0V39Vbn z1e!q>w^1akS9`nL)3H-OgYNXBfcCjtsRCN#mJ_o2r8)@epGXO~$tlTV!hS{TCXeXe z!jsB_;f~=W`)?{g1?}>-dY_#8wSC;vML=?;kHxaTdPFRKIZu?mkXZDK^W((0WL+!- zSCDE2BPJ8vd4VN4x|~J2oD+0)Nn9Rw;CFgt+bfQ>chxL+qcnbiT*1;_>0FE^_oDUw2-VG_Fq7n0|Nloku+FYWa)KhVMYDq(!~ah5(>VLercs8 zcoWlUXv1a>!tU7%jzQaa4|rD&X61+8aeZ*ndBV&cg17RRoHdD=DRs(wrq7y)7dLOj z7Sa7GSWG!knp3rtNH*&I-u1DTu{T^!|65w_taTMrhKx`=o+#>S0`2Aov}J8UAxCox zUa&3%f}M zn)(g_oQFk-A{TeuU=DRm-8T9uWFj4gYuK*HSWZc3hH!MWmMrCof<_$9VSEHyk?h27f4(IAVhe{*r@h&W ztD4x-SaI#NOe{h}2-U};Zonw3vC!@dw{Y$ohb&5VA!Y{O)?yNIh`GrZc&rQCB5$rE zxBDpBB*tB3qa9&v$~tFcpc1Tid<7HJmDXP#(8oJWM(cEWj;)J$(L?DvTDL_Y6CGez zO?Y9Kj{%EB@&o} zRt312$W&$4uKeeDv5#p!!Y-X}laaBl^s+vb0w;H%BrW3OCIGQf8qy-WM14bknU-8xaU%;fe-;GP@{H!ISE4JM`p7 zWqy5kRN1pb6)K+ZtAIH90jn%D?8QRG%pyFl5|Ssfzm({@)UH(%MVvoeqN595OmLy~ zLVUM|&B`X&r}=Ltf)o+<0`kq91xhhO69 zi%-Tr#JYz8g}&cF%Pkn{f$0-yoBJ96H%ruGF*0_B7Be~Y1@SMo9RyC#S-a&>@6lSp zXRW%;{hP zJFv&t=W-Lr%iGdEB;|76$amg*AF)3rsmwV=70>bD8kvLm z#AF??(3$xv;|ajltQXx1Xhwaas|CPk7LAX5@1D9%{78k({$lVkS#Is!RUx8J_}=-% zfve?IB#zgs9_BBBqaJ~1O`)+9X z!;|jb6*jnR?DG{IDtygi%_9x3%>>kUREYDc&3r=HD_jgOkuJ`^R1Oc+X{A|I^Ymvm za%rx?uty=J` z+%clJ6QE4emY6c(jj~a+`FtrZaIsvNJAgWPG{3$QRgSrzA-Hn8DdvtT(;>R>UbJA` ztoe2Z=;1(jvu1{x^KkV^5#f^(3x}|jHUBuct7GCLBlf zjB>y5*U-JcPFYt(1vvjya1`$SCGhqH{(NE&>H0M*ueT=uN^8xEe!P}80?LH_9w4bG z(<|-k zV17k%zgK#G*uQ=6I?e9dA084n zVM-J(HACAuVNMQg*_Rdv;^hU+Gxqu8aPU`Bl6JcvH2M0Jx3`g}#b5RAdzOMEyzfLc z@nc)vw$8wg9nyV)-E9_IVA@)^MBix;b6+Io;XC-oPR2a^T734Nye#dx+tys-7ppPX zUE?O}HsCas5C$Bqq2`e00c&fDU{i$n87%gh!SzPzKQ9hdZ2p7RFD~ zg`!oXLD|b8k`+G8gjCUtauIm&){*QIjuzgbF5R(k*z8%Dc=dimH$b3ek53|mFRXFh zUU-TKAYR(RJh`M>jPf>PZXK11Ms6{a4-bEzh@DbBP+(`4J8flHv@2g8Kp~|>ogbsG z{&+>Z2OMli?)M!j^U~HduxRnS&m7xF8)eA}F|;%Yz2$k5mc%Ax1W;KH33ye8(>2*( zD+3qYeeys{@UhnqGk<PWNx4 z+6m|Mxt`Xm`x4$m=e3`N@SD~THe7`4N}ViqMt|I92Qc^Dw^0BnnZ?hA9DC-Qww^oj zf*ZVo4}S*nYfJrs#X)1TC7&PdBQLHG8S34yL`w-wHYO=u4zFR&ZWcjW#Cf4$MvgUNyhOO5_U79jPRu=VsFwoc2ssDS8?Qj zP(pfg9&zK?#VkP5?JF}nn(5*dLLuXA{~h-<0OXuOAT&ZmJuFTjKxkJ~IfIkz!!I3? z_6V%Y)P=gacNrT5N08MX$JH4#W<$g)76hyd5Fzi+I*_B#P9-jVTol3k{v(;(Jm^yfD~CN7iCc@I#uwwyPzk7xR_56l>d%{q<}2&6K5} zgT64&z~EGZfx&m$Wf^H<_F{3pv+Bp1La%JF0DSVpnjRwnvvxgv`;VC&K86W4@Tk}1 z`I!(VEN-*vh@F@Ho6R|`#jh4yekkTY%RS~eF=9+FJ$Zg(em8zMe^Xo@8=@w~rP53gy-=*jNBz&|&iC@M1=_@{P8gaP$ z-NPrGT;Yu*%M@LxydLlie1sOZ36Po2*l*omv%6Exl7eP#$_efhTf-UDgm`oMhUjC~S81}=(i`9#ucQTpwbo5yI* zWQ)8aVdqNjy0SHqB`ecWjkn8bToAZC!|0mmVn@BsLDN3j+loqIY&*3tz;dyu!|8bT z#7VEv_KDZ6NfPN%kyWn@+Z()sBmz5&s-(&23?WBx_BrwP)dx^la(r}13Wqks3@omL zTkmIj^@Z2?E#3s2$7YqT{gGtM*6!@MS4I zqpZq*{9_?2{oW8>I{vUxId^O5*uY}w+)Ls()=^Q9IDa?1X`h{s5tDwaO2cjuA88NR z|8&-glpMF#`vqBA$OAI-YokBqAuY{#Z7D^c@UE5E{mTs2k>J6^zMdf}^>;n_L;+P!i4 zpKM7*b}i>N3=Jn3>Q#`l_cu9|fgWVKz`Y!ROOcu+pd@vN< z@;jQaGk_V<0yvO)-gtE{!#Sm|ySi(7=+JQuuG*x~oM?GZNs>w-JLj;cRn))^Kg}eb zw{!lmrWzBY8=N-S&UKUBl%%6)*JqP7yg=#QOq!4|%Z zL-DF1hj^GSmi;MzKvW=`gcgO?(Tr@GI_0zo3YQP5_e=l=-07rn(Z&2h5;*9 z_fgx>lBMTc$i3F!D&3k_j>edY#nF~YpfOYbo@(<>1;K1`SE&j=pTShwumZ;X5xMCK z?UC8K$=Cxw@+}jJGGde3h_=o3T@$?ueV$92o6%=G?LmE-r-~tF$_cmP{eLCNi7;!# zm%|s+A{~g|Gu7PFRX8*US7+aBGjGF|MpXG8_8A-fNBqZoYsqu zb=elH%Bi97^2vA0h`NqDP7X?I4Sjlv`G>Ma!51%F5nyYz9OY%lN+*;IE99@>#XZ}W z5GGTQ_H!MPFAcZrw{RYBH~DHl($+T$JC`MW!;mps9XhA9=D3H0%DVF+Os{pCiT=FK z5GAwMskAg-=j6ZyF6*avy5V%b{_efm_T@GJamrA2j;A=|`GVSIEg z`y%o7rxhInQv({FIEwC z*T;h0uzzIBUF*tEUE}y-udXEO=7kG9NFFd2Mf#o`Ep>_$2&o`^Bi9DX1=sZ#NOR>> z<6vr*Z>zn{hjT$Swy4Nu+BGQk1Eal6-8_ie)Klk=xZ@9APj{5RC(gnLDDdwi8 z;dO+8>?`cWt(cXIHp;j(o30mo)Rc5^5&NmiSNF_2pOC*{rk=~3;h^mFlUgA#C*Nr_ z?HXMFT}4sbD`UxSBN~==gh|5nOTX}hUwn@Ra1mHz1OucUbzcy>6$)@1StlY3{l*dh z^7HYJ&xNfmP{y|WuIJXyaau2F1Ut95*=6HI9W_W-w9ObUs>BX~@_AA-s*$<(W;7WR zz>X}k!9R3MjL|c>h<9V@?QvWi*9ep40A1P~dm~oFav@=k1O{n|grmDIWb=d@Bi4@l z^%WUH`{>3nfQ*55N=It_y+rxPMy~^V3(W#y?GA^PkcUo&-<_9 zM2+Qfgoa9j{LZ8B;DkpxA|qbHl;#qApTn8IG}s0ZiI%cul|7Fje24iEH=u~D&4{jv z(3hOf{Q47N3*QZkaprxddtibL-st%r!d508@rt}Z{dyZXsIcS)CopdBaJpwy{dGyG za$~t1nm~_g#q?XU+qeRDJqFhd;IW8#NL}8s<%?F!tnkxn@jxMbq}vTO?Aho^J4fJzsAojLZDqPt`;rley-AW zb(-z)K42F+atuCv47d&4%keW6Z)w587CvqZrAc?3slj_2c2-T=vRN)@z-M$pH6NkD za;Tj9^WOCjxVAiXi}!G*@ushUslLAbhdXjwRq{U~sOH{u{m+F#(>(YRCzZ!u==KP z@d&kCIt=I@iI7j=%KFr=B?KHFVW5{f8|A01yHIyl4zSbHV#C@RDs$ubA`JtcN2GmudublOvX5s39mhg~ zszzCW)qT#;@)|-0*+ceASQl<1_{K$?+>Lm^itd4BMNRwtrsms(9hp$~zSw&Y zX;hALzV8a6=D5A3Z@R}Z_8n%;sfI}p!u(J!Vf3kx*nn&OX#bQkbubgJJR|8jb=N{g z!IL0UK-|oK9j7J+4a9cq?2_=L*uaT*-QmMWag1htFeh6}v+811>rxtt0o>ZTSKPY+ zv1M&g(Cav;bM9=NBV9}5O841~IjYVmJn5($-_Kmu^5cmfm8k{KCJ*l=M#~uvB}8bv z(nmfsI5quQCcb;K`hZTznT+w*5kOEOzn47FMVR4Pn8b|57rlR9FD^55xV95~pDKL)>T~-iloXBUg+O0*8aFgI;*P6m?!S z6BjG75()cmK+17v$L@5%Rq@3i$3Fx)ysS2vV-S+v7vb`UlF)?k`A-a&rubIxQEzC^ z9+7F^b&7Z9r%mze=lcbC6&1`>pY&b|6$U~bS-^f56w;09eSls9bkS-UoFaeI#|8qykL&M^P`3mVE(LER23*i8iQ_Qz~Fu zB*)YIE(ufiPf$Q;9S7|!;AGvmWyU+W!Itr z3&X-WXgj@WHr}H9(R_!)!r9Z35QC0#I#KoO#?N$xD?*pPecq0XUmG&+6$u!l+Y=7O z!~Vb~ITCm;XZQbrGcHm8hrO?iifdaM3Ci?b)t{Ux&`=vTdj?f| zdu~yDFd9N?S97gm4C+-F{WQnXlDDo)vE) z1)Y6eiSNk&VU`0#WTJ}w%C5m(J=62TS>m2XLhvX6WKc+G2EJ|CxJ8aO5^w&%bNmH0 zxOuUm)x44=)Q$FLf<r&>V=W;Mc)WCmEUS zi?aK(y;QxS9>Gvn*a9h*mfZF*m$mytF2Y8)$c`W>FI7xe;?^XzR%G3m#f@S<66fOgX*-w@KRy)uKPF$^ zZ(+CM*OOvM4Scw-?We}gUaLj5Ov)k&QGBf2WqVfDy*Y_n9oevgnUPwR8J;t444XK^ zrYrn-R#K!4`vOHgl6AfFvE&(>X9iYP#{4hf4z2cYj&7Kzj6$Uu%1ul zGX@`ZSuKbnLOKq-iXuH^2cuN91NIB5k_Z`cP z$_LgU2V>*7OIhwh=NU$oTo!$5@#Dl|W1Vr2PkJ~gUogrwQyi8REl?7*(S}~Kf8dg5 zU?l3eQ?^(sP7+*fwBf#T#=&`2weJ3ayhL|*l^lBOBWB@mR1Eexi|EJe@p@`=-#gC? zau%irwu$k2srYmh)mlPTT2(8YZi!#4X1W7}2Dr%^Q~CDy-i|p9SFM**d-qlWn^tGMlZ` z{-8PX?P~2v7ep|x^(^3_Zg@B?&)d^t_4H6j!m`jHogsQnC~>LN@{xmhLnR_Cy0Nka z4H;EoJ)TYQMW)}|4^eWWcd3q_)#&lf0s5Ow zlb!tVb1cz{wzS|b*$U0HL)t`F_ptlM_0~k`C`Eq}_SLYrx2=4OgVtd&g5V=R#mXd@ zs)d;rzJmj~ur|o_e5w;)mmpyvJ|cU=Kw>q}-TS@R{>4b|$hLsnZT#&4d9#*kH|Kz2 zxqinI$3S#y1jfJQ27Nq1uo;%wW-kvc+g+d)3qNWV!W3mC$x?Y^LzS8=>&MNQ<2^=Q z(nR})@(+?G%7V_&&do4y41}cW()tvtpkHTa-!^}4T8xNqU%|du8Eim3wanm^XV8D^ zK+N}~$Q9+`^B=O2Kh-j+9f5gLvnpYe?c!WWD9N!$WQG3kt6 zv78&JN~Qt+mv+OG_nUh5irVA;A}o~7a&RPaPlLXGDT{dO)maW(&wM;hk!`0>f(SNi zntv;M^sXz=Q?QndE{dFM$cvj6af-v${dF7%i{3k-hRv=JP|8Y)N+ROCU5RxCqpLcN9-Y=q6zhXZy6bw_egyHD)z>c>QeXVhZKWeQzc3N=#=F9;&zyJs+;wJW0xlBQXMZ3HG6wrgah7bw7}E zb#|TEC=oJv?R?~8PZ2|GbqL;NEHfIXy(J(`g&0r^U%=D9g+VD){IjV-L|^2M!@3Ro z+ixPezy$ZArd_#`9&?tx!_e6u*r5naMff*;g5 zG0LR57j)%j5@$hRdg_Ws`utk=dE_Km$2|*$EYEk2{=B6Pe?iy!*ohrT zzOBPgGsF|WCiUZC-?+=SpVI|i(i%;SBi~<0`=u=63i^VhMN`tkW`HpxQ$b*2j*45? zo(LkyE|$bzfZu6#ci~d>{<(67Rw2K`dup@Fth?qs>0OV*dr!+jN)3*bGV7xw`OupO z3NX9hO9M6cvY?F@0baI+a9$wO-J2FC6+H3cF1m)gJLe19hB_a~vHY2Z3yt!D2hX*{ zo=@oCkd@px6JPhS3rk4Az&->lv`UhS&s{?2weaJ};xuhl`p~(!2c&|2yDiO)tPQ?( zBYg9j_=9LnQL1)eM=`JnGK9|!=Jg!2vmxX8jq4#dV)0a;Fyt*2Pq~^b*%- zBe}56(zil6F#92pMTHuwxTHg9wx~P=%K+>%K!`D@kpscVMITu@4(p&g=>&CG@NUKEU_Z*Nb&gw^QDWs_UIuIX*%wYC+*Ufc1G|J9;EQ5<25?6d}t*b zm}5;WR1pJbF@i* zF8VLcB^D0eRg~y8O@fulU1N#lI;*cF_QXkf5NF;#B@Bb(o|vRJNEah^tsKJ*5KCdB zI%Li>?;Kg%+qO4C1G>wngW)N5Mk?9N#7*{G&q{=6DTVR_+9Id79B403wWnx{nf4B` zs0sa)W9vj#u5VRd$HfShZG&iuC}LV60a&flU7*iww~b|H%X z94#*hI-PqrauTX;jgw{rV<{ zm=os_9OABQLPB>G5(*n(S_2UD6i1AwrWuu6A_l17_Bxb_@{KyUaJHXWisB zw$&^upZ-*@1>t`HjPbYPEw{Yj+Cvd2m8mdX@Lh`keG!9hW9L={$F-WheJeJ;=ag1; z3Q#**py0zYXU0W(Z}Zlw^ioz~+;72EE~6x$wbZnOL@%H>Ao0&EB!MG8gRf014SEqx zQ`W1Hn259j^oi7~vrhw}qjn?-Oso9jj%hJx(j9j_psZ}hoHI6j?R&Hq5p-ldDKy}=1lw72eff~Zg%9WPHn?e&5_AYDWL+Dj^L=citR z7j>Q3!7j@Yho%hFZc4@IO z=(c=z%oDh_e2kv8f7IE@6Y@3)OXK;lmX{dDFc zvMCf{hCnZ5tBf+WBK5P^&5!HV zD`QLOq-F=$RiAuKR0o=Ut~`pbc!wF(KwH;%&hM- zT!>Z(QFn)E1PRAOR{ghFjoJb>;BN674$GouQx<-Wdd#d-xTJaI z^21Nv6&$phf>?m3$A!Nia9HHKQnh)S{UqxC4uK+RJdL)hK|(J5ouqdsy+^z0wwyk< zXFB(XyAExm@?=*3r3X!MUjNX!#L5swf3Yy-kS^Dv}mL0XtBqR zGeb&GL-ma(F<;Ov+}Eg2P*B;d{bo%zzGKKui0l(9w3^bmtua=qq4|rvgE+0>IH{tp zMwL60K)_uCjtrE?o~MPNO0~YAaiOaFM9-1@nvdv!`RRaZWNbyt?RCy?t0G*qIje~I z>XJY~BST_tIbGBFu)$I>VZkIY>-3Og4H;bpIX2iAgEXm}T{NQGlV>dQQ9H(Uy@Czg z%s)NT8CIBbUt9eYo2TOj`|Ychvz-s!-qX=){|a>R5%*;OwXSgU4`)A^Hb-b$1!ZJM6o6%|rNdWUdcrQJ485BMRB?RS&X9PCA$Q1o)K& zbEiz4*s;yW?cTnKV;!KdQ4?6Z1oYB1uMq;B^QpL=YzGL2;Ht zrCXZ%ws01eniYS9)M1*&TFgZ`2^UwbC~8^FS&TiPLjIs;U^ARWYuZ0V*>E>e6xb8N zi3Dl7!vUFyJz@7SD?V(Zrq^E!d6!yo$j9NaE;5d}{;+vXip8KeQFLNEvTOi{9~;~7 zG(i{c3}}n91W-=_Lp1)TqfTnTE25$G9Z`UkZ=&OG74$JNWGc3Z($Xk*yzKNiA}kV{ z`-gJql;v22pI~&cZL|;M*z+7;R5R!_4)iKa`hyr~=WZ{d)mOMf83cMx1QHxMP?SSH zx>~ko@%UC;zIeCH5m|*)XTpQ0=o>#+7(<0+?Mcl#HAD=Vxiyza)YB2Bi!-v0>_1a9 zhgL2qVU>@|u2(3WXc>OR@}Z*TFf~1>#pvhFdZQ~=?>hH$TfPBKEa(ius84(1<&tN7 z^#0_~C^4Tjollgj3)96R??dP~w*W$V|EcE#Y{a|yXbSVmVtJ~>yQ{<UK0eA!nqMj{Z6GnYdYd1iIM89n@OSR1u zM&!NX9-02WzO>I(=ytZ*McBT^IW!-1+|s$~Eqk6HuA4Kr_zq@zUr?_jHmU<{TgB+h zM_A1D{gH7)J;&Bzr6H}3lr227gMf$Md~0&1(HNy*+h0Dll^$h%qnJ6V>OanVR5G?s z#hwRv;BtMR@6<%WQoKHr+u4}{MaS4GX~%a197V6MuOx2nwDErfjp{Xf@8vzjixEEF zp1e_%o3i`~#}JkMl1Vg^S>TuMbOl!vYER4J5l8S6EZiOWW!oOy+niWJs0}>S{fN3U zXfEjxATC(X8*$gr->D~``(U)1R)vGbrIUyW=85-ay?9xB*{Ta4f+u!1P>_IqC9V6Q zq~G6jkMXAWHVxaH%tvT%KCP48c~wMvyri!2p%8E_6Z2EndC@qrw>ComR|xwyV;vUV4suBy18aXy0!iP%i*5 zNR!!gLGeU{x4~AsW%y)^%k@5kh=mrdF0>@)480du(q14$$QFUw6Sg_fI!QDA%NdO1 zFTt4-c>FNOv1}IEiE=u6%+H2+ukw~=OBwbte+pPQ?zkBEh~3Lqt!ePi_H$Vjn|OomXzz1|V(3Ug z8hMd-&T4T*{n#nVxA{l3`+8PlOq9UrTF&}By*3Z%Ov>j^m>v5q8OgtXR?&dNxO?cR zX}>E}a*AF62$S|b0pU9?6ARVoiDW-?Fzo{_#MUNiWVpq;q>9betX&~Cn?F*1Pc6e4 zHMv53^xHY+fqDP+Ht^~JaX*JOs?V!EcEne|jsnQRgO`or4>}}sK9FK{INk5EM`OAs z9m9}qvV>IIx!4=$VT^TY>UMIqHJpQSmNZO7J?*)Eg8Wbp`2rPiTh&jKwS}%-Po0nq zVtgmlOHwq#1eAUy`d+bf(asWFo+3an>3CSUU zm7Ix2v&<$4NYM}AweS3ruVZ+^ez6|3St;iAyx6lFTh!gA4LEU7DC~}!HP_;+@s*Di zw@5l`W}nsU#+`tr=ZI=5wh^mYE7$y7!Q7Ibi$TX3I%n>w_Mf3xduAdxRRPYD*+h~T9E2TQAr)Qbc zG;z-my{zOHSAuc%m!kzpxsAzsUCZ8uQo#?Z%z-dkk;-#}fXt5Od_t)2fl@Ohf2<6g ziZlRGO>^|9&Vkht<^6+w*{FRTjW_ zzv&i(kglD4O4+oNefCtm+l0eC96?MIN_Jy=xy_ULsmgbl)Jwc7BDjl zUYyezzSPQ!-c<$w9`SmldX{vVh$*Lt7yIl5l-$=HJ8Ir+3vCl$i6#JXtCHGT>rUUjcHpC!SAyaNvaR!ig;-6olt zp(D>%hsgK4u7^Vp6=v+Mj zrFy_C?3R>j7@lTUdAp6M7dl#6|!VI}=Ps7YWZ_|VD)v|uCX;~Kb( z;s{mzlul?Sh;~@c1pGBJ@)!JZ0EVeQ0)T)J*FZy761?i)=_1wQ7r^skp1|lY3%n50 z12`<=_u09C-+~R?;r|UPdJ_QXSosGN4*-+@Mw0(B31ASmKiFIRfl*Td0{;3N)Cb{E z#od@ILIShJV*noyRXn+Wacnv)@ch&AAAJ7|a@Y5V0we#Tz`sGUf3b|;@2C}h2@o;s zKY~6%!1FmmfAIY?NH+mMh%|sgJ>NlXA^!#?bB4y5#W!nd9Bz@Re^{+E}S!~jrqV?65*zJCTWiUtVr_ljv5 z{P;JhKkM@U=;EKEamqG$>G<$ik-I$l4D~u~8rTkaoaQDSPZCznZEqjUn9DRDQ-1xb zEUY)gj13MX5u`2>^h}T=0Q8uW=)r0z*yH#W`>N;GXW?IE1yaUqEZKb3hnd3RU=Jdy z7k9h2&Slf>sSn-YnPy=UTihYuX;B6b}?`nOlqGc8@$CFv(*R zF>*zqeT`t7#%=W&?b?;!r=b|oZ!wGHK8$tcW#)5SRotgc`4iq74rBhCYXGYj7nnLC zj|1f!=xlNZu&7G{u-q>M{j?b`_w7SQXye3KPdxYN`GDPOBvJaIl~gC_Jdw&K5q0+#)Z>*{h?!ec^VvC11wj6!hQ@c)o+FD!eQPU&s^r zMC?o>&Ya=Wcow^818w3X3D^?Xm}z+5ZZq$KcmYdIYCd^JBvs`81u*@>Ke zQT1IlDmt)gB?(na&BK-E!-{2SoK+Ccfd+?x*Z=ZRaDu+iR_PV_{XPd%+RJ?|L`N@< z8igf#(C={RODgX=80j;FAzFeGYFXQmL_BI=WRo}84ad_e=~tY2;=x^7^cPiVNms>i zNRYE$qY#)j5U4}muEb3hOTUjZ`y5pTtM#*ohGZr4<;M(|dyW|UZJzMfm>J$T2N2_i z%w3b5ptcCAq;3-TKMJ#rB~%d_!5sXzzI+RV^l-i8s9GTNN|&ELm#%wJLc04P+?BB* zd)=h^Ma=a;9np{9*HCe?4LBUC1w^JY@O z*NbfmuEfxBo622${Y-3rkHa+A@RIlhLbKv{jBk#Ke-FFf5R{#!l+GEB?}9D??OXPC zGo7x^BS;dHj6g#d1(^98cr5TZ8~npcCsVrpi%YG>+n&uRe71M|y7d$Dnu~hj-b9Fb z@Os6H=~CS;wOZ@1X&+jnQ>wX}=4R@-n`;r$_Bj+KkbHf;&gY-}pPwG>*|>^ufjDfD z|E6E}zYP3OaLZMd^OSuQ#yAyK6`=g~Qe9jAA5pLFtE}z1tw&z3fbTSV06qoMk^|@C zy0TWE%UG(oGv%Tj#`Op_jWBK_u|@@Egz)YVy<%Koe}DUKp4sVQXH?HUd{nx=g&u|n9RmaZvAn;1x=d{fYyr49XhgEk z*7%;OJ7x=NrANGY zZ@71^+=S(LpLAndwXp${c$Do&)zx~F^%Sju)B4*G=$W`JY$~fkLWNfKx5NfCLe8`( z!Ny&axS;HZaz|}#f|w(U7>zF_wW4XP27>a5v@ytxCRHOw^OUFSp&0MZHv8-uqQaN) z_l;L4Ok8) zgx@>sYvTo!T!mz}8?<|jE??%>!UH&8anHWsG6hHP$Z$iqyuK&gLWfh?qMl0J;bRg% z@0oKo14W1XQzd+SsV!u_;`OIaQ^iUwcB4uffUz8UAKcV?xKLlhtW($f!xAW><&@T| z-_Q5VSfHGG@|WWL3OuGuOL@qjYWSgf+w-9f8{RjAt4?{l2W&**J-X2P0_Lm5C^ChUup@s0ivrWioFZRJQEq;&D#03xXHZ-YL!PdMZB2*Sw4eX?`N#Z z5*4!h>>wJ#;3~c5cfZfKM{lgBEF=uRMsRwd7|V9WKHmNeS1X>LE}P1Q_oNld2{spB zzq?ujTTd3IjHFMUrCCC<^LBq=Va;t%m!TF+!%eGS?b*x~l1qJeA0vfMg7NTu5FN-#u z;qugR?d0}~DuqTtFtfWTzcvO;glW|Jnss-~fU~Gv0N)mHwyJ*9v@{~5S9jf&k7Np9 zd-ABtKc=qs&S4^?BbU=0i;FmGciNO^3jaSRgE5V*&fp%M*MmoZ8!{?U z9ztZ&Ohc57C1L3Fx%$#c8E7iMo@x~E)n$7qopUHPQ1jigQKWk~&BPutRC+mms{fb& zvk2FdN1g=6yS^c|!BJviidfd<%bZIS$l(HZ-NP_4xW#TkVzP7t;u3o0Jo&-Yt_)14 zoSBG>Eh=l}C+V^^K)up714%pQ$gF%Q1@B^-D5u{w0#lB_@@8mK4Q)wp(W9ENvw*@$jp9-Kw_)^YXGT|e8ajtw?9F+s@l!*AnB$4VwU?pb05I+Z7u$9vJp zYFTQv;!IAjfjJk)u6wWL`awb+4&7RVukzSNi7~`@kYZU2okkZVF#mYm-DaE8c*11V zL`sO(c4O1Y@Xv7QSBbwnig!R*0R|xk*2cl>hAN6pz6m zTcyepO5qi^h{BsOpOy7_8Fp10GgRk!g2nLrtuI{qSJnJpkviL5b)qH2=mgm9lkPZ$ zzIP(q<6g-_^-N$h;vl`TbMZh-Kj~i5 zovk(^7jU&mX;@fx`iSzk0QYxRXRu`@rx(<)jI}P*rbU84b z*4!7(&=GS;YC5^=%ut?c6;bF2+(g2u@y#*U){$q*Hx}8>Wz{yW0mTp#*bMT4oK+tGP0KtT=1U~*U5p2)QKcmcfP zI~1>?hJh%v_k|6oFJ7o3YN4|o5w-6%$ye1Pp)~S(uQOk>8xVtb+(%~;ZTSZca@Y}w z>hCkd=8zGL67~T_N{h^lR^ar7(;L-o&3FN~dAHUse;8t>;7-nx%H z64+(|_oGhB55~aRlhNBIr0>zET1D;|$mjbdP{nuwe_cB37c8saa79ezaW*l%N;u33U(dLf=@Lu{RUd%@U zjRW#!4tt$&2Bbf>j!@=vklM^z#(5-t=383}u!VU|K;Q)JC+2flzFWOg3fdE=di(tKEiwv9 z?3vfJ?g#0;>$@BfS)t{#w*L6O?)ZM-iB!CGG}Q)zo~d2enO8{-G8$6HRg^aPpyFB0 zv=R^J(51amoMdh9?oR)9^F}(y)1sNM&hCK|867=IgXz<|MjP`}L6c*l>rQU}F*yRL z9`DK0;dt|~=TX5KamEVYkhThUD1>L0)8Wfg)xqw@)A*I*PTcNwqToYTcc`Libhp6; zD@>J`Zt3pDiCigrlB)6)uc?XKM1cnNx|g3p6Ky|&-@6+)p2|YzR7*Mg^Qmf( zSHelB{6OP)4Wgt|A^kd%?mG1Av`M3W|LPgN7BU zBL7F9F#UFqKJ`UHu2q^i4X+QA#iN)88Wsygt`lr zvt}KChjoXJyj72|jLrJ`DvEEQad_kPXVma%Ye_|d{WgLKC9g0k{mvaq1Wmcc_sCK& zN}~j&W%T=-=Jdgy=JEZ8mE`-J#XZ}Shp0&YqOUl{vxMmaPagG9QrVcxBvjMu963-dRVS;o2MZ*> zfxfjT*-qcUuLkV_@z8lowW4=1&+m8J9DnBN&X|59U2;SIu2dW!-_R@p zl(|(Kb<=KaXq{;XZ4DV{4<-BZE+W?+4}JZ*QOfp|>}naOKs_|+!;wgO;j+Wk{~^%) z{0|T8E;oB-94=q^cYv|5w2X1Ooz1p*{Lr((<1J%K!XD&Rh9JXNiE1TJUoUo*^HQ7* zq%6j>wT|1EFk)fK9$g&rPHs-~Rwj#FbzB3cI=p$(MJnhF%b1vNyass#h^d#Fj*=T~ zJ-@Z_P`fiCBP+SzBVb+ssL-1J1vEcS0c%|x87&!?ID7qFu+*sv!=z_skn@AkljsQQI8RGzvIkdeuKOB4BF()IWdo9W{lfU zkKKc@Z`RTQenz@?mLqZ%$?s~D$`w^^0XZ3b0#47}tiTsNNa~Wwik-O6;HGX}mUVA0 zdr#x~)P17yZR+6P9gZ=5``PFK#sAF{I;SIXb@&;NDfZGWWAw@#MQxn74_Z53KwqX> zdvs3J6ujP5)h^|VUH!DQgMT@s8?$NZe-{=`JUEJk#z5D$4uGk>YG;W#GQyOiI z;U}+Y+!llhTP%YHQRS$+`Q^g?aUyP6{T_Fu1C%Ol6uge5>3*LWXJ6+>aT2irLv%7x z1PHqtS&71@wtQm;k_*nwyCtKmvn(Xe(5+v2f#wT!VbShl?pa!1Jgo7+f#aS5@=e}Z z)76D~&q-}DZ-n&bn?OLQ0zQOkX3?$CfMS;JRUw!^02`#iIpx1RI;M#Vd)N8 zhc>sjqkQ%GtUv1%lFOB7xVbyMAAPV0@H{k?vz`>LAD*ZJax@Vz3|PF+aU`}6e4>1Q zbp<6xjAnEvq2eoVNC{#y-bB0nJ{NfK#2U|#CZ*Sary~o0ba}mTLeAA#m~uoMkQh9U zmV~CmCAWcysu24iK&=FBpq*AP;tI@3;UTMXCbS%<*2vII{)L8QSEJt#bM{)kH$~>PA{`tvO*W#zP3SM1L zi*;go0!6{FcIak3KgBJyS`F#y##l_5#a0s|ky85nfX=wVEpWBzDCpsG>cnMVl|Lks zcn9{BUczQHk<5#{TfIzGW+-X!)0F|8qlF~m^O@3kRo@vW!Do~db{4{mobCO zN$0oQG!KXlj261 z!9%lRPw}7HJ-Nm?UAE-*XDU(?$I~c>uRY_-+P4lDddxRMJ|f|w%56|?PcYKr$0~pz z-vW?6q1&gjWOTFV$Yu*mZ%@cV*TJ6*$y*$T!o#z^20pLQ-QPS$`{Z|A?%$^iJng$@ zkY2DF3Wpx0La1+B1a|gEJDIdtbB|j-W6~!k`Q_GajTwxv0X=}!1l*kdIF978!*fNt zS9p_+ONuu+S>7rh&jnzzKw$|yKS6bu@7ggYL}a2JlDv+0%fjve3dRo-9pG@(R4yQ9 zw~k<_epNmu`@F)`{ZNMh+)Tr7$s1fBzzgV58Ic^>cgf|9>GfMK%Q}C=LBx=rqd#*f z!nkBL}57=HW>W) zJd~1iBfn4wzkOl;SQ~ioN??I_B= zKnyp23ezbXzN<1;$yqGb`GO^JjPu6N*6lu@=9rm7G}bVTLovpT z=EJ_ABe;J zUboJC+!DlfIW2Jmp_26&QV|QVh<4p(3`4-aQlu~L{^jb^i7w#b)_u6tEPZZz&v(zt za|NoKLzy2E~nXn9Kw63wpc zD4M~f^)o-Z;&-3+4KW}H^MHt4^JK$@tXJR` z;~)A~>>sfBWc06UD?lyOh!_t{V)lzsrdk1NWR_!rRB~zAC)pNi(Ii%SYl8C`1SgSoX=HHF*fFsXo};*w4`ovgY{=9IUVA1H+>% zv#`xn<1F2-e^`KnB+#6M*FxJSro4J04C?&*DO^XU@S0MM{S;#)B@5m5VO4RjoG0=D zWswz~X6brBhLG+t43u)|O&16UBBooh(-6-FGC=w}ncbo8{o%c?rEacicnOOr(ijLO z%1nnA4{Y(}A6<~>&_2HQ*vTh(xXQKDZ~GweOVy-z z5i8-O>!Ddfg=K2L?^)d9v)|Q>jP;~jv>Q}Y%SOxy3nqLuUD5v7=Ky|TqM&5^Dd9*F zHR%%>E2EL0=!<)bvE!N+RuWKtvU!?fMw7%&U>myl&ilg(>C@5(I~aZ#QSzzc>Ld~* zaVB;8NK|mUQym^fy9b#JeH~F1GCCv#lytd&@mkp!#gw>2g;MH)$js)n#k-!HR7j?6 z7R1uXq?tfsrIC);jWI3e_TWh(zVrA-Xx+grEJ^m)Z|Ux*1m*GES}VRYg^dlD*Y|su z)m;g{p>u=p;L^>%;LR{$)jxu$WVaGe=7vNT>J$VXM1eE-Ef6v@U5kJQn`GwY8pC(E z^yHlfo%9+C$*{G5DyK&+cQ3^N8)0hml*d6|D9p-8F401=aQY=KjnZOeu|{#1mfvCB z)$Ykw6&71sGl=hG)eBf8uMlN=`Pq$iAfdx7C?yCvxQp@djUN zY@{cq+d}pFhYixFx`O2I(gIFSLb2Xto0Od0+&0VjEcEAsUBqjV9i< z&L4`wI>v{Bp6?+>O#y6F5p%pi-*aoqqor>AlC#$Y&yP2x-nP?%{1&``%Ls$dni(%Z zhLI46X(4(s0&S`B!o(jV+%lPGSa@2xsk?3$PzCw1lo^zSw&r@&wY6}GTM${VeavLm zIWIsXoK$DjVx-ckFXpHU!r=j8&t1KKfo={kbd(`#5ZSzX=KjK9kH$vPOdLc zvYewvrLQBd3YTVycnwWgA)`#7;{QhdsX`$h_=kL>=ZKM#; zz=hSsWV@$iD6u-8i4TH!2LD4I=yOtSf8uX1yA zV@ZEsl#QP*IL3&YNYg=E49pMaW>w{m6TTa;hm5jk!$DB-L1~(jj@M_OYM(m4Bb)kE&f4Qb6m`zrRl9NGQpdBX<_oJZ*FzsLD>yd=QS7x3<2 z`e>`AHalT8lH7GDY257lcL7eT_t@SYE`(vt`}w+ZZ3mE?fJGV=(V20Axt>>X`%`V; za;7Jlo|H4}lw6uy7(c^;i57aUS=m-Mga?s(T%cMx+@)oTy{N(k?VjqloajfPgc2$ zVgbx+sdbY=L0x@N#|)04Q5&d$W%uS}jpanyioxEHFfP;#K2d28=3aJLIxBcc_av9s zo|k}bVShwEM6}pI2#<~EQwm3q^9R1~`cB5ZU-ZUL#7&o)0e)|k-WiJ18fqrmY0AGD(JI!y2j{ukq~A$&Nmu z^Xqxu^IM1@$dbBk${^+G(Gn7rE^p|6rPDw<8=7r9fr=|4V7*{NqWdZ{Ugi~9Dl+!s z=6WhJlN67FN44FBRv97REbx{ zl$za=s+UM^p={m(dzip2Lt?7k64L3*U7cpL4*SZ**j`6=U-zF*Gw%<8&_?~$Mh{U2 zu}gJxMp#t&-pUI1-a@T^;^l$PkB)s^q(MP9zcrGnWfY&~sViRlpKHC-%A9t$o>tCj zDL&e^9pr9IU|cJ|>1>&918TU`-=e5=m}f{B(LwjXqSz@RJ3 z2;kFwP6rcRrOt_`9{w25<9SLEUtyb78>KMRH$wiHC1(}{Dp`{sB7X8Kq>z9Qc1DqY z`}Ee5Hb^xu|YbbYeKlHyK~q9SI%w9=;1KYD1};^aVX(AM$!La6FwrQcwV zYyRd0a2X|wbm})+_L=8%xPvB$T_<9i?e7gpndcTdMzxHpg1h~BJgzvMIz3a*JlE{M zdN?X_nqKY4yf*;6P}Y}Ne!aFG##OB!%T^sXX|3l#HkBI*YTzdf&Sbagfz)k;F*-rF zZ(nXw5dteB`X35PFar;Nrn44+#T(#B z7oteAX?9tf!X6cL#%+4&eNYI#zjjC+%NqLTdDoj*^~)wuHxsPq{PLhj{>W^7J(-_w z4Ej;nK=gM@b;oor8EAW&f4)Gpi;`?~F9v}{!HGF5!nB|s+$MG_Hh{)a)g1VEg!!|c z6^Z?-*x<6wWOvNJMbyDDTCd4Z!eZ>Rz{BSX(Cx;js>$m>vh_PZq@SGjM*-L{XYz#X z>!z2b^r2v)xD9QucG0dc$9t&wRg#ji);l{1v$!j5`|OS_1D=(j7Hq(XiYDTbZOZ>R zI6`RP#^&BzgRTsH)>*v|d9gYrW{Cp?WP8;tp*Ztf=lpg!~{N9hqr6Kj~JX zpnja^epjYeBEbxijxqrIxR$A$+_8h==GF^qyIvur@ANdB0R}6t37B~Z5SXOz@mUEX zLzSqHC$D(u?vC{(j50{k8lcj{c&%sJaBbt7eaImQ={|Knz7sB&XOSn}tZ})hhT&hv zAr9T-+$fS!Q9f3T-AV}Px)gm1!Z=3~(#=Bd2S)oLp(U}FMO7lMJ0rwL>)!V|hrh?x z#4Rv?*1{Kv@=%yh1&nU_yr88%<*%`lBa8savbgZ@L6a))UWANLAO$%frr(V_!Fz5iT+9u8blD3{*NLg&*C^k+uVs|Y*Eob$cgNw0ZWlSMFw*a~ zRqFq)zwAfu)uYmVSM;vPiOxuccX=C%>`0Dom1zF6JqMDD$9RFIJy%|Yzn6`u@?dqb zIZm#ome!og{I#Y+%k1q1-s{|zAExSf_&|-3ew8ke^K^-PU1xKODPLtdkc~PN*{kuc zT|Ti;bD>VP872CIj=qfT=slJ)DR6;2DWhs{!_G)1DV+_739B{I#Rg-KjVa)_v`g;N z-JI2^4iQwk;=OQAf9%OwCZOyCgG+)1*LU*W zckg;@z5BCI&vc*DU0q$ZYj<@R_21D?VvH-z;-PVaut~hbtEQjl}Vxt~B`sUv7$F3=v&PmLC@v@lC>h7%=>7fN;WhyTl~?cV@|LfSf?J;+dGxB! znU?b%qu%oz^}X(_xqpX(I``#6Ve;7xS@wvx<08kyMcJ{1)d zV_DZ&s{Z=aD_V?3FAC^AU)>(1WPXQSL_7{(Ler|lrcj=HB4xcCke7}N|DAo()oK8} z@g}3PeBt43qWZJVrc#%|6jZhL|LD96|383HUm&k*RIAY4}|CaDQ#RNQn&kuA`(#BU8`%wc*Hp+o5;7G@3+IGs7}=Ksvqrx%Nr(rv&&t z41NE-@-00yl#dkvw-1&q`&HH5y1Yo~){Hq$ih0dV?C&)H!{!qZ zLGioD?^X?;(#9zAG`mr925CV~vgWbY z#$>oZ&)*8BlmT-nYdg_z71buq|E1RMFElQEqwI4cX&5DFS;iG7S$yMce-BuM@mZj_ zGXTxnL_KKaMa9ZM80Fr6MKA226C=MSgdX@VHoHBxPV7}UlOb5`A()d#M$#_M!=GBv zk5&sJOtl>rC?15)^DCIBTRhAAUv&yKo|IA@u2TeH*-6|0Xgx;q&PIYC63j^bp@?aV z^>MceNYMD7R#*3$QmBnOY}@p)GC4TG0V8eUpV{tW7fc4PGd9)1Nu{Zz*24s1C%o1t zZ01&;it%?z4j$=0cm z2@?5=;*ZHw$Qsu`#?GCJ^l-dj<$@dRxck+g_56UPW#WwI*vM-zqsl2Rp=Q7RBRV=p zG8wn{?(W`dF2<&7AfI!RsCxAm49UU{e5xSxBpZ6md)b?}1Jj#7w};Knj;^(>g(h`k zi8+W4y*KRu2(j82j7_PFcA;WYi|kEoBp;OWI?VR9?y#NX?e;*ji1v-|gdkMS0oK$q zWQeW(eN}K)_|%T!k)SR5e3u+Z&UwXHo8ilKnPGkP$rv znodx80x?%0SKzVFGL}^4y2X4HQ}#s(*fEhBYw=;O6w3<0TWtMh_3Nma#G2<)qo`R! zu1J^W8~$mQhE-Y=SC1msZ8Vq#2kcYeGsuS4H(g8)uEQ+ux5cMuH2#kI!>rZy{QiIV z+Je1?<~t;d9|I1_(~sMh^lj&kLWR$hbp_tmwx2T$Ojn17xj-P}an=O$6dY^Lsl;)f zHi2GJY=#EB-8k_qjRdbK%PYqJ@IL!9Q;5u{DikS4W>PTfJ5bdub;t+^#qHzyi=~#Y ziFrZoZxL33yebfrP(YoJcU^#>vhuJs{TcEf4~`XJ@-@Oc58$rkK-(OIFjFXHo|qr; z%8$%K&Y)fpB;eDdLo!A1E>=JJ0nM)+6$@+$Elf_36Zs8*xfl_^sHwufJW!3jzc{GS zuC6=8?!uX*Cztv{tZylicvAi>m){ z)i#NgOyh6iatuYMa^{q1W?5V&p(D?xk+>@)$Z>RXlbQ1T$MdIs%$P@i;T8`2vYc~! zv&B|D#gE$lAB9lhQbU&+p$CKoID zu05J$ISS>jP%MvuZhCOZ1?=f?VXwY;WP=+2CVqbz_3ulB>2Fd#C~bw z<;MMxEOsP{eJaeW~J*Qn0RIW zigv1~^%+7KE*XVecN&KeTjB-p{JdLzc6GR<_5#l_GJrVfWy8G@+L-Ok8gI?VxzfMP z2*^R?EPS!<&7~C-v-%eeFz>&t89|==>Ju3mTH`5a#r#c|hH84(#8C6>XXgwC*MAyZ zi6OS0-YTlK&U%r==9Gx8<=~j$ArjU%4tBFC_{t)@pE}`BB(?Kp5wvbAM=R>wRw_4` z96VU`n(^sLS(Em+fUZ?e!XVoOTrs9~z1kBFx7Qg=s$IHd8ts7CRV4j@EO&7>vY8Nx z5!mKaO{p!vd>Q`kbILhQBeGG8p>(M?=3WGlI8SW6@BUTkhfO}8?Yh&dP{FZ_^1vVf zbFQ&cZG`1HaQU3A=H8Dz82jV!cm(|M6+R8Cur@j)Zhk+a^4Qgn>8n=n(EU81m`lKH?bYY+`G-!gV8?f??QRFH-GD-vjo2Oi z#M7g+e?oY3MiS?taK$&ENmCA_z?pttK2I&Lw6*(_ZzSu3G8Ay|Tr`R@dF{lPb6R{> z&&QkTMDJr^uU2&H?Qf8~E_7k(3w_gX^@~^H+O7U+1pi&ouWz`&vz>s|6#8(zm8cejyw zttXTJ<|xRhc8pjmL;(zi^;f<3RP~!gxV=qJKGQ);AYxg*mroZy=4eZ<|BlYQ{4b+M2iE!<^vSLD&YPGJ$Td+vtH9$VvH2x6n^HoZP*8P zF(xXs%j3?z9@!~dI>GB^?3Zf&hBBg|N0b4#oV_$HcGzrVOcT3VLOmWIbXvV--MOBS zp9hG{mG|TUFlrhl)X|_$8B<(Wt-2rN=WQN3Vw%HCGw<_`v(UrrGo!F*+~wUn)*I)u-9~R+%Syfh``NN) zA)l525JC8?)*V;}{}ZY9#ljv4qn&%|1&;!HTcLBO~tO%A77=U;4#xGLWQPZ^rfvskxwJ zO0oZMY_Go*|6|zMLfx|5n0G8-RQWvK+uQ_P`PPCxFiGDE{Jbayttv;xnX_)JR z`kIlr4sRsPUsj?G&&bT3Sv2Om#S`t^%wBY#r6!_fVGYyCQuD72zqN2x2LEcxHCp?J zSCc_Z^r7EE9^p=_=jm|T_bogJIP)s6hD?zg0?1SSa^U&&ma2GSKgKhRH~Z^4?faEL z`|29*={bLNJ%Hd8xXszPBd|fU&zTp_Qxv5+vT6Kp$iZ1r3(*YX^F(s3zNGobJhm#4K%S_uxe4%h4OYw+FsIZnAILQ;JrF(B%;gDh2Px^?q-;lRW=;7)M}LoD)XX08T^OX-YXAg*79cWO->(%-tFe59B-oTYM{Jzgy0 zU)Sq0v$i;j7VoF>jb{fB>`6X`N57$1Pc z6MveKp2DX@J%WGCbF|nbif{&ESq(24xMq*36I}%LBkFlmg|*5wO`5N=@Ql|*D)+1o z)=1$Yjh5&=Tec$OtQUdj;v>;sBrO#uVguNSB<<{pY>=Ji@bEL(l!VgG&nKi^b{pvL zEdHHhjq_dfwC&-G;(j$EsHgQ)YwO*S9qwLp+BcO(wY}Ce3)K9g6MMMGl^;aZSCh@>*5*~_++R2BUKiII(W`s>-MEzUP>7x1$4w2v2oI7nBQ1PYm9BVAn8URUQFt;Sq@bYmosQ5`f;&?@V)J-pHYZU0p`-t?ar~CJzpYcZ|6?x-T;JIH znjtNna$8Q}sa+M)T}dO$6?u30iRqVhBWy)r@iSTV(SK3TeSfQ>OC861xKKFIV@Uw? z4|fU^f!9(}2w>CZ+#LS8q|qfrf;`eQS-2#SWR1SY}}fIdq*3v4IRQ76yLjYh@n221*Eko2uy8nG*yXpaduB2swl!e8}Qg)SUzqm*+tJ`h&2;#PYiA%LE$R)P4eT)Du^KWx*gVf7dTnV`ARXY| zZU5J|{c*MB2fcOAxbnn7>YmE%zjxTzquKObgp%g=(+3CudUXZSkm!SkM;&zvq=YRk z88ZFan$>m?Did|PH}2w^=3-FmKa;TrGKu0_*fV%L06Q{!VEV7~6kTgTDsTMq6Ns^O zlS#h#j$P6|r&7m3$nEciDa-taj>pA_uA(JVSP@HTOd*8h_9Y-1%*?ussw;{4%(w3* zs`#rx_MNKn4rIc3=h9)pp_F+&q!a_J55zX%m81wrvN(gZhJ7qfCSc?* z?k8%)=b+r7H$G2cTt7!D{_b3M#zcCK#V!1i9IkgdwVHGN`|To8E3=UmTXe@Qsnp_) zCMmUe;A5-IhEMH?0~#)H*TZ)PtmgfC-?R6ndKEb+O$~F^Ir_liF$St~L%?u3VwzoH zs#wvbQ3agoLpwc6=qC}cQLtj6z;P|=dbZPO#xN){W#&RG3@oM`*zdQ@{Hp#O_r#0ClUGFQR}&n8;H@~7l|zC$jT$&-~UJK7mkuk zr|@jvPw3R{Gd%&*N>*{q)E@Ws-&)IELKv++Ewo@>GbeD})4jnDdl%Z$BJ!t0Fnps{ zZ8?s_3yYMtu}&5fC3Ay%d1YmVL1FnO$cd9m#VvE|4Nl8__sXBqk)uG7yVVf^)vT?= z7nflB>e19owxJiO5wAJVp}kS_S{eQq!1=I2TtR5&5cEQ`iVW~uATV`rllZ+xNm zK_k4C5(YJ!vAh4KrOiwPuc&}qpH7y5WH1__%ke*kb4cNnGlxC%YILbf8X5h+V@f)& zpOf;Mp?-`X0H-n*c4l4VvG9xM`NV+tIX5y$fR|dPZs=PQ&?`vl zY2I={kzxXeeCVS{u5_3}5P(3`$_#8H%23wvXanm0sDZ7^jVKpqXB4S2uq=w8fS2*3t%F|ZfMyFydR`g!hBQ<$-AC<-^t*e`Lc&Ui z?d*Y!bmVDts{1TU1}zVRVQm1SpH9Od;38m+E4efw{OtIe5I*ElbbNrR zx`^lX6g3th(*JPxL&iUcVk#(VDU2SInJ8D?XEWgArvcdEjTi0 zimP(#BH~t_0&P*_M$&y_z!i6M_e%qn*%Au`aDS&}euSX7Oz|}CrvJqd!AIUI5aTx_ z=btO+lZ`Rju2S%Ut(cmzGcs$rluh+G#7*tXWQw_q{>*{t*^KZ^UU zNTC1?h9t?o)bF`2{V$|GmDZwa1dzVzKfmvVN%*7Y%E-s(8v2hsV{c{@2S!B5$UG z$b%x49x2HyYR8qJ!a_wPbh4=*F85o--0TY28)V=^dX`7%J7k!SN+zjYK$5^CE*&W8W+ zOM$mGp;&n>v}~s@$-m&axyO#Q*_1xyZ03y>o!ecJ5-~m{-SWLjfVb+{YFm`YxXw4@ zR`MBUz1U@;!lu$eFdP`*{rGk+A)vRvx#DeXJQ*E+SH#+(JjoO~^6#0G2L)+_T5%oS zjA*H(uo-z4hHUq%cN^c>q!7-{^gZdGG4RxxnT0vCNm=Po;z7*urD^0|8@2PUM>(@p1N@))>LJM&5nl=oL}9)+S;ze)!QFV+TF`QKTTf2uHFE8Dwmv; zpS(vEvA*{M~`n1AsJESlAmDbzM@a8^x z0%X$HIXOny;C;)JKaVMIlPUZ%U#=lb7SX-a_5AcUTl7WFB-1TKi0-KV9oRe$8PGIsby`EE{L`d}-&-hG|7=!hBpfsh zg%3c9VNHNq08MDxu?Z2tLqZ;T&2_bW&$hm0P2*72zoak}vWY!Fh(@IS?w^<-iNQ-^ zcx%L%N})dp;?X)4=;QK4x}<1KtN*zgc`Y|v>vW9rO}qS!ibcX;LcItNf3l5qCZ|{@ z)DcIBcG&vLQDW-pu|FfL6&}^Q^)`S^hH;IVpH0_fbge%AmP-%(Ye0LoR8INhR5_Ip ztzGeG8At3lj3+>hi4u%NZ5hev-Uh&i#ilKCc)-qcF22g?z_Z?){ZtWN|9QM<)JqHs{Q&&b@=tPN)H=-l10%_W z=s$DgbX$0mF|f~r%OaaLn94Q_7VGVWP!IErfXf@3a>c0BAMfGLSsuAU=f7 z9)Cf;siF1rlxAnO02P*9Q%8gC7TY(mA8vM4hBCR{ZY1y{!);VxH#jLV;P(mShZ>AL z@eyA#JX5fe2Ne$w=M=DMzfJvFVk&i~z@rqh#Y)r0#2F14<}~7ou6N3o^*Ku`sM@?r zsj2xcC&b~*V4VquGNRXeYbmbE6!Wb#o~DtqFDsil?~`-q$D~m8>tN5)nZ>`9adaHr zJO0=k&>B3gA-I4K$49<0ZvUd;b%btSZ8?~dHoKsgo{v)6c-)O@EAD}bO3Pq5dcME^ z<%)PH3m0Sy-Zln?!`C<81(ykpF7+ZSU5l7-*{3t$H!TU*hgWRZB1uSMMUk8kI=AiE zi2-k+7}p(kISAs9tZOAU;~@h4C>H1*_Troir!@j;e3CEwf9!}X!S z5u4!3F`7PFt2MPH&(yCAT+)2pJFZ~p=*NYO(cA@0^Ck=&iuw+ zwD=wU(dvy6xJXj}QWEp{E8h|jsq~7oYh$b=wOe=I`nsC809{7wVEvXdlm}rown{<{ zpvt~CLROgN^NImr-{B7dwPCMm0-vsywC!cF>0PyRyc-dfsY0cXZMVJnVR!gbqabF+V`~On&(izh@%cpqiHF^)ot2nFK@NqEs4j zunLRDKiP$pe01oU?GNrL~BAQCo~9N!I0nPBUV_kk6$xNaN2?-0^?1|NTI&OZP=UGTe8gn-& z_h)IYTZ)Xt;MFmOpw{kaU#ONztdZUgdGL;I&rFKQ&zb2cz$w! zN?>n>4bhmgcU*k<0Wg2;?JWSgiDs?K%!vjY{}S=H=3u8={4i?BU_CM~hF^a~dE{2+ zJpbx=Cmu3=aV6xev=D`8sfQ0IpNWc$_%K7zvwt6khE1nzvljA#&*rmP za?`^!i|t!GM%0K%_2l0R`|H;jTSu|rrfS*Z&OaTuCa%C5-F+p|4!5LjlF4944fhIZ z@nggpmS=23#5>S3{shRXg>vzJ(rXJ@;9Uj*y3~H`!#U;rIy(d%i!9o~#9yay?hi)1 z8$6GXG}iM;tmmhjc5k3d=wnHXj6@`ZnFFs5uU~uzgASbr1;pxLc~F`SVQf6SzW*z9r0D`2?Q%xP1Ls|C?4lw) zq2Dk0=CE$uuGXU#YJu9n^opmE@TSYvZG8Pgv%+8OpHRp_&mQa-1xZNvdO|?35564T zpL4%~Vv!+1l$fvu*Hj{|s_pNkk&gcDTQDD0I z`a%}w(GCunnJtAax%8y@?rIIP^?TzTtm^xBM#SnWTjjG)v5(ho;h8ws}eCr!H3?{48>7IYa1GXItQJ*-{^0T&Vp-AI*6aMH9CA1jM#<5i?*lVX8 z*EV-i5tgw8%Ui<2LRmf;D3!I!12KP+qA)bZ?h7iI)Nj&Ey6WPwyfEvD&tjFxzH*;L z<|46Pwal>KQD=G86j%%guo!3yVAE1=dP|un_ZcUxa=i!KYeLeluCA@s3XTZ6caw<` zSW{gX#^1>8h)BNz$F-E1d?!M_>y)|{NbzoDnw09t5CUop+*(p@qFSi^X+_)^w2uocOQUT6+K~ z2HN)zA)zF)pA&g)Ccmc<;LRyKF}$MFnDp_7u$Vr$#|>p1avhyrtsRbcqdq@15(+jl z^c{;dZoLj3S+HZV7$SDg{K3NLxW*15*=%+06{itAaeCu!FUMz_CUuumR4iCd@C1lV zF)JdIYwgARRn=vyOwPLfJXu&(^-&_#W+s=!c<=kefz?rM-IlX0T?*+yC-P0nuw7|K z-U9u)q4matdFh;G<|hUmIt@K-XY$w#7wN-k#id%v{);SjAyIT51TcFytL)i`NIF!|+Uz+f*S5sajm$I@sE;p+7#Xh~}c3f*0=J-nyd#w0MX!Hio z!T#*QdUC67JVbjMDI+T(0r%?hi@W^tHJW)ygAvc?h+I0M(szAi8yV88Ut9fw zboLvwGkgy{f~N+fid1H&dlicrICmrGKqc!Fw_9j!C#u0LVfSVYbddoxtYG@`RT#)h zKj7+!KFt6Rj80eTzAsu!ME+N?*Z~`nlNODJwpbq=?5tnqDq%R(ph23{V;@67QCiVh z=G<;A9P~?EId3+GlJ}ZGIl{(n)cTHH8CtF6&3j8d9LxP?-REhah-Ci(E#fsqe$cSV zde>T2F<$Z=CVuELll{r$5_4iMB&o9dl)*2+W~f+sda986!MLFUU7A4TTU5UH(>7M! zC-L}8lO`$IYKxMhfLk?qDC!{QF5dzp5u7nhyXn`LcTm>Dw!J;mX4jiH%ICJUoPVQe z3`}PNIJc^ZZkur)vpUV%N?5qO=Zx26YyB6vb@eYxOtSGpa%;$h6BDaO|E|=aK;>z^ z>6de~;#&p_dJy-ggFri8;#TU>0DN~k!&Wk2Cl%oGT`SO9rOj`Qh}^(nGqkEN!bL`g zyWE(MRG0IM_(O(zI_pnb2(cZdXnk}7t~_DaFL%=pJdYzFZHv4?t+oF5xi9$i8hXd5 zX!5TU7Wuv%f=I7afa{bbXkn@*x(D;0X;=|R_4T^QGn0u;_t^rE&P;16N&&cP@X4kK zSxe2dgD-jmf$X@|t4nom*eS|e@02}O2FfWQN|Z9i!vAd1d|?l zM&E33dT!3Ws#>JDLrR+IV%F*w#ni36)!PJ)aG9j#Nw<*I)Z|`i_Q^#gl+troM0+

!oPjB$imr?__@h3S1^05r9pc6hjqnb|(S!bURkrd&L0v}~eN^~QIFgl|gJ9e!ld zZ%jq@^7{NrWnv7|sH-SuEKtLRWK9#?j_$=4t9{;(aOINi0ylRE+2F&!k;~J`^c%s- z<75Q2O!8`_;=>#sU+Gr`6W4Hcg})WNfUdMu(P^(M56$B<-dT^0)W zdr?eDS1K8aU3snHH+QqD8&YL3hrrRO+VDxBSGdc%Muy){IP66;oNP zkVvjgUqz>br?LlbU0#80(axBP~=BQ&-S6P;P zOcS?yQLThAMQe}Bx>fR$yj4*t6L z27Ifi?21q+-JI4Ju;edM7(-){lIJJDH3uDeF9~*2qFO%LPUqVn9aVJae;fqu z$HqylRIUS1n9?LIo4H8riq(TE^XL(%jle!PQ>o802#x z9Y`ha_RDvNH_zv3hXu>%kp8h>7K}Jr7DA1SHrA}xT#jCa1gD!TzN}lOr@pf2eY)L`6hT#hui zvKAU9g5Ps~5@|5dtx)<0va!)_H(=`h+RKXkre%ktm#j5_kug}LIM*d|RQx*erlP5h zZ0s20>TDf9VN>{-%Q0)$&^&7OBZr~KYc~F|@P1uX+X>e%mGhc3pUGo$UvKNK>3;kN zxuKDFBux0w^!ookFa6m$AM*00#`8`7Bpkiqs!y~arOXN?8$%Ds{aIgtJ`&lZY2&_t z?IcZgCP)Rx2OH+Zji?C+jW{X1=+&+&?#;TW+>QZL^$v^6|1b$6KD+LTde-c^P2jNX zPKbDS+0EDLKs69sw$B{#Y6}EoJ$R?x*wXTus&p7)_mSV>`OmB-oa)`wRY&bwYuv>e zzn6QM;nhjD-m6@PB8;TSba@$))6gC0{@iRv0U ztihZ>RRSR!3q=D-Bvf0CnM%QK^af;#3&bYJT>)#KSsjDf%7!l0=pfSH^FKhxG^gzeu&hwsfoLJ3{d*zc_jyB z^m0|C54DrPx-%kEXUwG-bjKMrN+V%N&y{OsmAdYB30ba(Rp+g=G2 za)VSX?Rjy@{e{lHfi4D;0eRi0cY>Lv1{s*?T;U#k%1H}+pE!iQ#@09%v1E&z$Ai^e zT|J8Np-oWr35+m1(zBxhN1Zw&KCo$$n7AGhoQ1nekdu*owo{Oo{UyR|q9H2N2OX3npb z2;tiJq)RF}`AT>s4}$E-4K_vA;oV%H!Q#ubOnGw zsKD5qeQ_Mim8BcO;KFvVIH^F;DYjxSwY5!LnzFH z+G={WV@w`8Y@s8}#H1tHST!04L+=SS^&3iWX`H={=~6c^YL`$lfNp3ZBjWaZ?sng0 zfpC!!k1nP;>cDOF-Q*i6w+Y9GtgnQsDIrnJje9zsjc-t^>jJyF_fdaU>wQ;Y5-nk8 zq%q(|WH9kKROvRLp&W~VNY~5i9?1>3yLMowy;OT`npy8!)EG*b1xN}~mS7R#D<<65Y z=|fh{$~PiRZC=l1O5ck~)#s`XB4=a+$U1U(RpfTUFb-8{b=myk;q&U__pq|)uVL;eo0ri#jr7L^^!*F0#r26BV&PMHPBV zY5MX7fR{+}{C#=W6&6REz8vf#FIw`Ou|F%X30T|)XL|KyzQ2A7_`sSdhF7LbU(p(9 zZG)mRtk>FKY}nb+)07hgnsPwz{2V?@LQLB4;ja_LP?D{L^^$=VY@q)FM)Ms4yGOCt z>>l)17n+Z0qCOOry~brstV>Ru-9m-sQN+5mk3{Q?#FhVEbz=S0pQz2q!zb)b=QSW> zNrwdc-dq#|CVlMcdNl9un;Vp&`Vn1CCDcM9BPSOu5*<#sk(u#SR$0A;Y(65cq7p*l zrJ%qq=I@O(%NiKna`U0=TFA_TPv%C!R5VveNYuQCFEfye; zgN|#zjj3_2I2gSlw8d#EEQ8dk1P)vD4P+RkLVJ*;U`5+8QpMzzxx#g1jyj9yY}iq} zxEoEBR65(2>`uux5g3Nt5v*n-im;UdAi{x6UZeD4v)hmyvV#Ki=`JZvDnr3w<<@L} z|7z_~EvQ-#4%|tK4f=XoS@9sS8>ZZxX{x9?84%0?I8_boW)+fy66f z@)Hb?oI!#(?~#lf65u{oq$c5gggitk1LANBUc=;08r13X9@RAf4}J%M)5qp)Qiz2ho29~qf1<9=m%O>3tv zxxZZ`W~cprD-*pr;X^?)CWo@o2ReZp{#mFbkai;*dox?uLTpAk3>;Slh#-QO`{eEI zH@b}P=Jo}mcxFB?BgEMUA-PeF2$sYq~l`q{x%M?-IGUUK` z-U4*pes&EhQ?6p+j&?E-o{tYl{#KYP3p{4r&eZMav;95G#ExyFVM0a8q`Zy`#LQ`( zPrqCKYi~BmvBX~|(p_?!pQH3|3Sfk=!e*B0h)3Ut4G>#1ducpx2ks3@`M9ctDP!r{ zKYu1u{7x{}Xbj({Y_*~YD*Ml;s9ImUt$!j!l=%TFi0AX^?*SM6ONttf0M(_W&G#V# z;3M+C4}cN)x()O{A88N#anon;sc8fF!TneW=@^y@rHrL0*R6!^@`lV#5PprfQ8gHf zz^Vu*RRQ{z`ti`A_7B{7LMLLM(re6s^r~yT!+z>6G_%5H& z5d6Q?8jF4#0=}t}=j&IXWfYBW%zr^s=Qvdk_y7z~AA^kzpoR2swATSM-yYpp(+Yeq zQDrzuX3R+VFH39SSG>ff;hCvG5n|=73>05}56?&gigZ@iV*guhZyHeBn_p610fyR2 z_bJ=~;SGg%hzS_bGEzb^E-(wp!P8Sf@c@arDhh~VSD{mC0%1s;P?oL;;5E9&DGWaV z#Yk2HRG^k%okACgZs4G>cJ=*lwS9%(fR>-Lhs6RTD#^`I1`Hu1cEj~Fz(cssPp_=z zNrrPTZx|>69E>cAIvzlVS0Smns76fxzCMIGUo8&ro z8Lt7xz>Z)I(%H)1HyR!7xkfea2T% z`<53T@=)3iq!SQu@#XU8gAE3q2ASMs z+hLF5|2zt$)0k`FTxn|eGX9@u|NqYh=j`BR+s&(97)SxfZ56k-2k-x6|D93?xGk_R zesfXj|M$_|zgTE|BOp>#ABrwV-b3Z&BB$g#$cRZu-p|hRL;Duf1zLwEObVHWg=2w} zk_rh4rSG$=F>xQkChe_ERMQNzl%R{OUd2pKwuOZv_1cniiM-IISQxBxS0`ugqrD{Q ziTLB&eoGa)w>)$y;^J3@Uk8}0>#H6kL7lM)8y(h)sURGoV$WV`-DtEeOrqyt7oE{mdcWfQc~wu_MGw$=!`B{$s4OsMg_$^4~ zhdfn*$4|!4(C)qM;l$5Wu6fn<+RUu&3CyM=c?&fy8bfI;yljxM7B7?W%&@`8JOby8 zgN4OKwb?Qo7FkS#I^X*W{~~lgSf5h;CJ9Os;bw!7O_p1Wk>Lm?N8=zA3g)ezlndRz$SIOyI~}Z=RwV(E|W3 z5FveEEE9?usjM`;y*@2!e3n6?t(~$OW*;#c_eiPi=qQxmq5jPnI>PnDb;b<*6U89(D+*1~$ofix8lAuej7+!?vJLHaR)?C}<9iCLE|J~1OFC-?)g zWE&Z^gPT8+`uCTsliS+DwCj$K@$?A3u&|YJbN%v(E-&99*CPu|PBsH=cf>omrz?*H z&SD&|ij@%b1WM8P#QJEz__*$oq^@x%J0pX368_^~iP({qYTB_cDxQQ>*QKVu*WD;%XhxoJJpP`}y#$ zLP!+xK8UiIJro1UnW7Nv{e5BHw^?)Ne@Ar z`-Rls(lCaC#QMRO;w_Z^>RgDn_0MZ^%`r%<$qpUQLdw~26qmd8jU?tJUU6_D5-|bc z^EZ{w&Yt;c2`r`No3Wbt?vk;c65H_@gCbVP;|mT}%qNh1T4{=HO_kCCP&~bslSvrQ z&QP=%VM;jVEOB_q-$onJ01;DQK1sr`Fr+f7>|`_kMYmDwe|=0R}wB*DDuy8Df#a+8RLm8Q1zB8 zj64*xXHi8Ma?1O*wph3ypTT@K?|$OK+@3-FOw0bf{G6Zs^p+|a1AekT!=@i_jxHNY zeu;Hb5}ifR!1toOLM4Hg`_JB$um^Ju75D5J-LYs;Zo*pgdz$CSsO{Jh-wWw2RI(Em zb-v40aCTnAab%to@rV84qiIJ2 zNEbi~QkfY$cUrQ{+M$&!@2=yO7ONzRW0Jdx%qi*JqNq`&WV)hRH-$fxhf_H+#hup| z^uoU0MV<2!!DMH(riS0&IuBG_JaQS&OT<|FD|A53#Tz&O-PRLJ5$klx*`Q`ER{ zogYUNC8|!PY6YzGiJvSNN0gxY6Px`%Y_^Abh^Yx=18Haz)7_0Jbp{~dKoPHRIG@sx z(5(+zS6aB%#Tq%?b5TA}!q|8$6_s9zOh3a&qipHe7tMVq9&k(A8z;5P5m`lL=5 zMxk7h$x_JXv8P6a#W+O%R#EhJU7Sk!>bRqL&gp(HiZuelIPJguG8>ScKqnyHB_)qr zj*lgVltf~`)k&55JVB+xT(VRyJ|T_8O{M(J+O|@=`=3QKOl&pw>NCL*3)IvwK3mT( z)1@n#&;LqQA-Aw3UpUW-`PD&Ga|t$;O@UFiUB13y_#pvo< zO?I0Wqs;gV&TJWQz5dIMRf{Jviw*H$sQH8bu>&xZ9<%z4{;^IGOdsTbzK8OZdSAqH zMMS=s_(@f=cYW$XjLY-Z!O>m8ZIUWyA*RXgU7GYfql0-uXsaN*K1db- z*Q``&SGaQHy;3}7Mhsl=tBPQjh=|V_?k7PXtkjI>=Exta%mcp}wP0_LWyT-Oa$vZ2 zBmDu5&bmL`YAkLJ{|RF763{8w5)ZDt1nBJ5jUWE#dy9AKF7>_;iEoCd4*2iy%HLI( zltfpmEe8BT{__kpXwW{=*=VT@_MiF?rd~A2!N+K2`-Yz0>)^;p*yoV6T-Y`1D1^8? ziH5w;znxugpb+}r)$-EVu<6;myQ{nEzV3>t{?z7cLmF(Pd#l}4kuk{um*mPhk5 zTaJ$VA&11?6Bxu`llvgiOEXXv`Xqh|e7HUty&Iso>8W?Lg}3 zr2U)rPqs2zXtQhz9~crwxXodHI#px5!OE=tg>YP0Js%SB?r}_z(L?rUG@S8a@FAfE6(R!QjH-+)a+2)~l4stJP)SpyuJdxTl#QnxkAQvxi!#{Z! zas#nT>^F62OZgU5sxqoKG-C5vgelfPekYVWS|IKBu!}4jPMJ`w&{FSx;Zq&iDD86( zq3fK`=%T@jBuS+tX_YL~AjQ3$>&<6WFIU2dbmS#7mhA%%F)XNds90aq`4Mng@#=P5 z{=9vR9@w%X_{7@ZF0;?Y9cL-gWY_siby62{(RmaT?m?(8-{Sc2+vVtx4Wx^Ehq%q74z-3vd3)>BVt;=O;-6VD(r%o0VT}(FzWP-c1ad|6@f$dk_VT^N>GwZ-HT*yM_};b=zGN(T=rcOou&%TW+vniq$k< zGNJc2!1<*rse;|+JVq#I!%q@BL%zVAJ3Gz`&5fYJ<`WLB-aFUSxPZPL_Cvyi1dTWi zXR8G|t@D8oR!4Km^shSpnQJ<%<-x7Gknv(b|8fvJZI7$0UIRh z=3qDipWSdo0JA8}COMs2BM1i8hts}uu}l(@@lWU`raM{J&5r|ERuVyPI-j2uN2rj< zkH%64$Dtbo(Ym?)vbW*VYhCZ1jrVXfLoH;U&^QlL^cU@|)47Fz)(>5@CbNRBim!iKC@`t>+(Sbp8`qs(Za{O%V zw@R+)PA1YkI-^N5$uO%SpHUim``ul!&>{CPtQMz){B{AKpy0i!GEK{&SD%1)#FGW# zWIV>OQH@U2u0ZjZ8doInFe7qsC(=+iWPtv9xgw+3V!BTfEcX;P^c|mo!3@Hu+3SI1WePO28mr10~dLJ={9{RS{24X}~#r5(f zB5zf0tonaN`og~Ndz?<>o5&QWwv|BKt~0mW79e3^tdL0>W@2@9GLBEpv0DXsc~$uj z3@RI}X8GRd{w9EmYgP&75_0Fvm}B(%`{#mb+l6WVlB?93*z&n|Avg1F6BYI0j1AcC zQL;ZI4qx7%t~u>+b>xDNhtld`);HV1U`Ot68s*wwaZL8;2O~Vx>cKdu-!Z+sWc2S> z`D2#at=}{0e?ckmBS~kyAlqZL3}V!zx0$%<_jD6+oVDJ*__VXR|f5A8WoucYX^wL86O;baGo;leF^{KqfyWi zY2-Wrik@zxix@-&UC-nd*-_}-58FsNOOl>TzmsJ#5aDyMhXjbr?)4)p{s%dpQGaHS z?kgc{upPWgOtupno}VJ}-W_1gPiHuU-$b(c=S#i)lmY>ta~eXv%k`iMbS6BX2uHg7`fYB2|1B_+(P$3lZ``$~zVK$;OX@%Xkpkf4~@vHw-E+7)sEb z3X2<=A}P1j&Fc4#?uBLDB#MSD^7i1E`PiT`c3@Z)_tb}Mk*HNsS@V!=*3 z3@dAup?f1V|WY0Mgffi4V-`yQfO5Ad&)Ecy8+lMo87DD{Yd-JY9H-yz#*zi(G ze5vu2oa z(8STL`HNe;a=2sKWYr|8EEfL~!hI9r(pm^g57(Y5C`fcR88PIm-@-rz^J1KS=>nIa zH4)?BN@SSZ=8esKQN0uLnp9-+qi}sNtxXu3te#}mn)SoX3Y7L0%JEhE1IOr8z9m@Y ze1iDEDHKPiTi5{N;k4d^m5q=EkGWxroBf-9P$oNlii%bc@p*bdtR;t)&zhFF9t-!?H^mLeweiOKkNpWhnz2BVags{d6|=uOm@ zxqug!mFj|h6hOB=_OIt5%V>I+*Nhv77A)8(D`Pq*baI?$x%u2pizlJmI@V|Cn)Re{ zc(h=D(8^nHxovVdYf3#=zN9dSH19F(;ggI8E8T?o=a!USyAsVg+nun3%c#!|ICB@u zYOY%I>{vSS`Psrl;VrAdeIQMxZdv-O=OGY_7AwpVJLa{Q8+4Jn5v^%Nqw*Njar@D* zhBlqcT4A~I#tuB1-M>>6KvqahoMn7O+hh}QwnxKW*lJ*%jUbm?JlS|+9z<1mmR+M( zV}$uBO#6$Z)QaI&t1!5CEPD=nTbRVnrwkkOB57}YO-2uUVdmO0tHgLu-c-%lh@?UYh%f6a~ zPvc3@ntvHOJ1zq>L$CS*!Csz~I#iZz3$G?pCNc=$K#DobNRgx&KWnf7y`Y3dVVuFp5E50{P<$z>yOM#8Hd65{FN=r#+xnwMA-4pv*dszdLS(ha;Rp(T z{-0*!2g0U*W%ODV%u!7TPXs;PliFywsqp%g@i|;ss3E08f$pphANNVPp&$1)g;dWv zB1NdlAELr>c>o?33BtD2?r}&h-&N|1p%FCvsuLYzN5s6PLEl|^nIt`Y4u~50jOcdh z;}Q;CZ+}@Zs24)Tck``dEU?NkbcI)=R=#Fu&KP_sE_hxvy4w4fvGd)FGpS57`Z2(w zE|e;FC%Mv5h4Q(YC~oL|YK6s6yuh{887xKiP%OE;q0K738%~NT3gi1Yo?Q8rl7q67 z6478$+A0V~Yobs{VKE*Yyn>PA=HX$U1{Ch&xwgF`u{4RxNL-Af-?fY0E{C#|O>D1wdtRXaW{@Rc69X>;8QWHg3xB$vnZ1XsV1>j^!4P4)X zUKk7>CUr5)Pzi5~$90SQc@fQgZvJgsrwg_-{RhwB1e!)%7Np&5)tNW__kM_pGKGJi zVB79mt${Yd%+tT)<~F1C5*}~#BcuyU^*E^yy~pfGK?wXW$SG@lK7>Xe&zVmBkm-PIAvDjx$A9Dai0W@%c65Ts<&H+ zX;!XE>2w>awR<{%8&JjdQ+le;6F6_p=l~0pi*&Bg-%cql0X(NOt?$P&hd>N5*t2Jv z{%kP}lF=5?L7|08!uZA#9tsG38luPJIn?r6a|e{Q5kO?o2N)n%s;; z@G}!PUOih@UqasT{4zHFe(`Xnjd5Zl!i*AJsE-Q3%%s@xh6ijV6B36^cK;6Us zLZ4IxM#qJcUNBPp-8xpVJ-;4F!zK^PoC)75BB+&K=$Tk>6sKlk_9+}%x3=sFatX7s zlvOQq4wCRv)zNEq4%ZU2!HC#nMkQ$bl#BNGf#s!uh*lfC5HcBG=tUk$N)ES!i1}x0 zaC1k&gq}5<-IO?ruv5d}_+%~a@p?YyqCfJ^;_t8ip1TQcK;HzJM4BJd0DhJpOY zqWNa>DH`%K(RmT$clKh|cx8U!nX$7D7mT#Q=a68B4j`G~U_ZMYQ3!n1Mm9;iY0-f>@L#62qK9}tG z5)lzHgbedBq#M%*%5%YA*GS8X!t*w=F^;?xQ8xncf0Yu@a-%TD2$29<(_ zZ!p8!d{s#QetWCR39q+0$0Va`OA74Y6)<|#8iC6BF!o%V#9o4>lHElKoU zDpAv=9Y0^cVn>zp1w8BBPwi~4C$_rVX6>tYgammv0c&2AXEdOB*PT->IPM_(kn@1U=q&8MyJ zRpmX#>#lNx&D%}Z=9vBo4bo}xG9GgmKWhAvPABQ1=3vXTA8jv`Q2 z|J?c)((r(sIQMW!n3mSkJ&)TB?9fnidh>krbv*HLK;wE<)-S8+Ov zr(5iL?|yQO*tP?x?PlrYaaE^lkCFR(ms1_&J}Gqfqu(?cf2Ex|3im$&yG!tglIoXB9znEO)! z;*j~_EfGwHf2WI;##p@G;ls?>e75lw-lzs6>UPy1Mr}D^lGSsS3JBw{vh{_cBu+et zrXHiF^)?zB`Vtlx(Pe4mVzxD@NmR1kCjYc5z26`gU!m@Yc5E%ENJtP4)9K7k=^M~C zN3q+?>q{g#x;H08xc~q!E5WC*SS+c0uIWL%rZCwOpLd!lJg`{y@GpN9++tk~d3FP< zUB89cd5X(w(Sl#p7ly1ri??|%=BGMojF-z8qDohmNBB8?hQN_q;tVxI^NAfHi;>N` zw+cvWXD9)2zSfVtUWPh$ZfLIId8M5LTCB*x`g%_Oj^zC=1~-tgcI9-O#R@t{$mOkT z&1bWjprDe^Y!2uklVZ z{s6b=XjFBEMSGpS<(u9FlCD^RK#|LRxDrR~);Yz%_KmjN;fTRmA@N3{)~jIj=AS3U zjVIKqb^37mDCmnFHgYC#Fy6RuoZ>q6&Bi@)IAig)pDB%GuB~=Ie*P zM3@MbiY5<@cTMWCPz>@6Za7q%+jx0- zf90TsdYSXCMp94~9m+ez`UFSngm3Y(T3!k3&-eA`uY=>%3@Nq}(MZ=nn5wa9ltbW@ zuW;Av(I<>ZRwv#L=H3-g|B$_TMtn16qrlGZV};1Hj;G063oAIXXGe*JsEj#wrEZ&) z+R;v2u!TO|E5-d-`W*?+s_{iRl1|^R$9=O>utVLNV5*H*Wa)Qfkn0DGo!0*PlKMNq z?JValnkW6{eN8wk0CK87%U_QGo1vm>UVbkIc2h=J6*T*uScvT}BphpF_9Blu9YJ<> z@aK@h%$9e!`(VYALk{L`uG~0#&9{nOkk!SC1!uu`>d&7sY!jLKddOMMxvW+vOx70T z#i_S1ZWxSLiRU#Kf}8qdbC3KTbhAKCC@KmaacstVeN@M}{xy3E?E zul*Ur?E7r~W4FJN88EJB(0XU6%*7cI(mO+mjhnpXZz=}LQqj-v6p-ZIN#vdfItn8E|4=XN}+SglRm&XBzKG^ojzx#yuiCda-j zEaQW5bVcKjC6PD}_jx&J^LzU#3dZws4k)G642KX5!stHtTd$Wm}or% zgnrwGrb#j&p;^0gFQQtCPdiJreL-Vr_>b@=!M6_)?9+_sfa`}OJo=-7<;AQxYd6J-@~^H(C_vWrC~{bDtf4NGF*^om{PwMW5e7Sd}jKc=kf~^bRYis ztc+sIQB@P)1S(r1<|=0eZ^FPn^h&gIZJC@(LY-4Xd)rslR)eSjjV2hm!~H&h|HyrM zMau5CaPWtn7SwQ+>Eik7`W;(L1QEXC-XEp&jRfj1g#<;P(?9N!fyJMd3-Elzac9;C z?T1eHuA*Gz-+QH?4~OSbG*?tH+GQshJms$D_Xx@rtg!J|+kT>JYuVyF@%$R+h=8`W@p~ zb2j|)#OXO^mA+QUusehVnOmKFnUcKd@Hu!0MX&!QrFf`?4k}CPmW%x$Ci?rNJnb9E zIa=#YElC3f)CGu6-_2?Q3Z@%!lf>=+Hq5Q5C_~+r;Mep83C3YAi$(MVRu-@_efhUU zyI3027!uv!r&R3hT!h zp;t-3;$`05CUGJw^$hsJ4}b8tjUg8~x`v1s(#y z>wIwU9iYuk6yYPmshr4U&HFw>UvJb;hGBcCk*G6Iu<4!svtD0D4vIq?`|&B!h2*GK zsX6u*w`L*r0H}|%4Oj1yEAA9T;i2|ce-rpff?6!D5_xkJviZvqzoeW^kV)F)@@9n` z?v^dq+G$hbldyb-8h~FLnidjoXK?3C{w0@XYdjH!McxzlF@>}U*sR>iny?cvGT7<} zMfU%KI|%d*yIDr4b)aHo;p&yI;rG40{=%pSPvLun;_iqu&A`yC@GVRTx2agQNbccp z%M30uBcUbq`pX6^aNw>B-mTEi1eXY6F+6lf{J{js!VgbeOdH>e1^j$NlbP{_vxMFE zM%BSIP(WfA8m-z_&B$|~(6ur|mlZZ-$ti!{=BH9Vor^}WA|l;7(6=Ij6?z)VYz`B zH$=PKV`0K@iOmStgu!IQdv{b!xV_e@Xk2q40GkZnPP)B268Z~gUY4e zq((~B&|=2_bXtI@>`R`^$#+1j+dWZs$6@Z=Q((*lXKu-gpslz&yf8~f=V1h}FeET8 z*`dOFZ%7L)F3kQbUYGZLsDxUu35{eXq8lPO40AKAi(8Js|Jv*kioctsBnT?Tk_&&d zlBP&Nl-Q#%@A9zEBwVWd|>X)Zo$ZFu60S8-+kM}Bu%Xsi?8TyqM_ zho-wvfxF~MAxa2xyHa+f@<~U5Y8sUMbK#>s>PRz*+o}m{ahCFD8&8z46Bd1$9zTCx z@Wy%@e`d!H6jpWmXpFpBGV`TpdK)%g;RZX9UwzjsSY$)>dQAkx6GWEVXJNH?4quF#JF%9XCK>i8I2@F>v^FLM%W?^?lo=@dhz_Z&tMy9XR%+>V}=Lo`|Of4x?ds3Oy&sGJQSV{PP zQ0@+XR}xdRU$W6QpXr59i50eIn`c&In-30>God(!(*Y53ktotAu77~qRx99#ietCD z!^e&t&fto_|B>&sf9C%Sgm8&KtYRu^j(YbGPpxIxz>uomcTFiakxNOkg3|aoSI#7v zJ_$Z@s?MW*NIau3Tuwq`6Dkj#x!C5$R0wAyiM)`oU9F;qUcIhHH!rziMB2pSLR>_y zeE<&VH8dEu=g%MiZxrdwx@MnWmI7h-@(2JBa5?(K&+yna;t?GjL5{f9akPoMm;l*l zc2h2PNO%p~NIh)@^7k)*Y)w>d6OJq@=0o5p09fORhT|-d(LHz^FY}Tb|J^2w^xo~A@;OY0qh`=>Mw>Z zLy^A65hP)g_T|*Pksd%P3?rTbl_Y}1KmTzN>mJzBu@N%ItpEWTc6uLa^=Dvwb}_4; zy?iu#+S}ki>4LVrXv*NqUOTOtAz;*MHC#h%ir0p@uMP7EK}IuEjo0N zNmyHMQv)jF(X^|X00}@Q#k5+)_5~nhykBlppHFogQeIDAx&M=3^mau3?(IV}`DiVY z)SQetf30~YM8{NH&~WjiOvKqyo9OBC3)ZOn{N<+$?tv40DV#&KK4&qPo?HbJv+;~H z+;#0lNVuHjK(b`iWo~%FD=e>NG1)j_E-$9P0psQaJOB5X3I=3W(-~h)ziBxD*j&*=<*#cUg-Dm{PPQ!bU zu?E}S-NrvRbg)V(oACSUydS*Ad=1YRezQfcHjky_$HL&uPA|>R7n`+TrJ{;FlNhX% zdk@so21kzA<{_ zX$^*T{zrUPutKdh6#sLo09+rkI(!552tdHR?=H=wQZKzm79FjhA@g_XrT`3VOfN6B znA(tezTPtSjhLmFxe$LMB0{^{rR&LyW%A&3$wt-f@K!wFZ#-RXv;53use-dR%U4Sf zm&WVF=lKd5LQ`c+OQvRj|2}yW$f`u~STjt;R9M1kfA7{(7a)rIuw61+;2;4;u7wYM zYz|nNZuHp@r>5+|4Lq9noGDGppG+w)5gRBWJ9rY+_oH9WiaV?I)cxz(bGqt7y7-btI1-g8LuZdw2KAIRa{GDamO~|H$VW1bQ zNA(vEsy(-9GIkHzXB^uN4$GT#N(#&6#)|yk8cZ%JX8k>wUiPEu4`2C-t~Te@QMJ`- zs^O%cL(ImOBY0)GJ`XX4nOj_CX_?fD;j*5xA|sxmAw!yekc8jwKx!P}>q6YRWp}Rb zRl$QPjA=q>5ZV0|wZ~TVy`ZW4K)jqUlOq0dannxvSrT zblcB>CVczveFe_{Qc_c$0BB<(akhUKLEYoHwb?5Ro@4pe%Z5La&gEA=y(v9aSIlkE z9nfLalk(N=V$pc`r+rJExoh}h3MOXmAh;uRSaT9007a5oqDc5?_g1&l!H1W(>YUf1 z4LtV}urUF@<4PAsH(*L_Fwh`)Z{7U^wzeND@<-Ie@t$h>hL@lV)Z{aIJk-MJCm>H=d|9fn@smih`8E(O8a;OuqHv)*0a+xQl8{jZI|S4CL`x)(1@7QOY?w z$@b)=0qj{#j@qFFg1Yrx?^jw~BLcU5bWL7K;*aV}4>a;mJnkONVC4$UD)!-ic@wjV zQw7Bw#+hjnSA5Glo;XB*2mJZe8)!_v+qWLoc6W#64`p)EY51)o7({K(;YD_9q9czt z#VD0sU>}%B_9teYUq$N)bTAsJUnQgSa>~@EWKGOuQ=+pOztdx<)fl{ScLEfrN&DTw z6c+(^QRU!I0DM;`V6$R?>(3;UtEmxeb$np`mPI$)^u`XZF_}Gp%FXMx+5ZHx`oZN0 zq9{FM*PS^8|ESn?bvdS{O<4+5@m~_E)>MXk3o;BLq`hLs<>tix4sm%x0S9Yn3+8ww z2;X2u`A;b89ePc`Z6CIs(>Ph9naXb^06#$tSA0LYh9d}^;T0k|azvNW7`913AePn_ zghKnIHtNdIaYA7^L90W4?LMMbgvf^Q9%MtnE_Aluk}EMXV^A(f5VhU6fg%k%ZN86r zI}`lx<|yX67d;pjT8b3{qW1alJ11vdi4i9;1hITTKX|U17V2=0Q^T(q_H3~GL1R=R}- zmLvkT8iThHNd%P10C3!B8sU>oU-c8y_cIPKq{~K|VwfqYBzD$Ua);#&A8~|WyCzGq zNZga6z|cyBBO3`G3{$LAZRy}{aoU$UcXWg=+K21yf?-C~IjY`Gz?j;M2P}hl(&$9P zU5=HopJ9y4<%YFqw)#8j?cPlLHj+gv4|;g0aDUHFuluC@cG_Q7tCOU2ba24%jB3A} zuMncIo#M7t3zvjgppAIX=N8qE0!YR#cNb^_C@<9iy_S)+9qiYL5bw%v+z#r%4G zk0%#ZCNFn8jHp-un9e$k9QeC?zsp7VihnA}1T+zMXpz`y&0DOW?<@mTlQI-@s8iHR zEEX&jOM#3d<9B>i|7r7X@8R_EsE`JWT>QKEGXhc5-437Yrwu=49+HS$aWmKE8Gb#K zGEE?wuKUo?saJEx4S~faLnuXLzM9mok%<;UL>pJP`rq)%LKjgjHv@d_zYo8&Zj7sW zN^&q+b2oUi!bFb88%3dFp8-V7vflCBfP1KfL~!p`fdK2CJ!do-jceEAPBEBq=_H}o ztGT(kTgKP<*w+-|;*%0C2g{C$sa*XoUBY^SSi+smEAD%%Rzp0bgZBKU40UyPFTq;N z9E{ck6|{MzgT>k<7H1sf6%8ejF{v$ED@z&58r<=ARj$So&uit967J)GW&?#7IvfZ* z3Ybji+d#BU{zv2E`9DMA)V)1UAs$X&1D>@$Dzd>ddj60YohPVa?}qL%FzIoIsT4T| zXf8G{K7?z0t4Ro-z#ib94ul7@0?je;cV@!*C?qf4EdKjtQXJ5%O|4Fu*w$kar9W~^ z>X0{l#&f$F*ET(f$GF?WGeblV+Ewtr7jy#ufw0Urd=s_e#SAR5f1nCcL=hwvH`*K-`RUBBe5tp* zLUa#b%Owx=gY}<4_Gj!p>9D4_I$4RKm^!R6<_>*D)WY`@KY+Svk5m$H z1pL}GLE$SS0G9DBVmCdUs>UR&IW%VZHvac7i~Wn0ywJ-m+(oBvq`K!}^?BFkPB@c& zzcm$gc!-vXFJPIeZMrVtmI8HmSD7p(qu*8z39%y)p*vRni=e@Rspf7~Mt9w*@s@vB z!Rj8V?0yJ@eGj7}#PoGX0*N3A09@k}?s8nJeIX;?&&`npWFzvHmj%oR19ITyp4u(U zn`5pVKphbU)Q_WlmZ@DD>wTUO*)R}MqX6K=1;Qq(>u^ryA~Bsq3)VstpUxS|`kKI1 z*9ThaaDPf(9?v9ahu}@1Lgp&iS_BeT5Gk&k{>^Ne#?b*P6!h)Rhp&Lm4*wLNqkGgh z13VlY|7!aGDv6e{gkaYIX0N+bszzIBxm+&CoKZjPIbISlW;5H$Q$H%H9z6{cr-yYhU~DLS1%3?#VFZbqWPkMZo!cG;Sn0NaM8JGt z7^xGzs-M3=_eolEX> zqx8tz$%%Bl$+;Iz1!5*z;gi;?#g%yELdR?a8SJo#HeTHQ@*IYi$VVp(T}DiRyUn-I z;I^O`R3rONDe4pX>ewm>ue$?L*uTLIt~g67qgI8iMv~1w4S)A0D!a4j*(Sw0Fhnu4 z8R=Pbz@U6Wlw`;uuouRE$Z%*@TIQCAh|d{cKBMD%-|#9xYT-(%NTUzIGw8y8NvBVrt}B( zB57p(j|UtZyEWqO7oZoL&d3d5QJP4{!t}gfIqo_)fxWSj?8>x%w40h_h>@t8?llv< zhBt`mA3ktKs`xOngD1Yn)Yq=VVa9a>o=jlhA1+TAG?~7-{oXfNUIO?rxuWZg938kZ zO@!S?z3~fE3sfYe-cyu*7qo0nzH(VIf_;Tu)sdtq8vVLu0^{Y=KDd^p$z~I`W6rcX zL5g(8LSkUCQI1LU`ep7nUH>3aBq;4scf@H=4&7~h$1ZN@^G<#WOG)y+h3Dldanfi~ zr1yX#nHwHnZBLCKI&of9vEr0tb5iln_rQAbDwHol{jk8ec%cd65V_{$1&*!;et1T9 z(jnZm;cu@roiwl%*t$5MCyXzNbR*Y0JWv=;jO-g$ME`N@Y%wc@qT3W;)67$r^f?;l z=2Z}01^wr~TkTv%XJmp#S_50T`)F)ZB%2b9%i?2JYd)}GYPsOp!P^ndU6nIw;eSCH zN98_yP?5-{YU^-(5GPMDW$@4sy)xU;&r5jnG}iFF>vy-eKkwh%`5eA%-mfL0Hb-#jqTiEN$=; zf|y2|*WCw`bC&DBuO66tR@bQ~sZKkXwOfl*OaRJ)G`Nk^P6YDUiP{n8i|1yT0Z3w|*RlM_vUW=Xp4wTyijZ?XSf_;}f>hvk#(a$<_ zLLR4sHkVorjW<@=*r%%(crSIoW6nKO0bODITc2IG%uD7UUm9XSusa#8kkb*8I=PKO zPmV#NyZd)83`0)AwQC3t4i}l0fY^P0)RvY3Ga=m#z(OQslaFDc^VRg_QRW}2jd$s*6MZf z!Q+YTG>o$gt;yw1VyUuLRV7y3G{g2a zJ<5QM^e+R6a_3CBTk#^C0IM_oj+obwBVSH?8@_rvm-NLn3jaH}Gd~H6U&)l(aSiQU zYDtYla8htJMFR_@Q?c}7w4ai(1Lsll4m;;2zo8Ul9~T3~0nV3elrRe$QLTr6pO3w! zfuWXJak?B;BEXy+AHUoE0^|RSp?8exor86ea^NEl;BUE1(EZrC)q-m3OwwVg8&+8y4_cQQB6&$MLSJCa4J+-*Z44NE%wY z6Ni}%gQCpy6D zIH?R~EYn9?HWIqjpD+Tj{=U7xBHp0h`#*v7hHK*SD0 zF*kpaSx1oP@O{>jb3HL^nMbUdCFmXg90pjY2gvO${d+skg3rX3AcW>vuTFWv|08Q> zn9d0M{acMLV2eiPtX=cHGA0b>Cvn*KNqH1xztkksr_1U;7PwW~!+TGBZP{`3NjN5X z=w=jR&Nv=DJ{8JvGt2ha&vN7I5U$HmTH6_sKn<8H-JPb(ypp*44Qp+*v<;4YNo3B- zxwt5*m5Gi1g_F^QkS2I-fDJ`vvzqvv}njNhFG4z$B!TP(L#a4sqA*0Dgmo?K$u1;sBHn{NAdLv@Tdef^Q$ApUAx z0SUztdn#jSrC7&vzp;+?^F*je&N9AV)4fjvg0}Ea*6c3#bKQHQ(Asgryi9dH?-Gs& zN4DwDB-@dAul=yE6LI;hwPp;UeNn3G;i4#69^$4JtfW@wtbW*jy_gvqNWxSU`p!DZ zYw7MS@|}7OVFnjzWP`O^f+aOG0pEw+VH}z&S&OH~ClLQ{Sb%aFK|x$HAw?@V-LJo7 zI+$-@3w}A4W8fG$)<$B~Zp1v{@%i&DZWY|*ijq@y?d^M(&`?D+QPaE=6cki~@i`LL zM#c3!a>A?HPb~%y=(bd?U#womzWX&XP}aD{Y)r@`GcNA|0l2G*viY{JJ%J`W@ekS7 zO?x0V3$D-i5<~*7*~pJOe>rb{TTtGh$TGqy78xj7+2%#FU}Oz#+Rlw=*+2(j$>Q zH}bONChF}Rzx7HD?B~6sj*EhADQX)%a+6Cv??pne9Hy#~PK1L8s;{NAmCCSO)D%(csf@o3ej?VBts%Vi;_1ahSM;eOqwZA9AE<{!;rl&f~O)1ULi(+r#IO`XA#~ zAt;|hu6T3g6eonHfcy*d9@}K{Xnf~7FbT$%_@SKt^5{SAEDnv_mCiE$!p%etZFLMUF0@}^rsBlQc zd*XQQYC)5v+8V!|q@PFM+F#ggW?LQL9y70+TUj<*?|=n@_x|4f5Zzhu?=p-x98KFo zc6Mq7ZJ3Q3$>mm|m3AW2-W9&jTL*~uk`RqGt58hodn1m~t&quPY1UUXM!<`7ElBwno< zZ%X647Zl?e8B4dGc*mpumlCS8(riHa0YBAmko}T;hEzx^`u!C=1+AD;ywSD~)RzeC z^A2t1ypdoDsUoBS`Y|UpL@Q`iThq>vNRwQk+EQpe^_n{U)+XI#2~kz)Go0N+~`c3%)C;{XBY_AXk;9XZb*@o!O35`a4_+YTxZuooB@S&UCRTd zui$Tf(dLlH7u(f=Nff)dG5VrcPV&PAxlyEh`p+iYS(Tu>6$)KrHoSbDaVN`$ASGV)SdxynpKcRK6 zcH&2F%IyLa<#HpG&N6s6h_I#*in&)X zz2~sQVz+^?uquF^i1@1|*hsh@H5E+tdhv-`C-e8^q9LEhc{&Jz?S%v9;@zM8pi%kq zv;L13C+RghT?X8Z4)oXM6QDgx+6%t3?vJFNRVY1YM|ab`&1r|9h7Wt$j-)*YH_%MI z#`Y{Zt<PrxL;lh$GZsav0PbX4bLd95oyC|M-zBNd$GSZ$WH5WC6eSFtctx7K-P%!X< z#C!vXDV^}FBO&pwkp<5{Tao+<#hbT*-_@-@#-7_|IaIyOm7cs#EcEeP5<5Z(NU%Wu zNb8vqo1E%WEIe|ui)NkLWZ4Xhf5iYK=BFdtVAZegy&sO1C@dc{pF8`%}@}*(}&Tay_vfEUhc%|G~8RP(Mbzoh=_zKuHxBmxW-2&Xt(A z{|=h~D_j7xU7E`M-!wi7fl^>pat6xa6URqcp#+P5{H&E50Xt0g-Cfm1_|2M-encJg|HN3@z_zM>j}Dj zmlr{A@rCd8C5euolM$pAEs8zgLQfibyF0g zbI?}3jDYp@Uq89rT zf>N}D80#iGV_sv+d0Gi*_>GCtr@vc-OeTL2M%#D(i(PZFKG$nocc4$c&FRr|rE>^G zPj2wWx{eO5%kl~3cLg6!M}!jVke8jF_t+OHJ!^`eJ^3RNp0A$3eqlmPo5@ zVDf^rOApp_yuP*~UON&OfjZ}{crtp}Hh@G&*apI+d zINh_2_>F=_-5*kjE7L8e+L%>8!!CtBbndHNvs`$t@TCg067~~&L}z_cnJILuZ%E)1UAn5pex8*y8Ll=Vu4|;8Uwo{ck&Ot- zTD{4T+O4W&$s~DTD6^@KD&}8rTJhq@8U662wTdQ{0e4{k=O%-sJfCA0AFcH4ErQM@ z=1S$2I(^GN->V<`TH z*HEyY04S1zyOyo=Z;Iopyuf4S$qwPHM7r@ z?`iY79Kve&r8E?eh3CN|z>i;?t-$ne8kTONne zWl6SIgG=q}*old+3XmF>hOnbg3-I)3i~9s>U0wc67Vsu7K?i*DaG9ZKF9h2g+N1m2 z$n(#9A*MpzoEkSb^-|QE>3=NAXVfy;kgur8Rq2b5GD*0t)QorO;+VXAVc2X_84Y3? z+(g8ZViFw=CN-ewo>5RvhT_Z0LWl5wLv%?8@gNk|D8RxSo`pDIHQ135i=Yrfsq^0F10I@83NJCVk}#B$ zNW?MPZ-@(_#(IqzV#|rNOPDA{S{a=}??a!yXR&>ZKbP9lK0>KlIhLY2^tQVNE5_sK zjU0@;uSV&-;m0km?kcS*?7P_U!YWV>bqW$>+HQ^7rp;mm?dUi>Ol|MlajpGbG9M`e zr~jK7`IhZO(1LsQ#SWYOE}O^Svo~Z5$=Dcoxci>{S*TOr&qPKHO*%YEP1|)kqwb*( z@mmfbmoFg)F8)lB=~MSVeAD!sR=51k-*Lb7ix@lptr6mXmRB)tjnqL-JB@du%V|PP zqftB^sLs7{xPnS}YT<|iovEk%{(e}FV3ZcLQrUm;I<`|FwjQ?BwPZ`LnV2q~zZThCKKk}PvZ=%F6)O;TRg>Qw@LCvWa#;(afUYaZp$biV z2eXBnkQ&ThZKILov!#;XJqniujg%^MMdQ?cyj5vWJ@-4|9}nj0_|5CYw_HN(9<22I zvMFk!6Vfy>S{d~l+ZFO%vqXkRfRgp)?#QqNH5|EY@}i68aogqpvG>-0QFUL#@C8T- zf;1x~Al(BZjdX)ZOT$po44q2H&^glG-7PgVh;(;JNl3?YyzbBayia`J|KRy)fA(23 z=j;_{pOt&9khHo{X{{s`e?)bfh<3cE=)L@Px=DEHLwJY|MP2?`c79uZcrjwFk-lpy zUf`~iu6gye>Nl&Nx)H`;1GB|?-qPJwp`0gOIz!uZb}wDwZU7s5X1YI5KS`a>TG$eE zQa0~BEe>=P!1-XH)C?u*8PP#!J^jY*^kAYg{){SHR#PkJU?4X4HP~V>;%fkLu}DyC ziD6*obuZNoYv^Qp=ffq6vc`!DL!=X_s4>d0Qm(k}8$!(f0T+fO>EprFK*hf65!Hpw z7Q#2QeFiC9~^94#v7d87*a!@w9n6ucHL-8}g7WtJ+iMm3_ez9`)=`ZzO|U zQi;K%=A}W3wP6B@yMSJ!m(=-$qNyCf|LL8ae!gP;-UO6VhDnD7H1`J1dU1Z_&((#j zT>9T<0ch7d(Vvg?acdZA5T!Y@Z(T39W~ba(y|Y=pcpq;I2%ov-qUY-k=N(@cm2V-(@;S?ZBADtUwAdL@Xm!~J^l35 zg%cjL)E*2Ya_wRTGH-{8VnB z>}+fUI!K%qAIYwV?aw!F^Yu#paM>=J{IG_Qdrf3qA4lCt(9&`pFWPh2_Kl}3*Ry=n zVvvv{Gh^-6Rvy>MG#yAb${okZ4Z z6qP2aapxP+0zD-q z=dYo-iVQ6kxqw9E%?^sY#v~W(V3h05YS{96r8D&3{mG2HIiEFMm~z`c(h2F8vZRA{CzHrT2m{xSvH}C8}<=N?ryVm)c!W0^4c)4|Y)gSNfq#S~7 zPD`TWg}ubEbKJaF+Jf9ik_~K_)OERB4OCVYDAoYcU5e(+I8zlQl2=@LXUiL7+<^d@ zAMRd#aYXf7H!-xHXmVuO8qnd97FD^`PkqN`+&&xWt+3`tQTSn?^!xeH{K#q^<)t*W zTDM;N0wgWGixEYVTlA<~ta2CQHRkNbvzK8Hu}^q-zG(%IZ&yr3!J-rgo!aR&TGWTGTVE!g}#uqs6*#`=W(=uG;ji zmTaby>2Ku1%B!tCC|b{U|Bl60JMdAV;h`*OTpb9eb%-YVIPM|*;ZNdco(RN?Tz9>U zb>`ev>?gErz2>7Jc8(mr&46e5FS`G5&sJ~FQfR-mrel6r0@Lzr51+zzKa{_bZW1Uw z6`@wEV>jg0o&~$#sCyKRfW4X9mHB8W;&^0XT#-E(7@Tvpsx zZL?l;1+DNGS9wk|RwU@ko`PL-JJQacs3c0f1F&{0bFF7dRCA@7JV^_;&Xv5{TuDUI zoBbV8haK=Db}kI?R8&ga{IOx{_9Wenh~85X`g@8dEbbq9pB}Bsd6|6czC#<#HR53ROU$IaA=*@ zS1dM%Xu0BcyV3Jvfc?_8;p4OC`R-Q~?J8IC)f^I`bp_=deUEaV*(ERWGyja(Win)Q z$Q}0sDrR$m=%aQx6IFv6EF@C9Yc4Vod$o0|6B=@~X1Gfu#7YLOhajOwsk_Pyu+eircEl! zP89_dxu5gG7O-{&FHIlZp6NDH(%;E?KlPw_wq|_HF-zsG|Inf?w(#)2{-sV5Yjlr^ zXty+JK*WW2VRs4OFAkkNlxg~&Qu}7U^Ui6d@-wp^>cW{xpHnRb>HAdjA-D%Rm-Qv{ z7`a?nhOmKj02YeWePZvGRdkp#R=A3=d3A=R}LNXB}J6$-QC zd)qNR4%_sN3}ypQYK528T~#SBLL2~Bo2C!x*xcJfWiC>mite;;Q{j`OaTzTnSny%{ zcBQ|5nBTu06n&t~;`Tu~!)5FPaq;AOMRR%BH*TI!Og4ITm~K7Ys#!K-vNn&L2o3)F zCmtuK&pNhfKFa;Yh7}1%U?cOt!YLqDsQeI;RH?Kz6@h&54T6pMr@``ty?5x~_4lun zX_xh1SD4oshn2I178~tDjGsExE}jhyeqX3(%P=7E=>qnea1*E7dylru$2SQ>$(aW6 zCx}QW$mhoKDhJlrwPwV+(1b_r)}vbS9_f$G>bgDTsaS0wQ&@0eihlDVjn_0klu4^F zXryU17B%zl_ljZ}i^QKRUA#cC`e5p2lFyMnzWX&r>)KjC$yhv+5WInr8`Rx>yvB3T zT{eHlMF}Cx8#g+CrC1`Q5;gpg#@B({?xm4+H7l$fWZTbV34P(U=CX@lZjZDmghbDBvCPTlct|(fF4051sS?7ao^FQi;^#%;-hT z-DySfw$pBhp3gPDK-(WKr=5YoaghV@Sn&;?=9)pmKit}N?kIg5FTH z_=DQg{Y{}#Glc0$94vfdpO|M_*L z`DhYQOaz|1)m`w8FUY&Xgo`tM850b#S>92O+8T%Y<#$aMw%$DV_R_IO=@ z#9cvd5z6oeR{53LCaE{fW+Dges5Ug;92e}7D^n18I+h;NL^@8$WBrM!C%E1*wwg_I zC)}{1cHA8g&YiE8QD|)J<`{Qk_Wf6VQoZ^f?&2rRV_!d~a%o;R$gAuv`B#AdL43Hc z4v@j2Z3QXUi@nSy1+iN9uvaC<+w$A=W|I2^gx+>PI1hJZ`!xccj5F&9*}0wazYBTD zS0Uk9TJ_P-Ql|p%q>~aD^?ol?25+p1iSBb!6_=}HU_ZUt?HUeCEkUcw8m|@;Znm3w z%ZODhmxc7!`sd}ZWlLqmT+HCKI7uoxV_v8E{YRv(3ubu#l)mUl6V4u$K^-z^ z{$;;K6FrpJXWgB37o97cn!?1=Eq~8>`mL#JEWMX}7*3n}2~SwopPg>f$1^=q#t*c#OP_jj~yHtnW?N%qye_ObZsL6!8S$kdXtRx#YL5j~1`AtmSj zO#&6upW5`A07I{v0SxvpZMB#DFvOFFw)@KT&W#Wc5C6~dHDmVNj9ACG>#*=+nDr#-}p3#`Sa2Mux9B67Rry21pJhxL7^VDE@HtZ_FGS~*oW}>X9 zX}(*YcOtt3EGW}7SRtUXIcl;OUI06=oj5bYywH(zm}=tFSZcd?ZKnWLt;s4`kYck6 z+=M-~P&`-vhkB6ILp8@QQ$H6G^S%x@PsqXv&oARkS(}6qzL|i-q91X&Cbi2EvzTz4 zUV2h&dro4&-|3f|)Zv*LW7Vm{{Y%*19oDWoz!Twkcp2EfG*_wF$kC=YP^adu{x4@n zO~_k?jXB<3mQEjGoe+SJo{d!XU)I3pUXcLO+MKZf%V&7R4N~rix+4~Ku`#CdCUBnwFFffgiM9 zZQ9SEv+13TnJ!&cL>{^ z`YRyE1;Eg{1=|5G{-3anJOIXUu`yg4G;&>(asej>DNHHL5Nx9(vhBe0k z__BWoC#e5Rl4O7+$Bt0Cboa_#M3uF>t7cmk;X_1*s~kP`s&c;ts3 zoPk+p`e_<48yi|q0A|T)6W;#;77xHK0u8)9>SI7uMDZd3f#qHkFiB&78@!s6h6fRWSy_@ZhD z#{t7j`YU_@ZDG5LEdVu^x+Mi*GX=Qz?Jw8xpacu{n?++UB(o0zb_n}Xz8iZdYdfF$Ppl|-b0 z$sPdxgskj)fHZ-FN0I=c$kNwgzzjAN5D38j$>=HrW*9?9kpBR~0`T2OW|ffpmn2?c zCRlJoUl+>akNJoI^y9=rXdm56JiM=BC?)H2)#<;_uXn!bxw!_b>gaVW@5=tdIt4QOZ$W?C)D{+hCl z7O5LwP*1sf&q#f~MSXE-kOZThsLWp-`?8P7mE@S}=m$YlM_y@v?mPw{z4OdR}9P_e5!C^ZIb_I%_f zhrz(BoH8PsbC=Ji6;Wq!r4kPR7Jyz12AKJ3ffSV=3l1Fq(Os(K zxrd{VEuPU{?SVzv{T^pcjx5)_Tyt;cL2Y!7yNt^TYg4T-%IJM($=0hd&GOG1wea>7 zX}52tU(WU-Inc8LPMs4)XL&~FXbY;j@79)@BitLhIG>>#N>?mNot{qF zE@<{5wz|CV=GhGeLhyRXHqN#uWiv$tgQKr8Rkm14_`SYQ^Opjo?HN%KG(N~wh-K6D zEW7r210=BbLl&~T;fZ&5A{e;1eXZ%{YavvRAH;WH`{@&9I<^9az}tD%j2PM9BnYKb z@RhlHC!&sC7Xylq7CZr7R-Cs19=*C%QW3=34T?hZAG4%}jWIA}jX4{Y%>%zUZvn?g zcA@oxuCH&h#6r*pGKl*P-nFa5)<~kxA{I9& zWVJ7+FSW{IT<$Z`qvcB5D`s`ZoWokVLXX3U)ARHGigX!Hzb?-{aciRI0Wy97W$`+C zxkExia$S!W=2~1>dS4a-$;u!;@4|O#m`lyjNHw#)Tjjdq$wKM(MqetH6zQ-7SDyDR z|7i7&>e}Ud>+c^}J;>lO645G!$j)957yp*P6f3~_LA0M00-1E#moxp{)KYvI9z<_93NT>8tx`8uood`D22rvH4cp zK;q*gp^T$6iN^=4xdc~#UDW1p|D(90a7Ox_u|GFPw78Tmju}^HgHwr zl6B8dS^wmBdA|9KvItEQvZbF$r&Pd$AuAR;>W(CkkA$54u{mf?5;(sn%^&v1oE-@d ze6ob87F3{;_?$`2^2C-KvsBdwu(a*w1;&7T_RSjl+3qko>%3WGg%rpqKmS_HIA9Gf zz>C3=c_UG>)MDcmrN)`0@tLg#j?j8eW3rSx;Cg8#1rGmrgNbVfIsSS(XP7sIG0Syz}pB}CVhI68u z#lGtdc(NVVrI?At%6aSGnnHh|-eP4f&(&mhLtlBoBj0ZKU-2rxMcMT?*68%l=zTSV?f{x%;@;$>QD}qOUM`Wfw)uk45r35zZ511DPX_6$! ze$RUZqsY5v5B#)vfYI7oqlf`s^9^9tt}_ zlh-@so{*Vs6VcWwjJ@APJ*ox42K7s3F=3FG5_j9*o3A~-*;Z!~iAKL4{pLA+CK=Q0 zV)-MhIpwTc3F1|$?{)Db;-RUJD_gNJA>tG4KeuWN3)a2asOWmXICoLcsTrwT&np{^Y461kiEi2mX9amh43m+LpT z>c_c4+`7|cK^&f!QxTQTDBNF#M98%T@X29AKAyDxP}A8u6`sv!?#+L=K$-0h zNPj|bd-jL*4ht>WN3KJ#0<3h`iN~(NfF!$L(gaUag+~;!)A(FokhXv3$zV)Une{qF zf^Kr-fhEKxICD3h@4JF5YaD-=PKU<2gAGLxv9kdRcCRB-gXGP%Eg1!Thx&XU*GxpQ zZQquvS@3L^{X{3^`(uuSw3_wW`)bkN@+#Kgu%A5V!YN-idl_Xk2UJSF4*`l-@t`{+ z#%WdR*Jq{YFthwk*1qno30*uey5pZ2(|#Zs2|BXC#i$T_B=WRBN{+nY!CyE|qJ0_5 z`f2;GdIg4vdk|NEQYGdz1|3CGrV>I;uEimkh4_|lN!4GP6lYX6sFam}gzo*Cl zxuITeF6M53pLOn^@0F1V9N9#Fl!~sYd37&5Wo?KH9Aqu==HF+^vx+#AFAqCwiSHb^ z{4z3F^sO8ZuWqG3kTzq?%*(bAcnAWoU2&PJmc2*9 zB(?piLwRnJ6F7pCsn|o~X(rE?$N`*&ZT;uhI+8{5FMu8*7xV^g<}VDq{Nk8DC^%$5 z|9c}50NHmw@a~MImj;eBpN~z@s-bhMyi6fOfAjl%?dw}|@tr+GnOsHNqu714p=Y(B z+3Ek~Dys+_G1Q6{OBYz6|27ejwXhECU6sTRCZ>1i}+7CScgbUF(0!3PbM5hBpL&yL*? z1Osl``)DV^2oOsx8;Nb&Y++?XKRwvIjgDNz=are){| zsG-YyT#Ol_oq)|_!dhAx9T-zISZ;2KwCy_Up+ z6Tf$1{)h!v=44$ULIBCy9A7G(+4*(0!EY6ku_g)Ray@r1XJO`|Dl%V{- zk0GoC{)MpSFu;Pl;?J}J2HdNPv3y{F2TR=nPo$|*?gM6+!EalEUk8^$i-C)OVZaQ- zX2b-rE#pH#fSzfE*7t1ahqyVik>WkzoT2Yl%whyc_77q!8bA~LW3JCCdSp(L)dO&E zC{zhN2L$xLF|~E>`kVe|Je>jHMsvUj_mq*H}G7tv(pqPdI@8Wy^E}mxOS@_R?F?;-r8A_yn+T{RDsW`kYtvPJ-rg>pm8v0RZI@F+s38KR zi!T>RWsXV(HEe8Hx7tl?L@EC`Vk1~Xrmr5Txv$xG_r{)o}NXxH~*`c=~HJpTYBXEe0Ea zU0COXC^og4aOzd=QEpqbG5bx?e5S_Nmz(ZWJvO4p5?(f|Dj0J;b^W*s1B!Ntd&IR0 z&}r!8ZT3me;W$-N3=`~RkpO7cH8m#uSF@tD*FC3gUk^T5$&9suX3%$_r?%f|=Z9c+ z%H@o@jX@h|o*VMLWy17>ke}~<;3xCkkvbxih{W_rXuW|+w1*G1v9bF>yp=Xk-2bsa z_Um}7Ymc?d0Wuz*W5q%lE+q>h5O7A;W~A*o0XF?;_eJ{_F?gQ6H(%k&crzFk&`$#+ zwMCzy6$EBxik?Iw;&cI9Y9Wuao+A?#t-$0i5!4(e*6haN?;){G@iTuo{MB1sYNc<^ zTo-6oC(@EjeTl+Vs(~k_zL5llpdXFdgysUFy ze#bayK6YRz5@Cq?f$ONHA@0v;x@m2b2gZ@r(7Kb6vuwg0dxe)?w3l8^TF6CzE4QXm zYT-#hH4PW|=%GL>*Km+rxjyxElJU}+D-T04+r1QJ@V86_zekXwXqu=W=zO=bp8|Yo zio!w%x{I%FH+4;h_1qW=82?09NCbzjit%iQhROy?bB&23%5|s3Z!fG;7RIzYSgG_{ zUCEvvy`pwo5f=9%AM-HYuXi`ca9hylDW`*(+o4)rJg z8ybz*pu3y6@FTH~;Z+w>{vmp>XUThT5|!vX4|(K#= zI3`yuelO(~U%lKLe_~o2wwUjb3S8Fc-)eWB>os2#w4xIm>R zGCo)x;rl@ZyN=uU8ib`E73YgmYBDqeyFRVp{+G5TJlD}2(ok=0YLMCS^GnTPo{0BK z1C>Bved|+~eg=1p4KfPy8r8H&JVD6B%1Y#CJ-I@@x0%p8qjR6|IJ^-CUY%cqCe6_G z?ZVR6U$zAfSwu+|ebx~#(y|{qeeZaWPj8Yhq@cT+$YMiTuTx);wAyT6z6?|}ZVQ0T z9rvhe3W`d!#YT94@ggTk78TpTS8D zbv9f_Tp#!>*xPO|CJ#;XLQSJNq~|xSO`@&b^4s;_xKWx#$B%0M4oO)8k~>9Qcd$F| zOe3sdhSZziuSsj4s2^S$+mHcG$o0z3!RCr0UL(7Jy7_C@_}AYwv2bp4V_+FW_HN_H zqMYUz5g77tG*ql4nd7BCMoDw^NOr3-gEj}`gIT}wg?hmMC072)Hip^7(`7vtZ@tO! z9A7GZlg|@E7L;~D%H>3Kw#Bw(*{PsQ`(}&2;o%SMSO$N%^L{O3*ZEAjLX+=99FUBv z?u`TI-mP)U8bK0h4jLhp{o3w#+*L#(P0gtB@_7ObY27pBL)RcYYMDIBt#$lM|UtF2XbUgoTy)5bDuJXdEl#OTWy z2~dO_9xy1qqDm8R!7~Irgi*H^V`;>;i&J93*u^>z5d)NA!^w6VbJb5(KdniUHbKhdhHeePUI$EvBxW4duwg#WzBu-ArrIDy;s@3$&~ zbZef&d59OUj(W~>Wr|~Gor0cc^N}foCi9~3S$rJ6B|f(O>10tEQY2(toGFRKF18(a zd?xrc*>0t~b9dZyQ59_7Z?hd5%Xsz-be?jdoLB7-6Pd6&ZWuNEE~naY8#FxgQhj!e|rBCGfX2HNm|sj@S9Un~eb{pMuvJAxU$5DiWwt zAz=yjhS$FK`Bk)=!s{}7>UVnXx(L-{l)>RC{}kz}fmV`jb13CHgOi4X5M_wU{kYz( zE#UC-XI{=I@#4@ujsK$*wS9ke?vM!jEqc+y(M4ZGlX6$NQ*5_;5^wWs^lww<`AWcx zPoJ4Rh(%(u)opJJi->bfN`XCz_>zm-HVXHn4mRmbmy{gWaChX9X3}pG6n*!E?q_w$ zl2L8us={M6Wz26>1HE8HygJTHl1ngB4N)0>`^|Ybpf~Pr4$#3hTL+i z)_czFugC>^WdsTJ&fkLDEYoixXn*o{?jYW`#+PKe?+{#9u}>X2_Sz& zE4ng?wy$|`{f2VY|0EJ6PW0UP_fP7QavKr`Ti*FuW0o;;;fNQj6*2=yADrRyK@XW* zwM4iQ1L`nQZCS*5EVVTp$zz#@Tw?<@30FiXO#Udei`u)kg~G(K+GyzcDqgp7oE@Vl^_&^G!t9E= zmAotmKwnubIPZx0BVbR$c}x=%m&EDO1AySvb8n%1VW1XXkXf@V0vmK*h4MZ&GFS3m zX|p?Ma&PgD=@D@q5}mY%0l}NYN_3pR7O|(%oP4&$s8`I1_QaWzZgP6O z$%=8B(gX@8v#>W>?ykRv?q`zr6fXn~6C*wF!57hYnVcvWJqfATsWCbUqM$P4#+%)d zLkH0Nlg-oK_#21lr#Nc`Uuy1`rJ4_6Gqx)-jTIa@z7InKsqY9-J)&Go_BlHEye{#X zHGdUL4u=zA84u#x1m;(V*~CJ;<6-0Jtx-V{zCBl~_NtTX!-*{N5U^A?iJ%*0Uyc;n z&7Ewd3DC4nYiwjF;C!!|Te}8AfI}e=u~4s&Yn`0@p}A&@AWZ#Hw~?+eW?#be^uW+B zdm24{;?IqAkzq6V%mD&^K}Nj6PZL=OJ-k-W*>g*tlpi8t4W;{OY`Uod6dSkVy$XMt z`=EOv2(q8|*PxZj)>EyCvtm4ae$PaSKHgP$=fox z+3B4}-}92)R?f5Ei977~(%lPrp&lLSC@u-fxuk7h_Ysgd z0)4+0Ha{$Ea^7E^cr0wbX!1J|Se4fCZQg(R_*jnA!xpq{>2Q}rEyhR{+VSQ$t zP5lEm6O;2Wf9$uPrrsAu3-+0iKL>5y-JBEgdF{bW(&z)+2N&D4lpmnaEiTbFiH~}O zeHmJ3Pr5`qA0zMc&rL7Baml9hy41L&QH96Ib=3$8jw(Fe6QHE>_D$x?(~^lQD5bKd z;@a-s`aO!(h7nA8;msAe9(QMJEE10)y{R57WK0<~xWN+q(KUCm@(hv?#r+PXy-?>T z%1bd9-9~uw{0A8wR&-c7$;n7`GFwk}t;L5zXDjT_GvuiIwwC*r7d%2C$?$6Ii$QwWJ1V{}O5|@w z*kJo#;nQQ+Lqv-D4TR{|?Ne;}Xeg;~C8#MI6HX;wg4L+$3jTa6DEQ|c#T+&rbMpIJ zy?tAQWXM)O+@$s0QP;IrKa=_R4}z+L3%lQPcxKW3b5x!IDAx=e#(le;4!<|w(A%X2 zI_oehI}l_aO^Gt#wM92<;MhuOzqzzq3hC53$x#!dgz3Ih#&jB_s}Tl1a-@Dg?+E_p z;Etm4(rv~$lAbKrkeC`$mZpTsST}-}H^CwRN||*(u6!llSGC3q@>6r9K#9CdEE-Aa+gfp4eoO?i z&Z~V(1v2>Mfy(2U;VTQ%rR(BC&RX3ozJ?7RzbnHZnuj7D$YlMUA!RleK*eFhMsxsr zweg9;R!%wN9)x8KJYGkYY%03=RL+N37oSbYB{-7YVASL%3Jr*;Q<5Alf16Mfma ze3B&lCBY#qpQ<|Y@*=3(bP9S;bH`NMauwRcr%hU^7uL!DrO|Q89WtCS*mbQEo^o(? zNWD2yLS@p}Z?UgEdCa@jQnUygq*`)p#!WIwK0yUmjIQysZ~vf+Lg z{=?K-e9J#rk8c`dm?+_;od!Xx9k(wYcCI9LY;@Gk?5O$@o?nfP3R&V8U^I11$zVE~N6MUgd@;qWf*=n|Qr*s|wV2R7XS{7Y zV>Z}kEa}Wkjx-~;u&Cf7V+a|`*mh23PMI!GE$^XtYx^_l$5tocS_N6&_t7=RfG%FY zG}VoidakT;DAZot10A=EM|t_h=i>MAm7=Ii*WXCo)&8=Tt=R_&-LLXgU$YWqPlQnxAA#**<3!MGG<=l@Fj>+Hq?t&7Xj2 z7d8jsr;kT>Kd!8lJ9jScTgP+zVU(xbkG5ofCz-L!?vRlTRZN078OdQEvP-7%GCdEd z@lD~dqw}Kb+epW$UlY-7F@xwNw*AqYgwlQ^dp}Vf3*0Atk4y^7oDyU_`q5tewi-7W zT6r!y4Ybl6Ja9XFKdY0;l$tIf;BtTe3O!vwF%=Pr`y=;sxJWSz-rcX(j&E=q+eG6rc;}MZzup|n#N3=1VEV(*XpNnFHZxT`v{~r2 zi72A|lTEN$KQd|ZQ*yifZJ89-1s>w(T0{8XZ83oI*?Rls<}{njT^U&q^1RU1VT1VL zVXPlcu0>Er_K80*rg8L{7%|{+E@D{Lo>a%TNKbC3B82YBQc#}P0`1-t^LE-a~ZjN1{JL>=ePDV zGMOB=h;esXP^Tb@iD~V2WX4gw#j@SujV2Z{*{98z^orio=eoW5r@Q8>c%! zhyY`WwXKPsP~w`-;GxfVGETRrdPXhI6bE?t%tu9poC1}P(*2;x)8XHE0}LyP_ZfCI zxaXo^9MjuljhtpSJXs zt9?KkrR{k_U+kl*p+XP%F~Q}1t(4#U58nX=4_kzesJZIR_AZK;aA)N4Mo z{1^|r2}nj$9jwrISig07E4)*f-D4x3&w~u#MH;peGU4+zmY^Y%G3raLwTY#fctN-e zO{pJ;{wm%!?^0|^+ad(_MXf|PMLR0>=Zqb=sFr$^^G(uA4J_UDQ(_m{VbT7+2NEL| z3zsP_XM7npg52(IZLzI62_HzVWG?Ud69(uhFSN!CR2^4p%!b2X48L0o;H47c6)y;I ztgS9uZsGmhSVq9H>7#p<<2U{I+@dGy#e@$%D)S?Bg>aJ$a)0>YLU#_A|M`wOj4o1< zq1|IY_$+WViHQu+Ko%kDd%b!35}bQz$^9H&Ib9SsJ7*j*>5%iVpp2Wr{9xTAXI zhMs~&Zm;kz$}9;Y4(cmIR-kvi!+cws7{MZf&z5&D-*;@>ki+a_s{{3_xihXi!_q!= zoC2i1(IsjOLW1sOId^zlj*I2;@qTGA8Lu{FTpdPS{}FIGr8}}sbH0DEQt)BM=$g}e zC7(rpYrN7oy?;Dif*)t&dNOG$uW`4gE8wW}lUb#NC6xL6np!#4H{8~=nuz}p2YrHR zba7Ld0jMef&V*ayXNq|+=eo9j3E{ZBcFJAcWEbiB4P-w&MI&`Y=L?L)qlXQ z6C_U@bxVAsK=gij$K_@4Z_>usk=V521(4?(Jb&z04U0A`cz?~wdbloNsYNCh@pO9E zds%c-5p1GpF|%EH!-hn zC>0sB6R!B|@=uj&8Qgj-9h#W`(>Er%uOXz`H+nd^s{y0BLQ5yzPv4NG-5*_!QyWOI zVlyS-K*Xnw{ftYmM?WRSk`SlIyLM3jg96lGk-Kj2I9fdmbU?~3IdnKN>u0ATLt7NR zehKGilZiCPvMKN}+rqf7vrP?0VxXN?QDu#sWO_$X4Q%pxa3h^TCdknDj~MN>W?6+d zYdeg36|&fZ%p_Lo1L{^Mcc#6=v_HIMp@fwmyC;gF4gH<$p8^KUSNv=O6$MRoxEl7q z6fl0wwrqn%V!&Myw1yPLL(Ho6c>&erwa}!LG+JF)FL5wxY>+RdN3`O8=!92Z55pwq z+BAL9Yia=#uR<~+@Zc#@P@R@cq@-w)MkEDab*s!gb;Vof&{ujT%DSyzC&wsg=xopQ zXIk?G-9FN!I?|=h1ZJdWev@u|rhHhkZ57Sr&p(MI(%NRr^BU-*KyN@&oap z^Bh6qk6uwB3&5$X3JZy;4wNfq=H6;s$M!V30(V1Tu z#*2n_Y3o6lRreD1x52dUNkyqCYc=OhGgPqPXJ+tLAh@rW!6gq^LfmO98VbHCp*3a) z**49;z&+duX+bl+0c_;L19p>J7WWQS3(s;SLvImMQEyQ^v!R*osin+4qOmUv7j;Unf7eAfVqfMxPK&!eP)qS2g>i{y@Qo z>KV}Ww_3Z(C?H4JSyll?J|A40zOQ`vlIfw!4d_>fO$r0@L)VEAB)7MvB7_ojW5rUq zUUhzg4zEJw6q0j@1)JXjH)nZ$DXM6yK9I0az4~;S`k7U>P2t-=L{AngEiTvb zlNZdK9p>oD6Hs?(4HB*R?l~ctE+d7t=ru))IDRI*s|3PV*!*vv33j+#QfwMfpmwCk z?}xe?X$V6IlhImq>2V2|I8|3*!0}N$JjeHouDwy!y(>PZ)$~$|+5tk<;kjO7a&K}) z5?lJx)bKp>oE{bl)W*3>SMD=D>3OJc_NYUTye8O3N)ov7# zPTK0Y?H|MRqCq6nH~jXZ%{#-fFE86QmEs%@OJ`d7GVb)LnU!$MGG&53b8Gi4+KXO( zgOFaY)PAPveTYE5Mr0$P5PEbdv^6?HSRJ!9Mw;D8$<786LK0wiV`*~Z!#^!04O|hK z<)6ZG37HQP*+O;*#mMgmt`k_>BoG#)lSg#E9ENd@XgVW<%m^V1z}PcksnBl;?$dKW zx*3ZkMcWVK3!ntB2hTlOt*WSo*V<+H7R&oLY&hbBi8hB-VS>U`_C-v_HtK*5%zZZw zl(y4-$Uo)A1pAi#_2{yY!>(ga$qWlg*5b+M!t9L(i(;5;w~(+Us56c^xQ{F5o?D@e zL-hJ-FGkAL3V1&KR9LD$y+3O$9~Fy@C)|AHxNy_9;*(`^ifGV@Qlvbt8qHi1-_fX; zc|-y=mTO@3o$p2ur;uY~!68BpGZyYGmRxPjFe^2Qx$zqqj4m*-*VngPS`lF}(0~IP zW)0j8a;#k84Z$iUJ!b(bLPEx^gyw;)AzZ>*5qV{5wF-VPIIT44DHs`b2<&ULq|qY< z;rSIi$jco?wHnQ-%)oI8A$l%+#c6UqJXa5;D^d3dig@PJ&wY!q@*Cve8QTxp9u|>2 z-I(Tt%&8E5`PnaLLA(iQfkdXtZYvLqc7%*-@B0)Y+V`SZ#0z+2m5)WUHIcl}K%VHm zwZgvQ1tU&TTmOTtuMUek_}*1X0qIy;y1QE%B&EA{0V!nx=}<~yVd)NOkdp3(B?SSc zq?Zt+ySX2~-`~CWk9+@NpJ$kv&&-)RbLM@|vC(ZK?-P>g2(mr@IakAGemr!B+}&Vt zy}zN>t2D{7MO$^}7UsJSDH_3cpZhWU18mT8p%7#B2>f(cwfvo|(Z8dnB- zYz-;Tu~%I72C1dPvjJ10s*TY@kHhzMt}+95xC^#90l!1yEU)ut&fQbS4xc?weVg#5 z{4eRrYmz#=F#)G}0Ty?+hu+aNd4$z&yN}C=%omisG8aN~0%^x1rz}m%TOatB>0vuu z>Cze$7-WO{D%3qlRZXr`oacPSvnBlv(ePuhH?X3I?30E4S8dkMZ+NoW4Yw^WolRF${lS%mjCzEo z1UF)QZQcT?a;mz6(wFSv)58(q3Yf2Y1(2iLNboDX0~u}EQE&>x?@|=lbY|s;MdM&D zISDnx&Ge9k(8*f}Re$$Uh0>pFAL_p%d-1v0yE9x9Vg6Gm?%N}bXer!FuO`0Zc~h@} zM8Ak3U_RMh9MX%V@BZmeOHn^qv0q%(Op@A1J3<)A83fo1JjNiczW9nGpiB_zwcMt@ zaHrBYS<(9v+F$k5#xL&gMV1wQT@@e9xusniikF??2e3QVz@8H^TXjD%T~s*L%vbxj zIlNV4jZ&#HC=CC$e;o@TR?!{#w5T+~ z?XHH1-)Ev%6y~YtTw1+kfrWFcbZg71WT)qQe06M3W8b4g9p#ef@Tj=xpBnv>^WKR= zN|)WnWH5GUAnqj;y?Jle2EiebGt3S-$%x?<=7iKr8j^dh+o-8EK17CZa#$@$9L}K= zTxXj$9=B;|!qr>gh0bBX?vhlpS0l42OtPRX9oZrU99D}RXXNzF3EpxaayV$xb#izw z&M(7)qTcdpNH@Ljy{aMP_d!nOBrDnw)(wV-si&~)yEr&WL_0*im%O+VcJWHx$+bF8 z;d=XO;y`OTV75^uF1ayyHok%;T?y1i4{04TvXS$;-XP*!XjlENeYKJ8bsWk`2w?(S z6p-@&(tZDbVFWx7gk;d3mkm#hFGPI1h^wc;$&lB>l^PY}u%yHU11r6be0o?c*~N0) znvDPc?isK9%p1sPN@`4Iv%~i*#(98s%e|maClI+l=kL8(oWi205ZlHKL2ykndk(3j z!Qm0!<9IA&crAy~KnMw|n==x#dUhe?IBs{{gn^AXwgeTYl-NI%tW7{KZS7jwW=$>P z(m+yx!2z`jn&fvI&yd#kj?2w;qduVH<}1M(OL*Td;E=PA_ zd1c%pg-Rzd{5xK}w^Z4VFl@kTj4g@M>oMe-QuvDv9lxE~Zb?GOKa*Qxw_G(pIQpJZ zKlc9L2Nn*c@(c~b#zIGGf*ydSbK$3Tl-$@$LMPDU=G{GSkVR%%wZ>+@^hv2Sr~g7!hsU^!+(N{^W+^?m^4OJvQg zorD?q4|(TSwZ1>`<~M-6v%VW~Q+Jt6&FKE15VThoM{cgOD1{&ur+^`N;9Gyi+K-i_ z+GWZZ&@>QObytD3wsh_^lsL1%Jv~A2<;9_HPWpZGsvHVpTb=HyH(1LBemmg(5b2@Q zpj^C7mB=cKov!-iN+vRZ}{ScN4cR)b!0(hv}oK$(3<)$2M>#obdNl*F_pj>^}C7le}1%aPg$eLL~7+UPsLkj#Urzo0F72k$6&aqa^4%Q&tHUqZ9w+UuMZ zJH329GI57Le$9)(|C%t?6>W)Eb3YD^fI`0_qT+14z8(1w`%Syx4S>JxB(>G={;^p9 zp{K3=6KazElKy`3i@*Glf-ODvcY5~3!Tz~f>^`_IR6T`xn#%BSY1AZUEdo@sk4h#X zI`j*w>)sPe#%>@NfXiEy^pu%nU<~LjtWOk&r8#Y?$BN&sv{)&beQkcj3XqmUkd7S{ z&{c9`A|Cl0{X8!@zWvIwxpr@X+PaSBcV}r=WD%aDDYNzg(#s3jD-1L;}R(0TJVoDr7 z3-E1D1A{LT0JJ!ZbeGRt&q)iz#!x5I+<>FE*h;-VPT71&Pvkw^ehhb6KPug_`kJJG zj4Rmt*^9ReNgQ1UVOZDJzdYKM&4yq-Y3IxOzHbBTJY#IkKIXha$q*diFf*UcwJETj zR%(b#|8uwq3jwUfAvWu_lDl%nt$uA%#g)xUrhGw5;xLmdt{kh1^@sXFjjsmsACnu` z3CQ@7;P#I+r@cm`1Ak`=pJ!l!j{_UNjw2NT&iwI8tAsKD3jfF2qydiZz}eXf-IsvM{vZ{%f!rK@ub8lPY>AEE8MN9wa28jx^Zz+U%Fo2j{LSZ7 z`iz#BY-g&d|IZtjH$_nuZaMzb+qA&l_1dXWHXbcVJn)XM)8bL$1Tqpj(e^>vJT~H{ zmcV`V+TT%+dC8LGaWS*+1H_KmhWma^PHGu0^F^5ReAdhJcQ zzC}84m*KfU`i~wTs8L(%z{W&3PXnu6f1NQS5!RI>F$1K-zsgKNx)nX#5Kj( z127nNL@}nzU;t>mBJ47tzYs-AwA^=yb4wR(GrqHoVhfs>W zCF#AwzK|a|`NfZr1o#};U8wFCfT0X`&|A$%FI1fzwbgx_G#uw^7!iwSfzo#akfah<45_yA=u@lzQQ`uzjW5wBMWF#IS>V=O^8)eu1Zq~00=}6IzIU49|3SVvE|PgrjF8R^CL?v zt==QDSFplX4dz{$yc+}B+|m)Zm|$p{gFi)tY>yB0Gs&>($(z=94;fTeg2(EB=i8!^ zDQz2oR^Pi}k*^H+2{(PoA&f_%O$PYuk`0TRc+BlM#dctiLSBh*{oqSX<0&Y^dvNGT zzCZ3s_2(e;{q{L3pI&3?oKQ-EP+@xN^eLwm0-zwyh(mCIGfb9d*wrv8&K$eX6Nzx=+I!t^Zm zcvUOjG4=H>&!?G_;#xZWbk#V(z>$L)*7@apnZsFAGq?|m>e&|K-Jkg>^C;TF#@+pN zXlO19LZb!YgT)OGPjBpTF2{xyRck$$n&sTtnJmxB`f;tK3_bJgSoEgP+z~k3-La};(l-O=;XBDAvRpMnOwX>n{Iyu@RC@@TyQc@Ex#KZmJ}?7;>E zA}65Ij>tjA$javogLrFQZcKTsj<{RkOtZ#gVW7H&7zxNoj>f@yb z3odc4f~~&9?_!0RRECcpS6@Qasb3;?dYk3!`0eaEq2OFHbrz8||7&ZAy{<}hVWPX& z>V##wn{@2d6n&byV6f(DW_mb2KlzhD_YcUCdm2ex^rASQ?J?sWG*S{yR(4!|+u}k* zkkY1G zX+ncvX=~$)d;Ix`hE4kN4Lt^@(HSN#ZX|(agsIV!<>l%ht+u|XXlUI7MUvt0#iwd& ze_5+nGK??w%_D~sGw7TeNL#6Uf_t=T?Puyn zuI1RW%958pHyfY^KVtsv@!*#a>Fm!HIsXXIQkJ|6ZD?TfJa8R^%LI=04dNkQBHpju z&}%Ggc|S@&0V2_S9Mtrhn^Rbdm9e%9yS=ouI+me%LjKKR`t?u>mr&3Q;uk4^R_j~q zOXc-;1xxqwwSCTwCBJU#4A=%wZJe`DHycz=Hp!yA5?akHvgYecgq@5~%>ITALXxG1elU%;} zC*`eZX}s$7>&r7|s}OAEaqz31I6w1BT_KO$d6cJj0>aN}56yNyMQSj2izN#v9g@d5 zU)N=x-BehG{myCnQSQofQ7K#cyrw$G}&aOn3^ z^@x5AOarZuz>E*VQd4`q`@Zy)zR6cWg_XRU`zU?7U3OCr;DR-(`qDH1v+IVy|^%HaU#o5&-xPUkg`gQlY5>UW;}Gqc}LHc1<*7Ic!X51CI$zw zxiTW3kbewD3+l(b%%<`PV^MPHkkvKV^*P_=Q^F|^PvxQQ4mhLP7}4il1expx9j`wP z{HqVk*P+LuZX}tnC5uRy6&j2fc9O&AFG~EyNdEK6_cq`7Yw_zLkLyXr7Zscz#B<^K zIf@B_A4Xof{7%?UTXZATfv9|%^b1lh&Pwk^$A9ySXqYblUbh#YU&b1zXKogcO-J~R zoZa;a+Bd)1w^%2was_lSuH&s3+J^@}?03t)uoMUc;BOF}4~~{8VS@ycwc=0bx!I2G{-UI9`9YlGacvZ6Mg892Ups;R zH%F%dyB;P(hM-~<4!Jax|I~k@>mo>fpjXPzO1>&IW-Z@5*J^Kh ziuyKXPuJ#{m3x)R^R;trpw>8(EM~g6*_tX{oz(4U;+YL;Zbh_AJfX$tjFB-m#0zPgL|bY&k4sp_(j zZyH09d-BJI7(4g7oE_)KQe9w~{d~$OECYx|KvXnh!sVL@9WKsU$&V|{r8d$pDny}K zV$n7cznMivnKyes8<_+^q&Xh2d7K@2Yrb7YRGQk^49kRQS6c)Lz7>6C>^_n&ef^45 z>l?HWhtuk_apgiafS?rYs>4IR(2%OR&cF{HTm53?k3+i0iT{T-A{B0#@ASYulE9!G zPX%hVWu;0KXN!I_$-`AK$7Sk>?Y>4puyzEtiomqL9)I@xVLsM)=}~Fwr|Md(1Ew3v zNEkfjE<4TCWT(q;&FQXP>$&U@4n3kL4|Gd}WYMyqvwW~*E5yimUE6a`?ThD3C%Tp5 z__bUSQB!0A<9eIC2lqMk0<3~uDv8X`!$t&&e>W(q_e9GvANIAlH}N{yNc^lY4qxeb zlzdshO1&ayhffP>v2z%FFw+l2B#+%uz4BjWtkQ0Do*PLXVbs+8wi&`*IURM)*?>Wv z@fW!Y2K1$=k(sL&xX4}2iM@bvUolB)>fR3X(`X_s}cQaOA2saL6(w*6a2@%TuRv{1pR3UBw)T(=PZw$frec8>OK3V_R2;d= zIF1X#dk`&`At$F)+AXy8484B!x?EXHG#)NwAl^INyV9zvFk-EiV%2vH1}(LteVx|P zPJT|Z$!RngRR85YzqMqQOdw7Tf|yP~{;M98rS>@Oc@MoV84|!zkIuwQDVlk^q=VQ5@gGrb3l)89WTnkG3s+{!F5OYE>odJ>5xomybX zIT#g!u{Mr&XR;DB(ZbL2BZH5BCAZ6mNMo2J^h_+oxuK| z=p6)X_EaZ6I-b!JN{?qSYb;uimGfe!*UZ@TvGZf6TWj|}WL}Z-UsoW{%9cFxI`Yvm zJEg0SH;!NIEwv@hycpjN9!-89u2)vuokFTJWlYRw@&cgn{c@`0K_W%o=h*bMsy2J< zy!>i#7)-CoJ~<#s%_U`tT=<1FYH;M#sp~}V!co}b^f{j)%2XKysMGRfvD1FztII`x zdl8&F1_#kBBDCN`89(boXDvC@5h0$>KZ!>P=YM}yn=llot!>-DH2wY)A&;*3-lDyy zCmG}C{ZYV9^eM(T@@Dui`|rySUCOb&Vq(^fulQRt$f}LA*0Y817^btERgZZfCSW|i zPU5IzCma7n&|@70kC25l?~Wlu)@N!wq9u%T{_VvUxBEJvzGqC+%opEhsL-Mq;2Oj%`NT?+>AWBy}qezQuIq-OMzLQM(sU z)(#dONQA=>mt40vG?DtZ5R@aEg~!YdA8bBL6U?3>bINKb?R!qB=)NhpJZ2mxtV%d{ zEQ-fwQ-yqTFi~l$9Y=3xZ&LWPVDvL)Ayil;G1HcmAKW&)TglTIrcRKDTOs{XWWs_Y zZ|ZM4*}JTkNC64WzVRO1*kP(;`Fq+pNw8Z(GbC{coyjRxAt6b%kMlv0j5#1b6l=c8 zxtcH{6fTA5vp!P8L4}XP|D4QA={a?RpoR|dcIafdizN5;KgGZe7h@I(g9Yf+o!2WG zSr4;M5RJmDvK0xtJx1S14Tt<@57x%U1+aW`S(fMOoblQ%{p1g{uI)DCnN3YhUIZ$P z(~BjywJ?#HA+hXRI=#KPSZJ%HJhWSAM;fFrFM8)Qo4(zTIf%Npr~I6ZuW~Dj=QHNF zU9unj!QP;jr11LrL>>n$KJoBJHuNBtI;stOb0|`>o>Vb)hxTi zzs6go2-C+TMgdLyZjt{04s)x`p4t_PsjDModx!t%Ksmz!6UxD20&WqeFrtrfY!g#? zA3h&NVc*qauAMJpH;pg%mbsKzZNJ6vi(MPJ(_s#UyHr?lnY}4TO(@bN{RA%}cpZTE zPP%(%zeGRt%x5>IWAw83ZxLC=H4dg>;deT-ryEzo7$-?11qUu>P(?uNL`&?W#6@oQ zky(RQq`fFvnsR=^`JjYQ0pr~bbiP23zjT@>G^!^7yCG5>7u|ougb_PKOtkS!s2~@wIKpYNCdjWAWuL$^x(q9SykhpBDi0^R)sOD zF?wp|DX3C9na<5K84&SZKiuk^#{=)jAqT4)My+Wwy~|Q+$9LBOkYSxC7Z;u|C~@d# zyeb{k8U&mqgHB$_q^y4pWEA3Y2e9o#Yd0|uvbrjnTc18W_1-?!?9j^UHP$G$b!7?> zO5clW=x<8KVgM+SzQ|ugUw{ZhhDT{?sX-4NmqBPp8lC=PFr3M+zr68xjUPuB%z&72SqCuTbY$jH4)RE zeoOy@h`}b{FRJ>(Xm5|!47n~K$C z^h5|m!TEU;KZRL8X@xz7zj?WJ|K4^U zJ<0K>BO)}FFimnY&^#>E6g6TVdK_P%(V1CMwnr>d<@}8FcWXrYZ&9e<(K?fkc7IZ*RK#OWM?gUZDz@!#RO`k@ zZ$_Nx-z^c?#sL$F=ofWi1g*RnxUyhHOjSpggjo5X zuQ4=u8^UK~!&)#)B`MYk!ON#tjYCQH4wo`*M|TgWT2v!^E|?gJaNE1$fx%i=I>`)r z^9Z!Y19xsS==KOTmARL~ec^YKPBV|%#uw!8N8v4M8?rr7XdlWWeoX+UHnF-V*Kzl$ zpE!s}4I|citJ;>kXy6 zs4$QJR3bODaRr?}SZcn#GsZOJE&L3NCo=xsosKm1>D-sQo{MsKFg9}3DdwDt)Nebm z>H5-Ve@cTCeOH9fUfnxLQRRJbPHIRasaJ#i8xJ1jb{T1)DJJ&+d$+v z%sN1yUF*>Q$tVB<(j8m$Tt__w^qry!KY}P|VCxw+EjD6(5`Xp-zb=lv{O1>T0YHm@ zI~c_r1Jr+koxiu}w9B_)i_U>`Hbu?xrt$tLDAKO&l>M@#XcXk?$6m^VkZ%n|ly4d7 z8L!C#i51?jNMl&$HhEu;N86MVp?&WkN)qES_n-blt^D~ZQS|$D0PBl8a~V^e@pHuX z`E^uzSVW4!-q14{)Y9RqW<@7nQZ)_ZNMXVG7g(6VB8HXv0qc#j-L`I2W8>PwG?*!> zSmlYyPJ_g;yJ_t1RQF_$a?D_V0%D}lGg=&R!B4Zxr_t|F6V|EB(T@$cCv16K;QS=CW`j-@Kd;-2ie~yS- zr>V+Aulbpj{5AyAb3z|s;dg~U!4^8AL=?D)sWQT7XPgQ+-AI{HfJ(obUKFj1E0d+y zv_t~ki$3*PtF$2=_1nB*b}<%BcM*G;l*qIZC1|&1SVD|*FYEL&Ky{J6Bd0@O#$gw0 z4QX`({X$A>nh4Kc&w?w0BHq*Kc1LIc47vcV_&NB@MFQ$}-1DXv!60eIh|5<*8NIQI zPzn3Bi)OW8XghX#dplXZb83t$qiPqk8Jj=nzL8wu+r^^|uWK$b`JkagbfVv_{q}7a zt4}TJ-ldPBX~VnBZx)&>4zq{33QlLUmu2!sj%gT6*eQ%E+qg3re%z^h2F!Npm&gT159BES_Q8~ z;T=l9QczP}YoUk0oL^U|U;`6UYDIosAVU{xS5>PTnj440!nc6X?Qe=Km zbOSL2MH$lRjtg-Q$?38T%T?$Uat()?qWdtjVAsVtK%!yB7ojg9N~tnJh9DBSQd4K? z?j+Try9E(&r!rR;oTZ?4&NsF9*qx#hVXg*DE-(R2Qf_A6j4(FidYz8^BmN)()z>fO ziide);ekA#Fw-Xks)Y8r#V5Qx{=ziPGmwZ4p?|8}^5lTLd(31HQ|kw%67KI%@Co}i z!6DF%=A!~WHgPiwlM+~r(Brx-Z0t0`i%$33eF9&~XaT`zpIJ9BO@8TVM%Y*>UlcSJ zXf5Ju<53F>unC+#qJ9PC+5xsMu>Z}Y2pZCS=&nnaJ)Ff6&{Z+BpaX=#aSn^*k3pN7 zta4kAWR8ez7r^)vBeDNjy$E#X?osZ0bDajzd??HPW3ogZX8NOLDY;kUbEP4{gZjrV zHx^>xBiKR(6EmC;iWkO7Xk&7AN%#YRWA7oDnCB#y1K^x2T5s$$J!~uuY_F~g$P~wf zZ^#Mw{*#Pj#WA_!<1}*JUZ8M@!Pp7VhqN_2JvJa0u^HcpGmr~4lP?U&#rJlAvl^(A z{GC!Va1m1KA(CKVY0TKxhI~PUd;QprWoVPz5s{62z?rZ3qKy+n#(M zq6Y{mD%~~-OF&^%p@Px0F!chu=-Q7!O9=DR?18Hp-ynA@6HwSAKFUWLsMCE9#s+jc zMwSEJ2S~g@YossntPwb>?P9{4h`Sl=cf18ZmLTeAQ5%J|cNS}-wgManNj+51)>_$@%w9TWJk zoE84b8P$ky=-)U$$%y|a=lG8UjBP-v=Z=VEKJ0YF8cE0^AVKsm*=zZJHzbP1dUA4iy5HEx-GEgXIr-#>`6$HTxfNU~wkfE5_R06_)39&Mi4?3b0_yI9n4q!Egc&8BE_!jZWB6><~%U9m--RK1PrGKG!<2NZ8ENheR z`y!DY|IODKDDSQKf6^j>vN<32M@7ySmH$uT|1V=!-LMxIe9ve0BmTTQt?~8Hh{LY% zImg88Mq7$LB4LmzwXNqqA3PinksKtdO?BL8!C78CMs8slPOc9$FC;S&>Ms$r`c zxjrdS?djh1T&DVNN|CA=_!!dLXS3cP}GaVH7EGU%C9;AyP%{^3h08Y$nhs@ah!7vpk`VF9Ln zPPG(0=fuV_qBm*&7ciIkQ9OezR)>V7y2eI1^mS)2xUwm$cr6Hq0o#<+V1^KCbzSyf zG4XBraWpE&cyPXD?_71y6|;TKSj&X)MMs_JXg`{PgFdGuyUA`~JK5yW`MAw#3;s0J zY+G_#F>HAd)dIvdPM3m`_Iij|)Bo_2hKIIOZ+|wyEYHExjizSHl$S+o6^{)pN6Z7}+ zGSiwK(TPMrnQffkGa~vWeGm~Kr!P?x(i4RMJLQV{`f9D*aCb^5i=DRqT9n~>Zutpq z*Gt;2$-%)5`+-5{!L=(R#7lMAPjvfp3ENg>YtB;s)%=4OLG6$KeP5Cy=sq3FCh-;KY412j%!-)hvIG<@@^pZls)FZU6LRzWY+_E&>DS|@>vfkX=Z8UouM&`fr=LE{@GZ<_}2W~p@r=jIJhuLrcB`ad zaCf#=#?_kOtIfbr3UNY#9Q2B>6Iy=Q-C)zjPPgd4Dtpe0OJ42}gVz9^JraPX5YLJN z5mwOLD;|LT7ECgJl$;I>a*qG$mYAgHct~z8S$l`Yy8K6(vfLml`T5@xedFTm$ku z>1gzm2L~3mpz9$fDq}CyHVXEdz$DipcV&mM11@A3CWKyg^_ zPPpBQbN^jmc)?uLN@8zk6FZAHqiu^i)vG2KU2--Z#z{7ohB~^ss?pR#i~`NDt>jUU9n)F&govFou*vaAUs}s4Nya3PDz1XEph{G~Y6ax#{l+95{Oh2*2Jm}&YFhVlzz4g~#Ou8}XU`-_pH zU5AqGmcF0wH`0C*u!%ZUEmmPA#Bm5l>$pj-Jpcxd)nAE*bkA7+s^m&h>)30cm`V!} z#1a@d?RD@K#j(xa{Xxj@nKW~tf-n4n8yI(Pvl1l2tQO1Ir|eWqE^oA)#Xp_;O(^!K zCED#=CKRhvgx7pct>79=e%SB%h2gDA(D16|*=|n8*9n2LxZy_Mzmp7{iyf4-tZ`ju zCrVE&<$*PyXt+P$C~i;{%(*l5Cg7*T;8^qKhU%VJ5H3%^(4gE8byaF=QESalo-+5L zNver~>tLFROrjxE;^v$P@7E8>7J_~Y^+SL;b!a^~2VD9Zd@7@hmEOMfyWd-u802+T z#$Q{kRZWJhEYY;SY^f6u$U&KUlk|4NZH<>SOE7@uxZ~l7jz(LB)9_^>b!No`@mKV%dntDfu+NU(d}+#cf=IcAH0)i+&B<_04lB%ytq+TUX`q`P2W-RiY)fmOg8}S`0(vhI zSx58OvrWQ>OYIkCy_f?k!R-+ZMLA@(77K_yQ`tzv6uuDNwHTqd{u1=X1hw7on#L_4 z6CW#41VDVi4_aa+iNsH_0 zxiv*Y$i#<-M4xdhb!O7X z$JyT0=umD9y?G+$mh!6AW1ayPQgm0KYzRHrdj}ZVQAOTo$z})|ziC(CO5uP7qj9`c zof5PL^Tfs^F|Qpy(nv2)K(`N=_D+mmO}Tbgo7=Gm$HH<8qAfj6fE_a_R*$)c%*(I$ zamg?1L^MpE1NUoWo%?HhJxBBB0KHpbd|fOqHoNm6)L=xW^9Ix2E4QykLm6s)h^85+ zTL7s{7_tzRz|!XoWpqdge}M|=rVn~Jyk9$A#PH%?T7yiw#Og!Sdtl(#XS<9yD$7KH7R|9yh z-7uv{Qk>XIpS(sy*Y^yPhS!APP6I137IKf_m+26?XZ2lOG8V2c=}@rJzeR6uRdAcj zp;P>4r(^rhIiC1$qx(6lWq~~D9HshryEc^3Cwh8Fj%)U*ettUt^&q=SqVx-_yNzH* zHks!M6i*88ViHAMKbfudY(EUgund{7a=f15j;uhd?I{Yz-Ww*N&nbJ(h&ER6w*~)v zfwGLhSVwLr8iVIJC7R24m}}rGvHZGTB+_GwbNk(EI$d60c^D4?(ku9QfkCw6zP?xV z#lUa&-a+yRD3eXrYZ-)y)^h*#{dN-Q=IREC4;vl~HhyZex4MTPUbA}lfHFw3fowB8 zu$fq28mVpkopk`^R(ZP+hK`g-eOIvEB&AQg((doWxjQW~UV%2?YrnFo76qRLg3eTy zXsHP(k#aMB_^N|OK0Hb?Twdlu79po~L(I~SF`iqZ-Gn_0Xc0Y}#@cNPsv9h{hK1fK zX+&>d(5~ctWE=F}*$;!2=`f&$O<~lJ2VHClod71euZq$1$wz@^r?qTh>KDcfp!cG_ zIHsDpSbat}?dm&I`Es3iDwSL}jzUq>{oXQs7Aml60&prXtPUScChBC(8L;eU0+uTI z#%WOUxVIF7m&H4=KSO;%@82)fm~p!0^G_?R^1eNL!M)$10Ec^tHa@V14JYoZ>rAqD zesxtW_VC|2WpJRl58l|HZW<=puF1q^!K;>5AWh|rg4b<-L75TfJJLIL&XDnfR1>i zPcz8yY1&~T1^UyXz1qkKkQ{WN)Kc?j#;0`?)4geKc*5V8Bl^nm7~CJmR7Q`zPCSN_ znu5LM+5>J#y`&Dim*;6>&_%-C*|${=zi(S{&i>oMSp4>fiL%xN9#fuW!i<5};XLj< zl~^ibdJcNyDM+b&l{#a}p15iSE1oXq(g3s}DL{j!RPT>zk{9%o+J!S~t_ZDEKaMw- zs=5EW6*p(;w>|z@Eg!Idf5w&PUwddav!Ykd&>PyYwO~Q?mcO4TNueu}g+oBIQ z5tJga)6b@fq@|bdsMMeSL?PqelRuDT;&Z<8GznVBbk4G4rxU;dYcen`wYXv+Ghju@ z8v%@3xl~nu&EqfI{2i0$<9EupM+V(AIaxRYN%u;Eq3iuGvbtPT zZQ6O@B(Npobgx6P&+7wSYfz=$PoU2m+tJSxhJl#Iq&{h?AX&5TEA9tW8R{9O=%!(H zwtcZ{3=#TRk%rb+Uv0lALf!fkg1%)Z9vlOp4n1j`Jq4I;0^(*M`C#LFW^;G4=6hNj zbwYyjJgF=k($RFHM_fu=<@CFBN1cDhj+F_cy8&Q+XVo4RVfsCled_2Y+&L*~>yXf3 zWdK{;iA^fgjbnRzKQ0rXIuhi4OQW`!y57Hqd@GU1b&@)QSCW*As#<2b$dnpu6xEca zVDQ$MU>+Ot*_=?qTk0IRff>45Z9d)LNyPi$JEKq3*1TZ{s7ni=Xo-#poM3?OZO0ae z#vrvA&#p0|!e_9)VH(6q-GMVGOr!YwcwhS!FD^-dP{49!Vw3CT9km-;C02-ZZCBez z_qGXo(>&*0(b!0aaOX)cEih7|XAUA){7ftf&gMydcBasGjARw`We6%(KLYK<3YPr| z=w89m{5CXbK|HR+V;TEG8E!>&-7JVUmP8?)0**VRoqp!NQCsA#zE-bzP5UM;(aoM) zT(vYVF^jlr6>|7l(&tCHXEweyH+M_=Hkg0bjSCinhKwd7LD--r!9j;qe1tpww=XF` zaN`;(CpgucfMvXX(PVV+j_&+mcK(rTEVid~r}UZk)exGD)?Pl7{Zq|+uVV{wr;|~! zfe$uqyvcEFsrhTj?$uXjk{$%eK=N`@U;eG{m|kv*j@+EX^j&n)q*eIR7Siv;a4W?N zubN*ua7!zLKdHoWZ9Zn7@{qnTDV2+YEeqU46Y_uH>+~?a29K0X@oq16|FO4<{m%@p1;B<=DaYv@FaUy?A`b(_hG#Je^^;pc=is2+4+- zGf*-l;?cz0w@ut(!{3VX7J*qd3W~m%97U+d;ZSEztkj@{kn+fhN4@h?j(MOD8{>sV z{&nNNI7_1rhJHWn>^lj(x2Amoqn#G&xU~!_`@yLzuGl{p{A_d1c9i!jcfj?FP%&n* zYO^=X+B4}NHjBR>sojb-K6A@F!W`+jezEC><)k}%sqrbHE&e^K{Wxhg-wLWOLJ`dsZEjEbCR~P|pkcXU zbiP)%o9lue9`Lt-TeiZ&&A%5e4B8b&@9ZM}Sd_3aRrmA=J6{I%MjkwX1K+c_i(VT% zV5`ye0~oD)YU=&?^mp+@%oV+5y0giqEmYgl0D_3s^#@~3A4M!J+@)SyRkkV0<(*mo zD#T!-ubQpMLS{;;V9-*)>ODA`&*k??q}5dV)j_d}Z(2%jEMw}J2M`&_PH4%cs=#Bp zeGZ{-2V4wsr+C&wfOGwY760V*>Z5F=GmCJp_PclTO)um-6Ryur%8Rqw7tnvNSFbz- zqfxrv9#db93>0ct#rcNFpn~@|gp5Q2sl_$-BWN@AwQJ~wu(jM46|16Mtp)-;a+j!A z^f_X!Mvu%20@^16-aFFix_1UEcHce;JBGGyL`$rHtL%$Ey!~S>S$C-3@bRa$oU)No zP_L2h~)fT+Sd&z95f*JK( zV~~U6U5&HXZ+>F5QDRsEoG74vpyHwKC?`iNyV%nJ$}QwyQD(VPev}_dkv_S}<13lH z{Y=FZwji||O~fxW%k9sZr^hUu{ju*%2_N>^*;78RHY`8vIO~)9>h?Wu_Tx`DG$QUA zukoQlF%0$S@|YzXwH)81tgcWgmR?M|R#SDume$FEh!9J`{awvH`>dlIFcJhCWuGT< z@_L$U9QW95WR-kc~tArq#kS%lZm9avOb49ThLWsM-0ARt(kfW6IS*s0@F z2>iB`M|bEa;_BL65G0{w&3T+4(uyk^`__t+nJXA-Y%+aWSl(pthA&VKrW)Zb`Bu1lq-%9 z&LP9K-iXNEnc9)Zt}_Agz7?wc#gNhMRHct-fBLku-oi;X?~lD+<;?hM+W6o$$({d- z_8(~RfE{6T%ddg->EB)Kbh-wCJTxs41i!+O)EFo$zpijYW?R4FnunAflxJYxpj$0> z`wVR*ko*!i5D{ICyAle-@#TK!HJr#e_`_E|IWx8&>()0^eMgs*&;CHzWB;aU*B>3% z`Qv240>Oex1)A8GfV(RaF0nzYyJlh5H}pKG%EY7=1W1woxpWW<9pao4XTUD%B_Zu; zIRSs?sI4(Zr|FERq2*&5Z^Zg}4_g7agLoVz% zfs8z#aOp|1+_{V*rTn&1Yo%-lsg_)E&7Np{#6NqFSLmS!Ryr$;=@`xtQOI`K4^9 zI*l|ms!y@VTYCd7!yy+SaeM&ng&aD>+7$2$L*_x)c~5yHW12aDta@|DUs1eNj6$f{ z^1wmiNG#_0txBV`qFdGs!AHC>{DM$N^S3+~9yVsGdw+)wfI-p)+|LuG-XLJ}--=#Z*S5OWwcI*F`|kQxUFhSfiQvYUyGSe# zT$tZ8^Al)pM>j*~zTih)X#3!_)asVAkt)ZuONO$0KQXbN%EsXTJr1oJki$2_%Fh{^Lf z#*qESu$(+CR!K(gm$s9h&<{n8O5T2}-~B?C#zGK$L-Ha9iQz}xO?{tY2IXWeI7dLQ zxR^)E4DJ=o_Z)HG?IwBsxCxHC^Zej~J6z(aLVj;P<@|i?&UDy~kz491RqDt=FvfV# z{T9v_jZWXTEg)6A^qa@uLj&Zae&@Xj^_`ne`^pv(&aiY+XEmpySlor$_4u8p1l^Wz z`0$=tghUEqFTIvNc{_TQFgtNlJOPF{2YPe@lypNc(G^ z!Tw}^MY9(uHf6ujog(sx>~{@ys?#nW16r5w+@#_<>iw!~oRdqE=5Zp@aQL?T(LjH> zoBk%&uwJQBJ7yhH3BdT){USC7?k>iMpfC0wG2oW??`$F+I~X3UOR;(Hed$LjB=Bnw zu857}H&DYF<%?=ZDAm7Ekd5w&&bREleys<+w-}FdZ+bIAlg#zjwDkM}uT5}rz4`rn zIXzI(D3<glh%=2~#LW45z#YtzK;VG972*8AZU6b9!))Mg{nu;Z-jkt|Txno3 zDJBPgE9-#o`mGh`kLreq6vY7IT#3z;C@+FV*wWpc5~B#-Xa#;Gk;iaQbUi6E0`<1^ z_*s;h&g+ZQpIe`M)x5wqsWxW)ceS1Y8OuS@AEq&f=>lqKhxEaGr7`qsg@UWUs2!g1 zS;1kZei@|te<4qC_ya8ntE1_E$=(b>*-F0)#eljkK=GpcZMEpY){&l_{eyZHQ$`!i zY8D^GQ^d5jlV|l}O{hoQ4dkkq5SM`LtVpQ|HzYlQp4QmIs5U z8E`G-Q{2^72XGa2JJE>w@YnGsf;{Xi*(5&MDy*vg`F2u?%fY|zGdXNCB>LJj^E{A& zCuoa+@`<`l@5^ZzCf3#Cbh8p{Vmo*|>}DlK{LEOX?At$V!S?%S5O6|c^{xdDkmZ({ zo^5v8q|d`tA{K&olJ1N#>q+Ea#Ja`C$hgomugo>^E_?AdTE%IgR5o!v^bWeyUfS4P z8=|YlG&=foNlS?hm`cB8?N!S>SNXEHf#Tfq-n&Vo*Pc8{Uyb7?n92y}c__Ofo0ka3 z?nK04ZeSP-q|Iw&uT~fOp*!GDM)m0Rfp&}K&nzZohegs8g8oqqvd7uaK2v@0cVRVS zxwdl=Ryt55m(}T_y2yp|nkE|J=_VlDRZuydMMU1A^s^m;O)KWHPrBZK`w$^mtZ=DCPrikKPF8{U}h#lZ^e9~igfKNj= zLeTToYs#KO;}%@tfDU~;3$E*NhncaE|C+UGBN@#Yg6^2tS-^u`IrD>obTuHt8Fk3{ zYZjAHhh!^rNx2>2;5&%gS zMP?T=MKZ`2a|@x#czvAj!5TE`BFN4%;PrsOZV|JEH}Kstx9z)jOWm4xgcQ_oTv(W9 zYD8VfTjrOH8LZ~R>kXDk^S^OB*Bh6>epsjjT&7$0rYNGkRqs-s zO@cOS5SFbyPC>7NfT;iy0E<%@Z1)v@UahXG~M?bwt-aV z0c5>9(7z?#Zdg*SK4vBt?9AqLihzJ>7$W+*4`5`Q#}!?0)4o zSITP%wLjzJTDVD78`(-a6{Y#T#Ow&R-hUwe?S!(El_HT};!j1)`s38g4>^-C=C+0c zLSoT#Tg@u@TfmCH4Y%r~ib}t+?{0=<>`V~-gtl3Fw?n7T#O_@(uvIk3f18rvrw(-L z>!%(C8b!62-UfA|9ABSbq_%rbTfPVZz3kJaX>J>=V`-ef+{V3ETffO`1&bZ6 z8B3+im*%5*n4-db{=n)7XZZQOK)qusc512?yHJc6f}d(^ zY1;j5^^)+ckyHZhi0q|&nXA;{A7h$B6@GnF43WPO_dPp8m)3P&zPwL-P^7Q|Kza0H z$-cBe$~#DZ>>UlFvh^a`s?FS-Ej-;yLvpl~tFVVD&lW0eDr%RTx8C902V~nbD`K`V zhxuRXWgm}h=IdU1V7Rh{4vsK6;NP1HxjzeBm!*~r)aMMqcbuX;nv|mpnjU|Q**jhd z(lKbK$b^aMN6I`VqmKMs+Hx+Ba`=gBMWryJ%k29~{bN4hTp0+J@Bp{*$BeLfP(E<+ zN$%}^$q+wE_=>3IB9RQ<7d58#q%8YVT9ofB6q!u>I=C=CJLTBJ(Y8qwN$wG5kTQ`C zR6(Uo{H3afr)|y-c{_?u$D0vQ`B21H4eu@~mpPTY;*6DD=FsQ({*hdk&~&VO{{v9Y z6A7TH0N#oXzH!ueeUOpcb6YPok*ZMUK2ft4o3>_fQ2v(f=*;lA_>SD^JFfe~NT>o@ zah_B7;Ii9S!NCm9hW^9lps&|aDP*!N>O6Km@Je~Fr_*mEL+lFK-~Fo-`q{@yAq^-i zhpO+@HR{Sy%s}(jB46fsOpq-Ch3n}#Wyv2VyvyvDm5A&z$HGNBUFh%7^*S-d2M3`~ z>sbDv_G!eony5YTs0@FEUz?I^5;6gxt%3l%j-Y&1S3FQT`SoJor^sn%A2+mKyUSu2 zTeTR+j_0*iwl(c0Yn3SNW`msvZcK|6oD@&JYc8x?M!e&&nF+FW7BuMmt4q#)7Q?LP zbiDjIO03g(XmaB&Bj0iMbLaM#mjr{hbk!U?{d&`ZI|l)Ai;d%7&Muq97^8|3yoav; z9?glfh2UnMGVaB}5!Ps2^-0bEKVUbv)twQ|s5Pdw6BI0tZ}yhY??k-RX|Vn08C&6k zlm51pAw5TFPu)p>B30lg7{yD6?AFgG9&5g*j&%+3EqBGJjZV=hCrYor$my-@rP}zV z3b|O13&+nDAqw-aUiW6;YRq$?KeCHHrem?EE-l+Jo?x@wesM*>m zH>A?GHHYc`V!1#3JM>%z0QrhrJD@dY63y_*Mk1`bq-3l-)aWnt& zT#cd(ZjBKTclgOh7Fny6^!c!W{oTcB>RWodDqz_I?k=eVjF0_$`66p%hW*ZlrzP;1 z)%?x0SjGtYv;G2riM6~?fNXCL_e}pd|#n_WC*hgK9zG*Zy#Qr!pQP`Rv z21*0n-$aJS1EoZGVq6-MfKs_A>DPT3nkaE&JyW=tA>+2a5v$}WA_yL4iSZmp+};yY zcx`;IRyGj@QA7;a8{>7e&pPV{uFnA7pJlwcV5)Cx70TkV4z|6NE23rf#|*bVydb=c zHFrWa3x&v1hjB_2HZHG2U*PBoj=~K?GeEgjWn8MMzSzspor=t+uHQpK%uG)wBjhHG zPh$@JluRRf!AmkPqVI5mZq%AUui|9NZ+d_gqd#o?Tc04eZ+>cGKC*s-d@Q>%b2DJ6 zL*JXLTtl{-F^{LI;Osw#V^>e@raBadPjX+6$A&%))d0cPCA4Lko>!8j=Df^&I3#RK zwN#cs!@*GUY;(4GR*SuO2E%;ypluD;+&2BID6m7aWNdW6rDHi*SEpl(6IhjJ|0z*U zbk%0};ZJK}Rs}`7$GfP{IgjJZ&m*DIese1#Qz?<0e+MFh-L1sufCsspXW%maky83m z=)_Z?sOPo^5p81`4`^b-6nQC`L${(Z5xK6?$nOjwNeydbPs^x9CQ_^Q^JeJ_a~x{J z%Oq&Q?8|MvGN%7J&?57%sGJ*ftN!>b48yE-;-Xr&h?aIR2jt&o8o-KI9osq_uMPPz z7k*K;va#k5?36DQdGebN9bQwW+Sp7be`+e?)cAFvRy(XUKQuppr1AuMr@abb|umX;51<*$Q%1mqRS|%M~&K#9;8H^``)HmV0 z^bFC6nIUk_?Arko4_xBAz4G{~omQ$J+}$=Uf@SB3r9aIY>`gGRYxIR~CxC_pu1Lo1 zqpG!lGeb`%pkjy#9MDI(us#TMEQyA&8i}Z$?s%RV{y`A&-Z=b$Nx-rFZVT>``gFZL zHa%f^ULD=_WQQlUq0W<}7LQC1ul#jQl)&Gi&@I{7#T^=qG>d;WePBLRo1|kd81p(N^a&XEKc%eGt(?yK^Ib#a|P=^cB^`EiOo$ zUaYzNBb8zn_=&mgD`R5l98+$S45-IS;d4UYWxZ<%RWS9t zm>@BS`LIx7J}_bvk-hA!PqfvamnV0OW4qw?z{WF`00=W?g~>U6gE}OBUBoxqujH!N zesro%(F{&tqbWL^bu*tL`m=Ui$^ii^V^V$gJ3W{)*gEeAnx&;*-ypSa7tKFZ92Sg< z{>2j7{qQ9iP8I7^0MXF10YrV~dH$*aCplmJ!5TpJYPJ&C)mLZq&ceH}dF@ z;Qf)ZM{|+5m^dOqs}cBuOQrSgnzdwlsTwJ#^=oTsAon@nz}-;@C3wX2HLO2;I(UN@ z3{&2(EIKV#LL`l>K&V7hHnst=5rnqNKm7ela?cA(Yn$+gGE&<5 zQ1+5vH2?|dh%8Xdnyofm>Ha;fy;==&x&Hr`&;J!X&=ybsE-nzm0TV z>~6=O9xO8^k}?ij(;w68k6j9@)s90bku#Agg2*T)avmGnoT@W*0k>^!IL;=9|8<@K zjmde(pDM_UUd(t3$0G83bCHFU9-&!tvI9tpX4+~hf6MSD$i2^wMU{L{HM7V9M%Lhn z&>qN$v4zxVF3@+o0JOU+U?~LC8JJ4kgGff_#VeM!msLg7j+ieQk3R#(+tUKYN8<5n zW3yxSCN_r>g<7$HI1t1oQD3ot9q8GHkOYxUH!dm^mnHlWXtCbE%^#6|nP37Vi!c5%w%87Sg`IMQ{5}6%r#IK3PyYZ5 zJ)m=uH_2H_1Ji${H~t)dDahf&r#mUcOAkqA4rXSnyk+{Jv=L$$aftQ3F+7xrCmle+eYnWs&Y^ZW{I#$8<|m6bTzj;t5g8o2B272s4tcGiSq8S` z1VesmQl=jQV}UpzF%CuuK(m&6b2BfE}SI}%+WhP9vU?s5* z?bsR1<2?5{2ury0V2nAfg##+GyW`IU5q=TJL6*Bwybl4mohcq-;^x^CYl;_AH!Ca5s{%*em zM^u=b9Q0gYa}g^stMu5-H#5NxBB!5w6=2k8m@FlVxkL*@)AU17*D9jaH$9(ZW+T z&-r#`z5D&KLXg-KX#E#EJLA~*_zs1(KQr%aA+>ZWTj^Z`irRa5@c$XZBf?dyikY^4}`v+`$ z3QiJc4lF^99vRVSx4Gy${j7duC+9>Z#QM=G8(57@m2*I5s6CN7a#h$WJu-&Flq*bd)X!}>n z>7Kk$sIb)lHDU-{%6I^rUHJ6=BXkoi-dyaMN{_0j8qPL~B?j%k-jih5W-LO@Po21z zX=6n6kgVOO-rKiTc2#rwQdFik0cP;pQ-LBp3TxkZ1W4*#!Ft{wTfg4jk~&qf$8R|FRMgzd}j$?)kDG z{VoXa)&%<;xMAYN&?_kvE~Yb@=>e`wZ;Rzu1QQS$IfSa{YyF>zN9wd(o&mi>WNdbc zv&b1+ zDD|=*PK%dd5YMKO-<2Y}dG*18H(aG=3#09r+d)Ivg_T<`U>i-}gBsdT;B?>TE%}Mt zKXCdDz1x0a;VqkWGZ65=WDH}q97{1uWYK%5`>55#&Z7qo6E_ddj)}Zh-=Sz3Hwbx; zKM{BGTWUHKDB0wXAQE&Sn8@o4m3;5(noQP42ucNDF~QNf&&-h6a`D?sJsjru7NcQ- zzv`p23o;x>Cw7nG$%gv!giUBqiKYHsqv5n)hjajD5BlUbARyi!TR=y;?ia(vqVwlR z0Nsp*@~BZ&o#i|!v7zg`PW%@HJw&}uEBP*4%bLFG!tCW--*@#`E&eb6maWFh&XPbpt!F;Q7DY_v4Jh+HSQAt>ny6&b;rZ8)Sl1n+ z6Q2OGH02FQR|O1ikSBGJ!Mp@oKJONN9zoC@uGBVsvOnqa*ACbx$8%05lSYCmWMyCH zk4%(R3f$TH>b^2it<>tG3V|w&)=}KT^sl`wTl%b}COlX_IYdo&7WMSbaK^XQKT}QfAJ7mp+5Xk})NC(RP%!2e8WhJM(qjrPaCc=_IwCh z@rIINjvU7Or3HTD-|y~v9-f!ly;yyi534(HDBmR$Vn`U7pHki`G8u52G&D6r9kVU% zAXTS6^`2I(4rpEv%RuPhhLIXA6pHBRq;Q+M@y{Klt0DO6c4xL?AvmPi0&R^?ocFz2I zzD|7~CTG!KBIJ}(PXHD;A&sJjz5V5)Cnk)QhogxBv z5WMGfoXs0dwXL$(alA}ICXh1Zl}g}MAT=%1mu}>@$xhVEcg+(-FITbY!X3Cdvu}5J z%zQ1eefCo7TV=onqr5Hw2G8u-Et3Z|`sOr<$^cKx$-GrSE2fkG+rDU^h}_g=8THn} zzJtI8lKeBU{-pu>T}_Vvb^~^P{F!J4>VgOJQs}3+NOGc?xdVMf!>+XqW%(px$;KkN z>fV=$%X>!I*9>%5W9@@>vK_Geu!_}g|A&(Uq|hVpT+Zr{J?=dbsBJ#(|_vAg$9 zsZy5|0OHUDft|-K6R^QJ1ef)8TD&In zawwKLtiDR;5EqH88uz2VbpN$4@K>sAwwg^vf;lA$bJx^L+clzIUuNDSe%H=+1Z-agFY(^v$th{ftUD#S(6af;WNX~ z{4t|Cm${l%uOQ&|;r!;RH#q2h0>2}T#@RJzDDvBrjL)*dd$)N*))4!u@s?PC;0A>@ zhn~XqL`2t9>C>*C4qMb_maCd&>QL}4KMP^wHTERk3>t%=icS=bVm3~ZXS`He2N%jl zetA+~2CDaX@q7&T1G-&H1}lhnfBKEcnIA5Mz}2#6GS~J~RQ;}B{+d^?v=$ka|Jt<; zjEd(Pr8e1$lh|Owk013?KU-H-TP1hiJr|XBO?s!=%G2RxQpU*I4I1so9rt=&ShqsE zL*J80w)Eeau|zbP+DR{l#i}d}4%UGnKfJz}?&Il>+}hly&tH z-6^#FYj~?F!%$C@DZT8R$Vy4bhqe9`bcrw{qA2t$Zy5e*U zItEv%vKJ??X-N|79&=P>OtGR8oF##5O_K?~AJO2$)`|p#tQ^cxNiR2HUd@u5 zjV4lke}1&mzG`XwCHkX6pMTL~g9LVYC3X zy53D6=%6dN%rTTcIcqCDtcFf|Km1*`ft0)dC4iko`6#=BLAK)a#pH9?=I}Tz^v0g2 z5HNYni8i0w>esIfRCvkj!nu#d23i>yAFJ@5k? z&gBJp>+d65=d&C2O@LM-pMK;;Zl!0PRS%_G)*2Flt2u{{IDn{V)#fvsa$F$(8hB5=ZFwSV zy#GlPU}_zTD-huo)gB}#+ps3)zLQgtPqR4P}& zgd&5QMdC8iXS-h zJ++qtVH|S0f^sBX3!hWanoJ$^dKU=cuEgjZG^dNw8bWSQPuj>ge9C8dT|INgR0UGO=>ewL2= zf_pYfmUHwpq8PMk^Kl#i3D43Mr7tpprO?YjI{+gYO}-^okm!pst%va|a@*>%S=|u= z90I)Q?}dZ?#aA+b6ebdr6}Y)4lv3mdo39qXP*%ellePQcdJoLU5s}*!%iqsSqqBdlY@)y*3auzCnV;myG^jomfJ6dFrWPv!OP$|&XOTZ0tc!rt^QgpFC!P(;f z!s9b>Ou!BC5d#WqbiXCUOGZ>lCZ3S5p3F?|=xutPSme~bI;{37gx#ws?S#>ctKlLB ziMF8RBA<}T-B_4efkymGls&4%9R;23|5f*KP8v03^uFY3F(&o{pI*?jX|z6 zdX{r^$G08ds;c0&{Bd}dS6kw`eKE`kWTeCDNyWy*cWG5Z7{Nb}RNYsc1J=Din9Gpg z=ANDl?MK=U)jDJMu8k<_>oGlQ2yEv%ofLcyKfj*9CWO>KAAYb!RAB^ za4MV0H;app+nOVZk3V$J`9|Ws=cz{8_XWT13=>_#R8}&}lM%p}J9^Y#{39Gdh(b7& z0c=Q4UP$Bbc1!rx(ZC%9QN!;#jfl+4h7^mu>#8`1cn76M2rSC}_CScGfU2^19R+T;s zCS6YS%)3tS7X*K~Cxm=0k%ZNDgORp@hY+1vQmiDPuET3Q1S94hJ|U_#PB8lggI29y zOa}(wH10ZtcO`wLt=6mLp-ynrKhX5Nxv_8h6H-aKWgs;?w0$Lev{>}o2X6&=+BD3Q zS>y=yHgvXJ+~Cw^uBg(4GwJD7!t-)$uqdr)~j5b72};CnK@z zWhbYp{HRx&14aZKuo}f8c~SW-lHucB6k9QOS#b*oabMPeRN>TLYQfP@+K+p>Ohokx zgOa2T)2NZyv=HgG%HRCI`_(=bX)6IHAw?cLhWDwS^y+ckQt)kn^E15X#Jc1!h`CCE z>L49^n#kGTj%t&x>fEUT;zH#9GJe*Fc(4K3ZDcg%M@ubLvXeLa}MUjk82l9M=bR7G;RL4MB{!0Gkq* zgbLN}-DmTD=bJ+vhPKXwEE_#f1Y|z2ROrew*{oWT2&TbW)3n{m$0#`O@p??lRgTE4 z1J+$;BXas*woa-}vQ|YLizD=Qt;0JwvESCEKZR2!m4VZMC5PP$icqYWJ^tM??Fl1| z*jiZB$(R{HUE;E^?6A85Kp;PkeP@8aB=P-{$zQ2kMX?@V6jE&jUWXjbQ&g%}SHs@R z&`momf?wX{NGG$rARSw{8g*^?RDRs!T#`Qk-Q__8(PFRsP_=BFQ_r{MuX9r3{{sKV zfycS<#yCT_LqzFqvIU)|7HU)f{hf!jXwR0eYoO$9`cxXf z9$R5wo<~9WPn-EdzR;YAqIjcrVS^T@>wsCba&PB$^%VUYXTw zJk&5-g{cY)nP{Cllx_{Sn{o60Zus;n(}rNrR8mY3461&dZLY_ zRZm6Gb|O}+W#zNV*tW72c`|o^(@xja9qNea0&l0K58;n2ZY7&$v}F!f`qtTuG_z0F zpSv3Ft)5PiQs4rHKsyZbdRx4`gzK##5i^DUk`qTi-sp5wYB_b>o83*@Mktjd2R^~Y za^4K(9n4JIjEUt5xP(Mr&@|QZz4WcNQ;y^jxx6Z5*^$n_t@8q5MhMZX;EsK3cj*z6 zLw$!+G*WPrL#7A37dD|m;2nnf8lBJeL+TVC5tpG^H3ae)tFhZmDjKJ0rs+1p+)1RG zcY1pIgYgOr1l?Z0D>{n*!x%PODP2qV)e-@wH>{dOiPXC4DpOa~BuS{MYb^Jrd;Ea} zBmU~jqOAO}RLv!R8+L`zKl}pJX^2e-1^WYWqEG#mVq4+0A*@Qt{LHc1qJ1{8Z$;sd zQPl&|q6j@4#cIl_lzIkkkFP}RaQeEgt|e<9=G^gU44tRs=OaAs@~{-*6t+rw15bJq zwkt%mG5 zIltdR9vTkFb2Xh#;cyd&f5OMUijz)TWJH|2z2t3#;7)nm!K)@O#2{ax4$yU)kJ6D` zMtexJ%*?Hc%Q{AGdA`xa$9Y5@^3=+F{Af4yz^#(-)KtgfC}`Yu5? zQ&&fFgX^*O$uPHb0ArdsL&$mw;fzU5`qKyBM3}7bbZ75rqgk#NL;DjG z-{K@c1cAeOl^AA#u(ns>yaS-0NrE&vm^DNEP^9n9JqIW2(MLLJVFZ4_df^PGDUN~i zVN^wlbuTExUR;YXtsbw{9xhueXj zCKCvPT)MDu>@r_c!}f$61sX-4qm!`I8FfLBLYL@*nG&Kz7GuCVaZ%c`vYlfyWF?Nj zYpX@3f2NL>rl^;6#qyT}{hs zq?T-LI6$AV!d`U4D(s`i7^54dc+GQ!i}KnpLibLBuB=Ksj;TqvF~Wv`jeq9G(57g3jdJMXNDnJExOU?CTz@XXDvqX1Ym zk)RqQQQ0ksaj9kx^yRIlO8HDiFAGADyU?d+NbHr{?zNJts_*S7(xpQUJjNyBxpSe* zPL|5Dn@)zJ%9`H}z2jN~jWD?C16RUuY8H@Ko?X&4>+ub?>bZo`DE;^ggnJImv>Qs~ zc37(}KIEyqHXYRbNZ-RMwp~cS*hE}1nnbGDhB2bYLblT{%Cc`RS?&}nWFiT6;WtyR z)I2Fs%Iz+@hBh~OPJZ}uJ?1q~b+RhHc^}PdjcC!m9X2#uc4mFD?7w|P^kw5PZVrdg zG8AH0Yxk=|qsuR#%9VOa_i>3kOdine0w+-A?MYFbYXQ3tmnUV`2Qp=bUQq`|Ji#yh z(V&->tvOj@*jV<+KHX2H=VL87(P6I7P6O|4b!pJKAXV}Dt#u1sxak9Zt;MIpV1L6Y zB(2GESQsaM@?^()^jXZ6qrf-2EsjZS&OI0VlPmq!K`30Keo30$kcHqWEd?s`o}RIh zGkDe{Yt@0LmBzi~LJ*=JXUcDO!g>GG2Y`CEdEHUMKfujw#D*UL94}B9TWQ=ZO^mL* z@YM9jDB4Ha6@HG`_;rMfzFp$D$Sa2NpFnx z*cv^;mUe4H&iG)+ea<)GAHMw9Dy5R%N2Vb%v-(w~@6GTgArnWWT=rMP?)0@(!RBSu z)d#)=g3^oQSR!k<9tY>U&FU&=2DnV^w=92N*ZOW>Zf*@@z+Uo_uDcjLNd;H$$;e{F z_>|>6&PtoC)~}x%f@DfI;Yh$N$l$};8raMSTbqWADKQ!&ZLu$8i z-RAH`9o>|SFgcV+4WM@@rXq@;del0ja}9Xo@Ad}a9?=A0RXT-28lhsYijrCx-+9Sf zVZ!933ket8<#nF38z7vnW~x>~Dtb#!YpH=(r4LB6(z-*ER=w5;G=BXIH~B@7?Uq`@I`7>Rhxx=thIGAX(o zw!ZNWHy9L~)*dDo+l^#&yPu-Q6guCiX60`cP$l<^pfX&2a|y~wvwyM<8q*TrYTZs; zeoJ_Zg`EKTWE_^6!?1mwH+zjk;w>wb6d`*e4lA+0QD8#=bxF~f0vW9%Ek`sq9&t{1(@^; zO~;&;k1L>2b(VPOs`ZM)I3xbDkp48;G@U8CO-!V{R|`Kn5xe#AF7Xy}6!+q{Ra%o^ z1;IDhVh==|oGza%*taF3ewosSl|6=7k9Fof5npY8)Q@}@2Ju_(X!&UNLhCCF9Qavq zre62!(Cjp{NJz9ZSPmuq4Oc?vbi!8XBY!C5dtD?2m(Y`vA8f3ujx(nMFWC>tzG&D? ztWVs$nNwn==Ve}TwIr=k5m;Iys<0B_ruB6sWS`1cy4~_cyaF+%gHNGB$B7IC&*=O< zt1GLfb0)aa@)+SCJ|zqf%u{KZRI;6q zsz^twZ@KG*iy%r4O8Za4xRvVXo%aFfB}0p(v+e1Ql5RB8UTtMEas;Zo&O;=u3vmt+ z^OAqaS&Xn&rd+xRgtSD?^8@ommG9Z?_$`TPBR6v5R_=S>OFIenNE7CpojPp_G$B55 zqO&gg_ugZfek#@6>X_ow#J&(?HS$wlBo!^i|B}VZ9_@__N8(TaT5tMOQQ>yR~8M8}r4!XDx};-we?{59Wse(8J3w4p|=?n%*ue!M+k*P?u* zD@84+r1Go&9;w>TN+`(lV9`Nw(uuTUV|8`gJlAUGMcSS>FLhwyGyUY9|KcW^-)Ed9 z?C@eSKS{~jPkzADc7b{YZI{qQW`T*s!DGG-q+MoLx zf~{@pNH=O>GcmWPc4A>O+n}K4#1LeTqORrFd_8Oifb_N8Vooc!?xV%r;?t|MZ7_W1 zE7@SoVHO}aIo#Es)EWupkFk*C-VhXyEm^Ep@}D2}KFIJKC8O11rEZau*T7Mn3eQlf;*!U^8cmxd}A?O*8jznvqIM4(d@`u3E8D zRV4meKuFV{Z9fMoR1+pG05ATb3Q`4|Nyxrk#gITR4G?kv=NhW3MXsJyhdIuWK%j~) zyhfi;9v+2fN0(sz5k$hvm{4Anfu}(KZ{?xq4+-u{%5mHBu!zAO$HD z{>uOQKD$qi$Y7~CiFi+7#S04IiIBm%7*a#yz=*CCUj09#y?-MnE3hjAC1(Sr+8Urk zlA6h?|1IzZK;R82+`s;Nl|Iw}Lh5b-=K23pUc#f8|6$ zW2z!jZUG%loCj;LPvh=Qgt)lPT7W=c_Scl@{7 z5CQ_X8AxdS@9+En=eA&jKJ8{ZT3u`oZf|ZXRR#opluhDOxB4FXc5WBMhyemAd__zv zJ6cpu_*(Q<@t0_5R2s7s#ho(q109B3L6a3!e3{RcieF%Pwkxny?f!DmMBln9z7K)1 zKa9E9dUi3Iw^6{e;DuQi10MdsJ4*RC9wo=5QI~?`!(Uthg?y1*1a#6TYg@{V(`NF< z*`|p(8z)9&NTAKMG)$P?&zkkZynA1aMRpsvnvW9rs72&YWqAHL7ftu?RtHliBVbAtNXR3ZD6* zWeVP$(=+m;DYa9oW(b+e`JyGgC5Y3rg39zemh{%ky$31GzC52#4~iS#WCnFz8w~w& z`*Zs?g~D7;fX|#^=luEuPOw1k4mR)G1cLoNX_7rZ>25MUG1;8l0|35RE*neuDJUqv zg->9uE1SrExj8z^OA%L!Uqy!5SzRn>tmOdzWIE{X0tm53@z^biDJikcm949}ID16g zfc~0{;k-jeE5PlJD;$b}s~G>H9h*G7?fj-#oH~@|SRGZAA08T|^GB@8b(IGg*DknuxFbDgA2s z$$&=+i@omW)67xaO6IEiK$$ns?2beRB7T`_E$y=dJoa=3ZEolI4Bsd<3$sYT(!D`2 zT(6U&2U8{T+re?KhDLK`YN7An;#DfCakiSa!+g0L2(+}`-Bs)hhRv+U<#^)zXS=8T z$Jf{MD+*{|0r();(o#8-bB{_;V{NXm2zGzm>vEgs{_zoOL}27kX3+m(@2#TZjGApx zoCJ4i9D)Rw#yz-0fDnQOXrS@nZh@e|9fEsscL>26*Wm665F8pmU;e$%*mvCL`*O!Q zuP{ch^)0DYvue&N!&M_Uu^`@l+Ka30`G0_3Y*03Pa>2;D;y(CnXNV1awDz(+CvQ+# z?R@8BKL+fO2~2Hl0^gyCrNK<35_K;uGS$>>c43hzL`{FqS6H}XeMVU>r-zF4fP663 zsPNNrI!WoZS~0k7NpAozWQ#@-w_1|u(e3rrr*RbSyocHt4YY?^Ns;?%NpBQ=nW6rn z2xqI&Qt+IXnE-Kkcw;8Z1^m}ge+2Jd=zz_XZcy$IT5spWm#C;H5%H$vbyj#E4!sjJ zR>(Y-Iz3N6OLRO+#f0yTXG@491%FHKo;+)P5k&J(^wxQHM&|?H2VOo+=On-HD>GUU z?%d|JHCNm$&kmM2au0jJZ)y)E;oyciN`ydK4nDpL_EA=9>G79x+Mw~t__}e ztd>2cSof$X;YJ^t9n{L-Ro1n%-_g2bK&6>tWow!}zGl(6=_Fxn@Q}o(AtD}h^loiz zu*mzCO2o44M*bRrfkf%;qcLgmGN@|nN^l}S1jWT^o?l?%9YcJ4M4h+qGN*-XYRy`c zACYWFKe$MtHp?VchZm)h;9~Y47&wH5yok=qYOO3G=_G!EfgyG2P77`#v|hSwsy>Q` z27fXL0jg$AATm%TYs{SteKgWH32;q1&zG`XX#^nQU(kn!A2K7|59caNtu7@{yV;5> ze(!`9-U^aVq8Bf*iwhJ4rU4rY!|Bat16L}(o+**iea|m+Ii1$n;e3g)D&&N$`7*EGrWU@229dMP!dGM7WmD*FqFyz0EHC&cWWCGJ zdi4g?TAX*4x2W3|UDV?<*_iHYJwxtj zi!o48Lnp!2#jf|)!aRlMus?K3C7#qm#zoGmjSUTw!J~~cbCWv8DM&%uC zaitWLsNL(|d~=f%k|%fyu?}mWMSfMPQ=i0Fk2%J!&g4<4)Yf|tg9B}ek`h$ zFBYn-8^EPr$#XKdmD#4XnJ%Op`!+>okv*(W$!yAx?&7+!+KyXhSveK1`7-~T@Uud@ zhnhCX794rV+Ph^nVXT>0tbHohU5J%q!OT^?I;_{H`+n?94CQ8bWPgXh@wG9H=K0QV zC~T7dvisRWYd2fIq@}_+MpaU_=CRF{n@X}Pdb(Im{%{VbphBOoY`m!DMY(W_sJPIU z+vEmg18VEt>dH6kxlvA?W(VQjM1GNew_j^(Uz!{x6~(>BNsEbKU(53kR6d7)zozh@ zDropYTW>pGMUR^qr9yD$)~|i}iZMt#+}Wd%hXmtPbH!9+~D(!;7HBJa&nwrVmPY42aw50eR&wTlv-r}P=Zwgp12@`LK&TrU^? zU|YM823IVpy#T!`%?QNDerjW5W3cOirixGbajlLNChxLGnli@+97L?hA|AK2bRBVt>EL7jgQP-sm7O^yNm#VwH^yJMqV8$Mr5{KOa&O+Iqixo6AGqYhvY{_sXDt z(`(nqJ8@P`y!sSmi1X}WQB+jYfEEk!z1sEV=_Vnoo^w*y= zsD{V`vnBjO+!axGgt}E;zlG!uX+P!JWw+|KS{1)L65xj1pZ3K1!@CHnL>f}2&1YAi z{Ip-krn%}g%9xed>=eY+lewt59)ZK&RZ9R-vuVc1QaQ}g59&5`IWU|4r%tv?gXYYP zX3hGPLF3+Vh5uj~qjfp7bd><0Ns>dZRI!5yI@;Y#xGAw3t;^4)2}42x&!SjPQ6Je0ZEbI}@MiDOT1YoVB zHSiAut!QynUVbyhf9Niu=D)E-3xFkLofaE_Uqcm01Hft3=>G%lmsZdN7$1Psb`b!g zt*lAYP?Ccvr|Ji({w9bb1m@d34$QE!bRhd5(tTA4Ko+I<(fa=x#t;}Lzs2rDkm?W; z#V7#$^eh3O|7YYMHmUtI)a-yl^{(J{F`Cw?&48+pma&u;yIqyO&=94X!X)15a*-X(r~uX^8p zgS$y*RrGNm4Qy7!d_@kNASMDdDNjntaIUzr&J&@v&q5F8OV$%NwSHGE)Rob2 z|hxLVT_vV8_lgVrzWuM~Lh{(uf|M_kzq@)|Uxv^AagAPjwUz%mXrumP*g(RA*t8xD&^>Ap6 z6@Jf@_sQHs(%d& z@qaMvdOVdDYp>wAkC=?ZBRt<-ZwNjMi#fz3n5fOGW%EUqAe`V0AcfvCy95uskF7bq z=Pucxz@r`S5il|`#9BQPG6~`0{MjSY4ZI6h(ggxmwsAns2&q6W08fT1yhgDwhQ9E) z4GAWjs+8)6q;e(ItO#WE=U1%E0vnybP-dLYieH_F}WX83HMN z`7=spER&wypY>KZ)_pa@mk+l_%JKC7`&|!PulZ&&T~f@mUVSWGO!bF2=<;tCO_%I! ztd`zxK<(8a&l#~gbYED+Su})H2{*o{WzP|q!ko|x(EXN(1ZuRt1MueIW^n>h(m(X( zv+{P>yshVtr#)&*H9acehL)dy;=#nid8SMm5Eg7-zfMH^wKPs4uR7PAgB|uY7i8`q z>#4p!B{Z_xIsAUPl40zDKW!AtproFjdCA;A1BRc$!LP8;vPJd}<*40y6Ii3kM0~ZwU1YVYL(^&BObcJXn>Xoc4$S&icw3^#m z>}D>K_>H&<%1PA{(lomG`RA`}_v3Ghy5%JOL-fYg{#G8j%gr=XJFjie#L)7QRFW`2 z?QO#K9e4vV9eHP&f@{Uv#lZvPP@Nif!$tT8#YZ^h$&A$YW^abpQp4+-?z4==1gh1J z+YnB}7PZ)%j$&HbFP~89O}Z+Q*z|yQLiIYP_sjCOnV(gKqS|9=W_GR)VKjl?ekty5 zN;_aVksoY4`UHjuN^DLzwLyl4P;=@*6+FR1f)db*4L?vU1YDs+sGP~m7!c0!9v;=} z473nvdpyAJ^nT9%e8zse*-`=YyIEt`XK`VjBa3Qshc4tlo1pSu&4V$bn$Xf8VAP^b z!8p|KIjRPxroJS+Qj%AX33ZMZ^e*Tyn;gRQ*l}vLnm}br+1QZs-HUQQ%kqSXi$_h8 zUitLQ3ujWRI8>bP1KU_g%0+D|N=p!<5--MKF<~5@P;katKE=xU_9Bey{<;zqoEzZu zjW6x{J6jE8G)}qn>P4k}-8neCw)@~m(E69%1*l!A0U3BaWh}Rr@DKG;j>O@^-WXw9 zz%xdt|D&qwR}^FsEBI()9prW3$**1}k@^7-2b27#$H+}z>DekH-49QPcq{F)@k0^r zf5qg>{g=5O^7#H>4JEcBZ{_~QYwD2i{5hCOG3ObB2%afYp&N$y(-4udi|nj*-P1mJ zY|>}XmU#F{DL+w>jd1(4?0;gPhIhhvNjj7?`35J4rsY_L9!W@S6BgTS0R073MLU_HKdAeB?R zP$l&z|Fx8rjcJ~f2=v>iJZPC3QdceHhsJ#AKMr82u5#9JEN=b&xNd$M#y z55SVNiT&}?_|2)0dBD-dX65OiejQPW@0FK<_A-gv+FVKHguD-y({e_TU;8l zaBl;FVJwr)zuBWqUGx%C#gp(phox@EqPXCZh6DblumQt8{YfDg#JjB#2`!t4zpH$0 z9yb`i{e*CHA1FwXoVRA+HqXg96FRD^1lV|H5tltyzc6=P(ziI!`$E;ArG7Ncv?OIb8UZA?go`Pe6Ycg`avIzE|pjorOw9YF+ed89NK> zBu_%Xj0hiZ*7-9YS1m_C1)~?&0ycJGyj&KZ>7+{A52$sT>k`=cv+&_$1y}qh`0faN zHZ6w;l~*5C93Wm@>F_0R0Op~pI$PojH=4pgLFW#WxGEHhxg9!7UZ9pO(J9ki2}16s zM5;;rk^!brcc?N1T#SRY?tp)~+tP5mAek#<)g=x6x;+8@gqF|GJ7@NHht*&5rPKIr zC6xcflMr{OULH)To!|Iq6pa8}8n59rPx)8+u70+*alci^T`ixjK}IfMi=Y{H!%A+8 z(@`mD-f?uoPehd_g2@;p6<9f1@o$UoXjtn*ov~y2j9OijnjCYus`&N(Cg?O9|W819MtdB?gxHA*FAuGCyBriA)VZ4O$j zp-W9qTXYrr-m)zejz-}E3^*=haK>C^)*Doecln=DS<`KOZ^`VRU9u;Iti}^`s{Li7 zJ3CFLkfqS)gk(Y~-)z59O2CM|_1Opjp1C|3^!b-H@jT`Z&d70=?|VKbGSsu`+Q_H# zjzqcgN{pT%^R7Zi)4qi7eF(m12UHIv`CrMkxAcSI?^_Hm7HZkRH(~Z$eh~!1rB<`}C^4ZzHttb9Q>}>W(WR7t8fWP}$k1Ykrq8p329<}DV zoN|D+NjcYwNGH~{^4=_#DkWm*G}*1ym_t&0&Jcmoj~3*>rdu`#1_*$CEz89R?^)qd z{Thrl$vfG1=zk<6=E@HCzM6%84PVp)Mb}%Zp+_?#qJ-IwPRZHM9TiXr(rC*@pGa-r zoNDRkImxl=jg$PBeS6;q5v$Y}N0bR*1K6S#h%|LyZp~xJAxVjmGW7idg~yXftgMt|zPQp=$KTn)q@P2iD@@V{@hi&c-<-u}=d6OnEq z9Nu6%SGgJTm28RuSd*z7SyDitk!v{1R}1RRvE+JVf~1pS{cs8rDQ$Y2 z0ZxZL^sNTw;q!N4MC013od#iZA0?)}u|- zo%~0y)8J=VtmB96Ol6?s+U1$}|CP%UnJA=}W6Qr`NiRD!L%6*AMfhiqzvMu7TX`VO zX?aSDTegaO1;R~6c8YVWFgMv`zdlnsW$@anDjx~rYf!QfoU zO)nvS+R7#F=ZvXt{hh38t#(EStCEGbvw3`umN3}xV6rI~{JeZS>-Elx^+Oq1>nj&J zbW+X^thnQ`nfXh%>uR|WlP8Q@>!TPaQ!V^Y@!_x?MfCtIJ-E;DdV|F^38>y4YnL-6 z4C#z!xy9uPCu_|q@&RG)4dNf_eBg1v=-$bm&MJQ5ORn1)%YaE2V?Le@(*3y}6U*SQ z$JDJd#7ah|HU{{K#b+1$jx3p0q&nWna)|Kxm^1h{9}$2ktnv8p8Rdszy93SGw^sb* zJsxP%r(VQ3>@Uj#;7b7q5mnvqM8yRCE|_WLWMpqVP9*Y(*={z3CBl_GEfEzdz${T1 zO-ITh_F*Z@(&fhrF`Dx<`%_s^c!j&1Rh^eiKLLoxx; zRHV}80fECeaJy9rXonY}JvC2)^3In5Q(6};+P#4Sg)+-pGaVrx_nAo-8sDz7ABg8m zHQCc8iqgR#{`|RL-xI~dEUkO%;UlyEwbdL;z*eL9l#uhaP3@{E9ArA~=gkx#48QV7 z@_B*%^K*69lc-DJKuj6q+I;C^g!k1Gz6GI&=0aXx6lWo=B*GABgT;lf$i=!t#veCL z@E`*k`HUt8#8i+6O)EYLRQ1qfY+i)PMqMC0Jos*-jI6-tQW zYyWd!jaX{qlAM6O(%e_GC?ZzFX@muGhWJ@m*$yj0nB2S(yjH^sKz9**T7AoHVSg!0 zg1e@BEZI7v+*?U^1qK21Y-uYVWvOrsnEMV)`1@P85YOdWbGJKd{{XT59wY=*I*~6U zoupj8x!aa^F84|zzEJP?1u?76FE|XE(DAti@%jyj@Ff>b*Rk)=rgqsp4n)B)jc*Mc{SW^7tv@gFKy6 z)L34EV6&5p0}cE1DqqzEpN5w2rPSWd`i*>RQ&gMZJ;Uy()}-q8NUHkBS8wI=oQj4k z%bo+jXh9zc+_p_7rR(IN-{|E;5+(f~;3V0P67(O7^j+qJNQd7F&bD^`Cb8|X#BkQN zEQ!hL#Dm1C+q4eaYXH8DV1{G>GCi;0OswQ}VE1UmI`13(Z`fLuV)~0^+NouBtvzqH z-&L~CB=iT5`u2~c(j26s8d%R1GeC1R!YsuwBDBLormIJ8CY9jG*Ti2i>#G&%efxiv zdpNDB!Huh?_jC4(f6pxF?S$YakpCc{p`D5Lzq4i-%jndC8Ww)4QUonWl`c$MpuLPX zXedPqlw45V)~kNMZBG{MO$O4CQLwg292n6%k_&_BevfQ2j;DX1V(Q_;3c!lq|3O3Z zn94uS2uw~ zKfVMt-%!}B3m|`k(z5F7PGL5)v2Trj+U&s9Bg&TW3XilGwOO=bC+2vkU-Oc)ra7O~ zZ8c+PwxeJ1GT(i$R(*?dVEgITs!|bV*zPWqu^qLWdGNR#xU6!gASUnhSgAI=-#%=1(V( zNK3`S`;xzcND%qTvj8xV?oG-7GE|vyvGjk*?E1j<^=I!=O z372+6!ehUh|D&7zhnB$kUWJjGlv9yOc;uw#z6l|s*L36~`A7Bay3o5Td(|T4-6<(t z&Lu36$AM5LwS=z{mFOI8SLn`=bWx$Tolo5FEdph2re^7-XxG`ILv}uso?n6GdgGHn z5E;K%uGG#}KhsNnJbU-RbGKctH!*dTUWxyy7P*VL9^Q$Ljz0OF*qocy4)N-5J6U{X zbU%;0NlzJ}fc=BIY;3@)J&tbq99quE1+k(0f(ZRgzi?8qy1mK}Ui4DYp`8}j%$@lh z{I0PcUP(&2^l9+zGh$T1EPUw~<_j{YQo+E(ON1odf-@oL1K-u3db}PalOCLBf7vVS zF3B_j#|-2&;fSukB4OX>p-~e%)}s@_m4MOTjv!3kZugUM+4CiQck_$L`?P+4!h|$2 zxfVtxqC*-twxTCQ|MOg=5y}(&v>=coCN4?1KGf>cj7R=NarO6C_;Aut7*5A&8-FZ# zd&-PT#6N7xYNAx;{?CEpQRF#6C(5+%7vOcV6|l_yZ_2pJTqgt;56!x9?LzO<0yEo5c3@I$Yt0fV?J2tSxn6^88Kp_?c?uwf7 zIX3m#8?Bx?3u}r4N8i>F!H7&^VgiU@vs$CDV!$RYE>@#Lb~+fbDXvg_cW6A}KXWed z+G37*!1{rstYmeygobo0LFa27G&c|IAGlxik_;Dkw3V?Sm~?u}ZV$}j-wq6Hx7aCK zQnjAwg1wJ&!ZnAuXrXZ3gLX^XIBOak4cp=>5-(PYo2N*mV{oPE9M1FE z>vJ5;1`r@mr_gsYNE zMd{_#lU!kzNeSXOa*XKCjyX*NRh`Ka4??jhe16L-Pa(rQEPGT~1t7%+*%9_)NFQJH z6D}=GZO~zs5_$6*G*!gl%2?l?8}0Dhi#wNr3oXZ3^EP-tzum4?hxS$WOkVAlhAhua zNM7Y%AKQjr&61mbcQSc2wij*?BWgS$Kb(tD-0ULbb=aRERE3fV1$n??Z}m%JkRZHv zEByYS5#?>%&8_f12gs0c6%z=%ye+r+CB*)vgJ)Ju*%5Qw@og?f~QQ(uM_>E<_AXs|+4!#$N`RhHBOdoarn~q@TT~2~c zA+fvg&pw!ediWNHwjl96+EvE>bnvZ_Fjk(`z>U1o!iohEX znM=IhF1?Kl_J(lyqDSTOC=9t2+=JD!R|2O?E@Xqf+46U=;xo2n9yYf%8tIiARq| zEz}eeH-tn61XZkr^38nAT3SjpEiTh`VJ_aFEGCXKs+1SDLJG4jg1!?I-A91~LkRl{ z;hV?j;3f_jBmgFCVWsLA!BT&%Q{{`yXJU$wUILx`@%9a0sNLh8LWtcP(}z8;LehKl zn1-8CWT;l!=yv5lGg-JzGuo<bQbAn`@g0Gb<$-8U@W{b#T)jjuFADT=UI8VU-d&ItIN1%(Wac=#kMes|6Vw&&*-n~a4KE$~6 zCeHq_eSrYt+_%%mZ(GU|>MK}fnEvJwi%>D`?+Z!aaw#U98kwKPw-kZ|xcn#cHFWcJ zL%4bGub9$?r(|`<4+Rlw$15CS<*~aA?v%)uu;dI_|K2+Y7gXl0K(DQ6?*PRUu zsQpX9>;fpxde(CbzK0}6%JI&q1!dfE7(e|guD#)6emxI)sKX_h`31RX8_9Hlp;X9R z4q2og(04JfWS}V;)}Y`3cKowuE`d${KpgtR$z1UK1p=@q?fRrrQ)idh`2yJ3^_r~O z?g-~0;`ACjmroAvqzGU%nI(*G;v%=#;ViUsP8oU%;GImfu@_sO<=Iik(tGze5@cP^ z4A|7OWHS608+HE%BElTHu%e25SuH;xZ#mnEsO%5_JfZT75ahT9rYOimE8r_+8*ZS< zpIw-kP=NYg>{%o1av-=i0v;J$-LDVY4bX~AJ>4-1B{RROgER~sA(F2h=wbqn&L{(N z;~Dfc59gBcjiS}BS$w~^(I#^B6HhG@mbsO)!pD`jJE`P}v5H;x|HfH0P2dSl;WG?G znzqmB3DNfdlB8E)jwQ>bx&qo2X)Ono97t|jh_ z{h}4L@sq>3FQO4Yy`mKJ+?2YIIW75HP+?XUrrU4ctQ7WDh8=4DI@DZigFscz)~}7G z=vavzU&PFM<1r7#=>?U|KUYZ8=e&nYY<)6%UciF<#Q(H5BD~zB)%fPiflExtS1i2R zSl~t)Py|qlDaM+}Y-domVuT}GJPCWmjIaExiK|#F6=5QD-T?`50j*NFwKVMmN4F8O zP`Q0>uX)U06I${LGQ0|9xV!~KHxeTLeUX*Ydn2i_T5F>PB5qwpuD|&*`HRiac)bX| z)WS60zwGV&&Y&>&acArs2NV8Qo`*x=wU*l&hk*!XFL?>W=StM8qfR`@m%XymH1VwO zXn_0fdwU+iGGjY`ixv%Iw*=J;%mu3=g_DJeD5J{O&*Hg-@)X!aB?(o2h(!&gFfYcO zH)`1wvz50ke1Ick$2!gC?;gBB=sn660xDSBWAo;SdA(*jH-)G1v*EVR3$(y3E6~#o z|3*`E6A|j{$ZU^scO=m*cf-&JM$S#{&6%!2?`N`msawlpBXGpl!W7tnEDW*jfOCBF zpVkFxKnHQBthzIUJyo&N67Biy>j=zl&hsI}g1!WLytE?4Bs+*b5H~qu=12Y(*X`hA z)mcsewSv{4XDu$S*Ei8-iaCpaxaRimiVrLAk@`54Q_(hVwL&rWC2YriFmjjhwbl3p zDr|?fuZ8C-p2;*!(ec;!E0j%1`e0QD%(3iWDKU@j=^6YqbO>SYG<03+0b9tBF8@a{ z)$MT-{w5N@>}0<}(oyy|)^q)%L3YC!_~gDr!w(&N(krLw-d9S0Z;tc9I#S7exhXT; z?=&dn%Y@Z(xT?>V*u%yCvl1n08iRuTiZ0~i@N2P(2pB}#kUPX%n3yevDt?JF!j6lU zVqa#@V(RA_G?lwF_8=Ch3-tx3yLOU4sq3+cI={O;>T&m>RvhMgwuS9%?x4V}WbrV- zx7kE*OGyaSpVXonndngxmjv+))~vrln~d+}k9~8!Tn+DR5>-cAXU&=BrT52d6W@=G|of%b>A` z9um0UgLN>#h$7&Jpt`E1-DEY0*R)_daJfy&HJzHsG)=LyNjZh`RA`~%yyw@@Xt9mx z-3i#a)P$*}T1ME8!d`FK4NTZWS5~d9r6pbb4G&^ER*i$9q|6YHm-G_Y+InR{9tu8@ z4O2$O@-mLiiSVN4=48{Ow_0v%Ozf|*QlC#?P3|yt2aIM#ZS{%IsE!Y@3_m8qw0vbP zU{9{vye=eXwWi-LHIU0?6`JI19e~?n!~1d$qV87n|L7sY9s(bVj^%n=UDM^kcJNSW zPH&4Z)F*DpE61<@>$cdCVZy5ya(uxCz{IZ2-sXVGx&X>OXrWGKf0Yxf{+@Kg^6ghy zXe6|lJ$`&MP~ZER1YfFRwKSfC=b@Hrjy|>fp^Q2TK-hmQ*SgYN_ksQg6A1%z@(}0M zt?p+Bv+(*bCG+e9tKI>IH`{cu%VBlM^s#sOAq%Sri z;uHAhKE`YhPQZ8=3DXl54W~2ZS*hn>;O+I3TkA+?N8#y1fh-lMxezjlB;;f5`nFy7 z#V;PtQT?p(^8kTMUm_R^*v&V|P;z^XAxfEX9Y4Z;&fVq9;Y5^JG|kFBQZAT@uT0 z)C}yp9IY>cAm_=UW=+DPF&n+l@u_Kn zsuC$yJo|DBgtqw4zLB(v#2e&~EuZ_R{Pt8-rI`3cf(JrNRnTap$;@sLEM>F;Eu}4% zwJO4L2GdhGmgz!s&meD%$;<>jHMI?6n-cK5nvHD*qF`9<8y}@)s@&zvcwC!7ke~93 zWP!6tDo&uMCpce&F+39sb;URAA)V0wKH7irmefzH?v+>LiPWNmj4Kv1YCAy7@X%B}ex)rk+*p0~u-h0d+B`r|W z|Neaqz5bS%rrM{yv~35#L*p67b%U*Vmx>hnSer)uZC=gK(&fKYs%;_xsw@xzT!)@dvoq~?O{j%<2wze^p z z0|kTZGOlx=vm+OE0huRpeb6#sLWu&pVA(gbJl$$Uc$gP#3$UiNcR{X9)j@g z32tA>b0tdapCQR+bs!%xNclUQKD;B=7A*ZN>VMc!4%=Q_1vH2PkuR#11pDJv4kR*F zzHzunKE(|j8M$Xjmg0{9U5K;AS;P0P?g3B5|3ij;ByOCwJAMZC(O2N>qQjhvo_>{+ z9^_Rk@@@wsm_&0a??xEK3v5LvvKtr<)USLH8!7jCB5@gUS~RnbBR_833IpCg z@=6Y0&!-;bc$te|?H;ZNKfRVbp2T!}YOa}xS>NJgaWoGB(q}bDlQ`__Q!xp;*99KU{w;Q z&8=lXG|pq&+K8Kml&mb(9-_@~zNXZNsTBBTgd({-w-3Wqs zdlr&_*y4oA66CXMnCgaUd>l`SftD}|vPWhO;)eZw>L$Kg@^zBxUukC&<|=BDIX+yL zObRIxknRi|MMR`QY)9t-)kl#dW79IAKdW?pN16Iwib>DX(NHpsyz|dLNgpukUb`NC z`c!zqvO=9yF|spP=fKgPu z*uCCVF6)!@8P}b2;_vzuEN-RmvYee9)E2o38!xr>s^6ALBY!#gFV)lJDVJ2`?|iBM zEIiOOK_4g|)jqWpu1caAq)Nbtar5^gQQx5vhD&zX;W6O|J_)N@nCC*Hz2FNRz2_dp zCU{KMz=ZH>c7!Qubs(IW&;?i${`sN=U0L}?S25~*qKyOutPC11-$<~64*on~f(RO! zOY61)SmolUPX2(P<{M*!9c?Xu4vB5&1iVv9r;;C&I?JPO!MbSOF>t&|2Ep5c~iZk@DO$PGi^OX77;(tu#;aqkkHGoj{vhnkR?{_InJDvLT zv_2$WVUf+TOg%t<%DUR_fherqHW^2U30odCd|@A>s$MlF%sW#&#?G;zybA+A<6N#0 zL=JsO!7+SeuxE$=@{h-VQ$q@(yzzOj9g4IrV zrh-)6(KcjumvOy_jkC$|dcDr$cetQy;ubsiqRCLX*4n(NyYkU|FEBx35PseAHFjl_ys`KpYZjp#hpmhw^jHlrrp!8U#emcmzyoqY zjIy1zdzzw5fBm*Fc;JQ@W=KLBXRYx~<|fso3D_eAd{wQNyXtNT0tUB{8iU z3cJICMVq~PTA#wrb-sR1;h=hxDL9s&^jfkPtE%KQW6e(Tkmvz!i~;dCENJ-~-vZG1 zv(-eGnvTiTvg1h2|{7Hb!m+c z#D2?<%y;?zf_y9^e)L$p-8nNsG1QBStleiMxXmgkU>8Q4*|a)T`#M&;)qo@>9z94;b;yXpd-U>N}sC`4{3PUSgJoW z3>9VWDYM&+B@~5$p<*6?Nua)QA|Br(@^*FYDtIJ?fqE|fl6Q{)K}3>cjGd7rZb)!% z1Mdxk!fZakBgixBx19-(&m1D>rg#1z%K3?e3naDNs&vOlm5PT zC0ELl{YzAjX#@piX{O4k66?DQ zzFN=7uW(rs$agm*rB59e%#=hRLh6Xr)#=iXN)WTeT7xl&zrO*p+tO844*1mOYq1mo z0A)joV$MhF?@AGZSaTjv=762#nqKS|Is-ExLL~n1=vS&2ixe*3s!_SD`a{;1VeAj) zKj*#hHkoh#5?@Uvj#g5Nv`F(IyNG*?sND- z8XKxXZ6$3k>B`<%!Vo5?d5fWj-`NVf=SiCn;JJJDa%um_JMLCm@IR8Sx{r?<&}6wz zsba*M%3jTL4ue?Dtfib@y`%!j&`&xhh4F5`Lz$ykui3C~fTIFPZ;}M;gjn*8Z0A{n z)Qipm5-#+;I${D}iE8w1rZ%ni$^L)iY86#l!qEg0PHx!`dlM%3(QWpnNnUAJLUplO z7e)%Nc8!Bzk;b!S7g%5nA?uEBfDd+e4peAG6a|>%VjLz>k~Gk=GJp{P++B3eS6ACla0ZKGK#nK6$ll+nTk+E#yb>X{bh zkkap;?_)t4S@f2C8ebGg`PRqsRDVyu2BcSO&)bZELqWRZXVL`0`q&3S7@h`OuDHX) zh%AB9an)Xut|dmIq?(mWM)C)*$A6L6;Qwg46oJ#hWTKl3)OLxB!lX9SUffd! zRB3^0{R<4A#uBC0B1P#_ zs^3&s8jcCR6NKy@BW5j=w~Q`J@)_6d2u!8*1(^X(D2}>ss|?KZ#41Bi1!352vPI)P z#W)ZZip$|G!_N|JP8D&z%pWkpPARTE6IJu2CB$V*Q45cJ-k#XL)ZISUDUj#k3+S+4 zrju&AF)J0erL$71pq-&z*c*F(vu!?GiVvhRl7f%7}kTnC|}- z)C8c8v-$Dp?@q825_A8U(l*nPZ1sD2o*gNdk@widy?qYrPqBAbCNZ!G%LRpueL37p z$xv6FJ}gKHtY<2S=_o_$OF5CWq4dL!qfX+ZtS@M%V}?P2NoIJt0eXgmAF)_t>UDm) zL4NoZmBOTzFEr%1*WbgsFFXZ?%ewFI^&KH5?6Oz$Y?vW*rYuu%2(fYNH@y6!|nXG^`m9B+u-!9(>YF2M=z?h-7)NeJ$)i@SU9Ai-U4zI*O|vUK)VC%F%H-Q5&rUNxlD`H=oe@yDx#9*ih#^KiT8gWDagDkG*CVoXxr zglN3mA^?pc$)k5O-;c!YdG=XOV&dtWLYkxG3sMC_qCC!zk?!vY>utJoT~1swz7P#e z1jF4)V`v3D^E*u2`nFmp1$#=j$y4KLp+(sutwW@^^|iHeUjMh(hAALahE#~vs1^jb zS7eg|SZF=ECmUa*8y&^u+>cVfe{K*@n4!QV63)NeoscnXolF1A0+?yM*6mi4Oe^hs zp!wgLIkpR|VULJT3E+KnGX8wmy)optW6C4#bO}q{0!jK`-wyx>L7uH#Z5oy&Js}K~ zoRr}rCzii9DrDRi%mIuB^BU&{IX!S4>*UAfhORjIm_i6VfLXm;*JlS%V-45WnptS+Ly z96#O|ik&q4NW@qFA)$?f{mN5A~?O;P!R%gOv(MSVnv%rD0+b~*YB^s zN3+#QrnMC7Y3Muv(HW}+&5|-QrREc($W`nLp~}=BDT{vhh2M8hzm#m9f-%wWHYuT!Kk@!pfTDW3dc=j z2gE>_h$AAYE&oN zmNgOITlynMmPE*PfYvWd7wm_hVb(bxr#2~=@S#=T^?2IU9xf40jrG}~DwAai3XGII z1yBI8VKEZ5^#5D{#RUV8k&3U6$19+yequZSYl`XnwHEzzcos&h_{lO4hf(5${OaHm z)ng@m7J}ffdc+K9iRCx=IowYx^KNsP-;*+k`9w_r-W4GG&nH)zh^XeTm6U6CQB#}= zTnC&OA{uCo-bt{cGcWx%@f4Gl&7TCyD#VwHx3#tD=f|zHF?)D6nXxcZk-eQ|S(s)c zdMv+mZ5m1R)_Pi==APyHPKfH{MsO@+&um=cz>+_O{jy3OHarFDg>%xg!);=o~>OV zK;5N0xS_v-Tx>&3$G-B!fwH1&w=Z^dwVLIPgIr2g<`XH*XvL4&S(Y+`Eo3Khh_L*( zF-;9CSQGks3v6a~*#Mi?7Yt+)KZPp@S~1I%P(xMoJLNK4#o-{0>1*{aT5z0=B_9(4 zFYd=v#Xbs$sx$qDg%k{H^3N7UIhbPWuLOeJ1jc0O3%^hHDVZPPDi=4#Dw`{W1yH8y zR%cefdwFGoJ66q`gFy|HH`f_gg-7~b`elfDqS9IEb!oY{*NUV86+vfdv5M@6`|)ef z>1w|7*|(U1>@UIH2L%PvHalHALs!>bbFJeD*!2{#VJnTP-xaR&q4eBYU(mvR)xge~ zcsu!Bhkc_7ye+S^(b2I56%VyD-RL2VAL7x>9TJF0`F-p)mJ)%2g;P3ReWyZmQ=>E@R?c0^I4QE%O5;p0a3q{4$%;UTvsDzLHt0&7yIQjgIh_XV6fXHJCL zOdPz!W=d6MZ369R^U^MRUi|M)MWEM|j7_zIY+WW=?77!naSG;U6^Ow=RUmnf<7a$4 znJS5|1&^AgYJ$nPK;$`#AB|K-Z??D)@4RI%={V90*W)B$^cThh{LSQA%Lm*>r^25D z(bH4(%1p(Dh4rKv0}l~Sn;H?`SATojn&zg)(e*~#l2|HDtJ%JGO7J8xy``evE-wLG zxgDEb0a4?^sHlo4k1yyk>>Qt4q+91$uA7s$AeVOpwa)7#izW+HH8(Mv?lI zU6A*w8l}_S0^s5NtPi(0u5Z`Hj%GdS@HvbS54?IHnITm+e$<*r-dF?#Q8hL}GSpo_ zUAeZoxpjb@aYDko~RpAuBdy2fgz8yL~W;L@}RX4 zoE>g_lK1X80<#(=Lnqf|jdHN^lLQ^x_RHR0VGF zp0m^QjBGsfFh~@+dkGOFV%v95NG8r3ghKFzixRqq&XY9E`1gtvH|tSGvz!frn?<^K zd6H##jpDe7W9?;x=A3v!$l++ll}-?SMrgDWok~^sOHD&5N4tCmcKH^itbx(=2R6ZN zcKgJ?@3;&)$@*DxyGAwdD;Qig0^bB$eJ4{?$c+^PQY(>g>I70PeUL5b)UKaPYoe4$ zuo|p9a|;Otj3rxOA;tlZmq#)8A{HyDc-JhEC_0StK($C(5?_5&GW~a1H8H6i!C(32 zbNL$oUOzmCV6Y3*Mk)m`R_l5iC2a~wphUopYpZrb!T_3u@NF69Op zmqFi_Epk=Rwa|dsG89T)!{#fUZmuz{TXEIf7joFXsFIjATEke0j{1`zRXl_R@!fOa z@q=FTQDu0HHDA9yY2bj@yEOi7MAjh0<^bSR;LYau_m8W|3EO|drYpnD-su2&tk*<# zD?X0eJJuIwXUz7(l)HO#Gvf9vn*#?$r+h^UtSMwtI5&7eW=3>YkiD1ySb$>X3iBS6 zLfR?pbXan~2Z$hm;Mb}58<4Uu`HIW^FuItJZ2TwgVbyAE^Q30!gM2(n7?V}}wRzY> zpqQEejiluXtKQ5Hr;*lWX>-brnw@q^<^~UPhSBu#h+XYMt|m8~SslLycQGdvnG*cp z2JTqd*&4B3{!gD0C<0)h`MhRp9lj2%Me~rgx<#DJb3u-w_Hw^0^Om3;9AST26? zq~Y4_p)Kued>wz*gqN326z}K5SyZfaCNA<1)fnZJyIi=J_FC$NmYzfL6~q>@0PhId zS{F<3=X<>ZK+DSq+sCkeq5#a*8{fVDn$9z(C4H z2|!P9J6COo(3_BMAce?QgqRev&1HFl-Zo!!vYhl+Bh0V9$p0mMW}nT2qCg+$;4Du_~aq z3nYjru~Cr?dL-93e%vgyU?WW6wCW1tlko5-4UhcW>SMXyyJBG!Kn1JHM1BfMOG8i1 zPNC$_x-lI^+Deb%mW|QE89^5fT?xTAP*a0hA3)e1KlV+frbA%a{v^*@5lSNM!#Kii zVVmM+J1_fVsf>Z7;28N;;_Nsy2d(!^puLk ztJDcaq0587C2hp4Rb)xE1CmJ7fqWWC99a|35p0LBUTQ9!1swkBqCW?RsXbBA+HCY5 zC9kUlSvpG(e)1fM(6MY;4P?&7n#NF9MJaL01R`^hATBj{eqil>0J8s|(dTH8X_aw+ zy7S#4FPe7`n4+dXCs(r|@^V{E^LPJOCnaj!kjP4L9~lk_2q`4?lHD13TMi77Jv;d; zDGnP{pxFpoB)v->+^1-9(|pb82Ip2w(ny`NbUYl*qvpARW&RMw6ZiD=fUMWuZy_VYNT7eaE#dUHMmxFz--%UE zQfIY0xcXfr>P@A~nvt_J<$BvOlNnbjo-pQu1kMKOGt;(JdiaE5irh=D{$|1;UZfHe zbTH@wQ#;n-Uk&nYPfqZsD8bkm!m6Ng9IL0_R|zvm7h)neV|Cjxs{59j0V|U~ zs;oxQf%2791pDC|lziyaxEHLdYef`GW4kn>qHtSS)RN#8ICzqz%)p)C)s@RNxK0l6 z+2 zA4Bq`sOC%LkF}#D`uL6ah|3aZc2PjBv_bi&|J@9QYBb`^ig7eL0)ro zXv8&QB~W95;ev#m<){*hL2n3Bx>%;{wosqFLsV7wf9D;bd>|GZZ}osrqTpA(=VJ{P zTp;7`?vD3My!jTVXJ7N9e`krTkMTTj`)!MHq;Cp5J!T@nOU=wWg{hlpjLR z4f!$kh6JbpYc%v6g@t!)Dn^{OzsS!1+K7P5t65I(oT`{8&ym(f zo+-yPd+kN|ECP$8fB(*v+bjFM-kwo{f@ZlQ@y3R9GTq{tb=vT?_UsB{%v^L#A9D8;erd5HBrs#D}A0 zkIQ%yc=QJ|`{zdGkRe`ijPOo);5kmA<1!_k*lBiWT`J}y?(^0Zz~5Ux1aPy$2Q#bA z^Xa%#kUHpy2IVp7UPon0$p3tD#eGASu}qdyFoUZzM>_yCW)|*pcOs$4UE~da%R-nJ(q&Qd!D2$yz4K&?C81O`DEeNiZwPcIQTfDTLC>8Mo?qDj>{_M#;pYjURkRlM2%SLEaIev9 z&p|z+^@2{us?vny)3?X(r%$`|+({A!>kg4;_oS{OF#$&?Hor#qT1uG6hgHpOpu+^U+IGRiV~gneo^aVry;~o#h+R;dm{OxWXkJ|jeg{jbXIyQ_TA17NNVrnH*@|2HGQHt7A>UC#5c ze|^UpH(q_m9nk$3gPllA07=NuTzFE&+e$U?6n!zhRp`9~aCG8Yf!z}*aQ#O|m^3Uc zE-ISi|N1PWo`gL_>-dQPXBid4hn1Q5eK5`Nun>)qV38ITU5h`!nD+4WMj3@T6NaM9 z@OyCg^U<8%(87Ys4|2k-w~G6MsZPY##5eX9JTq79EG21_Sc#XSvPk*b>fKjpkq+X! z>1p+D@#q|G;gk;wNcbAb$4C!INpF||$>(+0e?A>^>6F(M(d&!ohxr&pyj}1_N{?7Kj^!tybdN%z=B4lYTPYu zDk^AGh1`g18yk~<4LV!}IO;~%Tss|l9<5|_rHe~2kR-hq4UrDIn9Se4rOS*`#7@7Z zN3qE)1nqq3ULpov!DtKn!NCo2Fh;U^6HvIx6y)K90>{cVQ~2cjmV8`;sfaE5uTr1b z8*?hCKD+?xWw^A?kM&(@Zg1qU@OQJSo0HpVakGZaWa;=``471&+59BSON>Vx%jif5 z&seP-jgCrUicxezAI|h3&!gd8&xw2`6NT7*Wp#OG`^1;_egpoj*(J9b9;grn9jvFE4)&CH6jn_dY58dBw^`mx}0yNdDNHOqMiS{*SBOW=w~P^IL2C42HSzd|wO+@~*^OueC#keIg`vbYYQ zr2tG0YF5@zK#z)Y>>ts3W!cX?xJaj0=1;o`t=J3-62E?GDyYDM;9X`eQe9qxbs_>- zMlD}e#_BFsr?_c^qLkLTCQxGk&EJYqfv!LqBQ$cnuOT)32pel>z@dG42|0Z7>;k#&6W24z$1k@RB96K3bUy_kjCU+$<3!!e;tT)9)nl+;n z;n_djG!IcqA56!5xZU;zV~Kp3;$wfaM^IVR1IKyqJ#A4eAR>@Izh})hkf9${H{PuJ za`G3Y#snGdy7;_W?Jy?hZefERG4(Ub&ud*4W`iPDdXi|etw&6XoYELV4|T~vOcicZ zJXw!hCdRNrp83kz2!NA03&P z&a*oS8~{36s!gZPFZK`Wk8#8H-<>GKBEnH~`^*as6+~*?>I)Tqv#2N4$}*w7?P6BG z{eAu*+;!YuHd@4hJ9f+TZ@6bU=;*dwQSZu}MoqqexM_GW0aIe=>B9H&?xJ)kl2IW` z7_o)GjqV_pQk)3#%yZqNVA7X(^TlKf){OgnH;PHCSmC_^zoq2gn|HL=yTc-%>uwRC zs(261T|xL1IB23)Rv)id!(Sgtxhy90U^kgJ;5UyaatRMlTAnBc;Vv-b8FW<2GH85- zkJj7B0c!$JQ#JZOs|y6WYPkPa9^}Ghl0)_3C{s3Wi);UtjwC6H@b&QNuKsrSQ*`$` zq?BYO>C2v-iDhiiDtPwKJ9H*rXl%PC)x@wrC)AhSxP%AuG{uRM6Z&4~X#yiY`1!=J z6rlGU$E2abMMyXTGc${^Bd8Y|mNk?rCLQvs!cLuX zliqa4GgLUA7>A7q(?N(&#Pp8@np0Say9{1ce+dtOXdJ@{6)ZWB59(H(6#L$(Xqt3a zWu{Coow$|MAJ(enwtl5_ni<4^u>aY)!WKT`dN7?$5J1wCXzl+iDk9QmP6QqPWX=tx z5O4pUGN@bNGy=lq^hh^bGl8*~*!~&J1@Q>u>hwhc@u)MT5ny1b0GZ2pJ9Fhc{bpm6 z{fh_i{Iwg~-k0rg+2LO4LP0K{Q=6JR@hsHGzJI*L$-?ou*oFrj0y)&ZG(|p)atAJ( zxm*$^hHrO)j{rlX&b7V`rV|P zXyo4QAnkHh?xl_-MH6}!$7brD3x-QV13pjyvsd`Lr^p1|GW`h>5fVZ+l=vPXHj*ns zB3v1L(fL&uU|G<21$~5vd>l%zpQ~~*G|72)1J*UWB7(o(`AFe&wU4?#3@fd3UdO7@ zuPvH_$8-W2A@-t#Y&^Wa=swoXAV0sV=+*6)rv0KIkx{9sxWB%@CnKz-MzeeG?M=$S zax`}w)1ptF<_jbG<=+u3z~LejTLB}JsV68IU#gP({<4`vekykR<$$L)WOLrRT|h$W zyN|C#@t;JP8k_Bt99Y*vWb^P8sUg1Qo0iSZPVd>jNdZCkJ`s0(G|E{=QFeq{ESMHI zI9K{6&xiQAFwm{7SS~b{>RTM7^ORx8g5k2;?tI{dl!;#s96VJ5*K;PLhZ2yX(&pIV z4+nAN00=84a+YPnP_(<3CSZkP<86h!#;$~Cq3_-i`H(M&@&jL-`tLD>6tj6^8H}Vp zlSm}8YZBrW)k3=`IdI1il5e~T9bJf{_n#T3myTnv_K`r54VFx z=_ol#XG!X&}PIrz&Sc*pER>r7X`S$PF&gXn8MpX4B#-mcRWUQs!@1Z1>LDhyC zJd~?ZtiX_p%c4P#dA25wcsYxjo{xi!iex(e@bct)lDVQWU67{UL`A?%xsNA=MW&_* zP6DK2)}QwnXe9?3`I?Ls43!iuDhG`@18~IYm4~@yI+MWdkP#ozqh_!eOTwNt8HnOs ze7k)l5j*#h%(ycw5Mek|+h5BX?kE@8V^4`dX(24oI@FL-HbugF%cSiH345P$@V@$Y z$nWa?=hssay;VKUY36~U4pVOhcouY`hb+$BFC62+Vh@o@sE>~~4D>Knj39!5)y7-^ zMD7%oJ)XgVdUL!OSZg^9a3TCgb?*1OW%_r!i4g@SLx(+*j*uPN3NZtrk}<+qEG>a4(~Z;Fm#mD!P=o zp?mcdX2S{YljcCJh)f?bx#p-;ab85(V0W z@zbLm#${TLCI%b1(XFj&4}RU5gni8_Al3oZ%?`x_a{0j;c!FK=*Y5i#&sxM9aJeS&KfZ#QXmHbq)*cad51wEW9qlxi`S zntOThOP)!ySVyFNqpJ<_$;$Q@*3=^@(hxl~WNAfqxQuNZfzVlJ{{XHd9 z^>Kcy__J;AKsF%#vf9+Nv{ut2U5JzB=@hD*Oln(2Lp(j3uuIFG#?|TN!mv*mA*;%5 zF(P{d+8GmKH10&nq$yHA6j=;-JB0=40mwmBu+YgMhfl1`-;#gXagAhjiAKH~0+C{4 z|DY6|44iADVznkC)%elDnE0*gkmdDY)Hr=4%?W|KB#^O6jy3A+=7PpMN(t5U!R_Wq zacGCqzTfbBNJM3nts1cjAGO=#=&CIWkc=75u zyZ^KuRIF(KFLdvA4BObJg5Wh`4GoEQd@?DlV4)9zH7P5d5?+7MiLN9~o_F6va$>RWfbD+dxSt(Y~1CH}*idDXCqY2Db7%6)oC z^^>9RI(~f>r;M#KIVDiaU_ULfTf~UOFdxgrl`#((NuCe8+-Jj7nkgSbVZZNrGL5YO z$DpS&B;wPR1^e=h9DYtk!E=tNP`hT_xsi9CQf>Y60Mp^;g38VKn{v0AjekTwX@+RB z9=|ffXq`7ahVJ2yuRQRX<-&;U$#mZ+R`iv8Jth1)lt@)N<*irSs<9#3y`+Z=O+bJ$ zs8%3eJY^af#K0F??rv+uqTr>Pja^QzJ|T|hY$zoT0c5B6J}TK`7G@2$^BA1zOh`g0 z=3so{KmLGW^Jyux+XwyZc{1OhvqDg1u}XsQ4&V+?Xm$UEXqX>vJI{(@2owW}{^;d} zx88b_tYU`VWI6u6Mo7C9AobJvjeyT`q_8)#2cHKcZ+Fk=wy@1?5uQ$V z`|T4fhq%#kz3ojYCEv~wqY=+x0AT-py1xDA3DEqCJx+k~1KYjmTh&XHj7$u&XLz6u zbC5AP1|Xh`XV*N>9c%Xn8>^_H4k}3H7=$cf*hxCs&U_)RcIy=l>u>{kj(@baMsPn- zL%ixPU@STaVr2&maK6I)W**@BxilYYv9v@gEss}A@9l)#;eHb;#K)VaIjOYx!3DuI z!B$hi%KV0h%@Y0hFQu^0uJKBn6@9}oXPQFV&*=A({59GAH_b{~S_p>Wmwu1e%mmc3 zHOCnVq^5@0IFBU3Z`fo)?W8dZ0nA1fC=K>};WHmvh?Dbyjnj-~t$@2oz&tD6;H}$3 z^LaZLD-M7Ev;+?+m$W(Q-lj-NI>MmmDGVw2J5s|^Z#hM5E71U-R(Xq5JhC6kqe1BU zN`?sgl}IE=wNjLd5P3J}!8Q2B3X7BI}zk_pfg@t5yq(A7SWQ7}dN!PG+X2p|td;*fv z3ucPFgQ#i5*`_>rE5Pj2d!q?It_>tTnoV#{B)dYSO>!T?)7eDOl4-HhO9yPm{_=y4 ze=g*^EY41rn$vJwG@%Az!^V`NCXlPg=GA#htUGC+ox$n;R% zY)r%0Y?iw6z5YH!$vG=5C?BPolMpzyggH7MlR%-#Ri7>K5kmpdZt^>p$@486n}jRa zpiZI2jJHJF&UF|jw!u)-V>9Xnu$*yY41$jUqnW9VX~DPM=I;K&;u~LK3%GlKOfQvQ zRV)fIaBf9aN9RG{xc=N(Vx`l8-f$qe`eVYwRGQ<`i98EIr=Pg6aB`)=1_!dL?jm(6 z4MU_4*dbi{pyTUPXgsN2)W9Alb1-`Q!takEqT>!zra3t>C1@N(P}MvG>==Fe_pZB) zN-3o|1#`9)Mdmm6O=ysWe|4emmUrZ+k~kT#(3x>{B50FSJ_6HMl6!&z)-1Qgjt{o# zcYsqcGlliLRi@kSs4jt93RANOY#O@WgB|^i8=#ifwxu1l18|4PFhUL!RKf|VQdQ-? z0v*eL3@pE%NiLT-Vl5|czZ%heP6U-enUjekiP!%?fxiFMnPi$I}2v_gR zYqsBD7ckb(fEVphXq*G;X#ZGqB(u#=zrFsqT;N?!*o#5NZLg^SBP#u)J~DwuUTG=Z z&Xgc+`4pQ&m?tma^cB;glvsBK4w3)S>Ay%HVqh^Vc9a<)qZI;VD${DARGENZux>qrvumATlvOnI7Nl!Ghe^TlHp?sVt@ zNP5o4gN{bcguEEZ${Nx15tL=VdjaEv!8^<^=YQb9U7g*})$|*LsmejMpyRt!?`fh* zbgg1r1|U1|DIj<#^~N~bg;vZe){3{s?=HK#F;_H=YvLz7^tIl1(eE<2M-kWu(~4fu zU{VKvqzM2G!2ibC`A+mccpyAara)n7_%tG*F6|{!UBt9UPj28?wbQ^%{Hj^rkve5l z@0Eg-G-$lIZmx;KS2SFsP|kwO7zC@4%f{8v)s`=TG_?tz@gDV_F6Jque=^qnoj&2n zsCJZKmr0Mm&+RkKe3?=3NIF~aK_<>#0aex6^dov{nc?cl+%H|jzcM$+ig+s9NzcG> z+DXJRX+I{tIqZ9uV&t>_BS*ga(M)L|;Ua@0_QLg?Y}4j2`IWPC1g(RS0*=J znaP_PvPX=nA4xOxoR~<79M@Ox>TPUFy;*iaX*=!8ixBZ@h;IH>*^ye)!;q!6z@27c zW<$t#n7HlqtL{zl5t&`TB4aOX?!<}s2cy-A=|6FAi4cX4QvM?=76sSiF9P*$j~G9f z8-QF!V{r5DD{q}&R6V9&Ue2{YV+2j=WXB;1ln_8ZNBd66d>>g+IdR&-%JOT%`Qi7u zG3V3A_zkQD#=&@%B5sSzot$SG$p)+GY5uvUGie3Hj2auwHZwF=B0aMA9-lX7=4{C4 z8WhpN6?ny|Oj{eIZU{&bGh2O&WuEyKy|!*s-{p7-LTkuaO}))b%V?IPlunV2)=aRH ze_KwL&qdMk;HEX>88p~QD5WWJ=QCtLF zGgJn^8+teVfkRX@bbrhi9%qY^zG8yc@!6DhYGe@Olk!VVLtKf4B9fV@{fG6|Nf~{e z8h7wPN5EaEs3<+8v#@Z-TCWJ^{q8I8=#xAe)sulDK%9KbF(Mk$Jr|egn z&Z3tLL|nb#v@5^}3>J@yVj~q-@80T^h9-sFd`!_UW2JOH27BiUC}&ou34bY52zb>a zb-1EV!ZfOg=yiCczd32ZBY~CTHIal*1pVVg(xOPw{LE1<5hrD?@I>di zESCP8;DPAx+aKRg>@i9YZ5d{yQmFe)5jV%|L0orrUKb<6rtLm^&O@^%ChaiXI@plB zpSfanKz*{Kh;bT$WJ&g4DGj8j5@^C(%d;$6Eec3SOE%2=!%W(F8P8uC_Id9304LmX z?&o0ZXSwljs)JzP`0)t)FX!D}Se8d0)P`&?&d$%RL1s#YNAoeWfv@A|=n=4q8C*Ec z=SZ_lF4-nv7a}>_-(?n~Bh>BqReYI#$IEV9Wa6&y>BESJQ$H@ebi#=SpZ(8oT0Qh> z`E3Q$4_X`GsV0BAsGe^oB0-+Xz{-Q+mpou&MXrf#0smN`P%?-?gROd~u}yq->fjQA zXe1%OOuETc8H}DSCD(EwBi?Z$i*8;J=WC4R*k9`F;O)8@-M9bZJBg_cK={!@9`vWF zCQ>OcRKn*AkStafGN4Tvng7V^dQFZjIWA&b!PwgiXLFi^v1H*GNH$h!n`+AIuqq78 zPAX8Q*p$_)Z3xxp^sAWi<`S^-&~Lsc3nytqjiU$vw9to}$|duJQLqd>ZoI0A5Lf4d zAj*5>A2aTXvzcEZpUlv)cu84V`vM|nTes|(2-lKrJ*x^O&0P6ntWj6QT}Z^j1|*w^ z*28|awl~lj3m}N-xUEi<*dbM^hvyJk_tv~mErek;#>XoP&zA3Bgwk+(D_ed7NV;*0 zsP$9PC^lj(mCoXgtTijrolUO+A}%X4QO|*Uj>yYPkx~L)n@j%YJ<;Ojx>ofl9CpDf z!InRUJs$rKkWtffgl79Rym36fxiP2p<@<=nN}F%L7E*D(%C?VfP~tDAxykR)^VaDL zIU$dARSlFwGK8{y?fE zgFQ|Snk!rsCF62rsbGr1EB_wI1p(6Y;feX{S%*H6Fl#kaD`$EMHlj@`0~n?<<9stp zv<)>Q){4PcvB-fuUjOn9VTMI4$n)(ugjlq!=&u!0^EZbTK~lP8So6R4Z2F=D!O4a_ zNa*(RR|j{nKMMb?;Dnk#Ai{m2H;Zn7XVOKx`&o8nIlh5jd&xikZ5e)pzlGB{C1p^w z3i*90yWxHkQUyA|g|8Ta5g(DJZ&d8DE2(h#5rUoNx08;nzw;@#jkSL%x(J{Xfnm{s zg2{RIFci4x(IZLm97*x?1b`LbU5+q*4&r8J4dtJ(> z_}^xyOjp$nuiPyh81@A=>$rdzvM%%;X!x9@1< zb6P&LvKE-kag)(zbIg}bW3(z2)P_x3bC-MEo_K`9dMOyTP00QYY=1ZH>!%9b32f^3 z;odx!j)j0eiK=@kRM(~vAf*&;OT>-SEG`k@Xr!pmj~PTLMW33MK}f2&m<;4Ysq}?c zMYMVQr;=bpddBxy0|T&tIv51m#T;c!y@~(UxTH-g_X<7iCt7LlNxp2Qtm6?O7 z!~;A?QrcXctMp@c#ar-GN$%1J*=Ztl14;UHS?8iPo^E+$I2|5xg>{ae?%2MgA4fo-_8WohwQybU|)x;Z9zVkNgK}XRoX%NAhRocc$OPbu#NY8kg2Z zSv>Fs>_G^>l0k~ynenaxBW;+t=2l76Eq}hv8ud@^l1(*#kd7gs_%0QqgBq&~6m7UG z;8!70XrY7D<~OS-NCf@y4ce7(FI6g8 z((tbvFuw?F+M&(d)AHMDdRW#cIvXYbv_^7EQ!hE8a)T&lL_MudR#kM=Tw%{@33eQZ z`CwfVz-1RJ;qRr4%fJb(iro3BoabZEUpyOr5q5w$) zD7*HYcY3|g4ffGIiB7H{0ow^;a21s}k-`B{#*p9BEiV5_F{qB@-rrv4U?LasWj^*| ztB-JN-<$YqRm#~|2818h2iL#MKs!&0cuY+ZTFmrrai55hsxM*3^<&S^HqR3`F&96a zWW<4knM&s*gjZ7o>Z{R%yCo$@y6sap#^`Ct9o2Hu(Tel}lZQ>G-yt%8(ZYMZB|h0EzwS)GJSM8qx~)-Js^woPXji1oXKvd&_0cF+kxmKb++ zA|1{WA2->_7ieK+i#|y)?yKu{4L^>6j5}QMK3y*Ko3J*RrR7?Ak8Zf@^Cu9> z(O~;#9(2^+>323Yn^qXgB^WVOkP@=~W!istN$170cXdEP9XpgOQe}Nr5?}%p{gmD2 zuUNYDeS7^y$(`T#vku`kLq^2a&3XX98D;=isneYV?4-!GU(I}Tlm?q6&P*Qr+h#1Y zK^_f6eVsL~H!whfimBDl|987rHGQAy{>=%BGrxuT<4Uh(8-qn*UC3CcvKObP90 z=~98b0=o5#$WDHW`5a;8QzdL-R)(b&*Sj3U>x^)=B?X(=qQkO}O26Q8 zn-yhEWXRbh=wN=w#v)o2I-Z8&LIi>Mw&X zrK%T5Yy8QQpLc&u@YyAY;lyBv-6~)Z%VSCq|so5Hq*UiZ@4h;T7GZRbQ&L z?*R=nOxtTD)y8IPTZA%gHd=#~1x#Pb7K66dyAJ=T3SO6o6$vtIHnk0P3Eq{xz-Z=7eTIuLhxE9bJFrw zhr`hOzBP!zL;AurlQw3ZKBF9OC}w!{L@yI%Bg|5s8Mp{sUU~G1LasqIMj=~9;)qf0 zr%}iGqFhApufQzIwj+m7kk)k=Fs)p$tP41wnQ^ga>pdxnr5BW5+fKn z8bGwP5b{@7UWaCv@%k@L{ZKYlpJ9?KXJH{#?AP>_DwK1A`!P-8VqaU(xh)gH(3(Q; z@WJ^o#!R{v3q*(n9f@gW)1Od_h`+jjhc)_>Z%sj%F%7%)`nJznhW-v_^}1~$uQ+cs zBT1rm;BB@HTlpVytha2{7{Q-5q#-YH*DLzZYaK{?6XEZJaR6|r#RvxGiuWOoMWDq& zaOL{kXLCm+mlgjWzo4y50yZ1hr`s9r$a}d-31I#6V6``M0Hd~(q>*39(*u*uc!5lK zJfE1CR3=wgHEgu@h&m>MA2;~T5fmPZ6eF`OTx>c(T+ALx_CjMb(7j*@^!9fGycMgfU{9|YX zNdjG(POlp154Si4P36DuW>TXQ3@p@ipWmqDEh({JMWh~30xm(h^H=^%O^~DdAc!Pk z2#>Mv*B?^o9uBJ+e3<@L`gmXUkePv&J9! qP`rw-|OG8^Z`A#}aEbc2h{BqXKSqJndf=j%fed-Dn@P!u`WC!bWVbZH#q9SX z&{e;AIM{7>sORMWg$1|J$9^=wR6cKe`qQKRjlVdEvU4Qe4ng3It$RR9R{T=r;N?~5 zw;>PKk6-(yZsm!=X>?74mzg`emEeVliJoX6E(&&173(QAVYC0b03 zbVpJ(PA%mu2+sT8iL-P5m;cq5lgV{U)l`d)2yJYPXFg`{Tuh)aGC6rc*F6^kcj2q| zWUiVQrsYdW1W`@|-r_^BuXz3xRpl(*thXU$(x?WBe4*cCq7*1>0#fM=Vw6s!@!2G6 z+4D;4l>Q@n?e*t{W@<4|{61W&`9=L{@<W2Rtg}7KCHaKk(4-JVZ8m(WhhY5?Unfo0rj`h5s1rP1;05l9<%@^aDdYfERvPn~MG^T{< z_N*autZN@YG3n?nU&;h@9)joTX(}^@w_b0|q-T>QXH10Y1J*Cmcf+`m$BXgQeOKJX z;$|ZBaFp9%$Dcxeej8>NcgWKG=zAkC5hE%LEF;Ru-5pkUXeTRrC^4WrlsLY|(tVuw zk0XGPN=f0gDD=+!;tjY1KQeqIjt9{@@~vyhD|oPU`JV`r>A7aUE(^-AEi)ZAPl^Bl znRC8krQGPt=cEYMG>@&!p|#LIQ*9;x5TH23oo<5K)}JpVl*|?SPc7GdaR0!a=zb{=9?I7H><&h0h<`gFD21AF=hoNI@s&T6<|3czbteUpagHw`_j^c2v6xhOl8Y-5J(5>I z&hQ4r0L01Z^TVAaE0Rl@9#i{Av~i@WAeW!1TEywu!FXvAD8W`U>1@2-2N=oU$NyEY z*uOGxF3|ez9@FpF0l*P7+xL|aB>-ua(Dml|jk_huxncC3Zd($KCk<|+ZMt0^DaBG3 zmSg)};+i8q{peB3%)_G(&G7nHdpO~c{%8395Y|u8c*G0&t4+%I*?y!)kaLM_Q}+QG z{a^q(3}-wEd^Y_;xMNF|ZKPF`cB%Al;i>~SLpXVkIO%ESz{t<1Q{6qWHQNQ?mcAM9 znAGY&ZflF(-T9e=Uy6M_RVg&*|r3l-$Sh$Q=TwqNp^?o@&3FaEwG)1TjoNPE?*zn#I{R4@g21}0yv?RoY^Ws zWig7V=I^op!`@dv)e$vqh7jD{-5oCO?iUCW+}%Am!QI_M(BST_7l$ATZo%DylbyW# zZPnI(`wwi@{&Z{R-07L)-F^Cb+GTs7{YS^OsiuTtI%%<;oK(HXnP?wsunY>srIp4P z_9X{RQTy#h26?XC{q%ta;4pii40G4_UXXj5!AMc#$2{5&W)?G(gv5L6>!I+-)d136 zM_j@LcCB_dyiXKA+D^A6ClXVSqse1wFn>Wt&>X&ErW9Tt4XLq6j8KXC<;s;3vsaVvS>Q^3n!g+1NYglNaF zOC_rY{3Wl4x}Lb;m8N2Qr$?C1d_)KxcC0HCt~wWx13(zI`9-KShUCN#)7gyP;j3T^ zC=%@a_{ach*XN|Fx0B??pFlSusEQAmEBs$v0I0l*vD+t2rh6QOB2^5Z%FqKOgmq%% zpB-98!GvEb{)K-t()v6@fI03NET5}R*V-=-X2378CG&&K5sve6fQglsnT5*cYG5Ok z!HuI0s>-aIHc~i6k=N_nUHJsE`B;l=x+wp)4S|h(49205g@|972Iz0jcF*Nk|PKY)sQ!XHgE{43ru&ahm zKmx{d1GG&hW%QTVkaC!YT+cH#lJnnP@am>#?F=qMCd{8)UGQ&e=-C90f96MKvZx#; zh+boMHXe*7k#g@%40DxyHt<8^7wr@=!b>P<>H&C@yeT!+D&Q zFcLqS#DxYt2sz>_`+w60e|de0t}QgNxI;z2Ql85b9NC6wDG5~)JFD_yw=_Jn129TY z)36U%s~9_WSo`+r_>~6ejqM0>GSYc?vEFbeU7FQz#i(a?1%|)fNv-A_7py^f!uRi} z6PYAXOsJStaGSn+3HNZm5(CiFc05gTbfHkVh>{YR0ZI3QfH#v^93w;J(4q7S7CjAY zc`7b5;s`MHzz3IPgbn?>KW{<1T)ao4vpE{owl3&HpAxm1=@6L=>1tYfL;#P;u3??f zMzb5M>AUz3HR_!bUg} zDb%777p?`7SZO*!hGerKd&sEe30TbfK2E{!N_cIvm|CzIE)cO!w7VF3chdz?3=?6T zOHvaN{XpO8AzxbzFcY$poV4*Bnt6fQDLesi{hmM!E2$}OI_A!jGw2^)hyk+R$tKff zh587kQlCeK!IrY4H*UrpMEpQ6gzLqkU8PORjlaVb7vYXoskrE^(W4JV$WsL~YBt;+ z1F4V~F#+79$OUQu+D07#-T%$To-&1Qr$@wS+t3h+qleUZgui#Rj4^*SVxlYH9^TQk z8)MhMJ-V_@KuEj1EC;&|1mxl6tk&Pa!*a4S;W0X@#ih67lePlHu&@MtxxqQ#b2{%X zsCsOiYYW+-*V||VaxUz@7I`)`&8Qv28!LWX)| z*=|{|vLUNjIA#l(rU8eHrW$iRW3q;sNr6)c;r;x#`uI4qBGpeZ>$NjEJ=r2-F;Ydk zv5e=#Ihj~LBOx%g1$zJX*N%2!?w1iG!w6kn(KXOnPLY3dD#FUjEZGQ|@M^x=IkYk+ z|HPshiyBX$_@|4C(sx^Re<4qh%Xh|c1#n@Pc*=muECTes>6_qV&YUeSV6;3ouys}% z>6$C_!>TM6lbj@t@+rPK3j4}9N_t-_ak=e-wUgt1Xr1hPn4+@-c(j;mUMJEYOS7Nq z>-QDIVl+x!cF_8x8K?7-Cf2%lAtUSeJZlx|*(BDis zOsyMYymRfEdf;G<<}-1#*2o|iXR8tK%{;7_HQF%QMUcHu>Dd^KrM+=Eu1kM>^a?4C zf-`f>;6Ooskse8?#tx=t$I9dI56^ELzkJ^NC68tUrkqL8 zNrc644a;$cdrPyTrpq-a>U9+ujtHg}y;xC+JpuO-pOh)ItCTdN^=zU-jqkS$>S6E` z0z0kz@Ht6w>yXh3_kb`czBN42!S1W$=I@q0{Knf7w!|`&~L@FYsqfWq%&%2UOxZX>W5iru@FKbXhEre&spg|(q0X0yT&__ zJHwcf-#W;n(JRVrnW$mL-Ypx$@RQQ+QcalK5&3R)XI}5m=N@X1RuB2@`~Jh)-fz9W zW%XsI)C0V1%Gx~w0uUNS(^xW*HhP}?aMaBj3^NCOXTQ1m?>v&IEHK->(hhvM4tH3- z@pr*jETG48miqHHDTmc5oG9}S&q{G7Ez|)aF+&RZ%XJKaFusJ`ur*Q{mWm7h+E0#A z)TL{-c>*OE>aFz6js_?op%VBQfSLp0@vltcXP#f0iDTVkQ!KsNEp0IV!~psZ1Ofa! z1-~3Dkz$wWE$#p?Rq0JzEvBDms&Z}@O02AHpEf5galZQrkBV-cvFI^c9KuMUl9R0? z(|?@_&Km-BGqxji@>X$5=GRk=!K+O!$R6>Qri;@u=EYSym6e&)b*{$OE)gTlLrGo~ zk+MuoiMUaG+VWrCZnO8N93qg5RT62cM7ixakgvjX7nJ2NT@ac-0pvB;O7$>%>Ge4_ zaHb!1#J6t^f92SIu8m``#V&2c)t=K$8L2wE-Cl+}?(8DdDQ|NxKg%A^07rfaT7J#- za65e6x!0%-5Vqy$qTwg^we%Y*Q!qn#7ywqzucM)=t1WKUe}gI``FG?Z&=WL{QTq@h zck-mDZZ2KN5hlixsPuL^hv;di@t?z1nsb!U#6|8ew8+f06x(>5!_^+g6yCpEs4tt1 z0m|1^M)wydV1*ML^fB2EMjZ|8gcG30Y1kv} ztMR8;xS|C-x+z3ML7--8CW8>qo#SgiwsZy{ThDjS%XxL!T`F2fbIuGz&IyIj zhFqAasH9!*Db&(x#&+Z@9eya9kH2$FQE zk*%`Y`p-=c@UOZilQKYr28o2cO2~QXlurX)s)=Y%pS&Kg6=R>RDH063pFT+3LP?u$ z@i=N24hesD-eSk;XwK-1AXHy_0MtBqJr&ormm9;?%X5K&_a0pwHt-y}AJoFIO&y}GcrNag|t!X3J10WQWz>)#I*-ne5RPx@(Ag@ET6 z5_Nc_o)d}AwC+18ol_xi!u?Za{NB$2K)2FpGyQ!x#uLM;?}c1D$ecK~BR+kwMHl#2 z4OAyN^0-iEqI0~E#Iig1)7>E0em6rJ;wqBGX$7mj5HB^F5d5HM+0C$Cmm{(cyb;!?nODSgY(F2T#Jji*Dxyab$!!==0-vwnfpYV# z_h)>&fuUfp`Dn>#Y?k9-Z&Rl(wvP|6{Tl1SfGPjEjCxP-KUme@<#vox`!@%?HoqTM zB1vwrqWo;~NF#*I)=&@PrPq|=6_Pq>O;(fo0pi4}POC#qGL2vW+Sk=)e zF~g}?t3X^JVd@Gc0jI+b-r21dSG$`|AymW+%e=m!(36;25hM@oc5>lRrr=@gwRoA% z)_-J11vvFW16RB2>x^kE!yb&-fb7F*a)`59L$lyE(`0gOANIAy);U|_wN5P(!gm>H z@hX-}!>~p6Lsffo6N&-9EjC~k$EHV(r?W=je$}eD`9hhQGaMKs+#B_LghS7p@cC0n zSyO{RM>;FgS*o(p6ZZ??bN=%&ma1Xdv$3<;I{CQv~Q$Ejm*OHL0 z2$*V1_i>kt0$D@f#TY~_bvO_@vyuaj;9B)o_ibhM%EG(m417@NCYh`+)zC#2y31X#t7Zma6wSsTR_ z(r4+JwD(7`vh2P2%zo)iLP-~&FU^i1II|iFNX>ue#(-d(K}i24rKh&_$6755NApwp z&!UpSMiMybK*wIA+L)7x=C2sZ_^tiMW7e7)yScep2D3R{PHrBPWFK8o(|v!^AzQ!!PGS`q zZ_H<50!f;(5uDY9zvw0oYGh6nEtU@f47(4CesK6su|*1L(FI}yi;36O6@Tp+ffF9) z0`D8o*)mn4pn4pDO%VgPugjZa7$C+I1fqOAw!KFXk`~OWdNN*O0V->%hG9;QwNjjr?{#^onD{zoT(7>K8D&)~iDL}9`LGMVGx&@qO2x$LJ|X?J7v z*w&(yv7G+E0u(xyh#G zdDADmZ)v&2qotDOy3uX)+;(wTPz{QyqcC_w4BFUrl#+$PS$g7@JtZh7a3+hQ1vPoN z15Z}3;aJ)~NJ?SvCW41P^rz>^g9ptfopXYvX=l7P{hp+_fy{x6zU(t1GHLyNTa z(`J?8Dl=2=hkcZc2BoOn*6i=7EvEs%q2J$9<0J<5x9fP?ShH9o2~FJh5MKtxUtJvd z7sX5&o>V~69t4Q?o6Y6t_t0NV`k+rabfpWuoMj=`7Twqq8aenc~VwLBnri9wv|3yiPnpSLf?i)}13TPo9Iu zG_7A8{M9{`voF7U$CM|M+z&Obby(2HB3*y0E4bt#p}cu+MT@*yK^*g|p`83A651|6 z{#*CD)8qVW3MyP<~9dmLr@Bgt|>BG zh+P{_Rd}ae{YLta2 zSC+UqMI=;5q{nGn;QbF7?Z6fr^(nDM$wor7^Lw+T?|~^BKWpybybvKvr~;eG;m8K@ z9||z>8*YCD*LOhpG1RetlcA4m##n$6%qIJocFeRik`fPmr8j5X07?_LH8twI4!an9D&IAlvHBphDc6OO* zICLJbk`j4JxZVHS0CF~~X|OX7G|{=%Gd`M2QFD{7W%^7W$#1J}Hqdltj2{ldmY<2o(7j#C0J4Z|KNjB&7l8s0o)=2xe{R(IKBw%I?LrZVB5K3 zw#P>}(?&D=;NuMGj-Zda?VtC(_lVKfpUz$eaA;1bXgOlq_tgbzCD-Kb1`ZrEF##VY z6fQGczkNPY{?T#wz0O4oc3Yz&gjzY|LPu?41L4iSY*6E$m*i&u>;p!xj@NiYtC%m2>IS`80z;2`%Q7$i)&FZ-;@ZfrKuZv_MCNT_pU3`U+tKxz+&qmoF zn&+S+l?&<(v3~8qsB2=4{U9gYY+6;b$c&aR)UKdTw>a{g6iutdANA<+RonB+=g+r~ zB==KX`V;M1r|mfJLH2}sQn8R&f;?&cb+r;i6UhrE$eljMei5Z9VI^h?NP{}^^#$sx z^ZDH%Jz?+%GdGi~YDTuIqmAwZZLNgT{L4RfKN~2$y<%ngTfhtVCJIutLapy5lZG4# zK{1&$3v6IhmY#0mP`P{fPNqQ;wdc-8(;mR$G^&F01{$ts^~2> zr83U1BE1jbn$ahIf==Odo^-C(DPmjhQXHI0ZDQ)Ab3suM=FQW#ox9JV4Wnu!wf;O) z3e|7P20{LOv=PP_?!sTsIV&{wH3FgCL^atM8$KU~ual)M0L4*`?P&*mB z)zV4W5Sn5}vM4CV4S9lJ4sx?Kj6~S!z*K!WpP|nQZDzp>p6~=qFs93N);W0q__>VNhK%E@)7{0`K(6uO{+fgit+xVp} zw@R^S05g{#WtXX(bwAnXU^f0GVWk-s<3==8rND$cS9S9sU=Iag zvgM(${NMgH<-L6L)f;|nK-4ag8RR^Wdq;gG61k&0F*P2qYeia`tLppoTv5aUJr=X& z`nq0U-lSnrv3bMRL;X*9qB&WyF$77=$&PO-+Bhp963P^(3E1>tv{0DDTBmnED}fV> z%$+*)FS3#m)e)6eM(7ZG;Y{|t5hNgOal*V&T6FMGl?1@eC_!Y8F(h!&>dZfro7GK% z{5ccQk6#;u(tZx&$C_At%HbeYWHa)C-udi&Z~@I$N=*OVyMKZMZYHl&)@oX$c&(i? z3i$vP)t-i__eVPXb(?R(u%D_n^j?S{`v_s^CHtreA+RaLjSY_^)Xb-jp@@BWeFtSk zG<3y!{dP$xwU3^ZN+&!0qC3Ws4lN>*Z*z#wOAUIJipxPRVKuNSrvc@Qc0xCVL`~-J z8W!+ZVR^oMkDAnuCEG*~E$aP?_B-~>402YQK(-k?N(>hEVJ^&Ng#o6pBfhC9k%`8B zU??fd#Pa}`_c4S0MoEHfO{E=!%VxbJH_&3w#xOJ0uUrrL+@`018Vcp|xJr3ml2Nx| z*yt0eHo(63MUR{q>SRJ39_<>s<|xkk?S+({nvJO~p1W((5FVgK3z#q6t0Q11*!;T) zKw>&^SoA#J;nu{ua4BWYeU(YDjWJimODr2v)UH!-!=pEYeC8%;;d1&47l;4I#(dG( z-)AQvr?$~G|H;$VmX`t`6`BnFR-&i_L|P#!L@}EIV}vOPh_upe`UVHmcO3jZ2o>~r zKj_L%e7-dSB=won<||gqLbe9HQMb#(JFy2cjCwT`X)cu^JZg}Z{)1X0Q3aEN{!A$@ zb?sDZJKll=fK19(CI;@q>!-=ulgPJ}+rvPW+mZ3N z&z#M-!bIuxsPZ!!d`Dicx9QfluQSV{F%b30llT(xQTjtLPwJ;q8z4?I@@_{Fg-8j* z(fBcm=4*(<#j^S*13%p|fILQ`t)a zP$Ddq+he{pHHCuuJonE4Cy*4a;l_n%fLhAY{2^k4-@!z37%KVE%$uFo(a!_6rs9Br z29p#9i$;dK^#QTwVouKsKzD{XI%?qkRbQbnZl5f~I#3DTeSw*4qG1;O?5H(=$K~$? zxRWS~7B7^5CLp=JIY0wIt^~r~ei%r#U@nVe8)`28=;1;2AvC?{FO7nkj_U`1&Nl_= zto9PjN(aVMOl3qIyTatSJ?){4)VneX|33M~5>fq_~Y7evLtU zccaJuJ?ku*^ffoIN3RXdTnJ9(%;TC!I>soecu3FW8*zjaVc~yQd1Xr0x%|8w=YKT^ z_cBWkcR%rOYw=Ad{}Nm#J}95rD~wKL3xE#i0e+1mQi*csIYvD_+5vasQR|C=u;)uL zKG=bp*dM;i%74mcsAMx{Ji^@g@1KyKRJty~p>XFOQAe z-9b6Kf8QmIT{z3hy<`5E6B&m@_Z^vwAl)C0>?@p!AC`%T<~uCG>g&usX@G=1-h@?yiE?#s;mIf2rAV&->kRV{6&MHtJKQ4aiq58;u;;Taa+CD)w|UBb#j?UZ`-%kRA$g4D-C6;cJr}W&=oJER?WM! zRl)GmnI#HW^4mj0HqzKM-|?{&=0qVU)W{!h8pIG8s~PY}*AE1kW?`m1kL0ITAT)t7 zc>E4H)CE!cysQG6(K#kt!*t$y*!r_!Cj5iRLAxleqsUU4GaSQ$*1-}-)xv|Q-bejT z4{c_?@eC)iU*xiT0NIXul|D!A^)3SoGxy%i5r(PB6~E7J{zt~vQA~pOwE4N%X(v8OHyT=K6|h zp&6FEI6jU6>=O+hb*hIMB((lavQ^*oY20Rws+pK_=lwDeM9p*oO}sbMQY5t?naUR0HE5mRy`LZiAJ9Ub{{}JbYSNN5AZrFnb6EBvnvI|eRx=sm za^2>Gw3lA7??&PUkx!Oa5SLu@fw;=B;SFo3j80!pLJ`;RnsZbIZQ~idEWp9|B&G}V zJvbGi-b zAq~H)*1nKs~_lVv+o9NWlDAst#vok1vmCn2jS2^8$DM24;uI+YJy%rjb)UUQSal#|96^j$#0$#^uVcco_ z(;~h7;V(Nkzzks;n=xPUX{4T~F+|#OfL$|GOv3)BZtKPeXS^9P5xhRjpI~ZeX#T?B zPE)1VHvnyV7em*?B8MnBrFQFI`}bmnlZG-StvY{`H=CiRTkN1m`?2K|2EDP66+4?7L_8u)iiEB-J1gGgx`)0!o=E4!5W*m&UO&3K z;b!U5>-2@Id|S%L+6Hy zd4txSVNx)YI=1ZN?)!3R!Ala}*OmoOcWYu`_9z0ajLodAE$b(%QP)u8L)7G^chf|$ zW?$G#Wz0ps+{ahM5-5oR%yKl6;eX98gtm(4SJ>aEDT03Zrp1%XrBTjx2GT$2lp#h& zl6T{Y`#41-;))GPXhlb3r_sVws-_X1@7kb+z44}P|1fANrOZc_eL9C(yHIY7?4SNVqunur`I61 zEt(jsJ&#ZRGP*cco<9zL`V;>JLMdiMNQO8XXkAa4+$=|N&eiI{EErj?A_EU|C1%lA zK=}P=8NHPqOOy#!sEcw$EKMy}6)NFn({Z{{<9>|S<4=?E@ZsT65m5Nn~_B{!S_zB?&%#_`y)4yk%8`WnOT!TR&-it`Dk^n_h@`m8p|Q(whD%6uh* z%Z*wKg0;>`YhLwtoCrNP!6D+?Rw7|BsAWdIm`YhV)yXTNU^bpeBhA=a{q}0$sOoEi z0sp33aD7H}wtvUI*e6;WrIx{n?%?0d_rq^Ipx8KG0Z+?8#F5{BZWKqt6E?hdK4)<+ zR=G+aV}kZ{(#IBMP@Tk7w;Oz>Z=92xFsQUBu_6xZgl-TvduQ!RP+d=#L}WZ#EZK6M zCo}K9H<;C#s{bJ&5iD>hR~p5J0O@fdqZlaOY=#>4 zt_Q3GhF+Bm?DW&@E#1nvIS;651!KB%=kh|sCPTDf?S#aSxQW84yV+_vi3})YL_9BJ zF1mC#YZ4T9iLiG}YVIc^R(gHZdnx_|Xa4#qRv4I=3fr&HE2ATREj@i1cZbb~vo~0% zo0tD?+^n1U+h%CP5ncGiLkPvCJxe*%@r0;?r2H%DML1QE;|@AlX$>k|@nBO;Ew6B9!55Fu`PkHLSpC z(#oG{pH%C_k~gGXgjRO@!#;i9b^?dv{re(6D-A(!L{V~$gOXAs&VH3X3ME1ORukv*v#ti1e}T! z--XRI*(9i(cdk*uOCStyvI&6EeO<&=0=9U8&RhYngkAX$JIQ81uc9Mwe99v&EUI6c z)(Kd#1dqvmaar1=eB7wRUEF+(8DrSv#hRCySt8Da%HS~?dra}!;~Bqi(UkHNLY7cD zdbrx>TyCUF&lHYcm7@iO5JQR`XFB{FS$@xub)O8j!DEBYORs*3(PWM{f988%ZE6Ce z-gu=QNRgZf?(S&-9MZSlFSh@hEN?tNI~2@s;*B?z{Rlf!sMqggnETG;{YV(;{;5N3 z4)63$9zRTgGwk_$7zOp*6bg;gmvnAsNkykb^bMXAH=-m$=L|)w`4#)!X;3G^-g6y>kW$Uk@_en;gv|z99uQf%q zzOLIXvYJd+DaZa18ogQ&H7@ehb4*c-y&hMWxMroY*wI01Y!ee zNtMDD`Qj%UVw~N`aT!i%=QA+$89Uwn9^|Pr{!p$mdTUuUr_}W_Zk~?v_DMAh#ACVI zGRM9h%eOxz)5RYD`oL{D{X}ZUNA<|A3{NCH8Tx=s7X7xk%r2mg;=~crQz!|i++edZ zWL&N~db{7~r^vA}X01iw_JG9m)?~N1AP;G|x488JxaH#1c2anp-mt&C*)VzKb4HMJ8J}15w3&p$ZR{k?7JPX%+6_GDQoo^3c zXR$on^0ARIod}Y3Og71$Ig551gh3LeQWjsI4zOjo{n0S6`(gcr->I}#9CeM3-Y<@< ziw;bG68v<3OO0MMId8%Kba0a9%KUM>!Jo`>1siHOOgk%4vB7%wO) z*S{ul@Ojp?e2(%Jm99X$$5gO$@A-6)Mfvk0Uk>it%E zx;U(OuH7)d#+i*2fVT(K6k5K4Z`J9$1p`Ra&z6D&gUlx{;zvKReL@tC`$ERw=sJKP zD=Jec>uwr?3(5;jeKGjA{#|sFtZFq(Xj#d;?)J60^nP@VuECWF0=+&(Qdt|s68`2_RJ zde?{63w@J}{b+j4D$0ffdU!Iz9RlDtL<5dozQYDlz7A>j;Q`X^y2n0 z_j3Q$7o*X#77|Y)dr9oOj7;zXa>!s;%}g|+xRI}x6MdWw@S{Wy_K!%v)w z)S9)_=5xoTS(7;~FeT!F=O3YZ`8jcejfqH5*=6m<>1SqQ9MNZ)ozlEd7o4*9op$#Q zYu$VLYU+!wca=A}Z)@GkO8bocwAadvIt>kqu`F5)vtfTm89sOW9rChV_t%NcQZ7l{u}|OZVQQ+a4!3V%ESC5t$!(EM5cL;SOn`oJ1ZY@ z7)4%mU>XN+NsM=+W!ll^cefZ^kU_hm+D?z|DA$#iJTOD_ZLO`@MfRC%K=1x^c8eZ} zwZ`tc&5_ArkE@&<{HC}ONI}G=kFv7u9;orQpW|dGX^8~4gR`Xl92e29E?KR9zIEZb z?jKaOC=0C9|IyMH&8W^J_S(ow?UjOB@F~HatItW+VE{gHPM5(Z(YfhEq} zoyE~q6gA8W+StQJw;0_YSf*bvm*W%7`~|_@v3pYgMy-iFUX8(^-F#`GTK(_L)oww* zlJE;Ky=!|cCP3P7D=?1?*pWj;WI9km#?AY{>+jwkvjeoN0f1eQ55L9&m8IL zGwl{tznf803itMyX8OYgRVzhY2^*gPr@vubn1|0m_GrZJ8>7`%HvP6(;@JZ94E@^y zzR#aov{DHQ+@PI$9-*!qug>QmbKR%H?%(M0Um|m=`uJ0#ai&J{sXHaydI{O`qmQtXm%3h<6BtFs6TEO*^8VB+)}*-kRc$v;W;K>fB)suCp?*!AnWsL+CosBDc;*V6 z@30Z4S_hDdLK((qe2jimg8q4TYCaR~_t@Z8xEw`H)y(JV2}w$f7JI zX8DYm1#)|SdmUN$VB^5IeLR|Uwr0${crYaszlt3yx<&16#do^ihJt4BS4+`Td@Xi#c>XyrPkOub3TU@W5oN`Y<^dtE;$y9*Z(F zORXr8VbRy}uZ`x=z{d(*25y8TCS=l$!aiCjN4jMA%rChgMz4RnQN~Pk7AsTC#}xWy z66y;&eYNLEVwg5sVF9Jr{GFy26(##w(m%GI$C0%qQm#C@3lmXK^cY_xn5TDt24ruf*b2Bt;exO}l$z;oP>N&6T*cxj(k{vfa zHprfzp31IDr@9hKIIesDP8XP#Y{YgT98gn+@fZBE>Y}C#XLZ>dFj!k0SlZk~OJnMU z@I8Ip=<*D4d3)hr>m-X@Wwwh{I>Ke>7BI&wIbCMiGv$;9^XL52jdY|h+eM@UP8Fg4 z(djgdS5u2f(z0~>J#uAj{`SWnf94V7KiFPM98MKGw*MBcN7n}iHu+S$dXQxikQMQq z6L*u<;3wc-j}dIGoztX=57J*ddWY}XzOtXqeJ6Da&+P+HF=!?8`rmx*t*g zKFeL%@cy?jL`+rV3LBwUd2bxbig#XQvyeLgu|V8iJ%KCZD* zALYWyB7)2}Bp$({sy7t-Q_N%Av*P=whJx|x@H24Ws8{$+sZi%n5^M3owAIA+sMcz|{v30w*w>-X~7vG$$T`KnI*a)x0&&&&$KW zo@4y^K;5Pr6{f!e|3MX|y9VBmAUC!tjg&QJbvPP~3nH22u*?CPVf%0?8o0XQFs%fx zz8#K+|6j^%|CLeN)E9+;kr1#%{?$T5Fa;RY(53@gEautoxS@b6@Vn@w%BDmc>M1D) z8wyTUu5kMw5~URQB!sgPrIJFh4nSLolapiuSI~Un_CW3b&$1NuqOeKockNv|NLj2@ zZX*A5O{=|gpc3vGxi^nZiAO?}yQk%0le<6MC~N;q5p7D$@(Yh=2C9!iV%B0qF^?Y~ zUkzMwkWsY&)rOCbxBtJC75yutPsSAlYLUpkt&WsME7r^jG@c+it>4doorc8@yy(L> z{B{MTESWj*0x*8Dq|HWQ|9e;b|8DyK-Sq#$Y=X~P@AMoAhCxac@u&5>SywWjJd7!$ zKX5n{Cc%P7is{=5YIgZOWcW~g(_dVj^6@c?F>+TlFO~v5jNXr9*RASnJ>k)ux7o=> zifZLAb*>r$B1yiL@j8=yv*V_;<%kj z$fQCKvzDdmMd&5AA%e|Blr66oFaWUFsPTnm`sQYf%TST>h6;5|!*1@YfFL;b@%23*itxpkzCf zvlGblsu%t8=w^HnZv!N0a10-{L{mBtjoiY*<6U%+Noh)DLEMs@usbgk=&)u zXj@thFKD6kC>p(E&aD^9SSUZ|MC5SoSfxbY>@ne8Zs$(#5=K>g%28(~tuT=G({QZC zfN0g|55TzzB)NRZ=BnhiHK7tDE`}J3YrlsVIjfDPHHZW zVVnOm&&JUeLXAOZh>&-*@#v4?6)kGGasxw*Wyj6-DIYnNwRV3TAZAdawNH}l1;`fU^3jv{PNVIRTSUGMc;UIw?A zjAHTM2#*(Q$JgDV4fQq09@9xzTP}tVXN{Ps{_SuRhh4yr$eE6G!q{tD>bI2p7{ zL&TOFq!U@}Ak?mhQ#F`;iV&?)ZM9h;?~mkKtWSt%%=&wohi_fp&>_Jthif{`6FE*j z<%N4#>qbt_F@e>IS#&BUyW^kaHX)8lU_N#Cl5>jbJYlIC)zS30o7&bO>G&I~aJ>>1}NVQ!JVTun!^SlNX0c9|W>HuAW zVswTLx@d{f&S;x;GW5D0OiYS9?lX!ZgBww|n9PQV$0|!PYUHLInIU*G$_Jvv)T;eD z@cPV4fS-!0lFo#DMIb;ROCF3;^j+Ctcq-?9>+CEnnFfj6v&LR1@XG~(>@pFr zoZ@&fFpyO=|7i=EWno)e#tYSYcmc0>87s1g81IA|lwmO=P3^v9MrWfJeIa<^GKD_U z1xvG=QU86!m<)ocRHBz`wjyx}X0 zkav0%oOwn9I1IJz_=@C=AVd?2+4XL}CBEaG5m>c6WEygu*t#uk$d@_z>Xd>Hlw*9< z%-Qu^QdXCD+zUNCxGdtK$@O)(ke*YV(v<;2_+CHCvx|V`9dJX({C-c=4l8!)QL!4f!exeOK@bQ1W z^_%CKsdnB!x?PF^|vTZ2|CZm+KU^G-Z3Pqca&|9))p zQ`N9DM8Qz)SZyR>Z0%))@SZ1M;YM1&m~RhQt5yqS-1(pKXmrM^8Fso{h@`*#VWP&Q z*V>;voNLOmv>##MHwwGz?u*dwF%vu&@{45VZ}_IOX!&pERu}-DJ4r&XzZ4$Mhb4WT zL6nSy)kPeQ^g`&29$^Sz?bo#Ic~>_vB5Rr1wTgnmMU*3whvJ6?kkMcB#As{fk0 zqhc=x9q#?fG>S_lgJf9yAMhSv?8DFG^Sn0rL|bf;X){R7MLJ&EK?ae-2D~`d9~?vi|sR z?ai@ra9W~h4f4Src=+nuB?e{xRX~YTIWVoWviRzJ9G}7_C;rp&N9h%voev^J}K4+*m{TsS# z^NH|JlHrAiW(gG07h-wIDq{E%E__`xq?S3>+-({FfqYqz&xt zfVCbH;eG!?V`~b|$NNxVy3eqyBB_lX{;DRzWzikFQUjS@CEMj9Fc$%OFU|fFY}T3U zQlhR&TfNBt;sW6FcFo5GjX04eDBLlgq<>sgrOlQkJ{u56x*R+uOHFL|p8tUQ&LHgB z&V^Fd+R+|g$P3DHQoq_2AXJ8dHuC?JEw^4*+(<1KD6ua(5?^o}_-1>P%An5fe)!{q z=TD|}I$$PvWG^5%gg5VDGjud;i_Et1FDDUOy$Q#O;5^e)G`bu>@SB|3BX$;hC|Mcq zdl7aTN2b4dhHWT}`gNN~`te)W{TVr@ipx)t*q2xC(d_!mA zg8{<(a~`tDHv>1)72VrwxPuGD77$Ri)P&$rs-pe~SaG-zzO1H10=S5JqtmcS=@>Tk z(aFLvFf_-+)(Tbl&t}A|9iDi}Vc0al$Ram$X^MeuNOPPfq%n7Vu@C}K+LKg(p(8el z&t**k9EBKW<*t--!ggOr*DMeqWJ>IdAZBsHqG)IW<{R~zwqA?GJ`vVtpWWcMv6wW5Lj&?=fpoF^^wWF zQWQ7%VUr54PY}ecn|(XT7#)#aki&+fffvOPjl3K*z%uAG-Bz9jWN?0x??7PxO zlPHY76lys&IV{Kl))+@A2?i*`VI$Uqvy`dw-}0dk2+KDq#088&dcr{-pdHv#qX_3D zEDNoX*T02z{4Qm>Q-JcO#5Er8(pAG#=jA0T>Q0R}LHqw$dGeqm(M zQsLeJUI4s*^7fd10ndn-O$U}!fu$JnhAk&oUn2v;GHwZnjD7V61IpmF&19jXBnIXzr{0{AfMIvz`)8qkBeK-IGcfBve#1v%JcHUV!< z>4R_sJ7}9w>_cf7n2Mg(R1zsGDv-mu85e{Snia_abj^Fn)=%Iq_+{PvfQwaSC>3yp zalusnEW-s=(}VQ@H-U|{8PJZL>0vsGKv7K9&zk>U6!ou&6ex(ZvTZI412ZAc108Nd zs9S0BpZ@3o`a}KdI1-$4r8^1KS7IPn<-dRJN+w|5{}0aIIx5O93>Os<1O$-|5v02Y zq@*OJr8{Q;>5?v$hM{vvk&+Okk5Xk|Aq3`GK+_ToX_nd$3{maGLu08wP z``zz;_wzo_yNL+B{w@7uL(|ItmVOdgDvkS3XR9bdsAGZkf5^zEtZ1VDhTi>e=uhmJ zN&(}-qYg6!dbav>w)IY-REk8m6Zng~%An|fnb-IiRS_dblyd^V*LMH zEudF?XEVT}p~HRAZopq3ZJ0#<8#?yCq045)WC8nv$%@bd$Y_RrSjd?oft@L+UsMB= z_st|8So7H4hJVH!DKhho5(GBRYAgnf7HP8oe`sm}now&%!~b8~`ZwCDc)%AdbPL!6 zDXcB`#>R>WbyHe4USO~+-#kVEu#AtGxf7T%yx5qzAO8*A3>Z4LHJc3p%>b6>_s`OB z9o;mQAq!-Jn{B}4^*Fu&dikFQvS8E)Ooy%YObW1bn4B6b|M#YKpea5IXh6YCc>k9V zopA(qmjQVxAK+5-+JT(_Y5%vk0nBI{a0~zM(f)tfB+Mn@>X*!YI(lhf-6n*pqa`Ba z*J8E~kX$y3SJi%f-*e={M%LJY)AaY4yfiXUJb0w?_5GOr+LQ%Rd|g*pO%lebGc2Ez z_OJ6Pp7;Lipi&!~3NXqK0%W4;qyV&dYg9c#Ba@?ZD^?Odl4g~QF}FL&ASVO41Q(NJ zW~CK0N?GV!(RxTXiRh>z8#?S3Pdt3ZIbE9&YL1q^%@VweWqMr@>J_k;_IgqR55G6ORf&z=r`P2I4vkQ&tf#UqoMKEilo9Fm|v`pfS zfEcy8Om^JnOP_8a4zWD_6W8v%?>Azo8bzIrVSGx}!g0OpO5C9mp z17S%F<&(%teVk{mBY<({{?gUKCj5?#nw~pA?X1^djdSXX_@g3pg1eN=Y7{o}pepg)c*Xll2)bEB4Nv z<*_VohJH`9?NWzLviBP0}_~RV}y&{=g;TSM`K4@-t><^e}{ng>&_|XyMjfw|eU0?f$ zvZZYJaqcAQ${|mM!LRrcdWg+Ht>x5wbxtP8F+E{PA|a)WN&&TbNL)ileEQ)jWx){3 z;y{#Xipj7`o%w;i%;n2+C|oV2U|e3$PfQLYL&VwHnGrlniOa*DMk>cvv-B2DgXF!W zsx_mQ(R~EI9Rt#-@*O079=C8JaxPuGf}1Px-iz_7;(X(oM`^+dgQ#i$G{|IlMrqW_Us#)l=2isb)Xbs)yMr59 z5Z&GyHTerIjAm8SWw#@|Rq3$+&zxA$_bZ0+jbN7QiMu37vlB}H^jYq6FmK$;8x5I@@VWo;--90Gf^D{xF5sa1SQla;=+|sjl z^2Qz-d)h_vKQgKK*wbt9qr*zF#mmyXSztUv^kvXgnJ&fdYsx=z43Von`wu6ekGA9% z&lj1L^Xby2oSzFdx3_QlrSQ>py$|)hi!-^a#JL-})iF2Rl|n1i?SKC8sMc}&*!N(k(P=fx3g7IcPB9EOHPkqardkEU}!x&{-}_nzL~Jpa|{V!apN?q9F{ zPK(_er0*Vr5g>2Ac&ebJP$=6&=NI}e@F2BsE4M^%Zxm(Lb9=+?v2(0JX~ML+yBnrT zq4xRRjvTT8cp5B*`}C=AJet>E4_RC<=5+yDzB@2|fE)G0(2vnDqYAmYU3u`534ZhG z`7Z)OJqmNT%UH?z7OA|N))OZDOY66I2aL&7xMBOfMQ7MR?|a%*{MY{KsfRy{P$ zTEl_Ieky{)j0Je~Ki$jc0p8w#g#In^Uk#a&v%Ah23x5GD?agXb-mcI3Y)#1K2|ZR9 zGz6LTg6w+Il$PbwwabsN=iZId?ZkBxmY5%zDk<9*w59iVQd!fhwQ;{bY>4D9z%iBK zzBS_ddhEZm!AdvpK|S7X_K2n)0IIi5o) z5>w4<|NB_mHET4T=L#bYH&W^fRm*MmiTSTqhJuZoVq_7xMBn^YpMN)2e_2I^Q_JAH z-<#RxYJI_(!c@#(XFC#p+F5&!-tT@m#6PTpxh$5qxZHR*DWGAqWN$|S!_6Xjb~M#%i?|s0cCN)cqwgumH%fi z+B;F4aCPhJse>hq=dfL@g^~bY4Axi6jXZ1JX?+U!IO0s~oMa<|^9F!Jd3Sp#qi)=&DtN})musBT*CIyKl_db4k=I1u znO2OE#(Yb&8d=eJ1cR>tfZ>19S6PHQr`k=&p(xb93n!M`pH#n6M7i^tl_oyJypb^G z9w6za;kotvLOOB0$(9-zZ(@t9+4f@Ek7~-eT3&p-=Hn9!odXh`@93YJyy*Jkh$q^v2HOA;Hy~&bE z!-*)kyM6jlvp}xU?7K1N`L@t{SY@-@+HBZE<5%bMm9#%6TH#Yjr-O4*7B6U*_{#pp&OpziQB$vzgKVy z9G%icj|Xm^o_pkv-!fF*$$rqm5pYHMIfW(vWT>C?MD2O!E`-vyb>4+hv~p9o+HlZw z>(3S4iUGUE`g$O+06efk!C~1YSs`3bnuwl|Nm#!!Ug3soO$XiQKXrH(g~kR+`-Vde zZ$4a~Zuia^JfIvqHeIsE*6SX05^LYZJ6|eS*y&jziVS7D^NSDuW5sq&0;i_}(Uc1Y ztjVdH)zbaV4QbPq^+!;Q%xL53aYb#Gdp!0kEc+@z0)V>d^m zG97=))vKmsq>G6`sU8G)fES^RH;*nsWv&lOXnnT1iACx%@n6?t)Y?h|;EDU-TXgc!<0nzHc2U1_66=7mfZo5VWT_3kD7FpSos zto1_H6(Y05ix5GIs(7$lBTx|Sq-iWrs)D4+`2%>`$u>A z2u;w~jgc#KQJCJ?6QYuZ-nmCd0k8CBe1?r%OrHY#oL}~b@i;Kh3|U-*aYZS$O9xUq zrt`){Qg9dg<^Mu*-DZi_Q5Agc(xgMArB6T$QX-3}EENJo1Y%?|aIbR6M{hgaKivAU z2fyOmO!R`-2Ak&lZ-sVSeJELMT3g6lG_Yjc9iL=yE0*EQeo1UmT;rF~Dfd@ZV9M|) z7l^1*(32WDKRjmCB~KKLg)8)up(p*IE%E)A{-mjf)o(i2%Trhdk5~x~e4ux2epuW- z2MRsZaB<#v4}5{KA+XZKf$=3O{%%h>S<)XpDKx%n-02qF(p7$IBuST8!wmX4?l8vy zaJoCt{uWdh%6RveiCyhKXjN4t#7C*u9Fa0mM166=ztF@W|6xa^_++%- zVA@K_J|8h_nW)I|Xxe-wJK^)dP!6A6008_Xp3Zf;KHd7873IqCPQJt>eKlu5t5IV3 z*>mWjSyxzdtgoUUHBEhz;MR66SF1Al^PkUnSnd}R%N;GouG9uxazQObX-g{8uwJ5$ zyhl}WB8H`8_H0j`pLAgvY5{@v-@qi8&0_HiTOpE6QOe+blGAZ}5{zEvwgubG_YTj? z&bLUw0;b)WmNZ|-uR^`j?w@&XNVQzK_Qkx7CuCkg)7voPygkz%N5LJaSuh1SK-p7wq|!7SA?gDdg=8 zek(i=MoK#QI3qB&!*iF}I6&aSu>5xNK@iPgqCLG8fC?aH;B{pYBkIWL@}<)CxdZTT z@jG*JZgavNP-(pqrdVF$x*o*5*AO~Hq_^u^om6se&F(Dls==}j4}5+GQM zKsOvr9UYE^qN@e?bS3FgVWD*0NF`>9NH+8APvZ!vU4u293Y^&O5$eU<6^dScNl*jm zKkW36Sj*<~+r=^XkB1+i&~rfbdRzE5h=A$cYu1;#8S3<8>^cg_V(POoD9xPDDc;uk z?{eA*(4@UeaUu(U$bq!y10tn_H!Ck+IEVWOU-jmM7^S=uB3`>|Hg?VG+Im>E*haO; zuA!t^E+(WhU80w*_}kgyE3GOyS2&b(U!@=@^ZquJg8PV9ag4p?w$IhAs1*MMBt01nVGGMK~9Edb(wNVS`c!eFb(9F<2sF}ix(weR6$(C{^ z%#+0V?o7?kjAGs%PxkoR0=;M(VL`by-BweF9vO&Ef0DJ~0vn7-Ej5}24UG8OudmiT zOt0a%RlvvCoyDUP6Az?57J5fBJ|K;=34|Gc0<9XM7^I&h6}9AT6ZB1Cahj z9y?3F&o4Mrh+$WSgRc)ei>P%1qg$^p_jkeFRM;qigy$$L>+5-wpYKI&7xf_ahk`1- zm>+IY?x*L>6Hpr7z~ATcVuR5mHtDg}y)|Ss-6=o8haKc3p!v5liOC z__#O|ma5aASokkvZ=6s4?A{8an*RQ=w!Xvi2*$bDekO7p+7$B4K>O{YQfO5GZ*4Aj zawd0^8{&hI{f~ADYTm6B8{ zrJ@F&>gqkZ-MfBHp_;6YoEusaHz2jxJigxxUye_NyS3 zU+MlKjTJ;(U2mT{{2k+FJ(PS3_d;U0t#iBXZN1x#%(fHCxKaYhUMsyoJVNJC^$KvI zI&3!{H}?OM;io%SI3PPn98BXTHY8SVMcE|o_;dwhOp z^VPqvw}f5oeV>pu;sZZSn*9AX?(bX|+U?YIjsZ4z{M1_4DuwDGtd006pfdrl3~5kk zt*Dd_jdRtA`@nOBd87WQh4ek_nf>Qcq{x?rFSK$OT21Dd!~5Lc50eMg=f-t(=k4;nAl`G|^A5K(@h*S{aW^dbD0 zYS}6~x{iDtU#$-FY*(KC0*nTo*{R)0efLL-hqjC6+Y738E)e~kMV(=i6 z6`9S>%~mI8cst19J=*{@LLd{3vi87+hJJL z%zfVreFB^O?sfh93EFb+j}cg)twqS&X@>BGv#z`sX3{5P6Ls|rvRQ*&WENtrP^wUl zysQ*;TlZhk8sC>HaEdr1daC4WoYv0s>)9v%kPMPb(fJN^-qxsK$#4#`U$wviN38Rq zByBQHU&~T)4A-R!HFxq{(|c7BCQZFKf8}>B3*-g(U{fr{NHinJRSQ1ZZ5XYskW#(W z8nMj|HMEHmIQa2Jn-2ak0-bg4fm)V&7kXx_P5~$VU={dyDCJcsrVWT2PUDXIiqFWB znL{79i+5CV3C+dEuG=T>-x1$bo*7cjH>l~JS_*V8Fsm4tT%%+9*abTFg`pP?Rk0%t zbK;#*ogX*rp?f9Foi>9Q*10YR8C8Am2y_lxu|K0Owbp?^%(|=3yF8cxitJ%Rkz0Ca ztVimYhJe*tcQ_3LBT8iI=r$3Yl-QBO1Jz18#P@6xxDA+`#RT@Pa12!6qC^t4PUP5_ znFT@yN&i-j`R(#zCPlvtRWl9AT!&aMmytQ!~$H7!qHO)qw{4Ba_~62 zis-?wSCwc~n)`gwrk5X3G|Nb)d;9ELv>OkzCD{18>6Bg8L^nXL4amH`*a9xP`P;T? z>^hjigc<&`;{0w35tXJD`&smu+czgr#0UWeZ9g7NWc|T;=1qGCGkRbC2QcroL@9))D`#^i%K(G)2Par z;}Tgqz5RMR*iX?zXFnwx#@}{n*EeqI)}1?6&%nqy^(d8%zo;dHq$A)a@f>Uh*BR*u z%w#gXO;ANvjaMh+%xO%_iXH^>Tq&lYR~ZWmC5i%oTEnkCI81yl5Ole8JPl~3SZH+W zt~KXeH;&Npx0|%y8lj+;wh??G${%#>q4nZ?AAX&Q?WeUU62O66cGPL4EK;^+i>Tu| z?MO+eANE%yct`rd_=9x7I27jCaoOiRIbzbN<)-9YV7NG|`yd!9Z`_YB7CoAL%}PI2 zBxgs6DukFt^$_y^EGH{?wnD=6+oXT}F%j)*WjMT|v>=tGCT%xrK0Dxl;Ed`@ZoCeFiU1g0K09teYavH^DST!YiM| zj46r`*ToNXuZ}bKM7>p??kzXc9)7#>KB07j?T{Ho68A(~dwU#jNNX;y7Af?_5^)(@ z!j+qnY?#e{o9w^N2P zGC&HCFjgi3#jRgHf`zoKQYP@AdC3n^BD>y5)t^T6PSmWZnXScX=9O7(4#~X9Lft2N zdInndE5bi?TXP}doU%qFgtZ?cIC}t9svQPXL_fVut^IaM+4*UXBqt!H{*J%5#g<;4 z`Da#{ye$n70&{5M!$w&6xBTgvHQ*2aAOt@a>#zx>$sjU*m{*ZY58nS6{b`={&lD4+ zPUrT%sa)vwF8{_xK#XHK*V10rcmt5bo%*pr3&3KWBu6JJc~O}cJeeGpAPZkndq4se zd8)l5c-AQBSB>!cGX16QC;K9R4h)M(dCF~t1wwv#(cg-0gmM_y>V1Ufk$`xihz=1- zCV*~cl#_w?e4Z_rHQS(7GhyxUIjR|WzI;n6q3rCyzQE=QJNFlsbn~mBw0gPHkGb?N z)HE1M9H1VrRLS*$_?QK+^aY`e={Ki~A?rRcj__q5i46DIK?xgHN(^TR?F)(4r-0l; zYwu~Yjx4f$9R5AOU<}bsWl~h9=d-W)&|g`M88Y;@Ua*>%rfa^KGh6>_1$BY_VR!}# z&IKO_pyNtqXf&b)#61J5jUtU3H1ZA;j;oD|P$z*y-bP>WTy_08#7Qe3vU+%1uQ5gd zOJT>pL5KU0{rIBGtmmXHrEMwLd4mCb966kLz9hsVQR^sZWgD%~ScebtQ9|lLjS*Qt zLhcvGg~o5^Yo?*YXW1ns)Yxzd#Q zYcW>D=ISssDKmOl_Y@~y47XIl$)pHpCo#;w-H}|y7CIWsQx=z0&eSz}Od(mS95F%- zFA`sSo3r5kM*cp{9?eT`2wdPm%%9Yv=qr`uj2=&ofLy2eF3q~zQ-iALzo*T>vn9_= z!=R|>fn4J5hDg@@GD^@U0we1S_jBFh{;94ea9QZ- zblEle)#e+PTHTI+al4jlfw|cB|6=wXUn#+4_>22va<@fPrXfE9vwm_*mwF^G{PqWl zRABJX4ag{1nNAir2wj}SnO>Fj>4_$7Ev|mHzS6woW|ekYhFWrPZVWQ*=BHndQ!~o7 z`?Zz+-PZX5K7$gQ@G+5G+IhMV8onx|eS z_^`i2ncL0@&YMa7<84(9YFOa$7&5;)ckAo^=n+J5V1DqXlth$ZC_vdDM6gu(=^)c; z3Ej*(Km29*D*wobCNqZ7IYLle-!NRAMq*=lB5pcdBwfrQ$%V?hmh~2+O$OwpxFM3h z&$5M>ge;Ti4x)M$QUrWVki}ZJ$2W=`hRwO8pnaOS4rSo`BKH5pKh3&AB~}c3ki@jt z@|yv)vjp2>HQod-&Qx3#ES~W6evPZUEf!bNH}&c?6zLAUS!fKY#$1}V{N!j2_y!jf zj_QRi?9Wfhu_gM(ulGdl&Z1m8%VJeGH10#qNkE9;M(P)8nXl@pYsIqBA)w9#T8Np( z0|$rp)P58L1$-~YJ>8sUaketi|FAP|Bq-G6JZ5>Sr7LJQ03c5V4U9U8)*gg4zMa{h zf52Y)P7#>~IyOXg_M%>>7CN3dqOWV;lUkxV@L|0egKplcyn5VGqf%ZYrrV7I;JpC? znE}6RLudm}j3b?pv|#AXgcbM&hAIY8HSl9#O5LtA&u}E@9h~!R=+neNxuV0Im!vso0S4Qxa$CEs~kH5Ye zn^9Nd47Oef)##W`vx-Rz@Yx_z(p(+tM4B_iFybV2{`VdiIO%<#3SWob}j8# zzIoKeOA?yA9TUDL3=|d6&8k#~1Egnl8Yn)VD>ZZjPZenA)Ta*WKVCC8WKn z4Vpyr*W@Xo>r^BuX==%&y`aqo;ELjlVPAN)2w>}OVe}8yh6aZ@^tJ?xw}l)nNxT5# zd7CK|HTK-;pbDw|T)EvTh&)1lKr9KWC1$r*^{*zuRFo#O+5wLvbi^;|`+71NXvYm( zoDLDgZTKcK-R>^DpVu&aB#XeRn#cH!Y-^O-x}Y4Vohp?N2}g-qk!=SRVmo*E%Vm!x zpB!70N$66IOm;2kE$lT%fO6O!T#rRc#OXT`YLZT#7c7Sw9emjfI3nwjgUj!8Mcu}5 zhMS`>o4s-0RZ@`a*l!+Pnm{9D!(R#qvNR#2k*y zhtn^B?TcZo1^5`c9wCb>9{!*|>)a>>5W}waW)UgQi+%V3_JM{C z8qq?l4kZuxpfZ~Zn%I;)Z@^vzsO=Fu9>k{iALHT8heH3@Y+@N`o`2^CS#)g5d~iET z@>>R3(TY1jyJ})x%h0CF9*jzc`3iXY$>+ZGeG;GZt|y7na@9I%z;myRu8#>~HL36n6I+|CN1Q=U!QB;+&HW_G*eyoQVsg!as!^slcM)DZL)v-KUq=8)P99U^Uhc1uV--T|I z5nRxPx;fet(qg#A5q#HktzCkTfM|UaYO5pzQGqDdad-1&%GyYB+%W+k10;E1uhDXWh2v{tV=_@7j$-4tq>f`qznUA1Q|j|j zL_!`rye5UOqfo5I?Bj7Pd}^WqW>cVR_aj|ULb=f;ZbYZHA<+mrW<6qQASop=PdvUr z0|yT1hHU~AcdsA(-JU%^4$aEDcWq}#8xb%uFq2@GO>Q$1&_KRW=ejoF)Yx?9VmInx zb1%o!qtyks2~ozr4eCsoaLIk7yG2e@q0y zC6#$0q#t{Lnpe2kjW05Ri_JcIKXbrR;%Tx&PG;wWX4j?eFUdsu{blc7Fn~6^5*TgT z3{UI+nMOb@2s2MrmMqH8t|6d;t%}bmbgPf3CNV}*Fbav-_o^jUB*6rThD~O})zzx! zw61JozD5rkOn&2NpKzef;9dOC8{q~1IYa_H!j5m3_;N1i32WfUpe?C<5Zwz|(!wbWUmG@F_27>0Jbvf}tx2zs5bN zcAGkeGx@T@Gw%xtwn*{&B-n=W@P9r;8IBZ#LQ7`D$b(BNFCx3n*2RhvEvofRy@PqU zF7LiC#Cyn zTnMzx4E6sAh;C20zc&M9i31NWE<+O%&`ym9x>u!roMssN{#b>e5D@I+Om06KdxP8< zDF*)gpZwYtV1dsRHUd2(M#7zy7SpS@7n`2R<}~S|S6>Oh6I>E~TV)cVG#5|$V;0m$ zBCmD<LWM_P&E3-Js7U<}Jk_a89rm$M zK5F5hFq7yx!1l#{E+Ig&L~$g1slS6j{>1~Hk?2+oTi3v>c89O!0B5h*BnEeu7Q zGGwg9$;dib))JtN7=O0r1L+){%RfvGGcN{im##fAPQ|8$I~3c(e2}3xSiMGEW1z7- z;Z=MvY^jBn-Sij_%nN~coNHj%`jQ`*;dLJc^gVIsyElJgkLc$m({6CC^-KK>bH_Z; z0Qjn}c;MB-Z;(Mq#;gpNl?3-oFq*=2Cy|xO;@X27O|ggT(ki2glCyh5h!Oy0(BmDC z4J*L&ekYoG{L)Dj*m?$Vo52k7wE&&^9~-rlf-$v^eXSpjO)c8Q>blYiFk5%|=|W#A zL*Vu{*R=KF>Xfgs4?Y_nYJKzBFZexC#2QKLB+n@Ql)s0Cbs1L^P|J@Ckm{N!^ciKf zyhBNZEYwq(ifeAw=SHu8wJ%UC569JQwgw{sCNfb^RvMr(XH(?xUj)dQz^rXh!vAmL z6A20ZhXf*8D|GiOw7tGJJWKnEniX;!R1EUXNHu;fX@wVcDK=Uw-nCE%BWqfMb2sL{ zGW6je=Oa=-oi8nE;DL~4N0Zt+Qx}lHyIc$1gW|MdosQeBEqUD8sCqaNw z{s{j>y2OH_-y?h5=jO@Fj6>PZz4=<(Uw&!41|z?=5fiL591t$c9#9?O@bg>D-cN3$ z(HPcH0WjMS@-s{O>0oT(FYn<)hH$r8uLqAT0FaQrFQ zs7AE`YI_1y4d~04RgMJsC()Y6%>YC6f-to%X(mG5`-Ysmw%ZJEYlgc>@qjX10_($~+i_7~!V3%TWgmp;hqg;4*)& zQ@5MX-=Y4bKtBE0!`J^IFQgSZazW^zh$O!Y*dEguGg-#l9%oRRvNIP~+Oqe1LR}D% z7UzOMUI+C!jD4;~AWzrARSsB|8YLMWfy@?@jgrm9@z*Q?56(Y7c@s&P7e2q$9*@cL zOm_g7(>F9bEKECkkwIv2>D3V^Xszd0b6b&1U*wT>$-@7{A^>FqFA*veun9@sz_`Tl zB!|EW@^*Sp-1(-d(skv8OyevtBxD$G+GKNxvvA7CurGrFhjEHBL%=RE{0!2b@Uc|j z)~;K2I^<`l#ZviOT&h(%}Ses8xfQQZ>ULIVT3Vy3PKj1(-!_LlW49_+L{cSrn5 zU(`$=Oz7^D!G5NoGsf2$c^)%s1W+U%_hd8JF)1f}%z9zZFLQ!eJ?#NZtyO-sQPM#lNnWtmcRw$Ce{OkP$B6DbM(rPo zuIJBU1%+M5Pu|W$A0pfJz%p&dDMoN-m=doARE67+gRi3?@Ns=x2QN5?b&ONo6wnyZ z1!!&p>I&Vgltx_>3Q+1X1*-D`7n~VVHmB0Rs_cBEL zz`H?X+`4+Q4nNa$japYr@7yN%Lt{(;q0g@>4;Mh7#z!pQRRY{XlCtGB2BA%fynLVd*hJ_%JrlXTIizd2ar(+W69lOye|_yt5{M3jl|TAdi<0rpQxx zBvGjK+GmX~mNoY+f*a`({TnU%s>OfhtgHn1EZRSivL5%%&USoR=pTeHrPu*(mAdJf zM7UeCi@br7Y3J~e$;9;qDnY96e!u3yNpC-X3mJ8^B;B3;O4U3Ei_rIlm+388zwQ9H zeLnv^36KZyb$fz=`WUVM*4hE6ALY@*@(0ql5?VxvV9rnl_&Kn3gT3F)na=Yc#~4CB zz18Q`z8`2mB3z@B2?uCn?E$k482^PKT&KPudxXmdcn3do73(6o-tsyn9Wkz2b1ood?G9G4q02u|S zqshthtdA&V6SsJi^2k06lyjWSn>TrLON`_X{M7cVdrfjZ3i6b=)X>$T1q{sLEg&NV zo}E(X)Rv_djmkEfZkmXO8h*>4be&NjXfa#s#RQ(wVX_CZw*SdN4lRH=AD?y<$6KvBr<~++rfxQV`}a9#ME4ihfJ$?V8+LGnPwQd_^V+D zIb<<5Ug`m@mmQe*k;9Jh+&`Ro*?ESiT6Xa7A~Bup?qI5Wrkq_9l*Yqq>>)=kzvZu+ zlonC*1z?cdJ1#eOs1 znZv~GZRxrUt@e`9`#nU(8yjj=F2sAE1NdMF-E%0rM*oNU325dw?zh-iuCb77ih*@L zK^qV8ShC~gDY>nC%>zH7M+nRrs@7#kB(1`L041s+yX6csj;&8arJ(Rr#sKY4s$JSI zsYyKrw%`Gx1fn~p;m*(kiHKZsff6+@GR+{GGC2)P^wMD$%9{j)|Atoiaqv+gP!a|9 zE)nXs+!mFI#MGp+C(Bcx@5BL+Dn%k%CwAZ&xPM&-=h6C6loxQi|9Uvn2S_d)n2s7) zAgb5{D_0`c*DVGdZ@vk`6Rsk_mXzx5A8UHB3+$*)v478o3epM(5v{GUHHw@x16WCn zs5^A0ffG{Nx7Jri6Z4%!zko||;)Z5p#DQP$axIFXX??Xv$Z=%k-Ofl`B93?V6Ob{6 zV9m(sFAY{*CJtWtgYkfR?o0ReXt4-kO^(_W9K4uY{Po+$e82ln%AHhQ1JlSw)BjWP zm1JR7$RvA#y`DuQ){RG8yb`HPon8NH;YyPi)mmGZgTonX9d4Zl&Jr`*>Oj?I@r|^q zcn1|E3pggIO2qx1tF+%I>QI@W2ib;|l!@qRieW*ULDGoo;Eqd7MQG^QptDY_I6oj% zBel#1yqUi8hu<| zxYjzv)ALlyY(RJZZ{f=q6}QgcKcBiheUTE|v{`nv$39Od9ZZ{k`BxvY|Kk%@r>7FY z0^)|s)@Z^F4}ZOq2(i?J^G{dNB%=2YyAYogAEn=9I{k!Cz6u+0GFsOLG(PC&r2nCU z%c2GF0k&BD36kAYe7|lv1H{>PPT9ypxroRt0f$sclNqm!FHVa{r(xV$5^6uHsnYhy zyM+DT7LTWB$VIltbbQ^CmM@`dY$+w0)DYSu8p{7so}R_QwwgIAPU%+rygrRWzS{H}5dt)0v=Oh&aY& zX3C%s4m=9Tn-^i)IUQnQP7-l0Lid0$fv^JY41D+0Ow?r$anu=o|EIEt3mz-p}8jE>Va_OvuE)4 zf3V5RD3?q>-L>EH2{tum(!c~N_Br;9Ip>=~1-+NEdEG2EQP);z_-EEqbTEGlg;j zg{;Bs*A1$9|La=p#XH{S;HKcUIMl<$Laj~dKLj~G4zaHu;Z8hNj$@^ialTN;3;QaC zrT|MM^vt+5ucQAYg4f+rlT-rY&;{&4ijE{87Z@HYHA?G~;eZ8!(tI=WJT3}y|052eI2KXpGJ*}p6dVBZ+de`X4gSDv>SSbuC> zTs_k-57NkaB} zosyal>bu5b-u}C+K~E`2wK1M~5?dDpXzTJIS*U!OP~mPep<~M16SqA>&zyKf+*%PG)eZ^{+N!i*nOhNw}mXFQlSi@`nsS z_ae790|SECSQ$JbqQd87Lv>ZeiUn6K0h@%8#Fffe?DrL8(-2A4YROV<&^xN7o5fjC zY#hc|3VfD~m~Vlzv_68NR9dq^^lm%O_AD&_^Bv^**xLK9^)BQpp`etuD+vQJeVUG4&2&2AQZ^gQz_>#<47A>Q=kn zh3u@Ntl9^@AUVRNAw94z%l;*IXQMi-xF~+5nP0|&M>(%JOpAEBFLWW@A(o+*&3F#4 zLjGqhfO+v8!OaMDPgc|$&>17@3KDc1bkA{?MuAc4Be<3l9=q8Glr`5(SmcuYjDhhb z3-Zp79xRxvXB|Du;9G>pd;wM7F*EPAGNRX#SJoysbK1)uesa)l%BVAOwNuq*Or7OU zJqWLp+v(eUSVd=P#aJoA-K8@a^l{0=fbdx{3QTO zYq0QDVUoQGEsNa21JvQlIe2DQ=;y`ETYYRPyYu&<`B3t$A{J!KQq!d!|l}*?*oES{hU(GbVeN`VEGOTrx|g4Ic@oK3q8?w)4p?qKkuZ!~@LXq8}`|Hwc8 zO1(k`r)&Gp%iWxkUC+I4W<64<#RJ{EVLwFr#Xza9j~{KhU}%aK8wO6uKkU=oF+u0O z?jkX(PBrk>xTj`3x@=`m_ zEdc@J?r)AuH0)g06tnxA710%P`6$AelCI8~XE^ec)x^ix+Qfpyom-=W z!s+J_+*&Bl-vEAT!*gS$W$#lC3u{zjFWZd933S3ZRNrF@Mp4Zb>}3?8s`pmtawL`l2@ESlAA9$_=|P3y7&=zl?I`Q)&x zBCf^Ez9=?bpv%{7V{NW4mV4Ir34FuBKhxOXJr8;zX}@Fw4f^|{y+7LEVu|NE9BntF zyOGBZZF}Jm>+$R*Qzno>@&xY6w{nrZ|J5=;6O{nj`GCHRQ8kZiOo3`HVUYX7*_wjQ zb17u@3u0!T!!P*$o|}AxGApM%@BE8AruaB4+RZ-3uj}|cdfXZ1vS?s)0w34jEZ_QK zdiX-gJfk;nF;jeb71w#W0qQ4ouMY8;Okay|3M*X}rOg`?8fuIr;?+mQkVRzyrjC6uF7`;n~^%_<4og z@hTiUSK5!(_x=G#PXlB#g{O+5@X4R}_JQL<|MM+_57WE}Q`oM+m|Q}9{#@^6jl(UfIq zEy#ih;4sF}mJBNxKHfL2LEW(&)BxoSulCH4ut~X6^>^O_&Kv=Fa5XTIh+*~+)f-CX zana9;u}2%mYi>Gb$|uN)i-*(`-*^T{&epSfQjHWbsXO!QbG=y)G*mYx*-vzn?w*&o zzZ+jRyr+=f;U)uA&7E@+ja~X%PtoQzdoW|(rl{SM(C9t8;Jxp<)EKybaG>-=fh6tF ze_TKeTj;%5-$?VFIETRg89FK?w9pz6673F&;HRSlPOlCzyg0z;fZ$5 zZA%Ca-8b9_Um5moabNw&Bm^lG;NJ`!8c}}$*ZJ}FdF)R<_?5MrNbc8<$l~_w?+#>N zeJnWquNKCMMe`-K?^sK}@kpi@s%vKQh_4@uCyWpLrhnnYWL}|1@->a4W?<7FHBlrM zc%EP)%(b3kKIjE|VBvRkufE9{{?&f`M{b;!6s%S|n}K z87|y|*_tYA8H*6Z^9Y zrFFY>ihtA0X1e7JpL#hiOx<{=L9V1ZN38NxO>mFhjI_neSkuikhSzz>Ezn z&fOsBBP#FIojUMjImAr(EC_yU>DOSj_xwX zAgklgw0_;ka1G3C&%4li4N7*<{JH;P6DJL4YYrb=CB{S652|A0Q>5rr`dVPQU+{2a z-xNvr7&Tl-Z5ezTO`R#vPixlljlSMyq|FwnTXBwbbgCnPnXrr;IXEz-g0N<*^ZVm6 z&(o8U_9oW24}(A7Qb)xEkWP*MGUashSjUSQEc{3o_RD`LiB7e$iihY`Z{KUXOToYk z#$T=ev6L{viW~7H?(&3RxA_Jh;%|!I+LD%(GxH=-R~!a58)qYnU0AF0+N-MsO86~X zOsL3TSBhvl+sd6=B-{Ep4(RZND2UnfrV-K9*$aG;MZVwHvN@n58$oJZUB^_=J?wcf zMjP>evGtWfaWz4^5g>SiFYfLVT!P!;65JNoV8I~}oW(sMxVyUqTO5MBE)s&f!#(d; zU)`!(b^lPc=L|D*dZv4Nx}WYoL8}mktD?lWVi(Cl+jlNX11BUtG%6;me_XwX8daWP&hAsp#Bv>dIp8_8w9_6T2YHS zFlqtW-uC$500#jV7H5%gBNCOy(wDF7oc{2ljeD>`5F9X@sKbxsp$aq*{Wkqn zkodO_Q`~)~m~Ra6%NlY8yUuJXhYgjl=Z&~j0@01RRj2`BT)`)N_ly>QUBr0##kP|b zT{>0@*kM2%)W19;vLMGRLghNx=R){LmB-<;oy+7=ZjOnowDxWiFD5h!o&fyyB}E) zt*vc&s_t7_&QAP~u*$yXI4{l=9~D<9oBJmf&Z{q_Lj_=@3cn zY7u~SBhGTr2u_2XS$lyZ<^1i8iWbk5@={$G@Sn||FP*i8$$~+m?_8h4wO40!752uZ zy}(d;xAR7s`a?~pZ~OiLo?+Uwq+&?M+%}%0w+N7j`w7j-ibDCS%0o{6E`JMxsE>Cf z)KSE#$!piU11sfa3`TbB8C_PaP=olY$f+ncNLT#{SA@y(!s2JZnN-vyq&R6>eAD$3 z7t2#ph4m8g(Rxs6++i_PeGC|MT9UwG^S8MR^V?bx6n-@tD4u{(xoZC4nR@DDs-V=Z z?QuG+K{zJQ%kVHCBZZ!eR_YD?n;a0cCUO|n%#7=(wOk*VdyfkEE2mu3$fcRj#u^%W zec*^QrSvaB#g!lmJN+;NY%VR3n0r*kfVQ-E8--+o@ihpoCiT;f3|egb-W8r9p2eG| znC*hT3g_IbA{}?j+=58oQf$4Z$yRVpuZ;PRH~kC;)jaWx80fnaVJ<^E9$qO8FN{P< z!YMb#%x}%JQf7E&1O$}j6d4uB6#eQun}9d+2O@M#2O7<>2@#GCtA7H9UVX=Z4!wtz zI(UQZ21-=R8{nEC*a|x&nP>pY^Jzue;+$35r)%4j@V)y;q>76W5uNTLQZ8v{tUD4&88QCZwDG4Q!>5_G9$iC`1bIBc72Gr1*j+^tW4a zD#8;vDzkME1QHl%H)P5<7BOQ;@jY0rtVSbh*ntpvUCR3isE=7oa;7tP!=^4cw4f%k z{M%qsp>!M`6}?o|$4ioc*zyl8WK8Vatu>U)<$;4d!JMX=t;+$gA%q^R%I4VHr?nDa z2%Z)wV3SD$xvpI%#VGMO=f=ANC|g2Qj62Td!>En(TT*mqNvGPif2pOvBvXM#45VPf zgjGG4ksvdcG_<>nB<&k)KJK~gn#+hB6K(GDPZxhqX_q>SW0w5;kzW(X6H+hadL(@O zkUnW=Ga7@M%;`-(5c*u^BW_2*<-UAj{KyvcRsN-aLD-LbNZjY~qKTeKi>a?NG>ZAo z_|TW-dI;Eci>2&>COlGn`k5!wc%j9;Hz^LX^Cvz0c3rN5elp@Oof+?^tAI+p9)UHP$Bsb9kc<_zKC>m%O6J2usxEBG| zczq>u^*A3fdeRLG-dhR>;{*}vB?Apa3Qa#Qg^zMDj9&Kxqhc&oM>3;714t;1HHHbfRxb4w$8@{J_`YaTZ6&SS-bz=TiAbZ zFE@a1oQ4o-=S2^qfA6t3wg*-;8>4JN#ynKqbdDWpQhvI`!Zn6o)#we(SGq?y3wS2n z{BI5z(tVO++5;Gp&cwYG_$Wb>w7oKbR_`lY8-Q*Nc}6c+0YaeYJlO(-5G5n&m-v0B&qE4_cy zmhptlg9CK&gz4bfue@A~wug^{3)Hxm8Vh1)@6-_8Fvy!n3`g3XD1=H1)K7JZ!lr^kwA>~+Y`L7vcf$93x7k^g3u zr{6@|srVBjV%Ji0V@UAr&*4(*&z??Vp>@3-#AV`xA%CbTE)rGOV=9NG2SD-Tv5#Z_ z(UFJ*V-8zis&Fu9V}s(h-EZ9Exeh4SaB3HYu39T1pP(rhN1=wmr7`|;x(TEtYWGU% zxUN@~U_+2kER=5djPc$Z5{So(pRG|diyFig{{B5^^Wu@U)7P@m@lAGA;N|B=dii7o z6l~h?PnM;Dx2x}Ny&k%cnfr*qUXD{d@>uNt!W|cGCpnZXJDo6@2t%qw)RC}H?Vg)? zt(HzzFWEzB_}J%&K+1z}c@P6K9IWqrPLFpi?>^nEup3Kg!mfC163)0|#=Y*ZWc#?y zp0Hv$$5M2tx$ZXnkKRkuJOw+1h4m9^I{2nzc{Lh6GXvcR ziNJUkKbJn&nbe9U*n`glytnPuu=x=&#a{4l(HIenS{d!X!;4i;k&U5MS*-(ptG4$} zHS*>P0b+iS`7n|F2sY*O8xPaN50TC*{F7uiW;pPN&nGz^Icv8hkAxLW+I?A1@4|Cw zS9u+Bd>?Q3ag1)QVZs&Q3 ziN6Cdlnf5MzK_2X3%cGJ5bN8Y>YCmfD~Y$d%wqUzLPbZwVOX4M)#$DCp$i{Bt}!GPbrb@{zrRZul_A!<~;1Q zv(Ly%b>=H41W)kO2W?tZ1h#GqmIjMyx25yNdUj<~$i0SCSlxLD2blXS1O0{Ra;PLvM_TgU3$*j39-RRk`Eo%R*+v4>iQLf$5 zVD`s&s<%xhkJssZnQPui>z_q$u^$mJHMeUC+{L+=m{1{UmigucGJbt@3+`8-vzE9mmXH__@PE?>cjc{j#xYlxgf=|z$QJLp z?Bqx;_&p2zip6p4*4>BvGu%Qzi3?PaJ!!oPW$?0M-_w)ebe@*&E&8z*I9KL!gk0&)mIb-X7w*Q`9|yaa+ypm2}4ULzuwDF zKM46=Tb@u=jg(bm<(JNUM%CkT5VC@bA*9jP#e? z&M^0^%DS480HoVFiY9~eJ_Q?nA}-OrW9&OFWxL%I)W<)@?GlWi>OJ2_&CpG!rk_Vpu*eFQw%v*@$h4ynNWn zO9koqBO8f&mO!y#$D~oCEp=%I+Kf;q5m+OkzCbLR#JBerA>ew60by3y)x)gpZiX&8 zq&)`Y6dUe5H6uTnZ7i9eosqn?2M-g8^2p297tbG;5Q%GbGnYQr5-#!#CuXl4I!m#l z)&{QZKJS3CuK#+&K8$kyJ@N@y2oaT#_(b%+zd>6y1@mu>s-nJ|66qk2L9YEZojiRM&3=gn!Qc-(6lu#e z4Wnwm8U^!n?=UuF*s$cgXfqa&lKr&yX5qX?<^v}#JOINa<0t5*QX}sG$(S z*6vt9Je?A667xK+fPvz9fHRFk5_kwpfR~^DLy^i!-xx8yyqjR6!Io6;o4Ygkf#>}; z?>Qbr*i^yKIOO3Wh81K(nsJ&wslCsF?{6v7vp%&U6%ZA$ZtXY=q~ zQbr@(TBhJ(^r&3GKJ*nEewMFQzfp-S2W4S0LjKRpSxx11KHUEE!WZMZZtca zk?-%uP9$L`dv&6hqiWAi=EXcoy0@_xF9kO;aV)>%5yir?Y( zNA7&m*fOY>hHg4zn{VeSW1ia+WZQdo&IeudY( z-r0vG()0TsFmf=`#s%zxm5;mQ9I-DPUxf%Mqe^V=&ulam)aa%D%65~7rt zlX4c!f}s@r#oIM{aKfML3||*GN+nrm(2;8+^Wu*Vxv#W|b@)S(_92Ed5{a6(w13f* zHMBF<{V=cVY1|YJgFUL&Hy2m;9+xo@nF%EB$VbYC>RKWi0WtFt^ne9_xc%hS9eiQrQ$h3NOz{Z)iFOYS_8HT7&IG*3l5ZYTh!+>K*ZkMy@ow5G z<8BYVlDsPv0R@KwOWrJ$R?j%wTAu3sjv3(p-kZ z=#>VV*m<*|Wo8fz;-iIy)R}A1R8#SID#w}l?xM#Sc7;kA;DWpFgDabdI5l7$FFT_V zUKRP3}V{44^ecn%v0O=~SId3Cx;A(fBAQe9?=fc-3yqBG!G^v&bjFV_9yB+FmG zS2XT(JjL@xY+TKjpYY8x__?4Kqd(h7-&YAs&GqQ2<>(^%TYiTIm?cr3pG;QBg62KT ziG;YfhqLh!slT)Qg?uReS|0kQ08NTxJN&Ty?_r|<08{*sjEMh*469T-RSmz#z}Jjs zFO82^0hN`1_3H4a^5tTUkW7jKAnXH-wfZq$%@C65hMzzvDC&0##uI3;*m>+-75q2*d! zqwmv!RnGT$#$du|Bb@nIVFCsp$q+a1=|RSHu$SQ8W<3U22m&Ob%cJGntaXoP!i4gm zG$A>AjORHa`xfbBr2<|Ov(5*~&5BNF7XFb4aKT?vm45gH7%&`)JD+|7OV_}fN0JLg z>%Q1);xsjY2z!1B*#-+X$1Xq*1tO&=I2!hB#3UuwAU9$oHI`#K^>%z%h-g}$dV))O zM_UV?*$wW5-4ot!^p?@cMp3ylp(4pz#W$Uvc||vr(Qn*G!v`}v_?H^8!qEv`q`VA| zt{~@xfcDwwz?IDebrKtBrh}huO!?_W(u6(Wdo<&Z5eELA$t-$8M?nTk57M7Luy^Gh z^))i6*C1!8({I!Chyz(rx^!|fmcF9CXIl@mJIj#!C5lJ#3-0+#kk1e%ToCN|DoKLCJ_)7X+|M@TRj@fLoOf+CoF&1L7+CQc!`%K8q_uAOs-iBY>9J~ z{a6V=6+^BPL!aj1KIL+>Q7yolu`8inwf5gi5@>NH)^6RG9Dwu#SRU8fB~u=CH(CZ1 zbrt?8`!?+T^+$R@f?Ord@+&ANAJ_R%$eJqeCeW*fF?db%M^hhQ!iT$=2X&d~Yj?+o zPbrmc#SfmZ9u{~YIIMf?R8EWS)*lP;f)}{y(+Ph{72?x(Z*o_ZJ7#vuLpyLIehL^{ zhSacZOTUAE)R_N*b@IV`bl>I!M7O!k=NZ!l*IjxEdigKAD9B(w6;1!3u)Me6Ex~?t z9O)5VBX?9oVq?`gVVP9nbq`f6iFm$1XC zV0muH$l-@_K7E~jOKpYp6Y4Uh*e-&w;$qX@y zA(6}ZUZZpIsFZulbHEPsOP?y7g?iC2^(na%O|1HCwo{4HPyJwiAE8laMiK?swE6&l z)_BYEY3Uv!xpylv)~+F!8RvA18Vu!AF@mf% zyBnx_hfYN+P&-Am%f*e#8^X^w6&E<2-Z9mehwH7{NqZTtws!6K*kh+JeP#3lkC?@$ zX1a^WFWna-k^z22l-!WBa*TzWJtMcQ>9>{A23H(~B1CLtZQro8WWQuQ2=M63N0IH=3A=8jf%26gcO06xuE+knmap8PqLT`VDVI2lr}m!Zg2 zue<2-*Qkw`RgyaE;ji|S()wZyU=h-K@LY{eD*MUL>z*kjYiiKs>0A!t;lBsHToaRZ z`UbXiT}oI2LK_z%|F7LL)I~yN@`lCWkreTcX#Pe7p-p@!)!xec3k9e47JCX80tsENf zo4dpmJCv*O_?ws6vVkLxO_8SBam~X6zGc?`;;UuuwmS&yPnWNIg_8!gXP96_FwVlT#3YX+8toDS> z&RHhXPc!vn`R(KDhAttpRadd5XZ zKQ`dS(dEF*70y`k#eQDBtRl)1ZuE-#9Zd$S^PW(LYZjsWq5{kmJNjT-K{eeco_t|8 z?Y!nSk!H7T>~MRpj~?KF+fu>r_&OZ_+0?cT7iYrmHvC;gQxW zyP|IZ2ns==^**oWjSxYE_e6T0S*_CH3gPf^Fq&4W>RRMO>qD)@UxN-og&I+-^zf)D z*m)pM<5Ru;KEF>&E$Wd`8NLi(Q6@c_%C7nvi@3Rh)#9GYq0@nlM$Pf9bBxMaO)@Bz zU6n!)bdxTWnjCE6LT|=$yKnLsdb};Bln&+ZYkO~aZfGY(DN@8lQ!%9=WJ7QyooihbUS0k%lNrsYW|v(ot}HGDy_vt{qfkk&nz#hpPx= z+rkQ|`EXEC$jHuKlkc|9^(F*#%~%@3LwK=^kyfOb^&j2m zC1`m`RqThR9BoE5VtkG@2+?02wmw+IKf@-B86$jNak|s5gkS;NMPA@)dzSf;$A;>; z!}+a1%kR4MbE>d-4)WF~AQ7jG0Y`~iF-I$(bv7l;ZHI9uEG|qk3`b#?H#gY> zI~{t#zc`;M*vJx-L|H@UGOP+#UbTbW`dI6mqwDml(xMcvv(wxyjgF161ZNrvNpd_#~eRiQ6)Kq9#l1Zvwogwm_5WowogEHt{~%B!_zgkY0A znw^_rQ6+XDZ$@>v-8i~!+7|bnCwLks3Vw}T>5ttTWO8w3BGXv%_{7;dvX6p(ET$Ne zf2e8(4v}LtCClq1qgKG7PxNYa@2AGj+)}8GMkuXCm{mqsimnD+C?9i?4(bp9zo>I| z`9hmLfNRVsDPKCcDk-AtiPsGaZ}%HPaK!l$ExU|z#BVcF!Iy*M)i!KLTS?kdC^ydH z*85<8qm?qqPl3-oz5G&2muQATjbPjOhRyp5guoO^%qnee%emI%%WmnGmIDN%b^Oa& zu3DvT_&sSR9FL}6G0;Rwm?8U~xcl_DUkIe?@=XtS^sX!8bY%yRmv>kM`S}CX@ve1q zr-NY7h($;y;geDdQ@W7|XYllf&D5@rE9?99Hwh}t?@3f~bZY)QhG_Fo+n5rTxFhcz zDZ5#fB&epsD@$L*JjP*j*Kd2@F+e)A}U(k zn}txPOI1}WM`{Y0>|M)SQo@|Lt2%ywn$kIcoh^#Ot+1Q85g^!-GEtZ?h}@?XJgEBs zlbPp{Xvce`%g>c@{dIf49oeg@22RO*w;NEe4g>WkvY1s6DCv5(Qi(cFJq}HN{wKqt zc1Q)AcV87Y{i;N+(-X1F*(=ZLb<>sXNO0{7ay6Squ@-GbL``D9J3_*oPG?PtvU2>`JJeP1OKY~xVK2(|lK z6_K^~h)oOYWcQm*o3}UT#w{;a@kRBoHR>96r4@~1shn&5v6fcU@xBLb`NIFH?$SxP z;|}*%p}QqRDvxFS&=g%1R=Tg#cWM7hy%4p_Pm!_$9PwmQ9nH67J^ayt_gzSu)bn>j zzs_Wp*i?}Mo?41Jy@VoY|e>-_umNEAi3ze~}#*LRx|2;Y(p}^Zq2&;Ab zs{A@uO-ODOm%B7keV1!@hp=2{GmwoYda7VAm~}-+CH*&l1_7t7xjS|;yV`lc)qZ0q zCHN-s00{{xTR)Y=4bflO<9e&T(!dzUc$z_Vy{1=Ps*vXn{t%nC;)bn!!XGUKt>XVN z7bFn1$5re(be5aHD(r(3fUXym*>%N}Mj}qBSA-BG4;2$!#+i66oW#qW8qmRI=hn2XF%4fc7tr8g19=dZ17J_dno zxqj1oyLr-naz*d3{=P?Vmc&(qme^sf(6Yn7?*Eb~;sqkx8m`oAs022D)Sm(+UOJlV%J5HqRPFQPI-*_x`P6#HIy*+KN7+noJKG zq8lyvBquhZy3g1hQJ92+LuYcj(cN0n=*Rc^O$rqkTvc1RAOiIhx^^53)#;S9Y| zvcqGBtnK}t_?y{8oau@_sLp{lpcmRDmnV;%ub*1di3Y=`<|=`Tn5mE-{J%^=5<=JWm><&LaHjBmeS>8d42f+;|Xjx}%>~pMfcUcZS znUR0^GxL3(LGQWuxe7KUbIBjONo7^0yEG)OOTxcy2oi5xlYA%}m3XaD?Rj<^@*~VZ z%%7Cd)hhUb&Le-ql>h8U20e&;`73%G!BZZrT}u86LLAke+%jfN6QMZ69B7stMQl)< z;J=>FIk21lQ>{Q!3)h7im#(hihax) zH=UE1`O6uNgve%T^jVgDowUJ70|sCr!iG5LB>#|(GP~NKj5o6%a1CYCD;7pAoVx8X znm3DliqBW<-49WXS~Jm7NRbK#54boC4^H=az*5Gn^-Hiz|9H=Bh|D%#2#GlW$d5PH z8*YHF{S~f;LoIGR<55%7`|iwDMNS|5)nXJ{F{-v;>FMC6uHbaqBfR zhIM%O@rL?D*n59|7weBkU0$Q0A6jwrDvQ+~M&aWRH&rILe^5vLpYeB?>wiylv1DJ- zt%~qd^2h>$!-m`}wnOp8L}q}n)J_SBPu!;p6Sd0O;oW-<^(q5ilc3y{Sg3tq;0+Q^EFgx(CL#ZqH8V1sc ztVXcYHJ{%~{wX-9(rI-gPgL$7xOeYWtyp?lP?9XmG3dX12@d>@zYzfLoGDb&5`D=z zb~pV!$Vp!9eMJfL_EVCmqZsC{iCw`^swvtzdn?){ndbSI^v6f$Sn0lnp&_-sv_Fzc zx2Bv_iR{<(o!P6F<9$<451MA-?obttr+^pkcA{bUdSWn%6rx1A@{%?!tTlx(D1thb;UZG| zex(bjhKoZiZ_^h2kghr8)2P?yyX``=-NvSpaIq{xNXRcmkVEGbnM4yr^5Cgx7)cb8s8}Gse^)&lxo~bo6n_U*R3V4~p2pF>G~}7O_f0Y`&rf znvX<)f1_X_YDd;ZGZ!CIZlE$@aYsTz*y7kZ8}VU}uTuBNE!Lz!7wf(`8yf)8NzqqW zDe|#JuPNShuqXyqL(oOT*J~zorRcJ_Cn>zgGo$-`zEX=4L>D$X#X7oa@mjy`gNyx1 z`0{6~|AR}p#@G)mRuxb1WT)Sk=QR;&b+*5E+)Emc2UylgL(6~H7;xknJG?qPXeD9Gr;YK%2)7dV@TQguJ&U7Nj0!7Z$K^C6{k}Z?`-rVzIK0X}Hl`inFmpANI5O}jY^B$`vcNY6ITZhTnx-K^O5#;+SpqW^y0 z6PvFC!A|#KH`0{%cs6CZdt>m8uX#ab@j3zRwgP0?^NTX^(rx_@y^?T__R)H*vfGYw z0tY4AL`KMVZQO)i2;RS0Y_X}-RyQs-<$m~sePhpoUJ^joW|V;}5dt;}TKJ`6`|}&< zRk`AE=vCHj2Wr3PLabO$P%zTuudwRQn|f=*ucs8Hcuc|Nq4*`9qe3vwBbA<(Vl=qZ z=d#e-@_GC7wCEVED$YWd1 z%n@bo^9WGtbB?iz93mAR?)2 z7zW-?O|HzvSl=IgFIcJl!9!`t;b-g((%KqL>?@&OB70a6pG*BYl?s`FM!qi^4fK9{ zMHUfY8(-ma!?yRo1Sf*C*5ASE?Y}ugmU}1FzDy>>E-4FcthDHZ1P@i{?)G03(5>Ct z+LVaMT|eU>Rv8zP3D_CYC}) zrP#ZR?SBT*$KPdkB5Ei%^J6%=*#RN`=0jd(51=*GgSX+`81IsNS_YR}-TWNIIy<^O zBZ(wV$?!3Bf*tExsi<+h5iew$1`%wJJX^P)8;pmpl&6KL^>ggK5q^Np`#YSlD=rFn z$Yyj}N`Bgz4kbtxUaD}2N-@u+|FSF{vR^g+m-~bOqvZWir;Zi$nFr*NxrUhTY8msb zW`>dWPpsQT*r#Nh1@V6BIs4RuqkC6^UH7aq38c&WJc1SfQs@NcZsGtwEmy&EQtZzb zWSX<#H7*qWw`@$T2V%d!2ulw`YWzKP?qyC4$k)0I&87oj$NhA zgNhAx671Bo7Ca9@KfoCj0XbrDA{EH&tT^dGOdDTFL*m2?>CXz5ef2`WlSc1YyhH`bY-axonffp^)fs-a_v3S={TgTN z%?v=$OZ~Og5qdUL@Db45cveOmzd^Gx}MI71(lD2%e|=0SO$hxW^Y>9Gc*Lb z)64uv6$x=9(_T4EUn+UJJaH~T9qio3som3abutFc=_QKA#+T|*-ZEHO%yGuo);K*O zAtjy+wsl^t2h;@5Eq8I6ORK{&=Mx(MTrGizRl8wLu1DV0wOIXHO22x|)MXB4=FAX^ zP1?6nWx5NhqLBRQZP%wq{mgLVJ=(~Vb8rzpYPpv9n8tA~)F?p6^H>(pOqc){s8XbZyt z=QmYjb%y-$_`p7}GOWfw5FiTr{Y>DxGlZ?su}DLc?Y81y3@| z=2y{_!|^u-e-6(plcZuJt~c_G3QJwfFa0qB%bXikfwr`|2ZC0}Z?*V5!rv4=vAN%R zbwTWZqBOf~M^Z^GJ14N6aZ;C!s6MUNj-5-<;ZSt=hv_xQay9O-!i>hkS%1n>2pJ}s z9~m|U|M}-G91*tck!kweE}Bs`_zL9wVJ0Kb@Gj|T=>+q%uxGWsAA#<%CAg91*8mR0 zbzmUrS|@36J6N{1bdHc)ucc-?cXo)bb%_wwnye?)ADf$H%{;XKuESd`+LANddT_UT z)?%>=40PCH5_neCX6? zas3pqpM?&69Z@%vHy&`1V#ks*+hZDiiWrW*N?*w8o?4BxbbZzgS%d~1CvoQ#e)ob| z<<8WrVTjZ4U}*g-L6H~C>iW`_OM+gevcb6{4e6+(;vT|-=NDn&5a`-R9Qs;}2Uz(C>EorhCVvPR-Ux_ebs~4z*_jN_zM0lMm1Nxp~4n3|N~C zmsc_+Gjog~ccY`);yA(ekrgfkvFK|C8sF~F8&f+y>!-1Ng|U(30du{@LMV-I)Nw_` znaA}cAP@&5RZiY+stvqNeQzXl?l#0}yT{PHJwX0!w@ zrW*jl<6ytl-=3a#D-KdA*iG>AH*OZ!QbBR}VyM7~Ra;moP+QydR_+ewz$lNJt8G` zpy|4+V^#DQlQvcWqD->+97Vvu4&wgmj{>vChbDq3f)L@9(CGaiqHkO0O5TP!(NaX5 zVR}=Ht~cNu*Z*a~`h`%$8LWN7^FO%n|BGhF2R7&wh3>b7HY?iQ@qLAxmG?l6Bl;!x zwBLV7?Cns@?&(aFCEN@pOYZ@ zl$Ud@J>={=zfGmeK_Gqx|6Vp)j4H`4+keNRYle<@r9p%mdLwIOQuESPe$a#Kp#mvu|5GkFN!ra;rLN5s#@uvrtI++ZdgPV$Rs*W6t(r?HDhii41Wi*u0&5 z5I;QuTCcI}p4+r<14lgLuMb;xt1}HyjF+&U=Bws8YvF8p1YN90Mi87hR%6Jkw5ifm zRU6sHbMKM#l6~@h)_1Wy|E&8jj6X;Nq5Ek9DwZ^Iy9ilDPR?%AYIyU_WH*Xb%Cx;> z`Mp4ZA$qUpk|17yarwQTPlHUNxR^{|+8GfZOldgRo46Mwd%Y21;Cu6rl&&c=SOVRL zmMg1i{!lu)?nKN4ALou#*A=0prPoI zqN7+bSFLi@>ruyIICHgu_}p@53rC)>qvz(FtJfFzVEui=Eq=QF#O8Q0 z>dBV$zTG1$mB}(kks0$0*^i5u^gl2EJ!OjBuKcHIJRrg=U3X1TSXpUZ?``#l<_l$m z$Y6aZn7wX;9UrPV8g6mP;Dgm0BG|prywiobzH7b;y8{+4kUStzW;*Y`K#o`&o#+%g zuDeR^{c%OQy%N|$a%!At2Xo?uz2rSQn<-b|3HR}On^bL`Q0vVjhHmXP#)N+GmF!Ja z`-P}zRkhzTxRNLy`P@IK&10PUbESH*_AlM8T7rVng01oi?tN|3gcG2D>4-D0on8bS zYUi6rL!?FJSgd7P3dw7z=^P{1tyr@!d0pF>%#D{w+{b`94t8jZP3Zcz0j_fTY!%KD#=PrPuuU5O&)kE$%V z@Zy8JrN}h@I{tY1K<9|zBoU@?sGU6X? z5y@%c6bIx$8zIaWILjthJ4R-;Kf9S_G@2PK;_G5rh?qu)ZvrtfgpwPbM?8u+=~Rg) z-0Aaw&LuZq=E%K9_CnsVYSLI1Him;m+xE~`Ve@7mETpHBp|=^Of+p@@|=ir8r^V?Sh!9ZYs^lXOa%cswMay!L?x{^4ibV@oS zo({j#|H|V3+WmSEYMH!3)zmoONLiR=-zLopi2ptvK5bVUy&Ys2Q+zBwed7fMq){Gb z*3GY_38*D~x-4adlS*9So}H&&W#s}<&CS`a4gZFJj40z?7Q>WF4t!9nOy#4%Hm!q^ zvV7!>UM!2X(g9ZsA5l?o=+X5L<8O|7a?o@*T?G2d78DbTT&2_nSa&Vmp^{7UZd>}%he}S(kyxHs+;L@T~y zGOXU8T-75Vt`v#)aEY_mLL24y3181K(W5_0=pcX_f#H$DRKdAQJ~Y4zI0 zZFyAq*S>9!ZXM=WL=H0UfQPS#1qMdEgUB$KjGCT{RL;?1aBK35G}=Q3_!?g(5+pPr zwfsMUW-}!)8S{VNO#f8@ELk9yaA~z@Os3Oj$wF`o=du|l(X5(O+gC}_ul*aF%naj> z*%fg+;*AOj_!TY{8kWJVovD&#kZnW~^h&Qu*vpr+NaaX%?rp&ckNEynG{{GF{hqs4 z5K=oa^&f-)H+_d}qyd*=zJy8wJPIR2sVfnxx`bkfwF%dF zVMzdSMUK~PiTo}fk*Tna-zH$E z8|K7I67BRsM0ij#YUq{Wl0YhmUJ)oV8p`*6nM~`YU~OQJF?L`3hVu5i<><1P!joaE znoZJO4a=5Lf9zL3HN%NN{x)s>5n$d!oC>hu=#PlxFidgm90wvn6339VPcqCN0+a`1 z{Jml{@{yZkKcuVYcxIZAmvsiHTlpF@1QD%(tY#;I?woP8hGxQ{hV(u1bPK-^^nlZe zuaByf8fTSN%ulw zAU5=gWb_C_F||jS*zvyjs(RY59D_ew22#G8V7#7X3xmr=79ld4^0$nG< zc@qxRJIBF$d) zzZbH`4RIEjB8xy=_iIV6hKWO-w-2eupXC=q9YK+~vr+q|kNRDA;S*=oSH*KOoO5QR z&l+E@{5H#a8n=@|o*756$7mFGf6x4BU_I7ji{5&;%;04Csiv*KA9+}>v(B?Z+TtG0 zLCLQ_JyoSvqHe8Qz=Q}^y*)1W?pvzGV%;w=zu$akpN;5i+N#d%MUhW>WtqMR)@#C~ zPGQ$@PDs5x&eI*7qP^0@pu6oq0D|WkLDLoOh@7IHJE#Uz`D&oO_=Sn2U+M_(#x7+^ zKRU1-t^5JHsXaRO^~FaF=6_QU**e#UX-g%EbSX6g93TH6{pWH>I$1FXA~SJ*4=Oid zQsFfF9gxVx6bId$_^D=_Sa?FI7Olx;&AFO7`tu`L(-2#6a4vY}h9d@B3z}kEiupV$t?L#>P5;(H=9b6wHCCU?3!_1CCX&a82<(EQ!;(#9x*0PNI!3X8od>rZgZb! zK+X6g;(3i;QSyJ-d#kp(qGnArApwHBOK|t#?z(VycMIQ?u+vW`X(1-&6+hwjgq%URSjY3ch_r$T2|EaGe?%?b*KmqLUd#&JWi)m z40RPeaG&pX;>Xq%+6BJJ0e!~fhifGlf)IdaVrNpIrECnXuXu01`~AuxYbAB?S+I;U z&+nm-gN%NO#B-NUsZ~(}9NY@SB!v;E1t7{*sm142_(B(jIbl5;S0qV{;ZP49zUv*RjT1!92ojO_zpf%{wMW zGydI~{;Zstxij=@Q!m*Ha6>~^X275otD&fR1~#_}gOJ@nQ*{_zyyrfIvci+6YtYcv zXZP|~T0hFV(u*-nH5i6ZiTI|gCFO6dr{5jnC0}Qqu(^K$+gm;ZebktQTxdIK1-KqH z8c5gvN4Hp}9bv7-&}kHpwBu11TwDDe>II12!_TdIhiEtYP12O5xU+3=D;@29BA8At zV6UpxSs99~;~R(qP6 z$bas2DmLknl2)rEVOtKH@+~W^HjFQjdlTnp>hWo|3wxE%A`6YdB9ptD5%oV^;n1R8 zSlzBSJ6yLQ>JTp-4Cd!HHW{qitqQ=Y#yIix>d!pi&uSl0=^?>s5QXc`Ah-Dq0;D0v z`kFd=6#i1tx6~*hhxl7G2Ycu5!PG#E!6)D-F+auz<6;Kh`*UQ;3A&Cqr<+TRMmehK zFTWFaAbnIFDdE=z9*M;-!u@Ov%)>FqU|c@jW-Gx`$XyE#g-bS}h#FsOmm3R4VbbS~ zZm#S;)DZk^J~>B;S+h>PYv6g zZ%NY_&L?8`o6cjDx5PlPo9H{Ix?2=egUjGIH6h^L_ zo@O2w@qdj7nz*(c4epf!OWG*dQTlAdTj;NQA5&nfzE{FdQNA1d@xGv6bZzlORV3LA z+Y1-d56g6GckgzbiUBwf3Cc2lk_X1~j1`|ehFIci+fxAjm9jjUc;~VDnQ;X|^PzTo z#4nvkhC0VAJhD(Z({M9^VrF0e3>&?4KRDXu)I*oxr-%|!Y zTThV&XK@aXe^oJfnV}PY$$SHqW=gmh`pr1Q)&k56N|)I^NT%zrV;}XgKMZr;^$-2U zNfgxdw%8*YJiTPH7|?yPK*G)>iyzWaSv&n#qellvz!>OUtf)w~_`VXa(tf?Z2vsg8 zw2A7aRFsEMTJ$Dfz~>KrGW0J?!iGHCMG;9I*i3nt6V8OQR=|XO+OnXak2ma%%N}%Z zG-6<7ScY(9-_(%4e_UTsqG0wt0{@wNa9R>emu{cD^8eeLL-=cm<@<$(xEXvRXT`F( zd@ze#Da=}zm4=RGUDFP-L!sHoGfs=`PfHG#4F67&jCpz#aW~Zs+QGfJTO24znizZbuqyR zB>qSs>NS%^EC)xpV=UTv2dQp-G#~~B{sGm^^lj#sX8u!oN?d&`tkX$=Etdaj%0}oG}dGlU#0X)C#S$OiF!m!IOGcnT5G=MBDS15 zohq7XT4o6umUPcKJpTt|SLo_Z?`qc~@Q5tRNp`m_mfUQ(AX4!CW32QkKfUHe^ps6b z*-d4?wM-S_j>Z$-X9{h_oMtV$)HHlVzduMT(wSvz%v`2&TM#pw{gvxw-f^E1L@9Uo zBZ^73QWMDpy|SUH@J6C?A$g`t+l$>+BdWLqvd+)%Zuo|c0D5Fb@fD9X-(>C`A2{V% zO`$9u9!bOHY2b+me>WgTA27iJ+JB9dZ+*sdtAp)hD%ar^T8`O9`}i!niR!FKO&@o~ zuA>Jt;6;il>5TjScH_LnEyq$*q-C57afC_5V56nBOEX^~@d5P@T!C2EVt{K&iArCm zhNM23bG{lpw35-U#yQ47dfCHk-}rRy8O6^V+LhbgQXQS^Rc{*{qR?6nrn*6G8pK!u zRiP9|$uwS!&n4oNz`db0kd9Hn5r;{jvYt!C22eu~duJ?s3D}Qr$v4n;!v5lrXjbTz z&$ig4;7W1q!`0K=+CRBrn0FEl(lugX=wu*RR+6$UFB-k($=>&G{d^Vmh0qfN-KgD1 zPiKnv+=;`KcwE)?zijHh6er;MOly_VIQO0{#SKxgWu-DkYPk>o2#H=?{Q;$A$^nV~B_(kiXcWADLa0jkCr*3VUPQyNV_cV)DgyFScrAPum z`o;2q>8fBsZW(|vc^OU+LXD7N)%9u-ZG4^B!ucH`@d3x_lA7Me)oiI!nuRkozU|ep zKm?M}VM;<#en8A@_H`8Xthgln_$AGLmdmz&1T!Y*K0ES--Jjn*hwo<{u}q!cpfn~q z>0L_9+qTB}GEia3T3SB?P1=**if2PMH6@@9+V;OaF?D`MM%!aFpBd6^{VmylUEM46 zErkip5v%(d-RkK;>&cu00FhzXrwxaV*QeUWplmH;y=)hhzBWcyVhQ?*S+6VH;ic&h zl8-*YA1(CJixmjv?pci0-0_vgr6FdAdKmVN+GgKAN$1!AE&v&~^wow@oN7}aQri_X zQ)IKf+_*ZDR6+t=^I1qtfTK#yCDyoKl+Y_&VMXy7*_bRP;OpM3zgcm!c_y2@OfY0_ z&P^hIwZRJpoAI>lH-pkNl7Xqm8!)T|?=Wx8r@vT;I5zANQybna>|itZ>sQww6g3Gg zxXVzFKQSY{p_98-+Iy$AW`pE_nga4MmG?0Oib61@Gzv-Y4(|nfZCPs6-kILV#pI7? z9(w_vRMi_3k{=i~BRM#Ts0-vMQXHbxC%3>6EFn$E@4<27GU(b9nt1J2*`fi`K*sP+iM5U+TY73y- zfxH0&(|KnkLgy5@-{cL=Y8N5l&Rj>B5OBi0QM2h`g1nAj@=1jE@{r=pAvr(?DUnv7 zXpP_L2>p0ETAZqp!|Hug8g>X*<|nJ)pXFHM3xxD~+}I#d+e`|m2Z!&}l2-Har1R0xiY#Tx3i zd*J=i=MI66QaLH-HU`E)Jnp>{s*;zrUaO+n;d}o|()(MW2tPd{Hgl%@_&dn%XF@+H=h95-AD- zPy3rheCnu0&E%R||M4~42c4`DaVI-o>&ul8(9^@(7J?820TC9Xt91hu)_l3fKJQc{y;?i z_8-zPCll(j$c(UIodZX{u^n2VDEJdjm9vXM!{VG=G6{21d+g*9)gv$oNFJ)lfD#yQ zgn~TxY78j5Ih9+5@OZ`sG%N~3nH?x_>pZI>8QNsA<`I|OlgDjOToNb#BUZ)0>N71> z>{}w7zXOTWb_y=bSFFZloLN4_$nmq){q6fre`O;J%eyPFS>^|6VGY%W(D;&*f>05cqK7*T1< zYq!4Lmyss5+>DAyxD-3p3JUC12)=!`ivaxLOxq~8UgsSyttthCPR9+Kwx48GmU4zz z-Zpi!rAYU_-9~JypNZQ?lIrstY$T{>u-4jd*=S7*IE{Kg(dvxlNgk5CS%R=}nk#^& zNoCoA!Om!RC1ZTKeWv@uZB16=OqS8Vj7R^3BCM>i(vJ`W-RWLHur6MVIi3VTz*;p*nANDvY`S+?cbg0oaV6ET1pQl824E z;s1(4U81hzADplCg<_T{cY8WfB!Uig5PZf)12R(fDXCW&Wxhu53!$fpO&DUFa4t57y9~Oe2kXTF2BD&WUay^(?oR2+@qvKpb)n!P)Y`__t#>Pv z88swW_g6m~E#C4-*!GBCA707%PkfxWwxRxwBT4hiwV8Q6U+x6y{DR&6NtQZH#tiiW z#ggenGIh%Tp_S9&@L8%CQ1*mUO+|wsA9WAkKhJ?3-6w^^wE{()6p?))On)1IIKsLE z+q5>^psntp>zkP8jQ7=5!Ou3oW6mcQCfe-)T|48xwbvLTs+R0>^iFg7?ZR{2q+*%Z z3-nCi_zN}zUn~y~e$T(~l=}MhiBYWG8nvPYc1%`=0`MpHWApbZM-!RT7JTOR2xHo# zxkMs82_yVJ6Ufx95|4fkmE-{z8z3HU&eZWf-BD-SH{i#BZdq1$h*_~)pJ1~+QbxAM ziiHL<>RAyiMx&8AY{czJYh$3E@5A0!(XX*UxzyK4cv8txIKe;zu*C6O=LInEC$@E5 zClYi-wnvIgq^Ed)#q{aZACU>2^{xAJq^rH!$Q8E^$(HFV)a~bZsrVXGqqn5k_M_UA zRn}ae4E}PHj8}TtF(>*oAC}Vmbl%z|U4omVPVo!e<^l2Es<4ri^AWl~TS!d`N8%@# zHBfRd#6YR*E`1(~TgM{U8A}Z_e3;AXy;5PXN{<;7X|zF~EJcjlCw65GjaVIp^xIVU zA=zsBg3Y|Yw{fg!UElFf$k@!QT8DROZCS^IDN+q)pZby5@<3Vg) zD}&hF#=e>mxQ(-OB$^7~py&+(V+j({ue!v+=#lA40mgLiw|82Nd+Ft#qEm2YD5b&V z)}HZzZs^2v$!|sueZZ#oqSH{hXW9^(sNqm9b=bxI`Pa++KC_|j>iSNV=x*se$b=*2 z1Z+8bz=5G*dQ?|(j9Xua-(?Z0&Ds);o4FztSiH%gQ+8A47OnM)RLtMw`u4tqrA0?y z=T(9(zS%9(Su)cn((Dk1s+CqsQ<{{OvL&k~*g}dP=YBIl3RwrM#mOaxyM1BdzB>;v zDYAx3xy!n(`FTd;La>}95Q=ENIs0A%HC&8ESXVCw9xD<9Cp4uoY#rAHCS_3+-FC=4 z5&=tOT9XPs8`IHBpIG^0W<{O;KR+Ola_@PP%{a!ZB<#RlNBXab$A$=~b`Eg{S_AL$ z&g!lACHLZT%j}3jyzD$s1^d#HH}kp4f69;E`@s4j>Ii%?KTZd$z{Zd2qvBaG0?pt% z_SY>YBDK)-sbbS#g~j;IP9NybD)W5@u4ekRaA4al1#A6SzNf)D`#Ah zoa%?B8TvyaMJ2zGj^#|Gv-7!l&zA#Ig@ae`{6mDc zpxwDvMKt1P=n`2D(IP6bj2wRXrFLheF{0Sdi=U_!?WnwtN5-e?O|G?`S0%xh)W`(Vs% z5NiJcvO|YF`9uAKa)$_BBQ4efa51rf)7gqx3cb)L$BhS4Q%DMEFd6P-5p1>wf+#D^ zq2|DHtu3~?1~G`n3i-iH?nsu%M7~D49;t-#O(Dzs6Do4u70pv_v!tC0v)c^96%jAv z)ofq9%zcXlW&V*R3%?o&{~oxHr&20TOov<svI#R97LlK3n-^nooMUFGY10vDb}0F{?6 zC@rI%m>WeEUQHDDQ3gz%IxQi_z?iog5GNGzHz?NYOQB~qI^DigWE?|+=d_`qf8W}M z3tnS2(N)?gjX<13%*>%aa5lza&1ca+%qX48@s3HG<>1iRjboQ zFX6RjCT+TK8M2WhgnHYRvxK)Ty(st2h19Dst{tZQ7r|kI!I9>3-q!S=slx@gr6i=q%lXI$g z$YFPXWXEs4%)-^TOo^5fI@_%U6=q5*ZbXl7u=ybPn_7`9qSxS#4kZrW^so+ZCpDvG z!tcONu@;}AMm~2_6v(A=U`aY$MRGz`G(`o~ml!(UeC8ACw5Qre)DR7HhsmZl5S{Sd zn5(Y!pUi+0Sq9nDQV#PFxSh*H)-PX_daL#7Q2kHV`JCe^?R%y3^@~?M?a^(WUn?r+ z5-5E~sK#P8TO2JT)4XVow}a2f1=K(a`kB?$=bb{d165$)fcQR_C*&zM^^p|Qlq*iA z_x^}LAl6NUG!2D2eBlpnRMtzCPC2SOTS{sU&o=njfNKpH`G|*%it%C%#Lw>v>*=b1 zd)hf=XYLxaJN6=tW!;lW&l z*`k+1c!qY+xnxn+!bc!3PL;3Fs-&=gwE`{+ai6V^*W)nTx%$#;D`f19u4rHvSz0`u z--X6{*6al63jXnfq{)gEn!t*bGObdz%vBs1L9x8@n?^b9;MY%EtcAqlY{}U5wP-lU zVKRpu=e$F2so1R6jKPeC4Xf3b==!=QeAkR{rU~tQ(rd2Lqp!s@j&r5*!1aOQQGT~# ze9-6^BfFjO&$yC}tCNLavZf-P*RW$=+3{8=bcPq#)*Gk%ZXZ&$>cFsZZF3Wq$Ker9 zaPcb-kDm~bintUvjU*O7(==LIR3tFg?~csV|DaXV+L;r|!^UsToK5?NY-^JPK|^kG zd&1_e-wG}N35@Ec;cTM3X+&)!_q$!B-~H7eW5Dk%lB%OAy1t|hl227XpyW+}D-ct5 zYR=<};3Sp8O>dQX2Oh}cXhde;_RL$XJ&MTVoM~}6pz(mjri`tNF|ZfBAwO~<4|tIs zj-F`V{A{~SNhgmnLt-dEmR2MAqOyaWaW|ogqP-Kvs53jVKT$oKL2mxx_yC@~1sQwH2?#SqPt zu23aJ%I?70L)lEYp43m+U)5oLH)?z_V6{Hz*5q=BY$Jdc1A{c+F{rX5bcfTU=x1$# zaOk@iG!phApA7HB#BK3wRGu(pNtnveL>zZkPsCKgWoWOB2m8AnM@_2wEB8L*@ijQE zng`p9YjXz2ttWr3+fN>;47*Oc01)x93lVe5qnlBbP(u$ladbQ?$ff6*vK< zKV^TEfGc{a7vhvFzZZ z5WZ<*?rw%aU6+TWP55iDIz9nC^w;r6$;uCA&hz{SB7!V#_!36%=t#Ms*_t(VgFt5- z99ked5)8qh^Yu=Usibkq!XG7ZIfvp|L!;Cmhkk4Wx5H~&oXaPgg03%1-rld<@AoEj zgZTyIlvZ4|-v<*OzIMk%Mqi^J&}jCsa#*n>G?+1xfaU(&QgSAYg1`A_p3fZ870uKm zmGlw7#C#(5D`O}vWqsR+e{P<#N-)QeRk`_FNpLn#vKLfDD2ro80=S^+1w)cetF9E{9YBIWz^Bv1$ zbFPUV0ByaD$$kxRQ3C(x*uwAG1_%^;{ujkLiBqBLMzh*dPMPEA*FHnRLq4yGc? z1uer{G}`H(oOynwvR||B# zM7EHnj+|{e&MhMSR4O6`OE0Az#$%9=>;hLt;i}od1!Q9*ToY!qB*I@qu`jsVZN+!K zwMDiV$v&?B5Z(%z@QbtMnf&bjjzpEtOfHlpnPcPn`C=aZPasax?-ZIm5rj(o(di|e znFaz>doO|t_2|ML`Y>;1zGHsswG*t}nA|Ad3ogV`j)x9JB+;CEydI$nH{x?@R$wBb z=~U15eRr=hi$6?znR8i`7Uma^nYHjB1VO4A`5yxjQ`W1o)nsq1W%V=$C2b?xFQ=u< zL^>`B=lUH|%^eGcV~mvvx^!HXzRG$B9Jf7fX~R?2;$d4igo`Ous`v~&Jekfw%{dM? zyBo5uq3PT0@ayJiWd%-qiR+woEVOj>Sp;rg{55!kJ0J(5_F3$`BUHIswFSt6!|<;7)R-bgaQ0BEub=hV zmYC~%)W$Efx?gwQ={!76Ub-)Ftj+|_DQAMlrNIZLfmS2m)o-BL_fv?DywG@U`@&vd z-?K3r%X)F_uK9w?A8m%BXGj^EN`*@e^54!Kq1h@_xRfHJXC+zWGD96o}yL@HEgKt~w1H@Pvvf941sLT_U?!7>nn!9eem7 zvnvD;f(N59w!N#gf$c%c7u@ip)rYupa*(u{-rp_j*4GUd(Z>(6L+;;Sph07be1;09 z`Sd1O`3DalvxHW)21I5u_YIlcFiSw!&BcOyWF`k#&Iui5R zRkB0PH2aGlrJx=GT6l*=9O2_f7}5YcRQ2))yk9l9(0joUD@UVbtpsC%`XCT;{hi3J zG&v}Tb}KUi+x3L%%3O!G?$gEP(8vLt0n7Wjd3_R--w`ALM}c8j8^#n(qFYmSHV(97 zVYI%yQIs}S!ll0q^?i{+u_V_+$p`Gc1HMlb!_@QF!v?#%!3je2t%hmD`S<=8)x5kw z=J&?!FWYVsqE|YFeH4~ikM|Y2vhX#H{(@@;{G!~v?t<(Ow++5c4_CO?hrSZn?dPp9 z5pHk#PraMH2*v$-@TPl@vnfLuA+%qy*-p9!;o0)Yyt{XTA5);e+BvnJ??!w7T$y;D zCiVUhX+4ofH23|4JfMi+DoMun2c=5FBO<;>AS4r&w*4c0KXs#sX{`IK%dRn>Qzj0a zXeZPo-iKhNxS1nl#MgI`FS%n_bkdt?bn>|wlE))RHknAf;_z&S&poAHX9p_Bh!qu! z4IA!{a(8&l#69_dub)oVRjm6#l?%=OnK0i=OPU6&QYSW=ythl&=0qh@>sl{^mx?tQ z)KkheWjLp|fEz=1s{;?#{>;aD7v^YiXt&-}`(qEWK!h<-Y~2kSfh6FygO`L;7G!c#?N1OB0Yp;t`Y zKWY6lxN`f)lC&zNMK?!{Nqkq1S5%E)$lX61cj67C1H;}~?K7oi5!yr9y;$c`?g;n2 z_T6lcB0T+m3xfI!HH{lbqUoVob^0xM)^bAl+$#T)s!@#kbVp8)!;13kTc_NQelc8; z+E^H}@WF?(x3Y=%n|?HZUx($NDD>&*)M&DZ7Po|7ryq{yaBh(CpWpL0Le8a4ayji~ z#jPyz=M(W`up5Xh>-NZj1=$uskx*l$W;vkU3`1qLzJ?)qKO%CyyC-)A+jb)+#fyj} zpsPxSoi|a= z8RrI0Nk(=BoqUAd$`=j4C~hn$d#J_1ZpZZ9q8GD756FNezU)t9_pWuaD#R{hA-=1$ zYqaJXopNN*FT#XwdkFTvUO1>Sav#G)>v0F^r54C|WjAv3@E)=KWc6UcSeQ{GMi3znQc7fVf%vD6D|(Nc3A~m;P(u>GLl_GuA!i}lU3Ige{dr& z{7yg|v((V&9W=Q-l2mBGH@ei3Unh_d+J@EPwshb*Fo)#i7UA?V6QfZCY>MYML0BzJ z5q4kqNn<;D&z2hjoy%OnT}PVjkiwg-&}1-1P-`TNqPeeO{kCqpbNB<-pI&Dg89BYz zvZa^Bf2;78+udUSsAmG5DQY@0PQSO(>9jXN#96>v$m+GDmnFJ0a(~)v|MLLd5*VM4 zjLYxr)z~{Q1oQy!)JcPL$T$|7UtBv@3)8!Ysdy9cyvX^b`{cq7d*_=U2bT#{{|Aw9oQ%WXO^dk6hECg(r zUjPOQXtKE*?uee{U2X%@tVAskIV>tiVVYfaFICqYe8O-emNMj} zbj^vlrfjL$^N!j+p}%ooVh%Nx%ox!fXA%rr9RciQgP@KzgPqeVk{iwQcId_fpF#={ zg@1Bbb?)q6Xpvwix_tb!v?UV%md?pGnMWg1nub4seQ&wki(p82h9I3GggG*AU>v^}{_&9xgMI{*kfj)y zOkt6b`K~pOq)d98BwXXlY~rE;4cT#gTPX;OQsAdvMeJ|caBb%N+|98{*to_fq*=`>A8 zgCI~hy0brh0+1#&t%NtjuOzhw9+&CI&r~OwlSbs}gf|LZXU*HL;{bG8fB&-vhX%aFcJWCBmh6#_ayuK_D{!cAo(JT5ak1JX2VVvBRT2ypm7>TigVh8itaW*B()WNh5vg2bV2|*0%X0bl?*0%SS>GHPDx;-zL@ zxBI}a=t&P+p)#FaIarUOJ$3KR3+761v5BcSkZfWlW?eUAi0Ww@fVgQJ@MHgP4QRLnG_|Cv4^ZX*Q*E38wK*FCcEdkYg=gk|wvwt* zqbD<(uMnP>!{mt0~9*bEwxc@S5rAe72 zm~6!7(0N7Bg^!1iZ#HU2wL-5wiKT{pLDD!Ub!#Af&b79BJAOHbSM{w_K1x)qnO|bI z1v1)Xl4M-N#6S_*NlL0FB$Q-su ze!SJ&I$b{ceh*(#pN9Xz?CI%gc2NvwZUBdwMz_zd>3GFvkDh&AD{vfV%KDFH$jE&V zfMv7OO7kuY_1ax3ycNlFpnV_bLa30ZhZda~)#_;D`ulu=5L; zUmt9zV?Dm-wz=Ky`gAb2ySppwa^PApB{UVUnF;~^%RP-@T`p@{dZ*g*Y-PSlR{d86 zhK<$D2a;<}-bsCICRCW=SUfQdYK8fcHnt+oAGl@<=NehxTgt!lx_UdYKTIS{9L!*R zTl8lNi-=IISScwVs!N*&tUl5%7M0cx^_yTx``Nydrxy1wI_2Q-R=bjydJU*XqCild zgH}DNN?F-!|J3%ztM8^KI-@ zf#V6kjlQ+m(-U?3W+t__P5+SuS2$Owj8WT`u&&Ca#VWi!0=I8BDQf|K<`vLYya1#L zoloJa<-v(BE(eo+Qrs$&*%Bi&Goz=M$4IMKk&&Ous1+Slzy9&DIu*{mvLL(1BxWi8 zT7PEq?QM`gPjmNvE&haeEQ|U}iS#IbrMw)pZ~o==fn(Sx!_lm=WGrvxAA2M$$t}Nl zw1X@`!G5dfyEE%w-_`MRg)>c{V?``F#*b+1OPl^c#y;Ds^NN=-(Pa2KJpB+5R~mDr zr4^e*lr^pX7*14h84KhW-8v`hgd+}n)lm-hNqodIju{3MEzrRxwj%g1rZHY zi2KxMfwCW5LA$Xc3BG|D4mG1j2G&(z$R~NY_+rigLB^`P z)8%ArMxEj(=2|= zr5nXJ1@ow{mdN#*z-)*)dnii&s!T7LuTZ7Ss{&q@z1sWgok+~@38k?mZa#)tw$|P_ zU|7yytTTFgcnG?`5GtH2d&8yzf{NaTe*zP0#9q2|Kb#I)s5NA!Yn{-2E!?%K|J>-k zd@U(0{qgo6F{Nt5&#gPB0^4yK3r_SLlo}Bf`Rki32n4oLgi76;V z-FB(T!x@Ns?w>Q*@4n2tpcu zvdD16Oo>j9&%@rtqgMOleQJ#sNDpSe+9D~_yP3J$W(ve$D>Q3AheAWMonf&ZyZnn% z3m$VXzwbFX5W61VxWvwSm7`H0rM^aHv)k*WmsM)M4Sq`c;`RBmlnraFoMIKtzM-CW2uDn3CX;aRIGdYs}?0_5|=vx#_c5B*y~7{vg7L| zS?cz$-}M#hKRHx6zmWn<9`Hl$c*Yc0I1FAsib{q5^{cyk{>fskYEgI|!iYw#0pGXx zy?r6xHgl5i7asN}vgY&sm8}S6?N=9yI4UiFv|SCa(2Q$jAKsm<-e@O!({ySzchyn< zb-5)m000RenCLyrTS>#7&Rmg{BCTdku~!*-^S7rz#{iZ+aBNwp=*Km!&=|vGurKOIN?=tx%S)>3jiTF@f)!`RvO@o9F5++6MiQnE~4hf$6?Q_lmQ&r zypH{s!8b}+hE7!P%KsQLjzOKLat~}gMpI=N0FaNJZ%)A&ya@Hr7yT0N`{s2Yz4EW2YYkdc+qq2N0aT1pqtb032)=_U~u4tcr}9Xqj*Y zjnO0XIslSs)|ejw0G6bQ^a>F2Ce=w=0Kq!C*ftOtqYs(S0SF0O%Mw83?QB4Ju4&j( zBV6hW#u@>A0{%&<1~_fN0$$nE?+XLXh9UVcxv&6)BGp(N4FDQQ#7+7Ka{m`NX#>EC zxYKeI@afPCnSh?uj00%mS9&!-+t3ssAPW3rUilwxf#+&j@_-yf=*&NYZope?CBUb& z3Dy+h#rppZfKr?T0909z6M!scL2moNbM%e>7SQ|zpuGqH#n69Y+A~Ed^%>jJn)pZQ4Rot_r?IW^!%1PpfayG zfMftEJ3ydi40wkIK>iW_8V_>>g!>) zY(&48nwQ89d#mg95-Pxuo6n0GKQq}ADiUk+-`Z-Gfb5kcb}N!LiRv#e-ZS%HP>E9t zq}4&cND?VVRZEMe*;+Myq2{tbtH4RoW541*8WC#V7|XK`^SLVCEz<`5$plTJTEDgtYH>3o{)eFkAi}sM@kkP z@F5lNQIVROdR8;gSI^UPKI?_o1ql{bFgzqhZ**p6X7~2Qa91|;tdU4J<5M0&da$HT z=pkaA0FJoCLpTj4)(@IM6e zaF^qH71*bAGtS{;$Azy;Igl5oqUog63Vn?a~=#>urn?{d=xxgf5`qXS4E~W z6a`eEDx}h<2em(NkO+$IuviVJ3Z=se#yg4q&@vTdA`6G`2akZIU(b$@WBo8WW3m)X zTTEqk4yhi#C~$jx3JX6zUmP03tGyI@UG>+$+_K6-On5}=A!8uK7h7d{!N5nzaVv`} zWsQsZZA#8nfF?BXGQE>m*@%UIUB9#x_Y42p@}Uy}sRZ?92mDvOxCk}C_DT$xs!nFA zCutTqKCN_zwKDHDImzVo1n;-p+&pP&-`2Csd%h!8ZV<@9 z$3-`K74wD9LI{}9xHxRoJN!2#*N1KGh} zvBq*RBfGHPN3$}nm;F5ae+p9Q)OmsE9@+9I7~~FB000~W_#sCN2I5gwmFQh-anqnp z6l8xzkN z90t)#P!-*ow^lJSEOSzthV@1PZ^D7~D@-Gv=F64syFTlh>JRjV1qVYN{z+_1;gIvD zQ~w@W7xNjLQz~VKrQ_=sjDY6@?&1!0oO~)BPb!T;C_Ls!+(9$}dJCs4C{M*D0qz@{ z9}oNMvm^ZbkFx1(lwO{mpGZbEf4m+It+ob6B;xx}l(Ew$iET@RLqd)OT<;LXdBOvZ zw0em-zW=R$7gv8CIEs`TkMcgq_3%`CEQwSg^%W7{g&8snd52+sbDXp4`nb>d@J%>S zQ_uDryIPI7%vc&vqoJdk#)0??rd{b+$6hvHxb;vG3xzNe<3~V8&$O%UC z53Z&rtoQXk<9fGGh{I+F*|8}#wdGhU-{<4Wq(4p-+xizkJ&@=Q)|aFB&3LhN)F%4v z>7@^6(U_34Xp*OQ;15t6TRu*u7x|YS(@XP#B|Z9RB~Fr7FC`QBo1GpZ4(s!_r{X*= zdu3CC^id6oV>Y_`#2X(-YO}hWZ%a%<}h5YJ8g;(RnWAgUHp-@}nY49EK%jMRW6xIRio4xeBoo4PA z))n%##9q%!s`Y?B83tXR7NAg}dz{2HeZILK@Lw<_IMWBXr7e&nGT zi|^I*yQVFuKYxo0HL9Pd7Y#KrMNs;McTog!xX`$ff8S<#xbBX))9H7Rng2{%{cf54 z_j0xM;&7w;q+henai>rB4l4`+2S;$6_^YKam`%7;VF!gB1z6|+qy~u}IKE_0-Z0=0 zWrfXdh;AA?awz1raKrjn2UjYs>ZmN3Uf5Kzlfyj}((de(qbZ63-U|f5J`5Y-vzj0S zc@V`l+YUvQ82?B?n&9X3+TY2y?{O;TibbD4_*IO!Gd{zgAHY2HMlj1L_&qlj%chPM zcn-$WS$>|J3diX7xes?gr}?^@ z1#hNPl*BkplVBaA!%0m4!K^{dapRO;ZJevZ{j5~2*ov0_0dI6-LQF3wM2zKcn-!gY z_ftf9-D)#|xM4w1K7y-rAcmj(>nrooLw)zns^f6E-?Iq2*`b0lMju9z~Ggfe0Xam=?>6b&hY{9SJbnYBv7`Dy;hb)Z z*rB=AsdPxbQ%SVB)8papR?kMh^7B=R(Ala7B6xMPJm8I!eZcnP#-erAhun@&NoCUy zvwN$qxcQ!ML8|kWnh3}1l~{2c)P$A0fs-JTzg=EJj|HUJj6N1uUl;w}6mtb}h`PJH z8dY0sCT5NfET@tfTX616c}B!7@3Ybv4`&&z^eVAanN&i78cMm*^|hfP-%#ev7F>M8 zaLBvF534ODvgWAP2z78#N8nze?wbIq=QyIGqTV+JKi!sMJaQ?g^2~FK2a#bOTyeqM zFi2pc)H~MuJYI?QERLd{Y8^3bgpS|WJ++2&-*Wg4*M?%$>RqkJk_9>=$6h`xx?UK5 zKAvx5{l%cGv6$IvM(^O|^(pH=INxqVk5IsVg&m*ME`uXaZFng^aDgNlc1Wl7d*~!& zQ>{@$_37oBnFST%Mx4*bDHRdlWEO>^-{Fe9HyE|jaw28^Q-fpP>-E)9>7u(>=#3jG zvgyNXDJ&R|k4qgaf3?InG%d$;6^qychg5`Ko*D}OXmkXz z%*}c`HjVl2#MLik)!Hwc9L|-YGNp)6G~}Fd)(B&e&?v`d!Mq!hvOQyOPld!X__=b$ z5SC#=Ya#Omn{hOnPU!QDgF@jwUHbuub)>-r$rJ7tUpx$Y)2YKr*tgN0? z;5fP!9C9sfJ+yu1eErEa_6v*j=uj{Nx;Am|nB&~xUj{5#Lz5-zeata@K&r{)Syn*s~Q1ty8jnop5vL`)SIvw{j zitvycBh$5EL;Iw>WIQf@zFWLnC+w#~2j(7XKH zEFy95E>WzgFHdsodydZkI*hY#&_>hw5Ls*RqoUg@!ouNSxKEQ;I?iY^&qBf#r#E}{ ztj5weiPg)3?-()B_+TyGU-e9~EJJq&62kN+V|Uo*X zCL*vS;H)U$bIhd-WIOv<#q4XeL3?CIBwLPf{lXZ)lqh+hvR~Vh!#niWaO5RVLnL@V2vwyV+`gNxkP)VPpZOCl$oe{HlJFM`< zU=>lS{jp%qiIg+h|ECZj94OV`s387*yzGeaC8E{|C&hw{4=;6v27`u>9N0yawGopz z`TQVIVp=d{g>&A=m#x-^vqdAiFGR{yA5P4cCTLz_#p<+kwg-gg({tpHmq_P+bauzf z4Jt9zm}L85QBIY8U$Yo)(s(MO5+o^J>}H!;2v`isusRJoC)J9u;_ogG)Ef-nb3P;M zlzuiFu)b8%p{mh*)m8taPk+c$PFbLsmP+>1cxIsRfWF=T z>B7_T35tnxLc^Zn^sSCnTpxpA9`0*0H|7^MitqCp(9KPXnG)B4rAm_uEA``0sK(!% zskpa<1k_5)y(4bTo_E4G$6Q#rLJyKn;?qPt#nH%ytso8|-^Vw3b7zt?AQ<$;%Ikiz z)*8WTLAm_xD0+Wh1UjLEXdX*SDtAdpS8#2k@aactaXzY)F{8#nOIz~aEJkSFzCIDQ zc#cq5I7Ylqr3cevwUtULs}og$d`7B1+wWl#OtZD_$)H1aW4N+)bnqnalCUSU#^0!~ zly@Ve6>-F)o!e}jDHO^TsQATBfQv!S>}o@ojd3A+Cn;ZVms7-IIH6~C9BXL;^S3(Q zlSzEKExDNCk`EpDgv2 zu;^Qjt(2*^`Fs;>@mo}JU#qbkx89CW{3YZ|0hh=12IKxj_c=1xIt;-hii4mOneLwdMnESuHNp4B%y;7!i$7BJlQ+-icl7 zQsg)r8?&A)w4;eyliwUXA(W3^GbZxpbG0&|fO7MUO0kyoy{!&^eGrF%1~=kp1hO}A zDJD!mm+3c=SMW{FF}ar9HgI9M8@{!mV|6peSt+@ab!%jw{Bh&zJgQ&e%J(WNpX5N0 z!7d_lZ*~WQvv`tH-|=$m><6C^cn1^?Wg^&xPi=kDKV3Jed=~hcc#H26U3#UxE}zUi zEjZ&{z-k8S4i)}!#2sOA6-ySc)7tBq!jzC!s7M~=fi%B7$#ptyNz7!_{>`e@{L1dJ zUXdx0)boHpsu~cxOXD8v$&j*c#mM^pd{4481Z+NXFZ!0I4V&jAJUKvx>6n!ZeX|{D z5+!1JkgJu@hSeVA+WI|Q5~K<^GX4In65*e+poIXG3M2CsM#>qESLo{t*`T6~`$ zTx(!;6FDw7ziU?`svDdf3dgd?KX;xxW+Jw;234JPzTU@)C{8_l=Y5v=QF>1B@;RWr zb$dG##hFsIdR!mU;KCR* zDa0}tyQ{d`T?p)Fu;BQ~tvWj=rAbAQ$aI5cc?@POyc!kFV*h0E9a;X-_!~(;niUMq z!zE8a^$yK(4)Xh4Q|cTsk$lufAp|u+mYnvo1|lMR#Px_;v^4ne4b%Im>`>5P`b5^Q zWlI0G0ephqM3XDgXM^S$x9bR=+}G&r@+nNaN+m2TXiRQEIrV3&caOC-3j(a)1+z?gUe z^Rp3<^Bsd<^(VDS|6PrXY+%z!wI4d0p}<~qISrcufN7Y>D^iq4@9ro+n8*GS&mHm! zY}Db8Qr!B(#W=qF>yAsxgJu3ji>Q@f@LcJl95C-IQY24bATOC9Eo6)EpZG$S#N!q}2=oeN5w{5h_{Y|dKEo|$%dXBhx&u22q zL9!4Tfz~$5$*`-ULEk2q63aDZ_Dhlj6Yj1qLO{1*|&jl zyV7%w;(OA!+MEhrU;0z3qI=bm%NG_(I{aULK32V%QS>v*}YF<=v|+!C&|*Ue;oaU+uodp zN?-l?RtuWK`=VD?3ir1#7Dm5+JTp$gFQ4(lBe%Y7`6mtORblMxa{aZQZ~WlSz0h|| z@ZJcC^eirJPFl0MdOW#mw&|%1kW@1OD!wdnkV<;^V(zH^b_i9LD3ftfv77*6h;dEP?>MwF-y?zqV1D4*SF*Sns5&) zzZi&=KI|b}_q!Ic{n9L5h+u!lFKkm3b(t|yogh~_Do{5 z^WK8yP3F<;oW`A;fAkCL2l*W*I+^!>mEov-KqovE-^(j3p1$_%b*E6s5o-=q%vh26 zJ5R=mvF1$fveoOr7IAk_L_#m|SUOW6?|whgFjF*+JyW;*Ir`?^yf2TVVIgkMa$Z8F zv}F4`#%n(QTg65S6iE|($$U)?NkSo@ck`HidW*L=XOvdfQHKoS2Zw}0Ur-bZzi1q$ z%%qrH%#{s?L9d985#!6uRblQ&geK5_X17YLw2Aw={cCpp{iT8*#8f+0Wv4>W-ja(l zJ*^yPJ={rtuK^6VrDzh)cE1NaYOFLgupTwF>vl2ZVorOx&%bf(t#rnGI<3-Sxb@xa zd7bNblWj_sJ@+$V#F7wh2{X&3I#+m$)tO#4=~r!q7Y{@Xy=d7Qn*Hmt{-@?H0p@<+ z+jP;rd8HX|of>n*^8@PFOx7HtC37KXMn~zrnYO^J^6TL)SS?P7rzO-O!aql=mBvg? zihoXc&m8=8uT%ErZs!xu7=h(PqMD7$bKd3o_dPf|VW}oQu-MNx(zr|f0V5Bu0VxZ5h^7j6 zx-qwqa?0)y+5WhrM-)I9{++vM*4j;IOwFEMpQw~XO+5JJB12%F&AgGRiN+@29ET;r zCHRHyY0P&1HRtt(v`b4+a@$MmT5UHHK6!v;aM#FBm@=z>rraJW@Nal`hj#^Ulvt8@ zcifU44&{dB+3Sm5#OYsj@SPY;u!{5c+$ozD%rNnAWeFFp151Rv=hq(D3q`$%o96uX zm=kwiVaP#@b;RY53HFSf*MS)WIGxq#>CZSr>lXL1Pzx>#^0Squ51fiWtdwv21rfw5 zE^dzIvz;aOFXKshP8DEKmWX6%wdih!05+VCuc^hn>QGugzULU##QmbX43#qeb>zQ7 zC1JUc`W{jYNrG-RfCL*>br$GC{`LFQx9&S~>ER@kN)c-1*0F52yHorL+{s^jH4`$3 zvgZ$8PY-)KZ+{kpgX10aIh6<{u>I!at$iwtQ-?sey!^6Bh4A$U-Yd5`VqLq1@K(BCQ+I3gu- ziZoCf{3EamHX#+CE)t+9$x~|JiHP0dU_%yO>Z(Y9oH6+Gz$Hk%{1?*itSqVcL4t}r zGh!xvR#aT1=@P{Z&D_hah<7RP2(FeJDoV7*`Axsb(I_xc3hl~7XCxE5q&} zu7Q0y+n>C2-}E2Ft|+s?XTw2G9;Y(hTaX6yYb|j#N=eB?=~sRB04Wg^v? z;$-5XOvoWYDfMJQlT9EB4DZM<<|UG^feyQ(vK>XhqKb(tV&!DI@XUSIkm5_ZwM^V0MF3I~;ZyVk{k;_2nO7SoT*xQZO zA^t$D?I|W#CN6X8)Ii=TmY7^om}%_ko!ZFoZ^y_nOOC)v3NNsLz^!9yFu;{q#->o- zt1`1Xi;Jc=zub=JZN|?2MPE!Kx_f9aK}_X(7mTFd_7x+3uptDi(nhxzaC>Qw3=}*4lexoZbV<2Khy= zq-@wXOobWq$8*15n|zbG6|xj>Y_iqjV&hiR-atI$Er(I+5Fz?|KyQ{v1&f@`suJZ! zZ)l;9IK(pq*#mh&$2!n%_K#G!o!^Vy(NY~cr@f7cZp$g}u@fkyoU9|@A67d+HS&Dt zsQ_R86BEI490pwN?E5ous8wmXtU(o68f+L5t0%Cqr61Cq4$-PGEX)*YCzvQ#%91Jd#yA& zXj?fja2n#2R57!MU2Oo({jg@wCqm(Wb@yZZo+QK-u^gT=bx4p{IQ+QyTjP%>i*0W= zJ`ik2ePBOqby)q8P1D-}yB5p3(yxOS{N-3GKcRb?ZwsXJQatBATnL7qUIN*+}$6UPbb_&(&`5@-}JpsBfZNQ7F7wDu)L-8ukS zqKr>qYo`4&ShROle3G8Kz&VaWA|Xl~CEC+2E4|NhSa{vs3jtak+X@G{uP>k-ynSMgYX&GIR?WK}5Knqz;7 za^ZSEt4gAKoCVb?C6v_lqS`%w0Ok!!!k(vZ%Zzx4NY7YH*m|}KzqN~b^D_We&qW8v z?v4wAHgwK6xbE@ie_aF^b!oHr7qIYa19D`OBr#fBGCVv?2_CmW%qHP>RP=&?!8 zb^ydTaX*?ybHU6-JOjKOCycTKByuEs{-mldvNEi;m^#y{%{=L$)PdXNUgV zV&XWZ-s7iA0=oO-Be)=P91?*y2nFDaB^3Xj*#Fw2h>ZU*shkLqcay}-PQ#(s-lys% zF0TCEmr9#8IxuRf7_q`v&2-$C_UL*OILD);0xy*c=f?e`Dv@tNWd8&SL>lPk)8+yd zv>w|XEQc}#P_Ey{DWyz}$;~vBUaU0r#P1eILSX0B8o-&=90c`<0n9h5q_S`uVDjnw za<|^`;ci>pO@A(LE8+mB6a}JB(&lx6fs19&YG5TmCi*o*$m4CRv*j+$PXf(9DSV3Y zgP4dEIY*N(0+XUC9mdJBYxaU+u`&f?St8MsE4H#Wub9$|e3Bx!Li=M5t4d{Jv@ETz zuXk6q&YQHCx;(2N;c5v%?CO38Pd|UuQAl6akLLX<$#(vzJcA(PCL&2O90g&}MTB5z zzpOn@2D70b_CSa$ZK zfA&0@B);t~fP6;K3{hb2uPFA8{$PbOE+P5O6X#}vyPB#;IBv^480s9NkQsFtf!n5- z>*{jZ`#6B5qJY^n^)FE0AceqA5nNPm#}|HG5&PzYign$1)bMjW z<*J8?4{%Y2zY=KZNGvWbwi&&)MO=5j0$1fxSQYY%AbpctO{vT7!B}TIrH;bxt{JxK zq>y7h^_S>GGZ4+4 zYDgr(Z7WI}5Q~3;O*WxElXbub6EzAaB`|9xjHQ>>DSHFKnDpnX43UmI4UCAitF?O& zQ;-?`l52VIXO~R#m9OLdUJP)w zm!XV?38{`0NzRO%DtD^HVP~pY$BE>*vJB?Jl(eZ7d&^PHKD78jQUMwsKkNjeXF#)2 zT!*{Nne{R?;KJNYk~}F7!XdqhQMK$nc{;d7=_0z@6|#I#A)+*Kk9uW>BzL}}jn4kS z_jFWIQ|r=)p-{3Ov)4U4K{xRP`k{Syw$3KXEl-vBy>GL}*J@_q_RT5Nop7i+HDBHX zTCAX;>n>GW7e*BTFYHO)j!CYFhi6~h&5;nOykBBVWpxgc1D7E|N-rL+IzHS?aYmDh zcsXe)nD3p9t4h+^`fU#w=385{sHOqejFD~BXXAXO;@o|QXwEgEjWytw3+zJ8YkiJ_ zOQ&?2FQiJUqe?{h!aS8fC=buRXA&4Py|(|v_2?hrz&gwkhgGc;JFtDj7+kD=@PnFQ zaE=Kb4nD(}qfp5Mk?ec$uTPi*=rs?`ZVvXl!sjmO0Nm&xbkSZWzUGqA1yiY;AZ2cW zKRz-rs)KpcTl-DCwW1lt{cEzzj`UYTyCfO_VFnA4{GKEd;IO@F0eIV}8*l+V-Ddr_G#59Y6g=Mt{YY3-kv2#H zq?0gkyiy<27in<5bK{y^Wx;Lq)>~o@fb_7iHuvquWV~*b6GH5kZ#$&6k(3T#0V9<6 zlzXGwlY&}WrI+LoCAz~y`}Jv3&?U8CKQ>v>5s{BtKYLK^?t z`K73ScFqVbPIT2oHdXiHrv9x1yRIbCGdmKa zO`Zm+SNL0U&iP9rO3_KG)pc@KD^w0o20q%lODVRBlmH`oSzy@n;C?989%tm?kSYy& zvqUcF9sJtt^IO`zhZEh+#>{>$Siz6={KYq6+{LPB>=%a~w=9{S*gJ zCxC%;u(gIyryA{Z<3027@zd7h((M6_ix=T&avT=R9WtYM5qalKIe>v`0oB|{=msm4 zUnyqrDl&kX2wFVyKg0%-fpfeUd?8F&a!awd4KR6rmp38m?f0`<8T9grOrpS|_vxTt| zP(*2($OYG7^~`jrTvs16BCh@7to)G! z3;iHBR9CkrPbrg?@HM!Pd1PkDzf7Fu{^|2Z7W)ByeQ%A?FqfIS*2wI`bQb{L!i0~_ zdje@KRl8%~9PxnXDzbibbvZg!6Q5YU9KfD|E`9lWr5Xm>&Hfs)ma(pxt=R5r3$`TO zNNve3r$;R+$p4vG@zd8i{E+V)mKbb*FY4`|;(dM5%Jg^$IOM+cN|?Rk{r{!o@yK_- z%13(5U7+XN$(e@3#}3QX9ZMcUt@S1)mXU5YliE$ed<*PSi65Nri~blxyhggY`NaBj z1#xWGQsqI|&kJ4g7*kqCL~L7>*=^gak1Lz2_PatAfp8I18h`?zq5Mwqpgi$Jb;-DI zeW?l<^Yf-M=Y*qvv)Dt_wnRIHr@H|nniM~k zIBk{+AN#fV;mYby0}y+M=@hb)Zr%fx8wuNli}3~C7l2Yx6&LMHZt)Kw0F-j-wr z*lylAm=5U6*2@$WEMT~`Kpy~6`oJEgX0nj_eo3jhX98gaexuFTlU3XZ|K<|mK0-zN z#GTsWW2kY|k^4*lC0iN05VDv{Hq9umPO}foE&0N2)Ew2cF1QS$JLcK_Xwd)ebtMxi zcl4UIT^_JATfcv=203gMW6K4@vQ~nqCt#dbW-T*&u|gV6BLAo(EF4^Js4nrXJ@ASH zpZqqvxfrvzP{Aaz{kQ$~fePZUCkKYnmHU1A3Fxsj%q?yX;zs^D@23CoeGPM^4$tM_ zfp7j7y}wFt+IMn4>kX7X8BqUH=Nv&(s;=xZ|Nb8lK)SaFj2oUaWvxwg&A4q1BxxxR zVk1^46SSOSCmKCY9F;TErnHd3O*ZKLrWyGASekN%Z8q%Y!%&3cDzOiGH4f7hpz4ms zfPeL$i59DO*cW3P2tkKWSaB4qUA6nB-L%PiF!1*x;6T4@(I|*n20ZUi+Ly8mB3geg z%=?G?#SiJuoHrS?JHp3x)9?2kc*Ztpy6aDr6;oO2?J7c0HIpIy%cjIIsU89V)#{I< zL<2WD&claj?O=!>0Pgjq@eB;^bo^MZ3Ognp~IRrSthNo`k6T0ySZ!w>7`~KlD zI1oi#L)cCh->^G#;f`s~*z=a@*5QX}=#==s*l1Veq8?(OCRf`lI`*M=fP=FJ0=s%P zQxc$XT)W=xo_KdkBvjM52n5&@jxJP6<75`)qe~?_u~sO^R9w9XR_T zD+NcgrJj`rMQob9XE8o;MBCH0Bhiu`r@_yVeb~j9{Sq}-1LF$lJ)jQ27+eTc7pX%e zt0}-~EOUiLQBspDNBRrD4g;E>FLZogdna-K8l^>aZ~=sa&dF!{96TBb45r{0bA!x2Xrz;PtzeM@kw!N;~*k*FiiqKV(|8XqBcW! zOqpAsbL^U-{?a?wo*TG8JC4<{shHWaXklWpA7A*M9ODdt5JrR7x1ISMO;*9gX8(m` z6#B(;CFf|me_W8yl5I2Vhy8NWA zwx=Udmk!Xw_bwhvfkT#Y1MA1`L75?#=XH2i1;9owk4j22!`xOai&#rf~ z9={m0gwq9H{l#tLt;?W5_F&6tk!3$R<1%;O8<{t8?`uW$)vTYXV5?7_<`Rq4SJLKr zyWI+Vwql(k1NVe;ju51)wWpIzlSr^zuanZpAIDYiV8f^9`sPkqNUsnHDFox%bby>ban8yP$SJ|&83JK%p-ZOO4xBFH0T(+w38XDF zo!uHmY-2hdV!orCNeJwGZ-bwvt!UB?jxIJr-1*quNnJ&HXJ@bP`h=6NKI#EVNm@Iy z0ngs3qW+w!{wo9_lo;ovJmqrsy7Ll7Pvq1vqR8wd_?KmB6nkbrTHzCP57UZ^NgsaWC?(HJF4yPg0k_5Rxh827_`H+o$=+N)f&V6UF0C z>EE2q6Tb5E2;)0;+5J|<;cl&1QcHklXr+ zDe}`-nIUlG+tXblzR#%M-bjwm`kdI9tD4+6W)^z~pkU!`&*>)vc!*?+lKn-z1d?PX z|NqRWshitDpO$dl)l(_9 zsv!@=$P>*i5XM^{JZ-r;XPEwHIlwIU2XqxqMz9y{(qV7~juBnC{@6Pd3J6q*jWIg< z#IBR$eo!y>1A8H$&1nOVaW~3@F7KYnL^YkuB#h2kPXTgi+9NyGItZ7GhF|iR${?b? zl+Y@>Bxq`ErursIi=i11hgCy&D2RU7)nGkaC^RsWG1pzUh?784R5svSOYxt`*6WvV zskMltSrWhRRIK!o4ubV_oNtdIl_`j}>3Vq{VPx?n^YgYqnAb~s4g=ib_r4^izt+bo ziuy(Tl#!=VcRnJ2I6dOhdxj% z4V3@1wyo|*x{U4f@Hz7hl4L00 zk29Fi+xsoz2fuB-y^mBQ?$irJvUG?RcHJF0C_(eP49D(LFSkSeW6*;3NCUYrd>&$( zr#oW{`D<2GrLFQ3m{QPWiZmt6C9^~qHOU>P$NN6E5>O#_ocal^4+cVMQ5Elnc+~>= zQT!)#+viV6X1fN9)qT`lW4S?4KWspQr9-X^Q1&VL{gd{1;0=UU(cEiLGv4|>8$u4e zj~4qAJbgHp^@mcw7kBs4Tuq`p(y|F+`N;4rEho&paO_iK-EVqq-<-Cv*R;=z{8|}! zbpW9svMqBxvBFP!&xU@$us|!E6-AfL7Kwb_H+|O*B0<6q9m^t_=hF5uq8O-k(o7R8 zMAGvb21KQPlkE9#kr}on z4k2VFL+9O8+!>k;h8pBJdt-b);Ph|W4fwV%Zh~Kf53-F;#pF{Y^x#O+7 zH1q$niSjEiXD9%?H${6;$JltGRovLz{Be0KYQHp7u-Ow&!$+soGm|@(h{VpcZ^|)H zYs_%TqzA~@zy$7C!~D=L3#I29xm?oXh}Wbf!2jM;P5&GMLfmOMqY*czN*N8(0Ha<# z?IV>ziCq&KvQgLT{aJ^9(PodHaa92$r~8sF+h;t=lm{cgxmo`&3}|`zPNwtqAPs;5 zR*F?DF(@=bp%h#1{%g(zP*t$mrt93B#rlKepnirYMM-+9lK(Pqa3POMDgW$kC293s zIvc+eTl~n#>yogD7*O!%l0w6-<0XK z%=W&%fH_*7!;e1fcLOmG5ww3g643@g9gS}T?GL5~_j~!Mvy3%9KCQ>d{+Rg@A$l4Z z_M^RDeNsB&Tv&RvGJ7Z@p$7gkp1OC;hlM^!VQyb0aO;!2KMrZYg_ODOHEd3sG(4;d z3O;C9k5!#6T($n_A_Ucf<-cY3-tAhd6WC7<+3S7R(((Ej4j`0+F3GTD(NZVn_)eiwt^*0a$Vz(8bH1&> z)Zp~{Uj;5i(Mqs2=K`OQ`fk$oaJmSS6bGrQa$Iex(wR~dWd$6%*KR6VD;X*Nrl3YY zHkn~%=;-b|6TE4(I7MpZD>Njoje<;TH_RG8AF#};MzqjP^=@(PH)PF?niK4;#)$8$ZVBXwK>WtW`l}n-j{GU$3f5T3tn}dT@r=;Z+@lo?5#9$$i6ecXD~)q zt1yI>ICe}HBc4z$Dh*L37X9wH3sACWs4pN=GK#-5ar~m6&sq(!N*n(OeCSM7#T1uX z?d?biw>#Br@dYM7AekXdO~!hKnd{G-Yf>MEMV%`?Z(ROO9+z$V)WL?JZC|)+%190k zB>vX_&=Emu+>|lZaBHfEzAEamMoV4-lW81wU_hb&J2BOvtN@#RTgDHew6tz zZhH79lF^Q$!lbl$=^oFYXak7R{X^ZZK=&>b&Qz;3z9n9_c0Xo|q)qth+zIX;0!pE1 z+wT_|c(2)a>cdhJsKETwufkSyT!PLXj?+Ude@EPAu{v4~yf->4h34$v8Ww%4+!v(* zD{!8Ihmdg{i*i^p<#E}1O#T{tNx&~-#8^PzJA-RAMHS3{X|4RWUM(Uu6(*p4KMoO^!EZWwL- z{qH`<409EFl4^z5LF-P?>x-tDLVKmecGJz^15>!vm)mEgmSUS(QsTg<(}uD#yhObJ z@>2)xn9F7P?H)2=KP*Cz696YAg>Lzlc>J$Ci}nENe61cVABEk!-~pS{>Rf#r+8PF- zgs}$p{>>5S)(SVRyVOddzh$tXv!jYl-Mgl2t9-lsYf!=oBiYP%$VC+aaG?+l{bo0V zk>`l=3tKBcES1&w%Ku(yTz=dh&XSR0OX>9*f*1c5m+Z_vYTCvD-GAMJV_s(Zpb-EH zublVelVo10<+$_A-#gwOb%Zvz)~(g7fLCdQmKjM^lYqz5mN`zxSKV#ls{Vt+Qd;x) z!RY7o`yIp^+l9F5?IH1DZ04fU#Trw=u{B)27aoEa<%^5U8DsXKRH)x9yW1P6?^D-kfU`WVb*X9&<7~Im~^u1NSxSX**if<|VbSV0C*p`pTuG1pq zTR+(e{|%AR0ua@)u~&r^$Xl5CEa2p@?G4&ZSfjLRyhNyN2bBz*C$hi&?`XyJSNh05Q^SeMil2O-%8-)91j zmvjB9JO0TuyWH?I8Mflq^xmp&c6;wnh&f7jT0os|ui+C(2STCmC+mrMGkA>)+>elH zx!Q+F0XVhf2odvfztIWT&vB10UurPX8<@_rjK6b}$dSuJ*% z|Dxr(gzpmwm$?p3pAp@4IRg`~rb`lyITiqPTR1k;T}iRrPusLF^Zi7a&5`?7S)Dcz z*J81kv-9Q*1{+wS92SUQR0hko=a%iB2Fd?fv?~JY)@NHHC^!J`jq?8bP|jB=Gyl+h zXrjD8eh?Eq+@x0054uYU%;Om$LX@&!s1gO-0ZKtH27=)OH-=hfat>@|3{({RVR;0C zu}D2&3zdc&-@OR7<8jb0pasik$71F+yDv|HEP%B5FKa zK(pq4gtL@%g_nYqjy!%a2ruCIg2DYjpS+!w822(7F0eWEx}QN4uKL4J8j(jDP;K~q zec$r*K@Jsk7^2ebR{o<4j4ciWh9Hm`ubrqc>YtfM&_Oj#vp)fPEnJGaREbxX0xB)H z61}ArNAtI;&2A-|2&VPEqus3~Z@9si3P(KDO{A*jZa~WIRIqg<*&~xGD5O_K{ z_T^X?G+8u_5tH(xW+yT8JzOzwJZEq8^aNNs8EB0qK18y_B0kp`N&PVwp0si@$bE%`h9{IcGj3c9BjNzc1bX4TBJXc>HN z#{un=g_9RWO1A{CEnWN`05x1jRK7i&F2;T!yUgh}N1q<*VW8KU0%=mKJ+QsYx*pbL z{D)-6CR5s$uNjg1Mce=e0xAr} zc4WJ;baRuJ+^)gD z(4aOmE2nMqfBA)sl`i)EvAN`c!-FY{$CI7OfL{ePZLS5inYAH3u^`G~WG8_EFrT+E zMr}TCZ2ww!dgV)NNymAoi}?pv#)8XQKAVw=L@3OeN#fHniM@#fyB+N^52dgPwNDma z9gCKq;6*oc*%qqyW7(_RGVqx6;un8j565kI^X^l=Td zVP9qYL@12k-7fqrrjPd(2g1mZN%IlY*H0+mRW!8xP{cT0TBpit6GOMrskCw>y~CJj zw%lNT|6AO>B#0f#g0q|sR>>4@qa>iPYn6^6V%fsDjHx2O{P2t#b3_7mex_!(58C7T z^CY8`5W2fGOHLh1Adhl`aMT$gYj@5><$0&E@NrSHKt5TN&1`q@B60u})6pZ|PG|Zq z;GHx>dZo(M2NOY)E$Ut#L;8U&AhRTmWXtNZNhl$)gmu*6L7--LJ*mK#2K9XLeR|ug zrHh7t60}fOtZ>AWu3Cd*!c!#?emj*N^ldXK`7K+|t;zxW@7cHGlyCkjQ?=tk(zo|+ z{?5sw2l8i_=U>^Z_lKI~NSYSY{ zcZ+dXV53QAH_Gd$jlEPAf-vM}Z%zUyTF{z7Dvc@5zrI{7bC z1c_P0W}7j6Bc7bU3wjKP`4ApM3=Aa!*g7v{T4s;K?4k9#CLM-$FywEm1OO&;xb4HI zPgkQ$Ov7A?DNTQ+Z(Kp?<}b~Mc_y;OPTd}+2t#uQu}3%ooI!yQxqt^2bBjdK1@#~G zD#3ORLfE|osJe8c60s-TJ+~*kK{b)q!0MKa1za7BKWVo%5^1GR#h@s`{65Q%Nf;Lr zQw7E}TOqM+Vw!1mrt6)%nTxncnMAOfF6`^%)q4kp(iue?Nz(RD*A*0wK28+K^XzYi zNG$Bja0_rnD^Q&IEtrgMg-viBh1MAZQi$gnY@2*r_zCnTE3HyS+b8!j#_5bCG4yW6 z$878GbMQ3LsX1h@42eiX*#785Dj;{|hyNbnq)#6L>j10RKCn>I8wahmc+xetloKVP z{>RXnc>1c$sfv0#1Ty_P--W7>F9)A)v~j4DAbuAlgvYDJ`PoTQuRLYBJe?oS5~(-?Cdds8>QbZ%ne-!fXu`gs z9Zol4pNJXz$uRbV?JDLb92c0s*Fd-?-P@i4=MYMkBY}`P!M%}I!%`fa5ylJ)(~(QP|-`6t~ZzO z&Sn||MEG5I(9DP9gRoeP#1^Y9%!3drzUw{4&Ie5wP62WlRtG6;GgWeG3iGDk+L0sL zZ;FlF<#8ZU{49KKYa3!hV()y=(9;Trv6M$sxkNvIec@8ytI{vgr>CjbXotUG4e_}p z&0&u*1Hrc}ovgKmLE~JdxQ9)(E6JswJM$-jX9DnXey33g7X4+~wuGvfA zRv_q&h5*ZpU@4AuuE@kph&bVO-)D`Gj!7r5jbV}ieK~9e0kjU~UcqIwn3Mg^S9A5h z9`+bAIzO}4O_+;meF3G(FZdn!?nKicwaNqi2|xAz+VdQq6;Dv%wwKz-XjmM zGQEMlJS8cg@VlD@RWX^5sX`1tjEwxEf06Q2G50gaAZJr*q-8ygv)f6e#fs?u_G*9( zKFc8gmQjz``y#belg6eV;W~UoYn|>#D~lu+;**^9WP1f zn#!u{mlV1ZmQ?^BLJ6<$lTx zn6-NVF6Q|1j#_XyzS_z)d=ABYxVs2bH_#L4w=h?h*)2 z(BKYV=iS|N_B&^P)>O+>cXibx_uhvK#vE`hv%`j-q|F<1XAaIdq^vK5!a_kwbU|ll zRAJ}5n1o5CZqNK~qvRS1qzf>H%n7C<7ROg`kS;mPCko<6+6wHH|H$XJP-Dw|%fYPsB0$+&;r!5Reg3&H)+ z;p-U)4+o$7j$0swg&Q%Hnd*h6htIbce#P^Gei`;|06eG#cp5Bz=>fIjPj0aHrPq=a zjPybiI?OQ&UCl2#Kb?&qo_5hNPNzbnJ{+kJ;~b^t|Al|gL<(*sOY~pEm)*M$ekt&| zKALTOfn!GD)e@vl9<#d)edhDsQ*|(|p;SAV&Ct4Vt^ZNNY|~E#%mn!NQTiZ(l-L3~ z{?Q#2JH;4@m{7}Jt0W@-qCxh;kJo~3ZG3q_-!5Aw0B|OEdv6^vTA%G8nZs%cd*I7c z=*o{HjbHwd-rk(mIN2{C;iQzFck({dlYFy*%!CH3*l1216Tfs0L{FI{#F;cG2|rE_ zFko33x=<5knAFYqf2QVnYbokC<7NV1y%A8vHspBPj5gIvSqRprW zQjwoWex`Ysc`aELi4eP!*uidGPTAv0zE2+vv^5>oW2&Es|K_}Ui$-)63b#_^>TOki zIi_Ll(CgbjG~DQYgab+>VkeG`bdkZ9dYE8K2L(NT2D!xh-@?#)ktLr%+_gA}Jqfq6 zYOO_>xU03>!cd_B5>D?Q6Q5gd$;yJed}<}D$yEOB1s5yfHq5vA;AKR_2d%D7jz3+m zE>_d|$nt2$*ljg@JCW9g&?8tRQ%|(U{u7{(4F-lZ&-WwR8-~u3lp;c{a75?2p4?;! z)NqU-vpWmFr)$SITR2JI0{s``>3K0o8}YS~J`Gh*7b3)GbJ3#pQ|!wF!-`Bi?ueeG89Dmwet~+2`{f?FgGZ zF;RWXZ}S%7{tlZvPb}`{)Rc|eWvQ0n)E^=WPzvz*iN5q*b|VP8((OVRpq~L(Z#y~p zok@jt|Mu$*n<&Ra2zw_QB=`N%=VhHHCzj-GOlcQSLU;bgDQ`s{aoo&ETDW`D??zSe z%%__mSU(0*@b;``&LoOF=t6MsEi|5}26Rh|MM0vN$5z^_mA8F;DY)y6nDoKF@jg;i z-+93H5M?a%LAzMF_-}{9v1vf@q9qA!TK-$8EY9{q)nn}O5#)StyrC?hbw=Lm$&|jj z!lbyN#Y(+)sZItj6aX9SVNY{q$t2#HO^Q2)nGmxV*P3zQX9xX zW}A&ShaJ8uV2Y?}7naX_ZjDt9qLv7JkVM-{!o^}j+Z1@=!rnh{O%_F=Veiz+<|AR& zuMVJgp85JAC0$HG+tAlR57``SV)_$d*n#YfxSgGo2YcfxRw+GUO%^>fnPu+p zObVM$9tuuit?GO74SNFbekXOG$JZJ+eK`czYt~t*Vdt|PoEI4bf-|hDJg}Q_M^R^v zmRg(4=`f3LZ-BLDST8N#a$vuwtw}I*=qmV+h5*+FeQAU`J}$nr{Y-H|&Jb~iCgP!~IfSz+T--M>4C$ z;Y*8OInPF+K1an8!%ZOR;YX|I7uvJwYvp?EmboX8rVDCXeMI58%1pw_?evNr$zYZW zH2P7(K4d`Yh?_PUD{k8Ao~!)BWjr5Sdni{PLQJ!6{F;dgaSu8zH zkgM4Yh5>FJ1D0JEhfW!OOd{VFATNI#^+OWp+{`7`W-6Wt%YHZRk3q`@?EO?i93fkD zzH>_Cz3U39ir;=CiA91C{KjQT41X3wbJ^qknrw+J#5~Rq!8MGhCz;2_q2Q5(NPLN> zRE(YGB#ULrjT^dNK{QpQ@FDd4!-b4iH`yW=?6!5GdVzFt5qDyU3J2Ik2V*;fi)uD* z7jmv+u#*{W+Tgi9K%FD#r^DmtGVdI) zIz{|z0qK&FD3+`j8Uio)2?Edw)Z4Q}D^=(L>$&zE6_Zh;@7rIU&+iV(iMXB)wg!%- z{FL4eVt%aTwh}MxCua=%c{-wZ|@?bUCb z@ni!u-D+T=h6qrukn`eEfrB16Xv?>5K|>zKqn(Gi=c(I{`&iY_3RqMx3XY3oc%BGB zo#v0?1*^@~F+ZdwyP1ZfbC#@!5~Kfd%TG>r z1iyq-It4-lLc8dfz9{SkmfaB|8Fb^s1TN#Ja!gz@?6b@M2Ovg>e4b|>^A@3|I5$^Y zO8oDND2PEh^G$sRp4GmsB;3sNmiH7Oh-c3NVKD2uD7_#EDn%uT)7(@2V{vtk5f~^X zzRyLMXiNRpRS)lTkxku#?Q_L*P{ob zmgm36F8dC`3Emhtng|BNR`0!r$fB0GWvT-Xm`)2-I*Ty{YCUs$ahg`g0U~+;7=@OX zcmY*BILMkT03Sa`4^p@4bAN2}aCM3&XQ4HC zf&#%~`pT8xncPw2)kV7*q>nEx>io0uof<0jHo4E;86d*S_v4lCK4@XDh z?aRa}dn_~SaZ3K^1^>!EZ$Mg~F3%0&H$p4_fYrghGkl8A-`}kVaRuEG`T6NAX=sox zREYaLaXch~m(Y55l!$1A`)BG5naFXwIpG5HOtm=JNhKDS>DwFZt{cqvY=>FQNRZjm z3`ML~6dy7>wh1kjV+wqm`ap)tS*oOi<}@mIFC!5eEBM_l1|W+{8{9a5_tOiTkB@(fGUwd= zn|yB4;qefgMs%DiOmHl62#H5~VIy>~AGSSrU4Qy+BO;f%?!2DUF@$&NNHb=m5q{$t zh|~U=&QiRuSk1IS4K}NiEdiC0m<;B~GJc4%d1>`T8ZfXj#4ftjQoazpt4JQaxXgw9 zW?X00J~X51a!z}G))h#t@l2CM7umU0kAlZoz-2BWqb^Cg-D=UJ-J(dsF3BT{1tuX! zw`<`O3m-iZXBW=X;^xPpIL?Kq*~;O>+gh6^h+yR^0gI#q2KHfZCebkD12-6AXZo37 zcyq;oyuXm&kj65AlDI1tx`ar~G!xp(zIdg-0Y+qO_sMZyLO>ZJWceB`O|B_3Pv7R*=RRl z@_CZkk{ksh#1|S-SD0r7Vbfuzaof5$j25%{Zb~E3^+BS)MfFc_?SuB<5*9HKhDRgb z=nX`{zLZ#3RSmtTPb0Mnb$^sO9)T^K|n;*p4UZ^g3qVrPF1lZPq72Fpuml+-LKommsb z5v?gK!9LPv0AN%XS`X0Gsav7FGyCu@@~K*eEBDp9mP7hePEyi z-d4w+r`-Gn;hIrZEf|AKi$uivD_FQ!ID3EF7px%^EExQ!toko9PHyL7+U)|M4@$2e z0-pDnc$`6r637A`rzE5Apb;o?6QmP}1T-`o={sdAO%N*?>H5^U05PY~@qRimIVmY7 zn4n3%Y4BPyc=OX>9bBa_n^coZfv^>#hin9fnEy*0CvD>PHQ8%yE0#Z`Soss|Wg2|? zF9y0j0F!f?N8xz2l$^Qt0SMkfEplLpiKJf}{o5&^4>@N^>X7rNh8OGQnzCCrD^a=9FCiO|f|Uf1J7PP}j5PwJ+EuI~r}Bip>>nRS-qw11d5 zNnHNO${0%L#*FLD988iVaiyOj-;u_LS}||yzvTUb_o6ta?Y(Xh`;(Ont%uaD5jNB1 zD%8E5Mk$zkdfq@g%5pWpKera8VRuhsR6@sWx*_#JWVlJ8aqfluMlBfu!dU|N^(Rp}b z2)$xy*k*c!`CtZ%l3i1+#K0RxSbFiQ;GWs8Jm5LdB-M2f{O*>}%^ifc#eIwuLlDe~ z^T{3D{#=Fr$(!=%o$)yZm-RQ#I!#yD^A1+hHN;y6?!R-s8qUP93;aeXt{k=7)erA=EToW07^ht)Q8G<$p7ip;%oj@<;f6-g!GW zcD+~mAae9Q4P&60(RG)??!{@T=g2y$i zUtTSom|{?7!A?N{oW4_*`Y+>8_O#voF9I&fK|BsFG)ZJ4B1lpn|Bwo`S8qNtjiT9C z^^imm?6&QBD?&tH;H6(E6ccNv>%)e1`LDP?i8cW-HNkS;>p zS}(eH8+VZ_(V+Sn+G6E+2ZNNBN6s1{6XN!|RG8K=pE+vcGd}*Ww;6kkJaP&w#RcTOcS6LBU0)4pMT)m%EXCRw-LT^Zt+a0`Q!} zQ&Vh!gaGMshk0Rkam<)GEIS=nuZL_&3dNJ$cf*LGC%IalLmuD^%~}$pVPJw61U`Hk zF^(I0`ozt9WyG#vO}f%xUf%xiTohiJngQV40 zXAO4HzuPhJV1$kS?gg3_L+I5OKr?DA1{G)`PoY;@0ZsU^-&_AzFQg3UO?abbN5Ga+ zj+eLe{GNda9Gx%z129R%nFp#NpN1e5K@ST}xLFUhHr7JGETngf(gkd#s7;&q_CS-_ zDk|gOlmOfMVgOA++o*;A%51*(`@edjb3iZjYe_}or=ht>b@pK3Qg!g}k`93ZVjjnl`3_v!y8v%=gIx_! zjWEs00jX;*3RRG&X=P)YdV9}rH?>p;ymvHz`88MbnUP0mwWS)eog-voeB<`78$5Y3|P3^WmJ7%RINWDm_nrEL8;>QylgH1)<` z)eCe*$=Qwq4sK_o#V)v$+U;n1co;!z6;c^aXrbo$VM%W;(q#k-7Eda617;qiiRE90 zLzz!^r5+#1aL@}1W>Si_NEH-p3{OnJydD>R#8cYbw7e5f2aw+i6BB>_8v5}JTyM1d zsnwW?G5Y)av(GqbW*kLo%gVyfN*A(B0lkkuVlf7%BE4+!ckR7NRscm>*i0VuWjV?t*q5rq)C;JK=DC&rBul#*2F<(dZ4_4?AiK#toxX= zth~Ghd31b}$3CYl3P_&dsQge=+^#p`^Ss|@dH|CmA!n&-1e!=ICElq6VoaR@s;(Ok z4&6(6zzLO6b)t&*Yj;#1->QzO;Gcz?Kuv1?^VDWb-yF0fZF%5(Q$`gfSFk6=6O70E z8EBzk1T*9gg7XLMHhWtUpd;75LBu3+>?PgH~mf#m$13|_pix?KF3bZ-RWi8nf4)|YW8v>2CH%1Rv z=X4z(FXCV&5aWbc!N)n%Sz5&^a%w;M3N8pnwUJ-6MY##SUJg#oChCeD%+KmyiS4xA zWgUH+%*p$SekR>|v5N~|Y?7p;x}o3cb-QBvuh6Ta5E*J;;c0VNm*xxZJ&V#7J$}^d zSYRzru9y>M)vW;&(Sr)7A!(y)b- z{9nO6r3w`&Bwf^Ai*&Jw$!79%!m`h)*;S9m_0Nv(yMC097eK9TI}-uNhpTh}e;?9? zfloQJZ@_*FXfqc-sJJ^lqs=5S7cA}g8S^ft(bGDFUnkN=@(MEV0@0lTX5_H^Sl zW1;5u>)D3%!TqHI{so8PffyqZkM$5M;TVuM46PiBwXCd8x;Zh{7)(0pWn9N&&!Ut2 zb@ivA@X2*ceStRyP3_dpzr^yfpjmw>j*}y!%r1hJd+Sn5$gQD~|zi2ktie zL>gVHkOJZUkp&|v-wn1H-%bPK&YTIvXB*9-i4>387~4+E0NN=xcUA#BwjFJQa^@cq zB|ig%Ck_~f_M#GmrTJI00#5t)XT1zQ4>b;m9@32GHY0lS9<)a%5BAJFg~rxY8uGpy zNgoE3qg{aN8bVX1HhyEz8!h${Crj?=!e??Z z*|)^(TN3|xl@k_54>8=g>3a$RQEn~}Tt=7(Ry|z1t^|ZMxk`{Gfbd8R8%hS{gZbw? zBil9mGyNxcu^5GV4{|1T>nzUZ>Qc<$5f}N#Ern>)YzB%HZmHS!P{W>fM-}klc~TZ> znz}Q0-7l~-{OWGo+BjOMcr1VI3Tuey^piK;h1t@e# zNMyo+56HQ~t=esM&o`5gD@nvG!I zbAS=|IxM6at?v`^I!jLvvE4*!u8FCFw4OD-Zyoy6Q4Q~u9~X#<6EjQ#1Np}{Y0T)z zz`@H)gfAhTUfT~0052M+SSfAY=_0G<5h(PTdD}LVLbnRS)S4Z}TTeD@o%Z+kW9DSl z3wAo#8IA|^!(eg*6o;%CneN>@Ji@9+nRmVoiVD;%=rYo0VWR`e&rLHaMdAkL?RZDn z3JOS&?JIy=i(o(WTU`^8pk8%h&Jao1=ZSq`ckunf7>yv6YwZDRO*w{4XM<>p-{BmC zQI{pvUn`l)ed3zbHE$0OcxYCCgUE&qFO+>SgBPkW0k_>eO>77K2vi|Jo0_7@E#ht} zZrJCv3_4)DGS223j>Lgbpj#W91>WPT0uu$aP@@fS6sSvc#{b1J_^fpf|F?7Dal3>& zyYZTW4ZlB;TR0F8l@WxOcH8?F8eXE^8Q{mQISN4%WJpiAK(KsyGy3_~cZzCePi^$i zR6&(aiRxSId{aq_7x^sS{g6mf<1Qs6wo~3xteMkaj-c4lq}cZ?$d~C>?jPCnv0=qS z06|zW%jUJK>Mk~EE#wP)pjOW6B>}UE`6QBX+2pkT9?dxW^AgUqbK03Yk=q8;dwr3r zGFlicq~lF5;$@`8EEF@dCKnDMC41Ax92}aX^W9ogluqg02G$qM;LEEoA8(vul}sqp z_05-L-&)lZ#p7fhxKp#YR?(!X)}%v(VRs2Rv_4|NG|Rl)&2oy`*^urN7Dkm zcb@@yoHzJ&yb|HsX*mvc4+aVh7;qxIG{&A50lUgvUJ;p4+ciHd2e}jusX6ub(9jB` zTlUVLl+PED(5AfBJp~QVsHsw?>n|3S+3YM6*8BPIhF7;6r9jyrJ)JHo$Ak9DvM+EE z*+pga&Nmdr5+ACg(1&H@;y*=As37D%rgEAKhx4S&)$3vz1QYVNP5k?aiKuWd2+$Uo z@X7HLTYLB~F0xs;ISp#ly9H&U=DGz-mn!EY>oOLdeg@pr&wrnfBjt0FF${4RMu7O$ zp?1RUKS)mI{VGNyQV+3yNz*D-k4~t{?G}lZFVtD1#(S+}Pf;>Zjhsqja&-^vX$%~s z$p(Qxb+=d#OYew-u@%G6(|JpI`FoeOB?GCuK0b$1t_3SM>{k}zF}(m{#)t2|EMQ1Y z+4PcgcBD>-1g%Bu`JG_X+hxrRFzbG|zCtA{w)=W1Ez<``oa)tbk0o7Z8vnEJbfCV2 z6MdN~zNiqDbY?}~Ur3g-v~=9+NDy5y{%<|k57!EynoQoo_(ZBMMGoVIk|t-;I6t|G zoH*hJq*#i;yT6$5*Sx}hhqNav-spl%rN7z;Ey){ntAvJ#%JWBp|Jj0mU4f6NjuJ|G z59f{*JNm^NT-1jyhi9a;3T~-B&c|l43FY8 zbtAgc?uSy0pp+#Z$UkIdC)JMxx2lo9K67c8RAx2sbFO0$IXNerbdon@@R^n^~$u(O%_$A6^tz9LJkt5V8 zIM#nQ^t%p=N)|8hwrH!kv)@%0-wm53mzuUEgt^ZUC10!?eEB8|<2ix>c#0t6;73Ac z?)6&Bt~UT2$YH7F2*$7y3wrzhVu!&h+-59`z_LLUX195~mapj%$!2Ax3D)>}u zi;R3H+2Q}T@Mv)oe6gd-CYBTeVFAiQ^0Dw&k6M?cY11>BtiE5KQ35#5q;o9rG#srY zL)X5|mlL^PGMT-gvMt!xCnA)D`4fW8a4T3*lD?AKjJKh?Yp#5lvgUF4_G7}afEzt5 zSZ7m0%^-pJE?++IAbawyauz>>U$1iAyJ_iK2INTOId=HgJa7bE?yZnU+DBOMS-DOb zH@CVq*Wu7L`B3PoG)&?tk5O`FAk$UkZv-f26FwIvR`>f)RH6u2YA-IcC1jqw4~*WXTC-!@l1529{3S>n zDIZs{HHuaJ)0uf+1K4^C#-cm4bbc_8F(z#~;RwTM9`3{8YRU1LPo+~z>>h#sk8VbF z0i4KRP5B&e5ZRj=Py-2hAq=A$3xI_9-nSRH5)0`ZvvkW3-xQK4XRhp@hlNaSNhN|n z)^ftR`w)&NAiP2V3VL}0aaW_FMG+ZbKQVHve_}13INE){#O(HsM4}M+31_`3&j4A* zjqebf$oqwKAEauqP9I7zzpGM8L+1jf)JhfAr@&hv$*yz&$Cs8o02c*&nbGr~n`!N` z=0+eWovtk()Wm<`P69s{$xwn_a0vH85LT5}noh8tR|K)ZZ_~0Tz@wAuxeBRhqGToN zB=k_wk(>`!;MIZB+>&~@&~MqH!$!eUA@4k!SXDVlzLwrB?-;xv-`!cXz0hc}}h1+r|K;!TMtZ0qrOlOOtN3IUXu78W4kf9g8+_O!2$ zP9j&ZQ+T9ad9g#5mr(==IkJw#E!N~~XH4iuGWY7ZkegH;I+xY#FKx%1jv$I|sVh#Q+n)-HJhG%Yl`hq&x(8+#2{^ASSI9Gz zRDgt;yV|&!_GGn9M*~7v(9CF|eGkX4z830kMF7D|eH@eHbC?SN*$O>T7!R^0ZkMPz zPSsle^&J}blu*0vPv7|G4Hpjok5imi9IJXi*DiDL!vS+Vd|9Mz9G!Fr^Fv~RRULK4 z^wkK>k*z9KY;F@4pU;|>=yA+s8MUinR_&+#IhNPO8*-9EvLxv*> z%<$wHzg_1Y{n^_PuzyND?ELD$1m^|%ODjgpU$ zn_59|wEweH%xAkv?66hT!iLwF59jJg@X22pn?su!#o3juM*;_BMqSoBnAE<_w}pb(xdkS(RIV%Z?RZ^V z(;wIlBSVLdxqZ1mu^S{Kw)rrS`AB8hO=e;QV4xPhP_y3V2+KWwW-q88h*Q!2#cYX_ zxF~nDIDIz|Z~qd5GuO(({rRXOA9S+;um_gFWeRA(nnuvp_uzAwE>0^Om!>TZunc(T z%a;El(UY}-qWdt)>2`;W@e3@XHkb8*CjS|kg8vTS-Tv;xhod>^XGypL9JJ;_>wR9{tZfgQzGb&_NoSUqRz zZHy71U--uhOVkbvL&S-(@OF9+G&^*;Kd*EJZA-MQu18)QDLWnP!_$B$$iCZ^>y(F4 zUvvc7)A9l;(FG|68t7j+j6a9H@r2E)4Lw|NE{U#;kra#o6zjjp2R=Zinc46s>ALJ% zeXkD@vU&XlW41?I6e2BC?vHS?F0L=wz{ld?ueI6Mj`JkE3$s+$MhB4~V*=Ze9auwN zbvmf3HKzf0!VTS%^2JWM6n9571exe;8L!6VekDfNobU@x)-={-)%l$;5LDjby|hH4 zy@2yR5X7ZK*^ltQo1nNXw^lOa4IpCRd)>Ug5b*WA>J@a68UHe$#QI8?<)8u7hA1lN zv)#UCT-*12=>-T@?P7IgBKu%)}FEO#PM` zFX$ctGr4}zvUu;3yt8vj17|NhT9%byE@+#fq*dEU!sh56qKGgj2FmG}8)#Ek^IDdCG;{ z`}GCCbY@VcRS^-uA$k@2c^k8m;mTFXqfcq3G>o&Ml!86K2DjarAVQnQw@SC*IPNOwH^9Ebqy1tQS#xI~) z6_ZS5lqMOtal)y51Ti|Eyp!N{H<{8Y*D7lSljp5C44|Y8=fd=S_}lO9@YabKfkw}^ z?cv(~q;h$r|C>XpRQE>PE7f(LXqKK@wyaP?>t9ZJRTihOIjsS)+33WQ#b^YDCzqnw zvjm)EamNTl6OV2hTpOJft=+%&{>FVu&;DhHzZz=~x&iWi{{UD;ku9`5a!!rf3h#%K z_mT)yMdQT(9nQ}SPMR8UyZf1U0}>3IJXK|q@xS6FQo=yS5-cevP0Lx?{x3>I$_ zWNVwMGYhym2Cb=%T4-h>jh?b8p*S{9XW-Y>XK*v z8-Ek=AlJcdD!EZl7AD12zpoyIsmfPeyFCebk05dPUL>*;xc(B`c)}kcbx2T3^9&yd zm3V;aub)Z9#VSy_fqm_MgWer7X>9jOZA{DUo%FDk#uF9eX3Dq^02n3?ZC^u*vRSg_ zE0#`?Ja6`X9mx)tXfpmj46015&E0Hi()+cgQ$B|)+T;`jz7f<&)-8@vdv$}6x&R@V z%_t1C+Ls}jiVlqz55lat#R+2bb~hu7FK6zi6?+Qd(ii%>jdn_TyKJ$xB(EoXGmR%wJ(bO$`wLy zUqNseCqDKa3}{}4g`k(7UgIOtKXt0%cVy+&eOXqJU!Zv=DI)Nu@agmxwY>%k*glw?nuzeb5^OkE~5 z&X$`+MzwM->(cCEd&MN@Hj%pdSDUnqX(t0iK|0KW?;F>ej9ys zT$fB>YIn8Set5}!hq|gB%t`5Cc@===aeGl{nJbd^_?w1ySVI-29WLb_hXm>9=8xLq z*?O@Io8G&5tzsY~gRA%&Otl3tL#5A8Rt2PQ*E$}P+Ewz0Z?VmEpbK^=bytBC*~r*T3cC z#4I999QXpfUk1o?5qBqIeBQD2AfQuV$^p-~J`^)@ zIHV?~yg{Yvl(efuWo81TQP2HN9I*_}f5H6ughR{~ z@!0GFS*%)(%go5Yjtc`VkQW*E=I#$boUI&wm1wln^BC&22fyY)A*T?7U8H`-$*{=a z-uQ|LJ;Ti%>+po#YwBM~V6eT=4(}kp9(fV&&$!^?w~FxmE&H2!^k3&<_60p;{jnwQ zgMmCU=q1D#z=AI~AYo(DqOGNT(3&pnI_me`PrR=6k&wq@te%RiQ?*8<%bJ?9e>MP^Jm-E9ZxVQNV! zVKL;qKe0ww$g0XrXvIBZ2$BcUGW#1#RTZgMtDhBj;D$)K{(=MAh=Z(1EgE78V_H*_A^f`NoI z4=xZs#my>_){+r%m$UZJ_pS4Mx)X01dm9al8s297LDecFa-Vknc&XjySTjpKyQ>QBkgRd;nGppYDcau9B zN@!F}_pihFxAb)~=J zcNchnL9)OGm%Q9W!@Y1~#M+GA6{|WdY%}-%LyK<>U=-rkb0v^2Ys63V1yKKHyFQX2 zFw|$05qNvZOsMhhD%N4zuRPY zxhq*K6DN^~;L$KIAVZi`4=^A|7OIZKL!$ez(Bnk28Z!omGO_q+KMX7zVFfvvcXLZp~ADEtiB}W4d3Vgoj+jclU zu#s-uB$MPwHCe%4=I{FdTMOVlz!AzO)r`CW%(HZMruA;I3Qi!yO>tUx|9da+jh-GS zDUoAmCl??BI^ni&fUY0jPC9yb-%)6*79Udf6!H!E3&`H3xanvRmFW%rAaZllQlwZ+ zqzYS0p7RS3bnt---hWN}*lY9`NQCU9E=KB68Ymey*hM;MGx{tMsCjrtzNW&i}tACG2 zqZbI&7QRnH-%)`kFsbWNH*pcv;OIFrapjZ>t*XYVdE7fOW}T^I zxevahR-C>W&-VPO)*5kz3VVPSGj>C#Ftt8gT5g{}+6aDuy}o2w&*8s^!-q*3;&u=2@BEoaRcz z6j&8ju}~iyH67D4uEx0y$?lb@1gJLQ6GWyjf{fzc# zI6q5Ve(`tvvZS8Bhu+N;d(!%!H~v|7+g7fLsB1b(JOdyv?Ev?L)ocA|@#Sv`OKi*M z_#%^i9q&AfT!~i!wY++CJ$qiDD6ChAuHn8rDdVQ@8dwNJN_JFvdusuJk>RbA$Vp;GUzUMtPaN3#fmE5skyQgnjG_<1I4s zWz$VJB&;WRnPI@ITX+u#+lb2^`@!OQnsXgH>sW0NW&fvl6=kG5f<-$TaVem?aeb6w z4@!+t&ut>Dr%ls?5X6pI_SwbS{|5BNEMN|!KXM}H_nusOjXy=$uHj{gUL4a9VfD0F z3N8#<_I`1EXza1Vll5zeMzKaRYI6}88Y0^Px#G9e{_v+!1RK?3=X1N`@GCiJe`5V~ zFlqW&?rhxbD+?@MudPSiKkRwQ*QA*-sMh72N|f}vt57M`$aL}R@xpmDj zt&U#cx$k%ka#HdU?DI+o#^4@ep0GSl{^L4X0$%9z>oLs7#X*J%2| z%8Pefi89M21V#RE46iyeh-r$K8ej96K!WJJ#Yh?IQZ*E$Y! z-*E~*GNve+1F@;Bm?3l8*6|vqlJN(qgBL7hN>>M{-&29Jx6}Y9S7DRiVu6 zBYU#nWo_53?hhs_M1n18XA!}i>7;zeku7?`mbjljW5RwZltY6Rx2ZtRGr%AdCrhhv z%NF<`>XW2Pu8@G^V7HUGX= z+)&%H*7hD}z9w82wWmjUz{2dCtTiE!2&q0>aR%HI&4;c>{0v#fDEuf#WGf^70r*>F zLgPO;MGjnRB*FT|wdY&!PjhglET!D)z9hx+o=pvsS6%|>oyf7aPm#NOPYA(bYxWS! z%F0Wf*s&a;%h~E*%~qs5=WQ*FJT?%ZXj0VpNS^`nw`h@VmzLenGc7Q}uBVEY2z>4{ zux}whqyP}5(6EkR%vcburfM9%5>A5$f|YQ_8*A1J!Y?jGgJVR?W<;Es5s^O*&wSkC zpERtc_hpepe8&2syugik-(V$}+19*Iw>3RvPw3=+6N%pka?^l`O6K&V|PS(Kak(TGclHXKqxd_H9akTp~gJR z=!f+gNY}402DL36&#!8obL;x%U>$Q?5GQ5~?mT&D$AlsuAj;niY5s-(f@8>ix&YSh zBMD)t?x2VGU*|>a)DZkV7FY;%mv++O>dt(X%@uiD03Zg32+c6rCCvI-n3ZN^|NgWO zdHjRxjXB_7?whZA4WMDOQI%RL*iu;NETa}%0Jh0*WiRy4cpOqI@ZG}27-~6<mc8$m<)3JL-WB^WQ77FTn48H3vyx{Jm-z2!nF^OO4(J)jUl#^lH-cG}w;I zMPqKfddI5vUkmzgv18bSB48zuW~i!Ozs8O-X>EOrrSW0t8C}T4p~2y6`DO9lpqci^ z;R33*h>eG$1v>5J%7Eism7W0VJ?VcW5f8n;za|fjK`f>es3@N!V^vXd0p%l_#|?b; zN+Dp(oF`M+7kuX(Re)nj;?MZdh5315+f z4wQg$!5(e(`7;JzyfkzaQu)UgXOXaVwp;uUS%Mhf%Ikzy2hkzPg8>1rsf0;Q+GF~b z^2V{u5J!HJ^0CoYy?}9sX?^Y$VxRC&!xTuifX*@~)q?c+Lat;{&B^sM9J?-`UwolB zgtq+XDhCIB+Vo)Mtfh|xOfyIUT2k(;3(G>kMu?Pgz@Ni^MFt@$hpI?e)dSni_r91k zc=<#E=m5-5&=>qwn$g)Zjf7G)Id@XVO0JsdAc^5_2`*v6QSg8S(9J#HL}^uJhK+3}zfy3_dTknh_(GF8%>7w7L0`YF z^sy?eS{eehsd3b{)5<-V9i|C3)OG6S_q{%P%c|2#;(5*`sf#`!Ys|T?W%z^n5)@I{ z|J(tj?TdsAvjVB_X2J^hj8)+Z*=J7IbJ-#XJ*daRiJxSfQz-Mi?@(p*iobZLjigNU za(mIWOst@!q_N0H;*elxbCZcaG6Y)w_;&HYRu^|D1u-9r$8!{;NL3u_ZiK>2O;yA;|t47o6%mP!@i4w`2FJV=*+emJK zr(udE>~U#~LWJWn(NCK|cB+W)V<3~dt#W7QX^C|B(NCUoj(E{bV35_#q8!!-A^(^@ zF<6CNw35HU!nGnEAhWoCwOQGSo8fkkn>Id9^5-_3$h^&KZPiPlre=btNKNlhWDg`3;R7mq(*qpb$Lmu*xF7X1)U_r1qq1&I=~ z8H=IRtel#PBJheHJYDz^BQj~ai$Bha_U-8_08i;oH2_XPM!0pRR1NM2ag^J!sPssR zLw=SJVK**oT#rLk^}lkIx{9-#n{UEZuPSnA?mmdZBBqH4t@6zf6SCxDj=lq7ADYD> zC1O1~&4j^-cU-mo1uOCW2_=1}1aUYWaW&Rk5Q+0KngquvkQLWd>k;2}sGOM+}=!7ZI6HjD!(D6pQ!`#;Tap8AT!N z9T3ZOLbcUH;HxGh711F?a>oobbC89S&Ih-f(Tp*!{xi%4az?_|mXvL$c&ybCDe}=~ zIkGo8;WRUp5eLX3`H=iHqVRqR#}J(Dwgqede&RHAre;j-ARb)n{5I(kjBYhQ4n-vz zh9?oD*b*iMt$UgOlUq7S2S}o}mL^z=1a`05dLqMlY<8zv=%P2of5FEE=i2qG)}kiuj)0LRkZ60H)bx=5)BNVr&T!chcXX#Wj{288L{^G+^^1PG zs88{)G7aHLm!1MQHXiID?}gS`!am45m!s_vh4LL^14`~CX zH|hZl=zZ^v=+4hqYJ+h73s%53yzzH4i7O+0s%c0pU0(xXop%+*gguheaYc%HAFoKexfoegv3DLH z_EikT=~Bd_=3Wb8P;R@7Fii*q3R*7%yR;bLiILU+mRg{bLR_$2XLo8s4UgplJ;;>a zlX%PfF5>l8fTZge5j$irx|J?J=byKO;5-*^Fa!` zRyMQEr+ySP)0)jGZYN-*T5aW@XM-deq?g7+H$rSttQ&DCcvX{?y1fVPUl>LO$~J{$ zK1%i_VlOrIbBOj`t; z=LV2-?3EP3n+_~TpR9CXZRK**lDsoxP8nnjReMv^bS&HbX#~g{J_(c7KDwfM8NX8W z0ahP1T5G^%gO*5oiNVJWp|0Ou4lb0IN98V5=k`TRr#D()d-KOp2|PWm{i^w1DRC6_ ztx?aVG7>}68Ym|4&(MZf2-&!q@M9na4S>KS8XUPI_P&=L}Ot8)P?l)4xzT} z?^`feYd75f`MJ>wO~Pb6%SRwT3lC%UIn<<;`NZrXGr!sxJ{U!w=2^`wD7QISF# zcvEP#P)+10ZsP)ue$q!KMAOG$1h7tltjKI})AS{Ekx<8+Er47bhRtwqv68AO1oRR; zq@hRm1awoHjt;u@&uxD?wL1{;0NbStMAkuu3l=?b+h~i~4Rev6?kBYrZ~?yR3qHzC zC}qy96gkE+&G$6w7>p!Qa}E~g1i-d2+MV~JrHpv`kmOJZ@hdIIS6s9$1`y7!7Q6#AMrt0WT@M~%3fb9$RM|V`!3Ak7kgc4@Z#An*{-+IbCYUfG$4~&3YS4M|wIK%EJ7$OYJ_{PP9{0u@LVy4r2L*2hntqp@ ztZBQ?prP?E8smx~wAY`Q$i9Eaq3JAlg2kr7CD60yhvTJua&_l3ythGX*HYV%v%6s} z%69x$l=n;JBg@Kpmt*Vr__s;9bXgG3p&uC z#ULb5I|I(rtni_-ELY?T1j@X^DRYPIY z!24*>yD}kJbId+!pJ4ui5LOfJ8}a2IVu*2XaHmekNXw?{k$<=Q?cFt&Yk z-RNrg`dhd4Zq2X3a#Y(%oJ9eO+}Y)HiKKBlr%3l8Q1g{>wlbW*KrAfoPdq2Bh^S|j z1t%y2dX)lfG0kBgOm|O+L+Uy=qqao6W?nBklarDz)_z%)_=f6;P8VqQ%ZX0BB?21r zHkKfeK(&qAe(U`mH>kzDv6=F&Sm&r^P=P2#os^YqhwxXbJ?Yx zw}A07u=nIOaEpcAJO5cAPO+JVkWF7-|6l-v*?-Lp#OF%_LzU|w90$t+15q}xez7wj zNFMAJc_e$Efll>>kfuQ%N9A+jo-DRBAJF3FO^=D%<>m-xEyAhzo=X6a%B$S~v=YaB zq>^eOS$e&mc6VPe1$LV1!1}m>EcFV4X{KQ;HW02<|9sS9&1KMh7Sqiy0e2Kg+|UgK zx6o-uBCse|X`^izns+Ji!SVCd{Sl5PZv!=c5E-<2VzLd7a7N=Xk2?8{XT}GwwM+O{d)Rqd`*g1C3n9N29 zzx#NRY^j?Fx4OFI3wzX=R@_Vl)C#X_i=K%YYFS=r*MvIGD;>Zw>m{GIxv>Xo8NE%^ zXs5ijtovump%0gY0AsKeN5k^ef(qOsv`#OsO(8v%qgkW$hLE#y#lo{Uf+o|v6^lCM zsOx8IB@v497~8}O5_5bg)~U)eZ=p2O)R0SLM@ZY{lzL1=iN4Pxb2%v|ru+;!cO_vKBX4g{rgilj&PFRIX#R z?trO~V0yVgz0f2ymeWNTX^MR0t<;}ti#fH)(|F2(J0!GZtojKt*Sa_Gw^_F8kSgo9 zt8c}7(%1x4le`@jyQN&oP4 z>(yT3aO#EYGYb$p#2YM17qMaCFduOj`Ya5xZ{5P-yQHm9zei&pOK${O_iK}~0aF1| z+SwN`-JQnp`|B(wz;7=jtq2bdYB($Id@AamZ6`D-pGA_WF6-Uqwf*x(SXMp>sL=nx zrpt}0*^qTcd-%TJBQ3G9-LyFF%Z?B|;Q|r6Z6YaW@8N_=+#9-{#bJ4QddtwLCR#by z{114)5eA;_m1Ud?Kw+VS881s5b4W3OX&Ps&Q;P^$nql{Om)D!~TQ`dy+T+W7Vs_B@8uIPW6|0>psIxM zsV#^s%^>6XRvUKs&+EP2PqyGd`4F>rg}`!jk!JTh{}@EPRjJn@gTgUTh8AA~CFUd` zMi!^I=@o8z{iIUeq7=2$R&kXmn30;Osdr(2kBQMriogOM!7vr zL)c14si7YmutvvUm4=qw>a?@>%OWLw5R~QX(LR%WTD&Q(T-X`WF7aTE zdiRnjkUt(hNGup^!Tj38L$Qb#hEkO+u5_+DdcKPEi0~$gk9JQ&m2ToUd|<8hER*{+ z<*TjQhR;8VnSz@qmpGH}`$>I>*bIxE%py^j4=JND>6sA20s>6FzMV5|^OPvjUQ*4k-77_-Kh)B;U^u`>hW9W0 zes?Q{*mvS`^$de%&Hmv2x=)6ZMr}O|B^U?HT$#4L2})VQzpD#o1}41Uo~NWAnT>Dq=AR-gP-<6x%P58I_hKhzK~p8;?~L4%TJCrsH5R2L zB$>k*C6&GBjG}V!X@ouv`)JT9Z8n(KZW&V|GPz?W1{)vJ2>?OgQ5}8>JCKzulqD91 zxYpE7mKaSWi5JOi1?{hP(36h*D0TEL%37=l@m0#`h|+!SUrYDdv`4|1n8A zS;2i;{$Ul*I#0Yy7u4wIq#Hf z%q@v9vH1F=91{e720siZ$`NluB6dULgJeIHO;fx&g3kp`+=z-mzwB-w)#;M@PS~o) zg72<8mD+fuvuwpepLdlQGZgSLMm~glad|qubrC50>=;G--RfvOx`d%bwGofjlARi; zowG*BPSIjnYU+^7H@Vz3P~e+Ozo`{+etV;OC(i?G=-|`$G6+(@^P|*yoi@d_x3Mvj z{cHS4BE822>k(`%V92F&`TUfTDr8GRUW)#NYJmRXWUa_WU{`M`K(q4xyIJ|Ob!zsp zq3XJ@_pP{{+r{=;1^0yXthJ(~9m~?m=}~LBze0G?-_H%}zW^v3$R;k&pNSH)eD$xp z*%)3Np3jyaawKm^T4D00yAJr`i**hk8%`!xZLF=^P+;DF*Ku@T7C;Qof&s6mk#jxk z4{}goiKt_wgi6Oc-|sZu2Cmllf-k3R&Au@sdQE}Op3Suk-$kl}=;VrC4U78da$Mw9 zW^Fj1w8v29lkq%XPMD^Oc8D>D@>udf#6iQ<#!WVsF(y7e;VNdu{+@z*|DO>ST7X;2rX zPza;XLtdc*RY7Pna{Ql{B~n15PHi0?mk}6s>DDC%M+=T`z(1UCZ#?OJ{d!S8I$Ym- z1-p{gdpny-RniD4^Kt~12nl{AlQ*J%K)7{W+`YSGU;^*#PZ~{@6On#(Swe`w2Vd}g zeH&{vdCPNYJ8xoOLQBiAS@yX2aAUWL{YVrwwNVnx!QE60MM`TbdynwTYg}+OARqp^ zj`Q%_@v=Gi!d5I>)N80@t22r&4s*)iYs%GOq?Pp0m^_WrnSpLC^(IW|Koqk*W@YfG z;<*+>ZklYP1es*xQ%+lk#%>EPtie*hUQ67q>p{}m#Df87kWD5(sw|0!bq!>r3ACRf}@Kh zq8dzu{IBSB6fOp{mfc2JJx{FbSdDQOcBo1?m?TX>kdQ&i7VojHJV1fZHCi-p03!`) zKMy3|>DY^pPR8j<-_D6b7_Q(NyT7bEO=RWzaxOKI4?pmu zsvSeON;E$3nRCBu@{}Zxv(o4((ZT>jRi}VR8=ywk;KSSM(Cd=@#KD-;`n?tqEdyn( z(@LO}qgjc>CoRxpVBZFtfuIh?2+$9fH;b1RGibXrUh?Vws=P#pHs;>`I`g-JWKRFt zaM0C}J|ct(x)}7Alp+3456K|~mu?NDG2VS3Q++mFOKQ>D>XW}Zb>(P zgBa^*l_$f)q2Ok|LN+TBo%JS#A9eT3?)fg6CfjYT@7+bS_a%J@xE1U`vD+B$Nco*L zIFU<>MGj2~UA;q{agQ}m|JGNHEf*W2Ty}=T*y!eq9nw!*%rovBJmZu_1K0MJ z7~r5X+;3dT)jOXiMG+7w`L}Cm@|hhzpRHfNi+vHYCkqiX&oX`v?t^WoRv8qzze(q_#l5w?hL&l^4tzPZR zm9SH(>Au0VWZ?~D13{LUXgY1S1`4hBf3T}fC$bymbZ#=~{vhcVcK=3pL2$a(m7i1V zr#uif)cSjd(|W-Q*uf-*HXFz^d`O~^$1mObU^lSP54@Xxnq|Ayw9_ta<;24Rk=PR* z$VdgWH=@O_04>a7$UhNms4j5&n8Kz8{|z+9?SS3Dzo2S>M* z%zUm?!q4dW)?$NIS@mz&S*1o0=hS@h2({qT2%*U6nJ_1NL61O&k&cUPG&g~vLu^^X zPy%Z9n{!j8%{dCIX?nRU>p~!N0_VybA#AVcfW@FS8Z!7>QYj>p;^BJyl602H^EXF# zNWBdRB3Q1iy=)Upk}QdHy9&HDzbIu+3Yno{9M|%(N{0=jYJurSeJIyhzgx*!9Nfcj z655eUWtDlqUh9!E{hA_v39^#^pVr36$u>9mYk3~23B#?KUzp1%$tdqZwpv-~ zejnh4z8Sv^yZShvZ_$xmveE$dgdTn?QBXY!dy0)FUD~Tc2LBd#JS4o>Egz9Q#ynkH z$V9vp%1D##W%6HnBU;TVS%`<$T6}vrVaC#os|L3`T+86RVEb9Aox*aeARd&w<(>EP zOC94mYeX6{T#6&T3K5KWygJwb8Jo5*)T#ieBem|M+R}S&#)9bf&Ug1N>W3?DZb)42 z<~QGxVfGDY9e??%sgUfj6x>RtyINx+qj1!}!A~ZNS9H8SC2;*Frr3hjO`$Bzb_cLr zhC3c^PIK+E9?Lk1gj|J{5>-Eqr5w}{0vQ>Q#!P$ky0b9lfQ56vu1KbNJ_xXxx6l3j{ocC+ z`t=Xk^stGm2>r!nTYu-y_wWAegL&p|>h-;(YFP5ImqL1by8llSBb>jM_NrzZPpFuO za6{*#~YV;E@9!s|G5O4~_aDZ4HN^pQb- zfr!37NjXh8Omsf-4n^ooPB$LM5RaJqnLll={DqFdP#h}l!iT9L>Sbd}kz2wl6ma3z z)OG*ChhINx}y5WcCGNwmVXxiYgNL z0ysy9C2$Gqc~`>-ZpDCfmwl!0uhf(%M_7v61>}tVC(kG569Z zrHw;;A17e0Ii`nrsx8 zLnw`~OydFhY@hhYlqhpry5gXw79D<~6u7i@B%=_{2690JE=^Ta(FbQFEZL`F4e7_4 z>e%HG%}cFn$xs~HUu`ZT!1;W9s2?Fw>>e-uT`xT3o}$e*=Msg0mey-?^w{1jO+LsZ7h^Ab7w& znE7-D5^S=5yo{WA)%f-jdTi1#k#a`Sk}4%L&FR^rDANUdkIh8{Xb2%T?N#F&Ji_vi zIX}NLjHNr`@`MfxJ675=ZGNIhiQm|Hx^L1nD=r+3?0L8)xHNVBDfW^%5Z|BA4}Z?r zK0$tVwed`?j{#AABRZWW=%uzLb5K^Kc(})R9+WSreX=#5d8U$#jI~yiy^C@aFq*G02b>A{xLY_xa7M@V+D)l)D`y+`qcR~oL9h7m>)_2KYZhju}ea7G9SAe9~ zZ4cp1J9yD*joVf@_;awO=dAI9np$XJiNYy)j>@u5r}+bh+O^&+@`>0`DbiMlSw7Tg z3X$;IOtL1&t+IEj+BAnUe5)|q@{w)XuBh~I99qc5CT_)+i@AstjmVByJt1JRYH9{4 zPOiR_)MRC56+(2-rCh zdSCA@KCl?^CWJS?n6VowJ8pKp*)xo_Q))J3*KmU?mpN>W);&egh$!!$@ z&V+IZS|qzJxP~!>Tzo@H`yOVxqE&?iF8qG?aOyStNgz!{Q7CK{*R(#kT$?-S`J3jY?RL;N0+9sFgM;+xy-mIOt;~T6leaNq*Njk^ zltt%jC-=e?Rq{X+!*SON0H@7tDTRFbmQ%g$e8DbsmA*bm>6d#QE$tcNGZXvciwLlkQwrbm`7*7g=R-G*PqSMx zwP_LAf2vqwC(&j{bKKjNwrZWOc?I`o{-V{a*k@AKyFSt(ev=W^jX}1n9o9>3*5pA_ zcUbZiJKOLAJq%DdCkVxQ8P4nx!&RVL_xTYjl{&3jeP%`V8o$lWVRv-A{A~A_nP<9b z+tqtS7-r;~Cbu&T^a*$kahghGR1=zAt*y3NSKjf|H=A=hOdAp)7r4ksFQ;-nx-A<` zh^CU`Nq)F|diarvWgIiio<-wfy?s~2ak|yq>~|`7>0rO^T$i+C;Gq<@7&7r5X{m|G zz)~bXtfQ44mKcqtJet!*rkH+h(Lu$oS)JZJ8NC5D*(>1EbLISGiEQXd;V+h&VMHTh zlwiF(beWW1LGSyxK!&pzD^;kb_GxfAF}n0Lf=Z1$IuH>}3rG5T2iA64>pCXQz}>Vh zx>~=dWxLdFl$0Wa>eq`yY1j;)md{nMjkVxjb3IAoG}!2+fQrpJXuN%4oKQ1n3^4ip za4jnj?5jfY7rPx`w6aN#e=v9TJ>Mh=g-73mC)c`?6mkxLlLk^ry0fFZ78Y(>iyT`1 z7Zv~-1p*;%5>0r6NSeBk)`?_=IApJNOG+kFp!2v1P}!Z8!~q}rO+>Wsn7x8j_QKXj z5UiM2lNMMx8~ZtOjt8e|))%>Z=#??#e3*H4L}C^JnYT=XT4zxkLrgU-(epeKMaZF1 zt9!v}L9sxija_)_@1Ulp8`*_JFy|~6Ce~dFGzKsD1OfIqMn}>hEe|`YX>sx$L&}m? zOf9d?EFTF_aP1g5`>q*;w0St1Hd!n)vW|Bh{1)ufFC}aEJT~P0%GpJkU0{Z9kk$zw2*+bsm-e??;M|BBps)^nLsIlEvU+83G>C{ni3RR%zAUVM zDdB@K*i62-iR(1xzrm!OBWk=2!Rm|20f1sKw|#6qfT<*Cq0RTH4pheGqVWNd8fRHl z)FBBAcNx5>Jjd>g1MKr&^ZP#SvjedF{ z^{bc5MA#QR?B&FPdgKAgDpK8={uANSHA9viHHoM%H%=l|JT@&?y21ht&oq}3SZZ0g zP>{wB(Od)0oE@2ui^br+%Yjdlt}opeKjoaGq3}08Gd?W7QcG_IoXrcx?(pd!Y7cgY zzA~l!YWn?&fM>D5(S?DBQ2elWyw&*@$~7$JeJfqK7~8-IuJ=CC*GYrt#S$`1_+%ka z6wX)IN{0bXLPvU@q?#7z3`&n5b_ZDGa68dY;<8zLUOt+PsLWLVWk-AY1hkpeFwS|W zB3}$5ZQ8SB-1c7R^~?G8n-uf#OoAKpci7KNruY%5+}Je4j4OS89LR8sNc$D`sSfsA z`J{J`pn4~&Zwmjc5ms0>_+|*pR^1#y`DSnR0Bb+y%Pcr!hzo=X|LXwpzb?pGE{VGyJO^HU?G(f#MuV+kg+w z!wOm}z|7v5(!M4_p^o%7wIcfnuQ^P4_;LZjv84&Dj`jlsTV{)tj{t(@X0x)7?gCXuIRqm+Qoo(ZQ_5WsCkPHQkswz~!q_nIKA| zwdVF>utiCGD?0ph*g$8q_nj*Ock)-0)5Y+cDDq#`9$D2Mr=Ay}{8E_L$HA=E#Rsu( zfEhV{C>+WS`zpy|!RMk8%lIvUMV7m@qKSo&)el-@W5eFyqJ@}-tHKzGlRIfWqKQm< zw6ug~XMn0QpWe7~ZTn=ui^A7l;c9CKWb?4Cy8|BD?4l0kMVRP}Zdag(WL%3N_Kl%oJ5t*322; zK|tk>+Mv{@4x2!Q7VG@S+C|h5v$g+j)bnH3W2MoJURpx6`RyW6bT!(aB3pFYKrs)~ z0vMjN;g;NbaYIXCSqcP=^$`(mfDlpLtJ^_=u3Uzl7QnpQktvM?aRI# z8yff}=78j=fd__8A5KEq%mm}!(tRP~H6VdJv;XDTO0B{Gk2P<;d{fjyN_#;Q8RaiJ z3{>E%y7n`kPLkMtk;LMP{&P&1Dir;hPawz%jxF6T=_{XR|q~X?>r5KxDf%EO^&hB3=}Wby})Y z$X&NIB1$<+!|+02QaecI+6&g;FRJMyBb6Y9dV(hOdecGky-Z z@}C(GNDN9$UkS)8zJ}Kv2pHk?_WhF?zLo-TM_o7JB|wWAU}X1zdF~%;rT_&g;Acz| zpioZ$un=b`%N!sxt5xxjwE=)a2=J3$SqHGp46uBMI)cLuiz3ScuGIaayr~TEKY~ea=dWq9HiFQGiy}C_tvXT6-Q~4ax(HLI-}h_00aU3`qHs0LV^@ zB@qw=u59>^@?h} zh1{-X4|#vDAC}K@%D*Ob=6BvDW753ERUItRuEmj!NrSqbMhONs(SlxX)({h47@2vl zWqLyhINioAQS+Z_9o<5afyhn_i_s&@Yn}-3SY9(IpCE`@TNoD%0M^Lhjfc(F!K~ix zQ9a8-<3qzW!N41IEUfhbXq#>5c7LUwRxH1x(`bg@t2D4#?kUHacuozYo7?)%N29+7 zI#@vDL#_@I{^hJ&5;B?^US=5wZQ)sQ@9LE7xbLU|b|?G7aLCaJ(WPd~lJ*l^c6x}R zR{g98esnS)6Jxj zV6nftKhWHD0QHQsSZ`G`o+S665F)pn0a0IqH8X3dS-xTiT4Tu3=p z__3GPGRHP{^vvUd5UAYh9nb2b)@>+hkI~ChFYQ6^oXYwrYjw0sNZ#gpEK_Z`mG}Xe zQLBIj3TK`}8qMJkzI@`lfFNY7p0P1Ze|FZ z)*QX}$VQ3~kz&;=tT-lH)s)KRU^|Ls4n*qX03gX=$CT8ve88m>4|G3|676GfIK}X} zNXp<#!c{#l_@FY2KfMc7RUp?@E%&2tnbI#6ssF}TN?@6X0e(%!yF^I>W`D*bjw zgY)|StsPWmD<+I|9u2^g=e&Nk1Uj@61&khg(qYQQFW1udf8pr=LT$Gzsk5GNp^Vkp z*(z~@ik_^sad6Yg2$k=J`sQ!Qg&y*;iHnWT>F03*h&NgUI*1E=EqZZZ>Z2-NR z!1kd5c-TRq>EG*PWcTZ%Z`R5T%Gs%AMz$2v50I@ySG9XU;#z4lR9u_DDo^5wA=PlU zJ4TB6lhvPAE;_gw`Yg$uOYjTT5|PFzK)2Dl)bb7{cJZ<%2b5jC_!*UU3Re1SivQ0g z4RE@zEB!A~f$ijWbBP;Z@-Cr4>P(?{sxuogxjFe=mQn>7Dr1Vt4+y>8R{Vf9Ro!*c zoxp&l-f(!+-zB#R2|wS+r?CILJ?40D#;7mo{J}~;6U2}|K-oATD?f@EGtKXFpI4VA zWTWYw#+e_X&iR)rr-n0p!S`(YO%oY^uDa##Rvvog?8i6ECgX+6furnBJ^>0wW>e1# z;-$X@TDOL_ius%G{6V_eOg>E8Y^i#cr0%K8ONo|`e}}lp78;EHq>E}W;lq+XKA6Iv zqMi>bXNwIMJrP2;JUzZk)|%*=2J!fw9}u`c-ZLAv`(jk{8{U53#9E>xEvdbY02)MY z%pZrI-S^u_N8(`I$?wXM$=3aG4daw-bEHXki}DnCUglhB_r&AOlaUH<_rPISB0FY} ziqqD?fxhtj>O*xSdejx}oVL3ySRM+pDv3<-zZ6%cR2vPx?s~=dPpQlQN2$RpxsF={ z+Rb06rPfV9AGhL#5pYFdPGPEW?sZWy%RaagI9?xhq2k8qa-w=T?fra+5uAv>v|d`1 za)KyPh>|cDSF1}2CbLsQg*`65Gz74%95EAiD$BVm56qN~%#Qo$HHc5Qx}2YP%<+3$ zBdv`<@iejoTtJ0-z=DlQ0!9GFH)iMeXF5sLg}~9>j*NWY%ljotx%ZRZLSD@t>yJB^ zmiJO>yRBX^syy=fYp~9UX8b^)6@GyCdEfgW}5+{TbDb^nBCg*VUyM`z{ z&C}{Cwf21mb;tG4_=uGsRuRU-*#VE>6BzgK=1Age&t`2VdcEzRsmw#UJy-*K`ejb5 zBW_`(UQ_N)cYrluJ7vvt>{EasNE&_JH&ZanxeAh9pb>^S>zdygTf0gBl*oG|>&Ll4 zqN~H*!BVwQ0$65?GyMhOTzxX>yV_PGIoAh7w1H7AD(Ag%9blh$Lf5`vz@%L7#)u<= zt9Jxk=WOxSZdEd#z4Ycj<~8bWg0Phu?&t!yp;jf+G)M{i3|gP4{T;wCvl z!MZW>-%9*0ERd^sB3J&J2^FHb9<|gpN)qn&SPW0*b|VVfP$ zTiWV=)Z!2^w`a1>D)kRCdOmTYB1(3@Q9C&=-sLp029A>Q+2@Ks-8Z{F@J^X2g_2vi zS{fOz8*IGBT7j${w;~>FUe7MS?;~m4JIV@DNaJ!`V#3&g#_EdiM$WVU`r&9~tJjI~ z0gt|KRlqs+EuIDIeAbif`_aXQ-)FnCAGuHFu}1boO~06IE(RfeH~E{pGe>2^OOrdVYET6_-^zumr8@Bp?x*86_CUd&ED!t`fW0QuT-Q%$VKvpc@!Bi z|B=au)I=6DU{XF1t<6=G@&dCLLKe+!Ei3GZzk*ixaYZs{6a~n!Y7=j&XPC&d9xo$| zUCf1R-}AKLH)d8vn3vgdE}r=A6g`XznS1!dFnU8#`s9eZ*WiT9|h~H@@QiP49)>GCnM` zzvQPb0TUp!_a!Ku)G+-{S5|A6!y5`s+}*iIX|Hut8;c~0bcT@@W~sQu z(YWS;^J)f5|KF}r;6K-h4RDQ$K8~WNfw|f#bnVNgfMZP6U5*&wr$P>Eng84AmJ3o4 z$|HV^LrOzJvmVR@{B3E@4O+mX!=9Zh0Mf>+Q=0$}#%hYG7r6WX;Uyvb^O6i8z=bLQ zyrj392~YcEX>nA5rxFqg_zoNKNW_5RiKEre0nC0}eANcbCzi&$>A-83eOv~J{lB}m z@Bg{B(=x!TeGLbmU}W;2|0oawkYxr0`hUp&A0hnjstMd^5Z~Lg*KjG2yW7dSlN&;N zd;hk!(3cOe8%j~_rX;N2%>lMsySpbp*s7m1mJJ-;e%OAt89WJ*5KVmQ<5ecg`^KV@ zr#(KVqbH&8(8YrWxwldHBdRKPC?>#B9RX?^iO}7Cz?xZXh20QlJsBopevdpF&(kG* z*45;q?i_BBag4#;ypiat5xL1-6S!DqiM5*5J~0@1qPnr_?qIs9y;@B?-7&<}hO@D4 znU|bTRSsD@X>x^QSu(=Qxabm=OMYURNLh%#dA&Enhod|CD(x9`IVu|VYwqP~d?pKn zyxjWYNv_LA-AK**b z7+jj}mcP;XfBefsl~*aJ2XjRHC>G1r2ZW4_y#HI}SK->dh|Z*fO%-iyY16~0rpxMY zA&vHGL?&a@pUCfey_(bly{iuhRr!+rFz4dP&oknVW5y8DbWKM(Sa!}wQrIVdd=RrW zs{L<$KUu7gySh}EYHn!R$TDiQuJEp#;nPWeV6R9iyANDLK0Wqkzo~6ABG)t|FQXmd zw2j&F>&Dhp_ew}2GAz%mW(k>(^Ed$tSR@mkpK839dHcdB$m@|Xr7KYhUQ^~^D~;C1 zyeM0;bba5qEgGOnK}w7M&8=ixC{cuGBG6V!LZ#+ER`B7;ar;a{)S>e%f*flyzn_Zg zuX-`VAb{nIll9wchrzTjEL^*G^ERh7W`eXGNHoSgM9h+D+P|uwS6H>V$Vu14!vDt|8(?q+6ghY zJNWweDH00gDg*u|r$=-)FG$SuZ{kfeBH=&;0bZ@?;C8sKWq$^rgNI+oPiJc~D2T6k zV75GI>*gmURL^4~)@s1m+i#9FDT_ zR67we%cA#1e}EQ>?_Go>rOZ2&a5D3$ym_6WRT_98_ADKKY{^3KYU#>D5sF1FUDp?W zKBi?9iow133s=tNdP?Te`JS6XY|FX-@tY@>6OUb0ux zJM>8^otk%_p3X^=6OjKfezUAZ`-s* zuB@Ho4c>_TUo?G%UsT=qwLwTr!_eIVLwA>SO4kex(p}OybV>>0-(?Q{(B&*!oiDguFCf0 z>#i`d+p(y!6&_^h^n&kXZpcyBQ$=^)4MJb)q2@-y%r0wYeIQ;fc+73`H&>8y@WcL* zk*H0WX@C#neY!?j0P;PSuSD=e!jnJhqifO4*_p`|xy3zXDe>ufWm|cKN+RG~_7>5P zgc{+tDl-(J3L921FH_nC5G0n&dX#IEJAMk)HjawR3B?0!%T^rbf0u zFiH^{!F=1pT@5(l$umMQ^}Jr0jGj$=JSN32Bcd{V{4b&CK52JH0C|k--%EzM8iJf4 zPT>RiF=Yn&w@D+acAxXmChh|9rxU^`aS=U_wT1I_g?W5^g`j^Z)n*-f>wJgPZ*Svv zfO&%ADlgnHvzuvK@ukIS=FQ+P~TaM@Qm9G^EZ!*Q7BuOM>GE*+g zXdeBp7o+k&u>Pdda@8*;Ubk%*b#;23o{<~&NubT*y&FeIIcf(5*y=g*%>^x?e%({o?5mOv5o z?U9H}yJ=>#6Pt-AjO(*Pb>YCXS5IR8M4SdgM8RhU6*|n9<4w_h?u|xFP0k!!>b?n) zVJg2babXINq4>KLbAlP?OM0+%m*)?9Fvy_vF3P_%nI36uJL69v8xIW#3!8ckMG>!c zjsmZm0t{pKHnKg`6WjkNKWPs0mTW#RquM0SaJBWD)7ILwaz6o3`B>gXv^E(~*O{({ z0#`0?0EN&H=PnMF?kp5jQx9wQhqe38-mT%j)AitIJo45;wY)2MW|tt<`_XUWz?>=0 z@p|_}-T`m}p~-9ei&QL#QYz}BZ`Rl8v2)}7DW9>((FTWSi}lWN{jQHMLHviIh7HTi zWt6`oMqR1IT44-vWgC~-#qbm%GOe;<_@nrr{dA1eQUJ=JYwgH`4RmSnNv_d$@duzu2p1UV0~7KI6kWzYW6w?T{$l$dS1*IO6{VQuQoQ` zsW~}bC|n8oKG6TxvsM1H`;FCIvk31;SLNG3pKzS^2+pMy=YZ!yc+aRY0hW#a*DvKr#i5_?@xB-|1V8>W^~b zoUfW?ZgHgrrGF)2p=LYwKm7XnGDyS=lcDRGGNSX5$S#FJhMfOcHgCZ(7pp*+@Hw}} zcBN5|BaK}V_Lw`c8inI}@f29O-k|I>)2T)gzdtAP{Gcgt>~#OeGlg&(g&taF`J z(|S_97ATEili=yAUJk3>Lp><@e?rDL@qx*-|)-X*z6g_L?bu=@ql%>9`wS&x=OE8 zKcM!;-d~wlwyy&DJ6C(3%N$y|s46=D&aLs{x^Z@rT11fc9gXT8ua{Io2VyaZ(Jxob zmq^!LkHO405gm)Q+y95a+L@UTam`Y*2ZVgho5WXnXTUq66QlIZ=!9aAD;As1gP#zoTZz))N(bRTCB!DGi#T>)mQo;aUga!ghOdQ>D*X`h{iKb7Nxa*)6U5`hVi4E} z){j%$)VT`Rkyct<73Oq}uAE{jh3YDjl{b}7)>{)rKAe~zj8rwd{zYkp9Y+p2fe?Jp z!?b_i2YkP&N4=){GZOsaC%^@IsNeEBu>V?s@dsKfFYXFtGm0WSsgZ)Cm=*!rjTNni9?mR2YH9Hi|)$);KPwDAn|_)i^(ck+QC)DSWg639^~wKT)swID(5a1Dmg#h>VbG%9mfx@f|i|T2P82MR~v{qMjLftY%%Xr zXoOY-`{h_OwFOk=231J=R(}BnvGs^?ESEUyNnb)dJbo1Z-W$24gyM8VhMm`G9#=WT zaVPK);5HlFh@uMm(*U3d6<}8NeS~X6Rm+z{3uZQhmDzOVVcnL>Cr9Bi$}R^xVxs;} zxs!4#mFDUfukA%GWtlGb*3y&3U~Vvl2Q>6FZr%1f_4 zy&e*ks!lZBo33EvXVPL%3x1=!sEGNak zooLq4UmG*(ay@9`Nj0?TPNT_^cp@R+)^ntzprJ_3!)R$KOXWO9&M%|HfP1sQfj->e z@;#vB++r`cFglYue>Kul|8veuupEM_>j@X&XO!Nh$;0LR*!~GTsgC&J)M_CnvJ8dx zW)VU(YITPHfFzouOjZ5p2#eo&WwKykf`7{A zwp^20yS3f~mUDd!OH)d<1SCdYg77s|OdC}~uvjiE>$EjSME_nMEYva__|NJGsrzr| z7YvFGCG<>J8BN847A&%fqqc!by!#ug3CB4*!GvA?q$xez`9=Dq+r1~cv~e9Gj{ADq zfqg<;F)xp}t|W)uV9j+ox$#Xt*Uf7Dk;(Bj9x^~>^Qtx9pgeS3-cjA|r5)Ntw~cOdh+a)_Od&&67>_g;KIOlpC-QCCb?{QiMMle**x zik^9Fj;R|U!w~emBe>%|4jX~77DEaWokduZ$R_hewVyh4 z=Co++MXgUsR7kl7EMQ$6R~(V!WRbEl_z}twhqmEgh1=~PsA_mDS8w;Ye>**rE=i6EgUZu%TsbdR-62?YiVToKeEx48eX1EMo(SbWalX| z#1hJ2CUgd1ESFoi3vXdU z?|DJh<9Y~icumyPE$SJ`ZL1h z+x^v=oA6g!6(b?+^t%37as~i^H0vB&k~}wj6~lutu*)QZj&rA*>6f!ZcqzqIH<+$U=Va9?0Iv5XSOO8r^8QG`stP2V(KVk zhH5d3m!m+vlQvZjKL@Q(OahCx0MBn#N$YNTUKytIm+sby13)zG1N0mhLoPLiGBP;} z@cBRXDC<$DFe=A{qoo#V_Vs$WaQ-S8B3E_s6LD%cXCfmg<~Oc#5S;3CzHvKfKm{@A1`S#zNC*HX5;( z7NcQaSj#mV?Na|F)t7jYBjO0JP_y0gEr%XEM0Vl}!q&Z?6MS=azsVN_ywgnapFhzY zQu+*L!DCPlM~NUSFWfVc_d??qmXqA6^&KUl;UYy=CWa;uklvA>wi*6QMjwvb<#e$< zpy9o3LEFCwa*5j~+5=TC3!mGI!qWhdQs<-*24d(@MUo{s18>#No#Uga?QHq$oth1H znIQrE!KYIh&5wwMnQv;jFiv|>i%s&w#WS+TkJ%^J9kSY;1CcQQvozR_&a9vioiDmJ z!e-L6H|J-we-mR!V%ed=vybIt5H zh{Gjvmw1lwTC4k6z#B~2SmW(>AZpLPe&J4fUgNrMDr&*NK^XO^OfiK(4#d~5ebtVA z@GB(!)s@OJ<>Wf^!7Io93H@3mv7tomPacRg7R9%TGHLqU1j|&uV`ijayfXz+(A(c}j8YB))do zEB>aO)Qcjaf_zb}YHaq}V66=9Vb(qYaJ ziXNciI~gePYsXmrs|So^H zR4_LPdM->)@m@T`H|%#qUn1TSQ~|*sKA`=fQUg>?Awg-Sj0WJm&c+zxXf(zAWg&mt z>+IVLKma72{X~oBcb(4+W3zL_KoNHn#${WqbP#9~vSlsMt4&JOYQn3=3<}m*SI)j( zm?|!phVhiB#;F!@Yvah;Zk%`>4Pp^&!ZBl5l{1AX8uONRaN2qyMqj1}>WHU0wfx5x zge_|E9y65J*^y%qAhNlT>e>v(mQ!IN zX|c_s`W(#;T-uY9>8yj=nR~LRi5AxnLniFpOLCz39>lUUbUe!q63X|y~4-C(NX-z^}pai8L^V+nn|`T@C(SxSfgZW#XuaJHfVw9X`iAuL$wEPGV}H&i zDT#wdMDA)bFRIckU;}4q);geZ9rvRrtxnn{70+z|PrtUH}oqO(kc zy5DonY=uR#^^=xDR&)C1)wb+HWjtTsSG}Tt&ugG`D0?*zXcYg=C!!gXq|D7j@$$ zOvyA_z}d9o!^=WEZi9M7^mGK6H;E-_ZuRl?AnYg(po-<%tQEh*$&w5vQ(a0rVW_C{ z{O%^`xzAOh=akU-$S&p=qU(N~2e=AW2I2|h?ry!;-0q_)vM^Fn=Y&_;Sid0%X$*L7 zyDJ#|`7H@R$6aoVxwzuAGlg8cT=OOv@5mzP6JjSz3|A zg}Snr4@qLz)tbY2ch&6hMXnLQs?1kiyT!HD*E{S(0H{S$-Eoe8wd4D>Q zdAc#wdv5i+Vyl88RxgOVO<1*NKiG{q4cL@h(6(P|5wTUc1x28Pmp*j_9I>+2*7(Nm zGK)2YkrsXhZRu?{kK^-WD>pgztO)PcL{Furf1S3W*tA)l$OiaR@-*32mO((^pfVE= zCJoHF8kptK&OQ;+<1%A>YId)Cr{9-r@(PSxOUOw<0v#DpgeXpjU^+-(VqCxLSHL-I z&$!ors8PYA#}z%!KW5IT$ciPx#DZ1~&dp@ney9?=NaX8Y5bD&EVG2WM62Ikhw}n0V z*2nrd&XW*C<^9U~=2e@Cq?)%b;4JLWJU z_4zSsUIqnn(kq8_*`E0DaE5UQiJ6a)HzQ8xpC)*I*+kky$_n4|T5vX2kn<8(>bGi= zhjA4RO%nOoB!AeJ2%12Fkn$@&TbK^{%0w0k(?Mvr5`B$j+@V?sWpc4!Ny(aIOsaZ9 zg-3QPpQ+tHQ?Z-&q1d;4p`KAkXzxxIZhw(lY%-@Qc>7KdcFW#XR;>Zj-mPbdm5I!% zC|^jDWUT&&5~Zf+xQ?wrYA{=3Art@5Vq*^Tec~jc&;eAh$;Ex~+(}>Rq{4RJ*s0_I z7piB(s9xmV(@M(Rf#V)G@@OkloLXq`%FkUD67bw$b8)a08Yb9Eq^7}Z!;pN@i#c!d!>l8Lis!k!69b@EsrDJbKF6LD{vtmLfEhNZyH4;b&_!*HiSk zj0ld!RJdGLjaQ@z3>m7&raul0cC8CgW!IsgWV?~nGz6Xb9h!6z&*-UOXd=h}#s_0Y z`D#}*G{7N{cCdhygT_SHx6yPFu7N|}jB_~r8)ef-G<&ORSWyLE!GA@oh?BKtFY8He z7|I*UTl{!3e+s=e-2lx!O6_zCxu02SB(a1}&8NV~ekLS;VtXS;8pKji!F;`YIouAL z3k*QatIB@@qo*pDQ>XqrGeO1qf?M2sQa1zTv#@HxV|cxg{MTQNt2$swk0ZTUgA0W`G8$orUDOrdBPj}?e0mhv5+z!4R#M4gx!De_ z)oK+*Wr#59$~J=Wps^C-XOn1SWIP2=KFtj2@=2-g1kjO>VBQ+YT*9g89_;IIY6YO8 zD|EK->J({=SXGl(C5ff4hBLn_nzYa63*u*jVhX#`&vR+V5%8!#(MQvcZ(mG8`_V$3 zY{XC%Oo48scMZ^nv~l!fEZIP~q;Q~erAN@iY{ zTWIm0MTjvXv21cw3TwJ;B!ds%!naIQmaDXeut)UJcTTmEqx2ju^Yr#I_Mcx}ccz+8UdEPK6j{%CSs|4KI zgj^o7i*GfU6gZ6lkGt1%G2OeHUxvuj#!%J@ zqJy`EM9%MV?}nC0x-7}*ckiR9JvqM3n05j*rL_@A;DHauEdj{dsOrTywqa@Cv{CA; z{t?zGjQ4E2lYbV8Zn;CFq}ycMdH|l@HhefIju{*W2qR7`yGCE$r1@4rc> zaR}JnBusl;d9@_{D%}6(Pt#<0-2Xz;Hk6je3KO7Vq(@Rv81u9g) z+{y}kRukN_`ZXl%)xufT!UhsN?Ne+>{Gl;n%HZr4E8Q;%goc-aG;;QAGiuaUqBU=+ z$}A&jcVr^q0WZ+ms;TQF@q@EdkBmNen|io)_W`4g&mWdlf_`$sqYc1 zy$hsCEU?xovB95Q&ZEPYhr3a(NYMAlhqraR_C9OZV}Db54gSbCm(vqgERRX?H;4cv{jn2yUj@6b3;3zEi!58L< zjH<~AGB~gKa_2}#zX`tp50L(J`y&Psj!I8dz{Eym4ZMd|#iqadFbxR(R*M(&6LUxH zE))bO+c!hS|4>|5-5EpM;*wPQj=L8{a?;NXsf zT*w1yU_=|C-@})2A#uKo`zrpM?MxSJ`sVck9j<}wf3^JM8FuMLlCi@hX#v)Pj6WFF68s<_2g?hOh!PYv;`PFjf&lAL5nkx=M{4D_jmG>ty)>tPiG~lo zvlI1NGxo5MO_WUd`*>F<)v)iy_i?QR!@A0#I5P9Z!XqWLd}9u{2yU@IX1(eBCwG+5 z@EVJ_X^j&Iv%a02E*=xL-$r!|06-Fq z^Bkx2nz7L8Bm;gMsMh|0?qj9R@xyoW1=}Q+Ck0+2Gs7}niZ+l4;HnC_w*?rg3B!a0usCIh4?BD0JH%C$xa6zRbRqxX`QaWy99}XwsDb z2)AL9-0|IUFKG6`jdXw098ZoW0Ea#j!zjJ@G z*F8gZzZ#Vlc33*Yjhoc?2@!EKno%dk27fzy;QOvyrs0tNFtZlT3OqT2?E{y^MrabB z7XgsjU=Ehc!vv1k6ThP2v%=hwA_itvdEddyjak~Pq{}_=nGAx=M0&^*;J{S0469HB zFlwzfyuC6ABoxC^zPl4x@}V6&>?5Q{`SHcn+FnGb)KkfTkjeI`3AStFYZ3Aoc-n>E zNHj;xv8ay=#{2Y&3Fb+wiXi{(5g|SYk9MD!B=ELPc+T-du3Q0ym(_J>@69h@N0CeE zdxFNl_Si|-yzZIU1mdGf`XCc0^&|0L4$yLlSHdTi8Y_yA_6%dG^Im4- zsdM?j^)Dfp4!z-L8jX%p0J%sazN6K@u#=QTAijUT>Xq4X`z(=I_ZK|E`oc}P_+xit zq&&3tOXn~R(JN|(^^DaBENulkaG-&0!PeW8NjqiTG=PT(J51$x*}vBZvw%-IWGovg zyP@wzc12N7{vtUT^1~_d+bRaeh72$S_jYhG5r&k#nRv3+I+{Dqnfy*$xQ%HaSSyv^ zalbF$S1v~XtuX%HtrvjFZo3ys^x^^Vow zu;GPYE@ZV7GDPKf3;;g%E>9$g(oS66Nr3RvqW6Et-kE!*d;WDWo`OeT_Q!NqF{Lab z)C#>w8q15B^lJ?e-k)yxnSzqCjgVjcHp%@j2RXagJr%n6@lC#tBKVLCzSa4^X3e+kAPNKE7ybI4`-aC~|X0j?~<{`(oKKsH|oO~2B-6;Tcg?@Y9lmsw_h zFDz;%{Vg8^%*6->>9-aMpUgd?U6=5^>@PaQ8MI^fgKwooVsq@am|2E@jYoe!nhR7` z1;x~}nZ2aY}FHMYi|e6^6F*49z;6WKgM&aY008%6pW_M*pJhKYWw z0ZochHKZ;cph@u;e(Lzdpi#r)7`ZO^8a%KTh0{5r_e(x2fGG~}Jdy2Y9e$(@_CZ4% zlQ>yCKLz&ZoF|tiZN9mTWpTH^F|V#`+GQAd)KBdj!C-!d;N%)6?BAFHCr5r;EL(am zAz+&MKl8a)u63#bQVifU0BCW6(zX<+Y1EpdFK7p}&sPW#Kq^_h=_j}!3{ye-__N!w z#ryD`8bHDLbW&ckh^_?Ll`IwcVcgOaQ{`tN;h*2}CV>a4`2CZ0<^-1R@ls)5>3M$F zxIuLeHJOb{CV4Kd2_|y3DCuxIqYYsM;fuEaqRCZe_>LtTJ&XcK%A(VwH)hHT1>D`< z%a#IP>9+2;36+(@AjHmPQals_IKcmwYvKN$P)uTBqJ7d2NK*M|?fhSWd$PFwFN$=y z*90+&8SA&;|G2vJ|i#ZjXF35k7E_ z7v=G0coXPo9B}}<-oVMmLY4Jw;)cI$M2?(&b=4458h~8T!IC@vF!67BPpFj1asXx( zEy4d9I^rAk->S9k{(Sn+?7!zN{1cCmtD8+DZ5`3!TXOB!uN_|T z9K8cxaXh9R#d+i#2_4l;8WwIu!q$NrmT=^n&j4%Y`bedn34Ibx<&8cxRV$}DZ;n)e zo__*%Nh3k!cM5ns^Es-VyTSVnV9yU#teS}NvDClJaFt%PQ&|LiHaJQha{H1OPC$rh z0-$MhCb|cKG(05n?L%H2HBWOycY)w5u+`?5t|#G7jIg+Gwizf`tuAm)5WP|gra>os zB1X3Tbn9=RS}wR?`Dnv&c2%R%*a=y-pIZ*f{TdqSDLxT828h`J6>Z$b&~r_d zg_w=wDGMIDpQp&?367MH~%|I4X97u~J<#IftN^@mp8`TZnZt)ZL4zSjt*| z$eRcnX*Kx_|dKQ#pENMyht5P=I$~|JCL7LV-p?hwfT%0r#=i z1fkMyiXZ$$`a0<`=P=UV5m=F_e{{84=-At%o4^e&IMil(@Mt;9eI}(&pB7BE6h{Q~ zRC)`y-507qJoyMkDSi0X{bup>4uUUft^ZlE70*V}I_s)A1n|os=e3sCguk0U{8o~+ zM(pz)X85WC-!akZX)iK-rU&D}Gh%9V*$Q~&1i^!2dSHq;=f2AJx@vla|L-e+`ORpv z)`!qe?R#k~@5axkidR0mi!(;%qg$cTUV94GH6==2i41K3-Rmxzu`Hl zmtEkp(%%yi!1AXBLCK`0G#~#k!!Ku=F_tM2L>!?n-jj9~@EDy&t4uY&h{|~G5kjRu z-F)kAzXQm7NNHCKx&_q&g?Iw%_Nf;@w8@@+K^DXcILr({XaiHMbwQ6d@Ol*oNjzPn z%1`W$5F1syg=-xF@K5ipRW-)`CKq<3?7S`g5(yrn@M7y3T;dptP+A!jvoL%A38f6^%$_#?|l882=lF8!47JTYWF8RMjP~Z=He4c0q3paeJrwiM2AR-5kk#DTBjxuON7K z_aX{guH$oVc&~F~?oRE7;MnfiA;F5*$Sq`Cc;4D1h31^%Xz!lf+X zgG2oX)Ddb8!rac*DwsJuI;sSC9{Tn^m+67B2z3HwV-b9^o`V8?N`~eH>^>zL^j#pF(4fK8I+D??>C~Y zpRDq2jn|wNzeTg&zRz*ROLTDnQpkZ1Tt`|9!X`_UXMgCSeYgOkLjD)&%u4_lHbA93 zG5f=_jn(W=2YgurCc&39Ozy}!6bSpcDp-qnt*xRWSNIC~)jfvz@@_Uytw-F>i1Vo8 zx7Xp0w_UgV0ZHRuqsO=~qMmU9ciQAeON&;XR6fO#TK2`IC$xt*qdm98sIi-&6zeVQ z+7;#M%{X?olyz29cpVuKF)A(99G_W2#<0kU3O}gp*|#XZ8}q%m#yfWbBa@bnM@)9b z8t3pi#D6wtc3#6H5eVY5>^GM_ej{rYv`>BUT;omnNt4{<1_P~!rdfu%x&9XbcaC*% z^;Y`z2lVFnqmqkvECJ__0gM5bcRe&{Z>htd{Zk#?l#Wy09w$7lU^#g?TX|hCF9|qz z`WB6UZLI!9$!9n#StgbDmV^y{r0~)?Tp~jWni79=ifihw$AVYqve7Q+v4@Fv|5(A3 zx|m&CWw!3cY*5Rk`Q$dn5efgw^>)8WA{ABDu>Or_)=TdJsPzm1jZ}m*AuH*SlDU@) zRj9InvoszjJ0%~wD%r1Cpe#kPxTJb?_$ymVA1_jW|F(7RZ)qj>_u223yrTrZ+j_M4 zvV#2BCqk86l$+d0x@0)8MAzR4%VIQ5{Z`&r7vsJ^5eNqeU5Uy{}f* zZB;Fg@R$nemy+o199HY?#dy*fkrbV5Kas9aq*=u}X6;V?cZ z^?sv?sEd>k5=RGS;d~JMHf8-wvU0Tn17FeB%XvVG?hd$Eg0aeR2Xf_ znv6FTPG$)+oUF;u8ef!{5l~b3dtP#u@EmWzo(PX0*wPgz;c@{Z%>8E0Yq8F^N27qb zM$4&9Se8UE1*B6PQmNt(XqmjOv%#{arEaA!uiW8QShI{)35lL(@}?4(Er`MYdB4a# zMkY?KJCati4SMav(=t?be|;*F9*05xi({$XU`fr@5nc>5`B6PH)S&2tR6@hN z{l0G!4h2Jz{fViSuh`{>7J;?gmI4!U+@zYpDcQ@UJY%DQ%oXmT!wG@puNyZe!cg_q zuP@^0x#cL8Ns=4lzZuloH)IVt*VEGijMrxn%v@g00jrB5IR}{8Xq0(CCP_7Ze97cJ z0>>+?-)B>Hwn9&Ahr~!QnWa|RQWg6^5cB?NG3qvWLZNYGDV~*$JrAyPp2ba};6WME zL3VAqR&bF-r;csk=V1rdsu85+(Kj`2_9HY=NbCi|iq4zTf`JmDMhc~R96|TLcwSUu zMR<>u&ll?SmeOEX|A_)7=dn$Jltmu7_%Y_3?uA?AzG-d2j_m46hqI?Q5(Jpt z740_X+s=L0tzR*f4e>d+CQ^;%v#5QIzG32?%x3n^u#I$R&cID^m~@Io#ZB@3 zgY!_@@S|p_ie^eiBkR+_#crpd>n6GNm~rIMGLGv5F~+}r-TnvP;;fYf`W0e*YeRx$ zrLW?euC1!QZnP$R$4Rr5S``G-rAlGyI@9k~Ziq{QZ;}544=8eS_GdyRcx;MvxLX(W z>oOb)c7omP0IR`l{%F<(Ly_QXVW4ab~| zD(>g)P)kJ!2&^F@8m#JSIc&FE^8ED|4llQXT!$I z^#36Hy6#9YPR8oIc=QG~*#m!K1J~z?z}fp6%G#ZY3aDIc8cUJqM-Q^P68!AYv#>TC zjgu%pm$B_~F12*lbj@MQmdL5P4250fROmg38{A-YuSjwGusup5Hl*wus$1NynoEzi zHf5-7oPVJ6BYujKQor5~f4WpLBoKp`#(%z2N-x5YVamjiHd)CL2@(9mSd#=pM^^}2 zY4gw9Sl5#GBz-E|7^F@5c$`YL1>|Z=mXS5a-S463=<*zTv@_){kyT?U(PQ5~JWJ*M zMkZK7Y=TMh+M1)9(;dqDV<0kkdnACc4?LU7OvkFX01%|84Xee==B5D;bbOr55Ob+I zh`ltJhD;;~DHd>dXxd0`uNn{~lcL&8*K#i0@( zyd;g|h77(3=y~5PH~AJyhCGNKcSsTyTiVH%exffIKR@gFcd&4_cgrIh{11J&ynK5i zh0x$)N2*Lb@aHFV*^JA-HGMM}OS!q0;`sIr@OaI(hm}1296T#VYWncX)eMxbOCC*`d(h}k-|@53QW zevGJEE{PCDT{lax_$9gETS|v{YRLZVqL9ryw;l7yl`KZFpOUmQZwHm|8Kl*&e3znK ztEIxY{QucQsY&Tcc1N;#0B6!HR|=0M%xB|jN4TU_8&US{V9U^2Qow@;S^wwg*mf)X zxppm>&6r3K{euIGS}`gy!;EKSR2oEPZLaw0W}-uE5)xk-3o``16+xcuEqfO(V|D6@sE+fR*SbB~pe^Kc}< zzi!gMgB2+Td?7tFYMt(QCi%1&Kc4aFEEsz2tP*>QS_K}R5q8J7k!#TfK^GTrSIq$D z*Qyve{qIlK4DK?glR)c&+F~`?tO}y&V@>PL{ocCb zXId5_3jJ{M7PtPr*7J___(Hi-l`HS6bwZJaW7*;f80koD2d%cn%1<({ zA8R4gq=aX`}du@fMw>O#t^Qt6F?*at>>Qu$L*LK4Q5ouD?X45sUp)dGPlgt=d%k`qqrm_9f_QhN-u^-XbysR{3Q|7T zkV{G^u|TUm@PzgzTp5d)d@YD%o$wahbh#r*P}36F@+%&luC+RZQfecCY5QrnEWuTox^to$)`19TY_D#xQ=TNOM4v)K;SlJ29}> zXv3!JzyC3$u0v=hm)?Sw-aMdyD~u1jo08i)2MT@2LGLFRtUdf);W@^ky`}nJ^s`Uo z7W?n-A$1fBJ|~C8@ei&BpA=bDXY7eLia0vOtJj)5?Sl7-Do2NV6Uzf{xkkO@3 zx1I>%;uYFNnd%GJa9A!64c4!h6$Wgd=QhSH^{cAHs6?F2?AmJ8qa7RXZrLRg(Z~6B zXFwCVhtX=lwY;?2TW7^KlSzUWz0v1cVI4LrAF|JHwg%!z7K%~*8c$y2WEn!<_to*z zrpiiDSAD1LyJ;~W!)#%CXWX52ygJnuc9Ch}*dq%#iOfmUB7+#~X4hMk&t49(rucq6 z6Y~@0wY@3-t>Gfm*0fgWspE$8vxi;iTB;uz1*9cmpd~}w6pqx@KcpgaAZfH;O(P7p zYS^nAQp@jlUTy6GgmlMja&P_k?73zYFx~0(X_LRjxzhyU4o&OL`7d~EoO;wH(xWAP4SkU}T8f~J-r21MC4+;-=pm?T`JyOU0`SBDAV@FcPn4t}TK5qedqkmdO$ z$I%EMN6hp$`F7m0jOnPKH7qDscjq5lFxGeBF)t@36#FHA(~ueq;kTZg(#-CykQ)5e z>$KZphrZ8z=OJ-+cWB7~mLRLm$~FHcTbiM?+>p-}V0Ke<9lmBeQw*CzlTUukcn@Wb zzowMG_FLV!SV8R={3pGf?{wC~7^%mM0&126>WOoosXI^ojzHA&X%p8kbr?)pc#$JU z*ry5P5(y*kCg74y=NxH638Xxc#{!=qUvoHOfQ00>J#L|@ui|Jqd5DD#1#Qs79sBP^ z^ci<6rygx`cK*lhoEX=6m2By21@zL8JR%inY^9*zA7;PVqYz?svu@9Nw%mI`G7~+0 za54)Cm57gu1$(N`s=Qa2vIKE4#L-ojaXUiAFFsEM^Ky=sQ|%cG`v%dhRUQGGu;d(_SP7 z*Nu%93I^p2B*^QiwC?VoNDz3!AHvnUS6MZ+J~afV7yc1zfsBQ^YzuEYlo{WJZ>3l| zJBI;6NJvIToqf|*QAvCbUK7m7M@@ylzNkg~or&TetrGz{Ir2^Cy8D!#Op>9_G`}(2 zakHnB?HSNgzG#1+SZua^z1a_Yo#;dvkgsJ?sa+J?Q*5-Z(27o=tf|6q|5tS(bAHt* zs{a&MON)xzPJQtFykvAw_U=0Ms7gc1mKDf z`mof~go^-Mjr{ES%L@%dD!$q){13V{=5b#Jpb zlx1C z3+ap%T8d=4ur)^@LQL4tPWmk`WqxDzEH5~7+fOm`I?lhXzMae(FqXCUiEnGW>;_HF zjX1gN>|~dB_oN@!1!50wYHEN@ltljp#r~jc>n-V0uIFBBwx^4%rwA3OUN_$g1u%6* znpyW;DD&2IV9zZk=0uUJLo^(<)BYWE`Mb69fquH4;<*#sRYU$nkf>Kr}U^bxPB3h69Eki$=Y6C6^%M_G3`Yp?S}P zo-1VnftDXF(JCe?(XP9u9b67`tPy8}e`hpfhRH4Cxeq4vOL>oH{LPvBzBHo^YI(S3 ztcP;fRLaCZKOAscSu~^untCaLaN}L6xsSQM;#-?YU!#0c?YQ%(cxp3H$t7mYgbR^aP`{qCq%s)RJbT4x{`Vcn^bd3IyS{2#uN zC^&|Uo5I!^mBaHVBQk;ZtH5!e0hy_GC-lp9ZMr6-5@C? z0!w#?lz{Zo-O?iE0@5M1bax9&cXzzw@2$W1Ff%*5^UOSVo_o%@=jMW=8pDT9K#bCo zQyki4C0IE@QfMe_qP0n2TUR&;A?$hdJqUt~JUn%&sh+1}C0W$RwNmYXzNFvWF*=KV zWOQUTUKcVBPN_2{ic%g>CY+SFeT$0!U3s}_PA#8R2>%bcel1AIbsOsvW1`LV(92Ir z<>|zr#q*ewTU{W&+T+Y@s#1Qwpx@1&S=jqP%F!{&ev-7FQsjM4z6{K2gNg8+GKJdw zEfhvx=CSj&?16syNC-SUF+Q1t=R-Q1jZUj^tQ;%qpx+Y3#R z_)!wNSqU`o&LaXL4uOXnJ|^RrFBGmEU+~|59RvRb+}w*7M|oP33!b2C!Z1TIPHC7m zgM+cTv)HPKM^vj>$vFP|kBUHoc~CC)sK6w+lfcQJF6GPQUAaI7vryC!&857bMi3Dd zU|bLQyB^7QCYq^N;Vy9~{=&)r6M~*Pw)~qzL0cS}`wLI};lsE4PJHQ~O;*oflz3)o zqdz{>HPYhraJ2{Uufewmi?ts`pK(U1e`pop9VDJm>k+`OJZzT7s7lqQ2knw$sYGNS zJ1a34$=G%;{G)HC9t`ky5MLm^@?dmaCi)b&}%?jqo|PheoNAf)IcF|KRviF z$+l&-w!W*P*YX0=y#?a1`S&88ME>hKDi9u3njlWbXg|sU?vx*KwHEJ0B$+ zmDd?-Wcaz89{lxEY~yRAQ`c)2kaz6X@OZ3radq{^EjL#zAsg$Pq2Ou#UQQO@~o)UY1}> zU%s1`;v#}EacsvYzo3BTihqd-6&KK|%vEXP4?KUrZ3Q_H>c_{wSJeW=>dRL*G+OtW zYBr}tZP9L|5Omx#Z#s{fbp5jE#R@sCeR@<+tqj5%Pwp+3WBAJJ{gBo_>F+T0MokE` zKU74v`J*BF+!S}?l(B6R=wy7u$YUW;p@UiQshO4v)GqAEFcJU_v%&&A?tPaAoPyU?gy8S zPReU8hp}bC<`_zu#Gl&Wj6}bP?jXfG5z`indw#+5l?)t29=vl-8Ca8!U}LP(rui41 z2FKQ8zneJ>XASr)vE=0pw!Wl$I2pwLr3NZDm>UqJLwk&-PAzN+xTho@HI&|WshB*BJP9S_v*TyC@>Sf_ zQ9~V&nl9geA&~@;xNxc!|34u$#A8i%IPjOnnD3*D`SL?fTXxU6+P1x>J+aK!e{TWn zdZNszK7x5>ZoLJ~7bumnt1YsA#Kc)mZm7NG5L*d=qE}gC(P&b^F$GcG64rQMAqWxhyc;#~Fk`#H`$$Ed zKH)^hfI-wo0FSXy{+ogp+OJHsuwK6q3NCK;QZRjh`fa>0U7{w3<8KZ3Y|tx>gWyo0DS(f~MXvk*VnAs}!m*){{1kysZ!=ft2zNVy?au6)av)l+y;J7=j~t>hPaXL+O& z5mhI7g`%X=mu+3|S&#lp(P|LHwGcSvvbT0CRTq6d8oa70@zkaG& z&DYSoY>pCg-fv1Op#b;{2a(jRyP%vS zG7s9WPVVqv6T2#q8|~g?6;L>8@3lAe%^3JKMZ2n!C+QuZ1sX)(A8(~^d(NpTk2Fd& z__cOPWjb?UF8Jkw-2b-d#XvIaz(HR6jf(G|b^*%2V|j@U-sc>MjufeB7IUxM{5fAe z2tRXtf9yFuVy-C$6`uBXWes{fKbVS}JJ4mIFaM59*xrGWv2cB;Mnr)_f8)Y_dVveB z1<=?zMHZ|*oo83UVS;|UGzry|Bsvd$7y-<>sV!U|@iLKJTvN2k9yF{Mz5l)l^&XN< z#-smGW~WD(ddR_UP!(*4#bomIp@^Y5b zaQnqtYr-*Lq$HhRNH^o%)(270r9NvQi5LcY+RMiEYE&t63}^CVBfi9F(k=Fd)Dqb_ zklIRU0`vwJEtu2UuJbj`7w;e{K zmvFuXZ^Am$&IGL$O_lYiVFzcA?^|i4DZ|?Ms2R=viqzffx;xTt#lA#Ix~MHW;d9O5K@|?Ev>%tkouheIaBVI7Kze zxe1G)GCT3zUX$VzB=BZCYr&~7)ZoTSGhQlk#wQra71yfNpgb0G0UgyMv8xRxt@=ME zpVr}*`YgcBY=vXUCA{??+-#;myE|7yfx%5BoOfNeMr|ILtFxsQZ|0T`969XfNzL+Z zWC&4m$*~~bN5ZjGi1IkRCXiCDoMEBapgeT34m0D=uHt!ZQ`9 ztMC|-CZyZdz*B=tJo5Yr1ihvZMH1*p)1AQzKK<{2?rAe^suF)Sx*>%O8N!2;KJevs zLVJirRBjhd(rV@YnzAyd{pZG(68wX5CYaq^+Ja7XpiVvuEtLiOrDA`4mfp&g?5qFIlhoA&nzbJm9Eji9g&6iu;18ix z`57Tb+C81_^`)T*hVUql#|7$7aJMgDqYc=bg60?<>0*$3t=;Uz4gv?rm6GH>R(|f1 zuQG<-`mL^Cv!BrjnV+D4A&t2HfF&nagis_)F0`~)@MpHoj+-f~ktCVQ)|uh_;iClY zA@e+5AE+=;d~}w;%0>!JU;Vki&F|b#&+4Ote|&xK*Y~$D79s)Z#AGkF=SDezT#`(6on$HAaG0xmqW1fF3-0PuU$CiiD8YbiD4vY z(Hb=Q2SLz3UDw^<5E8kU9RP-(1(i#05%X#KF&acD`~3h}XO>1NH>w0Y*9-c3$@Ni0 zX|N&jJFlD3VXm&RTPE4b>)RCSSDP`b|1yTfA-byOT#7GlPG5T6o{@O^<@;i31t)r) zu1{th67Iva%`)Rekn>`|%JjBiNmt@qMMZRMRn#qX-GYFNoBA zW2i)8Qq?~Hl*npG8H8$zpb4&iY#Tt*obEYCCt&_cP^x*N^6jv&FowAHHE4@lnHgGP zWj^;yTT0dn>J61`6&Q>@D=((!7D|+b)`uAndB|~V8YU;wE9Lif&AtPn<1s6+5pWuD zHL&kByBd(w!PO#Q7sqFx-k_4@g4**<*=7P4?pY+ElZ6V@YNQ(DC2IU4*T||_BC(J6 zYEGShlbd31P7!adC+O(35)+2l6+h;%m20xaOO9feZt#3mRx~>^GOKgB?|4b=Nl%At zVu6zV=D2jHy|{d=haNe(ejPc03kpV2KvBbi)Jb`NiT5=1PXAiHPRR2ZK@YPC(cADb z%}PjQ&Zw4oSk;ISQclfo4WQ_<9pO^Kd!>Ga=lbGv^#|&QPFqHN4J~2DjS)x#08T;P zENy37C<46#rNe#QfYbnP{l z1XGAm=6?aBnOF2?dn_a9*uYPI9p8&L(VP16bGq9kS8|{cjLLt*Sm&9JH<^W8$UV~K zqSPjP{U_V}5=SK%7i|P@g7481@zQ+-JG{uiMESY{a@1RmEH49K4H}?D4Nt_m67H<5_sotd+wsuI z3%EEggqlt&()a=VwVzg%xoijr8sSK@&gN$i?nt{ft`&>Dw%9A*l;jrmIm0Ny&c^zp zb=$_AXwDY*?Jo1%{K>XiZ7tx^!PW}-vca6jhy*4IBK(NuApCL*qy zC(&y96ur)0af0W-lnS~8?K12N30hq_H`2V8lqHL%wccER-liBkU_Ho{0!k{u%H|G- zK{a^|H@>^Y>|&@kSu-h!v%m@BP1uO40KZnS22{?2vJ}n{Y>CN_3O{>b@MJGB^ixdt zjO+hYOS4*ETVqz6<^OmUopC}u|NS>Y*?O`R)hF6_YGm@0gScDx_HEe=Gh$aTOFI5^ zWnaLN09?8NcAvyc63*bOf0sLZSwy~~aO-{nsA<@IhVJoO4KFs(WTNtnpM2%Jaa>BYh?6>6fM<|T81C_xj&Kn1Md_we~5<2)92MZB%WYFrS044lv48RI~E@ch$Ba{myNMJO#?-zgD z_6i1|;|MtT9xOh`>Bvd&)Z@m+%oFW?MM8RzQpLjiB{5KSkjOSvJr#+`^J8HiIlaer zv6J&9QXvH0B{BIfXb(zg`0>YmdG;srqo^NRQD=`w%LXHOVwMf90!gR;L0v();8*GQ zyy|#G%KJ+iH-iB)j;=WAXo&heo?f)rzY>9+y7B_qDAl<2>4Ndi!>?{o8o39$LXDM~ zdi(PTpKr*7RSlrj@G$ou(Xdt&)@9Ebr695<>KflUYfT#996xEg{HtrVgk z?uVj3_BxyRu8n2}_mRDrftwKSW-=Ms`Az(ll64#R+@yUkcSDp?2y6BF=v}RZ=^MC+ zM4k2G=7S)4w={^n+8<`)t=c0?(CS&cfkV&f-{R%;*I2dO;wGP_`Cpz(=86dM3T!KXnMFOF=uHrR~oe*L$FE9ehADg=>s}8NsaEE z8Vs*GwhLAGiN^wfF|5&`;%x<+)F>_*Z3K`Abas5O>8?2o6v+P3;Er8=koC?0n%JU# zUud9+zg)Iv-@8xHBFm^ck*!COl9eqrm#$p1I?)K4^fc}W5&oC));JJ#4qXB`lE(=n zX|~wt4yfj{;vZ>GG)QIHi0*PW?o(P|UsChG6i$nE;ku%e2G_uRe)y_9S=R%WMy=Z( z(&7Q_^HPZ^yW40sR)6Q>^r`xjo0;EQ>V`o=L`B#&B}*eMQ54?f%G}u;0B(Lcdgg^E z^(KiHch0d%Ge{5#F~6(O(P~CutJJ>{$AUrb@l#LwBKy3SJ<3%w`hH8+N5z!m`zw^i zy&Ja0vp21!xq~SWg`1oBhT6(2x@b54-uL~uU=7wk(^FP8JA;dnK}?O+goyNRHJ~A{ zlKn?FK!Z{M1J!3_4!zQg7y@+=wI&Wpm^vij_zgvB}i8{n)$B@`>DqonZk3{d#$E+hk~lu!k1d)3+D@PLdF- z@jH`1F%$6QOp68_waQ(^D}{_#jD8U92$D?1m(a#r{d>5V6MfmMXhw?OFTW24o&@69 z%7-ZjUKSzJ<-7c!a?Lr+satQkH=hckBCg8SP4NHU)`5)ub?qsR4+cH6ws66=D0qw} z3?ur>^3H(TVSebwk2>+DOOl_xS!Z(?(#1jaO=(sj(oWe$s^vS3X^1LxSlSGYcl@z! zb8g2MF&b^;cMSDobY~WFj7+;A0U4wLe+s+jIEUz}0IKK2qylJ(b+-`?#7MbSX zd0AUp{yOp@p{`W=lyd4@`T@UjT6CZ_Ko$HgT1U=ppGQlLD}@??^9006m-c@V#`gb5 z7=RPNt+Ep`0KQEhD<3y^%2(Q=+ty%$2v{?wyt2Bn{Ee5sh70ch7ZDg5PoEPNhh@y3 za{rhlEb@}YTPHL>iD00XwrTZ+&H&_@6pU*#1IXLyo5~jz%`!lL zzwcE@7h9Xu;iUuOM4NixPj8yDzA`24B!=4QYTGy9*xP%6lVu1?Q#TsIp`E;l1vUo( zn@9g`mhGJfZYssZ7+Qt21rt%RJ}}#hCHZTO0LC>kgn1SKLz&0Tmw_M9{Q^bXk)B%x zy$1L!dz(fx@JAF5?7#p4Ysti5J76Cc?BB+2sDbhCK3=LqfKy+g zC06JH$4ZcC00&|Ob{4V%pD#QCt`@(m?BCh>=#*;!Zm_sRyIzMC;1fe5W-5r%8IW{J z^nrh0hFa_VyIQB|ifNnwvsbz};w$FIifI}i^0%L#CKb*6fp!J9zwpvu=Ybcn{)bn> z)G?#y#*~CiQ2^ie!%4P$$R?fy7a!m5->|&UE>ZlSQ%C$eb;Q@{iWx<-y?m1lci^+6 zhVC}NJ$`4LW;G3r_8Z-08{i-g(qaQ(pk;f3BmZaUhE>b}vuke#E`>Sz!lX7JJl*E5 z;LUq~QR70$y5gr#pq-<`gD${@!aqH+52{>j6HAMbw_MqRN4HIH50L+W~FVO$T z+HTQshkU!ZzV3}CBR*N{@|jy$pw?kq{7ZKCsxkKKSM0tlA(RA0wTORgU0l}lMuY&R zSj(`heZS)y1?p|&gc}U$nexVXE|Ya8g%3hTd)$`5)MXwB?!4*5^!0lFug;?`+?-5# z3r98NH0*`hKs+k5M)6-9!GpKDBLVA=udp4(F@V>ur4Z){t=Orb*>E-?Dn6qYOODtB z@1{&leMHc&8PBIwJ08w>>)=#QKd0RZmFbj&gCCu*SYNqN0T}(fzxaCdu?vkv(2Z~T zFMGY5b9l6m-6$t0NNJ%u+NRzmr`6AHqd$?*dA)nw#|X#UreH7DM&o@YJ!YZuM5VPIIp&Pz|2&jK6u@Z8Fx<7B^+ZPrXbJ21rP@0^cfBUT!TI|BN@2cz zaO8>AwZQx^RpRW*SoxPpA zvRAsJvaq)7?fKqAPYUb{uEr4B&g zNK=W68y1SpaMexI#-3|ZGMWlopcF|ERJUCI{f|-A6y_iiY5F5ou7v zyun$Bv1g{527lSZZj1we98xfo#YgzgSJOJ@OMd~ksX(d3d?M#sj;&0}O)mNFMSw7P z*4lRE7ahR)tqzjubf#-&apgN7&*V(gdZv4UD8bH^+^Bk8-RB%%qew{>M@Tx=Gzi$L z)AGBEX~ru2J3leP&bx-%IidRw+$II3+UZC`z{_tt2OuPwb1rPmk2!5mDs}DHzM~&| z0vO0s``nG%{W0tLgJF zi5!0Gp;&G@mKr&3_lF)*5evmac!72Zpj5Ak(ci4 zSs*Y)V_92;Ojy_3&SIexv3;i@p}M~krvt}@S1SREA42L+02tfV1raoA{St$6?btCo z${7A4AD&F(O-iSff_J$mTwvDEO}Ei$w3;#iZ}T&Nxop}kwR>vf3Q3w3%IA=lY7~Xl zM3Gfnjt(B0j3(}DwEz8zE9hZ|-9oh@6P+>4gl%bLPC!S2A4iSW zIu226aM43der3JKC6zG~sDEI`SsOD}sgJ!p$LQ?VK%tTk7gK<#Iup4M59C}xKaI6b4UL;I4GYfd`PePHE@$V?Ts0FmiY4!G_B>PAgYHu#DaFmPswP1jVg(OdM z)y7YeSY_^_kvM(Kv}TNBq!Myf#n4c19sZ5W^I6D!#-UA+!eT1WDNb<9>04ALx<-)- zAx)wTmbmxXD`GXH?qxF1)Bdf*gf(xpEs2#NZ^GT@|0Gu9byAsAw&=K#5v1v;V-ruMu# zkU&daAiJ}CI^KtUx%XJ|2hqjK$PRkX^Oy&bZ8|scT1#$%lj|7C3v*jeupO3K(Xo(D z0;})BM7(dr7h!Ql9gh+_gv!6*_YJ;>q-|B2WK=(-w?;AuiRf6ukv!-4$z9)>-WMGvX~0u8X;>>iwS)@MZvCxzHt2g5`fLyo0*f=W zo+y!^36pjrSr5wcc9D%E%$IZ2t^LnH$1d>N-jPp;?#}QhMrjKf?v*&gn}QNHu~AaIA#0 z57skzm$wkUf?Ro%AVWNf3srJ;)(FX9&StB)GHlz=%;I>AD$tvV^|Is5mz%-ux$PO> z-+BL~8+g(oDxLUtm7E)8qSZU-nwi|UQQwUC`ybn1f#*1~a6(xo@1j zJ-HFbKIjDyOmL~%XF3Zi9N7}AfQQtllzad4*Y=?paF#3jQ$cGk&z%pv`9t}pmqowj zL~=pLBEE0VXRBhv!S|uigPK=wq-S9|rG!@z>nQb*D`9`fRVTa?A*?Ksz>K0n=_Ca1@xx#pko0hN zsg2G9Qw*C4zVJ^Nbo9}J-Ora&;~C@67SPgSS172umuh!HC_i&wo`B_or$R&Ht-r6n z%8rYmNGf+A(ooo|5$&N<(kQ=se%jA%omOLG$p7HU^z=}2=_MT9g9AHh{xO#$KGdm9 zI`Fk=&zT-wdbR0U((mxz`M8rp%f5bDqkjvNdW!dUew23~cAvhA>pbVltB@=e4QCNm z(tc9EN$r$RqsiMl=wemPFKqR`l*JsF8TwrC+S40-Q-(HGmPxnvtwym*#N4SBF0P^3 zTY9vK!CB5Vze%#*E2^%@bk?{LFhy}?Ch0fuG^Q)TIbQk%(Q4OyJod`%z3IXQz1^Vg z6oiN)3megUZ7Mtnz1StUx@7d~k5j21JG2wg3~g}`ojvEgU00BwwSg@Bea{ z)m~zrRL-u0>9S2C!LBS61c0{4_72IXegkP*M{(II2cAxC6OMdp45FSaL3E2hf45OE zHLRjMbHPc2Q!`gg1%Xr2u`1jBWuQWF3jq(DLxwQP_@Z{T?9iM1S1#WzQvf(}7^$W) zT-iJA+HtjOc9FR4*ms&PzE7UHF`zHnD2Y&y+fpK-R>7ER)(8_iJ@QwaCebhF?(d%X zN3eaLTh{YFxuOsFVdR-EX=qSuxN>n@?s8M3(*DjZlb&ttUqnNZh~f zOXsA4pV^5`$xv!gQi(emwrZxo&4QVPR{!MVtJA;dVvX;QYAX`CDi-nI-babmW(E7* zy126dc)sRjrlV1F*7Quj;rvM0=cy`>X%vGMk&zriv(|pWY8{Wl_Rk1tba(^ zh!VK3xDGnq(@v|iOaBsg?7Hy*WI3F|t^v&*kCJ$^oyTLjev-};3QlQ@9le_AMG=_a z?td{K>}RBb^<7{%t@QP_#s0Y$C$Q>79>RX2N{TtSUm>;?%xiJm(=eZUt8?lHzh2v@ z(JZX)yiVt=G-`-Um$F>H$eU!+F9nhQcKl%`Q|;C-xdHp){N?G*+T}q9&xO@eFr3^! z?Rnc*P*5;Uivfsg&roLtO@i2rc0H275M3qJHDEgIA3iV4a}%!ze9k{zFw=$sx7+sgay z@%kAx7~D&J-O@T<>G0`6r%j~Vyg!mRsU8i2SDh#n$Y-(!2%{5;ytAumG#6?k$j%36z2FCdP;=~u!Ek`8tY6H&7e z>CW=mj^U%@^kkNyMpcMPWV}Lb4VT@V_Tr`S`Jnet+^o{YGYcw&vGg2=P-WS^Lk#8d zILYsRZCu=@`P>1tYn_Ul4#&tke0ku!(T9t%%k`>&D2B!)3%d?ANS;baEe5qv__=br z*~x4cHv}f-f|4~CfK<=1D%r7)yat=5eFNc%x7z{`%0JzD3roLk1wJ$>vxbU#bzbd! zE09Y;8CI56B$|usuJ5@)t$~>j`m-6l3;gD9gO_`DwDA(h^Zb3eUNgF|$19U8Ha$FY zK_|3S&eiuYQ{-QMnT&uA?b#kL9SwqpA%R#msQ=KUtt9fdyjZt~{8N*1BfT@4K1K00 zwt0t-eyhp_V7=4o1GDRtE<~`oljBNCbOur5Bf+lJOoy(Hwy#P1$rJvLmA)NJqFD`y zJ*3Q$m=XWC-s^s@BKsFZWEB}|r~zkd6n3Th&H|s6T9JXEO}Uf~YU_<)-T3;tBtqBX zk)BF-VX*VOwD~1IYo#yQnLF3K=<}y$H%!0GOY@z$p&hX$LS}0{TviV@066|Tb7%nC z?6S7g#ykRN>GE&`$_?B%Teg2Tq><~bz!_%m9gFR3YKf} zXvfI3FxcewmHB0XN%^I>kN;{gkmj)rl93SP3c0*~HhLiDUK8-#Hk5hcS0-I-%!Za# z+pt*s#H`}YM&vecb?E6f??T522lSXt@uT0HH-FkM{_kt31bVLw z-(N}rg)!itKAwfj$)qrE=yUIjr8ajmiZE=c46}YNpB=8vHtUJNn)C?e!1rA@SnS-$ z5NBe~fi&FSSBQ;%Kx9*Xdk%x@Xt2(27pbLk5J1%P=dT~zECuSwMc#-fyya#RySrW; zAiW4%YJ}MywY+`oG4#s;PKOMeSQcYGrj94P-d#%&v;9&o*d!V9De{0!r+Ph@f@nzp z>#!0(WMV);K9!n+S1)fRFHn5XQ1u)1Nc1(U1)H2sXhmB4{UR-Qm$C0LwLWaL+4G!o z!cw7X!&wo}ZoI=QuUFE}uvXn9OHAtS4X9U2M43vILK8(*JFnIZNzHXS)%iQrv5dbw zaz@mEHdynuZzSQ=XHqZG&$YL9;sRjB9AOYVR z_5jw7_SfrVmb{OP9)0k(w<*xi%6b5dH$Dy^S+ZfgwUPiLwUFFn>hjbQk;auLaLI4r zb}WWgp?@8dUqpLz{f)KL-QasZCQxqICPaTIxeW)Z4B^86jP6OVkl6@z{=<*JZlz z7ryG18Kk7De$sGw$E;EERnvkIV@L2s?=__y^@#t-3il#XxUz)AoBkJ}{GiSg46>Uc zXPCt>omQz~$~|cGk8$4F={OP4|AL-b4`;cl&NP5oN5e$;$7BiT8r8^CN%8u&d|AF~ zHj2sh5d@?Cgtm^k`i6{=8(G}r2Cu$%-tW!UWHf`Aus6wfZPt$*E>Yw;#Pa54xXI1A z?e|#A2ERk2F-AI3p};z5?Lpf#iw2oIJ8Nuu7_<4uufBurx_L?I(hWK6Aqt1__Lqmxc-aboH3Dsfq<=k{U?#*N!9$~&U}J(gU_Ezh{)f>_V|-(ORT zc|HOUmOCRCC~5p%7kks=XqeBwLwzp z3u=ABFqmw~rvWimCv^j`w_KLJS&q4VT5pwBFL6J~#(eE1HB~-yzHM;+@d>6eb(ZFg zY;ds?*y1LByx1XBE>hfbRWfb4joStLvaM>D$%hV@l!RiUxcqc#$C%r$Gz_F{S?Lz@ z8*7u0>Yli=I&oB=LDgP zt=+Y>R~f=rFLdN(l=9hxstc}9)@~rr{tw*>``NBr{^!!gGtzWz>uC;{<|8Xd$*qrH>8LVn-tnU_-fQ#=zhJx<@36aS8tsHr; zX2}#E)u6*6&9ObzyMqL9ND5S9`cAtb#JP&UHI5m5xgW)SDhdYGE{@K4?JT1f$Mm0( zXh4xNlru;j{h zJW1>2q|%#GdSyfX(D%zGlT`1K&b?u01&_u)ZWr zJ$%)vurXO#f=O$diiFTJAS1rxWlG;;)>_SDE2TnZh*)gz*jVYWwtP(4Dz~n#BGmWm z`R7`^bIoznkw7RenVyHzeob6IKVb6Nnm&O{htd^;v)_$20{N>x7U-)Ctz5j_Xt{uY zEECDGkQKp4S08<|8S6zNGV2a&dhTHTz51JLV0Qf^nRIa(odL>C5kE~WbZm7n5ya6S zXP84@cbeLJ?{BeR*v?=BGO)L&ar8rGNe!4e;@*_@gY`0$`ma^>#M4h9T}@;CU{8l7 ziR>WvIQox7=HOgj?+LC;(79*;F#b6bm9378Vs2&2Dkg@%75J2O@%Z5Njw`ac7Jgi& zM{fi-T_$Y71Sc`^--Vp%j~=kDuy)3Nz4REjSpfzfxthY`U^Ps?vXUj#xnLj}mXxS; zCtnliA>-|PK4QE2!L~-|=Yq;>)1bud-(<|~7o{KSI)t^>=Mv1D~T0=rgwA?Z}5?-eygsdGfA$`%Tl-UOpGc=PHrn$~01T=soY@%ldQ&8yAIK57YLh5G#F-b(`E)45km)Ut|fw7p1`I`GWM0h!l z#A)`#RnzETuAUfS)zNBlhar225&idXOpzBlncV%pJ!EsdUD6?fZ(YsOduuS?US=i>8(C%&C{=7Zl5~aSh)+g zlYr+K>TYh@|1br}|J#j7jltP}57Ry3GYciUj>1sD^U*Kvkua63;povyW!?`^^KWHOn7U`Ncd7+7n8xycq!c*T5&tu=$aJAX-Tk9jKlE=y2#p zOopkU==JeDr8?k}i1?kjc|~7H)*KCwxf+hXQLx+qt){j)RqjEfN;pifsA|+VW@-NA zE`^6cm(H;~MHE9ZR}zwHvp+53+tjWpePE;CPK2uDf`QZk%IuTR0dLyZ&rQWa9dGmZ z0WL;?NzDK5Rq`i8Ee(FTCeEq?149W0mT+ZCPSd4czuwrcU{t1cc{Y`0Nl$p5TNT;X zT3}@~S(#uT9xcz%_gI&O)XGF{!z|~Zl=qn+&vMCAyj6yH1t|TRy1CfCHACB=vVd>L zP1tc;ECHg&RtJJDiY6j&78zvjokw_GN@qrQ>&q6VZj3WCQuWcml z>1jclft}(6d-K~)UjlEjZXnW}^sHs@DZ%FbwvfGb53}^f+tipZjG{DuB^r6HpG3^* zIZ|>}?yo=IHPGXv2hD~k1;KcZ)MHr(PBep@x#11d0w#s>Z*nGMbh7ZyCsep*vePF2 zU@~xj23wp8!k@AKg7H32=IEB;<=jk-@Jvtqn-zKh>*^*i#rmKxUAz>i%yqERN?5sU zqxt&Pch&4VkeDfdbUEGCcd?)|dV^WVhqwGaLTB3^UDjW4)6^!VeM9gJb)&7*IW%|Eyrc!(*b z#>hN=;hPt{91ewdMiE!-FoxKxEK_n%<_q9H)SrBCr>l&l_ZzZ z(B<84n~u=_avB1x$j={`cZR%dqRX@-raupoQ9kVE0Q9%}_>INvzqwAQYc~)%{wN1I zE$qQ4!$YSfj+QGa0dM9vYwEII*fuA6O+)pktoaa@<^+*bOu$ z(JHVkk!5&2yf3gbS%`gQzYuoya-m=`+~MDIkL~z8P6@e>?deoNr=Aa&V+VySL}Nk@ zt_Rdi4BJ$_waT@@NbozcS5XYysNuRL7gPd7_{1DqsOX__7-VBbQ4-z2A0Tp`@8TIUvmeV`tVRI_=k^ynEZGx4|c9+@sD>+v%RV56{n~TW zriToo^cKqwueXS?=+D32n~0>n+zDK6l8JC0<~IM(tT-JG+V2YV%PGQ3M+$3*;LL3% z?-IuPSl#Iz$78r`h&h0mg%+`|Nhp5gjQ;8Vb3It4!J`}n(y!5GIY@c1>hWSZtdRv0 zXcYaWbc1&la9VKcMui4%o~J<42Dw6?C%PAE!`xjW zcE!IY{wUv{M1~#Jj>gLVpz+&2=-eLe$@`VT#IWE;;i^u3eM1!ihO%(7)Uq zL|LObqOz-E(efY_ZB(n3_E|ZPP_fK93&6re_l39kg{+T0x{S+`NMA7Z+m@h#_vE@sAs;S|)?a2CRp-0nko_b_=xro7-Pmf)~1gwxu6~jRxo| zztrqGWv&&Q`i4+>MLdZ49q?LU?>!1m!kkVwxk=X$)}y<@C_o@{NlaX!0}xTDW((v8YZ*{S10h@sI$5?0>^ zS)sAyg85wjS34v>`6^;M+|-ihIY)2J^r)uIJH!Xf*WvK&4031g(uKZB4xt=6S+%|djEbHx29 z6Yo?z`)+CD#UOV~<{=r-FPl-Pt-S>{1wWb8i(m0qEtfMS(+$=E@p)qVQ=1#f;Xr*C z0wwm87Sb|0$0g-RU|a)%K8(%)d7`X>GDp4xQO@?2Ex3 z18hr*WEpUfovQkPyE%{qzY~9Yj~@#A1)|{rA(Aus^M(39D|Aj)sgOZ;7J$+n)>|ZlBDWcvb`bcqJ;5b3O2qJuDUCDW+kcCUl#5|#vp)Kd%A_KoIx3ooqEhr)0R5Fg zWPWF9srGehaVn#n8%2=z$^U7@H#?6PMjYOoq$cLc{H)k_N;}^N`A-B8)qZ z?DlUu^adT<$=Jn}xsgAA|6FH(-c31?F4lBlv}^b!ag9?@8DI}&Bi9Aba&G05xMB4a zvMwE72sZ*YLoTFrmR?m#E)svSDzF^@#e>L@;Oh~}1G)hTQ|`@fa2<<39)FE(7Z2ud zZo{Z(^F9zsKMUP+mP4_|oZZU}G}q`3yqc3Tg(AWc)25hDeRtu1OdB zKId8yj29UZA3z>XzbW8#LZVR62-yiFbl&Lk*`M(8t%;k$!-1!Niv4`fcB?WFpm=$0 zX602Jo?%zmfIkx<9D`)$x4*!_a)X?S$bPKeP3fH47jVv+*h@*4*eoGDs)Noe@1|kh z)V}iZ^4CurbM*Y8p52pcPuVE)8B72D2Gxi`wxE$oGl#{i(0wThFO9^;uH=lVORsfzXyTZw>DsdA3U{K)RCfXCVbdhh=XQM zws%Z~#FTJg()g_rDc{ zYqJ1xz;SSBZ9t|AXxn_v>uQ;sXDn2HL*pr}@OZH+$|F20mCntlUTaCh` z4dZ-GzNHhuT-dxLfww&TRV|8q@EZjB&t~-?1Twb0N=h|>siT@HE&|<jmkmW@Cn}iiU~u>|>0!aK8O>9g{(+T_Ffa z7&ZtFjM~Y>lOIWnggkJokR#%~fu6E#X$>!`ge0aNHN<^C-ZYPrPFh_PEN=o+V`F%L z&i`2f9~`sBbRhlN!&7eXoqLz+RaDF4wn!7#{>_;)B0l04kY5xw!SIDHP(8 zSqqJ3UYr5iI(z@K5W&s@{1vx`HBBl-tl7q!j2_>Bp8Zad-bI4~%@K@mUClr!`003f z??uRdCUXQ|T+k$%*$Esk_Y}%R5%dei&s2#kLCs*H`g{{i5g zRVrp0Tim4?;(+g1IFBV3wf7nCK&yxvrN_Hk-9#D%{}?LqZ~cjPrk&l>^4Scc5WzjB zfxOrP&*6y(Wp%8TR=<*^K{X1&kd4yBF4tzpGyhzHLP)A^1|wGPpYAq%u>Yo4;b5HV zu<;(U)~GCNJoERtzgLI%H%0RJUOwV^d=?xBtMzXp@OhLzh4+}-v66NIM)av?ztVJPkll0a01INo&r}v0f*7JoOnFs5!av|EmJc{5vk{ zD7fj|xgH!9PiLKgL%u0iU%nD_i9Q2axU+tpsJQd8PtgpIkgN`9!kmOWM!^6X%;sao z(x5EcKv{SnbrfT3)yNa@_@zAe9p>ppBiC74g^vd-@!o80CzK;TJ%eQM_;~)_9zPv$ zDeQ-YoZ9PeZvfbZNtTH9#F&5dO$B`^m*?`k5a%R^{3c8QP0-Nj)j;O_1TE`z%}1ShyVZ*x8G z`Ec(0`2%OI^J!Mi?Ag^_RbAaBUFFDacuZOgF1(2v2lmYE|En{?t_oM!`u3AzIBcL@ z;p~8`*_PRT1q-W45zC}7UL^e^i86ebA0>zwu5~MnRGTX={IN#^EwR<`s1>06FQwYt zx)1rv4KI+fZz%Nq-AHepl8S2?#W#zQs_YydrKuYJ1>!|7m|IY%T3rTyvVb}a9<1je-dH)g2>nA&x|mjm;wUjTV*ZGsWB>=_>L+c-Pf?@5<2d>af%YF;w#x1v3%$gaO!W z>a7t6c2Dt>oVmD(_2?dBmi2?#dfX7!KW_K6Hw^2P7c+BRma9vCHd}{+APvdR`eM9W zp<0iiznm5X5Z2-&#Wr~3IzBHHSuow{Y?`r~Pd_8RX8Qe9ftHkVR@IYvXSmv=A3f^9j|L_Uxn8ZH;N$#}oJGl)-50NzZ-C5j` z`NH)0A*uvws;naG6(}8EXBJDi%a11lzg)gx6cmp*?amsRjSH~j{BpHI{d46K5)10S zDhuO;QC}nlOJL&Agh??#0>6Y827abD{+zby9o1sz!@fVpE@Omz{Gr}wgO?8BT@_#Z zcO&VN9zHkO(1Yf|o7I*07oZCcrOpmKHvV;S3PH?<@AsZ{JgsP0HH00V09>jC{`w?V z_!e_gq4#v1fep_wr-I?VucLF`Xd9hqknY1g-3#gGbhWC*y&lxQy*IMbDwLo?>_cAT zvbf+#ZwZn+FB4XN?uUF{y1&i4KH7x}Waxp@=!&j;8iq zkZh7#p!1Ik^{VIh74FM|PPS8TvVObf@gpzUh6Fy@)rBB(w;lCe?Mk17m9l8N=}+BA zqozG5L->rx!7I|FKa$zhR7b)=(ETL1bnAR&Y?QDoBo3uw;*EDm54Z+lsz}vx3vQWf zH5ODp>q`~lizn7CHI0+Nce}Zw(fQZVZc_d_B!R9J!Q*EZ#HDN!l!Xmo>*Kq~kQ|gh zn|LKmj^L_a>4tGD@FM^p(esk1wN+%i+-4su`=jh-Sk#gXpHO8^T#w4O;?Ftvqi0{7 z4vGywa~dNoP-0Tst3i!f;)+h4{N7HVPAU%>{equFta)0leOz_>EyBhj94o5= zL2(3&x2}huD)c0RQo9WI}FHoP6TFb0+T^x6DG#+(w_lWVJ;(_G>JR*aSv>$)q+L4G~<}zz*AXLU*l~ zLY_iqNNA|c4W@4BfJ$%lYTJ_ca;-$}mmU{xVN1e_$`eXhO<#Q17b>+%Ok6QTFX*v^ zwxeQ`)t-iNZomHi(eLKG9>r`U9 zBOeR-u#mMiVxDf+uhtX&qf&xOA^V6QX!YwW|Ks+_qLI`X+p#l`=<;`yzR#>EO4%wj zo9xLq9UN?4#EeRWRDABmXzitCSr6$lxgyj4K5jVfy(woblY~R9UMh-w;f=7YeT6J| zSr^xHr_-6qzXU4VAblD@yHY@hCfi>ftrAMUIsA71Z)-f7L6AybtHUiZ&ZH9Bt6Up> z1$}*)k@_K(o>I`~yLJPK5hZIEP81T=5+bwdid}J1RRmslS7#08r&<}!8r$UyZ;kyL z$N|y8hg4ek;4$pQcoT4YC);XA?Ew!26grlD^#fy+CHLld&@?NJH=kFGWNq0El{R%$ z7rfvZEl@Ze+a0X~H+_(?hx9yz-2G_@=ho;E?KJVNJhze?dUm%z1REJ_w!?G#-t*?7 z;pH!|osnADfh<_|SA%ucYe3-pTDN9n%(fL9UNELKe*^bmnVkn!)*o4fE<4)|BfILI zHZc}**+>(cO7yNwK3@&*_>Sc25L~gcl^V~Ns5B`}u&}Z*OKzqJoljYWg+F4GDcgBH ztn=`y?0f|=1v2Z+>crnJw9U82e1XRp!<6-QCvXdiqx`i3n6G?bozYM7PgBk9d9%l( zh59Z%`oY-HimBq+&G9Xq43Q_Ju0Q71uu<;(XA_Y!*CPq2PyCBb{rT!oNPbhNcUqH0 z0katGoKm#kezUNSvr29*E-t%;$%zOv`#ivv1Xobp_kym^FSA79k!i!4;u*U1w}uSV zjPZgbSkHiyP(>rS*zJ0c5}_IreDAv#(D^pC^1yd51@W_r#N|aA))>iGk}K7119Rod58^f+vqei?lvV8)sB&A7_*@L=$E`OteysdAQ& z&Gn$lb$H-za{wy+UZXy}61#tszGErv!mu^$xsED9RDMZR^969;v7{Uoe<)~wuUSe# zcE{Zd@xEgUkQ==k8swX(FPqbl7EP8jor(3Au!1~!dwsngsSzCCVW7v>!(140?lH~j zlh(VI-z$uL;H!2xkl`T3!+Q6v*OF@O-r(Z~^} zJnxnm5wrjr&60|PZQ-fq^jMY|be!QEV@)K^act0jOE_I`%PY~MF0Ig&t@Zt~Q^0N1 z({hDK{hAWTnGGf#uc$fJtVaK7%(>yXpXKHc0#q`NHCaADu0sj5IY1#%{=(4T?RvNr z!kA)dcWsU5>T45RVT6l*BW`$>&)5^kCj{ZN8gyiBF+La}8P8R(%r@B5icg%uomsg>_%c&cZQU=*Ei|8Iy`$vEOWS zkl_2xx!@|55bmckAEvYIp&W!gxeDw`GuA8R069vmkHXo$tY;ezu;VxW`%-Q^Pr2Lq zs;{{i^$hLyGt8B0X!{cA;fbYE;LN^QM(xO@JpOWAdd4a-@U`bz*u53~W;ir2o}_kf z9$5hKqBTc5fw_2*S;a;X--I?!6L@#*zQ z=JCN{@T?XVwrgdJcn-(P<7zp(F}LrlB0nMyzmkMhYD(4#h>=@ze=7H)5R@Pr`tC z6p`6TA>poF?36E7OlQmbk6lZ|mzj#kqHY@{>KN5#ndcXS^Kppz-)w(gRLF<&BDu)a zC2aL7l)*u(V-&8+q=$(#hBS_8L`BPOc2#2kqHa)clP(*s2oJ1e2X*Q=aaT}?j!-cf zS*?jtD`dKc#=pWD{&@VXj`-WPrKZb^xm{fRar>MUYngC1ex z0zvK^Jx=dKIcz;%_r!8T)ywoGlcwUhW2$jMFNys9#{HKfBHOcBI0&e}{JgG_N7N{t z{%4O`IKOjiq{LV?)O=dBVg%>M3^a$9vGs&tCJ{pSn#{(31P?Z|n7+G#^n0 zmxVDy9mPCE_AZN1q+^w84SIt;Gr2LA{#-b`m-k7q|8 zp0VyMP8I1fZ6KZ6pGLA6UqqRWXULO~+SG>&e!0s2?X{B-#$;M7n})v58eBEV=xbkd<4zpMG+o``K8iaIQ7d zwSgB2M87G-b^AF5)^{gWux~Ys1J)1`b$TJ!Gp?5k!I^I40UFeSvgtH9bX% zP41i##Jmx`_tb!xqhHjm%xXoyPL<1d*e5QX8LIycDni$AKOT_Sdo&VK`j*?OO}V7B zw31OEHu_uN(s(d+GQ-tCj&jj$RTVTH+GM?t-_Tk;X^?fRDgU^JDW&lFZ(aPZu zTY&&hxb3J|gK~{LxBGVXT(ZSQ?QC!hPt@0el7DrXPwjF~FU4AD?iI_Nn;w0lXog=| zCsqPHEazh<+n}st$o&ojcjJ~F{l+F@=Jgf!8tI?K1K$E`S`-0% zTPx#)(zEo9=s)!-g1HT@kAi|$%ku8!a+zF;)c7M^1jc;t7n|xDK^t>e<1p`p24^CA zc-=s9;9wZ8cS6AJ@e73dshOY1U>W3-)spQ>WBs1m|Guy98@htkmRMtgpO?N2<@@>kWcG-zB#(C^5d(yzI`UxoA-&sdev zUv>qh9rrWe{^9bI`vY72=h|I6IgjN9G{Vv`+rdJ8fCy&&YC z(W?f%zQ7zICJFPx5wke~={ma;(Xbs}$u>hIY{lQKmzpcYMQ9E2)8IgslvUDb{|`ro z-6AaDtPrbAdWRfsQ|TCD(x0gn7&6p^2lZv5&MaMOTxCY65Ndj2@rYok>o1N`_mi92qEbhX&4%3~k)5)B&!=Z1ZeQ%9RCB5ciCbA4 zP>$ui$DqzR@z%h7p3J&L?&PwH%Tn89Qxkx$lFZLl1y`slzvW3g zm`BR>n@p07J4hS7KEsKk;gE?RKoojxBv|}22`kZ_J`$OQ)H~iHB&@jzU!M>sEv2v-zLJ0z&Z_0U$pNIC^zCqou^#01m{7t~R1xLjSts`uLv%ojGi^(szc_5&{#`xJL z{&)=mvg42M+Wm~|Jt|rcb&p#>=ACf1>r8vKykOxJnXhOCt*%HJQ=5Zrvi__NlG*+~ zBl;<_JN>#@^3n7@PJp<^9E*ZTLnn$Rpx82@EaS#}(#z>0SJnA}(*6fPXgoJH^ZC8eWanA&kgAwC3X=T0cx z!myLiX``eG#n;+&oSBVbZ=9FU@#i)9H*rykMW}H~MeTC!*WuXpOzDuW?wHXcCU8Pi8xN_m^Z*SQp0!r5RCJQ-araXu!9?CdoK+Ss;Y7w(g7lLRlNRT~yu1wycB!twR3 zB<<{2IW)!jK5C=SUqOEdw{=GHkdN1o-g4WmpiL$Y%yGUxjP>f?8AA$?NRp=t_w4L= zRb4QJ(_;0APf8;i#iA`qf8+C3Nw8a!1T!d>@`W*v=yje!&G(u6K^rX7ucuILQ!;MN za>)DsJ%;+k$)zhdJzCTS07qhsEX2iHz8Otepeps`-Yl`7Zf9W@~Vpje5~<8*!EzzEIsZtR~D}z0%lVX-%%(7xgT^Y(H(3 z6}m78s%hHe3HkW-3H-KHKU>GZ$U~_~nr4X&Up|(dF3m7+`pw<{c^(6CJHU0R?L<1} z?R!|~_NCU?T)0G^;zTyr$TXiz`jxh=0fB%XTr_nZN~|@kU2$?jzVmaN@7{c9yYp^7 znWQ~JE!SH5PKOr!lNKjzvfn`1#VILzmOHavC)*rOQO^7%@M@_U&C*9t63_-u%3u%(iRV`qDc^euH z4ftagpEWL6I$w*;F>}GbaxZ_lOiU`$2DA2F_=oSlZ%q zY>E4-tqjH!IWIpvk^hN9qkQ$TcR2S&$M=_&48Ep&?I*TXx^X9u26#W; zCgbgmR^z{dMT;6}Lm|dmr6mZ^cv*;cI@pj4;#ykJa2S4td?M#7km0mlBGCOE;kM~X zsD0b1-|H>l^RAS{#O=!2X@BgM@BR6UlP#uiKeOV|qfAw6Iq~^m4-X3#?~%}eGH^&5 zYh{8qzwF>+uEmNpZA!`j*?`I53N4<=5ykQBOS13!Xj0QK-D7IkcU}!E{di5lFKW`C zEu6cDK<8H*b8;jYMYGoX!D*ZO9h+IHifmXsJ<$)3^Uj#gZHH&jAE7I(&PlN8vY-s< zahsPXzUd?!dNScvCjA2^nJQMljHQb!ynOWFStX=h5_wr7hH>c%SbX)l4PQc3ifp`? zha)~wn8|g_tVZQMdbr{Ja{Mt+y6n^O@A|}m#^{g9Rw<~m2WL)atSJ>u1Tm&6XuN1| zLNb;~NtG1bP@4OxA!yxD__zGJLov%9r9*bbi`<)(C@5g=jcBMK!>EripAYes)oG-# zoaV_?fhqMnA-i{&>hj_lWPxw2b3ae_?K2Kf5xZYBR8ITSAkY?)W__N}m$#ldE;P4Ki+=R|fp@$15WYwBM@R+*%&< z$NCBy2u2NdHzz+(j+w{=RmdXwFRddG?=gRme)0dQ1)vqYcyt_>8deij=O^dvI^pIu ze5}#>k!9n`3T8&@lQU|vZ1{+J=l#qPHXw5Ea{g)ko~*D5GY^U9l4S3BE=p2F;2KW4 zXiCU7Pa)!%@rRD8lL>8KH_D!DfhO-u#$RXmSj&4$CGqrSxr_AoOSeM_anw|3*4>!? zGCvS|yYT`vr(!qyf399{Ral*wH=aw1TuIHNAlVvSuZEMbX|3hn-xQY97H}g`KoZRG~PVS55j=tic)l0|bhpDJhm)Jju4kK@OMHP81z^*^6Ax_@E%3KCFBjD9;wBZh8> zf%c&{QVCIH7W9!Ds`rm&!hrvOrQJb^0QP+Su=`5tMBaUZ#$!o?X3mFKFX69O{v5iD z-CBS5x8kzVfq^n3)l4}Dc3dG4g)TBh%l?D4E)vIlRkyVEM%qOI#( zJ(H;E@F^b5OYa(@{4O+Zn4co|l#H&i;R0dtwraKw#APQvDrC27-OIjPxi9ylB4n2@ z?2vS*G~3N5h#VE)SF~>y>vj1SeX7!NuU=h|N9em%NNrVV|D*qSuWpT?l3`5qvhVY( z`K&2?L^Cmvn6b_Kvu;qGBs|TGuO^$*v#p-6nyA=GjoY6m-tm;T8QKhxcl@=-TWby@B4Jn4Rh7|pN9(|0j+3R{cZ z%MbU8N{OGW&NAS#gC>LSa!36*IQ2?q$%j8x zE)lu6kbuX@8qbp@~8dG$-RO78KghP@u^^%faeoQPt zu76{a=>P18K*S256FaA%Q|3d{{PvVLeL`H&7-=7O@=wAMCjI7MOls5$b{Bz6J~tYr zZ?wa)c?wxG(p7^vCWFzK&}$};`2GUwP?mWd`=D%hIi>4FFpfeqdzGj;MV1oPkIS^y znSn`WUkuafW?B-$Anp;YN~Sc!G47QptEJ#?#5BJ73ky#&Uow2Q3KB#cPJF`mw(I1j4jhkzv7`+`gu-seKVd4wO0I^9(yrF$ZUIEG_YLDE zOneC6(NHrP?KN-OFuONcB6j8j9SX)@pyqcS%n(vI7!PH-V6*Gg@*x3_r4+}wJ6NxM zB-X?Ieb9g9Z! zCA=Qpvlf!e1Sv>yi2wH`1YqSw9&+gIQ!TilTL*q+4hX7tS7*}is(8_@f{OhjQc{c) zEgFTolW1P3N^X{oR6ErQLDq!{s_rkhOM@va&>g9KWN-K9B9~oFX2fTCW95nouAi{w zA@hcoOVf02qF_>KcPub;`?n72PT<4G2R7KtxCW5tQ4AW1dXU4AmMuTW+3>)Fl|bT3Yn%qhm9f zigVygyvTzNe$E^>CGZ8uxDfFz@)?fFvSK0CQYFK2GLuh6RE8EeW&c zvn973N)8*dEc3EJ4xwv-W63C*B6#^!!NA%8#4MK_8FRH$Oh?FRb9?^k=vsHhX`R0P z^^J4#Z@Zrazv1Z!mp#`|V&CiFS)|Io&|k$r0RoG{%Al?e)&ONV!w7*$?!>{tVp1@f zm2BLN`(#h|)XWSWOW~(a7nF@w2j=hB)0`md#0-A4J>A{QUQ+WK*oI2(TmrSyXe00>(VXD&kIJU3*CE5w$gfZ*W(nsYgCEt+SmAM#>#OM3Skr^bT^t&Awte zi0vhhhDeE9(i4C33g6#{wtD*dUh>UJ7Hv3_#9*sK*uKl5n?c7vSTV&&QAos6?mG0; zwdPiol%?+l^1cAIzmK?{3uab9Z-KXtr)fbB7!ArcVy!bE-|AZ6LdqO+I$RM;`r=P; z_uG2kEkPnFezg}Snx%Mn==ooh-eAQzFjG56iIF5dE&PN$h^)_#!CcgeZbO#Ny~+~h zdt}cfDbKy6&Wjm35vrW1LDEu3YIPyF!s^P~LOw+6k~a~D%%$^~9=AP~_{@kSFj;yQ z-}sdE0G@(6E`K?-e&uk=7OQnqumr5mGKr0Lh>N;4m+ptF-Su9vZY*uwdV$(oJUAnp zJ8!!_DyB-7(JS9uQ#R`KxGp|MSOhMsj-KL3kT z*4v%}b3RJ>Tzn9G7i3 zU6ngs&zVfcLUF89aN}F!-|l>RNrOPZrb9O5yV(CCE{82V zG}pE>iOr3s=ghQMZhBBwBzNcb=&a4i{1e<5F>3gJy-`Hgp3!gr-4`8X&JW&Sf)?ve05Z#bBvKXkkRT`nbCn#dEn zh8VEzxvDt+k_wQ^J8^SS3XmHd1rhK~^{vfHqp%8$4}Y*hVGYk;*M$5BnLk0a8>YPANvt6qPB6?J6Hair1U&!$ zVuIWg&#R41pNRQCc~=4^`I|DSN)BSotK8k;2r>3}pYFE@F@~V`c2N8WwLL*teT&B6 z$)e-}FF=*(eka5K%>?M+e<23BCu06j-tqDLw?y*X!3S-m88t`GnVb-L1DJVcw3!76 zHp&TQJwb)BNju;XWp2k=z=XVn*@^$n1hy2wb0E=!K(_uT@9J2=BKiL>p#R@lKwk&c zO0*1?8r@$ZBez8gEefBk`?=jrboe`8;K36KMTp*ebd~!)+8&v8yDgQvzrV5hBz_Vp zSo~>)Ce_fXj75XlMv&v8V~Ftd_~Euo1H^nFfz~-^2W$vNumRPUuz#a37PaaS$f{xS z7Rdvk_~(IG+b<54pIK6ol4W{L^mKZ-#>ym&J$r(``Q|RHLSH?M25}(2GUjtyM=xwr zLK*AMpDN~s4dVw<-({g$VxfmezT@+!g)bzD=B$^VEKKCGNv`dl>anB?^!|Y<6-nwl zN$ZC3*6*&t(dcoiWQ9phZ^QQwz5jsh)=-8<2LiX_F_}YiEap6#&2(*hd!DH9=5_zp zlg+Ec@BMa%HMaxY(mygx#XaZlBul~W@j7|topjc*xseq0WZRpyiLQGeOQ4E55zk^k zkyy!z1jS<>?#r#lsa_(&8?==hTxEMWuT&gpYh+eniAG&ds_U=D*et62{QEu|Z5Rt^ z0=E0)&4|xi#rA73jan;wI5>Z1*Cm&ih(X+gSqi_DS9)Wpx0lOk+|MNE4u0iR}cFE`(h;3-Z$D zcZ_LDXEj<7XjgW>)T9axKtzLmeg2}DPCp?VEBH0*Ki9%YgJER~vCOs9Hpqa%&6eslg!q8m+MAqz zA5u6u5QZUPRyB5)&%d86 zU2GWpf=0$sVHsA{LsQ%I5GjH1_LL-7^b6)UJ}=L9#%9>3IA5 zXveZ;BoNy_TgQ~d;H^~MMx~Z<*(|H&e2Lm=s0W)&sX->?w^6*t+k-z*ekp?0a9Ynx zzN)o)V&yC6p~f@m9|&OIFcnDaL|yAU$lh@NzrPF9rDKmEo5NSb#nXVTU`y%gb>Dx) zqfe8MCtwLGq^eJzY>BgNW9DK<mB~(tP zHGMF4yCP5KH5upr_)DedCwa`Zs1s@)^0z<%ScVw&q~kj*tl+zAN46%tz!eqt7;0jn zh}O}pyaveX4`ll|Y6Tx_I=L^~v<=0t_#8L$weYF*%4RD!Hu{)7Dt9_q;WIPI?03fS zftyXcYj-$_i%+a(C(a-qX~3ThUvgt_R<_>xA0r{6_fG*&N&ZV6+SF0bS>6=yft)LG(FuMtS}!Xo6Ik{-Fzl0dgn|{_Ni+i&Fy5JhPS#bq$B9E zaL@-~SV44S|?`VNjYFETj-n@uerwR2Y$^Jp{4maPf3G9};KCK-?C3vaGM zo%<^pt6!Ta1iRsA3nMI}^v`?k+#Y>mEoYS|@ddrOZ!Y$e%pJVj1>pbsgWcgKQ+k~9 z-AaRc{7{_Y^HL3pMDD=NL75`TaBSKMSgVK;5-mpV^Y` zh+vU)JMyX}dWP`!Y^+6|Zpr_>{~M0)&j82NoJh#$Qk)K()8JO@AFR8>^VlI1yvr9D zN^9{i);#^ZQJ6BPR$q{tW%cG*gsqwUHt;GA|_Y{tRZWh2_+Z-cdD}~^+#Mjd>@R+9x`8Lnhcn!NIG+6Si z1@6?l#?vrV4TM9QgI*v$D3Ul{)KwkJ|8QmRsg^TH${?I~e24QvhT5i{-fm0@us|pcw`7Y$}rmR^leAwoto(~T$D$z{)Uyhz?c_SeZl_WZBT2^$| z+UWU-rd>-kR@=_&*U&eFh|8F~#l7ym!SbFCeRF93N29xb=KcuhEOLRjcf+sEHZiZM z7MfZc-N3V*al`2$_-cH2%|@aijSLpNAQd%EPU{PNwc;5$+i4P56mbY|O;LrLfb5rW zn3BvZ>X>FpYn!R}L7O5n2$2uKnjybVHj6-Gbvv4}JsI+GbfQG$^dPJ*GEKXzhQG*KqT_vi`V8D1Za8 z~Zkxk;4&_H|wWpIK|Dcrc|ttY8+OjtEvyio=Rbu~5vNrIk*NQEyrVF=K0W{&|l z6lx5ajSIe~1X{#TYes%uZ_Q`Qk#QK_b)0~cu=d1D{ClMJ$km$ZqVDI=Bv5{7;H6T- z{deN$o9-S>oFU-5I&kK#O-5y%6W-*B5DV%4I>6)Tif-IoERy@8X~Df8ftC1ktJ$ET z@FqAW;fIIjC;iNtQ^iQPYAq*GoDzpKxAhk9;_QUcGw; z9Qa{h46Z>^215e7h#v?gMRlHV0fEy|3W5hJR}tsm60je68B$__m4|6h&z&QZ3)+Lc z2Y7JdG##`8fkp<3x7C{9yyX9 z8@9kaYRE=?4d#+TKh*?y@2tg3BOM?|{a3?ICP41BxYH+)+Q|5_a=jXa8+*2^(VvVn zqc&iiF`$o1yakg$?};7Dd4i^g z6y;G^QF_-c$iT2wN8EkDJO3GV0VQ0bII4;d-l^1cdiu0SW}ISxca}zCn}JD)N~oIv zuFH^G4grfiJfb4UJPN-kaHH&rFJmtOU!cMm{mVe5I7k!X5e^yPomPD;^$ z6P*<+vHurNSO8A&#&!G0fJu;KhLQnL>7Yt3^STU=sK9$r~amD zFv7efsLp@T{S#D{l93E7#^~O0aWL${^wYcl3n#V!C(d-lwkW|nb%^PPzC>9BhLrY! z@y3ZxB?AKzLTDrl5-8PP_v-@>M?*Fe@&pN+A4Pl%urW8HdFd4cy?<Zld)w`={D0v@-V-NOh+fMBs`&*oh5&?t!tu$#WrBgPivzqWvOnagz?a#d zO{2j)B^fTqz&cH}F?9f>1kgBunq9s-fWLpBQosa`qyqn;Q!x7f$%)i7zzIi`lmu|R za`Z0dE)vPbru|I|#;cC3=?EM>mQG;|Ah}P2f!-G^`i*F+_liJ0MQc2Ke+tO&mJ6e> z680G`CV@%h<-|k-PLM^w=K&f0pPZ0?;zR*IPBK`1aBp?70M%am(enWB3?=Jtdhr0c z1PrJg7=?7(00kIRnASlKps&Ed% z*n;pf0UyHFjfOoqtJA4Csex)5I@8Tu1M2%{1&nxixpyuAY-nEs08Q!b?ZJBb>|MIf_>KJK^eb zZRh&R)r3i*6!TH*czn*cZtLgrzI*cIZ0hjcFvWA|Nnn-lj+Oq38Qtd2wV7;{u56#v z6aM{{?y`VJ+cHO$Sb?}Lq5oFy3(#V*Cu}+1%`%DlGXXn?b9~9EG zQ3V_k*!(-frgs+<52q_T*E62wO_d4M(?5Lwo6!)W2H^$2Qp7vc{)!EUJ6|O~xmy8h zHE>_GyM@5h-L&Rf!93ZCM0q7zDY1@lnW+Ev;`C(hlkaj|(@!sZG*1s#7^$>HWDJJ! z9EaRq>i(8Q-lJ*wh9EJ&+t=r5934|7MNuGQc5i`&7*t!?NRfhh%PqJOUV56>x!Q+T zq$OOwJD1vD_T{ndM3@;Tkft-~d&?EMPdM)hu%zr+4BW}dL|IHTS9sPgr%gk&PL7U& z5z^X9a&>!KRMgKw%wOLx$skklEj|77OZdP}yXVh+S0g*ni7wE4qv134?osnqb}+@o zdvG6S#USQFLL~I`a6%9D*uz3;Z{d-#9A-WXCSevwq5;1MXlS2Tae0_#e|zB^zY(;@ zqbHsrD)E!womOn4lZor$Q2IG5JeK^86|2wuIUQM=I2i@|OOPXVbQzj;yoksdZs8Q* zAApX9k-d^Rp|E?#y~ol{^qo{wn04ZxB3hiKs^rVZPNBQcx0pC@zZ2Kp2UlCA>${&~ z{~G-EU0F%5$qSq9`%tSCilAsjBhQSs@ed@S---SFl%Nd;8nQ7fHIk77h1XAZl5w;& zl*F(UKHs(&>bI|QU~LmK!%WaZVXU;ZKdy)5tPo>(c=#hI(khRIl@%l?FmR2O12rAC zKR+#zGDmzYCmJrYxUO$(%w219OSGf!Mr5mGG#wuEaiJqfVYRi*h}-ue6WU}SbCT>TM$WzuG!+I{GIX1EvH@I*39v1j)-&^=>&HaL~0&+^a_>VP zt*tSK8xkZ0>#!5-9vCl`C!Dy?N2YX1ufRio2z`?W6Y`409n0?0E^C&t&@m^!c1z}W zL}=N6#8tOhXjR_q{l!8q)2xeUF`iu!M=Id10OC(z8&~uJs#|cM z3Ac;$P=3Ji9Xh(WcJ#sn9-Sw>wcZT5#c6bg+CFE#R!#FbClfz^wDl6`d%AGyc0O6B zSzEEiMAC|Qc_7d^TSa|zxhzzf;+;J=tFGt$nyKlG)>^fEwGso9k489m%xzBbuG3{n zH=H;Wwx=G9@|dk=sZuocm6I#O=EA3ItvmC$4w#}SdY&ca2D;*Fx0hKQ5C0JLVlHSj zpCLDrL0VuBQ5+Cr0S|NzK_9X6DhOw>zr^eABXaxCw-~p;C2<&((UEJ^AAhKtlt30( zGI)9!X>)3Rd9dW($~o+U(rrz&f7~Ipr+=OCF-gmOCY`tJuQ`dW?J207c0nA)>nwvs zbAmAbyS#gpFeFP#p@{QvZ?=DzaV*CZ^|0j5rMrJWi+df{i%XM#khz=8* zK}3-*9g;oIrt!TKhmjb_;eN-WQOa}=6?D5?9J?Dxt-*C*pi%lw54h1j#X$FIdk|yK zb3<{1s(#VUcE+lU92homzW3Kzl?sM*C%A5yOfB@aWS!-UKh&KD$_$4Yb-25?vbh;` zc{}Wt5C))sUbvq&?~Ock89SYb9`J=(2bp{AT?tKG(Qw39=rNxM7ae=&h#5kJOqUsw zAET*p!#&rUF8k_7xWgc2!fm2m*Y7sjv+XO5;ab~gHVGp~XO{K%Dx4wIDByBR$7npuo2=+(Uxt8Hv ztpr@UU;Ge|K2z9H?%&!RuD&dHn8b-7MJ81*E%rK`H523&OD6W&y>+@elHL=td!t^q zq~x8H%)^>xJ}UYlA;R0(BvLmHGw(wXQw*H(&C#`I*ied;;j)m_m2T)%;u^aC#eKN> zyCD|fjo}P8Nq!GWV9-u>UC~-|4aF8eSeQ+misN(%SCFvuQl4axijaG+cJ}c#2hJ)- z4;FSYKR7!m4Z4|7V)qp5AL_p2zoHJort6OwN9;+YK@?G>)mseue)(K5jsH~;+M#NV z#Y8?qNW%MJ1XZ8&Hc+L??!1r3C0D(Q{2UF!vFk6G7Lid=;XRtumQI-0_S|OC<0~jw zV$^-%tkLAbsa1W2g#vMLn5~!Y042k*>2;2H5FLdaNv?1lHa(!>_7tF!Nq)NJ8r9-Y zPHT?@lLF9hOJB)v;HHaUrM2w#Q?(Zh7M(5@zN4x5d%nV0^tWi_N7w1Y_S=VIg{I+u zI3dGA$7eU1At1mlboCr;Iq&CxJUY6ne~D4BAJ>fBR$4y^2n8_l@Fr0c5eDHbTWp zIcd8@{5O*p0m`NalURN&z5Pc(B9x5$B-vDxz zVnzSB2*Vd^0ig+{m1|CM!yBf1^R6!m`8Zp@9pAe>*q?=0pnsu9nR)TW>l>u())din zNXKufDJjb`0k*5-WQZt~%|`S?wTSC&?E_5S!sS(K>>9C8ZV+0UaQ?3#u4Pt6CqfB{Vh`)A2o&Ya}r9n8*sito6O{uDah-+)p^ zXF-L7nA)J8JEcy87tXSGUEKaCTE0XNr`-sapjoY%^4NzY>h2P4xSoj5(?cj%C;dBC z#D6k1PxTn*X=ub?33>8_ZH*0U-01>AbxR(Roa6+EjA(Y-BYMU|$uCf5mypfR*FPXf z(U@>rYRP|$YCe_MykNli!f(B+xVLOOUa!}L`d)W4%ZxqVC(!(ET7NmqbWjbC-?)%8 zoPcvL6Jd62h@0>47k=L*H=Rbect}g=8Ga>Z$U@%7rJykUD!Oo!*27*>4-GWdm<*lk z+?g(pqS$jJf5^vt$y5$E|)x%b#5nAH7Rm&uSW~Uvn_pwRs0Y-&jmnr+3yR=L5Nyj)W4n-o3{;WWV#m4+X6s)Bd?S>-oH z3|B7m!+TXGt8BOayJdHO9`SCIsgFjbD6f+zC=`E6>u#l~pLHeQftf?!VAFDTM-U2O zz7K{;{Nzh8PmVq>x8S$BV!yljCse3c%5JmNh|ru!3YkY794jlg^h$!Hb-_OVVRqhC zx)9j~)fF=U-5r7k2=4Cg?#>`V2Mxi3ySokU z?ty{e?tahnR^1=pFRCbJ>Tq^<@73L_SL@_+P7qhH@&pQr61is)ru~BCY8E)n1djRK zoqoFPn($xoTY)-z9-V9t^*crKs14m+FI0@;3!kd$wm9qG+^u@x|AxG0tD)GuD$|&R z_C6#nRU7ek4Hi%54{ELiD#GG&tsg@_f!RBpptOoP0>R^%sTkh_zO}KX<=WsBznigV zn8@Odez>A@#(bpS@V*kvtUw5eq>6Z>$QnW*h={Ci6heW_&0Onn z4k4f0^qG5N(SMn9JmL`I`a$%iy;p8+sQsK~)~7Y$pGc+~ZFldgI`mOJPS^H$3h-r< zJ&^%YmT2|0%>(gfryJS2PpoOm*X3k-MYZ=LCZB ztR}5_a(fHy)6&!23??u&co@xIS8$xXVayt~y)%VGEH(?|zPcU6lTOvy`&)yp>jS-# zHZ>k+M*f`-JoG7I%T`u&mHHh~aHv8%E-FDZe?nd z0_`2j5o8zJZ#G_RmY$q8Ma5f=xXg!~oGW5vLR9vy_L!kNC316~9W$PDe=7GE8c-*x z=k6COh^%cJjdy7owQ6%3ecnA${su9C&Xl2?uC2czRfE>gPiJqJPc(!YQsaa;T5-jM zbwH^~5?1a%UQx=`axd_nQW-Kc-_>!T9y?Ba3Y+@%I7=GY11qK~AQ^!+UhQ?|jF$d^ z)$EX)gP3ONTjRU}N_Ev4R+f-oYIdc$>o{|$QD4`0a#ccPR@l)rg}k742^ep=@dAK- zD&^(nekO#gkm0UDf}vxD_?9O1a(~f>&)QBkE19RTI(hzhR8`~u9x3kidR7sQPgs0+ z1a-;j;GjNFUemkRnCA~lU-PO3*28JjlqNQdHyEEi%f-3#nzki=01Fp75>{sWxr9}a zy)mGJ1fAz<(a+U`<77E`{ck9VQBW>_(}_gph~1}w_YnbX4{LUAvVUSvvhix$1HqXV znEksn+jZid{VM$a!X+Z+8+fBzFL#?e|>oy?A>QITV)r`Goh&by62_u(&Z0bYQC~<*uYx3}LZMBC&z0$;BnH zU@MXLjqQXhB4;B#Io1aZd~SbI{=+R`LXzpK*w{jOW)#JQI%voMpY0ksnel=v9Oa<@ zE}JvCQAO{?fp@5YNW2qiSGn9!H1_lxNcFo38l`W3F1R?4eyUXtk5O9*g!r(am?O{A z6U4sNM4Tx96q-BuP-ijy+reOGGMmUnYL+32y+F*)v}7`LAjWZP1ZS<;PshvGUm?LO zZSpR1B73@6QcgB25>g(rG^j`=Z8+a(w?Es?OE_N|M!q2Kyw>hc0<*91CBbdtZ_npT zK0!TG_ksTyElP9jS&P;Q^rU1s3_F7i*XR4|76Pk0U|88(`OHP2$5XD?7PvWWn$GcH z6a2fDxO%yi(Jm)m2{z~->h->UZVs&%i~E|`$7NfXf(Xr8&ky|Nd&|q6!(5!a9ZL&S z;cGNm_|fVWPrPAW4Wk5Y?kXp;xL&1Qr*M& zY|zPfB1gH24D)5Q0gY_qze4tp%{+kFZFHUf5mCSn1sf+xf}>Km!L zaR6WDAIy+IfA=!V+{TGZhhoT91`=A2MW%8I@540h<$ip4RQ5E)bySI!edjpuA6 zHHG_|qQOujd`(HW{`h6mIHQ2aPT$#j8rfyfUlR0kexs43-1z~%)oGZl&nHD zgw%I}?Yt#>X!84tn}Lu8I*Zec3g>>OSIBKTso2cF^%j@-kiJ{p)Mci}EBEW;<>QO~ zZ!tW7J43iCj%NDyc^AZMmsZ*XULPXtHeB0ZZW*3YywCY}|77jU;Xfu~ld}iHOry6r zf99rj-c@m~)Qqlj8jR~;^|`e!ywE74!}N9jbtD3n0(pNTF0=^n#5{vkUWl)x(O6IU zUD0xy6SbO;H(4hLIU4e#Yzt41I&JnAUF=MzP38Ti<*bZIF3Evf!)I!4#plmS-2A1O zMF>TjO1(}nU+AN<68Sftx%j=rnAG)`U%7HVDJR;$D{^k*qj)%23Fk0*0>#HyvGbp} zSO3n$I(?aEq?YaJqV)%bgp?c&-zcsR)#9u3XzZZHzTtL$B47mVR#VC+8xOSrt>fz) zL85!XH2-z$m7MtOIFf$#=Zisf;t1EGuOo_2Y=Jpiyq@Ex*1d)=+;Rv~MZfJ!7`l`zi;iuo2B_Nd_FpD+ZOcs|uD3+rnrPokc8!Xo0z>)=br^ zSU4ZzwyX~Hwro%8=jBGNt=w53<**B)&(s znkm?JzV))Dy1Jrh6dc09P$rI$@$voSc62LIXFfmOWuTE0Fci>vJ=YycWNOhvYBNeHg?%S{(6wCY6f&d7;Dgqe2BONOX8>kAd&;vh5SZ>D4{%A}vp? zOdHrMfVSDB)O=oe@ZSlYta`LyUcsOcFOh->`4*aWTORLUpM>W^lPt4@1KuwPRYE#> zyh8~Wk9vl8t!+nJFMM@DX;W~U%20l-2dl;V(OnbeHg~fk1zqTB6R$o2J)wo$FDIVt z&Q3MLCQIcclOAu@n{8xn^pnJcBF8B6S#ow|D*<66Af2+j`P(BnSEYgS@~E^cZyA*i zw={vQQgki)n`xR?cop>#|w& z*55hOEMUPR2~&}TtgGc3#t~jX zUpURDS#Ejhs2Lg_VN)VrIRfW<=|_{-xrckBBXrGXt2{pDJyU|<4{5qmsZo_j2Q9i~ zTRy0EJXfwiW=U>@e9gqWXcqx~WwT&!0d*(it+m+h=BC>4s|w2SazkZHGFoq~ zNWU16^~Q)*Pfe%mE{JTo&aQxXy2>@NFk02#Xv8?Xh0}ML zN$Siao?MN9=gU2^KMN&Ed|gf}t_|($>a5&vdz^2;YLBoM34LGzp42pdQ&VQxfo&AM zA)9-f_@WmCwcs*W1!QJa?knL%p@vW^|QCN~`jKY$yD+r;;m@(2aL zvx&58Iq!~)i~eG!GxFWfyXT)&Q%(wXJHq^on)RnLJKeu>BP}R31<J>rgakhYD;KwO#_56_B5ITTEY~60V%($|xrdjLiBLG_5LKAg@$S)%%xlE7V z`V~fD00WA-Jff36ho2dEC7^39SN)dl9%s47cfS3XmHEKRgzM();~L09uzXEg{DtB) zUsm&x?3673(XVlv^;UR6H+5x`fN~xFk2nm`gA;Xj8?(j07PkpDZxn}}Zg1Yn)fEA) zU|_GZ;@aQ80zQrM#J?g z@@kdNZ}Isg{Td`Nz2u!h|NUEMj2@rH;Fy-qa??mgw>~WV(^SZTUS;ZoR@k0q^S)1T zurKSuF_+e<-DkRBvvLmmXL&{2CEH8jDzZviJl_!2d90o3P#dj1^w1zzwH-K^9kgU9 z#e@{+eNB(TGw}Vkj0CP;?s_Aux6z5p$fzCvQ1b*s{;d&tYI)=EFw^KR(c!oVCu{O}p1#V%bNK-{hCl&kta9wXFI z)$-26rPqv@&gY61bv=EZ?b9tq3vBgB_#P}b;m4d8eV5`uButSD`X`hl})L&dfrWg1*06L@(d+kK#gx6CNelN z>9M9g=k~4IA#wlJNj)@7%z=R+%HTqP(I+XNIpW{h+qVrB7j3?C5Rz3`fGulH%QfOA+MS#;nmrE;ESbC)y6&(0Y7$=oZqZl>kr!ra5P)c1QY3cjlo=e+CeEK!vkM5u4^@bW3 z0ww@LzW7wD0vVcOoB0lEM5pdfftC(+90hCH_l6o&CV(lCkziY*T0T z%X46Hp`hod#&Jv#%r5e)t=qoB3^3;Ip74O022T6>l@xrzVXhUx>Th<(YEtcYVSGPL zE2DL(E3nOW{`nOf9DQawJA!a= zZ73`;Sp6;T8UVsMARda}L7X9N!&oMeQxR#omhS@?1KjzOQ}j*dIM-w%d`y+Ka13tQ=&O#`htDgv@hbvA`H;+RzzW#@7C)NM^}URR z*lMlOt_nY)<=@e;E{R0^80-s@0aKx{l*4h*G~p;7rG1x}tz_Wk;aF-1N7|`Ajj!ug z;z3f@=G%Mghx>)EE`OZbnZb{qekmS%ZD;da?E1KU;kSR=5TCrhl;pjY!-I{PZ;zLM zx93`nzh-Q*uimlPy9Ecs)(!-ZeMIh3{l(yqB2N$8P6N;S4uAVEkQ5zPITAI!+0ZcX zscVcUWdgSP@aAcX!F$M);3ka-*N z=|(-Wc-yeGka=>|k>}SZJIzK{B)YE)fFzHhQ-zYT=)d5wwu74tVK~zjVRZVVOo!Ck zfcCfvDh90idq#~7UrpYsk$)(FY(rvVe2J54rLHFEHFw;wITZp#mA>Z^%!5Gj9=6#^Jg&+)}n122VK^aa+Ga`gzi|{0e2ro?v`w|aqR7`&R zBNRKU9fP=ioe!n+c=S;Tlovi;dVryGz;#iR-6l+9RHK6f9=EC;^XP_9$4t0fv*mr6 zy4}E`THQk<6$r*EhM7`&;_^!-aI?z9KhuWJwzSBKal~=oDi_{w4NYUAoPE!=^3HRD zuJ|tMfeJ-V5=%@Q&VhpSS8R5FWgz$xbRVxj5q0Vcf!>ef)nbut?qk*xz=CJ+hc7>A zIRhE$Vpa=^88o?C06f5;MVT*`Ko8SMC5;BG_R6t3t5L0u8a0Wob&ApXr~0AXFP5P` zzNUiO&_gle?vsJEMVAgL+Y3Rg)?cU!B!%#sgTVXv3-o-T)eZ%Oxu){%(dD%Tqeg{Q zoL+gijvukSF_MS&iA|BR+Wn5WiqO)dzko`AO|6D4bto+Lq&%LjR3qEqR!XKNW+zf9 zp-~D*hy1+vNVz`);%7kbpx?dkh>HC$mfnjn)DZdA%LFBBH!4?bkZN6fD50?bb7+lE zY@qxBf(yM(p}Fm+DyOTM-Nxo@#F2WhrS?Z{H75F!P>K*ilDYS(!Scb~!f$yHxm3fg z$iz5KE+jVa6KZi}d zOq`^nD(Fs5L=(nC`Ar`V?9h+V$2cjhaORvB=<znzh@ z%{tUr;z~+j7$B7rE!s&?qsr#@8=JJjNF(N3&|JZuUP1s`d)(y)Ape&PN3bQDb$fQM zUbrWr^q>2j54ka{<^~z{IX1Rm0g$nI5{RThssCF0{I#ctYZ4#WUUpAWs(#dc>w?WW zJoFoL#blbt1(%9kwL6r#RLx<0?=SW1vk6VdevZrcB2Q|mzd+q#O7&VWmSi^GN&txP zgr>D;jc4S2Qe$eQ@wEcNJ)$5`ki1tMS)T0tZM`7X!}`RI2)fPzxDNk4mF}!MFPrmVjN85d?N*v+8w&|Vm1R-6)Zx(p$VSE0!UgA zkmX?bH%2<_6zMrEi%@FuwDw_JF`G|u=pR6~r~sBzqSB=jxzH$>+qc$b=Tw1nYj9?i zFnoNW5VP?8FQemL`sYKWbT~t66CH^DFJ{G;OddO|*4Tj0`rN%LR&^T9X|{Ke*!VtP zBZlKS0p(rjSDgh6Q?4DPPQQ!aSlyyV}3Mn zS1~(7sRZy~pyY#2$7b=VRzZEM8q@pz;{~sfT!K}I?L=DYXeh*QW#ur`+l#jm4B--g z6uKC6ZrnYpYS&idwXC;ZPOw@Fv;79c#%;US(>x*UbfdRTNc0gL@Ya#bW$~p``^Afr z@X2`wDj;gkYlfd4B5$DRyP?r|TbSlN%^%?Xo6w-$Gt9`S)cK=69$TCgsU_q547sWw z5AG{4KNn^UVIBbxR9%!#Q{}*g$y;f<$_0^cOD76Xtf%Qn;CDDTDCE}AHe<_nY?haJ zj^I65C6yWKK9#5rtQ0nV#;lr-=XC#M$e+sJ*mgE|CjLPx;}ti z7)x5Px?rc8d0qcd4jLrCdhK|^Uo|1-Lp>Uj$|dB-cLZ$UgX9_O!hEHJdmcW2^8s6d z+`G>MFxQs@qxaHf9L1>KOd6Le~IfwIf`rk&byPMS;02;snu zqI%o_eU+=8ioc4P#iv&^+X+w#wEmQq$uY=vO1W94;o?onx!cWzItt=9A?o(D>w-cF zvsvA3mBM}g;QHy$Z&bW7egxP4cwV);fBGwmV!61%`po{GjZDBd(%xk7Pjeb?%NYIR z=<}E#kY*aA`1%pB-y&Z>J-al-v<8lO&o@8W13F+dbpR>Wz5gdF(RXqsQrlcXX;I2J z*_@WjEdzij$e2QhSOE)W6KymZl!C-PW$Tl`kf_V82vCuNe`>i0ld!kOM3-d>wlUtw zTKWJ+PM|yN+OsY;Jr=AD2=~#R=x3xetq}B;E zggVG>TiPl2@Hx4M?uBiiTG;E?S=K*4ff$x}S^9ixFEw-ub3pIB442^2@ zGp`c}6evH67*+AU|W<^tMn#Z2-8Skj*BYRB+Jg%Jn7 z-ZO}v3QL=57E?D{lY1s%NDtGU+1>Qgo9s~XtEHNx>r>saS4oc>V!F^c;9TShCAV(4 zg?=}ksmX&cOSCCTh_eLr1(Tdy8bSMWR-s5vYDMx+IUXL{Q~+H^*rcg)X9UC&?r%k0 zb^5GE(!+wpW~LU>??l??hxU_Bsb}rBk-qga-!ncuaKgyArO_^XOi-6_-ZIs;pDR^8 zjjn4O%E-TpPiSvJ;mXiN-4oSdJQod<5`ZFot29~)z<2G!f)Hnb96j@pqTt!v*T*BC z)z^aq3XvaXM4h2Og-aJ4jLBL@FzmB`+lJuU!J5iaQC$pp-<8IY`7ezuKk{*Kjn2JO zG*>y>=okpxoMdo7BH$Rl#Dq_a?)Ve+-&1_e`B; zv+K#)reFe3nje%A?E^X!h@nJ-VHDZqcC;LNwbD#bvkB}p_X5{CPXRPjt zbQD^BpzrM0Tmw7(r_}qW#RWBF44{4W{A2DE5naBH_|;zl zv}2k~*)xY_6X@+m8*I|77)bv#N5>r+16UAM(Cacm8gu>H4muEHO zHof?JRaaRR<5+fPCLMrvIcZ&ZDvlModFs)RF9D9uQB(gaO`C z7Vehfz-Q^@Y@dK__lfL#a*XUoYaRgRv>l+oRqj_GV%q(xeZ8gsf$E36Ow9u({C&%w z{x7}S8{Udq9DvDa!g-qy(fXfHuW)o$9T&6z&z4KK87VDvHkygzW2w{8R}naWc&wH04-5;!efea2LV$9^70AHodG@Fk*$2M)Gd;c@m zJ0UcXu2@-LS}@PkurfgBOhpHn$jUt~wC{T;c0o0bfyYOVBk35}!(8+D8Y%Bzv4Etb{VLW5vG0+<$&O{$CHlWJ-*b8Z81 z+rk;-FE}KDNtJISIA4mYMg=p#Acyzji{6-Cw^>VSe*+^RHOMYOn=?E>@bpWcF9;1C zi(K_m4grQf=EXQ7f;VA!eTrApW~v$wy7BVuELddcZ;`xOX|h=E#@7d*PtXqv>Q->* zr|%G1PIxv_`bw!Eiwen;G!mCmE@AI!XrlqgOo=n}GNvyuIo@Z1Wuu{jXQ1jz?-{Qz z!iNL*1O;gd+Y!p*F;m#P#W9^d3IbCfHm0IT45CFp24ylW=BQ@XbC$ktO5~aV_62vu z?_s}-6j@ptCmu1r#a100E8Uoty}omPCkIh{X8m|Y$Z;(HDU-@PDu1qT9Y8kpH!Qzv zloPBNCR-rRf&e_`*l0M?v3|yQLQ%ek%?i;l@JS+M&i{{1N-hO`MCEf)MLFVV#5G-c z$Lyffpo%8!^WXbeU1{$>sj;*rHhsi&)s=+O=CawZO#(tgBMJz@kAJY!5*k&u)ZkE_ zvy6f!rt<4ap#2?|qM5*K$A4azU?pMl&4d_0tc&|4YZ?`|2}|)GzDqzma~ro!17w*p z>#YPFMTo4klqy_*+B63MVY844Qz>0Z|2Nm%=b`!(nhxv>bq@hF;O5-T*bL~>8Q4nG zn}$cpD(Tg!&}mDabcu`(@Vz~DvY#T5XC9nb^Q11Ox#a;aCI!5`|3HNlC)Wo67y}o9 zIEdjpzqNoNT-v|RQP8ji)Ud%;@12E+2ziGa!Admg!(=#K(n`88P_0~{?JVGD*yR@j z5RMMf7mTz5KX>1BK=euaVK282={E_lv$lf&&S)A8F#NbDCELK;;}fC(AU;nk0V8B1 z?a5d+z0KaF|lkqYV7lOWMBx6MAJ#c0?jArIOow0FRN@pL3O< zj(k?lD<@b!D1DCruRy#Z&T_f<=GqJ zKKO4HcljXQdK*=sqtk>TJS*F-G5ybcinLVb_2xtp1p{ABC(2Uwe`uk%dh>c}pI?EY zCn&>D>i<%EJ++xw>7ugQop#_RDnm|X0!QVEuVShf{e_Ufd>Tj`8+|~gaLD;^ex)Sk z=#)yKA;nNNGocrrCzGgxmR3hpk|wAC)I5Baqh7E<ALt^0@#vwnro3OswhW^x^04%>o;eU-}jhvc8j zw-QzGw}RG<#&BkmCT4{b!I-q|k%rolW7w;vbvzifkpn=pE|RO4$zq)oUaz49rVd*a7)c zy|$8x+TQOH=CaKNVMFpIUpph_K7nH|CjO#w*l7nal3St278}`);RnQog?|a|uPdDT z4)KgSe@DW#27}MNNMIt40jA&j=^a0otka9FPUvhZd*amjnnV}SCuVykQZIEzsIf^H-I@bI_tsNKZyyDAy=*EIIL;ou${vS5{W=Bt~9>Vp*e zpBikBgU;9xYT(bhL9Uz@P?*L)c0$hDB$;;|p2N6|_o^BeA1w}n7i6N{uRV46vFu=M z(!z2e2S;9Eveeu*Z?(U5 z{u9K9vm{vL2wT&o=~40(hH?PD5HXe2A9mU~Voy@9$e?Lgww|;6zzb7sVfueX3ih$SO^7bL3!5mgb7WPLR)4%65z&xxhQ%4zk7gKBge1lcZVF!SSu2)?cu}VJ*Fp`Hs@Y7V8(gKLX9$%ZQnC4r_?Lps!4#=7Sfs-D|GQMuR=E#O#f2 z!RS9r*S)^TQ3ObuICb~)rJVg@96nnesjQ-BHSckWh(!Cs-KAtUjI1~P-+7JcqUI|h zCC&T;r(-4#<3BxmCgg6~206`_qFRF9gg9fN+tjg>S-kQmD>s9OHL*1xU#++sS%4Rd z6s6nY&Hv)NLyYhsyI&VT8T*Det-x#Gin9}y!#AHo$Xyv`*DB?gPMJs$f01!Q9zBT^ zJDK|yb9<@=@f^X%lL(kwfPoZAnm$%!C;|da~yrU)0{qOVW#xp1J zb*#U}E2IJAuczZmJ6l@{N;%=Z55)p@E3P65cCm=X`G0hpzKmzH>lXZ;8b@PQ1FY{g z)(Ol#GuQshdLADd(tTgF4<%ODFIF_m;PxmWWr5fjqJ6dabgexmQKY~1!+z{X)Y#*V z&g_@2+`rRBSq#+TaL9a)f9X8U+_7Q*IajTJ>Q(6979JTYd`&CC0u!rudT&`@g15vw z;r&WPp>Wu7Z`WG#@k6E#-SHguYnZKmdl=&xyeRaTUnpwa5#S(ge}fuOOE5QL9NR8; z<;Y{rnRt2C-tDv|f@#G~o%cdDY-BX?LAortaR$vIqE~yWG2x)V0_xiJq>7Av~zS$s_rZ zMW!&rY=N|XKEuR8>mnDtCjoDRG&^i6cYD@kAS8%wW?L;*HVjE>e{jxVWh%vj6zwKj zMfpF4(JJUO#hOmM=oUJgC1P6Yksr06oX`1Ex$0Xn>NeY-5HrH=gvM3IOt5nTPU$)F zgV|GaU7}LiW5==lLu_AlZ52ml0xOeNEdm55EBbeB9DXKk;xJ}2oxn#ybEd`5479Oi zV*Kv+Jy#8%O$QJ?&2SgB)ziNKHhZ_vik&2-Psp0X)05rkU?BN$h4+4W7(Z=_&;)S3 zicdLjV&$pXoHn$C?5(-yq@6B%aF56K+FkzWrtUm4raGktU4*o1#SmuLf(m&q$^Rlz zVP43wDPB5dwxa+EwW zzWY{gUx^K<@|v_?ySwJymnQ`+p01<5m+a}3Xe$gbE@gplcm$o@s(ZTnlb2z_xgZM9 zu+}+{c@4q)fKr)sq=6YA+`N4Cxf*e-m1ZXlIP5>6zx?fDzF~mh(W8;3PuA2v2EPl~ za$D~FhV>=**@hJ((RCN%X06pF^y2%-F|OYbO`M={^{a|`zDjkmeJQ??WK3odjFdQ> z=f|vSv#${0h zOTvq~Ld2k%-nH5-A+7wOpQv}}QN8H_A4Xz#f!NW)jn%5UQYeq}P-P?EWOAgu1Vz7L zBER8rJdU%*gJQ%WjmcO}VC;oP5NkXG8Zi$IE}7MQDU&+WU2j1b6MiOI;_w3zx7}rs z#|q03s!fJ~7fo-D9>)lh!Diu3i>&~}feQhU=H-Rg39SB}N?(#PSR`S}2Fc{E`psLn z{t=QyTOo4aStb>$QK2n0n#u-Izrb&r15AsG%`5Cw#@Q`X5vZ4oGo20ui$MQC($h5` z?=RR%BuSxOSL(={pgp$YVO<6lU^7N9x}zg)&hc#(DVrr)93%PK zeUQF&J|$zS-|89!$Il({?m?#g@1*S6_@|{6B-mM6HoKwlxHDnJe7^YKVgtyD!s=Me zn$6_C)b-hkst4T{!U{1%^+Am2%aiA8-awrtv{(2UV}v(ryHXA~xp%pr-K z1)WSACcdZi4_;4I2<`x2RXUG%n@4l?V0PL^eoE9NpP%^5cSr>Q&mU2#tS>^{%V}rmJ>lU7oJ0snQ@JfB{2O{D-lysh^p%iqsVGsj_J$$5H8N9Y;{YW& zi~o>K4ZrvGC)^in)ZiIy!?lK05zwWlUAL?@BTHQPqWL>#BJ}@46&2eFz2l2IA-g#u ztzeN#v0`i!n`o2~`+D15ATBfFFI1vim4lf$*JpfbYHW$Sc?cW_C*^F;g2R$jzBIEG z+9|!})Sfgd{@{CPgrlxj%(tW5KqqBBch|4K1(B?xSmkip|4+=K#{f^}2o?!-92q+$ z(-}92=nMYfTo&>%)$jC8%!22{k}zLw^rImE(2!GT(yXvd#SR0zy@u#wwZ83kZ3lAr zm)d7XD#4r+xk08^% zl9|d?ZHm;Z_Hzrqmli&i-=%-|bj9P5(xz$$1 zdhdC&82LO%6*MZJ0VOwkF;WG^hdfls^UjzAZT_!GV_LSGs3KXN|0Puq=UXu5U>_6F zBjLlP8pvphqG$?#1pqx%byzIiu7u=wbfAufjG&DCOYJPRg-T)yUQ>Gf0b z$Wn>xSJ)KsZmGeaHZ~f30oMX*KvlayxvME0Q-o`}-~($5Dz$1;2M9vkj%F%YKu#>RXbfX4k8u{PY6* zM>^oc?|RM$eniUO#`@l$cc!QN+#U;o@LACl5wPZa!Mw-IF5r5Eu!@Ro&7DfokRt8k zA6r)$z}@GzCz#HM>uzp zBT_ymnW{iiGfJ||eaaxM6iQa$qyyk*^P8Z>bIC?J8t8{zzJVrdCn? zpZ(^x~%jgJuuT zDTGNZ4kG0tR7vnC_-l_2n^OG#ui#Dwoz$W>^ujR$HtL^XclYa~W%g_6RNjwLM4f6)V=^<5k~~&Y8J33(tB-939923cmX-T*hlP@b62bq(Z4g4!^NQsgFN0Y@LeAh*aV1*K{(QoLP`)JkgIkw-Uc}y1=jr;Xu zY`{kf4G|z-g$$Vz7>3kneu}(*I3HU3hVvzPps_%Pi{bi)eXe7WXQgQo9!?K+ZrRD;V?b4mw?GPC+$Z1?65Xv8(SWy3%Tx3 zsar2rb|E0DDd-JcrsF*=cdo}H;e3J9azff_4y(VlDr19$+c<Zn>b8M+cYFYM zY+n=mroJaVlN$^CEwKE?97u~fUFpdz87Y&(+I15u(b*II65ALMc-@a#Js-h`^JNiW zmJ+y0Mi$l--Nab<5-C?#j`@;oqVPT+LSZ9ZT*gaxBI~q-a{{$BGLiMc<8EObg3*Ha4?wDVd|PJ4hZ# zZu^2+ZVnh;p}i8uvZjJ4{_|#$uiC9s^;Y&A_Q!^{(5%?f zy>0i3>id2+MngA{{|Z!`fPKXz(^%&o9BzzFnIzt_`@x9S^4?Lq2_D~sHW+Uf7&gbq zl1&K>)7816OZxd7k1PXOb36LvW6JbjX^R(zAMBYd+(IfdRwZ+O1=Id@^5p@bsU$@x zlH0cPsH5RLFgt;!J?X;7b0lBe+@~K~TuP)@-A$wH_;7$j!}2W=T*8+qDxDzWv}S2f z{}9TZ6MyE0ftCLuP*a|5n0*4qcppQGPYswefL$fQ1TjshFS%2DlVIp7cbvr^g0UfHae9 z7n^k^tYrJQV+*1I&L9pW8$j?6)a_#yIUN)2eHk7$Zg()K?rxR`%d$kr!$8Mm@uekM zAt(Dyn{g^t`c#&e$%T9-w#1aGqNL(`EiE7l;jWYJrHtFNt2|&p>AmoYE9p^ZaM+=d z+Cmx!Jn}Dj@NrzDW<32XbkHQiaa#D#x7nZ}yoLB8C8BUU+XAZ@uLXpG8@LkR95hl* z1u$F9MRA2L7j9xnY)#vf<`%&){@LoD!q*J$K|#xy8XBB0x@23~dFPRYva>~O5ZxcF zjfD&W&xkD?pF*7+Jnhk>NgxVX5H3x_UTH>J&+|Ys6k9Z}Ib0Jc1~~dMc#oRDzA8kL zZ9?_-xaG6utXIJSJ)%~mj_5Z~d%|j8tKj*oPvlk*nRLJ-?-HsGxIAc6I z$(lmaCz5~rnz$J3UBBYZ6%ugP%;n{&odp_;>nzaw>O2wJ6FaIE^5SL2iyo_3fz9by zoz1bHH+=4YBC<{`=ykXt_lzm@s@gWOfGmf^cqR_kUB{XYx2aMaHLH-XWqfxtP_*y9 zr+)nN1)@Bpwn9ohO(B2Y(nI7Grig##7x4N_P;2$>@)w-yMmz688dlJcPG5T6s>ZYM zW;N6R?8jh9xFvUzgX;r^$cYg+Oa=semqXNLsIvbgCr48CH~UF#o?LAD70u)21{>au z#MFBxgRb8ve*^tbm9S_%jhzZVdQMa*$4`Qn7A2{w<(8$+W<4;&x16w>?Q0yrs8rF- zE525-3bL8WJ4$vp8}%`{UhbSFQ%S1_IW5-;Oef*SdI=1?m6ONhiqdlTdi16{5>!9E z-{dgLdY{w#%HjX4f`pV2|3ff~P@Wya$~ryQv}t<1g}+kmABBd0r3U^#H({)(>1ANx zH$o<2Tq1%8Bz7Zla_n0~jRUQ^MKOwoo^E^;3tuCRw;dz{?zBIDWV~+4GHs-C({?A&%)EQrvAOunY8(>h~O zZn#*ls;XUQg6h{B@W=$AWXp%LZe0QTJEGzdig|)QaWPMKce&qARsc0&z9@X1;=r0U z(ukBU&x?#s7)bld=6_2C;&|xBoVDqV#wg%1SjsJox?R=FO&5QODu|3kr-AstX#v@N z)3MY>n@IUj-eB!|OY%oyhs8$xj3HB9lWj!FASY8;)Lgz4cd|4VeL)krhPN&Fk!Xnr z=Nd&(b(XCm8l;etsl}?H5>-0yw*xHD$!Z!cntL9j?es6rpe$2Lactkqgo=u%Pkc{< zWOK!KIvK*o)Mg_78p>1u{`vU~yDcy}AIyXatgX{0!K@e=70a<=fm?s29h!hdO=*Wr z_S^pqEvCETzjrn(-Zz;os`yQ|>e^4OwGe8fSTO+=h;`WPEKnq2pwb&55*27|Rt%c{ ze4`+lJqU71GynaQ3Ij-$WBZ?UYJR&s!J5qAUb)gK66XUoca@9n-J0fgeNPkIA8egi zOl&Czv67~x#sYqiGDZ@YJ_2?b=q#t@oRG5(R*2fCg&XZp@s;+oD*c|PSzE@z?$g^} zuc7fI$tV#mPB9F%geg~Q6e!QS3K3|jEH9jr+) zWaHuBfNnRGQ%bY)CO9T zH$H}I;ND(+suNMAAg6_L0iifxd!Bp%9S?z~u>gsr04pZ3FE&0iEP)34%^d3nMtd4A zBZ7!d<+-O#5fudZUM&+Jt-AgVHiV}VEy@J*;7rEWdLVXgd zMV_jejU7q-JbW3fq*S!%UKg#mCv9Jbp+LhZN-_-z`*ZYk%GoTyl`!m_zWecr z=hgS-5V^tTjMABErO(Rjj!6kaJ&QG;L+ama(b5~N=a&G}hpQVK?3b52)Kb-^rav{# zyV!@T#U!|>62%hAsl)2ngttGvZ;8<#Uc%jN?_h(p;TkkIF89G^wWyCb7W-}F6siJg z3SycqSdDeiFs=a}i6Yuqepgp<|5Y1Ji3!Do_=c^6T>2y~d}-sYR-GSZ?#%DIqj8cF zUJJy3;zzuPZurp})}G;fJkwM(8)bvUGDEW>YvLzqR~la|F0ZpBIJ@?q(^vi*eS%^_JT5Nz;D(e^oP;G$nR^B1wuDy$$jbgF z>P?kNvpa>2%$W?>Pf;r6pZm)*4tAhhWRPl`tKI-M=X#%cx{B8YCo>xx>7lw;4c;{| zy{Xgkjci`LXX_r-IZcP0!#p>$n(h{Y+gznb0ap_1Kfw?Ahr;qina5H0yK|ITCsM}4 zKSc3sCuk+g+C@&aS(wnCqp4VK$d4<$7PnFuKjFG~mvE$%_Uhy$!$4~(c{F(pOcIw4 zOp<=*DW_)Lnntd4-PXF;snsTFuG`~5dn8IBtT?5vslQ~K8a}_}@;p{_xo9{aevI5c zk6jWf{Lw!D98Z|pb2nxT^d!TidXcsaueL7p1=pMk1?zSfDh)5o!kF;qhRXMhm^h8ajvA8nKwl5wE% zwCgkKcm~&;%8J+3Wb}CZr>jQ^1epqJjHa?3UKg(C--bB>*wNB@809{rdhv&nyajJw zDyI!2>z4GS_vZDnm^<6j?<^rSeJy2a%z{+E8F6 zsw$x6C6?HW;IOsx;FzKRZVD>_gte@pkXvegU&~LH>%5OL8q}=_da+UW$tlc&f?*`< z`U!kp0r}u(>a$_VAeZIlx7UI7n^EdUH1`kVghXkNQqn%Y=`{2a49nC2-IK7P)oyBf z%g;9R$uIVevgStcUSS1o(HF?ou~CiOpT>@W%Wa;pPzl&o0}^{8LGk#PKCFapyxuJx zK&Mb6fsCtrtrJJSYbA4dBmg;P6uV%_BP+%Q`)ed>AMPeGt+RY-L9VKo)Xx!2$bY1X zYU7ur+6i?FgJD3l+a2x3(AoTv!f@}6p^Lqj?Jlv;#Xkz1pn8WSkXuB)h>L)H5C zh4?qh{|>ck3*HqN;+r)@NHaI>Ms7#Y?1wft+lcBXQ{M(En=^I)Axgn=4fz#OO+Ilk zI}Xx5_N}q(+l;1v=wc7Z_s03~&SoOv*km2C4x-G9RnzTx?)Odj6p7m?M|J;h^Ct@j z9S&-{oqLU&aap847M$%ym|&F!)JpdZY^g}J^R|XN>lVGS-baNQU5S?*9Hz98hYY7! zN$pwfM$uS98xi1m6nr(&{~(tl_R!T;+bK?o{B1tZ&*sq8Kr@S`)JX7hIP_xfyU6CX zUUrsHHOhGLD~Z7V0%>x%j+{Yzn`eZ?JPG8@%Kr(-0Xs>b#jM+(fSz!=g4w!vxzT2f z9XoU)U)s{3!@M7j^2@r|9BF9cubt@psuW&R33lS56&BVBTgF65U9Z626Bk3lmSj#c zsG@yQ`S>0#-b-jgaFMcvFY!=E7(*-}5-2a%VWn6ym20GtCM#BTN}1vrNFcZWVZRV+ z*y83V)^M28VnN+(NU;9aDOoWLxTFYZYPeRynXpaXwpgOv-o}*iPPeC@5iT|L_K3-s zVqb5Lp$VcSklW_`u4pFOv4qPzDm%t5d)=N=kA1%RMj2qrny$h#>mg%c^?h2{OwV$; za{ca}ho(I=SVE-l-H*Z|rAGqH9@A(Ehm!a{KNc#nFHF#>Fa-;!8zMY1(yl<6u-%Ai zXCxV^t%!p)3cM=Ctw5$CQn>c10!S)!zRw%0A9B6ti^eds^E zHhEv8_w#TaiP^d(l^zslH`fbJmxhddL7(0UO&9GdvqZU z=Gq!Y`eUY^@vFpg@I76~C1;NDQV3W~Ld=Yj${p>)%WX6t?&5xP%a?D{d}-MtN6Kwi zf!8sR3Fp;lbt;hS++{Ie8mpj?1)z(S%fxzy3`-BZI8uOCZ{Cm5Zykc>kzpuF19!uT zxz`gv)cWphGEZN9I?*oaDzO<8kGAdSY&(2^=kvet8&`;grb(paEi)ViPE8oPNOq!? zQl27lm9~YqnRarhSdKy}0j_eCifC8BA8)RB0s7nYNpD{otu2{CJ;ZxPf}sn_AQ7Sf zvrya7pb*xnaZcHrdV~y41O5G#obym88zo=SdMqj)=&Z45#>){Pc>|}IHdY_1bKEH<(do{7$CDQmf_LK$#jr`LIK_~ z0{ply^5tR(ru$PDY(A>svQ_Wf)kM}$l&Q|&=|h6*D&i02;vu}}AgE8}{;Kg1oUl>5 zd1Emy#8wWA^NLG_!v{z;(Dg>@(`7${cT<{q_Z4S!w0_A5!QoT&kDB zI~VO=KFP=(E7V-6(TT2YelQnjL1K~M=;G8Uz_5 z!cA1nk0VEJIaFKOsJOaV!s-iGGpJx^>zyTxk)f?Ow`U1MXXdnZ5}tvvtPm3!UJxaK z?DDEr9kt$)=%f^~K-ecJO#b7iYF6s-FLQ(TtmDUU|VmsBfq6Y*0T z87?{1@Y1jMBc+=dBxxr`Vj?+m#`X0QDi3k~Pk$M51Fw)Z{p(H5S)l-;Voq30>@r}O z$bd4{o!o`$a!LFG3K?!10Bzvp{N&jb)&q~T4EHxd9AzUc^1oX;hD15QRE23pQ_&$g z+{V+Om&+p*swBvJ4;MBOc& zn&A3G6ba*GPM5^;Ds1rQUNDDKlxGr!LqQba#G->Nfx8%zOj0b-qjtJtziR8nKyx+_+BRZ78n58Cl$nZlf8a_v zQnJ{n-yGSbb#y-u%pj{uVyK1ls&)nmI)T{|qc}IeLlY1Nz68;nM@ASP#x3x!?(f)4;*U`OPUh^$tI||s0t2aNst26ubAHsn1 zYny^!FmYs0xM`fI!LWnAV#eF|?aGJ#{_ehoJLSzt^}t~KJTfDh(d9+!Nn-^ z5TH8voep0vyq(_D7Sz8S$JYMp>gTNXJk+4HG#;8%QbL>+gQ*%e6h}Z{w$G@H;dvJr zn6wR6))Zf0G`Xhd1Ug2l^S^GZqQPeted2y#n=Mv}93mDqkU;&YFfEy%IncFCw&u}} zV3*;-2`IVEur7D!bv4`l{5kKMxFvI&Htm8Rb%o|g@;PNG`JagR`#RjYS!hYdvAKAt zid5Q&r>AhwJgKmUjqlRnGOxCA)_dhuVIbitjRQ+#2qb3ye9m}yZ^Iz)*RZ6mv_4XO ztWWc5=NF>h)!IOHg}s1~lL`_pX&qOGkm|ue^2duYHic66zZG4t%d2(gP#D<%sJ%6Y z*3;Le^2RNTR4JAEmZn@JI9&FKdN2z)m9CINdY(R<_J#K>|92Frbp_1^4vFMsC1nZg z*r{#AxLwTfBR6je|Bn}1I_?47jMCEIBO*z1^U2?tJC-ph1$|YVj92n^3eTQ_UPf~5 z>MU5eNvU*JTDvHJJ8vLA>Pgk*`{gY_5pWd`pZ5-DHVtLW@b=gFP(VHAzpD<-vbIsWf2%m9 z$EiNlmcj++G(LG-0G;EXZf`R(5MjQz03T83Y~DH>z`apAiQ|Mw6Epj)AG_C_xEQWp zgd*8jsPGVD<*iCT)4xo7p$r>-42ZL6Xt8DCPYzQu69<|}hez*U$d&;#;=VgFwyS0l zM3edO^UuqwC(wC#K4U2Umr5BwJp1(iMZ}no1-K|G;ncv%ghA)_=xN=tH$!3fg+=b6 z?IX~Rr==n?2D(&HhWk(dEl0fgBMyMf2`OK90nz7>T08c?Ut{a`KNSIgrA-@YvI16t zpoG^BoYXWj>IMJ30EjX^i%vi(j|gdq1$enun^XN7z(=Fu>)7=HA?#Ixsb_YSdS_3D zB_r^fC`L~I|B;O-(_U7s{U>lk#8;+WQ!_H~=dNvQ%{fR+550J>ma< zae=iMdm zj;54w1+(CN>x(7u3cc~>fHCnW&lq2hv6OW$$V|`1-Y;Z;##~LDMLMNo-&$pHz+gr{ zOimTUr@dGFp}t8k3uVAUbp{q%6-wEEVM2&9OCwz#@G~nC$wo%>n=M*#<}S^C$WMn1 z3kpHocmbGoPDE~g9eDf($jM@Bh6*g4&5@DlK6mG&ib_gJsi{KHMg;^4EH$3jucHC% zQbWvbS6g7R^)y?A_h2xFRa7{!91hPI;Lt86w*Hl36os6c%1#+oU+>ZfN@)Qb{91-; z0UL-PV+17q^KWD!L(%ZBB~@y8xgCdY2ma1zDp6@jLogh^-r+0Ho7X|7#;Nw0l&6UZ zD~WzrI+~fH{H~^dfHgU%-lcxa_fnWe&AS~O?E)!SKF=(S87S>w5hPq#RoG| zISqpOZQOgL{`%kCxuAFY{MD%7y>e-87S^rQ)oN&L)DzXvFJ75vhL&7ic*lxw@KGd2 z-9GgznLpkohSp=Uj~EWVf?o~>xrAd&{6pYM`3=W6e>|W;hlKk*MMEEt>>2x^2#eUgN%S(XM5v0@<&)pT z0BD_{=VOSA-M$XE>m8Nxd1HNZGb+ql&=MO=zb$b@qc)7og2kj&z!O85-}>A|KF^r^ z?}D}9>JC@%pd$nH$9qsg8kPT}$iub@Tj#)?91+a=SKWAHKbYUMVZ5J>f#=Qj=Jk{} z@S^EN0N_hp3NY1YUJ`dPQ&vDsdP4y+X5HYug%^WiY_Z}3b$kBMGR`iCkJ1HNo&I2& z#F2)rtG)Q^{X{NKzHvSL;gtx8Wg&h1Z%nAr$nUy3TTpwLtT59OJrsQFpfuE=+8{^3 zB88XDQyv|lou9VGn8xVZZ6F~RupX-d;PwHCu$Dob&-v4#U`Q&nnoaC24e1RPDB|Nl zhDTm!r##Rp>T-Z2-*IWn#kRlx?dV4aC(0z1%aiiCmN0!`TcUT_ha~xy8`d`w#$Eck z(E>~qvCe=OEm_lnUflQ~>i>T%9G$v4ISzb$o$ z^xspbSA@3F8oysEMqp8~DccMb)CB^l{VPsAz5h3W)(~O|*xK4+Vq=H84$H7=h6Va+ zu^|6&FuG?Wy#y-ff}K2~gVlBspgo+@<$uq_F3NW2Xym5d>?l_8C0}c{TuT5G&p=;L z!cN-fniX?tUy)ai7ONpB%&yt7O~!Y(Osi;5XR(2VD20L3S>N!>nQ57HO%5u;gD!qa zo7-Ml){m?}KJM@!)i=>lU8(p>y`#N!AWvW+md^`K*kjW4`S-vbCW%wTo8N^F^`=V{ zveo~UCP4X@W-*&WUrUnBE_%l&>Gwc5p|q5Y z2l`LxeRqsn5e0L*_l>@;giK@+rO4^Dz4;?DlN&MLI2trtutsH2R@HVi1H}V79;_kv zhLIn9*Xe#jF?MbNE!w}pTKVEG0ei*u>2xK98S>`6lm%icj}{^RcSwipmCcgNwyn?2 zC9~P5;#gIv}+9A(et;Sq8or-*RQ^aFdla#7ngleTh|u`EFA+~QXX z3P_0p0V0sdo|jti0*V>7>b#wh4oHZ66P2S7fY6@FTl! zD`>8KD#3Mo7-wtbvnXXUz&8W{#y6(Fg&?D@LT3mDddOExUMZ@eil5zuflg&qPfh=RB{gKU+`^hqlJX3O&#tU=IQEkAw~N_e{#YU&45^PAwqlGM{`dr&8iIJaSp zufIHFcKSb=4hJX=1<%&?-fkK8uYC9Q2NH)g_l=F(ZFnOIzZg~XT1=>aKWJ|P54yS1 zXniGTKANl+ea2_T389jSz|BL2eHs*{M}Qxa*q^QtY;PoBgt~)ZfogzFr;TofrME@W z@RzECWR=6?#YXf#qx+n!j}DDv>Q)~^-n$L~S>^WRluJGkzq-`(T+s0X&}2bs?O)LP zTCaL;(a~2d>6A%lZrA67=*FM{#}OOk?#Cs z45TQRDEt?oXP%rwbCZ+Bx`>XluR2x=OeCMVWFQt+vsK2)jsnX`)t%f5H4lDDezAqD zU-Bnyy2qN914L45-{(C*_vPm(bc-Ct8WKxzD9P1pje;VdRmJ}te-ls|f451fST2+J zNQ}z3bJ+I%nq!+}?xh~#FZEi4Y6@@AW*z?&6U7vqRydk8{)3!s9_8RMe2Vee3nt`( zDvwQaRfVrPT7GK5AqQiJd^Dc6yU(r-W#>l* zkJ)pCKeE33>Q}82Rcl#ct5Aky8%ZOA2eji?i7u#qKlmi#uOM=CfW9k-l$kaqm58sG z8Fre?E;$I&C0j(0DQdtpJaT)^_k6jD#EoI1)V17l#r*CECdshFmV!Ap^d$&k^=4q13p`Xe^1O#WBgHK_? zRc3FDTYO^FxXijZ-ew(#-Tp+z4tGo-`SZ~y9%rxCY!d^m|MS6gR026!xkhf<~y7Y;1(^dW1vNQAqViN3{{}L_^ z$RkN|?r-ysYGP12r-7omNrTmhu0@CAA`>97euW4(?TiR&b=B^@!{ReVyWC?SapO`6 zi%G$E!J4o6Pzo9H)ZY2!QK2+Zu)VEooU@)_RSG1llQwjhYn}0dYcI-qzeSSL4H8<1 z04Qisb1Ah?bd+XIO9EGi+PH51*dXEIi``bSwfpm<-p8pzTpY?J%8b+@m#Ql-M~g6j zD%qnyC!vY{gae;tT?ZC+24f5+y#sm-@rv}T52ux(DqT;_T!)mNCbDms`I({X0x&b_ zY1fUTs0(Z|00lyXPnW2b1-i8McujAYxZhNTUYuK0f{Y8F&@oNAu`-_78Dvz`xI!BZ zr&Bp!CdW4zGw#Zvwlm3xf2yG;l@m#|%K3v{3M^(qUroXzy2&7QW7!>c@Hd(+z{sJl z+`0*ZT$Or51A|14Wb#Xx4vcuiBkSY5Gmm4`R*JxsETw?ZeL3bI$l)w}{bOaYS>uaa`NK>r@=V(wQ|S6i)jwze zick#BKi+1C3w{|_PN=w-NfI}Q;-7KTBL!)~o&b}n>!aH4nb0UTI%D~fv1(3$y@7cq zgRww=6YQlXF@WiMhhsJU{x#OWA`NQR;f?7++D4NXsI8=@W5BRO8nXnwb)=;uFh3QC z2utk4$_76luwuafR3C*LQ@{1T+9}FmH}}2VW~Pu10{heRonk|OvzR+Z`1?{?J(}n2 z39x8l1+L7F1`S8@yGXi~iXV6DFI?-bRSjtZ#kmrnq7G2ftw(Y+*{U-nYhtxSUo7b^ zqEN+Kf)hPRLp5pMx1Wlp82>^|>-6K2V7&)Kf8p!V{<)nBAF6NG`;JbBx7iXl(9$1j^V0&M zEbeXxs?0Iz#0AId#ZtbH38b;c$h>3H<%Z-+S+wY%jPTvrbJJ@-ECV-9=cfiPbJMqo z_&oD(b2=bnfVf+T)$J5^k^LAL9UmXp99G{Dese6Q(JQY1?sP>AjGv5V06-ZZ5TUeE zzHfL!+Jn#8m?O3FV!vK)6DSXcxj$T6U$0v>(SQtsTdE3h#36*4h!AwB=;PPNWAeGf zIURiHdf;7Koi&!+F9Fpeu$C5JWDbI{*Nr zZ9N&10@T!MGk=`O!qW9V@|n`MMgwdp1y23b=}2H&{RbVhT*cevybP+b*qNosyKS-w6Mp%5IMOGBS=dQY{LRsunz@}Vb)p%s- z;GahuFRIkz++spa8G6IQPXhM8U?2wFhLWwjZT|2%xOM8uDB?{z)0vv`Y=Q+y4sS>{ z*@{4Nl6;tv4d5;c5;@I9d%*K=Q;u5dDp%meIm1%_ZZ?;pq z#9?m83=ZI8gIqCf9lqq4*dK$ozp_!$80uU7PGeM*n;Pdq^%D*BJj0~>lrI^~7ag0r zT|e7!osKa8$cId-$M^0J2@5*jIYMU^+$Kr)=l~6t)^iib$vSRe^g$^>4hUsO?LbGS z%b6U4+iwfam-3_2jsU|J8~4;nV6w6f^sJBeCAZGk@JCS=0?5~|SP6g2>?V6y7I?yL z+-Nmww#SSU^lQbUZ!1#PtlfO@Gp9Qno?C(=w$EmN{>7Kocs`f+Tl$yGJ26wl#LLUT zz(p`6N`!o?rrzYE9gZ!$|J@h5<)VMy*P%2XQWg_6v)CA}GZ?+zdk}dMS7hcd7J|{g zlVMe05xgiq%o**&e+zkSX}O;bNYmEtpT#IEF^1pHQY zdOZs?t{YTBA6IB04Teo+IsVE-{(y{T0Yqs&QiZHvfd%L-y=^^e0J6NpU{lN?itOPO z-UzhKqdz=FfXfJGuPCAgDiT_(npT_ju(@obZEXMOpm>(>DjsiVNfN;@=D(J8f7#_2 zyPKR>UM1TRpEol7D-7Sm7>+ktlxJ;*qe$M2WqA~2Ha>!pl6uWU(1DcoveUKPYvWSp zy`WEG=tT#CpxxuQ(8tf!v4I=mG31CmrrmhqzDK@XN` zf#Y8FE^buQym$Mt^}GZ`Z*NH{9gqtBKy%;s6#64@0x$O-E;o8HO^3+L@zD~bAtvAG z^5iYQN_=kfI1(0e5?0192b{Bzyrq`|3s9%2LAk@*GnNXYq40!EnkG*?Kr6BZvdI5& zwN)j$*pe%9)VV)g>n$rQ#Ud?80E1nC0>qc|(<^cJzLx&QsHYX5FAlF3J_ECSIT8O- zes6BOnYmc6LUfrgkj^h2Se&-J5&gi`+VVT#NoB6xgQ%Lvur82AJ>%PiK*lm6B8-LK zdQyzTV*HrC&uw~XiL|)abN^d2ds1|Bvx}gm7ao3so_)dOiJmyukBRT zqCX!gR>&OzDO9vNRgY>_ekrs!Bpc#-#;)DcR3zp$kAbL%tC(D?X_kK=YxQe>+&gq{ z)Klp&PAFKyu)~T{=+l~%BJ>29_%S{2dyGSEhwAhj-q=}gwc%-RxbH7WTXghIZ1je- z_ras0#(^r+!gmX)=$+e|&i#9jdv#hM>{y;pZ_(N%ofOF9jni3Wu0;h z|IzT$_xtBWp2fGO_jd0?hnHSxZLO(fOf)f1^j`V`%~1znp`_Ee8-mg{-|WPg@Xu{i ztXRyfxb&*bFB>fXJGC|rOe1B<(?`(~1zxy0Q7eZ;q?M;cMqLt2^TJv+*Ap33A=3l& z93W#${vDLLni;jw`8b8F0$C(+xXfb6FN&u-`nif_V|+?UpT}2i{PM*Xs#Qe?RX-gC zq!KHm`Oc|8v<`WLNBD8ItWIfB{dl!ozXts_|dfQ?~9 zBK-{c!W74&WaaPKdUwfM=g~Byx|Cj}F46X7t(X1?k&ElH{nNHX;*NoGN|ui0YS3&-D(`AjSn$&@IssZ7W_bVs2XbRjVHw8rJ?4_qlGK+}o_CkSdv! z%y1X-^?&KSW(ir2tdXtkv)|Jxw>=BrmitsMJ!?nbN)@$ zx5K%`qSyB-k&%&X!@9O!ZMUU2CyV{@I-bLx=v{>;Ko3A0P!~+tI+Qx5eczewYip^aVm z0M79sr|mqXZuLtUv?0(LsZRO;Fxo(xe=+QUY)5L$ z*1mv1R@Jj9^qllc(cp^(-i@InL&~wx$lt#5VLTFO) zcm+a*}yUAz8~D(#x@w}#Z;(`i@6!>HRD+F>fcZqL?c)bZV_zqXXJNDBs0 z=<0_e!h30mNeSOL!&l0+pHx+mOHhV8MYsQD?nuQ1&m$-Mp?-aT`}=jJZl`uM(?{+3 z)ajdRM*jO>jvIq+RMGy$=XU$cE!-VNY3A*P4ceVZl%uojA=)|0;^yA~uS#?hD~MdB z_`;z4b`6E}cq)ZpdGT~$7-+#I1De0?2^zg|-@1EUYO*fkvj?f>rJ;M(f-O_VaCSmXyVvfU$?*RD3n4uz)3OwnrQn?b|$V6mL z1?>zK*1u1iM%@~-XmmPUrFMHm{?YmS(64Lb%f+j*9dHQyXSMqLH?8*6hVA_&gYlQ zNenPN+O@I^xR1r99ite7qe`n{NO|I7i^k6<)>$O<`8AJEH)WM$cTU6~KPfCODU^oX4=um@pAgN*4r z?qBQGOdBQGP&EhKjuVu;KK^_2HY|;fg@$gc0z@rM8d$fASPo%>&HxG;i$2efD)gM9 zVM@s?)QR1`N)3m(>+P(XG}Ep1l6)>rq-%b|l(h|pc)zV2jtmt7-VzoZoh;+g#b>V7 zNs97A$t;pdseyTJ(&Ue2vgX(d;~vuHjv0NIoWPEP-Le0=pW$J_ zvqy(Nnf;>Fn;lNR&5I=ZBPEPS$hK?LV(QW z@E8&Z{;ElYJ}8U6X$$RUaWfFUzMAO9EKBK+0Ku`#Ilss>!?yy@_|;|h`W5WH$Y8c> zt>sQ~;B8ge-8k|l{5n&}B6vH&QSfuz3;=J(@hb(*S@`u{P3=tpF~Eob+$-`mjnh@5 zYUrCFQlI0OUh!)0q&%W3{E6IusiL*9L+SzB-ddUo)HnvI%}&4iueLMKMJ+yIYm`=t zPMU``I?TV%0!w z%$|pvnh%7Tf6k+Q>0NUG2TUhy#~5_&dga8vN93edaWv3R zSq9eB_*+42Y(mNMyH__W90pxD(5<~bO3R}(MbN81yONkEM$dTv88ksDmBWJTsa2C~ zXngw#sC~a+>rBb{5M(;ObG7_pkWNa6`AqE1HIMR)=sFEnDU6lWY|J=@o~yLvc|3$1@?63O546ChgTaT4fX}j z*jDNHXAEHjGv;#9@ubRsUDWU=)zX|bKmzApEE10bKOBq+{?gE;8|z(ONCtvcX+kW0 z0UvL#1HXu)$nY-)l%dt!h;msTUpc<1$&+xZlXeG>yCD#`eb{!k!J}hk#r>-xN0W?n zJ*SUN)g9=(pza{FCN`w^ak{?9n_S%%H{u%{x!i>y=(jjs%QS*cl1IhX6EzY!CS!9LxM* zi!NmUS{Xm}!%smpv0q{g*11CpvdDSz;@fP#T$kxI5*wU-Gek%Gah0xE2TE2ARbE%S zIB)Z2$&zW&2pCM!6?SR7l=W4#^*<|L#Zk*D)4Nl74u1rL2jH56ea*je#i{P+Ot_BW zw4brhV7C~Tay7rdm#qULta&$c^ktYKR9enTHDyPv>$bp?_C0O+#0=HUmx|OyML~hB z`xyusK|?}BrtaO86b7g?yVx!JFK=eb@C=*mDOhxVLbOl64x4!#8{$z*chS=KP!dRW z;=5M1_HMiQ0Fw#t0IybVf|5b+YlpdqINak~c~F49JFOx)$1O|VaZ|}N60Fb0rI1$m`cF4+%k|Kn|NG+O{A^3>$jEf96?g$);99*!(FbNl9d8Zja%!@%;5!!KKv9Vaxgrs6Pzgc5!S*iu9D(@tHLGNlItz>Dvv1PMax45f@FMW0$^0*3bd%L5*}s-SIq@-d=Y*Of zZ;d5l`?&k(I542OuV<*4ymlM-?C75su9PC@6ncLt>{?X)l59RQ)Uw?O4HC3@29>KxDqCfcjOLhjLCCOGB zYzze^+XF&|Og`Sv{_sN^hgJIrWP}IYK1r(fg(Lb^5O3V{A-rbOzQjQL)o|4&*f8h@ZoR(+e1gT8G8@# zC!bXxps^$HpkUUg@!|7yzBn{`gK)GU_diblhL2aq%wu`J?HZGYw zkP&McH*Gi75QF}AK4qDohX-|gV%@Of5rZYOGVq#hxQsCDEZv~Yw(#m?*S9g$a6(T?BVAL<_JJUi3-QGMGUdL>ZtC@>*a5<0sRntN@ibk0!FMTtK+d2JNNTV{ zKVi-4qzyUF&fRrUy0)1T)m0h6WyeE)ZnXzXgr7=yodX7^5z(*v!~YVmv4;!f%p?z| z@a4UBLf|&6(>IrZgooCLDQbfPSPVq)5|-cMA5K^l0-_W?F+|1v@F99Sj^7x;x`FJ^ z3+8>>wD_~*!L1t?QMD7-UYgv1zUXxP@V4vkz|5!8 zsC1rSk|JB}Dq&5`jJ){i1?5(Vyv+a$Bn@OpirD(JYWKf!vgq~zT3~>cZX)9|zFvhk zUV~_DQWxCqQZgc33Qc8|HZPQMO5bf?qu(OuS%Iajtszk4Mmcnac2L|wNhuyV(5nhp zaCYZavMU~~6luY0v#>AYO=95foEh=yl@7|p7*a1p%9(jgmKO7-b!3!Ckhc@bM~C14li{Lg zZ-D^SJ`?u&NO4MIqd>;9%qEHVr7pVb-uV_%IPsh5Smo$l6x5fo!73{-S63tK1-;J@ z^Sih8<;KR}Bfd}ycSUWBPw1hazOBsGCLW!vwreqM^5~X}#?mwfy6%paxl31l_B;ty zVv(C{!={vqF!{jxon7vL!{a6UzuY-GA}oArV>jeLSQ$Jt9k(4Yn~zDKVl>B@EZ?`@ z<)c>A4t*olN3Z+cnLEG9>r^*36Yw${m6N zdaY|(XMd_kzAU*X9;q3kM7N-y{dP}U(BXOJE7u+VZ`?2gw*Lj!yk zg9QVa_{Nc-B-XTtY(7@wpWTYcPx%VR6iOMEUfkA!Ewhb$-<>P{76Nk*5_t!LZqf+) zh0{LN`$#FWe6KMHig^5WfX=y&{weM8qffNQ8X()cVe|v@jT-?#CXkXS$7DZ{`12#0 zCNm_F)A25HvF{alpkM71Lo%xhuc;^_-)>H7wG5(Wem&;eN#@)++yUT5F_5liaPW<1 z8&E^Y8ChtG!#{?0z2i-{iWy7m~`1G?WVBKgPq#yMd~0?hAkf(O0AwYaP( zLND2+@9+F70u0Y%;MJ_0n{Z-e|9-Eo&W&6SG`qjAEQ>F&8yyw^P{l1bGqvcl`Fpjjg(KgZ znycI!vC|YC-Cncu-{V%9iSd%Q@!0TD$ren?)AoLv{0OeW~C~qmomX1*zFm|R6 z_-iX2RwaKH{D|GYWZdI^wJX`dSqr8ap=X$^sup9)9TDo^A1*9n7AcQvO*heBQ=PeoRQRX2R*#a zka-pNFSb2gMFdDFge25+g@Hl(+zqwojcX>L*|6Z&2Z5m{IxheDV_H?d)a*p|gAYMi zz4?=|8sJVpAme~WoaD+@v-JT6dqbDL`}Nm`xh5`oMv5LOE*F>0glhy}Qr|V8mKft} z5w7hD6zBXrH|II$pcC0_14kC)P>Qh+(&WtW>*7=FMn9DyIk}!6pnr<<({M#ppK+zTM<9&cvuICE0@(Ij{RDEHXAceLCrm zk0fGxccl2=%}3{ddJ&*Dg0~M4p#4IWmYZH6TAAo2dhA@X3$2KS=r?28YNpviV49vp zA%}9UvG<3?Y#~MQmVnMAS~PoeA`l$SOiWA!aa0b5nTVoMmXW%+ImJGP z(kbMy#^?$i2U(YcQSwEeqSS{W3pFQ8?IFt2C|j&V)c1?hQG<9~$Wiy3{j4B&w@Eai z-@xpYVdz%Fcq19R-fZVr#|@Q^cV0pKMlR<)9HIB~XcU#jG>paZNoxwFp)L_|@+g@J z%xWwT_vaL$bgo`yl?HJrEccT1Xi@;ZYJ~}`ucSz$W{f$J|3~^~Px~E@OI#<0G8$9K zE54PCV97WdX@6CZqrbhj=wh$aWY@YVDb{uVI?PXyj(-xL?L=Z2xWQ7w?VgLUATnn# z8#D@bLaW`;+3B{)L{QI$ie()PLgwhKXnq;h1ub({ zbZpxKEr?N^Oi=cMp9~jx@Y$s^e`SKSMhx292Y1cQyDK*di zXs}jJsydpOcRnytyy-gjWz#f|#Frz9EC z?D4gwvhU6jz(?jMe0UK2Unf;w{NSw>zpVGoF$RTXWion!!&D(B6eD01m)WPR&T{J< z>z1Cn!X+6O|L2o7)pnh>g~j*RqUK>V6clouU*}-hJMyESUjmsrvcY!ah580aChwh8 zfddQUmzy$$bma_Nf-r5l*d@coM>x5FtvVg);_qq10)*4tl?no4<8*`%eCqntY@F*- zUq4)OY*%t73Ph1VFr@g*AH3Bp*gt%F7fup{Syqv~$ zh^NqTF=OCebu1!q!~i2MHAMEXIo_2x-lOtIEgj zsU7A%YQ&I!3a%Qq=PNwvmH{+50qM&!dmS`%Xy!YRkWbgr_kRa7N(l+v^_aI@37E0T zgBk54@k)ke(qBDkyDcq7A48}dvlVDs{hwemEFvVPGL;kjVLX|;t1l?P^mjE5x7)b4|)ut+4aGpwpzr^vRu4FBDQ)OyjzaXagvaR?OC+EC2=oOJ$V|>w+ZD}6^G+nc;*zcPS0+1eZkj2k6$Adb36HSF=^;g|!!cU#8+$SHIE?*-3F26{732ehD6PL<_!R ztX{9n{!=?txqPnf?sgLH&$Z0O@#V=@BmE`H%<2g+6H9Tg4DoVhfqOb znG^6rH)!>6#^6#-B5!`!3*A%bS%MGGUvQ_y7x##AZ6fv!q|rM%w9PwXn_OY_R^lM9 zq}~2n+&&T~3eVXtxmf*@>VmV>l=c%cojq)o#LvMWjY{A9FIB*RO&s+Z;e3tLtln47 ztBqSOgNI?o8;ttt_ZRN64ivtS3TAqk)BWfb0T!zDSYoPgBG+0)C6QtfR9JTtVPmUa zjcwSi2tDex&j2mGmO3HDCzcd6E6u#ncRvnSW%#=Qkx8|wMS<*4Ta_)p<(~6p=F9Jo zO}5N7aboIpPDnx(+~V8Ge@hQ$45F>pI=0O9%tu4J>D7J`WhRhKY&_{!e--fw8f4)} z;Zm^;5!NE*`jh-l1{L*W@$E7$bz#1~PcjbgWxs;;dn{L19tFFH*;Gz8ak=}Oe|PUZ zstlbilr4ZnY33oNsWN{J5ZaW)g&eO{Q%L1A68L&!F1c%{myd`rVgEnuy=72cQPU=x z5Wyh?KX^!Rg1dW=Ai>?;9fAdS3wm&OJ7{osch`fvyUgaT`R=_x=2lHjP2H+nHGg(0 z`xL!<^^$&itzJ*Jrq_I+n!3jYz5M$O0z)oZHNEp25V?K$y!$MP1v~0r$Obee0piGC zF8t-lDnaJPK2q`D+}SirOio(#`)-CtaK2)p?l0AI^bNF(#XvSZ=e^c=?8coBnPIwi zX*ZwJB>8Hsd=>&K8=vujdI9e18$^kRxM2@3U-88O0y#tt42IecLB*Az2^R?J*rx|hFowd^bn zS0)?89Fj4)-aK4M6LZajIhZaE59@E9PwICfdfU89Rr=ftQsTn#oE7~eABSss?r+Qo zm5DIr&}v~v+bucVB~ymZpn|F61J6}I1hJiPy)$qPq$w{1_R$*FN{~Z zTz6NqT1BAGrDHkf=nW!dcsr58me=O1=^!_I3QI1R3KFbTk;4>rJD6cFi?TJ-I>@pt zPwqi1%C!^8m+hIY$ zLdFZQ>y}grNSM^_v)F8{whl~3z!E+RiK97bqn%?p`v>pX{n%fN<9nwcSDRnuBY4ZA{6 zLrx^)M?LxK*7Z@*7wzZI`2Mmlmc3dSby#Lqv987=V?=BZ(c)X_wk{_)kC!pr>f7#acR7yFX@2LEe!3^Up!|jw)HhL$tFo<;BxuYdfgG zmTIF$G@NLBYc{|=f;8!h-RxoH=&ThMJaZ7sOvnU;@tiKUIWrcX3B{D@#Lp@%2Y+wQ z53J8G=FPD(%RglR$1T4cOteM7KjUdWkX-0~)X9PSFr_lZ)I9BwH)Gb-7GBO^dCxlC zujl`J#+-3unt(s=idXdhd4L0#CZ+-zpBi2{F%fDZ+NL{x$H=osnx19VyozJk6h7Y8KPShQM1a2f>pd>-ai+V8 ze~2Z!ysN&G#wOG)g44ID!fx7VyJs_q{5vD})7-e>Q0EiLOXXV@aDmy(;kU4XlaU~( zOojf#B|U-azKySEJz`(AiAJk{GmzRG2Wl0YV;6NSQDwmj(r6Jh7bRxXn3i*J3q_A`h?828+K~7MG4DsAM~D0 z{dTPB3Olh7cp49zm$wZzPBoowsM$AND-?2&_uL*GVcvEQHb|@X*in>SPJnIA*#96c z0{jIS+oO2H5)}%>?iC)r-yOb5sWh9F6zRZ~*jQ%vpb{3~9$wQoD3M_l3CkX$BZ3AU z8{YNh$hXxtyU8X+ji--CD@w$+J&q6Ybrri*?59aFvw0&rpn!R4Mdb+!P?$Q$HXV0VqfL&S`?r|b^Fn>!Nxa#d&Lcsi@S$kBPJB5JG<8pk-Q$)9KPxRc&E zB`f0f!e${GYwU2KmkS9SVRyS`HeM);jLwb^NG7H%f|Y9vFg(kUL|7B1_F{t7P~B$m z$PLJ2wH^;W8MNps^RQAXmTPW6?2FADZM-=oC${QoGN+ptSH_V{kosy96QHi)b6Hg+ z*X+>$12*PH*fF7UIT`tcym~w?+RCdES_b5*Q4F30vD9TVv1TfQ?3>LRADBVx&9Vd<|QD8c4 z$~=5ENv=$mX!wp=>(wFYyotv$m&s$nSPHp_mj1ZW@72RW&*yqN9WQp3L-pruve#NLUH*asFY>5l$bbqByJH0|d+Ztw|&D0q<#cLMmR1v6Z6*fBa zwJb>em*?eLsS2}JwM$$`ly;pGbG#f}8I3K2dzZmBC856}^|!Sm98}M(5O190VgVzh z4@l1cK7!Z3G8`j2zXeb9;f%F@i>FO43a|*nukjP|9W2lqvs|31nqc+k>6Xuqo0Y@| zGACu&nVYGN(U#hFy#Kr2q61=7e?fX9Q0Jx6mK2}I8HFfU!oO5L|4U~tE(HX;M>;)#DdD^BkoEHV5taMq zvrIalSR_L>#I6?w$zm2I4(AA$`jhpV$&`UT7QB14Xj$s&S zL966_G$QowK^2U&lan;^_4{2cL)(6zjBh<}2pMT;ehb}*a3M}18DJ7r`v{UsgH8%aVZr3;>Q)25Fe^(R3_@5Ly>lJP`nx3h|K@IPG7J{jTIHTSy2^vK z9q3w5TE&VWpIL|K;DcPB?m6D+oih+`leMC&pWf|g2P8{T4Cw|uQQmB$#5}23QJ}Ox ze89NNeEasAAdlzH2R-}e2K~d$E&gx%g&NwHJ2E^@M~klI&QI5MAug5KR}}-Dbk;SN z>Ug9cjW)C;L_vdV8BlIz9;Xyup+={g`5CN?9r8fjrhlk?u_L*N*x&6|B0fqL+a1<=Vg_AQ*i3$3l(GHJ-gAJR*8auv!IBPOp=Ps>-43EdowZBM|9frOFaj?Rn|5 zXZfRSks4)K5*vc2w2OcZ^A{k5>Va}~JnKWV-H{G?ic?;lr;hp)5}%`<|W%`(NG zHqL7KQD<=G$)e&?hxCq%Rjnirff^&+>YZpA_kK4mKN%fUP;^48t;^jQE>?K{)lsNg zt3tOg9@|)n7awzx3R?F$wvD9x1+gEB8M(^76TsqJU3*5UjEcW3KxjcZB+1^p#N*~7-8 zF@A0L+tb7Z1J_rz;#}+P2fNTA%G<9<#&iiCNU$KY!3b}(**<7ViM~S19!_GsFG1n@ zDJfP6X?s(Q7`0ne;rMZ$_qPJ7s4kK$k>uSu@>G|;sd zU@1jDB^|y{^`S#}Yxc;p%R$q3D3J>1+69@hlS*%G6<1Qw zE`C)ZupNLQh5AmIOx>Zs!fmhnRCDerFpVtIyL5EG97(uu{ zWVx+}xE~=~sMY(62!Oe(0Zh@y$nGMQrHImho|* z8*;I{Gl`@6Hvb97kMEhiEoA?*bklfpvs2knHB|=tC;grg{pwLImMLnnh*1MS7VJVl zm&z=U$mVuB4$~iM2O^2*``?M>zhFDR)T1f_#o;wk-j!=KvFD-U`#lAs>Jn!&VIvcH z8yw>1zO#{r6_oNmgKiIZ_wYUQ6QQV&p3UzFzi7PIuxlLBd4AlIG>|gS>H!&W-ne$# z(7gf*@0(d|x4GMUe>uy#dEXZvJRLmn;2rUkur>;vT3hmpX)`B z{!JOORPRJ6TAH>!Il${R=Sx84JYB(y+h9DiAx%7)b*ANTNSrEOS71vh1#XCTeS^wW zZb)R-^yjXXH%t(!gL0PC|Kqm)_Hxo1Zs)h1=uTIrPTrL6ObK|j$%iElAel103(+?c zmwKV=)Rf8A^T%(xeuHBCf?H*bymhDH?JugSg)fSkE;g=r1;2j6K(v@JS8Yp$_sh22gg(yN>K3v9647BQ{9}V<}VR*w))3 z9)LpMA-vAVO(B&Ht`}t?m7l`_=P5SF3PRkD)KPHW1cl~R?UZy3fs1jX;|O>f*bGsI zx=51LtXC03vC8$;QX=-lE#`T9`q7J3-ZRpS$98-0=iMUUt}GSk%FPs7*=&?)T=pj4 zu9!Oga)3tdxcVU$e!0S&D2u9e zb-5bg&^{RuTbBkZ1h=@HHBH=JA01rguTq86X>#@kwQ6&8d3wfzxEgNW!*jEb3MxyN zFbZ$sr!qQH+p~}bUsl-}kT;yv4{=+eZa?ht@-i-^kOLEb<;D@R~f zM@QTGZ0PXkzfI)&UA7;4qq!Lh)R5QhO-jXK@?w5$P6NW`}5r}2#NvzSg&)<^HUl}OSE1X_#ATl_Zc($qT~@1`<31p|*wYP+P% zT&4X!2=gCjn3YX#9V3bn{pzsK2RzmBgAf_seqgsQx#tjS1T_dYo2s#p6Po%YE~UEk z(O^QmNq4Kn{a|LU!Wi}Mx1S-%=;V5Djj3+*e9gzKTiY+<5HTs0c8$KPmKbpot>hwB z8V(@Qxxo$4`)~}#Z=6*bn<%D&Btyr-M0}-~6p)Q4ffys!%{%3;;pc(G;iHQ0?HM{G z6QUTW3Nzz&iQmCE2I9dyX6DeCuE(RSjBKno{p>9=|jrf+xW%~f(hlJBmzht1!k-_4!5hF-<8 zr|`JpX1|jRC}M)6QE75P$PP)O*kUw(MjDKNQRdI%?&Z%NC}nT&a$@=FdJw;-(rmY%MO^Q4^W3Aw|QCb&^4RvHd|f?-U(5kec-v+)JyzH0i?B{wn>>>aS};e(#dNuHc&vpft*EiB_G(mwvk_zkt*L__{(RJ?aP zXENJQ-DXRCPm(cObylkI&Xw}pTOEahIf-1ler_!;`^&Pp9c zc*WC$vwhyo0F|=}8rinj<1TL?VYL`U&v1Sh?tq^Fk<>qF6Gy$p0xQP(B#-o1m^@9D zW%r*rZ3oF`vQ3NRBaUpG3qvd#6;2J2PKe{|>g%*r{=Rm0!sC@~xmD}pazo-t@$!z4 z&n0ZZ(kxr$PVevn0&9S_TFfsOe5%1<{NZ`U^$ju?3qQc!gwYjiSqWDy)t8R)lQXMN z#$Rr?SFq&!qH-c>?a4mY4Gh@sY9*!L8l2QG^ZXZ=zg(+1Gr$DkRgOMkk2w)`xZJ4l z$DFHuqI1}vf>W(hA<3&@9EP1IeVfF*=f{|fSYh`owwL_|*G?<}yY4O6ZVO2~wONm2 z-xluQpKg%hz}moY^6AkOE=Q}LCoA}oyhsEeCbfG>ks4z8;^|;D_C}S_7{Wg!JM!gi zlH+mMU)PH)3&_TZ*l$tC=ZZxZz`Q|}UayN>GWu@a?<;o}wA7(W+D>fXGS&b^8rl%S zb*Vf@ONrqMXyFbTIgCFu75wP}J>FC+Gr*7R$D3^nvxpPBm$nozref|qlJ_#aff|ce zmC;+4vY$U`K&$_3{X1_(pc>I;G{hIHUN5@P6ZJSDnSgwm#D5X!im`Bt#-CK^tt^P7 zQs%_O)gXGLtOH)CHIdP3jmpA#zra8;`===9b$iS%aOz+IqxJ3S$nE!8wx#xY#?fRn zZ}SSzC#!86bAF05@gKdZHAC5y@vgM$P9###0ZciAU7xq7sB`f7+^%6-ZY!HWT9mRU zmxajO>F9=tH;kTVJ$j<5(+YA*-iLp%!>vDGI|(13*qu87J|dV}xeU6f@frO1Lvx{_ z2D)dBV85)zM~8&N?%nCpTjXIdqk$U$BGf4u{`hD|6h+BKx>}5k7e6L5V~|YxhZ3!b z(rc-uQ&#lDPVkhnN_P~0o#EhifqU|Nl~;J~AWDFFYo7t%Vy)$-0iU4)Z{I4pb>5SE z-mQT%3|J|g%}PT$ytwd05zMGqc-Cq%>j`I%?0FK~qf}aY4KSc&l9@HQoV~Zi_(h7x z&7GBgGq=*hba0FvO*yTHn#TriV@8ACX}B?TE8;FSIi}xf1=6>D!L_D7ul32v2Dj?e zpR>VCM-ZyE>mPdfQ=mtVd69=Cm~`#Tse3oUd}s-ssKwvA->?EOlWprW`gn1|CgY8Q zWpgsyiT&T%_Nl~}aM4U#6XYiF`8KOWoYl}c(vE6xU%&JbxM;GI`vvbFfyN7{j3rjo zwaJ6bsGp^U(c6ID7b}U3a1Pys<=Z%Et-Bb+OBWxW9FKnfxtmB4!UeI6Pa7c>L!lQ_ zfgQP4^#60jqzI`!A|VyYoxKDnz3Fq_{N*!Zk;lTm1aA=XvE8%XpywZTFyQy$&K$mOBSo!i_eYCCqXqqX zuID;?>`DtIuC$%2g1gQxL+jaMV^+Je3b*U9?sxF(kse>kZmxCwcI>aEoUg9BA4!Bx zMQRU01_4TEFU#7;?-@m!dLxM{U|-ag^va9u=#ECiagu67DaZaf4I8j#rFx8fU0o2D4q=PBBSf^ly)eG)%^Z|O zMWt(U*$XVw3`DQEBCn1T_ZK|TuLqHOHUiu}0I~i9YZ-LuxNr7FCzyKA*oF@5=+__q@RXG8X?@W$sy2kj?B=n4CZZ~h za}>2Z!!A*>+1+~+hDTnnEvdg*VEBk+h*NYH`f4&E@LB<2F}eH;34O@{)V;4Z()w<3 z!Y7gm-|1OY#qVzhr>c;|7)o_9O0Vi`R~J(po!^7Awlal!rz=hXCn#%7#B8Pz+?oDR z(PU@T^$XuAR)aLhqHS~!>-wesPoXMr_MdA4jujkkgU?TA6iyM_&`lkESHie=UGFoO zQ4KW*MR(*J;_rye z7>=dtE0v8|Q?+A7EDz$h-w8AJ3{u*s!ec!gkg)VpDm(nn`B#xr%HZRir$@4V{6uHT z?zar|3aJfUFPsCkYTThEbRs*IW5R8k$2GopjLmy4J2EMep0qg$1SWm`#tSGWp*pNx zDXL#jw!qoOV|A+9DZK8WGxRRc;u2++=5mk6hYv6pNG3k6@|FaOWyo&NCHM{s* z@fa*QEL0R^qK=ToQ4Qz3@vXx1P!`LXK+I@)sjmdu?at$qo@n8|w`oY)t3u(lu6;w^ z-F8~Q)M`rOUvWF|en)Q`ZGtpJz>PEQ^<6SJbu5uK7`ii(rP}Jr?sES1b60+k1ClOF z-aEOIZ<2rCHyV!!4xsYI5@I{@cD}Ako!=Y{#+Cu}XAjI8WbtB{51_$>a?I}`z2nV4 zz|QL(;wYw81MaS{Su__b9ydI!=T49q`tZW2qr>@_0*yWDG}5T(jT#1nU8w{+4X%dS z#3phOD#1kBolq|Y(-Z8Fus=dg>WmAtCs%5P?+>~&(pC<<$R3OVd-hhyR~YlFeZwJ+ zxj&um>6w)FCae}y-CJO1+L@GxBTt~bbKP7fR5lF6qQ2Y}hWjL2Cv8~9wH~Qj&hByl zHYQMfp&1FtwEv4I^E#9c^p;odI zNasbT_gnSf=nkoHiQ)9G;STl1q@K0&NjiT2GKw2F?I|5cCpkZ!*0>E=ts5 zWHl6*=R{La+84{3coL+|pxC_#4YMb6z;F-b{Qe2f)Md3tbM+E@@@ z45F5^rq?}szj@0F8=6*d^_d{H$JKo8aJ>o+DX8IXbd`>Dm?jHW@ap;|-1>QOm$2$0 zWa5WodcSAM>{fUx*NZ~4`QLP-(PNJHJL3Nsn8T`%EnlcQiCXh}0#Q=8tB7iP0P;Ws zttJSK@T%=V%Tv9Esxw8X&c11%E`^8N(7fQBY{BGt7oJLxhW_4O4mm*)+$UI-z7o8~sTo&(% zAyv2O;wljAhdZX;ZB80>PXDTPBLf3<2Epo*u!@@wlDvw$r<-T0fgfZy1zOp;Me>Qc z=hX9^^g#9t8-1s;=0Lvd5k;nQPWf$dW9@c9#|rCJwEO{XuJs;n&e2?e~#y#W@=X<2Dkp!u4RfhdwmxxkT5Lww7R`*L4zk<;w+hq0JG69)47ew zox9Z(@nq51%d9dov5F|kLkC~U1rH!iNwk7R;S0ji z->*+^aG8vh$F1ZJK}W;DY}pd^!J%cpuN!x{Kc(c1*fklS>+ku){D=bSpk_@`{mZCE zuU{K>YViE23ag&%<{dZA?qo(iaN48)VXR7MoZ;}Iho)DWwcT-mIQh$4<6%&ycj?q^gCjO~G(yB3}IO-voul$R!2C!K(^n=!}-* z#FNaXN5}w{h%FcZ;7wuXI;DKBP$-tiAsySmRcGK$rF?&VM1gb6SAB58u!8n_#OrRL zzNlfXzAp!|u|vYTqqG}hjkW9;o@hqO)+SN%>DoF$9M@=isyF=SuV4R3cDpZb?FP0g z$w7<+rCf=Z&)DK7bAp|o@Locg6|z2mK~{f(Rlc1Poo1=RUuB{aYh4x^_HJFs?F5^@-0KYLxJ^>00qoQfJNF>DLi7 zC%0HJDnKgZFSjd#laB#O1;wCrbR@jskbA&4)2 z=RmAZX*S=vMh{Q7Jr^Elhab#gwS*Ox)aOW^`+c_YVQ_Q?zrUL3YTwz2GmrE8&Vs}7 z0Kti2b3hu;K2V768;$;oGLy9NXc`z+m>_Izt_bag_Gm;j;U>HxY822J9uv19hz>!oB48IQM6r zSp|7ZMAPuc&;G9Z3k*4Wz9-I5uy{zVYsX|J;A-rcBkrJy*|Kl_ENia?My* zxe0vv_rA}3grV;jP8NjiGh=!jbtqya z_%%xQ$U(ffwP##Okv zeb#n%@&_C#7q{$V`ilP9dMKbUqM}+~X)#MRIt01-NDbQ4+|A`c8RZ0LnVni0U)z=1 zbEqKMb+7Ia-^RZ^v}J#I@*>JJ2(yxlbfvpu(tk!Ziam-!F!`Mj>Pakb#osbZw-@spC0{0Ih-dp%~>8~vA(h`QzqkI{2?|!j{YkQ|S03X_V zN+K13ti4`FJSh!*KgF))z!HD*-u{x;tKWUw6Nz2|m}wNQ#}PhJp+KI# zCX(x{ra&GE+YXuN5~e{zFn54PfI`p`-IvKIb$e>kV>rfTiHt<8u#aE{#j!%@GMWLW zS0F(B1{U7 z*vY^N>y!BFS!k6EI0ciwh1XlQpi)QvD#Anz`H&=hm}=EG6akccS{8F#yJ5E|L4sH zh|%!q28_0CULs&*VFE~)rvfC@b^xhM6@8`9yD=w$cQh5==0Sm{$tI8n@^FT*PBgH5Izly7HRsE+cHeS0TggOs+r>Ya`3do8B6tTa;Ja|;ajDgr|=D){Y`M})8AuBbDGW}cOWW@41lDb|;5{9xA z@E4VBSQmgSYCw_ged?K4)${=gLcn8K36M~Q38?Juzl8)c=!qD4J@tQ88CdIN{xdq% zfYI?)_h<-^P96~aY7&aFMBpzBylJ-qS;|I$m}%vbSJi-1wVnXrkyKgX z7Empmr7sK_crdFQzN!pZ`2sT@7~8m9{}~1UQ5i^7_)jmH0=*RDSHo2NACr^;N@xAk zQv=?8s7(Z9X}p4PWo<|qpc=?L3KtJ}nAQ%wsthdf+tf7w8MqFwqu~Fa1K0lp*zI~n zC`KWpV)b-34(J-){y{Z=Lu0Ci$6(9C~Cd z*B!N`ld>87TA*1FUUo&d#Ey2L-dx4rexxn|XNFKr9zXw|==@vj#V>u_c9(E1g&L}R z^e(45j{uYFi{($Le{OD3^JUA#%;wL2lCFQ-$(aOvYguR0~9SHzqF7jqZ!S=e<#Pr*~02ZlWoi_2BI* zUyGC;Eg%mQ)H1664#-yN=q2*<8CI7z*VavGJ3WUf({A-oVmaTKDW(1aB(fR}E%^iG zk0zaQT(Vv&C}m{U?(F#-UP@6|tu(?=`P_*borm*!R0~3|<`cy_-vK+&{HYDg5S#Vi zU6DknOvcw@aRZnp&*2RDV@iFYP@p8dVSz?{^DiM7|I5!zAAkW3_KYT^4G+7&I?%s9 zoIgDaj`R#kAU=sg&8AuU9Da86-H-vNqT9>^ zHD?ihX-49Mem(*36L2zv6wpEHQnKB3-tdA-y?#F2C86N z%yWmj0kb31;JfvwC|A6=;%evtPmXxZTP9=6>6Q*d0#aUn{_Rw*y0I2b%%w)>cj7Ug ze@KLO*0UoS0rv~B_-=_7b+$pVbGZuoyO7Etkf#y+U_$~wXAP$kU z?yuPSl`En9aD8-odTP)f{~>K7`bsYmM&NMxpY0Q)+g~ktebgL@bcD0dJ@0(suJkfs zP_6jdpn;KbJ$sY+!X18`m2>>AafOR@Cs5$Jr5TUwM9wX6ID4sYkWv(T3i77^d%=7x6<{p@>tyV20 zr9Yz(2hYA=HD-!5Div*O=qC2p#PWQ-+uW$< zi?*1Bhbv4*)4F-lEPk^QQC`KT#H_kGJm_m6_{>C0%DJGWuUQxLBR8?xx*Sq+@XZJT zSkRfCzVIxJj-rZ$V6?Y*JSeos^LiL{NJR*Zh3X>w&_CWDrarx>5;n;DkpOdsOIv{b z4l^U&!{+8~8n;UpXn3yxB5bhVt@7b`$u$TTm5@5Sf7Oeqf5l^V4*hYq6kq~yKBoSJR7KEfm(OozH|%I8Y@v!89?xczGoedW&)2Mfdkeba6JB2H}j5y>#xab{`k9jK5WO`d~@74R{Xq5yf6511>)hwoa zxy!NN_jU+6m*UFUzNGt`;fFW-3cKUR*YoM3LL?M?_D`CP5-VKIIE8oVj(Zk>2L-_wMymZmvk89o`BJqLN~!_rq}jL@3o^k6^N|0!Bk6e0Goxthpj zNaJv<;bWE1T_8EFw>$f6r&wG>yhu*u6~?Drq7x!@M2#d5Zrl0JbsWNK7Gz#$P3jSU zsq!>y2K<1IbGk7)?8Y{pVe)S2`G)mx?vH@r1M)AEy+hH32Ymq)NANODNG`#=KOgi4 zD~E5R2%Zker3C||q+;3_W$8Xr@V8<0apw%*Uv>W-PDbO?;+8rbe`o0R7;XQw+u;8e zL0#&rPQ=i3u^d;_VZVc#Ko zQ1_iYS|W6?kpfVVJ3TFwy2%d`t%%M&ni`6=Zh20(S; z8YUnF{J(JV^(!t`?o4?_>*LcS0P6m~L1;1nLbZ@DUZJsuIx-`2!jzG$uT(OYQ7I4y z#8L*G=bI{y)Mo04M%eTrspx!1-m;;uT-2>#|g9i2WDhE(V}6F5;I9 z44}zn0J08{G-E0d$oq*{H3NiBlB&s?$c&b1iS>B`dEott+W&D71_VP`ufdQ*w4-p| z-e-HAdO|g;hF9GB9m44i2z`w(*8eUb0f2Y1ql5)$u*X`rU&?72r&>4288}^l>48EF5VvKxPPz7#or5uGd_Y@X@}H+u!zRc z(h2AwQ{C+(Aap2_Tm+i-KMu20Uc;ytnIo)Vw*ZXfpPwX|Y~py1Z`VC#D}8k)0zDdcHnCm)$`` z2>GpD3?CU48f9Md-Hs>zWn>XTc(6N76Ge%PEA#yRqPIb1mPZF|{@%=EwITg#x`F3Y zJkh}eTC;@z4~q|H6=RYF3T?sSn2MGK zE<~|K+L15&cm`0)E>+>?a!1tqk8WZlC$Mt*59XD}c+0?9Kz4D8WH0zFBf*`1J;j-& z5}&z8&5iT*qM_G(?+-5;*4&h+^ttnXBk!RX|DLR5n&>#!$|tYPv%AS1ITEoC??1lJ znvvntor%f>qEZd%OKj3dNQ&lTZq@VLzH)auXJc0YUmlh*okm*%CUskSgVA8r+x(eQ z*vF^TW{KVh({Rg%L^wt$am0DP?o(;X(tc=Zd8Lesw#Bl{3U)ev9O@_akz3VyI@W_e zBOibOLG`lI%=pDjZcd4?DD_tI^7f?+;(ko0)x;n9kq2UWChC6s{4n<)=Qg5u<~x%d zK2l8TrCaK(cF;4U?7G@wWx09mVB>%*c2Qw0H@uka_+B=D@1?HHVbR=#LbD?t$0}xT)<_ zAx!{Blztn8Hw&u;gZl6N$B*kZKwfT){@K#3P3l8=$CUZ1Xy7JY739zM-iZe^sSv|9JUue88f#6gc&n*D_WouDhr$<t!QL~nH@ z@-J0?Rsi+p_wYJ_U?EXnXB#yWDh3gf;ZT1Qnm>=oES^H)Pme1$>iE^Bn&^tf&s|(_ zHM2h!JzZ8odz$7pdE1wq57Hk_R~kh)o1SaJ&}O{r%9P4GeHm++ z=^rsWTaH2HpS%t+XAybNEJh*c-ItiwyVM(}xCJYfoC7lyc=dcJ*S*h{5789jU)<9! z>;s=wrarHFT%KgBFzA0vdYr)tUipcU8?+$9 zv3Lf8yzuIKdQtE@&-bV`CfKz)T&xEzcH$}yDM5HBjCss{TE8p?u4>m-T%}-qb& zi>=`QxR6VNW8rnZ{fhNdE38Me@XSrxY{5-*nqscF%Q~ zO|Ha0rz3%@ZU^@G{4aOsHALmZK9Nw{WD8W+urtv+#QUu`UQI$oi>0>u7{l6a47X5c zx&6LpKYvM9EB-Wm?R9<6o$Sk865j3fgW1!a@$7#$yU`Q-#U<&)qqEvoWu4bN<)l5K zKqHv1FNDm^+z%h`3Rt$CWMT{c?Tq~DR#BB4#R+_i?`bQV_>7GXHh9+ zGu}_+4EuEj#}rZD+;~Zk%@gd5678(E7ic!}AYk9gP4oEwUIvJ7>#VeT8XOVNM){28 zij~YA9xUH`M|E4mHDR<;EUleEm2YoW5$Bpr;kSp0Li2w9&XituS~7BEi0J+tbhp|6 zPAZub#_PErS=&7tUA6MlpO3I>LwnglM0{^Stox$$_X4rC`04G}WvawgzgPCz&t~B_ z?@6!`;wgfd!4t%`6J7gV&vcLLi_6$M5vl0Ga`{387~kdRCg#4WG}%9)SZUV#*C5Vp zq%*x_7!@b&9xoDjK1PdBQJ&qA4nk)mZLb~`EmUhi*o%#d#1Q_JtrOBrLM~@W=3%!z ze4i9+76NQokaUvcDtut59EDyD&YDcNB70Jtz0*1)=M0@Anz3cy4oexfn+qN7#K8ET z5}20~{3mQTOsg-Ul`}n3pW@e7Y@meIpCRzX7x7=LvHzOf?>;WWa{~7xv(vh-kIbR@ zM4>i_j+s4dR!cowgLS|jDkz2b301AO#9NNk&8D&D3yt4w))~;{&yTGtweIMm^3daD zylnlzdo1$$QDi^uKHLIZ3Bgsyal(4MBq7M4;fIlhE^qHT>sJppF=48&bd*o=2RZd zYzQHBY5a=I*=CECMXL@apH5ToqDYaU!I)0{gHIfW(_#o^ljr06otNeiO!Zowsipee zg$feRS!;gPFXP{G{xW$zafYQa!D<^12Dq=fXH)SO*IH(`M(vc5w%i}u>riRN2W9u;_xSRVOx5Z)iqs+Xml+ z?Q2hGvCDK1ib@k% z^sx4w%OXWBN;uAYvsAdBX_D&UJd&^U@1o#Lner4*X&BRK2sC_mka~K-d`i(j%BZYA z_F*JDhR$?*r@}^6ePS^JV@pG|rf#D|prBIo43&DlOy(AkD^_7B)QeCow-C#`by#?rAleDNsBH$snR)T~Kv<3vQkB2jdINV`bn&&90HsUYan{Ec&YUh{V%(;74A8saOhvc8u^ z%@P3h4Tas34*xvAZoJ(zyGYJ;Ph9;RPyc$#O!ntr9ca54f789vL0nn3c}#L8I|@|D z|DM=Pm@VpOT-L^e$xP}?kF{Xg1^aTdr}V6x-bc%rN+*9@sZF98JA?w%L-w+_gpGE@ zCGgUnGPtZ82rZ4{eIFB!`=4)gglyRz#BF~_64ZbCbQn6ke0)0uVi;6 zU#Vuv{dv6G(S)JTE71NmBBs3L_}G*l=08#gzY zm3FAUw*ZnwC2;=MbFR$e^0Fi{?UB<|>N3aZB~^$O&YpN<0SM~Z30Y8=TRsBeyOGc$ z2maxR)1Jo?qhj)8h>~!ZM$o4eyoMjMt7^Jn!OnF|ZU%cBlf7sq7(zI{w??KRn;fZc z>L(Rfvb)DuTkSDTi}Mq16b>KHnpAR-;GRs)wnC)hOeUK7ukYRk00b%Xi4$?4K=GLF zG0%5czOe$xWB&VifeD>ozgXjWRoHVCKLrkXMt_Y8JHwvCa>Ij;CvGnS_>4dA-&5v1 z;*%`Si(DTX<-Y{TAVZOR0Zt_$K)6@oujku1(iN?QL;<%l6SjIXeiu+^%N4)GVgN?* z#klb9#BVL^IoGF20Qm27ftINn&nXxHG^?2I#b48_6%i709S_}5y(Y=scDYRf&K~Vh zlL56$Zx^D;Bo*s3jIBZXcew)kSV(kIrHJL#p&{)O&+^(FuMv=OJ8hl^HZXg0Sz6cp zD^)fLN#eQs=|GncA_$N|5x=o+$NLF_vd)z*fRIGzE-H?sw4pRS7hPN+$FScEoyhf# zaHHK~sJSq=+NZiPiA!oapY#?_Mu^!M&XJ)_R zPC!Al*1TLbAB6-r0qc>5KoA@ApTmA}c z#at6!{@!Ln&V~K&HG>DI5)%#vzK`8Q3>()&Hg#hyE_LWYSMoG`%+6#~0 z5rnuea=S0!`N*s820L&f z?z?>igHQIvkvOa)$YSB}NA^Gvm)p~#>&(9y%mof;r#gY*AgN|#Zs%|)u!?DGL1sbUycH{JM0JxWoXpjl6fjudwU)>m^MrwWTS-bS2|n^;uQ#& zf?CiwZcKhz?|D?A##j&PLC8WepzY~f24QG@#j!kZ0T4J%ZBnmAbo3!t16{}PI@j?; z9#yh}&M!{V-zLnFcx?9#DkDTHW~)8YA18k~F%^u(BT|{8`IlC6#!GAxX*m=_lsH$r z&?$wbe%BpNV<~uZx@zx4;N_kDwGRTDo#OoDdd~W4Hn2=_2$I9Hi$OzzT~=o&vxZa( z-@V)Px-!Sn7PB_r=luSFv_b_c%F##cd$qLcGmsfHL+kZ;or60$GJC)%y{`X>A*9z1TEKdtGty@0u73E#ij9< zvqy}c`*F04Trz!33N~>Hv*B_syra%4W@a$abag<5J7&Ry6l_CV@yCYv4Y_2kVP3W6 z{(HPI%VoC-5qZ7#jBII+x?;7Pa<~%NCr!c=fIx?1;nnz!T}7&mE!Kd~*OS+>X3Qbi zK}-3vD~yaz0^0wiDUR9vlzr5h)iL}3sAK}HBqNI)C^`_c>RD0o$G$-|y2%Dr0d;%* z59Pn>;BwF5O6{IUwC2@`n+z0ViPB1zb8r1^@5N(W2%BA?0Mms}UzK4Gd&wHLE6dzK z%K+)ba&VRSE^3!Iu`!kSvVpSmHmDS6`-!{=04o_@0@7L_`)uIE&Uk&Z6hS`~KwCMVUoK zZvnlIgIOy@U}02-o3Tux{NqN=g_=E?RIAxr#3i!4>Q2i-u;e7r~X{lr0Z)v<@fk|P*m3&nrF*k^n zfAJWG4%E4OKP~~;q~huJ@NRtgNLCx|#_tne`M(O57^%F~NV*NhxQI|Zih;T#X%1o& zWch4vMDH7W_=wKIpJvhBQ8YmPvM-rs`0SA&Q|VqlT5LGr->fsXu3!`v6#2dH^&x{U z=S#$-(=NODA+pc#5FjlT_G;=HI88uZn!y-Xg84F0U(SChfS<95&>)4~m;d^h)Q8p3 zKEZE9jvt#fOh=xejLIgkHM*TA_Ae5r<(sdMCB!+mct=`p#17R~4vDo-#?g`MxwK5USg&^?~< z3g!rKjnPJv2$0F)#Ys#R$dr^kO`z>8pln$Pn4)`Kr3FZ$NR#{faUbTXe9&yfJReq$ zA3RCt{Nk)-ieolkL4WCMsJ9h+Rbb0I--rY=1rqcGGWzOF_$6-FySqefmH53Z1?L^z zhUs>}{wQ7_aW4)INDaMQ%jJMtZRoE$$q$hI0NT;aAnrE|!Vjo=MGABy=4>9HmE74I z$;_#(B8n7FX^0BbX_db5FmZr&foHG;pD|KfYM`UQEaA{%vRQ#;}h-g7% z=TB0wEzJ=*5u5&DFIsGrr`nG3+o$g3a%YYH@2w~OClRPRK98s1scEf(@ibDdi@Dy- zhd4~!0r~J?OU-%$){NEwhx6^|96=P#nE!lv{d83v)C{F=AATTN^$0%mR7AGr98f2$ z4U*2kPZujb9eua1k1>Hh8Xc`JHMfc|v;Y zHQRMPBoy-A05I~Xe=J|G|LANLWV`C0eVn{~rXpT+#>mHu>vq30DXPvTPkdITJUdsf zM2uKu(5#9V_g|hNg<{ntu^Lj=2^WoZ5q}plU4JuTP>Z;incijb0Wg1x;JldUp5?h)%RP!{y&Ry~}+(J4>WfZdIYeU2lqc=4XAf zipI+qEwb!14qsC59G4Z9awdYCp90i~P#VYhvDk|4zhT-s9JhpCTu@#aF{6CyU zp9;aR!+&^YLkGeGU9C3(_d$M^LWY!_xlE?UpF1a8{Wc#{#k6x67X0TKIN8)k#G@*H z*H|mc)}w7&S2e=PPpOtOwSAE?`AeTkevO|5z?Ii^)y7996}QQHG6B13A2_0PC_;OK z-LW}H9^GztU!v%d1edAsV!!Vs@SR_3M7=@ z(2_2ezKvwQgDo710Q`=1KXM}5kM2t@nW%o=^UvKzF%MPFv8Zc(z#tgFiP5XI_p>FF zZp4Yz-=vGn-e4>Gh#AwtcIjI;ELbbDLlaARUt%U_nGIZR}+XYxi%%?8yF@!y1(gh7OS?RT4=sJXWG;djb-m4XqEH$tAqK>RfGJt6aoAX7M3$;bl^4d<&IMUD4_ zm9WV2IE1ovsF}vTuNC`36Zn`14_Q#G4wRp~<@j%Wl*uM+fkkRnC;q1f4O4~_JTOS? zAHvkPZON7M`I3C}C}bcvT9Jx#Wjog43HEqLJeQ8TCg4^HDKSKzivV~iZgBQ^l2T=q|7Q7>-JE;G; zYd77USExH-tmo2hh&3@H*uQw{&ot3%(bognlpTac*q8%9kRt?@dic@B;kE=SC2lmc zpHWMb59!cWCflS+4d;$GTS|$u0D_@yhh{}Uzth{1_ZETEZjMNxxSFfOrr5>_89IcS z#p`p6L**YfF()`jV%w?dR5w)bt-AT8i0d|^gbV`(`At6C$g*5k(I~`e^R-rLm)2_( zE~ek-U!vV)v7pbR@J#j3j7fK3pB~O-1<7;7@zStl>AX16^yqi6jX1W<-@#ciUMBzM zN*McX{tUYyLWjT`MB~}KanCIGrpIH=?=R~l*Zw@1Mlds`{vsz`^#w5i{W#vT;cGSe za&^Z`y4zDOIFnQ{?BA+=(|`vczM4G8F!=k^niJ3mZ|Q^=yZ>Tnc1Ldn4nQw7XT`{4 z{)Y)8CKI4-I_p!zpJhKC;4Gr1Pi-M5UG~Ia_-_N9>ogWLEhm2?w2xdB_Jos8YM(n_ z+fO3tEYLeFSA*ZQ*=9TZd5HMuF!@J0UdG>TJ=3e8=L09AP&^sO;Oj zk-{j;5zA`>Tc{^My5(AZ)2}KZuh)cOk69y$4B5vwyZp>>zs}JLw+~KwTFCJhh?b7N zxHtMrEdw)GI77Oq8#V8#&}}{kU-SbeZn)VuoT5@G?QUz5uY>ln9@*5_6$|hf3$~Xa zsZZV}yUU)8PIK0jKj6!31_|9wgT^ljQc%@R8{3H#6Hi(z-4^z%q0X_6(7Dv@LpiB0 zNyhp9YyxNISjsc>l-z)*LJ$NC-EQ7|gUp-2XgBV&RtV}HJ}0Qm(Y z+w!bNLV>x%&fXC0vq0gR&zWLAHOxYWLQOtb5LpYx^5@w1P+jAc+sIG@LP|P;WXkjY z*M6_}=39h?OtS9G4buAtv<5ycH@Us7N0|vlg()Rd68s059a(?~2zInY(ECM<8W6b| zcl9EqHdv>L|F=<2gw^%tU4*uJ&1i{u3X#g2OfNNxaC$u*$~Q0vF4|uEiRRbsX8o#A z!W)MrGAm)lsGSZjy2IK#23IV6Kb>(KO9|QzqV(%SH~?|2^7_Z=QAI_+yn#CKBgoQb zvZY?T!3+gY$%OMXDrM-0VE+c&4C~=Q(-{3w9KmZwxdQS;lDl;GV-eRPAuQgONKr%@ zKYGHJSos&von^isIZDZOWKm&pB3-Sh+FxDB{za=i=CN@QLse5oy>Gst4wrDL@mlvK zusH*kT{u1W{pw)m^n|A4qaK!x8nQ{Q+GR#$r8ftrhwMpD`688&AN)%H(G;o}fGsG6<+4wh?Y*y4g{F-B}&U`|o+xeGewatYP}8aSo`5+V?Gl(YlBaUim{gCDQ> zP#}dWH>sHPB){A`iwl9A`!_m@?;uQ*-K^`gXc5j<9N(p-ZJZ;$RwHx!(47W7yY;Mc zh5tj^kv6w4fgj0Tsq7z;S=;zIQ!1~d@d54l{TQIH&3O6NGnLOYXLFh#ASLGW3zSV- zGQNWV4NXTO4xK~2A`HcjKyk>))GJ3~&@1EgFX{7QNBCMva2-Jd7AH4}99#1vsV7o0%49IL zI{89s-r`grKrs26?359Gfb?Z&3OZrcN{_-{-J42_Hv%I$HL_*5NgSM)Q`}*z6)Emd zkPUCqhN$7jVnK(#9-{2Bej!xTt|OU192Anrt7wgfi1^W7roS3YUp8~W?^VJPmgx>h9l6as6nW&dLC-%nDY7%1NiCYU5=r+~x+!BmUB{_9rx@3`H#i%R8pAzZw) ziP>UE3?n+Gzn8t^&3$>N-{JJuK-x_-us51Ex{ERm1cafuxN33pcWRC!d%~TLyP37* zi8oi@Cr^bNtp_Ov-p12vFanR5Lr0v#I1z-<8j5DCCCxW=JJy!lCx?%})JmKz1ElzxOYcgA+$=U- zpS&se^XMzsck$a31Eh)R#;_iAR}0Z1pYd2?-oUj7CHNjqzWQQh)s@C;#}qb0DHt;` zJJVX0NcGSz4^BjglVykcJR;nQvCz?iJs|o5P_u8dY`_H$rGbec+sR)q9^3~8M zfoHMGgij>ahs;3s6-UA!8sST8%((<`eU?zgRO+_`PQh5Jc!7^?fOf1QDVq1IIu|IW zFXqDY1`E0pd8IqeT!^M;OJq(dAKn2Mf1A6^<)J}#>8pQ7AjRfakdGWDZ8x`P+T9cb z-+v7f^4(px-x{>a$HS^|s5+Y^fNlA?=ynoYeX(0gu*f5SPh#W;ALKX^%?XfJ3`orG zsfoj~Y%rHtA2|NR{2F#aOfHD z-u_g&2w@9*DU?egD4i$X^@dD0MpkJF8SocTf6T+DqIvTMinl-ECArH6#4`0z?y^4t zD1df}=ZX0`VUbfzMS#4_pHIP)fsl2@^N)#POqYFGKw8K6MJZcQqj~8Z+`ouK=J&S> z;&d}+@ThyCHDj+wBq@gYt{nf-@3BOP+io$+KI86uehe*ve)4x6(`441O-xkBGk-WT?$Q_f z4sotu9Q=}YyK5oh?Ff9FZ!L{QrZr1TI+3NPd>%xTpfD<55ShZsF~BnwQ{P$SJFIrL*atSDID z^asHCEX9z3DFFHa*$hnhV8gvR^G(fiQp-p3whuR+Au$`xmg7AdfdbYfRA4Mn-m1*t zw-=i9wG*WLTylnevWmnABD%ExHcW%MO`_lb{Elui?XdH?q7ZCF4Wu}gE76JmnG(b8 zm$;9>NiAV&K?&LB?Eo0FCcMb0-2PFC@1UX&3j%&+>GQO9z}vdZ9{D35{sB9fDRKKg zj+8$#cclMmDRnLWC~8lE-q?RU?|<)+9!JTef5tyR@9S*~`M)2H|D4;8%Zwb@8iSO& zmA-zOEb;S&4#rRB3cDdgLvJ^Yr+@};Ij*zq@pnbF`(SgEhUqJpW^W{G3o3^OFztp)x82pri-ZRx6GoBze;I#0?`DE z0fa*wtUGEGLUhiY#)|^ke(+r%K^QS5%tjYqs?ACoFm9Uz65}6 z=7j@RLRx^fP@`zjzXrw^*=hD?HJ&+K01Zt_Mqty$3Jnxg#xG0^*J7K!b^cWJ#ZV=T zT#Zz@$aF}r8S7cl4K>7h+a&AZ>@z)sXx?;FQ8om~Pc|85w9Q!^G#U+q2?bCfZ2HgA z3KICaaKM}WnS|u@xgZL- zgbiV5#sKyz%|#an2J{oYo&xnTrv7w@nQoZYx~)VJF);|#>64VAPCC+5_D;5s=Ws(_ zqDX%p(4-+)(ZPMVwv*bQ!rJCjx|=H1s{Fb{wL>5us$MeX6kd7@A!sMPmY;n1uSAEi z(rAJL*LH|bs`z9sgd^`otw|cb8~aDs9>9a-l@w`R<1k|!Qp0Ae;Xt&3rS4L4jbY?f zqj4`DPuzve^U;#tUy+4med<5shziQ~XN@LZN#8)>j%SII=fsKrHM}+?68q{-e|&71f#>cXhLQ32nDyIV)~-SpR)okO8;fF&?r6s{?ap)yP_6}P2c9SY zg~5+*H-oL7yQ9CIcSn*Gh(2Mmks@((jC22eS1QCw1L0E^Hbc~eofcuAkL!OxNZ1$e z+q+-+{aI-k2X}aC1jUvsVXy_!d#tm!P%elLve2$c4bie&i-ksD8Sl1w(x-L=+-Vp% z-&nMcib4sgOcK|0v0#*OMnDK0mlwGb=hw}GG2~J}LJC^Fk1BpKAqI`4C&tZT!KQF1 zwxAjKFkAhCqgA#xaKQ?Ox`QT3J>eX(R+)*xlP)&w8z^z>iK5Lcx|{~$3i?@K8#fq{ z!}s5B+gWeHc-viH|4?=Jrbbx4+q8};&2`&cOfxQHNRaV-jAmUmkzs($*ygs@CJ*35 zSE`WyEVPZ2j2>lB*TjaE+4L^+fyCWERmV4ehSvC!$Us?*^8%^meX4zr^>J9#c@NK1 z$JNxLGTWa!1-6Tqv8WK^>6d_L$MbkWhuIquD zR3LoFW6a1+4w0PqaeDcXkc!)lpt2VnVW^jBVY)Wo5@-E6YuPnMq2$0Y4ZXo)axsq@%?YJYYCD|FBCW6s`o<|i@lEO#8ct#{1 zoxqUu{=?^Hsw(3hvY19kPyq*vV%8P!)tJya?4VU}XDN%Am zV#3T(a!wz0^`JG1gSV`jdtJu1xPVCrsi3c#zM+W%UGQt_K#}1-hRB=PbYj#HIWFt) zeAJTf8tg@)l4+;RfBC}YJxw`T&?(i_c+r@q(*`)pRvs%SFksJI@AEA=1tv-t2`Sa| zS5KCkLug~7*v(UUmX||w%}Pm6b=M;tIDO+tn9xZ-we*KPR9gNSNYWXRE0**j!06Gd z1GMG6T}EqO$ytWWYh-I@rG;{!wYu6wjC5j;_#|jdXk2KDF00U1W|Y-{($xG$G6*+l zJrx&Ql9BdNiPJ|?l(m7nbw*UgWKbU#KV@R%eImZ)asdyzv#Zf}#zNVo3`ZlT?-Pf4 z;jMm8bU>q-#ZVsKThmv3eEa+{eYC+%;%bzMTGRCdSiOG>26dx==ce~7sV8+w72E22 z@bT=hTnZh)x`5K)!1U2~jW(c-TwN8Wm zIl}5O8T)ratu(o;%e0Kw*Im8EpIiwg6{ZMqVEccyW3Z#6C0KnxT3`h!9SJ8zq^(Yr zJG|4+;~;g)*+MxU`Fmq!pD;Ka@(OJ5sf-R%cts&n_8zHJf0!ZeC#%i~(3PS6f98Z| z)(9!mrrB(=>*}#?(z2mUtk(;wu9qm{dRkKB zxXfKLzSwiB!V=C}pafBB3D0~QtUf##bwH!Uu5A_7lkDPMvW4O|lw zHuuGrC4nHoktEb&!SS|gR>EAhC5KG6op_u@F>Ip34xv!6(Q5i*1Qwn6ck02^%vK!ydMt?P9Ti;s@~Ea0aX-3qB;FJLzcMr!B$MTIx*8G+n-6!i?`*^dZzfQT& z|9Nw-RPgr!)ROiIOqlxD$GpE&H1N#akb_+DpBf#>8qbUxvft}mq>`DfCL5KAsFkt; zJ9_qXEoLmsBjoe{DfoweAK)zS{2DbYK`S$8)aW4oR`=#Jb1uz}2~>90)<{klT2Ulf z1oeJQN^H@c1@-K2()FO|Jq(LNb)(dqVy80(P;c0JJ1X{47rY+A3_6fPLqK&hYgd&DTELxOHnup__z4|Xm zN3Bw}(Nmd2q|76GBQzeR5`J?DSnMqp z@|UlWb~^7=){QXBGzy}WYMEnlt?ZFQ_=LW@B>vqFb0>jk*g3}su*$%@S``v%{p6t3 z5CjqIv^zwSPyhNi9zWgW>Z&B z2ktH)J2PNSWBX*!FOv z^6mT<51S^MCYLWY;t%Q;xL?q*FEM%%Nhwp6{R6>#uB*-8CnOTv+=pm+3EC5lnq(9> zPsEiaR+_l?HDn~7D8MnM!r*HFYoTf{!b-hDdX?#j;|whx?3Ziu%L5xGjFQQx{VvD* z={TH}iod8VJ_fE(zpYb#z~Mtb?1L#3t+)r znM%s#8l`HFM<=XA=x%w_X@7q}?8b1HRHRgU^vqzlxt4uA>zDb#s1ct{muh=Z)h%aY zfyeM22^)SRwm%hZ+*iMQn*3QDibZe4XdF}|4+kzU%8gqg3nmAzpUI`1J2eIhi5ppE|O;*oP8-ov3584LGzqqrqQtgW5?87SAQFS7GOJaoGK-7#TO0L31E_p9&0j{q3dSSa1sZZm-44#X6oiAr zBGGrKV;mRN-Kl3vG&c8?x~TQ|(ZW7;Gd6*1$K;UxN*rA2kHnJmX$D~HLHZd>eQ`A% zAol073%TuqTGUSh6QZQ@%%{`ROo#!>zkl7|Pk#ca>i<}+i>?lkZrqNZuuJ!MhN*RS zS&Txhm^A-#N65o|lIS#J3CqeG{2R_?O^6u)4E!qRn|6mcPto;J)asEy>K8%(J|}q^ zPX=f;8(2ubRSuVllskGmmp6Pw*?lx#@hcGaC~#t*5~2q5WhfI%FPP1S=cr|x7bKG5 zu}8sx59V3K@ke%M@Y!&D$ji2>-^VxqK_!sDkf>V?&){%E-#JN(tHB-G;CQKHPKXhC~~PD8Oi$ecCIWd+UL#*^ZYacmJl)U zBcO=Quq|+DPvwvIA&7fYq1t4@iP5qJ`qtx#S#LtqgF8Lp z;u0olZaIN`QJnxA_>mf862rIb${YLahf^9?E|unD#hE!H2^fX(NP#<*zVyaGmT>ib z%RJ90sJlLD~?prjkSdt(XB=j#V;0g+`)LXn$L#$27*PzO+)SH;& z+zyOQ-PvDZB%7ZVp^;_ro_{PGCwZ%mUbQ}7ioux)xRP)aG6!ai1&fBY57%0ijDgiP zA6*`rEyZ47hHMwz9j|T#N|km6opvv!=L!nO^H?a-NJGyy0YQt?gVna%-G3Uki;W*0 zP5VQ&p*snt#1-3tEq^bAh`NTkDYeT5r+B9 z4jxO7x@fEV!W;u^zB!qs2RDEC$Y~CKcXL9>zz+}%>Ep4g{pK5)pNirHdSB}zmOL)^ z6nPyt=$H95R3%h?D)vnLJfWwBx?sS})ROyt&!0?KIFZ#IEhUQ9Pmw<=T=q*)?WzPUz>E`e})2`oscmuTA?$ zbAR7`uy+^B8V7^C%n;`S!5o#j6ZTzPUcDsA;%2}!MRtEz8lKmVm zJf0KAkKBr$$r-CD}tYaMuC?RjR&HxP&aK=LY`11o> znnq=;Eex{Gw0*SBVWFJw<4L@}`eBiWlIBZIVIt zYEwAEuHI?;>krelFYut`0+GP8@ogr+1Bn_{J~4gxki4H>l&{(*tJ!vvMN1^TCw}z=RDJ_zFtI1ScW2-K`I}zt4oKL6HJRPsai~DXF`iTrKz^YU4dMZob_H&C zMrw;VqtmTvs4&noW;R`CfRnnK@|H$is%u{kvoQueb`=iCbCrEa+DZdr?JLhCA>hbx z(@vYg(FvCy)b)G1H=C(QN(bHMI`6LU&G{DSP+rWGY<@kWm)P-EuUsYOuqcNfCdSJM zV1IqfEnyxd?+D{-ddP9Ae4se`s9ootQ1g68syHlST1JJRK!AqN_8gVX)f&sw_j#KP{H z2+SR0xOk9c-5P|b1oai3lZ{N9-}~m+L9|XBX@Bzdn{0Z0%8X@rP>EXMe`12 z(rncDel5N(xpWAPbz<*`LBV@|4)Y}w+B})j!OJWJlmse0-kL@!F-kSEO*Mdc0Yo>X zC8PcxULWpAYO)*xVxxqYe5dI-A--_h$_c@x`(w^+jeU{20K2x`4Xa!ipZAZxjr<(jENrp-YEkLPH{)dSHX+8}{1!~g~~ zvbFERWc@fqR%V(}prV2YSXAfdz@$Af9@xRW8@XE1WpLNb(A4-i!O5DxMCshyDzjm- zKXrp|ul~*LR2xT|^>I*yy^(DyRQ>Hk!iE4Ke9~Gz5)`$#r zwTNhmB2qZA5eJCpcvh_${G2K*e9f_A(8M0QB~jEnb+nj6bp6zVKk8j4@>24I(MJ zC^xt=Jvtu|F3yw$NxBrqp5H~)R}qp*>)@2sBp)0bLY>}&QXHlU@%Ic@)9|Y0iWGyK zwc9ZM^Ix^FR`YD&2M4_ZQzWxQQPVs?j-pBPz~}L`l#-rKQP&>p0V0J;NBrrY&S1E7 zifA93g(s}5?dEfsjVCA^aNRRgxcj_y1#iO*=scM`p*c*#Ji{ZcB3QOER#1W#e03rD z@)}T*I8uD{UFV_B8_wgSoPM#3i=4Ib1L!T^R(s7-0|dF@={>1kNfuI4loz-wJR9uC z@^v8aqgH}sNqhttciloMI}fWi`9*=oZs%{;-dvI0pB2#dH=uwc+OucwKGa zrs3EHhPjjGDXejS&EAToS6o!gpyEg<>1nb$EAsey@LHFFvX&>0*Cx&pxOx~lnh=N8 z@Dhr9(wCBE3er;0f{9C;T)PQcGE29xN%-H$f5zUyl8KOaFk#MB&71yY)Mr24bBilp z{vHmr`*h>bX75c;a`}LyP5yJy1Fg!*Rte|jWB+pM zN)3X&QS*@!)GK~q24vc-STLl^DwgEbl~PU8#@DEbBP^5=Y@4VZ$M4y-W!$fnEtX08 z=`_nRvJD<1-bTpPQ$ba``dJ!!p7j*I&t#I3n!JdWGD8e^^}g=a)9tGuhof1~Aq&9I z^+_a(a&oL3F<@!kR@ypH3P>hBs=oeXmB}45AW?uK{H*;!F6F3*Cwg7B&A%p_MgCe1 zA0Rsp+?AKEA=?M9b&EiUpx2W<(qGFDBhz12CC;*=OxGJi!m&O@7TI%5wGT)_rMh}$ zF4^!3|n-U7@=v~P;=V1X#opKRBv$wt#Z_8uFcsN_jt+M_)5vZ`XjGJ2m|yi=Tr z*uD$!EX%0EXA}G}zKZSc~$+J8=_$Gm1U40I{EV`fG;N_G}w1bmY^m~(+l2dKrk zEgPbFuOF2*?s2u_^P}xa$RU#{l&ev4Brt%;1Uyg!&(c%mGawao%ZM!t?>0Y6LRCo} z79IpNZ5IOHEY#-naBRWxJj-euwhHcM7C4G$s;M^mW1@tUnI+*a``vVLz=1J%W^r)h zU6ldarGop8e(^~=-oX`c5fP+4#)#%9R3a0KrUEB0)NJ%-)BRoTB&p2m5Svdhn6$Ix z(Iv29iUX{S0hBH1q-y+lz98p9Fp01d<~mQjIUoxi(`&^09ZRfI4L)F>EJWE|{04yq z|AE3|$a+)zwvqAS_XJs=#kx60_#p?e(Gok4R|i!9Ato1m5Tepz%~H>>5|*iywRu zP=XQiEfOh}2l&%|06im73$PCGwDz=75s5JDSnyJkbXo1J$f8Z{NmEY&fw7ORoyPsYsC zt!Hd?TkQ`h^K6~o&sGM9(ojTW)Sn)f^A!TgEK4JP#Av_W#-u&r@er34^ma1meaI_h znZFk#G7;5{H+u50DSSW!bytW)w8TgWRTnK=WSfyh#@tObd$o>V1uT6Pmtb9JkB>^W z-nGg0-3KN*d6C*h*T20t)cSp*&`=a18>I%Mgp$PUDcXeoHvZ<#6v=c9E?xDJ z1@17gy}x;USPuKgFW%)1i1Yss#9>}5UdKF_xV z*jfq_o8~cGDTCLPK+1+0y&`Y}FG3)~h1WKYN%hI%_C#0D7|@y#wEEm*eS%*HEPdfA z1+TXFm3H4whbg80<$A54%US&a?nVK*+WI-L$?ALmG`9GbX*`&st8DCW78{vVA-mO46ZwM6LPb@{2_4>EZyqSv+yPo^9-q(-DC zLdJsTl?G9g`O^7zL80p@{lFAs*pH>;`kRTk>7yG*G9nn0n8HX*O;TEX_yoPKQK^2r z^?mKqj^v9b*sVbX3whjgVmU*60&-QIK*q}!QxYTkZGD|{f5>3mp}Q-fN2!mz?4wJ z#%9tl)*OnaQN!VB@uClucw>$%wn!HFHOP*nemuk9A`W?pGr((PnA|B3?QRc$DlLm-CURIjrAUF!tMVj#XFRQ z-rvi9F>;K$qGH&Id45{{`nS{J7kHDdOeBf1UIql;aiA~_R;JDe$zwXZF^E;Q`P{v8 zJ=n?(@cyVZPhq)c{K}*^ys*MdKTg|>Ri7OpD=SDw?5aQD8#YcH1w;bxrc%E-V$5D+ zkMb1n0{>8|LLWhX!xC|tx%M|O_&@eDQ(G)D2Qt(>!2n11>k{r5BOy|a1N9JR;s~>D zg{`LOYOsB4Mf=HG`{wMlR&nCbSk-s>NWb%X0?@-T1wv~!s@0WHdppna``0yR6*B2$ zKH}MH7vBXdF50mH$xwv;_|k&_zdOOepG0q5m*M0|yF7c>UJ5t_Z?MVi0};twJC;%c zy&n~P{Il%Vm|8johR~OZoLCTN6_rVU(F5@f%3uZ6$@aQT$4O`BwtyRP_s*Jm9&~U+ zP;r%D5RvU2?aIg7X8R`xkbs*|k-|~jP|~oI-fcLyHwFw5l(>AvZ;8QueAVwUmJ@KgKjnad`ffGgt;b`c_Ff+t_WntG z>n#3QyxRMTbyf-{KU-#mJ?2=R(oZ?RJ-(YB=OkU0{*!gJ)hnpsX|da`n8-v`bHI-- zRAn+-{$CIlHW=)Ft}jo^SL@$=i^vzX|xW(G-#nj->$w?D51kJN$7vnsuVE&kj4AWeD+sY zzL9@#vW6pr;PbqJz+|1jzspXrvk#m5HX%vV=WT!Wy_ZJp1gijWf9qG_UYDIhQDhbi1-U2|Dv)z>J$y~gMGL9Fo3jX(!oLu z*q>*ou?s(g;K(TO-tDO?8+t$rvt@-(JTymx#4A9z^ET$oavoonV>GeUNu|9)8sQf~ zt4XnFQ~0@xeu0z23hg6Et`sP6cya4K?*b2V64S-3de|_m^z$nlhmkCPjd7C8e0Cr|-QT)k7utS3;|mhhfz86^%~mIsZL#7q4-*2fzUF5d!^O zi1<{dKJeSs&K%PeQ3ARbSO^fUlCENJUhGy@jWHpc;+yDK=&KaS?Z!E#j48x(Sm3RO zkBLpoU}itvD|MBqrA*|Ce^RG^p{mRbOlDg9`tt1j=z59ccW1`7SXa_Rt#x&4A6!X> z63I;ixwCKgyg?VY@cM;B8`+Bh&%)UjEdzHbzy2(e(naZcPEJ)DI?ORHKi@?d8jMF&PR9Co(_ zZBCn&5nAe{q$sV49FezNn8hnC+_Ws|1q^(~foE&Ix4{(f=BO>Hf6+4tw<*clqh6ew zw_jF!#MrOX4nopX942*>K-oOqO}565zD2(d zB9k;O8b<8`tIH!sK9UKWumd62`0i%>>VS8Qd5$u!!)c1?xR2kH`91WDWen-49=%*Q zxx{6=%JJBh?zdqchYhOj>VX^Q6gDPL<`x~D7P3fN{F+*-@~7T59U#YC?)Zp}&nCsA z_(l$ox#06SU8SMPZdXuhvpJd{qb?4%6*!F=T)Xl8^9T3pw+VOOsy6|gkn(M0f9G0V z8Q&Oc2Y>V?G91y(Paks0plzJ8pGJ#A{Vh=yKUxN6F=Oy&NOI_@`r(zDCoMQh&+_UE7*5qO*7#$tCV-)v zH_RgY2EW6Qg)roblR6j&F!-Rg)iS&d%#pJtqBrQzgXs5}$Wb7{m&(D{@!~__tOMOC zg4$0D)-VuC_EbZaa~XKZ)^%usZAl|wwz0{Qt`7F9GA}N!R9+w0_X+n)XNHBfJ|yBq zSgI5%Ep>mVygwi4zupx?@Irtt$wkf-lsjCXXr5%|yW%_=Ve(O=I4c_5*@-}>$6YIs zkSjEP{iGsDCrrfdeoJQ~=ufy+P{gQ!E@zspGCh-q?q4Nu z^ur(coC(CfE>CqX4D)#sF!xv#2!A*peTa_Q9g9i0h@li+>5IV49WyfV<+Q%sB)KY~_>XDK7Y?5EUa;Bqwq zZTAf+mnvBjnZLzSKt0xiU@cJZMF}DD);|uWK-0EdTtmegR@)Ttc`ORh2q}`&g|xGu z6g4v6t+q4>1e(!dzt~VsJ*l*mP)o<-Sk9P9{5gtAUG_iSE(22ub4yV* z^oy-xC?sXXbnIgyF7Pb?u1%sKYD z3Vv1r54{S9dY0xey!2Ddc*bv~Ls97rva*bZvSFyV@1Ks3xp9{LO0^SC+1e2@8mxOh z#~8&;l^S%txv%^iadU8{0E&ZFE#BP#k7c4b#h0$S^A6C)H6`*J4z*lHFey3%h(SFG z7FX_+H_2Yw925F0%rj5l17=W?f5~pL{NE5q8nrqUm16ngPFRi~zRM8{uh9T?IP6>3 zi^6|zltR8Qfjo{NQpfG!hpn(ei`7!T`iv6D&RW;ER_ER)OTY@UQ#Y+3AF)u0_VuMl zhV;!n4o8%sWDEH;%{`B0O3PwL)rCZ9o(%ZFIb86t76(%Gh)NTavO=cGBq{^?WWPVf z(rB`=zLDax&VO&DK!A{fyH}gdzjQ1K(_&|cyJVwc=R66&N3-cZVqwc6s~CmMUI@S@ zScZqi_$_4&Vx)UJ9C!b?U)YfSwo}`@5E1eWU|5JdwS$K}rtmrxI}${>JU*f=y!4)< zehd_544udJ2Ayj2%)`?$8VnbpkDPhesf3{)WCa(P40<=`gD!8cHNyWGb_PBo>b@dQ zd|qMp1_)iNdr&LL4_#e0#qBHI#`5=5Jrv|fm9jT>iobaQMJ@gKpOZ!ejAk5tvG_V) zK>oaV1YoCgcnDVzrV*RnBhL}CAFH!PXkJMNiH8DGmXiNt` z^8`{rqgcbk!nBIHlgYZ&as53KpQlo@c4V?!tzQ8~JhThtKw{OK|@>4*Lz zR?-0&Tw<&DnC@jtlx2v?%4$&+1O3AKTTV6J^z9PZ~NlGT|KC||L1Pf#=!U4={-FH|)wgWc3}U_)&)>wJflkK%~d7ZJER>&QrpQTLM%&1#iy64W}sR z0e*SOC0(%$>X(U*0@53It-%z+61SaQ5>$bsj_`|19`B^u$(A7l?!@FWkrbMB{&00WgJ=v9rPd{!xmc`w*O+z6=+R5uAs)yGV6hdX z{*?uLOmIjP);s~I#Xpn!dFxfU#Pn9Ab7MNf^o|P7E!agqg@$!w;fu$B`@F`Dwx>l- z$FGV1uLW>2df39EN@3sm)p&HZe%0?G9=?d;TEvq%7$X_F>not7Bwl2v9a|?D;6Zsf zv~%ub-sliu2@B$ZKfM20C&(+tT13LS5Y066*!&q?O`oU3dGt?rxozdZ9zZO4hh`Nr zM1}-Mq%|CX4p`0Fnuz&u*uZ_dCT)W{zU?=xSOvJa!j!AYeS7xm9WYnc5w!xJf!VF* zjyNU)1X4LHLasCcr*)bDHI|8~IK@%4FWd)$PyY^$we9yKiXdiH z>CGlpb(zVxPOaZfHY=Oy*Z7RryZ`HmeVOWZ(9Kz5++R*klC%_!t*sHE_K!rZU;=Fw zR+RC~-Sa&!T&AIidEFZCr)b}Z+8g&*vVC0pB(Lb_#&_QFNQd{U-3}+2cX7EO8j5Vt z6g&x0Rj015@kRS)ZvERIu_rsA;1ZKUz-BM-y_d?M-{Xo@QhguKGKA6a(aLS6Ji;!U z@W`A6**WdWQC>n(FP%3pzox6_)g9sHqMze9ZvWN7l8+{Ce+vrmYFe6sl~18hGDagtbG@9~ayYhWEiQZKOSslLsFWfkr( z$W@5GQ#+$QTB#!LTyhUXET&N7>NIy8F{*Z12atgmkVklJ(jJjNmxhf7rjQVt2^uQK zSqT{Y$wbjwde8vwPmDb|-l;(A!6B}B*Kx5cGC1Zsnne_p2+0WTs;*io;a9&O0#W*e z#Fy4hnp);bxO->{or;*qw!O{Zu=BU$I4{D3+ECK_G;h?WEMuZ12-9DO)6YE}Bo0@- z`0A=Fj$A=S`*eg$&1AUb&8Lh=P$rHtSY{M1AqRuXkGRu+Vk!{sxNA(E$&!#B47xS3 zJ``BS)Czn6s8lxGh$9M}x!U6xz0<+(UHgjwg2!AyO;J(p9YN3}p;!-_@R$ouE6?fC z`)-h{^I5 zEzCSHq1Z1(-=~;cq3Prhl(uXe1px{|43J@V$F1*B3qYJKQ+$RO%j+}vF&sNd9A$J-m{VPo3#f%x% zGTC>3HSvys;JK&nQ!?NQ_;PT(5dp|<%oc}^9)PHh1?FE$0+CwQUs&Z)0FMt^fMeqX zxIAoDHv}LCaD+vqtN{DL6pM@+IhI)90#E6N4LpENa5}|dP!Kf85f>Gz10XP=hT5fc z)9C3~4pT19zP34A;CY#_Q;6-cWK%NsuG&}}|9giJO=m)7GO z51i%>?$!W++@Jxef(`(mBPU^0g^;i@ekj;70~$w~mQwD7lAaPjW!DD4ugG!;oB$Sv z@YkYwNyNnJb8%5Q&_qg;hBbg^^rt`R4{&mf6sdet1PHJjRoDa1Q6GevI|lCm`}Zz4 zU@GKXw3>nj8(9Co&GCoJuk-*twvDQp1^#Xu#~LKRA|zQ%Xa^DYg#)g=9spWmi_H4{ zb&+%_?$jRAb0|23u?uL3PX0(M zFg`IK?`qb8myZSX$5O|q$PM`Mjs{K;vbdq7bHM`lStc<4cQBlgmF+a3R<{?pb8G|9 zU4rRf@Py)KC|He800eCtVxt5M(Z8qp{d<~tsw>b1e5ezH4xmSjqd;dHA7oE~pF14j z1i#_Gz9H8E=Xm@69Yg^RdK)q<0MwTZ@_(pG7)Pn7H+s5q{-@lOi3JU|Fm~_Ubh7lo z?Ci0V&Em~*cK^`V#^uaGj3UbBZ3KgN#0WFw#8JO(GNbBG?l~=x)0VL}@!}iMH4pR} zH-EiJK8MbZuH1QG+!Fg60{*G2$(3)@L(lQw3;#>kE<|AoO=-ft$O(T85YX8(OUur& z9C0%THZ^xp_mFzRfJ#`)cF?<-a3gmDt`=*fbzSZtP2B0JHW4msBYOfau4za|guIS1 z!K$XEU39}j<7Y?4CQ1EICCA#+38u(Qt+)PF7k>u@B;fK4UfAM)Cou*)21q~qHMY9Au15|ghEptpCT-QNZ$JEkjV#4 zPVHp^0F^Qk2Dpi6^SSBVA(o5>*XU&pn|7v1JTMqe+8hsA)#(kuP%lrW*j;nZR=-Bi zl!%`2)FMeM@k+fFK0wF2J$1RRG>A+)G2T(KWnY}Fd9Suy$0MORWxErE*XJI!P5ZN( zD@17$po}H}JOrU$dSk*6*>?ZSptv|@q^o=~5O_8mi!o!}GqC22_h=pAb)(hxquSC% zubnv6rO*8q)R}vuzVFj9Be{4LIP}7TZ zJt-?ZJU)Uuz(==gO}rGt^M!Z$5cL2Z|yYX(N~vg1o> zZwvsH;EN`X6xcgG1ToyxK~j5o*jZz3i6Gc<2AgwH8pSwCgDN;uUwSkJ2R?W*q{$`# zrZA>zw z&(Egt80KsA{e*<2hKG`J3{n6r8Md>`<?Kpua<)Qgf!LDwhb+Pt4|Z$|Ew6nx*yfrl7kGl2MmeO zbcT;R@KOJ@=tT2IO&7fr@NmFJ2fK$gal@8rOk6x(Q&awsU;;;=6FWkvzDg~K0Hv5< zgnZdBa+sJD7HdI*DNN~s*vvT>W;71Gzx!kGiQ&+xdVh823?^3?l6<5~$rVLhYjX!e z5jo(3F!zUNewpY48ogq?>Ef}!)?jdxedq#foXSHZFcp3V+-$9d;cWP}XZ-B&x`WRZ z_5~_NTB&9|IHGWi=(-RrqN-V`cT>L|VyF9@t^P)jqGk@wqKD7!;qlX@kk3E~5`ZOj z&sz=nR7xWyZS+hW(x%<%)D`Yk z&QJ_=E6<>zb9u~7(?ISj;(Ky2b_SAPIAaogkMh4&^C@5ki`NpyBIv!jo({D0ro zLCz*)lHuQ2bJ|_t=1(ZV>wM9#vz}TrB?F@YeLSYJ?`r1r`%)p0`B!fq*Gn=x=Q^fW zed*ouB3bh9tu9%!eWZPEe-{A{h`Zji7CV8K?XhPFA#K_mzSH+L6kE~YZ`gSahB?z< zzajxKD`=o^5Sl!J+#MNRi?+_VzX5Zx)rr0x$|Qtl04|Z@>$1EhtH-?IOZ*-zWzw!T zTSU8i*y}^w_h$d&&O}Tggd;IDOnE7`OpHP;N#L`aTNNc3A>VTDukacq(?LxeZ6M@3 z^n8~JTlYYUgAl;$v+l4noH_Cd)NTAYyGw;L;%UiV)Qi%cwm8(2t1Bl&yz28Ks)TtP zNQ1n11jBL3?5e2Ak`l(n;ZrkFzPU?SEu)!tF9N2?bO&JVNuCcSU4}Sa(C#{tfXU(@ z{og*Q{GrQZ>*G+@^ojh#UBF#TuX+G5sUdmGlncS0^Ljpiytlh>P?@|8!l;;dTV5#nI!>pcFnjyqTk)`^A+Z(IT^h~XZ4M4ghA>n z{JHs?SI7-00kdqT1RT#R7SE?=MM-``8WmakaB__sSK5^=;Ghx%C2C_bnDeoJ1Y5(a zrPe6m?Q!eXP7%n&==3>bqq@oXg-EZjB8>bKlPnI3g^0r$1s;RW5DTJKufCN^*_CFP zMW;s>;+Szm@#En>I%+P-TTyb2%Qbwh?I^w?kqFP~+n?ZO0|(}{ z>QN2WO<(WRCbk3_@uf!7W3t1(KyKUIt%%ShOn7(YBYYk=r~E${59edBA~4J<`dY-h zJa0k$t8%TFBdqaBk83yzzF&^yG0$ysAhJ#@~1yCJ_OietHi_ z<+6PyQOxPFiED()unJ!H2cEo}2oJG;`?Ry%V$BM= zKn%J3a_IN>7jBSJtE?qY0)~3US1;RHU(AtYSZ*T2kbQU?&E}812)p88euPT-PH+;k zPrT0{%E+nxxZGO1Exs&15BPLvGm)p0L!R)DCGUJe8{8)%hgL;X#ftblj9ICJD~2Qs zoP3*|r&1=vnx_>mKK2U3k})}9kJk%j^B9a7u-E%B;tJcSoOE;g$nC}bGJ;9Y`_=hU z6SX}3G9EE_yOT|d-oSNoT8K}%*dal8<_x+GSg_LaJEO_ayiSGRy?PA%v-_WJ+mw=l z97`cvKH(I~#83YoRKIAy09 z>z1lksoNR}aUj zHJV)xf^N=e$)ObHd~e=Y2ejh^LNO&@a<=!MFlk3t$mL=g%!ULQ6B?=iL8gqVSl=|G z)z~%4SBA7$N_u#_b57Ka1lD~8#)$=qgB=~kqC;L5e_KGso(jjxFR_z?ViA2r6?nW~ z2;6GcLX3s6rP?}e6S_S7(AmV~e2SO=51>6qv)g~aYr%fqaC=+gLiaGvF@S)V#sA^+ z&Ta|9bG{}_`p*xZK3h+mH#t10{@MH`J60R9gvw+ZcH%Qh1NqF1tIZ$r-A?n>l&nnl zM5jU8goMQcor(zgt(bI%U&D~!5HuV0b?c}T%EB)3jRh=+j3rb)r##l1YvAldD7Y!2 z-#a~_I(YX8N%fO?-EPU%DV_G=deV}dNoBFoA-)4~4zGXj#n;AFW#+G&(Jm zZ){hvgt2HWo>}8S7lLTzrppb0h;&?nkdD_$XZwm7n=Cbu_*0aWYTPp56o9jIW%|92 zOyf;gRtjsx4w9cV!2Q+|00?rZH0CHriRY8=(<($4%p`#iI6M_YZgg2JKgNEy!(h_+ z#EY<})cRD!p4$9yMVKAhN@kkzGb72m^=>5z(2zP}@q6B5Oi|E^G@VH)D>fO+R5QwI z`)U`3GZIj@=nmON`orUz0m#57%a+&D+efqq;7Eya=)T>v>C(5AO9*K#MqwpAJS!^C z-bTMa-&~@vQa2Y!X5&2!3NuEnMaXmMS5o(*=`+xKD`te-!hR{&DOc+hpt?Vea@qDY zQ6L&oUVTe%Jzqy3*b=f#EHAKzfGxDzxSSGvVq|%ZlpbDo4;(sW|KoW2zqQM4I1d#)ovI4MX+mhPz$HG#nsXo9IVHf zR|;5ZzO1%Kh8~_{n|D9cwjS}X_L)t-kH`c5egMw*%qz;grM-PFp67in3fYv;BghP5 z^8{MSf5OjroO4-Ujkix8>kl^Ch0Sa-0tl8YXfbwmekD>4XKChD8vG`*H|UUnD8wW% zII=zPJeI0WpHox_hI$%4HtYh(0Ik^z7Zfx+_V09Kx_gp|jU4e}WNu+$b3jq)QwThF z-*@f*c8l`euKM`Iu|v3mw~z6m>h~E)Nn6l$nivXA9ewVUvrY5Rp9{ZW#iyZQyDY?h zi>HurIwBJNJVO_b^9BDb003r5B_b&vPBj!0)3aPqOm+Uzt6HBgAW_e0^eG|uoPFr6 z)O$mT(A4vl5{Qlxh2lW7K~xN23F#A4}> z)U|04PG8T}aNHTz^!<`woRc*(B?T7UqGnYwFZ8pU{y==>V>qdX5#SXp;F%9c?oZ?t zC~rx6fG5##olW|lz8|BKt0S(FVhMvD`u`3;($_9ov+vo1gcW|GYyjTiAjlxrTCRk# zf6l+UJftLx;utfp&**ZzgCA$c88P;ZH55xd6m=QD-90u`Dw+gd7WZdfoD8vZ)Pa?soSPFKjesfUdNn3vaH+wLcwS2a(BoWxPHj*I0* zM6`74DAO&?fy{l5?lkopiSSE31;6^lQOxLmcS07z8f6K80A%2wpedV-~cv zOkdX_=FHjpAfHV+~MgFdZ7I^dbL>A<~99+*$D9*GAdBfAQG5Z0)fCg%oX}`)_R0xNo<)E zI^2{KgTPTOHfyF#^1#8_wH)GGFuJ)yfozF_hg`*^_U`aj6R-25ewfYI?Gb11INU1T z*#)ohV1(VOBn7E?y&o&p8!a*FK`>=4^N4;}bh>GED_w8GGNE@k+{nptsl_2S#2_XE z4Y>zl+OB^M#$&|E<#WeMZhoM`S@>ntm(p@w*_&o!np+EUNKH9Ew(J1b$SS&bfhTOb z$w5(*dd^oK)9J#+E)Uka1c%0<2Y`6u8mCvU2+VBUpT(FGpMdZ=ipC{CIZ22=DbZim zKSQ7D^c6evGsXoEW}W%qky))q6Z1c{V#A$NpN(^gfE zBqB#$?|j-E%;S9e1q#_*fciNyw#0!I%Gb~gRlIQ|RA5_`)Sqp7cvFX09Pk(tU-7n;MK ze3TIDm0YtR)I|~WZrHD8_G;5xjI(D)6qS=uy(0&I%beZx%QoKb>e9R{rr_K7<$AoF z#&6DJ490L`esGsQC(C=oLlrMZiQOoRcO~LPlLIJwSr0GUhY@(fuLRp9XHwAlj&Wx+ z3BtmwEAAwHUO4qRI_~HB@)ar4;}O!t+P^p|l#@_k4M%}q09nWR63*?vic*_>M~f&e z7<5{k2WNnIVcuDi5icoOxV&YFW_#QjA512eohrJH7EQTkn{jxQ+!PJH7uRvbYKU_I zC^r`7%!5rv(D?+>Y_h`Ws(h#+TsllqL8Riw>CE;H!nM0XGctPRN@v(}b?HJFgTAea zf@QOt6;1}IF+4uSo^j#fV?Tfde0yC~%XMhQqeB?ZrrN2UFb0tq{HkHl^|~LrUtujS zK?u9^8;$NdvsPO4D8elUj)(U8iFJXOYvtTk)eLbI<2hBjlDiqe+l2=k*RE%y*LwoI z+#a8xFxVrtGQ2v0V&utu4d$IZ&etCbb`I01HY&8cm$aeiha;`(-_pWoai@DD@*G_x zOdj=ykKt`sXq*(OjzO(a`Kqa6g*B22x%Ha(NJ`#4ndn2XT7_|UcmiVT zSEmVe94V9p+E)pz0PsCd9B2IkO>{|hX5BuJ2!=6d` zam1*{w~jpL+wo@aQ>MN1thm4s`0c1`Y*)Rhm`n`oz-L7KpIzd9l|1dD{r8-mDOg=a zj9<=$#VgpVE*{Q66kFR(C|<_d_SW{aA*^eo3!}PKEID;%@KzkvMwsA^l^YR0w9&Ot z7xr!*D&y^iKV8OjVghQF8WHPoOlJEIXg#|K9hh$Ri_lGZb{2Dgl4bO?4G9v&iw70# zr<7rD1{9cXDE7(v=9lV~#Tm+PO)Jy!UnQrn_K{DM8A{2oLa>5iLR_P6oyP3+91p=r zOtGV42|W^<46Pil*WA&yT9uaQx+OMDlJ;dlt~af_a}Zy{6Ka5XF~f}qdYC#Ctl>be zKbclf@FJf%a-?=Z?Wn;g2oa<9FMZc>$PMeBk8$rj<1k&LUw>ryIPVW$CQ+;#ir~MJ zUJ*z(6RpdQ>_1PnjTmkLc!Y6E)%E|O2Q0vwjYxmI*dpX8HTV(neSNZyO?bXiFwgkr z)1UE+%;xAQ3altLLv2lQG4On6g6UypNAydC==O*I>2)}57_Qr04lQVpsu z!wccD+$Q0Bo|qaJgqWR8QD&S4^RHHl5jRh21QQ109QrKmVnhc4Tw082nnP8?E~1V#ew znFFAiEt#5;^}=Rmwo9eP;v7>FaKh1Ps}BdzmJ}Y3sCUii{G!NZ8#H`idTzR!yrW8xdtnGh4I3b$Om5M?9HFDstLUolkpWTZ!{+BG#i3RIgNW-1IBiHRJ`GA?W%{ z;~jhk)FU841F>}U^xe&GK+Q(T2`{O!Pgt-hM5v=S630C6Pvu82!Nj;hYp`g;&Cf(S zEw8J4XFL4zw=SroScQvlqgRv}8cOHL|#RxLTnOCpRWt zndlmP+Vx15A-3R7mUjcPoBcs-LmhwI?~C1TKMv=QvRa!t@hdvhPf zz|m8YfZW8Qp@oSJh!fLh;v-C)W*Wa;ru%~>V$peNiHvJnHie^MFvpq3!C{tH{Gt)^ z>jA`YwGtKT`6FqpR#c=W3(WygLGFUZ(Myj5e)8I_7~a@S`5BDWj{yEl(rXJv<4mbw z(&lx`;(x;9Pw&MW%cisaRsDP#H*6wH1i!V2nMe@q;0Ih37Hu?A;_{_mUnVGifWyRadm4ExGim3=I1(c+>C%uV z&Raq5)e~`-%u&f!{U5kA2{5ky;gJ6<32SbOI4wxh9cN( zxpD+b5#%7C;9vPGvqVF=(I0bQbXMg^JH56L1vNeqw*GmeU#Agr)Su9D@&(%cpNTk~ zXZx-PCg!ZJvk0A0gg>&Xi`jTga`@Zzp`|lX^ylWQ?8Y`bY-J4dF%=qXBg;Sa6&Q^G z1hLhqJj}Nu%3YRfLp*v7Wd{h@cK%S-x*XPGuAEDqA-9h%P{;h^X_Y1H_>nyAHx+!n06qqC>YuCGo;7;+ zM#7m6`0^{dSz%m5ucgG%02K%3vJ<(Jh7ogSa*1UOx=PtlMBG3i^T8?$5vh{!&`stI&Z>mHri> zHq1xmRiPD-gY@(Xucpm%L4m{AhK*M2K$f>y$9-m~Nu+=40X0F~>;qUW{L(VkNR-bm zhZj9W^sfgmM3C5+EH;E((rIcrJZ&@W{zvtZ@D!gkLn)qWc`y=@uFIU3&K0gtVUe1j zP#C^^ooYBMZ`0Hfjl{Hx$UrcZLVd}V5P2kk6wdT)WhW-wK}USu)Y6^ zj$Q`hHM)%%8sn6UY6^(5-T9qgbhMH=Z~F=k}PxlMvUKm z29y#@qdF(E$y{vW$ z60yQ5Iv!P^b&4eh;|?n6y>GRRM{sN3x@oXF!M3;}Py4zK{2m7c6ADD1-7fr|v7Xb~ zOPe)Gj$BN5V{ca>+cPcnsmdIR`r#|}aJc0a%#H<9z?&s=5b`)FYGs~_v z_-~>McH550jycefu&lUn=p*FNb0z2hUL9rKeLjGzT7Lgz^z}l-ioT75ytyZmL~aRW zsFnq8OGKq}k8ko&1YZY!%T&#ucHwAcDV1^;tP%nMLUn1Fd4RJ3l{fh&2kc0ksHgII zxBdofO#P8_JyI5mbopsB{V6U?gT5w%Vk)uXre~RLn{)BA@asbLYj0Y}7u@>a zX?2aW=~hwwdrS4Y0jH`Gs2MSMEoIUBOiX)YEl4%QjRvSPCXu49fa1&_JUXqx5SOg> zU|jrcd01ap_h41sDN{?2<^h)aiY%wBW@-I4k8fJKdyF~Qa7h%16tohaJnq{Bx)sga zuR!&noq1_WJTy5aUsG{C#e#P-KthmE7^XfgE;KBxOQ%1BGikDvRIf3bt@n9@36uYv z`(TIbKcEtrSdr{*Gwq3E- zH?a`Iu0XO}ZJTbFJ}ui&?!PGi(llwb2xHZrM0lApdQ~EY1l6C-V)=cpJ61RHX!&eI zjp_LSeZ%!&<%S(t4_as)Iu%$dwBukW?%6 zrz%a2f&!5ofbx_f=@ckdC&LbOUMWA=(pS4|g4tuBbHRQBEA~q})G6CL4i!doZ z6=K#8K$nEpee9ARTp&^p^AlWLC(4rIX!U8=@z!ikxy+2R9tWL4x64JZ#cYo$L#IDG z#abrexx?cu-+wy-d%bk65jk|CxYDrni%yKE)7WQnK$C?w4pt%S2kl5WE42x^MK*6` zYMUp(4`p)fUzLniLT}Vgu1Hf5&^$n#z4pu0Ag`^+VFOkv&{* zpcP2JTs6Dm`PWtvV}Z1JC1X}foG<&D99gUfqp_YJaonlENHR)v+{&$Fc-UHdFR4uV z?nHv(s8rgCC&K|D_uc`8WejIB+Y#z$3L%Bg3~&lHgPPDw zW^a-pvs3TE)bh|mqhW49y~6laiULTe_gJfQ#aTCM?pbsVr+CtUp{ zW1cwWrDeMxAYpMl4X-GXES@TL7|~NT-0MS}yE*^#H-%!Jf4%-dE>&u2-}Q{kLK!EU zn5)(r8EaSwJGY<75&Ug=t}jk~Nacv(&!93}c}9=r=YF=PMkF_4(VxxFPr0c3#Eyb_nXxZ|hz8^IwxlF{r zJq_x#UH(NUX2q*TC9@NKfenzE{D2Z?KmkKaw zeWkPczWstOI@6cv_ISL4_G(+`z1Tvqh)C)EtEcOpb(R!XKN%8qseA*m14kbQYt>U3 zQU%!!MB_#+JVS-RV17+eMoW`@hjEr+nM#-sT(DqF<8sz!Sd!AnYv@}<^t^l8 zwZ17|K=@ZN=`ykZ+fNQ(L~$+jarrv_Bc1nf=&6&TdVpAgm^^5F_*O)tTlqVJE#E-H zqf3e-K}MS>>2ez6T?ivDxc) zv<~m!>7bF|&?~BP#b)tEBs5TI@te{?_0R0;`-@h>h7yN_D0R{CM*0b6wSPBHm@6340K#5y(WGiwn6$H1I_jO|9U_vva;!rDodbN<}*SA(y#TzTA?Ev&et*o?rBQi=-1~*nB z{g!D{8~(l8wSS(11I_VJMMF;Iq=F|3CXLpI>w`5kZhc+!M;a$pEC#Ckoamv7h|4du zNRB3pgXEUq5Rz<)=)6qBS4Aw%^6Ug}9VCZU7rWi$b8cPF+ zkaqPvXnz359TeSy{7h?!m_-EYao)Y(4CGDLKSew;Z1Hx&dXeGglP*maeL;5$BAfV{ zm>qj21mn2fFWD9GD&05~ZqX+3_WoZ2?aof<>B?l5zCa>Rv45Le+GKsIuq4dqOIU1~ zd6vV4H*|o1lqj21#w-pt zX@>PJni=LmW9Qi~kUJs)(^j4A(jX0UHrg@JaV#vr&i%*}9obpw%MMNCwJnm{~Mz~kcc z>55w6pFKgBQ$F~rl^ZfxV-EOGRU?{vz!cMd!$kO%&>>~?XP*v}*m*V5v2>-mdWb@f zN)kWSrjJYClN)sEic9PxSCgppwNf;pSyvts8ABW5tqiKrrVBr@jCFy~X?X2PeGwXrH`LW)E@EI_tw*8sXR;JMI$XfbId|%y$FBJ$5&*NKFk6~NbD=@<6zv2b%E9vr}9FWHZRR$`T8ZL`OLP&&vYKn9WKZ|{$hZ_-e( z01=I}!lWC5gnk>H%RgROi86hGx{vF6J>SpoHPS#lNp$)I$kfuxsp$@Qz(KaU$HJ@( z$i1e2mTSrm8R-m1hEoC3*zQ#unEp0zP_@PbJuiCK%9X0D^*G+hw-j5)F>u!dnf;pY z={Ww_Eexb}?R#EE?|ZLlt$uW9EIWyem&sf&)Zy}-6{12|Hj7_FLZwHb^Jb3c$>HFn zg51-b?gV$mVOuTaJR)EcccvGI;|8RAY6SjRcR#Q3-&n5U1<`ydX3{Cgaz%4w-NoAX zO8hYJ$4E?-PHlPP*WZ-7zVXF^ zlJy){|6=p?(K}X)T)IH6qaycoj~f&c8(P#r8sp`h-iC=W{<~{Phc*c0+O4-QXCP(v*`cY+JXtx8>X~mLFP-p@7P?;6CH;?pc@n?$!Ci z{%>im8N^I_d@(WjSis)FE;}sGM8>Q&3U6$a54>-E`R|33V#g8i{C5v?Mi=F=8=A~& zc8Sv$Yn`S%*FONAebhXa4WZ_+S{QWcG5?4d(Mwy)(yzU{G4pQRyN!fcIc5_h#$TYi zs}ddYEfw-9<(7!$wEt)`)QK7+3>7|~?E2RNAZL5w?mmis z0Uh-S9ZG9we!jclSf`4kW8dH`;^s1NGml_+~5p19rjie%Dfe6=yHheF#QY1dv)c9GUuh8#OkJ;GD_404{EIePI5SEb%hiiwv_#QlQrB_d61>*kv;*wi22wnP* zN)U(=*R6H#@L%cw+GXe8FXMce!l+7?0S}bX5rptlDg14oY)l+^|M*W&SfV0l-UZMJ z^=`82?~n1L!ttlzWk`3VG&@i<75&Szc^Llot^^i%>-_uR6_pmEKNtnnsQZQ0TU*+P z(b%KDAod0YpbU{?)?6T^3K9A3tuXbos!4nJD!;kmqUyXIQXyZFw%nDxl~OyJXJ@VY zI?L<;5wwk zNH~Mn3xgUyFN&z5#51XvN0Jeu4&Jv@Gi2s`b5l(7_5|me)bE~*`g5C`mqdk~RnZEYqY?0NvMvHm2`+{SYCz8aB29O5Fk(?I@!ZD?aqTQ< z^=Dj^51Yg`6NF#bT-gn0>|c0rL#v;y1bqK~uvF_3VSxCCA*cWLpp702w&WSO@aXQ` z{}i|#X{#+~psQ+CDPl1ir9p(9(nrlUC{L!jt!N&f-mF}g>sS&bU-ikwXB2B}Wbkaw zU{LVhB=xtJ4yFIw{$vj$+u-bP=aoi9)6oMvHqT+cmizVRv5$Jj_*Hrfidj5NoQ z_kP8j>`iSp7K<%}YuQq6#I#8Z3Mm9%b}kNn$$p7f$l;;w)_TR?%5J66MTmf%oB7myW~PX7^sGsl3cGb zd=oPx$Tp8Y&)$`tKuQ>vmcGxd-yDt$@lTC(rIS^2G4d~0GSM^g&fO0uLBFjQM1P0C%Brx~yccZ?!P#ajPTHHBnS{LHQ{Nn@K=~^AaidkIDtZNC ze#S5RzM5F4fA4wPyieS#=;s;rsG~}lQYOFUnN9y`4wR}Q?nw>H)EY}(f2Z{S&fRdh zDp_6BYQ|J}AmMK-@^1Erdo0)lv`$QJTJf{PSAb|kDrNOK8yLyU$F2!ELIUF4r`?|d zS_ z%nbdN%oka6nZiG{^(K`*j6}V?72A2V*|rVVvRwLf9Qf~EQI@Ue1^I;3x?+PC_Sx`#!nCX7_-qSS~@#4wg7Jv+}*M8K0;*5RvN1<ot$TrPfE$ zB;WBv#na9Y3T2`$I>adu!FatWyTZrImmi^DcY@#e3#=jp0 zrMjdLs<(=trSU(NbZD=!HF~mild$I~4m7!M9Flx*l02?VSd$!jxx5|~2dV_eB$m5Y zWF%=-$^FqkRq2ZQS*5IQ#a)wKnWQVoD$soX3zz)ysHh?>0Uv#tqm+tUWvPl9`1UjG z5$S@I1tmvTYn|n#d7ZYT%hbCvR&d7kl!=cS&RuopZx?xvgwChO{J}EUR6r`&L$I;2Ao75?n$k z4a4G>u}t3nPpre^!~bdTEu-RkmcG$IkUttB2^w624h-&0AQ0RF1Q`fnaF@X)cnI!H zummSqaEBzo%)m@=3j|1Tch|c)_dNG~o_F2%!~J;II((S5o4vbt^{=a|s;g^PR~U2E z$4a?o(LI70l@!z%Qf~N3GNIcn2kkdA(0oUevOiIQrP{F*%$g2+tu&(Dn_Z7hy&*=T zHml77rhZrBG9cXK;?1kSB_2U1&|mSDOU|QT*sksyb&Jy%Z@BOEXl8w#>lFY)Ge5z% z#bU%|++i@)1|WX~#y-|eFsSQ1gFtUb$X7`vg`0_NS}3Nma&X1bs6lzQ^d^o|Vmja# z?B})c5PSD1VV#p_J=FxswYy9v#n$sQ`*#1m5h6&DQ=~-Pj-=`HNTmkYi1MVV^ZB`> z1$4lR#w&TnjFX*gT2#&Ls`MBYB*RB$X7fy4LL{;Ayn}5*m+kZ8EXv6^BCs~4iCla^ z+IL6|E;I>Fhe@SiSI{`m4`#bw9{4Pv@RKjw@(wO~n9VGvwUFVO&me$`ufBEO|9*6T zV4%wpO|$0R`jXEa8iSOd{>s&Ulk*`UBaz9J1f7TTfYdD7A8l*{ZBxw4$#ZSVR$`M#im~i|jA{evt7D-yn-2NO9Jzg#w}_LE&~Cw^uCT^(E?$ z6&||~=ay^Zeg>6cdF#FaHtQsI99$x&Offvz5#zm> zS-}d=y2nd6*xBEWFU*JxpE94UJO_obUPJ8%;pEUxU(pTTT<3EWf_^F+4I23H-RWrE z5H)E0)7))-C<^{hid$8BGADNtl|{R^*Xkdpk!&HD-g~Lt!Mn;L`brZrI~&d&A{TAZ zD+0$0b^5fAO3lM;H|0yNNkgxIBr?ul()Id#p z){Du{$>F5lW%&g?7aV13n&nR)!q(?T`nM#1YSA0;K@-!Fc zAKf_%DU7-h4zpnqxRK^%YuUT1byek93nwS!~fgDIHPN zNRA5N_$ZKwX*c5QM+{LMsPr!YQy9K9#AqoJ@rL*tK7(}U_^-;jp20^M%+sU2(fSoB zgO9mk3o%;C(WnKJw|I!hX8zhD5&ba%iFiBpk1bd`KkyAsUD)=n4(YV2j1}N}xmL>2 zdFA;l;UQY=Cbz|RK#6xq%kq%U7NkSkOYa^q7ehxIVBNdcytG-ZLqFB9L`W zut-#}kdn`E(unfj3LYYZ?K)mi=Rs}v&TPO-dOx0Y|2lZXV($T!*vQKjzVGaH8IK+o z#6D{?VrIT9(BIOi@jZI7sXJof_8CESQvXk1-9*pt2M~7huh(*oZjRZtv=ttQ-H!H? zJiVEcjf|El35d^be^3mTI-QKl6+xS(JP%G2*8hB2|H8|UR6b)@v)Zfiw5}U__#RS3 znVYr(f&&7GBR%NqEWNH`RG)lPg7Kcz<~~qH=?ye6L&UX8lFMpC}60_EbIiiC~XNrM|S~bF~>yfGJ$0qPLgqE^%6D zWZqY=*QM(186$O9`V*@(;PvsX<;&mbAP6^Dg;9x$n68 zky=sb)nZ)eZr8;jhQNdx9NwP^3_Upk`XYaRD0ya0&nSfED6aCEmdoL+4YjCWUXYBo zn{7^Uf-KZbp&kyiL(+ad^zl@v9zU&<6#K_-_yRp_r(Pk=Bb>P91OIr>3P9Lng1-4g z&}<|yy!hL!9Xs?^FE;ZNTUpTcrPYr*)~#fqlj!vqb;!}Z4_yz!&SfKs)awhaRd`9k z0en9hM^*~9ae;Nh`cxD9l~0*G*=UeKfl4X?IPeFvNmvt~e5A(@@6Zu2ubXZIYD)Cgy{wK?stRoZ zzIbvY(^7~fk)e=w63b9-K`0TC&t#BcU`i6XJV9&BUVgmX6K;LDvrg)!L%i`!4k91Z1ts{OmYfx&$-g zgW{ZjUnrS;r~HMNTpe4_tFEEGMFsmW%-#+$xN%k{*On%C-~MBIAHM4 zB%%9|S?c8+6^wn3kEqH6n6hPMydV@_Y~os1QKs^}sY550cEXc1ilFQSoGpS3Wk!|G z52pXPj#lqER^|-aUP%^Uw0?bvrrYWFGf-)J^`p*rXkvAtg>{K_sEv{6MvM(TsyCjd zX%WH_3m-8)vEu~-hZ2*mHC1jruL_;YT6BtZ5DKyR+Xt2T+i`2f7hO?T9DSqTbz?{E zGfD#{%Rg7=q~js+a@R8$btm@qNj{C+@RrV1((@Yk@z>pv@LR8|+hE|qDG}vP=S}wx zpo2%dJ})-7n57mlNsEZYNO=?o;?FQ%nZR3QDx-huBPM0(&aeGAUllhXN)1r3*-oc` z;RY5r-u4^Hg^>O+ef{s>sXUpVw{&@n8Qr<6K;9ixskc%uHR^ai&(WrR;xh#mC}KE{ z44utnz00mCvhlkPI8si~;k3f2>*D8oJ>1y=z=EwbP`o_3ZaXP7lUrQW<85tB`Sbwd zw*H%QZLXnn)IndD>s3|48?hqz&YoGE>KMnD`KujxvO0JG*^tR7&JuS{0@I{$45()Db$!((Jh zeYhe}wD-(KgJ_11D_-W_ch` zf7EVUC%jqCB){YuL;JKWPTl6IR=h(V5&aumRt z(LZY`0h?!ZchActz-FQ$iB6_X$!*D3&1&F`szOk}$|BvUNZ6{wUvnW(Ve<`mIgQU) zL>>qj1igsvVBVTK=@RfH7-iX>uuW)ouq`2_8XRw{lM8;k+IKy6;}2*lnV>hg$s#tq z-rgtXKD$GNmi-*=YSF#@!jhOjh|s0dOHH=GJ2-uh1r5+&n#&5PkTtfEh+08dST%j> zZzGT54k?Fbyhd`76Di$AYQy87HLa{@HICeX2I%_e7V%E)s;Q~6)c{Q}6Pah7O zs6K1dd4mT_An7;8#i`fy^-#WXPtA_5uyT-MO?)*Xzjl9JXScK%cH#yk6T1@(DROPN ziU*8eX~fK454oT~cW&`bN(kst%GX`~Gp7|VaOn-}NDtpdz%S`aQQ?Bo8;=@1N9sun zk9(Z23KG`nON)$vTp6g`JSd0zQom*FFJ(xPz-)v3Z0QN0%ixN{Y0sVs0w+`kJKsZx zeIi1Np{~Rq3XRhfl&w^zS(;H zHs3!%w1k($-+L;eQNQorL#Y6sgw9`_+kedsq6=f#_hb7S%YUq(XGOR@#AvlckF&pj?tzO|sHiWlUpLFH zToH{wHbUi`UBw7-kEsPppO@8b4n16m^Nbfn|GwBVC3?p2CIris@r(z?MDyg&tr#G_ z)#2oId~-$`46?ifj{+2b$dD%I#$AR9ecZ zM3h>70#o0u{ZN**I9r^1!KQZkaFpw{NPDL?y&R^Ew`uSin|vl!b&_ZH& zVEpNOl4jrZW~fk^VnoMndHzL>nrC8M>>PUrvlF-1~Qng#pWhIXo=a2WPzsE@t1w)N@S^F zietOIjsyr9^O)BAfW;+ NoA2A?`2WM#X{Hz!?Ht1ZMpxdAO2>=8W5b%iYiDV>*Y zk1QOYpt@yNuuYEzpNpzSJ+#b&(@w1X%oWvEjCoJj-6R*h@&LAdntKm?glBM%Il_~H z%N_Mon8SP_EiM_@6h@;M{CSvEpJOM@U4wL)cgC`hK(d$2WFJHmqJzw0gvfK5Kq>MQ zm(OrM$~b>7zz3Hf@E`oJ{=kI3Tz3LiG|zJ~b=npEel_ipgr%K)n)Z73ymxGW(_QoB zhCc5bH%=TY*|R;A*^&=cuC(7QmGhJklq+RIKTBI)MHcu-@N?VI_k;79kw>^dW@%#BHn`P;ay&}E?14}qT|R@A=-&l$@z#~yuqOBsswHoP&WcTq_wh3~8T$4ic{ zv*uP_DS%=9jI|B@NwGW5KfH8UZD3kSMyobg9dBoe4Z#6Xq&dMPx~Co27RZ&@OIY1l zUrjScE#pkFQM5e31YlVxGTYuOF@7jpK)#6dVS})mj7f)~)BGZ%w}(pzvIQgwWD*wQ znbo>JzXItRs2K?=s~GS;b<8a`|2~XREey)MMjq|q{sZo8RSi+|Hxy>10DV-BOWK6pLPI4VuCl1IN?=cHHgD-j~ zis{=@Oe*@4udQjvKMM%ZdBTB@;9KCMidg6>{EHNOghJ&;208U}5~x$9l(=FYC%`OD z3NIrk=8^_>4VOSkb9HEJn(*`tEiA7?ENCw;rF-t;dv0J-ThpKXnUuQ%#y+~F@;X;z zABenekQ-HbrnaHu@l#)FJ;RBMg5}EpP{8*lBs&v{o4(~yrumwi%>1SCp0AfN`>GK~ z46c@;d|+yy{iVxrsXtX!WMMvuMiioYO!HV^$X_f@%?Cn+-JJ|C1` zAUmH@xzw^OL3J!T-I2%#Z$amFUPW#5Eb-@H&4#~Nb93Gpohf*o`|5ZdGX9uFbfopR6tpxi$mJEwIZf7ySu0=z!2aiv}w0G57@= z<0aa9VVfKeKE)>h8FT*ZHMM+bZ5Yrbe|ir2kZ$u%nVc_E&O)&Ab$XY#eG|`$@58`z zhA>}96hs|CQKZTyD=K{8&Rww$pj7fY*O%G^1xz{~oz z>e0A>r3v)@!jx}T`BNm-a~fCDg!#uH#Pd1c)JykR7^606oM#Msxc6wYajX9%F zSb;;Lt<<&gjVl0iAkBHe*TuVQV+H9%!rY?e%*8?CY{5*yI$vKx;#-ln;`UF<6XPcW zc(MuM$O51E%(M*Oq!pvOF_n07>B7Y_(91%j)bcUZzvk}7tQkmJAhBEB1g5i~TM~bl zgFxkTo=x>fxqxz}Zv(QIJh2}{liaNbsQXGciu<@z&7ziz2S})?fM`Vxfq~MA$_{Ej z!F7C@iIqRmxptcwfC`6{#Mi@B7ILJ2vO17yG2FQA7pQH{D1A7);4ooO!v7v#(qlh? zrU`@JmGn&F$ngi)u(8JpJAn+rs!DYsoY3-&(K3@B(}2waV0A_>f-x_zV1wti5~Olt zK$9!#a%E*JS0o$`fTGsyytX^`vF4vG;+uM0V}z&~y4r1HN}fx8S6rifgl9CHuuIRV~#fz4s@Qhpqh?I8EAILg3-55c$5xB>EXq z<8?iM_p8!TAD2ub+a}x0YpK_j@9J(51G!D}hSnb$_{?5==l~lldB{xtaMcrn8JCRX z9mj>)1>Z~oWr{EH3pIl+yJ6(quj<<`zwVSg0vVh95c%`_ZmdP#@uTRC`i>UO0UtWx zoG(#QRF!;@M&g^xZ2RbS#S&1UJU8YAixQ%mF8}EU5O@Qy2Z8rVR3;#p!%ia76 zD`v1`KnZH7&>@y3KHyz`2-X-WU#y<`DdE2VT?HEb3cDCeKRddyrx%&Ylh*oMyG5JQ zZVINHMz4NY4pomh(H1$XhjQV41MyuHzL9a0AQ4x|!``E&SkRB{4n??t3z) zB|OF*v@>>s%?1*|#s&iX{xcHc_peWtG4Hl@DvWeVi_@=`q2r> zKdc;6L-3=O?CX6l@MW6*7BG%Dkb~>S^^7rbyppWyQ$k4xiG-GCIhI9bSsa;V3-;r@ zCI(K5zf<1PB6ST(9!9%FBi;C%i94d>~d_;-T-F z5`vy?W(1SkSY?=JkxC$5fe z47ds2q#c5O!eosbu}Mj*$KDfoh*M*kP-S-Fi85}EL9mjwenlAKv4QzFN7<(r+=)vJ zq&d*0KxucziiKF=Dx8I9@NngXJ^Iw?hQwXRX)H`ObEt2%hxtW*WOh`WW)eLz_>ZWc zr>bkrUMc1u4}rCC3QxX{$1VV?92FA4jEZy4k5WpLRZ~FbR_^WF>8B^JEf2eWZoDom z2QxzpjFRp4)&5%ZlVa*^Y$VJF$4Cq;6+}IZKT4Y9aGz(TCR+vQ7ULuA{I7Qokf*G; zYBM{JK?WvPm6@D5R2wlXWFLeRFgD_2HO8u?Lv0Z-Y`J`Z0-sSw1pby?U9#pxfF1it zX0>g^BM|iWm+Jfwwb9g-!FsKY(Ok_e{v%gvyjWVRh$yqsXnDT6rog=GnV{WwbxPc7 zWNSG}C$<&E`Fs-W`!DL;){eJZ!S1_nI6vd$5FQ1Lf~7jP>KG}wD=W%Bd^<%gpkk|Y zHl2twmlF5jChxf8qP>FhDW8jO9vHpJO=okrI#I5nXJC0<9+g8rH4M0lX^S*c-Ka8! zh6cMVMyWJCUIW_1YwZVHsXBwrvYKK|Y6WIEb_41{o52J!JJM}Wh zK|^NC4mk_euCLgRWX`X&Z4H{UvrgB-QYuXqt*Q`glzV0>ym6%9y^~hjnGQ3b{BR)2 zL$A=OvAsc6P8ccV9!vIjscL*werMi_4n8S$UMyaRQQd3ew>p_Nv3R@mEY@nGf(`~e zhC1UKIn1_uGWt(OHRDLY)mMJ}h#79)kC{(OZNCQDkt`_7NBUaPJtm%&2-aHibX+1F z-O*D?O@h0x5Pu9%3`o101{`2c*nc~~p1|zNus}jPi;&znE4JwdvTn5jc`0)0`uZiB zn0RroelenL*%u!uQBim_)y;AFLl#7W`Jf*ep*SVcRi7VE^Ap*Ob(vHQ^J9_ob&R@R z=Ojfu3E!Q`L^7v=lu2t6b!8}~{OzxB_TPpUR?536Z#cEv!o&^Te|k8zi1L}56C7Nh zOmzRC(bT0>PS1W zzbm3|u_qdS1jEsAqW!yw&8IM?b6I`;hULD72E^cy(&c`{^}03}NlZ|!TVb6XWh-}+ z9Jt9sX)V@}9*)alZ=ggC`#3OkEDOlCpAGYxo(dBWY8YX`XL_N?kLvbFyB>=4=y8-R z{iQapF+tz8^KmAL(iBv>k_{KG6G!NdZW~zc4ZSCABMfu@QNAQVHoGOEv#gnAy-hD- zoiS>skFi+TUHS`#qhGB_mR7YL4yz6w`_Qmf88#!a$TE&qAb>5QAz*6$tpknyg%qkD z{Yo#(rbBukir@zgdp`+5K7p_MnlLaxDAA(lN5X4x zp~;;^G$RG7X$54G@FX|@Cd{V>yBY|Bjde#^4GU>ne~*)sri=KSI+dS8RHCO^L`rSf zpRS&r$yae1cBlDai0IbacBj5qoH5aw^iP9YW!efZJ>DkxU`}H@f@_G||GBg3YWWWM8LLB}wvRq{Lsh_t-HL z$l$@~t~O4sdVe4O<$E~|Zex*P^B+<^;@$ z$()2|h=L(AB%@$UWZ@jpce`ix8yM(vw|*!Re|lHCL$u7AiK?UfD0(7YzLgx|%P#=F zNxxib7v3?yb5=gS4l4$@k^)-n+<878qnyrSO^)bv?0{i%ehON&nWFlqYdoW|w4(Pe zTBP<>3e(vJ1bwjKu~jnXD$hfnEBJBOkvs6JONt0n{Q32JMI4M)*gZL_pb#LVY5bW9 z68G@0^2Y78nbR9Zfj-s$nXs9e?|rdEN&Gmeo++7`1754gAt`|FmI$czilPb#EFvrjN4mQtrt~XOY2sq-%ijiOGL z+CaU?F9qLsXJ`Bh;o;+M+g0xSZ+j>$uaRe@;Kzi}PD*p^G&@z`^P6!4bze|TPXPzK zrh^`%K1wY(XBM(!_nw-&*6`=YB*E$S8jCqEFfOzgj@(ubxKODMStYaG$V;mH=lDUK zePZbb{z6ZN*sIujy>%LcvQbtldjSdU_ZnFR$;($!e`WjfaWIi2lwh=w*v-rhKf3Pp zOq>@zsy^0vRc+z>*8f}WaM_7>ZIodzp(>mM9g5TDZI| zu5c2m`ppAa|I2#}aZ#S%F=2aUKp39(ZM5Q}XGV;}lWyOZv5bPk>>FpQG!~n!No*v_ zE?R_Fo*aM6ip-00H$(S`scw#^snu{jaQ_~3o@Sz-Y5r(35B;K3K3R*L^n72+z1_q} z!2z)zYHzcx%GsAf_R#G>Qr`hx=x&zankC0+Q*k5p@#vedL@#D-8~8G;@msWvL8 zD#7+mwnt$a!PKxXAxF0}tzlU#ayzF>GCsj1Ui#Tn!WYLccOSi7v6kGcKPh+W{m4?b z7{FC!L*io`O%bY=zW1Jwg+EJ0V@;?@H%Vl!eqdz#ccy_unyf}{;=-!0?^doodxH1r zo>BqVvCn|a4NuE%q4MyxkTSR1MlWEx)WeBe(!GGK5;t>#EvD?&RZGqI^Z~Ceg~uC) zHfhrK`f3WvqBY@dsGWpv4haGDtA%LF0?h8;Bta!|wUU1 zK3m8VR4GEx5^y*$zGj#?SCulj56vE#oLdp+2Ye*+^Wo9pa*Rvw^|$t6`0I%R>bY}% zwd`Pica*gxOq%^#JYX}$q}@YrJLoj9_ryV_^aG-gA@I<=Ir#pFzB3#2F`!HA{E47e zV`9?*hD%%K&Z=BOBFg=FmbTx73l_9~9j$5DQjTh@HY>8Q8Z8!7XAhf9z9^axG3lK8GU|)!qj6RIp>+MU66&H`&_{Di~u)h@aRU`1U zzfgUgR_U78n{l`L9mh^76ist2JP&|C;DJ5+7A@Hp_e6!9&?9=y(*Jl@m!uxrf9JyjfkotOa8dv=&~O#>`CDtQOB)a0Y;l*_q? z6}_+9C-g&n=N;T#-zKb0%`VcknpR!3Z9JH8-Z?P27)+VH6~b-d1q(*&Ct{^Xu3-e$ z-q}hdxSC}ZK^|;bRIu3}%9Yia@y!DlSF;nZmbsCgYK`&F3kt=nG6TS71?J@jvHE~G@X&NOLkO4gP_dpYG*ReqN0=?%fUnY7IKPIWH5*ECJB*m}#` zr*zb?StAr?md0LuQZd=^kG}$l=gg-?HfDWs3s@-65+i-3L$R+}J6gL+Aw@e4{57l- z8tV7Pb$ngz64vYqr4t5wmQ$Bb$A9%gto%2!+Ag3RKAbnw<41721cVgE+jL37rRgpl zAMnGRCHRrAb((xVX%QQauD zwB2kRf=!`_z0&Z~OrZ9g*4@tmT$*aigBd&F+|yb0QtTggf~*tP{6-5(?h%IfeHa10vV7dD7q{fUaZP;7f-TP5)TtB|wP4 zwM9bJFy9h;1krUEVAWJvmDbNV(ovm^LaPN=Xd)WlB-qX%AVu)EYNBDi(OF`@;;+;- zo4wv7Ai(25``iCsX;H&kcpYZmCfKTuL-|708pDC}?ZA%fc@6}Kv|yHK>jc|$pcSYY zZ#JYTz=8n5`39&c0S;B$=q3ekQDjEBY=0C0>;4ezIC+ymovT)}14R8hv~-)H83kQg zoO8c3ju?OMHXFDtnMna5gP=zg&q5(qxMCfCK#4S1dL%t zVbvXkK#QukZr=aTEkdk_semzvyM+ZfS>ghKMfb<+IiRNlh5xtB&3FrI#VxG8Ef~NH z)$r9P8d9VVO#z}kz%H;o07o#;!bVvk@aqidar_u+n2{1dbv8h101&FhXFZtg#>onOE}O=UPG_uTLuE z(*yARSBMSJfn5LdOjvJ{Av5#ufxo;vc-LnuABCLtRgQOV2v9w?Fl3})0c{XL1N4xjQ00CogVHm@N15{X*yMulWzzmBv;9P z4p)d5Fb&EtiXo=uyqTBf>v*8G&=pCpS`kveh&-YlzO0{s9iz3&;{&>A4T<9>;^@S0 zlMS%lyp>ItbvFIgX`YYnyrjQ~p@uB)S)dADMKAAflUAg$e#rdA9(jd2rU&j z-*mfMt8wL~;B%&m(j_~H{gu!;Ap$0fr4ugkavVUGMsoLXx zCs7Cpm#KK{_)_aV52>L|2v)d^L%CFEU_n^w-OKxYQTGV&Bhg_*2S3XqJl0vWKQ{h} za(_rh;V%(_h0blI*6tF$r$`2tBwu~z+;&9}iiO!D13#D5f65Wt5KtUL z>RIlld?)32tIGzYLXz=aEfMl5jUocyZeolOkXZ#}ISh%JmOz&8DJ}IBsjDZPz1sf0 zKUM?C_k~X?$IySk@l3F?{pC!z%|lzC!(JuJB~fle16Eq0V-jJs6ns9VuHv`8#*MF0 zjcfvxy(*K(Cl`x3NSHL~ZBYuvV$wmF1i+niu>aHTrv0R^r8k+)Y!*cgV^^WwaGaBlo1 zt1QE{p^mw=LCfL6e$Wjntw&~2_(j0Y)qvaTpyqPJjjm>IezV)(Ji<>Ww6t{$Hv2U!bCSb?3`90$^-840wJbxhWfREAa5^rA*PtLD#Clx~pQjLhAE&#st zOX7q20=)Xe0P}87-K6d!aPFjN%s%39tx#?eRsA=Y|_`v|ys?T1eV^S${cfIgh%pM6LLL#lQW-roPMjRT0iQ$QrS zT9)2E`R__Mw6{~_1~AQhozV~fTTB1*5(7NNd$-p6zuxmd?-FZ$^8=cnrLDpHKR!@% z=hkZA$@v$V|9f?S_+8U?kfM_ap5Xt}17CT*0a}Ycy7>R0{l9hN_OcB?Wv!h_^?!Bl ze|vz31Xw(iV&^&k7heC9keyrb{sY5*VE7k{{Rf8s!0>Oe3W$mSaKnGN;XmB)UuMI< gHS7OBCdKZ}XYM5j?ux~oJHVgv3k`+J=Wjy(FIjgO3jhEB literal 0 HcmV?d00001 diff --git a/media/images/cute/HMMA.8x8x4.quadpair.AB.png b/media/images/cute/HMMA.8x8x4.quadpair.AB.png new file mode 100644 index 0000000000000000000000000000000000000000..2b04c7328afc83d3fcf5ec9e1126303fb61246a5 GIT binary patch literal 609515 zcmeEucUV);)@}$$R}y+J0V&c16saLVfFLat3(^!pKm?@s-a@Z}A|(b8P(%b&nlxzw z(ggtpq!U1TZ+8Qyc+cCr8xqwk*FH|7$moV2+V6uy0Ay{*@IQ^AHSR|c1o$kg4(UcJ8yF6``m9$WPdV2Ud zmP=z{L>tv_4{CmWTt1YLm6erPtl6dqWn8GyObN$><3)v&Thu~|oAX}2LuG&h{VBf- z?Sez2X&4yJV<|_EMI4HhW%+Lz8`K;P9h<3`@#~R-5WQSh>B6A#(anLk~?~PH0|rU@2CQo>j6Z3ecQfpB@5Pe^ioxjfy{iXsr;m zc!}nbD>ds)wTNE-{daWA$(I)(QAs4NE zic)nV;@GByq*Jq*j9OBuI$oDJ+?e5YymI6asZy+#}a-~ zrQDA}k8u3gyQ7=b7#f-iw&K*<$tS75IZiN1ABZ{PJyu|l^W~hh#R1eGG%uf<0;wCh zD_=Xv{Y^&V!$~rPATqv^Ru2^*l-Kjn0yT~6pkP==W(M(YL9hz?*ar2wrZS)~a1<8^ ztr!=Z@`0RN8+VwltTe1AAp&sm*H{~6caF1%xB7N7j zxtwidJ~Vuspj&n$TEdZLJjlDjQ_2I9K1A`bBCY&JMd^`NrZSVy4d)oPrYAvqVY)#A z7sg`y-Fml}*Kn`q1~+wE(xeS;{6=?uTz9U0RSFr+u6f@`Z%$I&_uX+($c@n!B(%sl zfHA3y&G)cAFVB2iIk>cmsxgV;5LgX|nMB^JMDoEqgCc`*X>c@u+RgmoyFUa<=AG0( zl*cy(IuJmk_~@cp>z&+P-3bw!+PVraO~pt?=}6aqFqkMxfNCt!&kj{uJ`#_2-v==| z)XWu{H7pTiZK{coC9lEWU$|vLx?+PRbWaw$OfpbM@L=jfb)W(%@x8m0m~#=Fbech~ z^q)6{Nc8J9FH+d?T}miL?*arfJ8wgIB!zFWl?3R%44XdH<<8%)NT zV;hqm6PN~cH8V^utYmn|k4Bo+FHg$vSb;O{UVqHH1N8`54SCdHpd|1js5s+s{A23h zFSQ$3U)GAXX3tnro14>gN$ljp2=^)Dae%PQ2&h&DDMMxC)ZXQp%pY zM7uoeSNBpIfEmUA{PJt+N=dS@QW57ejN_siuL9S$u8Y72T7~OEDyV1LhS!HwZE(gp zE_!?UUIyL|L01#flrz-c|SiKE?rvkT52Ee za^;u#Aj6uWF4H7KCqv>Qf_GWab6Ik!sPy?cRC%K^+{IyT3oo|uW5axNZIgassaN*( zwn0{oLmtOV(S9>*?TzD*BW2>|;s=kJz|B6*lFjO>c?3OmqXAa&_tEynE!eNv7W7zm z-{qmlY4@J|f_}ch+gx^k?Wt!+ZpUHALI>67jxfith%jO1o7WcCia)A;jtPoEs#>ag z#}qO7UVgw7&NRy`dd>cFjcw3#sb|p=&mNZaOy#I=KMwwN^!eQ9_ONA+qvrw-@fOSu zGM`;u&!6ivm2fU4m3TIEr@qArwN*A%wlp_9DEzAV{Lb^S zM*<%n8NFL#9_jnwu(Y_Ow^;Mhw%GfjgGE=V=!duOYbO!zq)z;g^WDnvC>IOa85#S> z)l(rdp3)H+uhK3JR4C2avDLMZtr>*F({K4^sl}WzD!s2u5 zZ1yA7+?YFX{6>CPk!g`<(TK4_)qGWqqoAX$llT1ZDUru-ooexl+oao1Cwt~r8~TRy zrz(2ZUow2QEgal76*kSP_N|^CbsaTTn=K%}Tv?x(SGzJ-yt_k-Y+xOh3 z+gsY+Zy#$?81zp)?(5mgR>}StvyOHQzO-@3to^%Y(kWy z><1x}tzUjE&d5<7jgbhb!>EWoP5hkg#cqRze#;sOM3b9?Yo(ic&?MSzil zNSzXJ$xRFSuNPWPpZ&aef%+Dv>Y_0)c2s(6x|Xzq^L=Ocsjb=Esim)cvsP0aiFgg? z+&B6id796AMVwQfw8@Ykkl$q3Xw9A)p~INtaPPW7e${4Z8U9-RQBiWCSC_o5UG>BV z+IaQl^oeM?r`1^RB~7eMa9E8?5J}v5aHli<+WL`DXQ6Ou!qBr)qnX=XhL`i4?Hhk& ztmm&g%^u#FxKo0g+B`_!)uQO2?=V?eslqHj)OfEkoqMfj``E8f$>)W|y%AWPiY=g`-x@&oC)kpM}b6HZMsp_dFX6$7?v*e%XngrR4SgT6> z=xlEcy&8PY_ZDvS%kZ7edDj>xEj3FQ%gH;>-?tC=8S|OCzqyq2aM`PGGisprC+CmH zp7&RpE43axZ@cFEU1-S1S)(y!?_m`_@=fC#6(bG@jZwEfwXL|TBU&F)9gfSCE3@xg z9dr&bXQic?ysSzYz`bu9F8VoNCAviRv+wq--bYJOXNO&Qu};NuT$Z?BWsSl^|8|0{ zz{)H;e^dW>$Kpcvf-KXqK2G~NYfZaH3Rh;(8cF_r^=7I^C6mR|zg}^Nsim;zvYR|E zJHEa8ZuEhn^Y-fEB$W~G#~*O9qsDIoj&JPvvb8g%>HP4GHl6eLt-zhE@CEm+Zt1Ox zu=UZ|1yK(Zsh!t@Z%Ia~NARM1#*eC}ybB6^UG{t06;tEP$UZsYFD>JDZN&zM-}0H% zdU7ACwrKtOn)h+d{?pEzxsNp?&dZSB$v?k+GIH$NFx^~^tzBMP`7Pz7uyGVhLI&&u zt(?ymYIW}ZY*5Qa3!RQoX1|=PzgM|n^+GQP&C#3>ulK;9$zI-Q)WC~_Wt^<$(=ml*BF5YB z$^V>Fd(Yg#)kPi#I~nL7e`oWw^sxEoNcPTuTNbcD z(UWgP#Y8TP{^Q<2SGki{%gn6F%ADYrkL0j@ju7>bItIWypk^n3U|HZF{-91fVj;Q~?Oj6@S`nyOij9&OD<|Gr|Q%B=p;hvfGU?=Jc6WQXO_xZk5=yN7t@ zK3Z++_y8gvP69F%0{mY!FU2D=zy{}1(yE01`%WhVSb)jVH;Ml1aZcxiH7Dao+<4Se zG5KH3?VsxaC%gWa>-l@1{M*3%=!{%`m;aB%!Gi@^vi(m*m=XXsXrj-M75Se@kRM@4 z^7rZfcS8n7Uc`cKyi=Kt)%c%@gB(5ZKbsrsf4AyS)&AeDI#ZwjJym~N)&D(JXIALa z{~EwQoXr32tFRBrG_&CHGhd$%&g@)(?~F@LV7P}nmOG-YGCg_eMNNO0_Xy~}Y%F)2 zywkBB^T&PuZL@Ba3k!PhE>%VMPxBP$ix`hGJ1{?3v{JAwohGBz%T1HN5jd-!2-9{3krKB}H&9Uq1Hx z`9DOlC_h^L_V6t3B;%JXh*TTA*sIJrDe>9>}eCNWAk{ouH4>ekOjgIfiA> zJk#NCGk$U%LGVgXqx0~OrG6I(>?B3Ydv8S#n)!d~%aA8W%QpS~=MMf~3#n4XX2ak7 zp?fB#RlavjRA(4YN5D3a0?6-nU`5p*%gW&E*0s`IxO(t&?9jw#yK2l%zUt?;8~yvJ zre}X#_DAz4@~)ciook~W_l@0#EKNoe|ML*V*`2uF_mr$smq-2fml)wb9`lQCW+R6E zBJ@`OelJ{T0@*fg*!&Mx6=kZ5w6|>jp_zzWC_3O(8MU+jAEKAh!e)2EdPnU>t1Ftk ziYjK(%J=ZEjXG?XV=`2hbR9GQ?LkJkL+A2!Uf(|cv^~l)o8fM{`a!=77>YcJLj6Rr zh;uZ+P~#2GgMSR=riuO3iBk+FZX$uN1eVXge16Brv*W3lnD^G=+{j0VsNFYX-wpfI zFw1{*`a>rxlMvp%cXAhM$Ac&)bo}@9=L<7xKIquyVT}J~$pJ++2xY)liJqS0CyLm2 z@5E9>OqknKMdLOX2Gx|6m800+zJ49Gx3@RiWI%$@dN$Zk6!#>U?Uz`rXirXAe8wz2s{I#4%=`(@&mWwY`~w9Vj$R zBEcjOJdF&rAWld)ry9%3w7*>IPsst73p>y9@(OyiX{6en<(hr;d)vY0(k?iQgjMGw zkwLxr;K5b`LXry|mQ`bZ`_m;af4I|o=6fpF%v}lGl8K6Uy!qx1-E5|}J^hR8x7@zp z3!g3#?EKo>6*&mVBQ|++tzFdW9Ssw?ZfX40sI9{M*|Pg(R%N!zVU%BReC<+Ct`Rg! z7TtJTd*p>4XhoA|q_gDCl1xXJYe>g2U%hleYCOQW)41z;s^D_A>+&C(@weu*3icEH z&49PZ?sRI4sQ+&JK&h;cL)Oi3>w_}|c@+XE$P?O}aIRAYnbrzuW%~HTN#kHO&OiF& zsFxt%(b7qxBX$fffjR3SDwEyHR596caG6-Z?>!Z_yiQL{UV*KSEBml*zls0$&bkM; zGq2;%{PCxo5xo*7NLiYjjFSpIiXy<+VZbe{&YIP;^2=Y|0?zCw4&E>M^|$mMASZGH ze5dSkN5RS+hxw1Shju<2*&KlJ*!`5D0;Q7QKp*T4)^<8|9PF-_R)-Vm%jj;0UtnV< zVfVOO!;-K?Ss(40PMB+FHoxs-!cK%aKwgn;n0MBOe2SN(d>2tNeU)lhl1RJFvo}Y* zLVl82(!3E<{m7Fw#`|p5e;6d{d?E1CrpviHP``}hC$-Y*+&n$SGcB-n{2 z89Zf=Jkummf?k}7|KX-5#1k;l(yyAi!dVluEoseZ5^15=Fq{U+K$*>cedlCm;zVe6 zdfRhQ!sAiT*^He|_nM?>2yS$R^Afq`b)L(3a01Q)qHEa@LS}e@i;aZ}AFQii`8IuI zLFf_@|I~S;GHxVUn=i}9Np7`~Cb#rX^@mb^u;*fn!jWV3?#y$;y=?hOU6h#0UYYB{ zz_iR)PtXFOsj`EfFf5&&8AW-G01G(BZraOr{p&TW0vg3se$z55q&IAuZ`!!bvdpSO z;a3RF#Li?~++?h*NbM~b!RIhnwSQGAY!GSLVBWXU-^c&3^2^zh>x+F4gBzJnC5g5q zqQ$-{M98zA3XER|j1mR#iU~W4cat7^1=#3nDSUpm6VLfZ#wk9WasoWY3YoTDv;9et ze=>rG&uvdTbBzbh;(db@J$AJ9(bbh@c_#f>Wt|>=z-GOW#%;#mr-ZJyOTFblFXQi+|8Spi}{7j`FS36VL>>8Et{=;ssl{>C6@uG(L!Op7VfT2}W9(=Rl;LeLu zpZ4$aN&f;i0@p5fn`Yhg_HNgD;81($Rzp_ssY_#aDDTGg&x2t1TQuvG<1=7^CI2LnK>L1Qs@(kOf-VU+FO0NX)pHpi#a*eM+T+%DU5#AXp zz(BLyy5*z3+dt`94(dqYkdI@2-cg=SEV5Cs(%oNF{*3*eu!DGUNJJbdy5D)Ybh)8 z=6t%f+A{MwI}5c(3&x!}pc_G##+=-`{-SSkWPrXkW%j0aXr<)#T^%&00XE2Wz7TNw zdEfr-^bvF?o*dV(fj;ez^&v|pN0Wm0mp+>L%t&NK^l8fqQ&CDtb@%|MPz)l9DOXIU zg^`*(u0JQ;aDA8kg*S1a9zmoy@}W`?jIKYBg#9TOT~KQ@lf*>L{_l2OeRkiCo}HC& z1y=vvRDY)XEwNuWm2qd<2K0O5v;KT6+W%dpJFP<>h* zfN;Y0!&N7^V)07Pv{hFoTh80D9*eAj*twl&r}oQ$5J1fj`BmZ+Pk~@tCJ0QcL__$6 zlAarpK#t$&)9E7otl~mR1eh;}s-+Jn)_qtAk#HxHR$qHs3(HLqi83W&b3sRC_ybO+ z6toF=2um`&GRGde6r_?Ds=p1H%P+<78#f_yaKTh?DZtt4ceHh7?f;lv^BggM;C0_h z*N6ZQ;Byya$0j;q^b>M)tKMT(Ql3c2S{~Sd!q{^Ar945T_Dvi#vF82L@|VKK<(JjU zl<_{}fh0jL;l~>~$H}P|dsD#%itSdXN9IpdkC-KO>CbEWs>YY;VgCHd-^FV&*Zun6LfNtGl96be$I&F>{6c-WaPUEz1U)1naj?e(qM z^~v5p1RD$Ao16J03_~RJ((Fc%qLX;F zBH#Kh*(wfK6A%*fBd+$Xh5iM@Vx5WH`twv=?2!4r5ziAIbfype=rZ0jTpfnL2@2^t zvrBJI?9zJ+d66@_BnZwGSZVEv9bYTZ@x7!izuEtQfLsip;XeAs<|Bq2DZ{0A-o$6M z$rDm>WkS`EEnT)x6PRDk;gUnkn~$T+NLQE>LjxiK99LQFcev-VHDomz=YMd&dUL?& zB(N#p=+&59&pP&r7a_k_SKv*$m|vRjLJ>liCFN)i0MT3*w}a0L2g*LPvST)4U7a{y zf=F44gfy>B{F9MV9ly1tXEaKbOH8w$qeN>Mb>vo&KLESd^VY1ke6gh6sfxsQ!87XN z4L1jJf>)@(TRHbt9$wNLcdRKi`X~}UR0{;3le@E7T?H2^+_+p|N?g>5dcEp@jh6yD zX};@b(bv{T^#1VLTXOJc;W(UAGwQPU1CW*kx!Gwvig-c|7^vQ$&EhiyRZR*QC;(7) z6l7s8GzN&Tfr39{bWea@Y|2`%hUm1c8_HMBfDT8JAlX$=O;$#a#T+5aSa7+EwZ})E zvkB)==eB^;*<#}Uldk_f>=&m7S2zT!X^7o#^Hj@FdGir`Y<7shCH#O2z|Cr6%;FZP zbsyQN4-{y?fPe4F=NqNsavvL}dwEs~KILsPIP6#UySK05Uz1YJo zk^)1&)Xo_Gk;&$_^GmqR6F9(%vx3lC)ON%VjST+At`b=(chq0xf}T0ms4ky7;82yAS6OI@1CiX@QFI*^2lE}Qzu(+D@fsZw`vLx!7!MeeuLXfS z=>~cf3?dx~S5N{fX>!B~T>yqa)bPio1PL`!nKTlzlS@qN9snn%k+Ur=76TA4L=z;5 zG}Ee$tS~qo6&@t8hM`vJPlmCXb0tF5!w2QPHmr{dKV=K2%s$&g%MBZS_~hBNJ;YA+ zp(VRFkIj|-JzO&9N>1Yd4fGRg_J$^a)>BLX$iOk3oRLG5BcGiVQ>kpgiI>K4C{Y%y zG{>^BoPb;H*)O}@A05VwvkKgs+^>&FjFjO%EL8!)zUTVP^ao(;_G?Q+G?094RPPF2 zyx>wRH7c{#weRA0D7ul;mMzcy@E+HNlnABX~R5%S#-P`+;!J%yVahvyvQ(o=(ddKTl$AW>#W-;TcK)dG_JwR-I+aMJ(-R% z3y-{S`efOyj#rgg^}p^c%Fgz6uK9~Cz3XtgCx-$D^t0}s^T>zQBM!BnTyC!0?w6(M z?kJrel~>6pkr#bo_{FoxO9g8XkS6PH&%T%?=joJkpE3WUQ}g0b={iUZD$yx58J(^j z$?*=Qm+M5caN;gXFE?3Hq9nSFTbZU^r{2H6cH)E-p}WLnt**&i(Ph@HyserJ zpEhsW4_aqqLR7uvLbUOgx?Key%8O##k84%LBJ z0s;m06MkczQ!Gdyx*HUDuUB3EXlK$eNM^D4^BPvfTr&X{XBkOyC6Zu9FCw%n%Eo<$ zVPT~r0>I~h$Fnvguiwsi)jN@@ zs|-C!v4t9*dq*xEyTrvBRQ0@=H4!eXa#lKn?9y;3}ZGj6KlsK z&j6AvJAjG}nI4zh2M~r(fI#VzTNHXojXeSYw_sj*wyC}u#yA`lIwyX=IogK=W=FG# z0bO;)_ZxQ=B=zUPE_I)H&Ayjf?YcQqR`Dix^0Wote9U!C5WfVUyqZO)e`}UuY`r1^ zWQPXULy9JSSf<#jg+e2Dl~QuPjx)O-?at2HG6={=q}{xP&{Zq%{Hl!;wWDq4Y^1X^ z-0QfS;1yyVic*u6gn!WpCO+RrP1~;bp(*&2aN8#j)j6T9tV8ARuum6VUt_>GgpA~j zexGj4pB!nR6Ri6gT-eW{j|5d~I=@lT4PuIb7zGU4lqgfaqwNqX>Bph)a@6Zr6b>Cx|kxOp`2~ zU~&&&J?jA?JvShxxB0RdY=Rk)4<@|j$Ri?mg<2Zx99H>>lRQOcRy(|FJ_ozHNaE0= zh?qkGqI?Jc{Sn87Hy@kNF&M{(1^~`!YEFw1&#mwiKqH2%*h0v2QZ0Edm3<}V0O>*~ z72|>bLZCOw!eOHB7=@;NFz!)Bz+#)-GrRbzWH9Ry3%)(v}t$?G;LD6FmM0ld_Gc($gH%tLH1`${YS2(zrvIkP> z7s88eCRr<#>6BT8qTGNmcUT05q(U%z<%HR_?}zXr}&>f74~ zd~NyGu&vP|{92YHA$7vM2H$?!b(@_#XXZBnFuzw*UbURvz)~$B@Xqhbdz{Y#-HCTj zP>7XuSfCMXx%y~tVRhv*Rm;g?mn)8<{Csykl)>~ViJ!V{Gt%YB!rNLn6+Y1(gJN!# zKthywIqe|roRpWLp81&cdRyW!!`KKriMaZ^L}}&-rh1TPScE4=Lay`?)aH`^To(O_ zYcd+!yZHlf#H$fHKCpIi3>#AG#iG^MG`f>GojwN8QbvRuchH6415!fDR&I)FU|Ei1 zzT49%L5|&4wGsn_w64978eX|AWmBBP^Y^AT_bXz(J@GilAnv$iuXKtFt0OO|(1ekbG_>^rXeTi;4H>Uv!8efBI^XI-`nOq^vlKSL#9p_R&S zCD{0FK1j$!6rXEiQkQhpW#*0d2GJE}AY znAme2w#hpO=1i!@;LmxdD9-uJy$GL}PBrU<&SHpEymTzC!5E$76;&sg!=sTzpQA)7 z`A7Fhw`XiA#A&8K2bbg;^Xuc!w@burQF=y5#>sr1CA|$C(TFAVk^Qhg5CVbsB>aK_Oujw3FNo|(-5 zm#LvFmk7`#;wSdr27LYwkimO?j-S|$sZhljCPVB*2^EKsGHw9Xx}&KiiA;|l1Z2%! zs@-W4U8}pSL_9%RLQGhRT=4`Tj)Fnn3{Gz(W<4HLC^r077X~5WbR-26q(L59T95mo;+%w?t!ub{c{Ll zB4X&c#b)tPP!j@P!m@ynLg4Q*wN0F0>;{s}$mxhm3D31DPP{?F!)Tenj?WU)sEq7I zPxQ;e;AMSL2o|D;Vh9ark*iV8K(J$#L-!ip1r=H69|1X^y|owzAMT@Rv!mS2MWLrA z&R?&d<|ucf*A&48Psr-22qN90K9CUL+XUZsgzK2#I?)a%G3|q)xK62In9MNdk=iR+ zg}+U(@sN z{M0x$ccfwFxP*9*m|{7Ndh(S{=;Yh^?1`t~C7B~oDPZsCvR^7mOoBFi{aKeqK`6nH z5)apPVsJa5z(ra;5Mh0*5JsDn9wmwb0ZfMI{oyhjPq=O{W1|%d&S$coQ4^a0I}t$TAa*F_5Tei6)g1MtH5tge^kHg#I;=fF z@USY91;*4LDFrCC^5d=^0(4K?bWC~3W8?-)8=-WFILzScCa3^p7~u562V)-WgP0N> zh*_xzDY%Byx*@F<^{@xu={JL1xjw%0CE~|Z>_^S1S}=qVSWsI88rfG|Tqbmls%-SB zQIl6H&(_wnx&&y@bt_$)2)$VW38Jjti2f%lw*mkb(nIMeN6z|Vc?v{vsmuS!xrn69*(X&pGD0%cj`OmkR?c}Ltw1#vmFKEVde``! zYczJ9v~Yz5I)%Zuu#t4Quh*z(P0MCnW!rW+ykxoTH7k{CfYEpciBDQfT~=$qdmJ{m9`a)irB5yjsK zO2x|vUM1$8?r?Z=dq}HMGL1NO@D7F>>0@4;B@Rt zTOe3WcQ7clvB4dhoxk6&f@hvR?;5`jc8uGKUXNJM$LzS^3-X#PW;36UNDQ=B1QB^P z(cPNb)7;>)V1pf@V_W^7pyMz&}z({;D5&1W!{ zO~93F_k=l_Sisk<&34n5bO#EF+F}IZ1?XKVr zcVxlc#~cFZ3_*qw^_Il0&lvM()I=6wU}C~glrl6!5sEed_gl>L=s`Rq)4kP+CPgHR z1(-P3kdLqf0R80W&!3kLqUb`AVMHkzOHK1W5XDPC#EL#`4GZ1n7C+zz4;X~msi;%cDF z@ImImg4m#)w@FI4W@+r06T$&Ailwx+EthTPA@oEU)+g zU~JR44KJfR+~c3#W=PP*7zcJMUEv~r1(k%@#alDZSt{kg#38z(m>k72PU2%IBO_`i zTogB$K~&m>bUATKN44U$i|iP=)wg|LN--(yLh-yAf{%9YIF?frEM z>*NqAxEger<|CR4kaP$}8hJ5W1P0?Y(GAB?s^qW|ox{Txlq@*$Ma#P zpwKC$5cUwp8XabJB063qA09r-yh%j{*p+1WbY%=K`P_@KA@(JHSH^<)4+)1aeJ!Os z&kbJ*Mt9mX??c}`LOC4Q;SIN_4LsLS<)@Url z`s>A!#V;1`#^tLT&^Hp`B6Ks6N&-Nl9L$b1QY7L!#~4I|qtSK95AM%8sc#!3!|QS( zxcG|gYq}&0s8|`OiPpetpnX&hJzk|k$7D(9p|2ciNdIAm)_Y!rZmo?%es(`}xKv5+ z;n&E$Fr*q@rbgK}Pd|1mL|ZC{o`?|uH~i+j^Ig{dkZTF4w-sm-+5wVk0dile_BP38 z`fjzA0){BWy2)(_V81LB8WlQ88U;Ib26RIL9X^e{KVq*>lnh&-x1(921`7|;4`*!g z7TrMUW7J%V8tA`7Z$(G6Y8WTQ3r4_EL-BB`0tQF&-4wS_$ry2DfYJ|@040plE1Y(* zZ>I*2OpXceMnYwh+Z*fI{QIOYF}bY{Ij+;s7C4t!p)0o;sA+rXD(GeYl{*d~Y74C^ z+OUBx&1;zL={;8-54&QI)C+hR-2Cq|`DS2BUJ1r7tWas>{t8)aAYXK|?!ERT+rGZp3uv&t= zz}`Cv%sheX=Rt4edp8Al*-+SILBuHt$m^*B!|c%R>Q$;C61Sv^@V-6|b?8Gwe~VML#$8$3UYF9rjsQ^oscnN}Iq%G5J8Z~axawx?Us zA7Ug4=VSOojX%bZTbs>!OiSeG5=l*6Qk@FUrN;$HNaIc#45|P8gqN? zOBtBC4lNaObgqGq@Tm{@h63^g%o^YBC!S;&0!w)65SIx{ni{d*YHRxMG*_dg>VnA< zB3k7sVObeGw6M&}#a-j#J-UpHB3gmB0NLw}b1^|KJR`6o4vQ!Oa3%P>t{&vYPC~TQ zXYAE|UjR@(xCg2Tu*{I*MKG-g80j`=2+Z~_>kc^2c7CNM&7i^9k)l&jX8LCDnuZvhIUixKR%fuVX* zdfcEY0F_tywMHz<+N#VOBZaig_C=<_%xQbVr5Gwp#$;t)K0JzEn^2U!x%a(CjA1j| zdNBSOra1(uUKJA1EFFsamH-M}L4M+gA-JZTehLCR8&B(DOaq;e(kb9*txQ%JdGOqL z7MQ}qePB~JqZiV^{jt~k{!;RA*9`KVjv(^zp!r%2^mtA5xc5P71=C!bZP6EKf`&rj zd-$8)``hnN14aW5Ab*b+)Q>w$z`c43BCMo5IG$jrZ4GWkzPuODkq}_5NVz6)yQ9tt z;Ava=bN0Ai5mOV)zbL1{VBpC2To<9#;0DUzFUUd8P4UKbCVNCV~Olf>!tNzBSWi@%S~M%A`GGGjR5eMmoiU=dJ^ySUlf#Gn5$k9Sttx5;*?&3ttv1gPD(` zJ&o+BtirrDI25bF#5t`8&v__JhmF4qO^yW6DcU$3rH}FsYrTmi;ihJEf9o`=TZ5`9 z)5YA(X&1y(v;(yUIp387Chiy_1M~v$or!KH{~E@3n-ICZ*NI}ml-oqJgk9&`qlkd0 z)Lx&r$hs+T-t3HI&OIsa+@N+jK69ThNiWaoltrI}yHOgHQ``vXdz&7dinKgL!JY~E zfW1d1(%>PI6lPAmMow7MD%2DlXz53kq=z9DpcF4XPn^Q!0|)9qO5~KU(B;YNfO>*S zrAqbpcwr-jcivx0rJB#=*Pr1o=DMm2M-p&nFqHH!`)yYl-$Z_WQ3PpSQz@c4ERIUo_PjLYQHcM-PFZl4jz7-cn_CahVoW+rf!$CX#P5It`t*LW_Zp| z0d+D?%yK+edGdkMD-)N41a`Z9WApl?u!?zFzsRIDoxl*a_x50{Y{$Q&plA@gHwiniv5rnQ5i&} zITDUidzR#tvaG*+n=f&hj4!>L)$-9Y#8L2Hca1ZUjplm>{7voQNNvor@3ObBB!YwN z8XHu;!UaeVB!~SC>GYl;9WS!V<1?hAPXxgZyfHehkYX3&O{1|%XT0*@E+YZ)y?|s7 zO7iv3^fqKnk+cdqCHGetBEsse{ZiVAg#8nq(h`2M4$C1?SOVQgh6C^R|$PIU-;>&Ed$j9La#XGdsM~5{J$69^E779-U&;o$$-q^8c`DaE_M$7uoUD?3@5pjb&7SR!I#4ih z3HWV?TW8dnCm1;5gcyw+jFPtb-Bbc|SOI7A<#CNE%25gEPw>5fd^AS96|9M*GY36O zRxC{7XhPn&he+FP1-BAXUSJ6B^_eM~C8JHK;pc|sp;Is{$%KT5lzUwnve&xQ?wpiN zq)10z(NC5rxgZW=V?&XUIurt#yTUB5AmgTz9Nff(%XsLzp_@-+;b+MqQ>Yk_>D~)Q zhv(F-0cT1ZI8*oc0hWsWq+o4f=%ku;$$aRS=9C97iBb6s7;uw8D@p5BQZc};iSQ1XkQx*| zS6ek)_08HABFN!pSJ}P%RL4^Suwy*9AN=lZVI=J^5IRLJ(h@omudt)NiXcD9yOko< za#%x%ETjia3WM$RtiE4Zn2{JQ)!GX6(*51{`^2wsP$o!1B<|<{y3gBNu?>_=@GQ~6 z$f@wRdNOsF^J`lr`sIn}hf4?6^nl`*KD183abZ+M*KMj<{hpLn<6(9;dk2$;x5#Lh zm@Ji<=PokO zm7kPfxX-b}PP1GfXgcZk-sSMG;0w^~kvx^D$s}zVo4l$ASp(Y8inCG<0|?;7$u9}H z-#9xDAOhk*CdLPxPwRh}BT^NOd5PViw}BLXpos+mTr}k#d_jOX*;9l#bGI;_%)`Vd zZb@oM9;`Birq^70Le-9jtFYZ2Qi9#}B;JWgTNoD5B;WA(1IciX_;W(t=heBJ4f5KV>7RF;;VsTsF4o$ z=1wy7M~||>OkECWr}f~|{G>9gT-T)C{-s{$u)VJAo#@RK{>~>j_Stg>r7dJGAABO6 zUZn3_SYle|*BBOt!4s6?3^IIf0@#&mSQO{oli5?$Q?qK4A}~-dLF4-3Y1cmP_v-M! z(Z~zH(*=DnX4z}UE^aWk17#}tvd>L)ST{@y)tS%z>jNp+QX&ZP&BEIWvNEPX6`BwM zCmkn6yd;%+K!11=pH63jt4x(?>PdNS@kcUFg`O|_4HTk{Wj1{!bx7~9xrjQXtfWnE z4&pJ-^`}XvVC#=3i%{4^GM8|WD^OwDaxvAoM41wJnx=6w{LE0@J26y+bw5l_4b@3C zVz(g}DaCcz2NV(C|J4xs)us*;t>ndz%cW$I6w0fyWz)?o9eFH$a=mvj2$~gt&8Gdd zbV729Z&;*vq$e|mQ4ChjlCu5RN5TWI^LDd=X|G*ib8Li)0ePPby8H-NOf%IPaST3j zad%%-w4cuHB!3P_W^Rs?nifV|098Ni&(p~gB8|UZh@u%7+N?-Ff&RYg0Ux3z-v9_V zBa*~s<=+rY-w;??B7UiZDL#dW*F}}DlzGU}|9sp60c17kZd?B@`G?%*$O&MIT=3Os zNazN#jX!e%kFPAbl=|dE{5oW`c7}i^#M0R@FQxq(Su>Nmq=K4akdw_Z?a(J9lQ_^f zcDmWK2WxUq3hVk4ooi3^;QwMw|FPh|e;W9TyW0dzd$;Cj^KHwwTV_8+xXE zJ$9E)N^(N!)2q&+p}+JdAk*_#wg3BX0sNs11{{IEX|X9l{nJS$EKojJbe{iQz9cIB z?D+yZO(3iSI8u|;v(x@Z&%ZWbMIpSw{f!Yc_klXQDi`cY5#Qwg*sm!jI+5so z!6Wx?Y`|fA6yv-nS6u!IIAjjhxMJg<&`2cj(5^zx~J!33mjNk;|ts4-6bP?_`Ms$ornC{NG8B+ z(6y02M&Uiq`&WAXU$%RT=j3rh^+DGYgr9E-&Z!>yYhrq-r32yZn11v>EOJHT-$*OjhEwC#$Xc*)5ew}Bdc4f%maGlMhb2Nw~ zx_UJJ3T{EA{TDFxXR!cZcOCMZch1rQnNSv4|hl1y#^SXC0<1B+3zN>L2kd7`C#gQU)!I- zp;L0xgyiY}q3gY);d@BQ-$7LS>&nfamOTJh%YG)Q5A zjc4KTDE%+#mhJKHfr0{)%uMl1L5lmRFbsuYC#!wXl+sO7OKM?|k0 z5HN5*f@*5O6?g{2gt6hh*!NUB`AG6EAkLF&YWVS+f4)@VGOrR*4AMr-z6S)6fSc|? zkJmA>u%MRb-*jMs9@1}O{bPhaDtJ}Z5 zwmkbYQ(@T^QAM%UDn_TZs{L%V%^2hRTjlK0&)RQ3r!o(lr&qKvm^v+x8xhtYm3I(E z-jh)73*WlU?~#_nNEuNwb9H@6qJIq}{j)o9AkbOoD=fq9;@Nm77v&*fkYPO6Vo7;W zGdXZ9XZPXYe~`C7wwVL!A=L)1jePgPKu``D2^rrO_*5%VlXqw5Z^weWiCtAf)b+&M zUe__DI7`&_ISrk{5ltnKKUenFby6aW-*1eku7HkfwcI$r9b+7=6EG-{Fv~#H;dzbC%WdD@0R~0o(VrPV;|- z$bZbb91lI-n=Xxtaa}yt@S70hjrh+3T1d06Jigi;ruv9dj+bvV_Wx?02L}L%r5*iu zud(|NC`^S>V4&nfe7l)HDtNDENF1p~@Rh8<%aV<{d$)gy)=E6G@*QIUO($Y*34&f+q$J{f)Kld z)M!4>!>i4uVVtB&YIOF|rGZa;Oc~xs;dO_3)c0(xSZ-yF&AEK9c#_c0@)sdiy&r?? zONF3zg4z$dUgWVh@hw1`S;DCa2s$rS16G>-ehYEe9?P3pa9LOlRoSQMkn+t|Jv-Wp zo4qj>3V2=f;f+RA&N!IN5?b?h7 zNK8TpUphhl?Xx9b#+z$1%0h0QjsHkZ{`lNmXhwy}z45@)&>Prc1EQXH-n-c(I@l>K z1K%}E2;BJ(8eL^&BN)C(5gk3036)uDxw%>s)3ukV$rSNW#+{>~78d&2(mQZD!pleb zq&$e*52EhD7*5hlVn0}!^X^4##>s)v#qy{|v8^tC*1K++4fP;dc8)9d z4>Y}S;AKBQ*lKD-ra3i0>HPjyFgrNJ#4&|-j`Yw_C9uUt$`Vpi-{=G^r1ljgt1xx> z^m&lPy+;Y!Yd3Zaymo0Fr^v7-x1LI6j3)}3%gl32XsBdOZ&bQ>WNN0POp#LRnJIxe zLgvD`QOu(wq2r6rN>d%DJHoG6J3 z@%q;hGs#)>NC9o*Nk>X|O3Ov=bkN-m8PaZ5?vVZ8>Djan`e>_+)kNWI|K+&X{zqFM zgl!bLjGMSI%(-tW_aM)m?0)^ON{p!|zT7%#{csh@8}*3qZKH*qkeCMrb_43a)9H^e zVIXtZeuXUkSJbZ|-pmG7A7tlPE*eNZ@(B4T9MLOPJ=I?2+R@DnD_P)y7A;(6Lym1p65e+Rc zc5eE`eKgMX=SYIKh07fkiE!J}S0@1+xbh#So0n zjx%3GkF5eC{)7zws28{F98MI?wz+B|pSzZ0AwWLhbFjMqHjYsq_EU=cQUR71*bT-nMu51;u)@zphNk?&P2-yy(W6luA5&&vVZ z{g^}l-(ms5=?|1zi>NmSHAlxxM!ct*M=p9-iH{ef4O{%#EKXL{kU_0V(yz1maNvho z=RK1hgwfnsv>&r!6G!_}JE7>F(J-Cy>x_Z8Y%fP{@munM@UQy$L!n{!$zQOfJo1C@ z&_HHWX%}ClWY^@W=Y26?CG=mBsZkd2+PWR<`JH)p8^_8Bs>H9!q|d~OZYLK(2YKoQ z6|Of8Y8h41l76S*=8`(-C<#nsKM8GTBTFrMRdQa{kk)Tjx8jl7xX4`BwYab@Xg*C! zcHV&x_nRMj)^l0T>RQ^9J*n09m0ZFuZEYvAjIIUkfcWdV6q)4R(oS}ueuJu)%@`di zl2BbMlc!~^N@VBfd1pAAhWbq#lorO|G_GRi7%wC;pSZ>?-!+@aoSElYVxclO@y%v` z{HvCtm%h@(l_-`uJwVS>u2U-0bX zwtIiFI@EEcd%VU`X{vn@nxpVg9QazZE0s4A**D&&)wL_3)cX-w^!}{oejZMCD}GQI znO@KT(E7?7?=6U$-d})<-j|DtK7ethYl|2f?|8-cto6~ZRbX?l?gY1i;d*}h? z^$_OCI4!eJN>*vn8d$F`d*mM9mca@{+kBkH!24)ht#Ti2Cp9zaTn8v}0((E6#AeTu zApr)}Cq|l*!RkDltE~RmMX7+R$x^nDu}xCRJSKE*iy0|8Aa3I-3BpW?#~S`aT2P3B z<6VfTm&Vs#2K&heU^_kFS-2A($}+0L1t)JEny-JnZn|P4F5{WKjPuF|o$5w_JgjtZ zCB#0m()m93-;}lK1O}K$fw5=@d}=ZFUCk2t(wV$MhXsIZr(b5C&~H7A4Bk?7&P3(J zOuF=kf~vg_kG!w8QY^dr^!VPa4U4$^3FHg>D^nl~0vDcbtBxs~r0hM*WlARXrq(`b zB?WVM#N*S7ae2g(LUhOCt50hAs*lTl`g4}vWzMa(TxY}>*bFK=`fCD}MIky}(Z3y( zUotEnT(juzCb0O%uk+^=<6=gmO|3z${gQ;Gp#V*J&`&YnH%KGLKW57j%^J{ z?h5e@Gm~OWR0UR)`pAAy=jKSx@c?Xrj1OZ3=hAkj@@0XhrE7$WBwI3iaZ zi2}Dl29?R(>G6ow^fKuo$`bu{jy(w{WU)z;h;v#-0=;_ zHe?5;^wVd2&#RbWX0<~YK!4EesC{f@-ffX2KZ9Q|l%Plpc+44&Dqd;{D~?;EN%c&Q z)+&dS(z%KZshT>C`mXtA!n-bJ#VWSoC2uRIWPi-uaYTnAzH%H=;cHCrEHHw|GSlRv zs-xSO$mpV{vOgoe@L1^8&)(GaXPk?5)qJm--hw=A=7Xf~`Jqvu33axdqW?)HMMuXb z_iqdYZvS>GNt-i{s^e;yv6Vw_2IoF$`z}cp2mM_^5LZ-mI~(ta$+T)kqE~uX?Y70m z5{myR7XdggRsg@8@AOl_0u@d~mDjxg^{jna7ocuK6Dxh_-6XVAO+yedq^7ACDL8DE z3_B-5nyt#*Ip2YAtd73^aGj*I=}Ls^jMfcKf9c9@>>S)%@|!XwxaS6DkxF9TTid$Q za|(16_uWc!=KdnkCBxyI%x6Kwh1zKh(XWB1b;>vaRi& zGr<>hn{8NWEzf1r!a?}?iBOE#_V7vUYev{a@|un@I1L@u%@xY65m$jpME4*^fnb3Z zHev2wybBpO*9e~CfasDVwQ-vO#ynrH-5073dwql@%?r0;RokD*ysB}#qgxXG7t735 zCwd4KVgUZJpykyJl2nzFjiWfHj?0EcE2o3rqnrBHO!e(JLdI)0HSOd7&hk#AXnlCm zZ1)tRTL3ye-T|0xZQuA*!G-!7*s=ctqJX&3EYEP`G;>vQM;m?TVJ3HV6OSLqdMAZcEG`>qG{^orHc}PP=7ojlak9?wY3Xs=xWzF45!~?*x?Z^y zC>KUYzwyOQ!_>R|y{Kj1aF%Ri?uoKbt^3Sq-SuAM>aQ^Rn9pT{QC>?vwW2^9{p!bW zyDa=oFCNx+(JLNB8$zcEL*mAD6JAAYj*1S(lb?;3wT*4Gxi_)Zo@wQzwEd=P6D%MtLA%uRXYY`451#`4K?e z!d{ErB6;MSr$1U}=ILrRU!SOmF6k_u3tY^d5dEILdNI)%PEU4P!JZqzsNE;7{MNY6 z#V9#bL(;?UVE`$zCXe2@NQe1zE);jXjg$|+9i6Y3{MeQg<}u&!CsK10jiKraC}yo* zM$C5$xNo*`dBxu~dx}F+XRF`irSw?EN@qSBDvQ!$3hSzKz0MbxB+?+Rv=hak^LS9GU^MfENbZ$-2f{1YL9v&mOCZ&lk-9>L@x%hkR4&$Y+XGN2L0 zWsA)#O!SvZ{%21!qr<~A)NU|VjhS|i{3cudFOR;Q?O{p(m6<;ZGqD`l78Jda;N(QF zn%ybNip?u16!1KBnff{K}@PG-p5&pJLC`;i}cg)3FKFf230=O55Nv{&`kB zGgYOvV9;93r~Bh$S<64cR;$_PW{GoxvsGZUUf+0**aY;m`|4N{G@kKc~-NP8P${n6*_lM^PiT0zSY_-!`e4bv07hFIz|CL zf8%7);XU=Z@5vwH5bex?kL_qVJXn89Z)KKNuqh#I4|_vT=V_{0F%0JtmxQDJrgCKt zTKHU^w=OFTGjhU8#;G4UYMj2ZYQT9-d&(*=lb0Ebg5EPyS0nn%KIFtmaIk7-G|iEb z*V_Qd|KZh6Sx?aIvh*rZ(1k1jxLSwjShOw{+|_F~JT9_`;TQ^6U2?D1l&4+$-8`@* z2D>p$yL$XQ>2~NyC3BqZzCfB!0n)D_)-09){ayc~%e33!(h`l`nTF2XtQR_NjgwES zZPh*UoC}2mbkf!FrXEmiE20VS=L%&WCwoR0w6ySK%9c|Kpn~El{eJt*JS+lB$)~l8 zR=?W4@^@o$02=qyObRi*|57gK8f*{2npz{6$6&f|*#Wzvf2#T)NYo67sEzP5U`0lF zfG$f?kld;AFPXT^3*wQJkmA`OzP)& z0Tryj4@17h#3gY~zT%4-tOCloM_^!ZiECRr9h@dCERX2B_pxFQo5;;Hu}MKz>mUlObc^bjS8o{;P5Z!Iqim#H5@)9YIt2H^&H_zA1&+?JMZMT5o zlT|VdRORdGd7OW|&OKC+&{m2KSEm)6_>t3`vw~&LmOrO>9&z`%%A!~kt^LGE_c}4h z|Etx)c)6(lSKSR&^0E&Yc*(hGVyW8DGZo^1kL-kx#Q6#7YAyD@6};cpIhh{us`fda zbaz`V{3Ln3;7>v?-16ixUKCw=ARe1g2)Oq zkZZ*MID1f@!i*y9=LT_l(mE1o8lM>S+SDUiYprk-i$5>D`{{kO?R;utp5{Z$X zS(h_t7Qey*;vmlQ4UsYQjS2`37|D#2hg#;X@AeZT64@|1%O^aKeo7Q+z#}a!bp8U?gz042`%pgM%CVMb(K|q+>0)K)a4mt?<^|}vlH^}< zshyvzD*+TY@(~w{;*n(OZ;cM|Ce{TusHugzqf(Y4#P__p7JYZpnZpFv(j?UfMttRFKPbrmC?PXy zHAa#rMrwO=p%kYV>b5Zu4G@ckfR61#GILCc@&V-|MOwkfr{6T9tnxK9ZX#qyebV}n z>!~ZdxxXy0N?grSIk{k*Twi=%{cKqNsCo7wv~a-7G>Kso6SH&l(jj z*`(}sPV&R(eI927`o8*^{*B?h|Cm_#kk)7ILge?Sj+Ny#9=LtQ3EX?GFf;Umg@SQ^gz~O&nM~WXrqP6ryu( zzSM{S)j=@{=n_kV_an>vFFE;qM%PFQg6bf;rf3V4rXOzCe_u`PqzWe^eGcYi2 zMd%BlTbO?8!98VYa>06hHcqyN%?>;<5#_=7|X@7%DaxQ(WQ7-`IZnEvl#=;5X(#Ec9XdK&pin2}LlqaYl5 z7+#yAz-6*j&{1;zS5)inv{mIdk+7G$M(Xx>M8D_bt5V;t71FJH3?ssCKY-uq_6AL+ ziYf2KWoXK;!_S%)q(R5R^pSzkQE{y}`_*&xfEfH{&xHo>gLa$MS^FT`A=VL_)9urG zO~j@wAZ_4l8hDAp_p&d`b-ME~9Pdb-8(ZXW_b%l=k74T9`p*DaFSZ+?^OHo9^A1&Z z{v$^SXr#m|xjJZsq>wL`06H{4H{kG|luqgm2*cTHI0FU7fyl;})zzlsMG}e@puEYB z)|mU%RPuglNzN^M$;StfC2OiXlMbW>CM9G=1sSyZe81|x+xHntyy!EKdYGT_M$Piu zT}|2~Ds$pzm{2ryA+gC{ecol2ezj(fPFeV{InD`g{zaNnI`B^9sdDcd zMe2!sE%m1>najFajEd~V;YqHTrN3t_b=MiThx1R%x+9Y8bvS;sivlFdaex*#`Y05l zTPkon!vvHfXW04%{~CCK8W=g;OgYQ14modJ7~{lQm8 z#3-iO<{|RY1DOKWJ1m`)f*HJx+1!rm!Z@TkaUaTAVKC})cdoHqf$-Ccpe~YIJGjOm>Fx4cU+^3pc41g zHT{T=8VaZZK46*)HZgcH*mQ}vD%`3Hqz6R?h{8#s?50?t)@thSX}<6A!Rvv=^(^?4 zijJzjG{+ge0q(dtfxRs@W;BRd_y&8XC2n-6W~k(^0S%kiN^w-Zn+}~PIx3gbYUKqG zK`Imw&z{&mhUGS5;}n&GCftQb33u*3jcDDq-a@xa|8^2m%#DO8Q9XTik6Nn(sCCmn zsP%M}tqx79wt)GYAGWT0^=#eHd^L5iDpD_*-Pev2=SGiVj%twcR4z{^5@h`FTjbPu zsos*S*WQP19g}2q)P{xnuiXSt(su7Ha1(VQM5sejZ}lx=K-`Bq#JyZLfu;m>nheq1 z(BMh=pZ>ZT+pr#Zdz2($`8Y*O*wcDezxu4lp=Y}8wU0ydRS$8i@0;tNv`zOC5wTPe zpb63+Y>Eh9F&y~0)JO~H^fRDjHo#_#1D`Mr+8mQb%+dg7OD$p1`wF#OFQj=gz|_^Y zy1VoD8MMaVDrw0fj34s0B%-4l&j1y;H{l>XRUl9^!%lmNk+<7r@t~xub7NfiMBrZD z*hg$Qr#>5BnKHt+paUD8pN|a>89VwAIq?&NRSgQ!Z3w?vcO*svTDj8I#K<@3C~^qt z?u+Wk^e%5e@CrDtV|xftN7L;7rrG)m9cAkL_uD_6X!Thq)q6CLn;7K_F1)}pJDit% z-w0&6q`%nTi)rX&CbHVK0noM!>`p&Ud-~N5JxNy}>2c&nQz7nchwH@prow(xBYTBW z+_U}Cm_ri64@|Kl%<>41n8vq6uEL&@c&sM1uUI3=c~rBkvAREvbYv?I(LulX#*{Qt z+m|(!Q33m`FY2n^b}ZSjHc*yJwUTJ;L!4}O(8E*TvnLpIjx+sF7nPt~Q?Fd(%8#+4 zXcIaOM&)!>sWT#*@q$wK2+cFM@#A&@94}3~{VbBtN`aX;;)HQx(P*5UOt78Q+p%6m z@xPm_?wr9qcyP9HBn@K&L@^fM^Q~iRFGs7Ie$2G{RZAa`x*iAF|MLa1{}uQ_qA5CZh8S&OTH{H*adh-F)dThbRkHbmd_wJI zT9bH4*(4)Ou=6AZxFtmkC4XZDL?85kM-6&Vg9D$_S)TYV+NG>@Kl~2_D(x)kHVEn zyLs-~CZpZ8PBZZ}nl`YA?0Ck46D3-%m$k5Yj1GbW|c_zxKRm;hYp0V#%eh#p(g%8d@&;Qd%DZx^yZd2Vol( z#U~hSIPT&=ap|PEjr&g}&p-ljKscb9%2IY}J{i=bu~`GaY>%?stJluV(naEbXM|oF zaNOC>b;*Q^ujO(>xlRCx=iBs{cy9RiQmX~N5>d@JXl|TwYoL{{LvL1&u!DnAwvO4b zQQI52AFxzL70iOSZzh`}Vi_TAS6os98IsbfQjt2Fr%)Z%PCpGXe)+h^eIr6Bj7=(E zr})E~yeF0RmOoaK_+Go*orJi({{1YVuv!_gZxtGuMELoRPFjtG$0rvdBGq`hUDCJa zI@EgR*G=bVd5X#3GkooQaoJip6k;C5i;Z@##SmYNcQ2Igj%avcv9){2HcXb7rDVf$ z$HN1zPouSRLgn|}ON!)qOn(7Lt-0BM2u^P*05!MxyO~6YCeTAt`a7$Ku*=jmV~161 zhNc;tDBM#RK8$CiY2F*r*`>BL)ADQJCFbgr+(aMvi zpGTW=y*>D>dlBP8r%n<2Q@aB?ew_2-J+mG#0IP7kdOGw|Nh?Ob>wp zPP)DU-mlxa{2CUV)^W;rOV#$mU#~{lLW$_(t(Rs=%?sLb;k#308~@WYbYcQ9Ff%-B!27mB2v780{8l0=n+3bK^bT1kphE0eZg_l{b>73Sc{N zB8y#4U@@;Fn&$3DAmO{{g>XdQh}!6LDdG=%jd!2D_hzP6(!D6yaO`Bq*@6pl|5Ocx z*%7Z;&%ykC?dak)Iawk!lQTo{qX2}7c~I*5VwyKxE#QhDd$7%>vmdHHipTpiBE_bO ztgKmxO+K6vGLhG$KAKLsV*h+>%CY}s#a{pSNkv3gMX z@oafjw?32}be1?yo0ZNJj!U_CVpR+wDm3vFRYa(qhOtBY6F`CtcyP3Drk3cabOyTr z7$ExbmNsa9c=*BpO?#N-oFZh2DAS(RWs(D;*p^@3^o_go5UOy_Og+QfP}#a1O?P~D zy^}pKU+JML;4q&!1SGEQ0NCfZp_`uTH^#S~9-v*E>+R(1b5?c};NZN~{IpC;`Gx(; zsgawdy2`nbh;lKL0{*~5`%LEQz!)evwsk{|rFdqZa~w0iT^EA1~$kz&R#mPjBe& zpGF^Z_D_1D8>xYhNafW}A_Sq&NFRdebni$f!h7jFpmsWsSq>>5sxC2$6W4EalH_-? zP-JFU=Z%aA%fB;DWt8SNEa%FT6qTe4TFEu6bMc7@eFuYWi(9Jj8Ad%H4uUO-TrBe` zzfFCVpD83rD0$NM=;PZXBK{B>BLr+Tjp^J+FHd<1ia`Z=0^sb$cMMt%` z2|rMbrveyE_crqfVit$xHd3%1(A@}~u)!%bEY;751{5Ha6LQ_`ug1m~1-QQzIYRg` zbw*|<&QUvXIcvWGVq|<>uUEMKl(77Xt;*QvM3kkxPl?%0LX72Hg{!}M&opxqx&1Q@ z`9x+abDx**i-$&6q6`*)ZtQ<#Zi}Iml#mamN`IGWZhMY5+7&lG_;gA@yXg&qnIg~< z4fsinBn4h}8v+MP|HgW!=kAktB~!3nnD=Pd-|~BJ7!Met>21*2BAQqsr|R5dJoZC~ zGByq|^SfT`I5*6M9M{)2!Ws*XLqAhn{d`ND8}3dAex7Uop=;bo)9^eO`Sot0#+89u zO+a*m`Q1A>jQQnb9Pr)so(@G4L?uy$!>!azcWW~GI>T-IOF+hrQ|Zh4dU`ObF9w-+ z668&dc++~G*R+rEDT{=CSCbfd>-*8%Cqg31LzNO}$E$dp#zI&q2|3OLP+hyt=a>FO)1t1qWY&Ae_s z?_AdV$WUZaUCe&T*V_u$60e6DW@;Qwa%h+6rj6T-k>Oki0T{^PyjL&H5GoY5pJW*g zkdCtZ>5>D&=-t{J`3B02`B9qQV||`M(jXm6OhUpE7q`pzD@gU|?xk!#!{jvtgqYY=*qh&SCznF= zx2)%u^UvMUo^MBMv#Td1%eZcn&(5d)e&Z5cg3ObX&(*K%P6p|v^?bKKS2XcdZ@G4o zZX2Z8!oGm=lop-rj?t<(l^h1>aE1z|=Nf86L=yb%bT11I4!-i6s?60xRN}(b4S04x zmtYkC_p4=tyd0z(O*9RG3#MXOyos^_qqa+|fy?gqYA)e(5>VIm`btw;YvGm0#POQ{ zOz-|8I?{V-?|g4Q|9G*t)_A`o&EC-$)VJ_zP<`C`Tc2lXlcxH$*xcmV;ukP`kNKdV z$GP?lX3|}$Q)WZyst-3v&%l0Wmud(?jmP*!7R#xmT+}hs4FwwxuwtX2;p;PCLY^gOMRwdG{enKhS7U*i1zZL zyybO*iG5$=Tw(7sg8aEppPaLBwbC2ql=jU>3ca;3vF7szu&ghqmcUgF%qhGZ%a1~= z$5CKVme@05Cj6~KW)iZDYQh`S5Tg=9<@`cGn52!gR$Wzz?V%LIl~X!ft0p z>NDGSt&^mVKKU=yx$3AOA0|fnL7ZoGv{|Wn}93s-s)?EQeO? z-STS>an0N@q3@aC*}VO^Aetc-iycRS#5-)6L*bP`Or$OtcWXZaG)VSLp6^L{_1koE z#E}NB#N>WuWhE}JKIqKN7+JQ`RJCL!Mjj29{`2@hw`DO#lLO#+CSJSTU+DaQ@RegA z3v1)=FL+ax^Yiqe@!dIvFw==f=hWxRbG5`57$n0aU56|IBHguebKd=4tF)kqK(6j> zQ7LZ1+)MuRpVuDb&I`3}Evbt}QW)Fo7?mfh&ej-}FKEP5nYSoYJS4w8RZA6+d!#x`Du5+q9vsj<@Y;Q#2D&(3TKDGLT#YI z91WN~rNo7)*f?h3L}S5esv^B2vLe?=5wFhCP&%Z}abTw$l$a*-r-!u?fq_izJ@63S zT>!D~HToRC`V1(lnh{!oQDR0voUFN@b1%E%`4fTJXFn{!3L{Nh%?lIubqg7uBO|wL z?0Qm3uIK}+^Z`cg&=V~SOtN_WO7BBcpS54mPbbbyGLFVe%2S;S(_3rLWATE6O+nlS z$j#$*3KQ!eennt>bk!7|xTDV1v6t;{m8YfA;X2g21IfwB&mlL3f9AZ@Y9#~2^pf;| z$H2ndR0m@lmzA9Z^AG3ji2Co)2SK@-2;11OwlMMZTsd)Jvb9d>?#NA(^Oc#RbvZ!` zUF9FaR}^Dhp@&9|G+03Cc`TiZf#i^4BJ^}#Y=BZ|f3x0iNr;dC0!`t0iAR>OXK?+>=7Y8!VXni})W|5kBZQR%&vYAkJ9FrJc6Tr7S@|7C+}rSkXd;+sD^y-yex>P=TE}|^FQ8Ke-!E7p z|9X@9X0yV!FG8}Y8u6d697MFW0vFF;#>vzc7|5uho_P~T zDRPQL2z$u(!b!qXlM!a?fI7vOhsai?I__99nS9-vg^M>i{BcTBTM0R>qiDAp8u!n=fQu%fI zs3%tt?ow7M2D|s>JbkVE(yWn1rk8mb26EequhDVUunhu_wlatXy3ruJz4`h|yNN=v z^d1h_#1A!3CVQ|wSkq+L4;Lo-%F-sDR_P3f(+iKX{=0H}?UFf9>+R6F$MdNM56;UU znso8`8DDTfh1{S#*zP-nY3l35XCq%>Wa<3tYXzi6-(IEopHApgij?O^Q6?}P|JL{j z)lqzQiNe5qrzYM-xXayU*!AYbFdJYJip?}%*L{)B40q4NIr@%=XASe$43F7+x9fA- zlB^$|{whc=Z4LaT_pPN$jcLaQon$td-IHaxb+)$aCTVgF{cl{q!GR6FWI!@q3>n|O zYnEH93RftsAeHkVy$iZ}e<68n7@an8^3v9};ERasv$H=7Oqu?F!~aazqn+M_p8Odh zL#}x1y`UENEPq$)9M<98xHrLe zHZ9ZXP(D32tnb@KE<&%;!W`$&u?pXgeG#=gT{s9!Nr?uGS zYmQUb?gfo^db6(r0M{J88lq&A4Lj-5@}{FJDWC{qH&~aA6V%EoDKD={y%J!}OUT1= zpqrW|6|yw$NJC^IK^WEi-^=vDxVi1BAYB<5aGw?3)q@QdK|wufH(Ck~>Gz?b&-A#p z8@%>os-kJ*Jgd{+!hx~RKce>z-+v5pFCf8K)^b%i#S;jO(A}&3+5r>Q-}~&=nte~> zZ?4Y`cG>j&(7(O8JX${j4i~U%?U?xki1^1T(g%!Rj-NtRIG;x;ss8c~O00VS4-`0G+4#E z2i}(D_7$^5hBr|sammT>1?pj1f{6Dm=E=?E2m&;%#n=5cyKld8LrhH=FFtz2ff)N= zbU0d_sdq>W+GIM%d+eO|w%_Cv*O@Ut&Uy^TOXKFfTYI@RmQVC|{g+4zG5Rnba5g!f z_b`x@9U=K715l|Tb|GE07sAim5XPD&7NCf^Oi90*$yG7xzo!;=)eZ{{6PG$mcfg4w z<3?E=Po6v=dC|=A$N8?Dyqx@k&T!d}5Jud`hc>aDK_>1ScrNpzL`8HDFP?WwKL2@{|qY!r6u=abcFH6>DxP~r%S3HJscgxkCu!n(r8%-%o7>3Dw z(mtXJWJ&tTA=Mx~m{F@!n8S1(MR|K*_&zWM|^tG_jnsVv*JI@Iz&7%GBJqo8hTBVmp5n?#?w0q!~PenR!R~)@@TN^KM ztNatG{${z&*5fSd(TIh)cVG)X^`Zq}ClR{7)toTC_#M{04Sv$%m>lluiq#LrXyYEg zo+WcYG0=KfP9HoW2wsiSh}VX}bt8A)S7igot{|cmu7)FaOBd}({#wme3!q*EGzO-a zHu$THd2KT^`+1=iPpCSYEVc-@uM2KLZ{V+tJR|gGVz#IY9Ht6%&|zuGb^I3B1dU!+ zm9sv0h`!!l_<`;D-U6b)3r%WJA3MTpr1GUId(C@UJ$^#OsUdAxcV{Pxk#r8L(GV~~ zL%i`W$nnNdncEW!vlxRpp(2B_WI$|7w{hkJF%zk0;oo&YX@KQk@L_$z>*kvN?vVcd z^OWmK_bqLoqaSPw1~WtV$^5_qunh|jaRBpGI9^sg3j=C~<|M$ro6YKy*SpU_>{qX_ z1xGgITA3e{2Ej;ZDD-J4E&XF*jnoMEX1}D@9^^Oe*a={*?3eI;Ru^K@ag}Vb_eCn+ zOMwfyehxO_AMnGpJ?-1LtF*1IgqqP{S`}XcZ@;>2#YZyeV-k;s$p`5L&AVM6JVB?a z)-kC;ArTXw&I8ZmZCm&>$F4!7vr*L8sCUNG>zq@|SWTD~mZfX!6sT5S((SEuF?p3R z7lakNI5%j7@4+#Z|7rm|TvMe10UJKX1(Pv_C}LOI4Nu-&b*t=4@qx0w=Su2!Z*@N@ zjS8Kvs|Ufm4NOc;^8%}i4BQ?F_^R{s^ZXz~SD_#^HFy#gD746VQxnYY_az?}2keeL z=e|t?&b=iRvml#i#eVtiy*=R%DE(-7?rZw@b0PMFemiwR?3;C0w4PU>=Zx9{Xf(d#_ zikz$;a5MlQB9`j(^*S62ew^(ySVr?(ivPtk^)8z9Itq|isZU-!$PPm(ZRA7uMzeYozxZp7j`FrwBoMvNiPa5BBYPnZrJp^|Ub4vgMQeHM$^deW$%s4~Svj0l_E z^hjdJjyeCqUUqQsfD&?=zgI^wW-bsW)ImS^L4sZLi#GN9TA~qw{ECG}r3Zn${$(Ye zyng3LGaxwnG%oSF0fK#mL1DX{a!jTFQ`t@ELL~RkmRTKyRXv~1GI1go6bg1}Rs{z8 zKZvEVQt+C~sdQtD;Lh2n!St$uQxxsjp4U1p08{od{4e`RAES2(v&1UyfuWv%!2+XRFaH-T(1HPT^;aRwcW7gc z!S%{V5gs&F$qwUBSgAe9i?KQ+n#jPuqY-mA_S_dOZBH{OU1NA2-6tUIL8FY|z%G=~ z?ao`ubGEqhzlLIyVY=j!joVQTo$nBU`ZPD^?hbOQsgO#bh+BuN@y*8AF3IzEzM^D5 znZ!et<-*Fv?If1lPLLUqR7BNo569+X=K;~QQNrP_4zwUG?^C@b=GPL0YwuH4bXU`wk z^u>Ddcfkc6)?%29PiG1CmT>m>u-! z7j3#eKJru^UHmGKBGLYYlZ5rI&=1Bvmc3SXsO8~d5zdz^@+ERu+We`crj}GVyNOptm{MgHpLgC2bU_3@Y6>A+ICpI>URb94zvPgfT` z%u;G}Gmz1_yEAMP(p1EzfeZ&^c*-A4`uL@-}~UX&sJ^%V_^X>VEu}NecfTlSGG6R#RTRijN;% z4*RKB2k>_nA7TZmQKNdQzJFo4;i3A7MX$9ogI8&D?rLNYj@c*y(mTn*bFQU2P&bx)ZI}Tey6&o2nmWHVT8;uR=C! zCA<%+9OeQIYXZC;&G5ybEE0B1AiY`pT#vJ`DrHZwyambI-x0$l8o0!to8*~_V4y0pt2KU7u{_jllH5eIjI! zCgKv(iF%o^LpD6AWIUw5zX0x`MP`Gcw(2CA&=aaxYUQ`uGfew&!7Mmo>DCk4)?bY) zCau3oKXQ#{dmlx+iex|MEw%Y%@bsh^I_muLMDKqj#3u6XMYt;kgm;d_BbX zGU`6goau-xhzIQhT~3t_*^DnSMQ`#gNTj8BGcp8AE5Pg=QY@^jeSJxsFM-v>MtBxLt1l z`e5+vytx6+7krPqCrc*8;(rM=F!T)##^`}Fg$v z9NQl-eb?6?ceD@Y9Gy-r6=Wa!ASlzpMQ~A+uaEGv~{g}ik=qY7l;Gw=X#l)*Y`!JfL zXb{l#OOm_*BrgBhj9Rb!L86iecf!NrinoVDqCq~l3y?!;EIp6i^nBXXYShMIjs9eKABHeE&79XSIP8tvwCqg19sOx5d-9nc#XN6<~ z4LiKLiv~<~L;JF(2xUcq9L|gd=@gcJbEuGdNN)A~l|8$k4p|ZJZ&S=Ro*ad4dn4;* z&h~E&b74cH9I$q5Pg~3TB?za7sppsBfx2)C#>aKnw!bz#KaIQxgCouC0l0Ly_ue;# zfZ4x8?5FnPy;6#`lSrPS0Vg1mfW{@{1m%9` z@#O(7PCIg#Uc2u$dX@6vh<0+{ogC4S)+}fZBTHez%a{KLdKprI9y-#;;uiU|=+b2V+6a*Lx}m3PsYdRQz>B423=}rk%26yt6FfgFA3jIcE|+ zjrE`%!3GQQ7Q4zWiP)1UPM7UJp$WVuVjkO|`r}98^uuFh3$JO!)IG!1JuRi&Abz>p z+NOmXlz9et?>p-;knvj52K_+OgPAx+Qo1$y;RIBLoZFb0x8;Cwd2`TZuiAb%LEQHQ zeAT2#^8P=v-ZCo6HvAWThGqciZjh7~q+tLF zkr-M^q@qU<|H&nc;gElV~y)|JZ=PNbrRM-y^nOAMoZ|RQ>mb&t%Z;_j4dg z_C_)A_&aUxAU0z~XBv^}DS{R3!ZJ@6^``qCmX|RQ=dHwOt)!Gvj>dB~H1|kvB2(sh znU5xw?2FDARNM-Vi2^|O{jQXR4UbIGVq6o8>GTAV(c1zmShQoqJ;&Rh$$X>?03n7V1@*gH9<#e_|MMB8cTcf! zt~hW*J+TD7BVCKlq|nK0ZfjXZgw!;4|JDO#veA_t)eR8xf`Eito_X`;eZ zIgiIvDKzq{XH1CDt%>h)@XFt>@6leb2XDNctGgM~1<3Zp@LAZ{suVx^U#?RaHrY!K z=tWyuTR$?P!tVGmV26=43kk~)NIO2DbMzyMzNJ1=BoU(%VA7bF93A7g`(@^=c9`_;06E9){j#&3A`dK$jP@FK{0dJhTINO3qtcJ7};!(_m6t z<=Wx>-QnR|8a&v-gz!1uf6vWY@VQBY0jp-!tSb-!kYJlootJ7%wCY-E2?`J)OkC^h zrt9xl>Cz;Jp!^FPRiA&w0qi4h{+wCUkegeDa-btK!sSrt0>T@i7pK)TcIT;%h3(HZ zhqd9$vvvO4?^M!)(fve5m#4-qe);se7v{S|l1*9@K5-pUK@>_5U?yNHCxgw?Fe$S?2OD=}fgdHd(y0?WIZ8D=Dl|*5xW{o|ULc?#fDU*_Xcq z6_kl7CrbxnAjp>iX91kUpPvd(IEhi^udSytrfiQXL1gmKlEwmcy(+5cg+Q%vQYM~= zP+Lt*A)xgXKY!pyBZl7B>?k#fA`M>xNem@t2 zF20ur3tcwwNL&3wx6|3j!0h@(v%rI^_!Q=iukrSFnW`5R^J|}OK>estR zl2Hh%u|i5qfHdn+#^CMzwWn== zQ8Nuw$9FU&4yixbzBrqcE$OWh__v#J0Cbp^z>*90o|c|wWzp#&-hypm!Z;if1_cI3zk?YHOiEhVM(O1e~wXVeYTP(oi zovZbeBaf;yN(gF7CLyszTe!o4rz64lgrkpu^V~}Q&(?V9$rU~N8cs?$@Hb!ECqBoN zn1~3Ss4O!CcKTkxnp%h8idBTc~_INZRf0Feql($m1paH(|b4fSNC>F>GU<&9aIxp>}pgzVClXG13_~o^ba%wgtL4^aY{UEh=M&s_Tn&RPXslGi^lkp+}nDsjN5{gE>0 z-1dT;AMYk-13tZ`%|xqQ)ay7{Hvf0dyRpvJR?5(kU@<2f$byZcY&T6}&()tjgA@() zCzSnrPS1X;_y;T<{X^*|n1E=ZLc`Pwd5Ap+EUeBA_cRa}OF+0VpD<`p@r-ZuMr%7-sFQAd zKHhF|-k}nLbnX}FV>6R4dsIOe~lE$HlyX9;7j#q65et$cT@u+nde>aJQ=h%q?ehD;n)B- zK&;LNkC%8fS5Kc5pHiWN<3^VclxWc7D);#Co?uwx*vvP3k~*O+%XVl6e^4uNlrZD{ zIF+NpOkeUlI-_^B#L4iu$%Bihxk7u>!y zJOw=~?|2NX<4(v^!0O<58m`Wf7WZU%H6GFIFYfWT(Q%Uu*GwLjqDFns%)4S;eDybMJn8>=0;ci(^?Eh97qFT-RRL^<3N)x@S z6elIsm@{5h#xw_lBBLR}G7jH5;pc06kj9|^E-}oxch=I2nySnjPp<$HV*uqix<-uf zjzA;B^Lh*h{}1)8C~yGopRsW~u{u{D6a5g;$NF}muji#x?+-yh5pT}un2N_h5c}<= zSNBVdB^tQhEIKQ(z#^@Sx$(FnI%~!z6%fHhZoNx2YH*Nzw(HG0@z*N6Lmp5iHj(o9 z8@o5>CqEhVRe2~!T_-a(Bt$kBqN80er`k$!)G&EaX_XCrXr04{107lZn}zYsQpfbM zVS?n4g8Xfyml;*^_pRC;3Otye4f$8_4K+fJ@Pbq`76cxahI1u=xp;k82^k3@_y`qU z3t7GT^Sie|raf$Hp6bqy^50=`fW#Yu$AxpTk3aAa-X$nmkEc2O z4|nMewBuctq&T7i)?*{30(PT&me!F{9F&_2H+p}}Y_z9*EB`t<1&J^K>vn$P7n?0p zUN%FQ+56cSv*({~q~ph407)k%*%t$w&V5Tm0TPE&O6$K3?&iefuVoH$vkxD&`{5y0 z&ldMH9+~Nw-f!IMwJBe?HwIPs{}{N|yUUz&4_ZP;F8X>m`JZpUMFml!7>|1>Df?on z3jof#aM_4Ed+p1@8^?Hfk4o^mZvSt9i}WY%(?9#QQRT+H}f zT|G7F#LjKi2Rhnf`r#2g%%X!uhrnxRT<+?8o4xgNXF)xUyOK3zD+F0MQAQ3!Nooa6 zB#Wd~EXHsxEy5A``BG!d+SWT4?bm9bls1Rxt?pW$)x7GoB8Vzzk6({C`RxmLkqDGB zTnxByIHPux3AaCy_4lg4r=KL0%Qof4q`c~En}HV%CoSEb^Vc{pFwl92lo`AavmhJW zyLJO)N4Provn^s&x@ksNy*i$6{K(gEFD;YHoHv zmu;POjIV%9#Iw=LH~iy+wm(|eV7FYPo?aYD({TfrCzM*>S=Q1^Y(CZKdU%vbyrx{a z%C#^xe#cURS=TmIw25&CIiJyTK#7Ec4}mW?^y`>+w1V@eH(hfb2xbtQQuk zg{u}!Il)Cj+UczsoEk^wlN5dXn^7Iw5)2Sf@MfTSVwq_KQzDyM5{Gh9h1ZIP(8#`@ zdjIYv)V`c38;ZIvso(;_wJaEPwW;X8K?FMqGtq|o1gBn*@Br(t8NlP~9ND9sGVu=IBpVF5%F++fgzQW3z>+?|C6ZbPC%dg%aWf~Ws zI33sk&Af@`SpVPn*=U0&3_hcIc+_+P zwf`JAgY(D1YwE9yMjgF8%gg{YKYhzA-IeIt)!SH%#U&-9{`Zr4u+G|ToBcNSOUWRu z+Z>@PPs3RcKo;mlAzO^uJ5-kRsisj)NcX;H3?REsX_#*3d3-2hrnCNL@Nl?^)aE_p zuiYb!I~MxpFLEM>^P3c-?PD)*;;*r6RZBC@%Qj%m(bwR{I_jq_xg!3>WCfp~N$6#c zC1c=*;ES9eFo2MbC8?nAxaDVBBA!vplygwMerE1TN-}YIiumKj%Ve0f}H3pB|B^nl8v;0b4wc%s_p47Pi`Eng4ROEUM6qWSJ-u=ddw<;M4 zWq&H)v3$a(swP z7RQ<0#bMM(rclq*-F&@-?$F6(Md@1GT3Q( zE*ZljMF%1JecHi~@iz>IixfE{v`{#^&oMjq(IzNdFuT7}NJ@kk%Ixuv6~FI1VGRBB zYDb~1@iWub^^?)MqQ2a_o2I=*&!^8vrv=fONowF%0FeWF5cB@4kS!m^JDmDQp-v}j zX7^{Nm&b!Y#m-gIC9K8BzDGEu;4T5kAFWj{M+}@--t`OlnO9i__xU>HX)0&{$=}HYc^Db0a=u3S!^c?sBfpQg*qgwZ?7z!2($N z@w$&}+HM&s17EeEFXts-8@7RPQg!mF|5G!H{6R-;pL&D$ci^>?%E`}QiN0Cr*C24! zs&QHAg6{r})I*e6kbLd>2V2tqhiCz){CpBUbVIy%%NI*#@BN2TJwonFIc&huSw>sH z<7xDl9SCP}#B=qZAL`(Uw;-gog+P!r;Zdq3^fF`yLta0Z@=X=(ptbH`-E@)IzD#&W z_uGKz{fAJKtpy15Tki#YGf7$FcLBC*7H<+}8#F4!cUkfUpt@EID!L2`GdTdolJCin znvO;C6DQ?zd@m|~XHb0BJfy-UaQM{lLK_W^wg1|1M%`1Q?1XsHT%yIe!I=#m>l?ER zvH`RUGTmNzry@Hhe_|B=0ax=1DOVO;C;z4K*T;7GzWYsnADK{R~H!XdV5|_Y5z= z_G4n=Mq~1VG_=EiHH?40&MySGU(fveHySPH@ctqL#zQdi$e^C>f2UshsR!Pj*(3Nbn<+!idE*|4*B%C4)O7yxcXwKgwm(IG$2PI;aSP0Wa?=+@7MA zBj#amecMu3vUE2m++qXL?Q^c{94E8~wI3lUbPXprbr#T@+&8{sBD0Raxm9|`Up%ux zOTCg+`I774U52VsBj#R_Bpj6RM!$;`l1inuF0$vi3&0DOI#s5^A@n4?uC?E;A$R;< zV~fC_>O3?-#zWvhN87EJXh-qF@q%}xhyDiATDM%=^O)k9Y23YWxuGnY#b{DICqDq#M1EU<1w_F!GQLI9agN~$RPoj- zUGFL~qBXKd7x~=)(e;LU#dJJxSZ0nBM4_-pOWnmro7n(34KJ9+`Az!B`~5P(U_b9q zGS{tTyUUY%e`Xt`@bEZ77til>#138(H@eh(T>tJO*bB#r27&_9&ZRG*flAJ@xRw2h zk9dG?yV%+n8x$h8|0TkK}}R#>^8q=3UU$k1LORgFB{`!m)Vm_Kg}GE zC@(R&_idJy%aTKi!CyOJq=2*04X-`22hQ|7^8O5UGn{GmC`^p>;t!y0qLCwvZ_T6I z&@C&%Bb5tobpju52cG}qu~P9DK8%fHz?`P5ARZg1&`9g6V5hlyy%)W*ARN>4&@23( zURCtpJ_s%kYQMjAE;gU}A*-sCxb8kh9V5}mAgHa1Y$o65rLVlM=+pW=thiGB9-tT0 zId~5kzu{MoYzTiq2F`T+gmReN0>2C(oU(di&1s&B*xXIxmn7YCFPC2Ex&Kjeb*|VH zhy$4=SE9ri2syyXx%2{)6GV6`e>?&AP3%`BZoT}sKJ;Mh#4dg#X3ZCktE|NK(v z63b4GP<}GKX0p0E`ypMzO2*^^jX4UJS)rPAJn|D1O;Y%++b{iJ+rK^;)GDzG(ZrIh zFz0o;1c=2Ut_%mR_yb^q!VV7yCVhK4Mh<-t^yY6qS`s?U+Q#P7zyC*LVJZq-)DwqE9>m1@4%*1b2$f}a_fwKyZ9PhGL92Ub3NpEg6dm8TKHr}(x?NucRm`o+ z3Y*;A?<=oPiJ;A;#`E&YS7%tJb>;7t*{f3J)4kx9txZqsS$dsw^q^9!4Kv<;JVPv< z8ozEcZ7VKq-Wj{2OU==}93-Bhj3j9J%Y=ls+FWaKZW9?_T0~1wUyDx}IoqdB z{Dh{-_r_({Blqc9X8_$dyc6dNz2@GoE9jaLy79H(;Vh=yM6+K4wkG0 zyd@_&Ip!O}sR=^4#t`U}J5(+ND6ut|Y<0mn35$&EGNUZCA8si;9$Y2AqyWK>a)n3@ z$hjEbVB0j0a@<3?atODyj&ffVu?ibS%(&&&W5MLfiiJpP>5v)y&$DsM^Y4xa&17Ou zP3YkVC>EW(u>8BLQu@b_`TNi5BiDoOHXY8A)pDfBi13r%w()Zvm2txIveBx1OB~U3 znajJv1;&d84cPYOQ|M$}H8gV6f_JoP3QNqCOnHADk8^R;4QvWR<<4N&J0?FgZXVr6MuvFCxv?_5ZwOtAB2D?Z& zV~hGUIAwWFuioEn#DRjQ;(Yv}bZG|bSUXI^w)w%d+&h`qrY+pzJYbY zJJN_|GNjnlYMZ9|hP@GBPd}ckUpWH-c9LEbYGHC(-M?b8_4V7EkKxjp5%&cs_`6^4 zF5eqsO_T(p3~D<$sUN=;aHe>*`IAB?Cg^r49+Fx#a727dhAm6WY7b7VtHeRHvx}@KmM|9S+$!l5iDz z))xGB9GkIx!WNW}N671Y*Z#;684@Wy*!%rC#HDi6yky2~;=T5;4YD9mkKFv~be|nm z3}FB5(C~MnKlE#j*!bEGLXMq(v~j+!Z9O&spmC)@5XWl4<;(Vx$~_9V#-+QZ=jCX+ ztpsyo0GR`Ofo`HOCIi4(SS1<2u1E4AA z(RLNi7Fj8eI5dm;D9s!@#SZBzk_*JYBn1hdFO!$?{uOB(=QXAMBhw_{6A>+C3KYJl zn=wpGKV7sl@${6cc)zD(RPjO53<(5&cXaAV*v;cXnNBw2vDJt^6FtB5 zOlx(xOEp-32FllNr8XfSvw*)YpsW|8jhw5om~e960^dq`Y?#5Cf&%NuT1tMZ zZwkXJsZI@7UO5N-+Cb0{2`7fWyS=5lKb-c}tx#ajh&CK*m5_9Cm{lG-|MT30d;jT& zs^hNXZw6-bmhElTRR)@R$>vQz5fZmtFi8KVkgjjI>e*?(;Co4keVM%$*CVVZ<_zPLzR_AoFr-A4JO|bI*yWXGy6l?}eCkm`uEp8i z$Ksws#CFYI10}i6y3=cK?yiV2YKebsBd&`V-tS-Hpmr3d506%K;3*QEW=ypc`jcXdXKI#oaNtx@;AT#vMx zpOzz`U+5=)wO9Mdvk4>U<_TTu)Q8FVz`JO!qj}I|RnX#C)ymD=g!^Lb2f0a$kK)k7 zoT>9W&KpaNyqb?}!Bf*%uXaB$2LO3zW8<%X;B=np1qSW7HF?(yPOZ`d1GUDFM%eHF z#1GcqDJn%kvNCxHSx}jt(XY&B2+CyF`!{&~tOqx4Yh1?yZ`ISEIAPobs@aLEvPed( zv~8a}ULUNsd%aV`vE%n-K3K13YsfF+$PzOf=wKg= zGHy;wH2X+Z^e7n^ikGc@Z&x3$4Vuk;DlF8+NO%4KrrVWvQRtO{{8F5d4AI;?#y8wP zQaHyT<=jpIKnE(i^n-RtafUde_Q#F=8wa~i41pcK@714LZNVMYcVzlZTF(g>2fY=# zY$BcciZLJBPY0nTps@neefqySr^Wi;G7BKOIT*7U{mE_$_OTDY!fYYme}^7 z%ZbEz1mRR_Blf2XHd&@XgnYpQH3_nX1+aoaN2nAYtlAJYH45IJ{Bq>Tgwb`&r&NNT zQ+6skQ<#YS%n+^2^mGG}{1{*yw>I3PaVYz-zLLUEJ0R;rtc?837!#P#4 z!Ai2n$%JC9vFz1)_N@Oyp2D>ST$buVT)z-boVZdY&86K-YaJfQY|8#zkUV`T24%rl zrBYUKSxj2so7ftb7kg=>@F0iy3;1C2hf3MFS*;HwAD779;wF-WZ#}9_u*nV3H#;EJ znooIf``Bg(sJR13@vm3bK+oe&vcbz>R>^uU?gZ(Qe_FqbX*6|A81Vs}jK2@&Ap66I zu$ASgbd!ps_fep2j{4gj2seKM4PC&1T7ISJSA{l3dz3p=*<`{Mn>=HI5)GD7hG*%^ z$NrgWv;s2FICkHKh~f&yaj3<8Fn%hfDJ5vC6c?pT(~`0y+MMJ`VD~Y>!yrc&!NxHc&O<*%XFCfm6f&d$K7YYFO@~6rTH1%wpLP7Z%_ed5xUC+$6l}kH3BG+h zct!o#3uinUt=g1&bdOo_)gOY`$IeJXKLzsI;Swtrf>8=_oJN#%QY;QW&)teiA^b4B zI{!XPx1_7 zq-%>@8mTbpt+jY;6FchGBWwg}6ek6RIYx&|Oj-(FeGixzdP}5JgL?S`NWQzZv=$is zNm^{t?0dCY7RgIPia5sh*F`X)J^{?<{R7E!b>&CmnO9a)#Vu~w_W7TEByVy&6Q6hv zwI=ga;g6fg~%c0itPQHSbUaY1Q>CdhY99*oZFG^KfFc(J^ZW5khlfJ%+G zHa7BAxf{SJ(5;9= zEs1-?Z#eh4moYcu)C8S8_HkRn6+(H4^z-%tcrqLqAvuzj-0qKG$6}bsDofIG$CZLg0`!;?`8F9Z`4h++|0_n!gIhY#&MQHd@JbE)0;?60yAqb)Jk08EmT;bb z^&>bIn5KT`l;;t3^;7ayb_xG)(6&!RCeCZdf3LA=goFPxU7Di(U8~1~9z<$#Zt(Sq z%l*7A6uzg4(RYW=VMT~=sP`@BkC%NyZ^zmD{7`0r%zom0X`-t7lbmPsmv@v^`?#Pn zsG&@mn8wd#<>@pZ1h8Z@f}U05w>4T|CI;O{4kOP~aM~TP3;SugL0Td1?t8Ow-qmm| zNQag+yf@ud|3_$NPF8F-*`p(;Z7;S)&_TWQ0bM$M16avt5z=y)?Fnv&meA|YfACV~ z|4f{m5 zJfADLKHnxS7|_DN#3I~@=@gah|K1rvC{uH^cmv?9vDU1U6pnl$Kg}yY>~L z=)V>y<(D4ET1Ldxo`%JbQb&e$QCnwu)-K%Ey}(38$Q**&jUUL=2e=XfPnd;v$?_L= zpsrvPHfkDE-#y<)Is6&*NGp|vKhB8^ucNwdRHCJK!1;oLF!#Ge?pz7@IRc~csiv5Rs8 zsU?f4!r)6@LC}DNiYy6J<%}d`oLtJlC1vje(=`!MTU47a=L`SAcpa=1w%kA&sNhto z6C_CdN&9HNw@mMoK(#MjFXfq~5|UcUh{cv>*Q? z5g;=sE?!-++RLbuBz~9(-Ca>qZ_v3$V%SmP%URQFgpDncN8lcvtAuOxaY5~JZFAj> ziSrxZ3F?XI!r!7>?1Xltwo&odcRjH#OViL}W#Fg1W9xcEXQnMKMxMA`U9FWS#mG4< zE328u5X9-N7hz#vPr|?W1t1H($7ys@My!eYd^bQI^wg}GP;Le{!v91O#c44$mvVi> zjmD4@kF_4CCcdOQ=&BJ6>|$bmD7R#Uzjv9c*`~Nt%4?**6@(Y+Mim@-PE~R&Mwhsz z@Hq#m_yoyVCaU!;HrfIo{uVUtl<9sSx6$i$e2)kF;|qp(Xo=kL=W%`q;C+Tat%7mSEkxNvuxsPp~~3Ts=x8JTQ3 zKbKQmk~nW1ImdKpT`UcGbxmJak(A}_wb?OVbs`yfbexztvD z1o6UnF{wQ#i6$McsQCBue5mE2L64-oHz+dusdP0e9r0>}VU#9NvEH@z(Jqx9A+D;R=b|}G9bdpS%3k0oC}zhe zx%Vwy75uHI31Dz)ruQT`IL2}9VgDn{-xz`4&j;)TytgtR?|JlLu&B@-EQ?zSD-8s+ zL!OyQ_-@6DyZtTq8Z!922%%}ORFf~c55`tvPH_4lA-#M76I~MPaZXwcJ7LG`U@^1A&>ON=?|lMvAz6{B%SR9Z(*}~OWPA6!9ZtRO^E-&bN+d_ZC13>FY*L5JAdWhM|Atwp8m;Tj!Tu08Gc2TG<%}Lw> zXgs~9zRkt`Nkcq1Fydq_e%yJ*6Dj9=A7({`nZZGwp1Ri1ev6molsR5a9gi9ZBe3P( zo7l&AHIip_s$DdlITs6mqg2!jZ|Gd1U3&ZKDM z0HURzpo8GsSzfmdU}d7v9vv+W@A>nas;BN%ng*xcQd^?LqalTmz{@&)qdPci-U%E zhIjFWayq9heE8iv%=NGXDG{QmM&Gm9QO(_%^RhMHdi7f;fZRYw&*82^gW#7j2&%k~bezSfI}_yf7R_}k)2!1;tSKDR@0?j?Q%e`b z3>P8W#JG|Hw;~;_Z@fOy5Dub`ttu+jE89yZ^-rp?ssLpwa;)6bbP??d03;w2k5c0U zM_%DSZ^t=qZ_24;b85+V3?v<#&x~W|0+#;HGkTlIj@G?hB@=TiZs2&^E>*Yl-Nk6N zrP$uM;n>(ns(xS(&vOJ49`u%#-r{gkuLDhgIJ)^HDJSt2sa8>H5MV)))AeO;bxgrfJaZvk2G>r?( z#gzqXA;q|8HT_cUO~3ef?r+-|oN_vOtWYUVIpPjxc%LQ7=`(eaeR`caooBBB|E5!& zrKhJxGwSHC$(DT!^FM>{gAiHY-+2_zlfau-dRxqe%8ft%iXVEZlUG(91c_jRm^v_y zk^s9A+VD~+(fhSaBTJX3Q?kWFYklBhqhu&s^o<|%pQ`Bd3_z2`&6()#TE9C~m@iVB zbk|Nai6bT31yxebS5#y==}S7^=J<%LmVfM{P!)R8c2{lH!qe zEi5c#TFpzWr~#5Jsjr>?DdYA2$-L+g+`Kw7GpT02DmfLHh*1@oSU**IN^&yqBYnrQ zPJYQFX+NdGf~e&*P~xs~2UA;QO(3`*+5V7ghS+M?IljC8Bxo!1$*m|!=(azeG310* zX$*V2@v0&eG-b!V&HCZ7w=Jgjd^q7z6&@>_Uk61fmJueW1V3s(jx9aL&|t!~r^aH` zZy~4(6sGKG)p0w{g4_237EdmQMEdNj$J)yyxGzM3fu}rXpf4mhpk6DRG1k67b@WlC z6#nKZk(jWACCrSP!Xgu5KtJ?+^YBw;Ad2f0#XdngbZl_9V~!|fg&WA59O>%FQ^WA&tK_S*>LC7mb7^}=z#hyR z`=DV~>Y`&|kuL?ZGi7MxG&E4#ui3L_-`~@^)M(i)BU*0H=Ui%wS(^iKc2=LJi@US4 zOKv@E_+MA)CJ%U(l3h=uV?PKxx%UaDoduJBV(G+d{tsB$o7ZI+NNPG`LH*%G1k3P3 z+)RHEVM?Ljbz80Fv~sL1cpKPBc#XePt-=jU7=(}IC3td zx!cIlk9WOk9D?dvT z+6M-+O>}fvq(t+O>_Q;UK8Ft<0d(su_?*`P+S9A)Se@JnfdxkK+kLilMa+V=&Hiv02`Bz~C z#?gs~3Z)lslg-I5L>(!D7K6#pMqeEjWwE5{8rjjfl;d->+7G_Jza~?q3L*2+HAxTA zbyf45WI=fH!-V*+KoXKRm4zx*W_CfPr$_gF*rSBR$GUk@>q*?gY8U-vDX7;_<58Pu zF0ovXp67)qvz4*{A2p~;>~%rRwCu-YdtEdztOJww_I7@d?7=~G4>(vlv|qxLH)GLs z6#&k~T02PEGFdrvVBIO{T7Fv0%)HO47E;T|O!XR90OIAybXj(2F|@7@h6;z1zR2HM zsIkrrFI64;cpmR05wO6cS#B15g!#S4GvyMlAA|AGlO{S4#JM@Ex?lBH;P5>p`c@wY z6C$UcSzH1$eU*9##V}HbGenHA2)7H?Ywlc3bs+E!>f?)G?u6Z`L!~DlT5G%yxx#t7 z^TNny1l95(9W}C@GU#Jr#gMxnA>5`E{^hz_drPt!3`WAw7y^B2xD~5E+strxyPmk6 z)ZJeBY`+yJ?5HT9<2lLTdys$r|6PBpdG}NIS7vGRzI9!OU~0D6 z0I0la-~ErudH;VbW>5ZN7G#mXNT)d25A^SVi!WY4LloW4wY>A)?LJZoESd@b?*Uzx1{An?G%wLC6#v9-Im7`+z+k$+@Q189}}_p(hT7gQo*bqW{<^A zJ_rdduE)iRH`Dkc{z@F^L4CQA1a!mj5cWJJHeUo`$#{ZiC|8|j!Osf^3t#nStj3Df zY3y2TM_7`2U#NYfwl3M7svK-`-&}q%ho@J^|4W%`WFL$Mi0@C%L+Rw{RV-BekG-Rj z5>J>7=-DlvT{sz*HCv%z2sqV3%;f650J8(38&yG|v7iD|3IpbfZ^@22Q(*I>0_?Bj zSdXt5Wv9mBbiF#^D+UOH8 z8$YNc5l!1=PZjP7DY1TNy^n}iHshH?;I5m(1I6P5jg1Qhaw>_i#@@OnH17DQ5xQx#Y zVcm>XYKnoRVH0YX+DdA7(xdr%s*(3>u)y6{znQC0i{e~22HX9O>1~AceI~L?u3x79 zN9?emR)zgjv7*g0+A#6hQ5J#z_c?&5g=sI!|L5DPuXn$v%T}=(0}&Q#&r^M#kjsQC z)T}KeL(#!J8O_&2RJ?=>Lq8V(cNT!`F+euXVp6}M6>F4Iv3wxK6sx!%5Tc{&wqaCe zrs!$U@APqTh!dQeqnEzi6ay(n$BWz@-kD^41;>-_F`F#p>zsNvcJDg$moSjyoc56x ze=Br`ZR}<>v3!UJ)HK@*J>PF*r3c2IPt`=Y!MLE!e5lH-tHrY7-x zB!ps?Ykd&7e+*XCRK>>t$RY&!fKH0Q)-(O8_eDEO>NJ#5EjbRZ?tqIkk;G6^Jl0mn zi=|dLoCPo?A8c8IrUJrt^!-;2^+$Dea9dOhX6b%fa=WO(I+**n`Q#2s0#j8co^Qp5 zFRZY>vn@#!HpXHWmxnt6Z?uy+17URjp{x({6*xEW`Q$MV@Mnv>I@QtZ9v+lbmT ze`Iyk^FpJY)JR1G=b2!z1<$*0^PT~Y!_pZPh*k2A{ZelCUZhK-^&qp80j^|qHoJ-n zBJnn27ZK2uqnWrH@nFAT$6hywF4*fAPfO6)&|}P(ShEsn?;Q_ubOcGUH!akHI_m~a zmwed&`QfC_@hbl<+%F${-fHR^0MR22uCA^$#LHQ1JUsZ{-jpfr8G$Ko(E$zrOf+Sn ziH7Y>>uuf5)f6LO(rgZJc3^-SVUke|Z2#B5>SCuCfRduOX^5EZdHK}b36UhQL`Wyb zdMmeZu|K6gfus^tuz4U^5Zj`vs;Wc%VxUVy(I$uf3Ir$|3S-}NY4TL>36qlbP0Vk^ z>NoncDHj+tBFJTQ7yJJrI4{0^Y}#HIGl?l4l6bXVYG` zzFjZ`uTsh4Dn<~|oI|6MoxhU3XKo0(-)vs!Rp{=yUkZE}Xqx_@ct@=GDm2YC0S#V> zj*xGb!bAeEZuV?6Em%P5K$88>7f84EsaW@l4o0`FBd*KyIqBoncE?}OhNvGaS?q~J zckl}e33LT-4mgW|wMKe`hY1_B-*L9qAlXy=-OaS+m=$PbvF?Kx$|)G-6+@7=Y?h>f z+-4>K8LaZ#H!GAu0P7)OL>=BgD`<&@c9b6R>~LkeMFNOccFl>zjm?uM;&4xY@F6%t zm#7IMOOL)MreQ51&t0LJCGaB0FOWfd`E1g)Ub+dLqEm*uwBlA1dQAx}sEVEbYZ!?s69Yml6igadY# zuPW_mVyzA)ZgYI`OL(X`cP7=jrCeLvA>KFil1;g(>r;#f0~utf$tCN~RRY^5GVWc5 zIck?PDbB4peZ>ZSc-(3|sB%=KbrB495Y_UEj#eGXG40B-iU>(ji|-weJzXJbd-b*Z zN?DfOY)^++0~gQX5izm3JTe^7!&HcT_?L`$22whJro{kxw&43pP7;DBYgU`I-J0t4bSHYdb>(=JP^piC+3VXmE!32LIF*j5(VQ`0_UBiJu=K!s30m zIK)f)GnXW-%;Ou4gYYFDbZ&T>-V6oTiT82go8ta zy+932ze~A7-@d06tBS;rfYOprzqqH~dZtp|k~7r{Qzw9>+jK8IE*l8CSTF}ll}BV% zbjhDgie4!$>>5%2p#0O|Ji3-wMng?K!cgyiGg16mVPgDNIGH1aB)uj@Mzy%AFhd^* zdT4Lj_-~LDNnH3JYm<_wYqusGyeDm5wD<^)GPIhlShZjNe6x0VcW}t`$1qkY`Xk-? zb1*u%Z>C`V(ZR}C41Zfx+;2(pxGIYtXu51rV7TWwAqZRsU;h9VMhWqn@pSoAmNqIm+FGD}H zFNW3g87Rd+Lv&j-{>T#1FB&thv+xzY2*lg8Cex*OxR$6hx;7kirwVG=-t~vY34s8g zFP0s%(t8GE=miBg7%Xd%m-nRnFtCEmr&kTyQY9eq?IRk02G!$lB;}_FyEz&GQ zM3_eSo_V`2`dS1Z6ZX~GVf`o9BYZr#?+IY!@om!K7kRpQe9mSE*?HAv%RIA^n! zp-?!a+&1g0Ju%R?=lgS%#Ghng<5oc(4a_btzf_w4FEiHg|Cq7B{J^tEd0oxd$H#;G zma!M#v-SR8Ca!^ZSrCYR-8lDRzMS6xA+e5zEFKsR+(APk+x)x%sNKpa|DzwL`w`ML z6dlh!)RZxz)IQ?=j$h1@Zor!TA|0w~#euuD}_;dTRw6>0VQWwIe4P%O9a4b?6Z zjH-xmPuULjCMUqVQkM{-)4_5&C*=5Sgyt}{S(IClH;5#7`KWh=g0b~=NN zO-jph2)R>>>YF4LrMDF0QFkn~#4g=T#kfCd5M^)Q)Q<21&c+RBh~@$yIU-3(=P5Z- z2|$FP>8b;|ms$k1mUolHI5HB4CJx^>!&(v&_+Zo+An;4yrRzhtG=>cZ>xm_C6u)zV z0s~D-ZD08}nkyB8xAtaEGPpko(2vOxFfLAE!lVIUfZl2?;4A+@(3pl)ap_>WC>D#Q z_~UCL*vjr|&vW?hux|1`e~MGefy$JKKwgQ|`Cnl$^&CexC=D&%lG98r*ydp3#=+4f zLe~-BBERd54Z&?txnT>Z(eZLK8D3td6?HCS%=~Q}lq#Vqa5K#e|Ac$^i@|PU?zzoh z#V&|7NwY1)nse+)Q||CsZYu& zS~ny=;|0yKUdp4gu*cm>0{%;;Up>OmXa`j}{grq>s@CDd9iAlh@p1ZOWMsTMwjH1! zBr$JJbIv9Xn|byY_7nAJ+4}0Ic(285GYx-P27e|EVwJ7)FghZ#UiDd1RL7qfU{6T5 zG_JF7O6xF9B!mlMIGiTPLQ@LCl{-6FP~T0%6}A!=v<|=5PJe%`r5ID{V@N1|Z?V^N zGFeZRS-c{8{us6NIW+$sjN)YDXi$*{eE-$+yu9llRCzOl?M}4EBVqy(hz>-W3&{xxecYt3TjJ?HFwp8bS?z-}6A<=bk28aj6LQ9H{oKdW z_+x66M{(vlS%|Oh&E@=xE$A`D?MM(P%Evh3e&Dxha?{4ct^7XX4D;StZ4-|ka z;Nt6;r4(!$Q*M|=Vaf6KpU}d#y?&cZzR3i`+`jXb>lEce#qTWxY`~1IH-mfMP@_Bn zujn^FqpD-qim&MoOhG4I5#SI>L&sF-Sw1kQBpaJH6|1g{1T_|pKztD1j9=PqlL>dH zr&%hUE>zr?Xy=~n_KD*1qE%5cK16|bGm>p?dYx+XT(Z0KuZvkuUS49nt2lTG)n%56 zR-9GDYRd2OcT3QfYcju`lApkgFPyUk+|vpg2QToeMe@ zW3lNy890iQi;k{~30 zb&UkV>q(p{T8_pz?+SdbA0Mu@wf5a(HnhlL_ESdSs`05dV*-Q%Ru|7L|NKf)A6+Ko zxHOKH3nVh04Vli0>y%sWr0w1hIye7pjnnBpB9AL<+G9%%Nx7*g7)3EqwM*tF>1?=b z&y{xvK>zNgGZ6$x1R%jy9K~(@WZ?y2flY*fd?-Tp`fL=qZv*kcvunS76un3Ed+#mT z1R*$|lqaKJ&y6$J{(&sicGzXLWUHw-t zE8G+Nr1aN_ArkU%m;;Ys1a0o1lQO^c^|=(Gw`ec=IlcLP8ffx%>6k*)3;Xoh6{?z5 zKyi5W&pY#6&Uryw6j8o|9Qe(2bIH46_9mW>seLRHm<#y_-C?xO8`r4&Tzn9yl69i& zNBYSTfWy(zx+5#{_}eCmDrXx6J3R}S?((k+|A}^D&U_7zesf0kE!j&aaasP|G zn0J5L_k8xjp=Wg8(tG`Q$ghekKg(mC=cWB$$|@o02HZ5ar`WUu4`DXZc!HDY*rA+=HI zYVE2b=3H7`ICV9U2J?68aqnRtc=51w(m@JlZ+NL4r0_=fFoXy_wBr1*^%MfVh|cCp z7>5Uc%Z!LP(3G{lrytKE%<2wOu2zh~lD`gII1Md5gz&fcAA3r8N>kpF_kpEV42V*d zmZX0ls8$AvmA@+#==$^3suni2zZPLYH8%f*Fg4jaDdi|KtP`a$a8JMByVEYo+aIE& zgqZ%Vu}!QrI#fVP8uxK`uc;_Qw*AJQ;a$&1r6D^K)(>Q3Nqk)u^!lBCS$yt)Gjei( zND2~A^oZV$j%{O&;aqWm5^9s#M-{6wrZ-*a&drv95hqf5v2~rTIoR}ncDdFO|1FlV z%chdC@W-q0r)@Lgf8)Z_fL|_r9v~W3SyTOc(8s2IY@iAL7Lq071{f{?%%zSlfI>j0}cB;hbE58diH*TH2 zsnDmnup@`Xa|Ovm2WskgLc~!mmCeNIV@)*OxbIX?Fs)<3CFW1e#S_JQeCX-XJg6#$ zf5vIwo(vBMiiHPWTJZtkC-z5YdH?Q?Lzy6<=8~p>f#=%++9j2ZRJ%%Z5Vdm5oOG^lbmJo5>=5<`sN zkpgMFV7lPdOuBor!q+sr0=%nt6W=p@M2%+XLnkoCA~3t{rQe(<(|B1(t>H?9pRbI7 z;2@>I#H48C zmw#mOs;$;T+TUoHt~J{{c~HynGH9xxs1Z{m0Fd4fb0QRL z(DU13k9(wQ`bcf6QfZebe!ZH8MW3ssRN>*W)c31Lbgp7g{|rcST3F0ETJ*%&)6;>g=T(*0UEm-?%zt6h1#lUB64yz8ylOc#=FI&MgT$k`|u127%dwH@6T0} zvyxEQyv)I)m3(2QZUpd*C5#*XQ!Ie#z{rY<+kSK|0p2C_K1StOCjLWJIn#<0^{ciQ zS|?sLdYA|WIOaD`fxjF6y^lEbvm8b<4)qh~(|kh|%v86thjM43pmB~8ElOq#c5bx4 zB^-}8?wa;p6n2luL*OuWyt10=lUZNH9TXZS;VV7(2_C#)im z=y8X=TU?k$6uy{5zEgQE7K&(PmqIUeu3GoCT0aG+{>IvEo|wDz9sS^2o|$gXWHZ@q zHju9oFaTq5Ni#nA^=g4B#=JH12|xqLWGfC(h{burh>1c}6)JgQGInZd8bnol)ZF#( z>7X&>+Uw$xUt}H_Gzjx-*HHdqVJ+ayz1k46$zLDvznbotIiee3D=V{H2vWA-$M8Ff z`}hgh@*EH3aJU9y=`vbr+PR~4R&Sxi-L6E)o#PasA-qV@Lx1_^xd^!YoJhL$yPG_E z;pj-$Beh)U|A4mZrR(R+3(7jMoWxzX+IgU*KEb}a^XL@K{A=XkvyuIISSqqHk9#axsJxVI@{ z)%evm7m2iRg^d!vn)UXGoFNQ?>J9I#L77iXpez+WeMvqzf|40XOB|B-3TuDu{^)h4 ziHfV5jc(!33;Vh)L7kU(aAqWSKPx7V3=Ls7(&+>%zOOlz-r9E7Ria%)5_iFdKLu*l^81IXmOw9#AV!2QMUc8m>vtVy%|?fd z=jm^lGi;B$GwJ1tn86Tcd5F?+fXk5-IR%0}YM>uy+uTV!O6sb(Q71q6(j?fe>hA@d zls%)rrsE@zLwN6m1vt@7Z3CCIkv)YjzHQe4`e~lB0{PP&mHm?Tp>}g;=b##Z4eVN# zI!TPaS6nJ(S1v9-b0fbWxU(bR$Dn}zCWiDVAZ#y+gZ4QB0gu9ixj*E+e`D@;RpkVs z6+snwayXE%WJC!8^7Cu$T(OnEO|Hv+&t_%>?f0|nuX!=xIDE1m^duwH3@|;Tt4}Fr zUVkc-5t0EXHA{vSpo{L~+U#^DbmV%0*}*#H>GN8{$6wWq$FwIE>9o^)1UtR=Fy=&; zLPm`g{P~kn%2XkBS|DI*7duWn(WOB?AahqADIXK*`kur z=ad6n!!{oXV(bqNk;YFf@_@3(4wZxmv=GhRN;CMlZe);sNP-D z#-LZuSjU|Z`D`av#G|e^Mg3saDXEM$hLGLxlr6dxRUQ!`Q)`p@LGx!v=Q$>vyCqoL zbRcD+!^(ZGgNn#}D6Tib=gxk0qZn&yr`e+Q_lYuu#AVIW|4Qqxy|p6B_DFXB~uF71JtLB z&y)~D8SzOvG;mbkJ*#OW!*qjvx)EsK%0}X7L>Un zKi-(rDy+K<*j@80k$z2O%*Dn=Uit8q8(y4JYrx?#!gHJe*(Hy_s>gGzasgKU zUw%lB5<)WMFTW^%`K+P~;P_7OX!Qji@OV@AXnNfFaW%|L-QYr}09gNM;8m4AJ`A|L z7geHQUH4^z>qtSo4g=t@ftIi*FRuJ5dgi5-w9q#deJgNcVYit3Hog6d1fZtEC*;|{ zp|K<~Jlc9kPM57$hPF|>ePk31n$y2x`ON@-Y{zEVAU?a;MmalWTW(GXU{mY{@*@KVR0mME=v-hOStgqOoY zZq6O0X34zBpvtj*b{M>^fgvF@ue(^QBJaPH8jflxuvgKGcg@slzFu_i5Co5R(>34a zUru3JKQ+}ay(D5JAyr0ihZBq_n3r}^UDoA%b9=yYCqfqRsW11ulc_T=0#BUi#O<}U zsrd>UZcrTBxyjW8hemnVx*Q2{0v@vu6cP*CQurEL`t^)4g?ioJix7gJ8*A&Ss-Q_Sux4^Vx-$Le zFgI9nkBL*K`E}f_MbBftuWwbdu*_6r?P}yYFL@QOqW#UEhW!s;FZK@utG|fDj0aILWuT{l#Yz2fD`igHg~}$B_~D!iTiD z)P#h71wH(_XnW?Z;g9NRZD(`*OW`p0QoXEzXVe64?z)fN-y;0p_cxJPHoeowm^O(; zSA$_lPiW4QFMaOZZTz%?6|wWPApgg|wI2mL$M{PGPGqrY^I$M!|2WSjoS*|h(xi%r zD)gl5LHe~43~L#^d%j0r5x&RIO=oh_lvt{(#Hy$t3U1AEY;NpdlkRVx;>0(6SC1e=zN_to* zToqv|Ze$rWMu619kiH6g9^OimL!bA?KEj3aJH1~~Iv^w&RjwJVG?#POXE$lL&B-|a zq~P{lP+mlotV0yMztmC)%=Ap!F_c@i7P+zI5pTTYNb9{}@dr9_eBzQoV6@JA9a+u<+X7R6g2~ z)(u|o3M954+t{PG_^jV)917vBJIfv{(#n3 zzy#Jy9`AO}#KOFD=*~A;xqv>{&=w7OK62C z^TEQ$lTrz_lt^;kw-dk#0;?KLKmDaTigM0M2_Ye4H9DEJ{@Zy-?QeL`A<%%%91oG8HHi-BW6BCqDN?@n(aIhi^NNSBBBrk1WG!d|CvGhp^GBD_z+Z6}tx#hgZ-d78q+>uO9 z;yzEIZl}U~Wye6}{TxT(02g|lU^%tEo?fBJ{`=iRbtjvT+d$Dtf0d^3Jni>}bAAl- zz>LM}&MLzc8VQQ?Fa$Bu`VTiRulys2@c?yqSuoqDPBl+jx5LBeZFh$pVIH8Ua>)zj zQbTbd77(x%(?1IkdE*bJM)=DK#BBTRB@chfFFsSFRs?zc>yI8H5UT{pnXRk3(5k97 zAK82~h2bkTM8Own@ID#|5(wH*UOdM~8hWEq7jzqf_mw$eDKrSt4R6sBX=*uu;`Xa; zwrGp0?tmR8fFq;moDEBeJNV6{Ivv^OZ!YRCTd`iIM)j=uMIxXVmEJ4LLMC?ertCxH6vm3J>&ZfT`S2%*vy~TU*-jc!{|m{q4*VHRaE>0 zx8DgGe!{Dqim%!%%Rpzsf`u>o@6OJD^`i}Eg?&{aW3Jq(qaWI%RPRhHWuSaoo5zuq zs}?m1AQ)i2 zWM@w{myIEYVPUdl0{4WB7}U?SX@Hsx<=<|W@1y_X`EU17*ZaJ99q=;N+ldGdwW6Xz zEmU3{Rn?4~+l@VO`o}<`l<=X&WeUv&vZL@R`+ezeL9b0%T1Z z-F&PIxLMP4(JaV&`n#ooXwf8jtRC$j^nsOcn~AV3Ch~?#NuQA6xC@Lf3N@P(QwU32 zSX!duR55T_e2CTi1HZFC;~QlZSsVT|ggAso;^S0F%fFIUX?!az_HHeoZAVH|ve#zc z_Z_?H6P968c5l?pp7js7zqm(G2H)!SNOC4(1u^i72(OF;4&-iQmB)UNE(t;%T3|H# z@#g6Z{o{aazpD?8qONhb*B`eoO!To(*`qJ*)fO=*j)E>2!HG5$p}{mi@$H*;1_1w0 z6>DP1aTvYp%)bLCg!JE5I+ciO6_&kirvuW&(?V}A+e?W46_U7rg@g{{FDjUmVhG_A z_$$R1m>Dy)r}=4%x%cWqW&!@hs^}BQE9YLZMt7U2J0Rr|Kq1ft<>^eC1FAxXQ&jUe7QB_`jjuDcvJ zsdxo=ZjQ#Uqh#F)J9#xGYf_ftNtc%TSZ{n?Ea-G0M!qOyk9CV|C0SCaTKl4S_RQ}R zcWiPJ$cXcI8tn@UCAMcC(3ZViRA5n1R#CA|(cEDxrz1kTE+^mDrmUuh=iyRh`v#wL8%p_;^6)94ZiY*D`OPS81KV3c%NB0tJ>_m}- zZBLj*fR4Y%CKEga)?8AgE*$i@DIO%pfYT;qW|LvUSy+bQg8V27AX$oby{lb;)Qcp2WmRy8r*XK)?ildJ z_)0WzVxkoDXJy6f_m05$=jpbOsNokUgWNyWg}_l5TsR?E{H)UB>-2EACiZe{;7q-B zueJ0CaAj2&4$(Dgkd3)H_jW6PUPxYU3wx~)|KfF^tw;TYGo|2f5k`hhOSfwob={<$ z;qce{v;}_aRYX>mrlhY zAI_AbHGI6lO;-^yFixYnqd~BZwVs@j?MofXVubfZoTrnZwE@GWDGVb@*u^-p2G&*$ zFurYz&``a2rw%S8I*-2}TNHvEzRT^_7_5rw!Qn^<>f`-t$td0G5=h^W^w78AY~yNS ze1FdSZSd90`YV^=*4Yg`_9W_0%VmyTNT^~_S6y5SwfK zxHp^&>v;l>TYsV<{?dJH>K-lP69GT8;uHKd-TR0E=3s9>3Ls!nA~LS9w*89wFW;(3a~u1N7|co=#Z;-10;&@VoGbvhIHcle@% zt(ai~%tLGaQ4Jig=eXg(H|3Vz){H{9GyYrC%}M!yb6TOpL=K>5E&ORWg7-D$CFffl zI@t<62nnfktwlF)99~h*>fI$Vi;i|4zXBOwF{`e05A@uWXbCD+*>X@3TsRut*GZmv zj{hu ze^3TgKTWfPT%1)|{N`{UaTZrjGuu(LzWXI615O!rVyJ;Kq-h&D26mbCL2vl3Y@LG=O!^^ugW3Mv}|c zGCU$N^->N(Vzi}M`bTF=AwwXrnT{=0mu)t=cI+D45OHIwQ>2mUSZvA5WjqO3dZSk; zzER$E6>n|9NV)r#4`p}Jaj<&B7$)(F$>`6|>_5wrXxSiC)C!?;zH#^PTSA5BD`}`0 zude&L)CK*2E3MgMz*BtCyvJ4l;Xib46lB#hQ1+F_AKuerfTB`0tZN5Yz`^taY`_gI zB2@)AsF$MI$%C0;r=C(-PIbU*Xxz;3sP&0~eH@ZSmTX0E! ze^R^kzSZxM^KWw1Y4>Czw;_JT(9DJbKM|$o=f^B|MBz_G$ptE?`9k4Iu{i(!)?~P_AM_uIFAx&q~q*HWe{J26ETiV!7kV0P5~==)v-K_Nf}C z!i!oBiEvrkbxGZEYWFZaFU|MxqZfo2@M`1F=@0k8rP`1L&{p4jPmcO7-p8A7Vw>mF z{)6dVb(8r*I>nEd&Xi(L7*1%+>y|9$GDR@>F0FuN|5Q!1&6|ymd6g#Z;4^!Fs&;0|4duCo&RLhXazY-1@on5GpzPxG0Km{$@aCd&x%)>U!^vWK zt-~h$xV-Ee7|Fi4_m_V1;RPzt$N7b5aVhJliS)^2*YDjsZ5w|kh3H{CB^-*ecv%5T z1}FHCETdfMXr`@@(2FG=+9Y{-ou^qi)p9LP7!anVVvY3^&n*qIFlzs*!HbPyK(A>n z7{(-zh)gWfLwMnGoNu5118?xf^nx+rBd_Rk0TTmZQ$^rDN7Fscf-ZJ+omRZ57Xxja`dGn440wg2? z)5M-rd=OVlLkCj_L0ZC0c8hy@Lq)ORJ#)6^9#4&pHPoH*H~T&0CzN0P7NGZyb)x$q z#C~hs-fcj|wt12?+tgaorZn<$`iW=S&*vD@<2&f{i@qaWSf)4cc7>F2BAy}$0(SYb znD8+%yChQD>e+zr5{xfUf(vX$n0((;On7^HyP$O+$Q9$)_xB0F(O&@i^jZ`HNDU8@ zM|jzmowpwLZy!A;K+bN1B9#?eP8S84#v|svj3MbeRYLb%g;}~WkrEhwTR=UPM zTh=eic3cNm~>D!DrEMcG3=g~1qM_i z%VO2V3QstV{}-D$>}Ll~KQ2Ds7D@(u=p{m`NmnW?mU)R=m}e1Vl(sk=*szb8#~M7Z zw`~gfu9Yt?V1kBsSgoVYz#u9=Y58&KcxdEa=+(PQVOy)L)|+^OJ=9!uZ`W?^}^zZUrG6K%^a&`?A2anTnWIGABEjHp-FAKIZZ z_n!e{{T+gytn=eagIpFmyc(aoe@ng#!>k(5dK>h+eNokV6Z%;}>pXZB9lZaYlpZDz z=8VMilQu0BKekC+!Hw{zN(7YlPvXwZJKh%e#@@{JQSz#}F|KXT+P*_Q|3g=+{rQ@iAM%+W?D1InmYBHzwHmjUq z&vyt@>?xx;+fjBdpB1{gkZB1y8E+nd+r*v>V)RS1?>pu>#NiE5hJIF<3HBarUjXQC%|2-V zN+CjOj9aI1z?aou{ca!vqu{6qMd%|D7|7X$8-Cl$P%L4;fhI^c5MK%1z^w=3Niw#eZrLLEuOz{K)9y0q1f z2-R9NNVGqHA9^M-uzj@TOCuUh1wK8-WI;*O*mCdr~a9nNZ-)Bo7$QBn2A3j>n|7)h*ejL&gb+;lTgiynf< z%JJ36ejy7QHT3mkIV$_qGq5K)L;O!k3dZ8|Zs~w09Z@$qY36mYul>FEF^fjR3(r}3 zL4+nyV;mmcMUCppI=a?>PoU2=^KR0Gc-vfRJD|y<@y7DLrRR|tsWw>}WB{h|`K!ZL zR9_FbCgj%f!NZze^nL!wcwGrnO6v5--b@jjpO89!3E8XFTDV`m{T&5x>uG$$5>r!E zdch4}-4)hMpzNHxhd;d)0CQXxHbRZ$JOU<3+l*-W@>=>d3^Thq>*>nchFPR00c~kT zh0;+E(2K#DK!3%+#iP&%Hp?Bya~Rnj#;F?`1R_Fc7%=l#u3jD+9{KO7>E?eGi6{vq{JCrY}Nk?Ox( zW2;a}7X1`1frhvmQxQ!v-q%=LqnPxG0Fc-q7#B`&jRBYap+hT=SZebu<2GwOEHX#|YUa}$r!X|7ngF<5EbKPB;^s&x5$;r9!=8mm1C&-gsO)1}s! z^Ms|5+tJ$R)MDNQkYnWC%~QHcT5FKP;soHW@v#vIpUD?hriZfB;$FLOd#Jo!dt)CKK=5~VMuNClry!f~~gJ2m*TQY-vO zutZF29SdduzLO(F){Gm%8uB-@yMuKUOb(-6S3R|iLWBAx9A|;hDb;)qz>Qg@S0fFB zeO7S+3Zo^ zE4oD3>hMEm<|QNJ`M$6~iy1yElxwFZAn!oOcb`(Wt0aS$j4wa2#Y~uFx$P1k0_Idz z%nLmoegZ^DXnUO-*G6_Y&vB-amSkc6VjEo)u;US5KILx_3ZmV>{E7+)(^+SPrCb14BJ_bY%S>IlpS%G1=cynG zZHUtqvOCK8K~Hb}#+mf_uKVj;iuV~ham-uFj9SP?A|%aj;Zu9M_`FWPBKSaUtO>L| z>B2TEdGT3=sT2EDoU4DU$1H-Nd`31$$Z&95HuE#7?An zNM;`Rw1r1k657yS;ht1;XwEq+vxB{@;OgUc#$jt8Q<%`(LVZH|bq> zXA>7MtvYLLw680kQ#aQ%`6KPGbgZ>Nl8~!EE@~Dh*jJFkYgs?kSqRpeLQqJNz7xl zJVYuy38bl^p_QPV{gyf&FYXSumaajBcv%5yy{)_ij-P3c&nnzjkUx~j!a@sTsWsTX z^?8s(*-cM6eB;vbi`62u5w=ZePP++!Y4M<1dfhKTlC5-6l>(BlImN|aS-8LlTF#W> zKlX3s>&uCdR6*Ciq1?|DTPOz!LbeYb6cJd-tYR(yz)f?YxdSIEZ^mv0A9pXj2nI$7 zW%YTP|Ba3(pcPgTAmw*Og^SH0L&}K#1x7HIQbr=DG!yw?ZMcSI_%Ck>e_o3*h(S5? z#2KlO5swrw>K-HhF=S6s`Dem}UhW9&wBD5=lA($pJM$GQAFWWs)jU~Mqq;~H(ZTKB z+pV_}aE_fX=#ZraxEBZT7{`h0k|3)Z*je!MDJAH`f5HdA>NccX+0JG`k5SrQyI)LL z5O9arHsDI}&)RlNPdl#S)0JF7Y>Zmg=a+0twRb6?t!NFs^dD1vX}Yp2?-@v^t=isK z(J`swr2()y=-htotEz&+Ane4ZfNWa|`2_y?8Z%Rl|0%iUY@qYjh9g#h+e}A?*HxAc(6?yS z=nWc8B`7ug_BgY(Djmcdvmy&Dheqf6)U5L|+-*uMrtnTgV>0n?XUMbW@fe)wBv;V0 z5FI)~+4Y9&xt_ipO)uMl_cnPEFjpie7k0d)6R-dHHklSH<5>--BuD88?F_4|_hfpq zDhtzlUo-}a%$q-1ssy|qtO{%I{`z7_qi^3x1AqnN_PybxM7e;!ReEzXPI7+0VjH?f zZ&)`H<4Ij!`rEy+b?9?L`iV9_V&}jG{v7OV9OX!^oq$K|2-VkK*%iTeL#Ps1HF&mR ztj|%8a5b=|HLK+>*wmD>Pcx;Xm!16_Qwq3y55v$v}2f?He}CxJ}2sYG8X zY(2j>I?-$C&JzTjs({BcOPy*@34xw-T$O5oWDvMl0TXeFe?2I!_Pz>YAd`b73KyKn zFWG?LGS1GD=8kqks_)a{zk$(-@HT^YdA&7h!B{|;)%$lpx*1^I!1PddX3*&sQZVuZ z2=`x_o7SY{~vSvNO%H_5U}WbAF>M}vt`(Z;BjII$;V(3X+aE9 z!|d+~X@1Pl`0a%Uvr}lLC^Q5VeFif>vol-1MoV8CN)YxfC=hmH78Mwoh{FGD5JMMh zzVY5s^WfEIW|$t~&XimTnB|S0pc@q?JV_e(Q%bppQ39a!<&pAxVi1E93uWd=q+T0= zVRd2|(Z%|eVSSD#M1lRNH#AZYp%QGa<5WnY6C>K^oQm0Gjh9tGm`ds9@Re{Fl)oEA zJS4C`wbPLGVfBfm_cyDx>SgG?K4Lo?kqaw*3z0;=eT$P0IH$^Qv1=GP#50ZA|Bf4$ zLj=jh%ay`!ZQ=*UcCCJP4EKwY#%Q{=u+uTZY!16q(|nj012hnfb~;6njuYGzP(pvK7aXRXOSDJmosYxniq7K!S#x3hI|ewp#@v-VR@o%(USehMvdb z+9B_~w4Iz{>@9mM-{MuV#KcCA7NGAi{@D_zjPoeUF%`Y_R(-;JnNE|L^yL*L(DQfk zm4tLCE~m6&#+uo!hV8wJBYf#YqsG#0Q%2ukC$iON9-RseS64^TbDAzLYxF&^`QChu z0J3Fsrqeus#T}TJ+2Q`0${b8+Ojq*9S#)0xQ`)&bhbM)d4!4|raQ^@WCjxN2h58&_ zZ}HnYEmQNnLNj*v-t+7XX5CUNN6U||b-sT(-_}lJ!RYS@x#nJMawST4tXvJwjxiv8 zQ}`SJ*d|`Q`uOnY9sdumWdwBb8ec2pl%9Cr8qaZ5NjFc1`BeN%?+l+~WL|)gtl39R zb@;SwlezKl`y1K+=LLWeKeU3fW4G)(+r|)l^r+Yr4?(f>`1R?2iB6nP<~n6qqhoe8 zVEmg%UhnbnlMlGbe?#O|l`<;B9ⅆlzMr9pqZS&zEB_!=!G3Iq)y;)QessJ!0tkt zEVL^B>k|==|8=`B6oas`ZEe!)RM5v!Rh3j=snP>XQHIolf1=D@;CNJI@sZ1AMf9M& zW0IYGMJ;CDJt=K^^jBVcSzn0-X5*=VUukx77eBnw$|f!I!bgDN#1uBm4r>9yZ+o|I zI*2lEfbnk?B&&jo47W;{(Iocra_3hGhxisAP8pkyC89aFG!~7yp(#9^>KnuaJZ@7N zEB~^ytxXAwe(MSN*!1Nt|7!*M&sV|*`>eToh_mU^ml|%s+0sl2wqk=d*q-l9=N>J8 zTs2SynoM*e?*aO?TZ{kz#-+1=z1rLQ~n>QGP`hZ$j{=F`BiuRSw z40beK?N|*-;VR9QEEvnCzwxE|O*YO?pGOKTW$yEUG-wWQex(w75&~9JAX(oEQ}e4; zS2z9Me2U;iY;KdjC{icr^yR`&XfY{4Q%76D~my`>8aP?u#)8q&F&3 zs3b8gRl1VAg-Sqs6s;QLxsBZ!Htq`fHiktGH--&-%cL1f*=`E7qpf@QT_ITp8nBx7 z5DrDP=-}I;lwC6k!6%}_=mcn9Q~MG#o&LJ>qZb2aN9+0k@SW}YqX};)p$KEjmlm2f zli`(+5Hm%_u=?U7u3yX}S44&<__g@CHOhLmZVA+=wlosJOF7`z46-ykmw8O!{nnD` zzNQGzH@fkXzV9q1(}2L?R}RN3#PyDUAYkVj>9bu%mcThfp~JTpy03s#QMAM`4DVsAx9PR|jJQ4Lcc@1_6?@Pu}&fCpF z{EwqA*v~#hV|^V`AFKC%HPnUM$Y5Daqr;}CgBAr1Mzhho8f&oIF>=@+K9;j5M8Xd3 z7sU@wkhH(YW~q)>m|zW{p#;3B;d&;r(Z{>#^0Us>P*+0hQq#78Qykn06U8&)Fs>G3 z9>WwUdwnoJx_M^P4)56Qv3t(15#G^n^ssbp$=Ny*WIQ2$Yl=ZaP%_3T-oTHSXr;CF zNTnMa8rjD?s=Jpgc6j*np3xB%{vy}-8NVo)D{CmA6`AETn@@XhV6~QG^MPI~T?vBz z0Pj@=z&9|a$a~^|-uBwEFpr25C~RmEwGsdIy0o<8jpZK+=(Gl{VetbBELZ|tPFYoE zG#?q9db?vt@%8JPXV3TcJ$>(kQng<5>yy#T%E(Ych{>zvozyBK=OR={W6_dha%oeF zg!Ki6o*``vaHF12$@SQfAZvbq-Fx&3wE)`**^Oi_j(ydl^x<4a6$K}%iEjnXyM6CD zB2Wm~9_af)ypnOxOt~(@Q}^jT&Oy;i%~d;So)uLQha>`Igi%I*tEo^|{3K?(Z93*4 zYt!jH!1&dPSh08h*Cq%;^7HcrJ=S1G`1MJG-rHKPe||SSx-Q%TKCm%Py*VTO|vukF4pi|GBM?6RMqDpxE>G@9Hl=_F#BZ*v@kAu zgTFwQO=avdku!L99V?>|@P-Mi=+g_a(jYxJKPCav2&NOofMyhd>PHreXKTzNQrb)DM z6q4|ChZw0*y6AJ^oWr>V{~0&_l(~5Fbw1>xzzOcL|GUam@pa7?z3e=?7vC1UZ=*Bt zDl1fKj<>zLvN^t{GAnM#-~D|fDRtz{ECXbg7$4;UehyHoq3G)qWyAY)P>ik1cSRW5 zL^hE8DDxe#AsZ^|0UCmwktb%nI7aj0*Qr;c((x$q&V7PR=aLE`#c3ZO+T;`bxzrH_ z#cww0CyXJ1NIaP?WrQtl)a=;7{M0g4&Yz2g1#meSj#dc%@hNgGKiNes>?|2D16mBn zqm|__>scu){X((-=XN!mVEw0Yl}7#}Au>MmqDsCbU@IGcGq*QH6|)}rH?mkXcdT`u zB)OOtJbJ1hMTn}H0;p-ax8pZMqr~w9-Ei+p?rIiTy13R4>d)Yo1mVAZ52n23Fp{P2 zTZ;JXLISL_8cSmAE$N3a?x>CBk3HQAZlMBSztUHqy6Ap$PqQcsHw~?-ug7sID1w8f zxSztFZTvWARGXs_V;)Uc+coK7?wm(j4)51o17@wF{1b-Rb%FG9Qv%Qm6|?@=e=PoW z&Hl@uvL^rQn*WMHKCr-K1Wl?Qx2cIps6TiW70cvZiz%gK_cS+~;~Lk<{Teagh^gz7 z`J46|+RtXS<*t5K>WD&$)Yq>74IiUiAw6R(`C{PeV^NM;_J(afQ6uarJp1cJ)?a2_ zk`T#Hm+a88D>FtZe;U$SQ5#dUJC?s_`Xiy}72w{LChP=fyh) zgi1hAmC2A!_y>#N30$X;Nr#ctqNtC45kBYw$;G88=zsK38XYE`+@CxI%B37RY;B!X z@EGWOJbOlQl8~7ONKmSrXjCeifE+v&6>E(xD@0BAAR%KyH?hOf5^yQ=w?xDR5#hWx zZo_+Y4K#2DMsnDwwoZ=-PHLh=ZS|hUBnc^ z#!X-U=kE5DIg71%M;7KqiPl(@r0@}nblb`sN$rnqFL1SoPYiKP?5nE6>}7w&H&tx;m0RUb~dvw92V>e%hIJr2a+h?`__jz#+J zaA8Qz;M}4NCdLMo&7^DFCqxX0VseMd@9r(|5w>dlwiL^zn>ciXkjy2}cpaIxW&}Tz zlQd*y!_Tl8yul=mZGU(f;^t&yj`5C>lPD7hrvHk#m6=TvZ_xKE=AeF<0#R32zs>hD z?q!w)`(K~%4ZJM4Z_I>gBo3C!n=s)ynNzd`d4hyDzoA**DtNz9>!?McU8Sc}gbTpO zood|@P>xy_`28fpgb>#$h5iI#Sif-X+Hh9&^pZnW-P+;9wpK?m4?-{Cas(Z>6;(N; zmS0(6gmB2gG_-;~4eNaC9$)kgevxtcq~2JR{6Ha76dG&k%iq2Zp@mWvAS1k(w}thY zZ^ugrH!?$AWeqz3+H8a$70#O?IWCxqUFuuPB}D77_!}%Z3KAmgVvIGKGw3kOo^hf+ za5@_{D#1a){QA7n@mMcfsvFuV-ywZE#283X%FiV5s5<+bLs-OLWPx?Q@2VC;GGS*k ze08<53XR-;DoA1H3sv*=-qy+RGoY?zOBdlq&=#IqL%; zVQlC!tCP_2d%O(`6v1D(*K30xfIJYeM36Y?G}|x{HF?stjf5ee6{Io{`Ud)^&%Hr# zaG&M*_e>sCRigX@LyQAnyY2u!TGwk^akb*6eKY@}Ym&Bqe2%r&?#lN6R&rY}Mso(p zsN25Tu(w_z@=e*v7oz0G7WuBHfjvu|DahM|pAojJd@w9B+q^9Z zo{DJFKi|(~T=?DkdODO9Ci8cV#-!e(YVx&!Q9RF~g5 zIB@AAPary@y1_jdSNBV;wbAokIVDy#pU63BzqYM3q2s*m4tnu726m6jocaSC)~4VeujK)wbLKI$oZ4I7}= z|J%|3kahs01}5<@&vt(D%5_{G}~B>_4DBkBjzkBzRJqIy*gxTY&c z{wF%_#K^o(#RV;v#d=;&&g{f9jfCfmTz@J(#GlN4DBagEy}}%#PxDiI+ljql4xUX0 zHQHJ$>iZvcm(;i*t0QGKU$?zaNswn3cUAaST$#7;R-}zkP=`NTGSlS0#NwBI<{10C zOcGB$lC1q`MRHzlP!w;Vd#&){2h!)F`6ei4`o zVCm>EKl7v!Hq5grc+fy7e2q}Gv?P4(gE!tjJCV2tN>hJ6MO9b&AkB8h&x1}=%E^qL zTDLQ8g(|R$RcFTZBaUYi#uF+tA>s9FDVtLpSqr@BMvZ%v3;R!{lke+TH^`*9h7_p+v5g1@3bbVi_fN{_)0dA9CvjGcTea1_MLS+72J01+Ha z?)&GAP6fK8>sxZ{Hem5(IgmKx6|P1lrsibPj9~%8Q(%(3@CITvf}m;m6}Gh#rE;k5 z*DUs$fn|5b+?Yac3rh+`96|>9PX+5Y4lO(3`$;x+H5D~!UYs$czAgw^g?tt?O`a;- z$hvSGFU0~Bdg{aDX&cbH=*jkV>l3WIxly#?-l~baeyfar8D!72JwF;rEg_`4=y~I5*(kvKAl=n zi;1z>OYK-4E9DTf&06bd{xVrBBfy_zuvLq3ScO0jLy1E`P)@l1B_<~|=o%)gNb6Sa z*b_3oET7g`a1x_*M6(HnMvXu&Y;aM7AgxVnMjVuOnush ziY^K{(Lp)DY;i*`OSGv4K1|UQ*^n+!6?#E#fY2sSETq&JLZOh*PyYT4&SEGisOuxF zQLLsI-m%yT?9`PSWPAYZAnsh{3L2e7%?YIxc!dvvvTYYOnIC1=Bo@mI_V*t~pGnoo z)vV-s;u{}zDtor)*+N%qCgd@fB#;rWhsJ@YTBWf+8E#>34$u6LP34-19doO1cFdhK z@5141fW~Ga;rC=a$VAC-ONz|Vq6|hvV^j7DX;~>=CP3+`P5N((+jZaxl!Rmuz5YmT zjaz7pD#R<#bWZAlb;_JQRv94n5JFn zjpYclXhWmnV*EX~w8ED!UrK{ltMr7?b}-N&t4)qGQ(;xOk{tBkvbKoal_jbFY+iqr zp*5v_4yJ2UN8UsyUE?m1+ZY`b+YUs?)TNfp-{}U*B zz(Fu$WHang(`~Ddbr5m+ggV)GagA74d8+3LbIHXB~0eubx$(S6`am3BK-nzI^hT#2FBt zv`L1;WDW4dv%w0x}Rk_E)B4CTS$Gis)!Wn1u#xa_U33#Z5OWBubY zV`&IrN1hy{ySz&48o7sxCH}F7o39zmwwupRKP#lm0RJZ!uR0on$|J^rz9qJLQR8$O z7lcRkz*qCBZyy#v`Q#o<{ql!3dVP5Y0K&Rx(4G@=xf}&{9(K&)Yvkc`MKnr^_?FZQDy|&7{!9?SR&$pd8gO z#bi__MjG?eP%>)|<-6}j$~I=FlXaHZ$LX!jN5;V)-N+N5Fp+(OawQ4>@DNFFHAp zK!omCmJ3;y1iAB7w5;`4x40)e0aN$LZQ+!GX81_D#g!3t)%=!Ja3a+E$Rl-VN(~NN zj&Zm^l5U}Xax;PuoIGgYDWGMpw5znfv&&yYwT@=C zE!|x#MoLw+2(*j){c zRC291GD~w)9*ghPq|1FUA2IZl>*^kqm%L9c^G`GA4?E0W zFj=igA%J5lxY2rziQnmH9QW*$cTF(66a-I%<0zsKKu8g==s z(#>f5_i4B%?UT$5e`2b&#N@b8R#k)v)rU9dXiFn-mGVIt_n(gjeKb1&)WCE-#Kgq# z33=+$M>A0U8Dvxs(2CwQt&kC#9NX1n%3m=hOWmBU(54dB z=}4z)$|1oaAu)pkI}Na&0)zB_508ssK5FpbW5)8^(^GB0zj?UyK-4mH$N;H)D0{fV z?*V8uBT`$dwk?OHlMk^ zn=>3>YuF*kEhoys%pem6it!?hLNbFyI4wo(*KMT98fI>BAjxzD#4vQX3hkcvH-Aa; z8rk^JHFG#UVTwk{MKmnM2+cpFzq`C-EZ_V_K}E1Z)6*@sJUz9<&UEB{>@`EM?5Y4P z+b}RNwPsV)3vI4|4ZcyH0uewBeVoaVa{4wu5gUsE_(I~taGA-G_Bd5SG#kDr2;8mv zvpF59;|qB->%d=r@;NRXVARiTV<{u%J)&qhg_e6iwohRE#~_) zoS!X4_Pl}xj(L07_bK|p3%N}*3~wpJ4VW9Y)IF#J85_Gz3xP+n2LJftrgvXoVjmcSXou{o#d9Pm6b06)bJ1;B_CfuFoG1< zxEC6fSOANLpq3$6+4uj z-O9IEks6Z*DJiGtK^luF$#l*%YEgTOXJxEz70}ijtt5&EjSglkdpIHy0ae&=5Y6w}jZTQ27a8bdn-x&&@buvK71b=+qnlBL4;CQII&|+Uy%Kv;8BphF&R8-oH zZE2{YqEa0L5tPQg13)dKql6x$sO3{B>x}=z7RV+Yv=#(93h$xDH8Q`2njOo0g%Ne^zAKKDwznch;j>pU%%?$?>2aoLYox8hs>bKBl*)gDq%eu z?Fw42H|$wSgzZCo)7rhvn+Ub&vo7?;Zek`W2d(t(pENRA9EGHvVUzdU>O`Yz`_7h+ z1Ml}l{w_4#=BEoLG2t>9ny$HR$J}F(pcnSC0&8TdS2S=OiLS3Nt2Ljq<#oTQ9r6Ae z8~Mu@>QTx=p%Lm)$>jROP1+4+cJYe@5O`gA^Ev`}j#lUQ3NnKOd^^`A+^*<4cj?`B8<+2X4BT$fa^ zn}G_WKu%s@q&vK5X!y^=O_lIrW1ILw4)qXrY5kJKL*iLID6935%L^G#rT+(u!CzF? z%<=kE`eZ{NCsU`rfWOpB)!WvQaLQV{dT}e*n*dVz(=}&Y*iAChbY66T9(MV!p8lYLh zY{xY~3h+5&Czb(==PM{eR_>3m{`Q)xfu>#TeAys<5rqdM`I_*@0UJS;nx3c1%cbzE z(!NPV9mOJ1fP6)zWX$<2}^ ztd_65@Z}=^nLyd^A>n>e`@ahgm&Vy^>iiH%g-D>galR&xp4~i+2=6Pvt~=X}BrB{} zXW*hee4{5d`L1XE{HqJMxL!-t1E`~8uweg{gu`rf!ia5`wdjmxL~Ot?RIt){AfrpfGEM?RKN3F7;+ky&}mpr z?>Hs{oHZz#^{V%G3bn#hBsSJf+9LeD$jZ{{=Y`hW_@~oX7tHU1`(V$|#GPT|-! zlvI5b7G!YXBLpFCQq!C^K;Z#ydOD&9Vf^{0X5`ImQ`4E!)efym?T*hI^A1NhQ+P+xz>?GBFG&pd8x9 zKT5g3KfW+BGP>AJvmRphI!DCkFpZ`Y5xg1~ueI3_6Za95g}wj;9KOpBg(v?!D!_RG z>MKfx3Mw$Z^#UC91-o~Hk11)=OaH#zK#+hWk-bUCNE%h7BxQ}orM*lr&_H%$|H}y!&b`S13SxU4HHOnV-QS=YL>}QUb?iIn{a*5)$3T>DzJfOOXp%AL0?T>=R4tqN6o$&6lyU)mZzz$pS6ydIlXn`Ht&~gf8Ic2WNJ_ zS3QfirxY6LAr8jh13LBu(${2Y+B}fahT#X144xFaPbgW5x5D#&Xtp%l`$b89AWnt; zNMa_yh2LGE69j57pJN)KXZ;H+oVZYD(hBDfM*t70Y-Lrs+~%qzbhi%W`g}cSG!*|q zCRVfhtFK%d*B_qsS>#HeC&2Lov2av4`R^!3gi}@x)gFsYo#rTc4{^|iUeXdrp8x}+ zsH;e)l8|^1fXRfi42KHe|Bi;M-%(ckRTLk}p=hdokqb-Llyz;S8wR8zER{PRVf%y* zUFZ#telD=aSxGCH6zcH?NsHX#tL-ePm6m?bhunt8um;fdbRY$(u@j8NvLuOeb@B~F zQ7-}^<$PkXRA;Ko55y3l*0yEMVQ;BmN$9^xb6QcY7}{XNpquxGbrg4Rjow zZ!y__S8=K{W*%1P8E9jM&)}Gtn7({CB{@U*{5L=dp;o~Wd8ZWWa$P`ABcvuwzSyo! zDm5$|NokDH3pW_HEF&%s;A9sLCR|w{v!neJKKjn%%Vhv3-2X(7&BAYse_b=MF4vu{ z%Q9J6LQYH5iX2-5Es<=?t-BpPG4Z${!$2z9%H)7kC`Rw;!jspI5<8NZs?<%~B#F~} z_xIGAcIcVBxV|htG((H4HSd=Vaa^Hna3ypx5w9(|O2^%|rnbvb>K0dewbp5%+POVQ z^Iyg2QUui8)c?;BpHJXa*V{|NM*4mMs8P}lCyoRXhe0FEU!T_KvqEPo&17*|*200C ztXi-E_Ic-nxn-T^JC_CxXQfS=BitHNZn&$;}@96)td&iF)Hr31dzoQxi5DmXKsityF zIkh&o3~X1Gvk45jt5yK_BrDxS?euPQq$Th+;wN2 zCcuZH{qZeq4p1MpAOo$(x5$Tf;1+eYvH+@+>qQ$8F9PJWyWTJpT6U(ZTS_e2KsY{= zCx1FhEgLwU&CPFxJ;s z8-TG^Yc!05#^~JrflZ?0apiO9`^&xISSnc& zXhM3R^jwxzd{4jr7B2ai+bQY)bGTTif^OCdf<R45Q#S}T@VRCkPFp}*#8hA}G_-Va zTsSA^7Y2l42rmQlyh&+US>J}@Y|c-)E|Ow5NmuM?#i4@geC^QCVJgyjEFUn2ppfJy zwL)}n!f?S}8!!2We)X40n~BCP63FRH!Ji<)ap5mG4CxGY#)V&Pxe#(n$!ChEQ>hG= zWVZR8YhV@oq(V%|gt2jx>=ff@@!Z5&yfRSm9w6(}?5t=)E=Mz3zu)~CrvCx$7Jjv# zM&_?Cf`AC6kK)M{{eb9RIYUgOd5d5n!n&;WL6`E!gb$Vk73b5(cNk2w+L8Cf*Bvgb z;zGXfqcijNlj?jv6$6Te(RdS?Z~rI$?4%f{hMd0bo8TtlybYjHzGZWG5-W=Y!6Z1N zq6}QZ30_(YqlIj`rAZ*@2``3`vZ6W#beRW&V7j4Z?22Cw%aMIe_Eyf1cL|I6rxVqb zl^K?ti6oPAZwY8bM1N88x+Vh_U$j2z`x2^Lp9n(p{i=D5XKMOKu5_JIUr@uRRfw3} zOE~b9h)?Pcxd>+=nK20hp<9zPHV>%ov#NC5j3IWn;}voJ*|~s2JH2 zRtB$|lLbikJIt$dj?TPC>WSoz(Q+B748nI1@ZhbSN(0PnP4RJm$Z6jxU@BqL_WOex zejtF$M1&6E2{rKCSPtjETptxXl-tfQKpgmn)Vi?3frm|`0X)@o4reom#iyWUz@_bR z-z4ny4!W2&B={>_TO&p}!ERsYl0POqc*pMG3TC11vH7V40-}`p$bDFw?#*lagLK5_ z`1cf388+l8C3f~QJa~*Du7T`*Bni9N!_9l&7@h`>pjv@UBB#|%ZF4=PPdu*(0YJZPdZWHj?CKgP*oo-p2|iO< z2Vs|X-k(B1T4?5aX?}YApDojY2{O2cevYkx?YT5wS1K+%u4sGS_c_+}IuvUgx9jkO zvLJNqC*G@R2wR)E+n`!-$($3$EbnU=GB z85w*r&==`I63LaH1NRjnd0W-B$~7{+vR}TGlpeBdr}}Tlib%pf?~G=63^pe+;!dtF zI0-2suvE&3q7c@(#N;#Rko)hB#&nz!n4NwBcI?sCXwybGusBdRRVgzW_PdWW>c7Im z3~H#pe)6TX55jM#`SIVsTQWNXIAVkHI)O%T@+#zn1dKlgK6d$|N^p|dOe$Sf z2z$a;?dRETJaJy-paGVq3*s*2e;?n!N3*K;7un&mX!4}!Q5Lv&Qf>W=0_l`}d8 z1ME60_wMHTSl-%Ygm!JwfK{TxDoLvnbUjt-Db(-jMAFvuq9RvSQ)d;@D#X?vNa!!1 zX3ws$ou!LC01JE4UQ)vjFz^Nx^e{q@L`PMujOyr|tx4qaW7=$04+Ffh6khTi8_dlO z0^D_4ruDyN)gTn_>4eOzSneGDoJj~uFneyT;5hOQp^p0LaQ8vI$CsE78ghCL_T-CC z?$Bu6SS4684i3-*${&5r&~dbvjWAn!_A6Q$o@B9pPOF~A4s6O}dV1%~#Z#sC^QGp} zbe`m7?#y24d7yu>qf>w3N*B$T4xI`RuSz&YgJym}PIG6~Vr2&+lLjw|4yiTW;LxaT2BKh#2+kQY5ld)@??6?8KXQ<+L1{Ol04n6m;8X7=)~pcA=1MJ?i_6Pa0-y_yg>L!D|uJxmuz z0Ye;gzChOk+@5cFIgOl>nXbmXEQjz#%shABn2Ha7->g)4I2&1flCOR#rw^yGheXKf z8)rS1iE1osT3et6PF%o)D&T`D_LXLfLIBJ7q%sP+_iyTM+MQ~j=*InI?XI)h>DhY* z`vB~SwMgT94hO)g*^kfR_NS56eCYkmhjqwoux}(86 zjz*ZtW4~B^)H8=`qsh_^XXQ}?oCN+4%8->1ABlg&#qmJEwJJ>OcR{{Sd)GOv+VGtc z69G)>)dpiIO(&9J-2jX#J@1IM$to2XI%Z~6iQW=$(?Qvz5s+!K0}2=$_pQ#SegbLb z8>cRRdP6WBy;(V??b3@p6FP;L?G4zZNy}zk8G%?!R;L-dTPDECw_9tPARbD; zp2YQt)pZ+4Kw#zfz&iV2St&sm>x(~<-#ln*RDUfUNoLjk3GnP}Gr2QDtPd}C$(s=_ zg$GWLmspaw{=$>XN*F?8MbCfIO46PCxmdEDX+Bqrl|_q&+uGMBbxT6|0DWRj_D32? zOL|K}U%v)p&vg4>r4<(mueabikjeY#Q~KdpW-EjB%ZB~J&+F=Rh4#>M132)@d}86q ze}iIxW=IQJK`Ejl%pv{#CF|4=G*j0oR{za5LcA7XPD~Zg;n2F2)BfJTQsk2FS@u@l zjBBgCSz_#x;Kh=(Ef%(FYa=spHOxCX9TWZxBeU*wf>3+!ZUJL+$FceML+5IPTU8lR zTbB%Jmep?PWtJZ1UJ&dtWtwNz{f!5e$c#i+OI~3u7i!^vc-ff88L7g8%-`Ow=XGZR z$u<{x%lmRc^XJ(PwW(fwepHWX50|PHl2m~gt7v&k*J;$gig33oMA$MS)rU&iLXL-4MXP)L$z#*tIehA*a#sxg5w&fIcCM1md(!D$HuR znmA=54BW$d@}mJ|L=2gYgeO}nm7AM!Dl{E)y5i*)**3G4L7o*lu&uA^x0`$%1XJs- zf<@e#&6ikXwK^-qIk=motpaZJ?+l~a0cGiu-4Pxf7|1N#uC0~llk5KdgXV&WBd1I_ zyD7<@VD9IQeTdWi?~zD&2E3H<5x*A-FhP zPVHKcpO4Z-Kf?zyD(Kl@P*e9t;4RMEWva-XqmG>mXSlm&cd5fmJIxq-|_Ui8WTR#g2F{EO&tDmH>gx?Yk zx_*7}wgy@s?qgtLQeEyT#9Mn9)KK0WvXRP!m>dsGt%sH3!& zT{iidIR01{uy3fW9X;0!JL`6p2M#bGR#fQXF?`4>z=LS>)jEYg2)fv;)}zm2e}V$X z0R4wr8J=6!bJXsy%e%4fq;t(q8C$nUT*^b5O;$liOYVv&exU1aL{vYH2cDZS@~^T; z1Cnk985U`4A&4Os%0zRD8`hTWjIc5KLR=`bPs8O6R>k7T!5PXveD**n*G^MC*MRP> zvv);;?3;weESm$mCK6jmCYe@)*AC@B7yG2g9x{%UGD%4+I@BnFwJT*aa=~bm0?Lmp9pHz6s$jH@Y z-aaoPEK|JY9I#;vJs2YHBA{hQYIk*fJ``JT!OR9sBTW9VsH#RZi4?f*;rRIYGR6lJ zd9YZ2Nn(q#yfiT|FxG!8S2U(GpoS;_O)^UHBrh;6! z4uV7UlyQ?p&LEQ%?$8e?dzXs<_yA(tX29%GL@sacgGlWIn?@>_;Kf+B%UBTKi!riye~Y1I8^?kI&Wt%$VHSTixMPQl zDPOj@&nf!kZY0(A&UW*TquEwfVPYKK+ikj(E(Z{NyFID-o{tBz?zTZb#dmq`6B_^| z@&Rs!@ZHpk5OfR-+9CGuzb7NeY4glw@tOnHC5qo3T>qPW2XH?*|H#2Q?9$_3w(7_| zHxrP)AY)I_u?YShLHi;9XqT@Rg6quKas!W?0~IVzCvsEti-tD2l)?tDODgL zB@?&7K3Z22iTdWjBD^;^;>RbrNwMw&i*EY_mjJSM7K@=eq_(8@0|{t5GOZZ*XW1Dc zJ6oR0lHeVU_^{CT%khm^3fa*<*L6CTp`mmoLmdt<@|;F^;Zfzb@urOdb-gF z@d<~I9c-qDQ#2-1c8}EhxT? z)ep5RRUTryPZ>lqVRSaU?Y^f=$3M>-#Orem_8?2Dx8fobQm zDPNb?JQxmVPbYapJOiH%pYAp;uDFv1M>IaLdAN9I6b%mzHMkwIYpUh{dFi_;I`%U% zB$4yKClT`a?Dc$zDw^C~7a_&!3Pa7TYmi%w+dN0VBo;){r^!=5;3K>6ExAlhLT@Yjndr+zj{Ce>$N&xJ! zolaM(`^{vSG>Ch^cbT+>0J-9wv57 zp6Qdk+&sxcS~zb&Cn^Ihyw>0S;!aG7p8$-?FxBI314jiui<4cFS|8~qsMNz5ZDPFI zav-EZWJSZ9&-_OE0Nze3J(f^>;vJDN5j&CA1EHVO)S6~|+HJM)d2^*FxZ0 zPaF-Cw5VuEZ+{=T_NE=Jdwc%Ik(hgu+lz-$7HX|1H(ypzmQ*p_0K3#Rwv0$+2)=!%A zqPg)%>3MxaMZH>!D?$C65W|Jz0IbH+kkholk_x0)l;OdK28VGtS&fiK)ne@l-~ZgN z%Wq+kGC|$egmI#FLme~ndhU|9*;dMa+(ibk(ntayjK9T*7|5+(W4f!KZKPLyV_-q7 ziC%|{L}X;TV9ZPOG|Fz%dfpKufCIaW`T$au@<$7992s3|5#;?cl5j*^KM)LR4GDy- zPc2e52MLxqOlSor{58#RaYUb8w!^Ud!C)%?Muh`nU`BiC>MY^Ae)Jx61TG_UmU%kj^1TM{@zT*ijbQdW;`U+;y2)*~2oi-dY04f-{7P~(W5iq(QFzxF7^I!j` zq)HqB$`mep-B~Nx-C1*u|F#ipIQZDv2?43BuyVIWEs&*8QB7S$Tm(m$hXga~Pna14 zu3X<0%&@$e%3wFR$1tw5f3_g`s}}%%q4JY z?S!F3S!Cg}3G@qY#34kKHsdz{@^v`DrWaY&is@7KiZsF3SAD+adMfIXB<9S#2Y5?i zi%;%b>7R9fiCPRJP(4~>aAAVtK?r?2JWRaLz>J_v5RU89p&tg?p|IsiDAb* zY?pnPM~FG<3bpNwM!zL$(8%>YTKlIvQ~46BydG`x`3GY?QN(ZX-X8}i6U;Z5j;e6q;H zsE4wY(TB6{v+cosHC~uUp-iZem-g9e>%TE}m7=cMgKtD$ilY=rYyRa0P-m(-9#TBc z`*e?mkH*Ta?JmfOTYF88=HJW2zfbXpJ>A>({iX#iS>Gv0V}oQDX}n7L-2QODhZfBA z{dLnrBp2hcnlovlV;F16W?g8t%pPabfVW7VIHDt<7~@V}|9PRvEXhDLkCn8>lXvuZ zjr7~F-C0x6yu`=A9Fo~~=S#$$%WHQLDPCy5y*^DyW$XH}AMib*LMaTWqA*!=b*HtA zu;R%qR0u=9R1z5m7jC)(a5aQ7-0zn`-*EXUP)5zODwYCn1gJ|)st!(_ zfw(e_+qyr!85=>Zbv(&n=dHo2;mLB#U+x^c{Yk1HB+Gdl=UkSHlw17~P^*X&we#^6|Px3&5nmS!39~q98M^mJ%UEtzhm5VzI`L_oORl*iwsD)*7_P@{;WB688?$v zVW0Ule5L6gMYEr3F7PG;6BroMe&gN0>CprC@ ziUeef4z-OK6Hmxic?4~(-j5K9n;{c&fx^ns8^U@kmDXlwdcAmxVwjklz_^ym^Bdg> zVMAT~>U(78#E{ciXxaU+ED)iYxo7=9eic<`tG5~;A+c~M*Fu{G89u zxAt-(Re*lITPS;;4MWhh99Q}D{z*LEsAkRiq8XX_S1yPW+NONDIkRZlkPafr-`{jK z|C7&HCx)(mzW@~Y_AhD_ui}a%v%|X@kP`@{1m433rJdjOb8~yQl#C!NS^fBc4$rx1 z%)|{Z%`_h2>alBFZB=e(FfF;cksESrf82|#fgxgHg#2RJ=@OY%$pFRuy@hwsZUKR5 z*|b191e#*y&Z%SzEEsHNUl)~@L5WNYG*dcvON=1gQ~?+7-(22Ox%Eg#E#_S;EGXWO zttBeXQ%R|RO)ZBV9`=x;TQ_dBy2wSMBAYr}r=}Z3rR8~@d(zOgLb1n8ZjMFkLv~u) z6=lPzgn8(p?2GV;0J=CmbhL~(=J2z5!-MOlZg)Ce>0Xv%6KOLxITd@lv0{+ogRpK^Vi!En7yrS6Dx?2igL;40Yyd?iS!7gBpd; z?n9vghl3%rvYQkr=cfh{9F;1+?W#Uvv8Pe=qI-jlm3$=bIEHOBydN=11UMMcOm(7;RxO3}P8TR4$ zRA}ZbigMa^gw-&;bU(Y}m^WG=lHgdDvPJPrg#JFyY4ox|6U6{I>aM&Zp|q5fcf%qc zrAmBCujnXmIV}B0NoNc`ZmVTI1&C5qEVQD}(t*I7bVY+MHoY?1{LL%teK{X;-YLIm zn8rr>EHC;Po!xJLlf0$vfF)7GNOlFJw!40@mli8pz zGXOxMrUX-+&a7eO_g(SKE#?c%Mr|vVMSUBE{LBRr(_BqNS+{_f&hM-;=s@b#tTGt9 zQiF>|hzql+(L5S_cgBCHBy3$7`_en zRR`8Kb%V0WQkc%2m7bog4*}~p=-q@A(>#}_o5VcgurG~b8JIk4tP;A0l!N{; zKWPS9{Fpr(MKWA7J3l?8-6r|gj|#RG7t@Lr*;9{wX$ai|%4~eFZwTNrvA?%S*SHkbftVRkN?}hLH|5@HN5peleo1^`*G_O@;9sm^;5~XKGuwCs1lgV<3M;gI$6Tu3g2U zzZnU9!+3v&wz+he8MRJP(z==wsP_UAnQ1jf#YVw_`|l;y1jA9Ifet!oZLP9hdl@TY zfF^+=Y~x|EMW`QhNw-DAG_#QFV(0zM|G4m!&wJ+0@g>A06K*zy3@Z{!4?dtcpb znV6|}YqFYH_xBG?b2U%7Sj&l3X`bcz=J9|3xJ)juxBIFLz(qL`~TA6Xcz}n9!Kd+)Ad+dYnNBfCv*? z-;w|0o4h1T^BfN3QHh4K}Du7@>(rmlIB`*20PcFFx5f9jk4JE}8}4KhZO zEBJY7R0-pY*2m0H+N@ChJt*R6l>gp!79Nvcr zbx`~+R&!!AjSX?EeBQ@plRTT08p{CUv3+O6B~TrxZ2}z;A@W1-lF2UqVOj!$ZxJoB zg&>x(buuVdYv$wDsVbbnFIJj@HKnG%++!$H4ZY72jYZz3NvPOnbM%%2p0b`19=OA4 z7|&rgDTuh`v)AvV)-U+50<^b|h89Awnyc)EJM=^j{!Yvdx+}ug5q2V8rENNe4Ox6O zn|S`gcG=AYs=Ioq1}!@Z5MatH0*#b4Q%p& zbc_Fhe#=m5;u#xRE}Z+Qj&*}>iisA}@wsC-(Yo5{Ui}uiO?g8)^pU3%rsh~Cg~_Q$ zb^&j}^zS(RpC#O|%9j2L7n!SW&qODV*6298oL2~h*Qe3F-5i85U2J6EJ&58K`@K}y z;?CE{PHqswJRD(-Cm?)77`(4>4cTngvTmPQu^k|`0=jNhdcVHRMqOTBuE3Idkg40Z zyZZdRnv|Wf4$?^dF~INeH(KEGl>2MQ9(0?0gVl;=<{6(%o~3~gR`m4w)&Qg4%~rXO zXF6*5Oq;8{Z(c@)T$e&XKtN-bCo!Kx_47v7l@=KYyx-*dn^m0*xUzLMf%_e^8+A}9 zM2L3YtUD+a_C-o!vC*?>sp&SF-=DFH(wS|liAgKk&IVF#=rT#J1mno1=?vAWRHGk z;%-U{!MR4iI*&ky02hUUXvN&$fe<>c+Cz6rIGM@43O*1Sg z2Aei+WHEa8=`uDoJ=_QV<%STmjp(;MahfQkTa!U3ilK)g=C22SzR&y)u@Px^rbReG z${Y2xYvgzmuGOfc1$XJR!u+45L1kw;4ouP?*Vxa8`)zCD(vIq+7oYE$9=%^iIbGaY zTZdLN@po!pI&oJt8mJ)I7%S{cE@6`L)X*cLh0y2v12D~8Fty%WpCJ}B>8hO zJ{jE%#5mbzgkI*LwB>p{r>J6;A@0{{=u~(}-j@ne0OF;3=2~LjkA#*Vf2Q^~2AJtF zYZQRLAi{`jK+r*ECv&`Gb3LkAT6sO6e4c!OyT+Awi{rN3wIs!-J5ZSJI~Wt4mQT?d z-IHC6dhUpl9FTOh)rcvgl^J54_$ZmH@UGcZcr}hEOFUQ2!9uxe(>^7inQvzDlB>a9 zdeS5(${3?viMuOIsp#`hUOgszKvbT#i@az0WnLTy6p=)e_=#qv5Ny!CVw7~F$mT-; znLKSAKXGvluSxPGo|5OLh{6^QDx^K#Yz#jzpK0>WJ(Z5TSZtArxGW)DaN$>ea#Qug zs-Pu4f&|B0xv3f_ahMj5n+Gk&7MMeOdF-rWN!hM{?Lde%27SLG69vz;)))k~ocydB zWSi|bW=nusk{I_D5b+dHAj!sQX5QYnnA+7r;GJ31nP^I49FiC=?C5(&Hjnr2@wnoD zFr;zK?WR{B%goR*378j$+SNr*pn{b4Al)WfjAYJx%5w;{S50BCJelY_j!nQ86}rC< zuo_O(Ia%|5DRfuUH6M|U@xjVYIIL{#7UaGsxdhFWsj0bMLU2V0lu9Os*qu8Tu6&Sn z(D{^t@p4}Ddh_$)g%8Tpt>m-XaGeTbKvwv5x{~(lg?BAek#{ObME4hrezYh-7h|a4 zkOzAQ)O>$8w82GH?=+o}(yQm<<6`mYRSg7l=aKf82P9ESdu;}gB=N`a=8xH9Xrkdg ze;x%1$PxznDNA0Nw@*(=Ulz9?8r;K)u@(@k6iz#ZKW+R`O*SbI^@mgA>e`hmx0X!! z&}oZ7B>n7L3R*hj84!~P30Z7eVlVWUe!Iq)@t~k*Va^F^VX(M?53h& zS*o?{;jp3!*;YZ=F-Ux`2NP}FJn>|(#`UYa_u8hesKH&wi1RGTGL+HSUytj#a@lDV z!fi(L2=}dsu@Qq=pRxzj%KW0k;x&8$Z<9YMLxI<_{80b_- z6p*|~C&5615}0IzJp5qZu7eifUL_L+_(Sq;e1`o6%S2OW%~8PuD%>orUym36ShcEd z9nH$biTYRC7+B#0*?G6_$jtI#gLOXVaAXb+1bPOB3s8Zm>KO=IKx8I9a&*3(5|m)n zEe6VNt};sF|C_>&o7%h}*{z!;B&gl_;HPNPA}56G7V&kJiWYq`ER{nhgyUgX`hY!1U1IOV?1>g8OXZb-N9tP ze0LTdZH*#hZ}u+KJkolNPolzv`%e`dg$8VNDI$&$R&phWS4^^)qW8iUQb7?QTIv{h z`kU=I=bxwEr=s$c7(>4gHC~?PFM7S16nHnsN`9{iJ>Na3QAQXJeALZ8OFX~dlZEze z)6Z0dOX!o})AfYQx=sca(?96Yq4KxKqiyN`DNSn8v1oaAHHFH?@m&)2j2DI^gJ_*v@w4;o{M4P&|6Q6`viH zy;~xfM6k^wZOLtXb$yXM+0Uth$3ZGIlBE*kY}dPZ6;TKY;|n{{=j-ZYr)b^LRuXwR z5REag@ysNv4|@rtcx3Z>Q^$>@OL>I1Y6&6_L^?SwY;(x!Hr8R}DX+sZ`@o?s^QV7L z^kE{5#q9N=Q%_~SHbON?zo*gxbNck$DdJ@;u-~1P5Dv2l<1^|?eg-*|%~*&C!NYu? zRSfavWmHQ^mFM=V`{^(S2vEc0I$!?rXINfsw&hMGA-+xto$eteZ0LJ}S^JH+c!UU= z?zup^9btD)p`kxXn=fW_>c#+la#8wDPuO5OF>*xe%UomhVc2-#V)vy&EYm95q}AwT zMooCO-8&)!+GkoGZR5BJl@ml)<&3)T@h9xT+V#YI@S@Pe<)igq)dQ~@!+pBj{wyI! zJvfHGFp|}Brq1%BUybO7yN7WM?aGf7{8QZU?sMANx=Nzj*c!o+u*FR;Pbnw3h$|Sb zH+0~u&`?M);2^VGg;KVF6F?P#poHLo5Rp&glAoL7wwfxGKRIbVO?nXtF%WEuETNy=$cyAdAob_L@%@f3Z4S|5a!XCE|Gw^L5*d z+ia4u!J50lw=JGg+w{>lomwH&!e{}2uWGfsHM9dCAVmKOl&yH(aPY_k0?(K%McY-2 z{CkB1QybeLJfGA6M2oCzcNIO@;|a9AvQ944i zs52N;e0dVQOhd}2{5KWR=ldKNplqnNZB<=om^)~3J!vj<4!PD+-3fhX%q zpDE=ziW;_PoW;N+;6@?IMq9YJjQUz1kU)S< zm6EU^!1QrYV1?z4j@x$F{7VRtdl-`NXYz5f`vMGpA%?^G?UMYLLV+Er46#jX3Q*|T zF3=yIS~js-~v;QAoZvj+g_;riF zhwhLDDN#y7rKJvyAc7*@Al=O&4<#j`fCva2Kte#eIfNkH-QC??_vQC}-~8{r|Cu|? zz#L{^n8UlDXYIAtUb|L<6#gX52G{4j=>94X+z%HaA<)DgE#{QpY*L|`D+GNXpZ|zI zQynfE^v8ySP)$h4Lodhow@rGHs@-4>F~BGK65w+}z~I#G*A9PmIpeiuAoTNe@3q$Z zDS*?xbrghpSLP5kY%Xb`;a;4svgRvuWO9dsLgOR1(ycBo56TuM1g;x?3~7f*Qz!txd%FYfpNMj#m6h}oW~IYTaW$m;|w)MERxK|ERs)>%!?+TJiGOtlU%63JbU~18w!HF z$Q@l9sizNYO_Z3}o$X9wP;hD%L&x9Zo`YjkRYMc@DzI=-D2%}loF%Y(h^^v3{hczN zufYvFukDGX5$J=nK=b|_b)kWdICJG>iirlFItkSvBx-1LJ_+5gflL2nEWPpHsw~GS zqve*W{$KYgc~8N%Ps>B3d7o3KvWL^4L6>g4u<{aP4O^aqTLr2bw9-PNHu1{DAAp*+ z?J)JolovL zT(RU4KyH~fnd=n5svR=wg#ooB({nM%XUkG@7jiKmxhU7-8dL7*et*0c$v0t_1*tT+ zT@OxhNq9bd`R4KX7AHc;moWM{ha@QqH=qK6ce%{tUHTGK zLa^Z4nuR!KqCB-7sH73X?Awq$T`jW<2H84%@J}HF|3l0N!04E$78*4Dl$d-D5rUV1 zyejK2CGsE!6=%i+BxY`GHZ}l_q$dPilbchp;k5+`;UD!0RYnc2wtD@2NeH1n-xvv1 z0>OGI7J=e(lfc0r=2@%bUWH+B9{G9S8wnUSgF)&Cq_LE#wUmk(5gtVOwY=Y+09!F4 zP}|8(UuHtn{hKUafX}o`+MS6G(S;6e9ajTJ0@U^l(4c>Gs(O(;v%c3aV3@Uv{Hhj> ziU49FFC%Ksj12v2#-q5Kgu6{%Wq-`vXmltsihb;KRtKJq2b@~yNZWmTA{|9(1(N6U zd^f9}%dOk#c)jQwW^Vs1@{;&rxw+`d-JYDXmG7@aWT3$N=8=5%9cijtpEcv5=O~Dk zLF>I4)6#yHXFzTt3c{|@HrH!wQy33!HTalNWd-&U=9oHvF?#@vSi#^|Z!e1+lto}6 zbsFP_jlKz%eZOL({=QQi#AGc}&PST&%mdI}XZr>|#o`soDpEp~;8jBZkalQOPHO}O zjiEHwU-LAdvF0J-SFe2gmRCAs2`s`@+>yXnc;a6DxxPC{+G|saits?7uA4(LJ_6$+ zGWFMcb!WHB(c)a(kxh}2vq;#ni~ThSKNt133hVYtmknU6WP!PR$m4ggR;w(B?~1+o zq;$W)(NHFP3jy44VN1qSR(b?^6Zu?v^LHv-G=QDQjj#seKh5ljFz$#bz)p?!(e9bd zrk#WC92WsCSMj=$?$*+6*eH+IcfFlJGn+e*_bGI#t#Bti?cwiN*5M_ZtzSn>H*(LP z?x(GPCaCXHhm6kFCd^t91Fn5bCG;^2Vac-uwA$TV+VRAZ1zqocfr}&j{zF%O0Q>5N z6cL#AbAu?&lBr5t%X~dYZra&#D+SA8t*DKLN&};(N6Wkfw)oa)IiiY#Lv zlV!69g}aRX&N~WM#~e1l5JmHCB{6O%u(ci4@)oZdz zlPHJs&oh11O4Qok@nOP~n1HRr_V!Kc$80~=Uyc8H&d4EJ9)~SFXwQxbg}hiSv7D~7 z#UNc>S)qJ8Qt#!Ct4OtvPq3RvvAB=r8iL7e_zQxH@xz=2k*!zfMg@#E_;?-fpb?#3 zuBRoRGdE9_(EsrShfFX8z~#4#0qradc~x_sioo*g-Z!TLD&eImuU)&K&kdh1{9bO4 zu4`_Vl(5jF)z1|qSN>?!{eVE?sp>*>hb_&3DV02?-oBwMfL1+r>X-1YB?h7&^+L>Ldl{eXf` zVOC9>nw9?3jrYaV)s}VBHO`IGb>}g4o=sVH|3a-%fvkDKB|Q>-n|ps0h>DV4$Qd0x zk2@dRq|S20KoSe7nfz(b85>YooSVRdKUyFGa&4;I=Z$pO+%8o7w+BihyLU%N^?YI6 z5aZLw@MhEzbYSb3`sHee7+BGE$@6trm6Ju@0Afq0Ead9HpLH)>56&VEsge3ado8}g zsr{0&G+yI)VVdXLF!l9%{Ye2pg-uuk1Zzoh1{pG7os`6bZ_fnRPS*Sgc)8(bnkiCCf zg_&!=;;%@DSo1NVi*GpBILC#1_VA4_Igp3R-R3c${GkS*oFHlv5OW4pqD9@3m#L9k z>=2b%0D)ZR)qXKcbDQl1koNzQi+Wr_k(%Pc)#^-=e(-_wqRp92Gy?nBZ7A#nO9Elyq35Vq z>^L;LU1U!|4!~dXb516~_Dfc%E(*Q1;ZQ&rk8>7G6~@Q2CF)4@Q3~VmNH3q)QV_nR!3b ze*iy8Sb*ehc}=xuVF9hq6Pv^494^m(FJu*zGh70x~p=xsMmSi`t< z*&Ui|P8*xQ*8YCgBXnDt@)B$7U)XS|9FsF~w0Yf?1w|@3kSc{Gvd3VA``;Q#?l!rh zP0NoT=JG`$%=u-0-@o{*m-#uS-g9YDpZbWqnFWM(K4bn;4D0^RNtRku!%UrA`xzhG zKwY^KfM(fjfS-7y8>Az!fRDz{(ZR<}Z$h#0cQ7z=0>iQ9n+I2-41w3DMDRR_q>tR9 zz%kV0M&uKD<7Deu1z^M$WH^D`$0E={GONeDX2A;1SV%1R)mDByh!SaZ&zWE#0*~9* zdyj=Adp=(COz{B*S~4C0h`k4-It~HC*Wa%_xK)*nfe?Vl%Tzo!PF5^}92JZK_=g%`_F*l=y)^5WFyg-^GCe>%Ed1LVnPN>yj zh73W!O5A?<^TO%_?K-zVlF|+4`Z*rHbl^}aji_fi>?g`M933+bA|SVCqP|Dh76sX1 zOOrNP-Qn*}>HYal<|`$SMF8Uv3~*WVj7p|G6av#HY`N`Z|MQ;9N271y0Ii~l10upJ zVH@yf3gQe*A~r?GpNbG=YL*GHHr%AYyen@AY0hzR zISK(Po)_U5k_yY-yh7gcbW$F)V!9S6FVkbEoKq#yNB_Jr{=x!mbvRZLg3U}h4w}^I zM{xP^?z1|Xz!Hz-v4lq0!kX*bTX0>Jrs`?3I<-6|U{!Z*HRmLtm(k-jyw>$>mueh( z@UMkX+@{F)_NrzMR{wlf$)U45JSI4Tyms8cOAqF2idrC6UsrRu98&kvq%a|3&RTbx zpEFK_m{n_2*IF56W{=W4l{oheE6-j?0fx;ABzHhCcemUWE#<0y~X{&nc z{%qZN>g@5+(PKv0*RETi7|p-ssI)hL7kU>4KJZcDctf%LHvoNB`2j#Ynhvf%NuI)l z`doFQoKN-yW9UErh=pWR-E(!H=WBNTCXh+q8;F@Zw3}i(_r>?hzNqeWyDGWCvHQpN z%u|WyDP$;w@=Sor;AyeLqLu$s=NXW3z&g;{*7tVIxEA+22u+m(;jAWRqU49Ef0pyH zT7)X|1yc*(xpkK3e7(NIx;-zxmh2VQeXe_ceg3sQouCwh+GWXnV@rVSyrK&R&+B^i zh+sdEQ0#>RHh@#~7OEUnjH+!sf3Vns4>yK19j$hco`0Xr-vP0dN%|oe z?6e?uxkBQl->@7d+|xb0nqaU8Wb1?Q3l$-6vNKV#*C)LHMN$9*Bb`0DOwtk^$dg=C z=>O~6L|72m&C5JK(TpEN5bQ`XY_D4hii8) ziCx-owkAYl#&->phvoa)aVaj+YpUFKlNy6iWp7p7d&8_(e~G_fx^Vk}A&8wI*GLrpx)o}clkmNa+!tS&suz{A>96%%03 zf}K2D9{vA4_m{1Y&R2hC5oiVn=LI>{osOkn^;%fhellB(SQ&eFqvR>M0Cuf^Z?6RW zzEUKmR00o>XvXI}@>}nergVt>VIG@jeAUN~c{evEc3al;G(S#yyOiw4_fXxSAT1~0 zv{JD*Rw6YE5_33DEwgUl3+kmww$7EU_?XuxzclgJDRoJK$3{MAiVob!7k`l-_+S3P zqZvgLHhl!QLR{`~5@6wI+6-p!*)9G1y2q~9Z2{&?RrH#*JLq))5;zb=y zEg=l~gXsC#)9&s-eDD3>pBMd|efZab=vEQvR^lMbjl!6(a=s(_)XBS$_-wjrzbwoL z`t#9vDfL>Mh~`fbm~8*5=ZgeE#fhB9^)^ceer0KLXU;gbQdq*se|V24#%IOserz*q zr}@B9phaJC`^z2$^;dhtt`d(|KZq_AI_XsIy;|7x$875u?itOv5gu#)YnG?qW2gwk zNzU7<>7DOE;Oq`?9JsYM&lFclVRfTstf)iT z#q3b5PLBO4HG+OnwwuH6@b}BK9}L5`ngeR6^;WVT9T-kIx`^uwf!ZAUQ3hoE_F#SN zHs`+4*IZQ%hOr-JtmEw*63*rk&GRWkf*ZfA3+{%}eQ36MP7i(i`G`cI@%hI=f&Y;r zLeU>tvgxxd*87Srk9}=ZdXHbu!y5=LrK|?mQPVFx7B{Z)c?KwyQl5PXMrY z+s+|%jH=hH-z_4@>-pSni+Y>|<^-E6tk~?w!vP89T$rca$2=Dlm1O>XC&h#UXBvUs zA(1A#;{x$_$4?{q#GA`>KV^P2&Z2sSfBy*wF6r0nChWfJJYBU*-qfHlzDVHS5~)!m z5qM=K_B)n=E`#XDw>S*pr(FTQs?H)%GIDYSFTE8*PVI7w#ah>GGdjeLZyTL=wqHq1 zK+FIVgOJV!LAs1Zu2w8)a(i`(q56`)22}h&hm8s0&mwQo9+ku-Vuj%^UcH%}{D_KW zM=|rEQUkdG^0GR#3WY4CNC3g9wQAQroOGHq;3BzI^{FOp+Kx9m0qgk;wPi)^6ZteM1 z1}_I4_tz|nD=Ivk`IxG}`{1vJ-5t_?=8TpYKnI9~fA?k5rDmLUNRci;0;k@kOj&%Z z%PQQqSfUqXctqDf36%U1IoEaWHMQIMBX!a=rBA5$UQk%lJtd9;uXz^&if%CI9zR8r z@C1`w6xHPYh^l)R?#Ss?%0|ZgXXY;M*k68XZrmKdi9B$~Dm2{0pAzMPYtLki<+Ww?mu340L{+N?C zZ?LiFAZZPv$0^MJ`fBat$Atc>hzP4$3@_q)^bOVX<_tdTs(MaA(6npJ#EB+=x2mS93qgsMyb%!i2rD zGVQmlP>y&@2n1J__+0C6n&kA<@#z=Vb*L(5%7SF$Zc;8}P&|d4H@h1(jt4)qH6S>+ zefDc+`)JkcT76OB8XM8&^zG5dO025D#}VcpV%m70v)yh1+Y;yX%32Ja=?o9@Ve_$f zaU~$T-!r(p8y2b~NCgB~jTff$c0$A!C$Zt!8)55)$h1-bUV>Ikdd}NM_$xLo%|89u zBdV{vnX9pi>2@2@Ihby5~;MC+~v2QG$pL~r_sL3kuTS`mCnvx6ybGc z85Wm`4n5t;^VoJJgd3~Wo1gn0qXn}hiquT*&No)=P*2uY>Jh4`>0iab;Ezb%py`l9 zU^Lyw8dWsI-$-rsqs~yeI2zDZMIKGFzRDkFZPp%ji_t|K9wip5=aP2Re$>Y!yJNS& zG@GhmGnlr%n~44+SA)B`(h|zbn6bix^Bncifnt`&@wm5s`^&$VL3DdMw%S#^)z1Rt zmS3@V#V?Y_4crc5W6;I2wkpcQSSa~R{6jtotPb%G!$IDWkN1MRCnw%-$bs$abV31#jjLzLGZ z{T*+`u|S4%gaQ}gQxb}&X9FhrGQT?Z_NT*`XqJS!ytw1KoOy4+@zH(z&2(m1p?>{T zdPv5y?Q~`DiHgVqkJjHjn|oo>$iS6nAD4|9fv<0H8{||?j00oHf}EJV(W%+5v`>&a zQR^MG)>iq6RrIl<;Psc?+}rb_|MKnL7kcrz2b%yWNc(Q}k^{MMO0?jowZViLjR!_a z0W3NjinAemD+)m7xAOC)znjC_H{A6YKEJ`B(xl-J9g`nwiR8vlIB9`6XY=yh{fiRS z%7y2OZe5)<(y53i`*9_&b$d(IPmpo%fUPgS5z1*->QjqalRLsXB4F(4a>ja9(x;TS z5S6d^nt~mY@1H~)<})j4{-hQf?zqv#zRd>9>&r`r*DZn&&IvD1 zkEKYt+LC+vdF~9CYZ6T^=e%IXHVWEbM1P#Gc(*iw$5i(`{fQK<-Q~}x~!>$ zf{l{X1N8)Is;WK1S+aCaPbpZSc#Jc9HS2WF=@`M3(rH6(UuLBqUxO4>htJh!4n|x_ zIM{u~>}=&1MTwSoi7`>`fKBF9bo@uLFX{Hr@Ar6Cm^*Phuq80xesMQvI0WbI{Bh$OApjN2*L$q4y9T)WOgX|*!;XMXQH=j=kmg;V17Qbg8w zmQ9G>p^D6Fu=_9f*v^#I(~uK|np&F@OW)B+5QX3eRPU6gu1)db)T@1J>G*cKWk3jB zWJB-Q>`gkP8Jh8ZzO}xOKCTZSZe$iPEt0Et9G%w-2g1M8qF@syu{=XvQJAX^zfvma z2}r1Jj7K)?DBntHmaZMGxEyy_lT|wBF2qfklMWqh{v09};<@gT)bY{%=F0n*HdvsHFqC84i3&s0CM06VEZ^P=4aS%qrj=X2AyK z=6K44HZ}G~XZI2fLZyfH16_H}f_c&Rqu!sGL>Ez@0^GI$dRzB@L{)*gVu7=o5|MJ= z%uL>OJj~I-X#=x3I5>DeP009PkhPOE1UT4%LpWt(L3`i#96#I~yC9i8O30&?6I{BI z`PYrbfn1R8?gm&-R0}0|%|VTyb}U)xb$8AEdi&l@<9(^OTIzqyDC>Ia0^}?Sk9p#` z{ij?%0*Edv?zYBoAdG{;%=DUNjTFpO|IIVT#}0+4_}<@o{lri#SyY6hOQ( zKKqyGT)AfW=~jS13qsMc6s+8WJI@LKT4oJnz?vrIoA%CK%;EywDUii01w#ediamcQmQ&VE_#q-lN4mezv{Sqg9%j(p4H;!;_RyXJif0 z2OLTqLJm-?)60S6GT{iY0#@|ENB_DVCx{B_H8pFSl&jC|#jarmnVtVJA$`R`i0xEm zv6@B+;pS<(m~J!#9zK=qU#y;MK~&|otA0Lsa(VQ)YSK2g+g3L&nIcOwRK7h`OsYDu z%GZOWKuIYi;wQGom*gn!(>Am}51APf~#I`we8QI0dky zxWd_byRm{IvME^R3;{i_h4WIqKr%&D;p}T{6_mYqPc2w)vYNeppLtj>Hjma~+P$z> zDGBV!EeyL?dujV9@j)RvOwYx%w z3*?#CV>hfzUGVhX?nl8?!Ls;WlL36Y#ITN>x<5$@)NEwesjmL%bHi zUP|&uDy><)LHsavlasx_Pw#pr*}m6_M@(zuyKntFMjmB(IXlaUN7+ch)}Mam;QNL4 zDQ@#y$j$b5up?3noFWl41i=?ZFY>UB#%{LO)$Q^puE`-ieDMy94#{5&DHPX}ls)YqzTNDy7+?Ohh*0L?*GprzWN5f9ueXR{6>a1u z0Z8}u`0eL*9!S6QhX`bHRMIN<0?ntNEq20r?4`EAq=LTINV>TuIN}P-NpAyB?g0F~ zI;~C0|E?uW=pfFW@7lbHjw~=@Kj(VC&Eoe$z7xo0Bboj6rmEY=%o*loT@YAf{@we9 zXnN<|%?;CcA~1ja+-0Q$JNab3Y`u8TAAhBORam05d*<6SYjlZKoiFR=X;ejC!IBH7 z8^%O$pH#C~x>T(5R6hGSLVPQ`$G5SKFzz*Qvm^wmn*>Zip|Q?`@i~W}+1w}XavSR4 zrdZ6E_jNj6Q?=|03J;p>r%q!4=U}6Mzh_L;Bqde(Yy-CZ78~v_$CKKEa}CR}M{6I>_(QezwBJ#yNP(X1F%kYhzJo#yPe=0d@7JH_87O}@ui&DXaW=5AcA!5ndCeSq*wjo( zx=+N>Ov!KaBT}wQxW=<&6>A;+p&XTQ5n(_v*eYYLK`g)oZ1Sk4Z7~mN^Op?VF0YzN zg8uAj!q3cK8X+-Rf$V9!y56QLP#|eGte$X;8h<2|7MuxgVy!~Y@)(;3jW5>2*^!Ws z{bJVz!OfZTA6Z1!{S?^<8?&IA$YbN`6E012Q`8>a( zH9erQrD$yR-EUNM`FS7Z?ib}3ecP9|J#W$&asSxM7Z$aC6P`-qC9<9%Kp%y#)5@VD z0AaOJxvCB0Kn3{zpaFcXKrU#NvI5W0p)19ilnweM ztzKk*??lK$s3I3--JCnXXFoV1(g_qiy{`W=&A;S{2gKQ%_%6ZtLCP`)G!o`dgLVt% z&rwz=Yc^r?!oJrB%mT+{t#*qUqhmQ!<(BE|J^2++azcy~1o-!bPXq{|j(%28xD_<; zH#lQ{7m4P4op*!%n5wmno~|~^fhD8#fxi&0zGVMi=)lZ#Y}TO#IRAni!+-1&j1+o` zipbFZN&2?J^q4EFM&lfgiXs`Ez*<+7@0Y{nQS_QUW;;gGyf3A!Su$*G&CM~eXn&o~ z@5U{-zh(%|Ss2?hD~`1b`?%=CkMmmC>;;v6Uajl?(`dPw>xi^|@7b+xVa9`t6k&f% z)?nz<5o=H;Je^5<6Rnq6)}ZG+iU*yn_=EeykvX=Vqr*`5A$1;x(6uvp(e}!H-lrB z-1P~7W540i)OdR~&3VH}_w!Mt^95&1-~^Z)O#1o`_Z)P+>RsSVg?Qavu5$)GVW6dJ z%`;i9q4`6)i>egc8hIDo@P=ye zfOmz}XxEU8uu?qp>a%(PN~R?Os2u7Q{hJ5N(Fk0vjd(B?_#2f!2llGy489;%Ub~tsf>XbCu}CCR5h~>#XDkV{!Uc6+eI6B%j#1|8PQwZePVEl5Elrmf&!qto**pY^Wl?-x~RIXf4P_!TT+&J!-M<_)*G2L90-@@MhxX`c>5BNb`)ELyMA$Fysdh)@ojZgIL$ZDd1)BSFJT_Vl>)Bt36<^ybghX~YsO?^ z?SSRv>j9^HBB{(UbC1wl#>#tE`?Nf+o>D`M_pX9rzQ?NMouSg#v=xy=+8W(zux zWWnBakNAa-J0w>UCO^p(y0jqS>mF4-GgEjJlaO=$hL^HM|J1SUx^NHHM2C)|lf-9T zFi#oWgbT#|NAdN!5;>nMuStRnnxxOrJ=kq=iy-yKo*K2?2VY7(Ppu@RaA&uR>I+30 zOFP5Vvu5qD9w_{|~H?OiIx z+sV%QgB!G2jlyvr4Mn9M>3+^TZ$BqDetmJKgzleJ9*5Myg@gZMMff>&YUgOlngUqT zhI1M#R1aABjQ@@9{bt#B7$+bFU6uK%G$b&1xYNNwNW7lwLgPf~wbO4Eg2_8xEpxXY*)%^bG|pb2Z~D=qUpg8 z7Ezayt;co8JtU%1GM-`m-Hod5duexq_P7n_Qvvhq^FE$;4axGy2QikVyz<@ao5$y{ z>{@|q=bJk5kaOu|@hFiAL9Vm-jh|gRE#zSUt4KO%5MH|>e};5h$Vgg+~6t^I4LTM8B z0G@~V*=j<(ECGbV)W3T&fYm;1X#XS`fW}uUzCM3A5Gpdw|NXxX1k;t4CX#eair(QI zWp@Lg)A;SlvYcR1rUFRdM8EjGE+f=zNWjOu3#qgv!J(X=al#GuF>*Gy zLTM*LbK{&1HjDdRT7VamyD5F?|6~CSaB3sv8Flq5u`5LD&Z^yZb+CwbvWU2tW`D`8 z(*B0=SO;1h36K-y>E3OMCS%qsps3_|GJ%G!aH}U+>{~6wp8(+WYMZ|1eEwdN5v6kL zx|RSWeX>-}TSV=OL?!_V(2%AKOR34gLgxOt{dH9T&5OzqboVrfnDD7;zA~iQ>;KIR zWlwWfQu4Rv2E__{+Q87AG~*l@nXh0USb=fOP;dRXJK# zhN#F$jXAohJ78J`udXe{wh83?aoq2i+JLKT?DoLK-ed+I4<=#&1S1VQ zwfIq@xpg^8(HoE1=Iv)UyL=>mx=5M&wt&6>H-G)+%by`@i}fj)N{I9rjT>1pq3nC9XID0oLDJ+9A{rOPz zft7Tlk;K|V+nfqpOSHoFU!O+B$9sVd;OfOX9SBgTjsIq4a?8~7{|07U+Qz)RG6SJI z{(;nf>sUji5Jj&m*omIiA&puvu5Ggm8}=_ZhH;NMsd}}zb3>PsONG}I-Ze*{@ z9YKqq%n9B9c_tkGzs>}vSxyi;aDWC@Y4xzPL3@xgwFk|}7Y;Xp4AGsaPM zoY?{X&p^RGXqw4QQ`+CxsMcMPRoQf+o*k-kez>cj;jO+Hw<9zR0WRu=V3G~xCaPL% zGxwag{!<1tRFuDP2Xd2FP`@)v2`+ftt#kkM_iIsMa{8 zgYL@m(~=0k_&aqmBF>qK#+zesM__|}V=JJdnmxk;ScUqkg7S06Mk zTB35^>vOl=9OrkK1~}~V8S4S6dtvSEow)1x0UKUhTEF7oHCbDF>&dgw(%638J?kdN zrSN7_R*};UIs1e6dK=uovv)Z+w#llf)Gfu3(nbn*VtI!Jf9*T8igjfBFou->4hVlq z#f=<);qa>_b`nO+a zdpYh)>>Dl8*FqS3&3b;MJAUxDG$Ir^fWy!wS)e(&QC^Q~1IDF+$N|%XJaFE|*B|5W z1$h(tZNMQ{5-FYgK*5b;Ihw1qYWmH_3kz#+Sw$w>u4uczLL0;*5eWt&>!*)~BzYJ) zk7W8K#3dRTlusUuwNJH7Cm=g$Qa+3Le;Uvj`1EKp=IXlCNn}+D)7YQ6GL(~KHE-uT zh z#NZ@Pcowko5oZ0VyLaFMw+RGY^@H`T@)3naAcQMJM$JC%s{}8q4aPKFJMQaW(Q7If zp3v+scw4uJcp1cRE@=GG(joS01w$tAenhf~JZ8Uob+#-A6ic!|Vq;OJ)&@&q0=?uj z{`*i~``S%r5^T6eM?2x#F2k*=D!?j@CnBiy zaL=VM7QCm&?HX&eB?QFi_9jrN2?`^$eeH>_XKf@D4%D&q-qA6#26dow$rha!1t$Ra z7Vh?xRwVKL9jL8q>I_!>P6!#d5V+#94WV%oNg1R^!q;Tz5YL&7Sec%XuV(zws&6|i zp;;fWZ@9a%4#e7I0@Oow!G&C>!J~F}kGrI9{r5Dfs=@7pbp-Y`tK`C|uLg5k)GNqA z7;q`MKs)?tpYgAl&D$$}>#=`)_Oe(aXD_tU^w6M|3-9fLTn_A;_8Gh2CSjLQA`dw| zyCN<|v{X;jxwO7c9YX%es{3N=*}R(ViQkcR(v9thjUgw9+D1q49AS49o>qE#s1cCx zt^;*M{Ju;CddqkOBL5s9mx85MXGQ3@s{#=Ilqp7jc5qJ_nk0An+Vx21REmYwswmZQ zot1aa%D>%fhHqSdE!F4D4nnqyj_6=gyS$`Z;8r{jMbX0znHl*_zb&+-sD#5eQl{u7 z%00i=_N76;U$FP2=RFCK`Gd%#9RSe+JDR=#u7CTnclH8@O=4#x`lFyGp{#~qVWxjm zpbaUnD>QVB6NdCVUTICatSBi98uZd3e*9bw9>+;thP_)ucYubvkteG)r z$;PC3fICRGL| zl<+5wFF4(HnuItgmjwXA)h5*q@do{gjJ#91dBP=8aN-Igp6-W&(~1hNho+cfuO({K z+xQ+q&ROb-dXVufy(ELgD0!mRI5T1vJV5-)pPld#iVw;a#}dlOO_#!mqDnLL-rG$* z=2jbt^^F#j^+WM(rUO~Z)J+Wz!C#zj5^53sgJ&o7Cm*K1G|1GB#O@DXdJMV`)_T{y z7wb}{2aRR#3BjlDE90X^(6?Bk7k>UqAN1^&h#HH7ug9pLi4TJqLT!k7^A02>Iyvb! z9l(>|Q~E>Wp->*}3-&83XV~+;x7TYpN7FyM!S^K z*YTDkokg-z3q~r+)OZ^ZgoBQf7O^cQl8JS3Gt?8c9!Om_YEK8+OjE&j(Go$;Q@bu^?MC4w?q%ZNj##3Bfmj_(T1` zM1v%3_(a&%0ZNi2!x0TEMr?srl2E^$b-k=#%zD$AD?@uCZwy607oYLqgBSd@+fEQn z-l#ctz^yk~U^G9mkY%0@oLKgOAadO+l>+uPF6vrPd~k51>*!{MYrDDj7pXrPy!axZ zQtx>Vkj|x@?TD^SmD?50PrE&;gdny|s#&k300g@)F#t^=2TsGiE>{V6tEyQT53;2h z544}7={pgs9M(Xd9HOYFi}qQS+J|_#Zpj_l)o8~0PM_?IwJW8HG=O9L-xNyNHonm1 zmd;+`PtRz+a~ixXYn`ru`b3#BaOw+I>^yQ$o^!8{2DRK5x*k7jAs~wzrJB z|Hek7su3Ju;1@H^_~AGY3Qcygxo?}>qpEg-4wU~hGuWT!mZ^;hW)#JP#uUA0$JiQBp{)Qx-M~m4+^zp!BHYVz)K0&3!Tq6;lD8PH= zosaR22IY)NEK^jt7ZxU0_}7DfSh;H*cupjS_Jio-*l7*1YTm=o=yldID)V!dbC0*Zqz{5=9G{ec#CHfb_6s()DfYdPaP^Y>8Y>)N6dX{oR+VN$d;RYTw-am-!w8QR zPgxf0@d-W}%|B{s8k@k~JY!UbmduG{`vyxC;w2p0H_fLOb{* zguMoFrU$vvoh^k*XAGR*9+ANjsW)o5PGX{wN;pyVMm(#_Ll5RU2QkRlIT6k42K zwB5Cu_0+N$Gw+Zypa5+qSnk;gQCGxVus%vt07l?lK&8JO3L;=lR+D!t6w?-I=}hNI z)1KaP(;|Ko_?OG#1qWrapks!=#s-ZqYFqlx4e<^7G{3u=EC@%iegG}R)!0^Ei*>u3 zyRpDLU+Zides?~k$SqX<2eb`ad|U~`zu;_cqnk@lDaIrsoO%Y3Qa_!dUWcToGcM|3 zr?hkbH~|xqTwd<4DCv|S2;{>Wu)Tx!{G%`(r0>d^Y!%qn;5& zK_nR6SfW^zc-mA4?~-^sF>>(YsES{)njVA$sJ#E0=-_CA7)GILMmKt7s4xRvX>`eC zNac$S3|R>g2^YW&6CObWa1OET--Ie+nT@2qGJdXK zF7yQ37(Krg@GTZX!Ea8Zuqh^P(onO(;+P;JCb(v*Wf)>;GIF0IHYA$7pz%7Al$Nyl zm;cE2-918iN&jaSkD0H-YjC4qib-?KO>cc$881<4 z9Ur&HGSl3m_+tvJLrrw$Y;*`EnfXTA1&lX6YIeLbiwm&)F!nqUT5LhsW=Q+Dh0?SY0!=8oD8*smPs z^(`I%y(3#iG^cU2jEQK(GWl2L?tx9hWu+?d5OPo_Ul>h4gWb`O>Etyo#)RfX6AEox z>X-f-&F6zPMh5?ti~{3q@<^5WXd(6G)}McXNcPRL5eMN0qyPd9@`c2Oxzm&rBi}S$ zEJa}a003Ad0tw4`P4EKkw%4nT(S`PUpm~6!n(!O_WrkX@L;pO*Am0Edpfo@|pzyHU z)k|(z(gd=$DCojnPO$@w1XV)tFyvAHNr?`A)#Sx^9Jtp0Iom(`i2nYN{yWzI<=pQF z7o!f==d(3|6V{;?6aE;gRUEG5Q182eg;2xBQK z*(+t=g>2c^v9B@qJ@@T%&;8x+{r=AR{m!}n)M?J?be!h-d_A9!?J3Tp9n(}?Np<}2 z-gsPMz!?pOxqe$&Z>D{?Qe`n$4Nv-7{wMo6`ERCk%77U858Rl784UMuLEO166^QaQ zC#k=nuey&pc)CxZjP6<7IRLx|5E%m=KkvDgMfcC_l)cUj57Osa31S&1qqz@?sDP9Ur zB&v{Fke3&=k=(F@qaJrMF=|~EXgQ+*gsQf## zkO_HywMvI+t`)(Fp^%9Jt0X4dywaN5_FPc8b-af;eVE$oM z=Hpo5mQFPkde+$abA#9k$LBw#LdVGT{sQ=3jiScCNO!K7DlO%Iz>ij07nP=^%vS_* zoo(%+3Ld1Ffe~r233qUnNT=D4QHcqa$ko?S&Hz$+CS^X5EmbjW`kKD#3$75J zmtbjrn--Y{#io}CN#A~oS5Yru$~WsjNCV90^Xjm zv+Ga(?8b3e=dP=^*Pxka?AP;vili88pQ<~LxDQ4WlwE&W-x6^3R<-O*Bs%+eJQBvU7{JX zQDR?FJ0APR!qA?;DHcd496)ZbfC|EZq7#*YANyHBsSrQ{66PctUWhBfIAA#&wNP$3 zNdkS_1@?MuC{V`@eNuglim6+57w20OY&@Br>o8WE+&S6)BG8vUYLF3ehg3MEm=w~= zhX!#HSUjPX9P*x22RUkZG5leULn)DxJLJXoT1$+5)%RQ*BVLwRV?!*zr#MSn4t-e_ zwIBXjqR@VetD|Dj$)D_QSG1&~A{TUl1OtR6EX26pQ0c=KQgPyG79VQ0zQUPc=nXO!`^1JvR@!!1B5 zDRW~L5KM|bOM30cmP!u#lz;oo28`WF5k5D##0Z_L^Irh-{Ofxe|1LE^2f#;m)upNb zeaUBBXvQb2ONfmHo=(i9r-k-W3p8Vs8?Q=7vzoBMD_%ntI*_CoJ54ylzdfFghVAr|cjvk!cek&Rux zzt-XdbFDnRytvMI53Wc*1z(5Ho-NR5(AY7YLkw+T@|a-XonJyyFwIDVJ>wc9A4Btn z+dlB;iV?p!KtcxTWNVP(VazX&7gb0>j}M1;$s@T@W-C@O?9&1xD=>Ck zvRA2*)a=-eFa$+MjGJFwvT=(QAbYPG#+4twY!$5{E< zA`WU4QEqgjlBKhYN(EMO2k?(xkoEp4JE2`Q4Z`#8U1j0R^}u`Qyip-GkbYpzO*K+w z`JF?RI|Po%VFlmJd5cHpOu4J+tg9>Vt5mYFl;`|0deZO#cuWWP6g}xXZ;v~wILd>} zSFnSZa`OMsi%!q1mv266jSkQYB7)3{pu&4Xoo!oiAdY#k%gz5o_qzm|{&%|2FR%x2 zcywz3_ECcoX2~wwO*Y#=d-_+xx2Ai{2T&%>Lrv2>nm*6tjas=YdJ`l32i#ROzbU`O z1vc}kjg!^7G~p;d(34mh|HH7w`s-5ln(c89fCMyS8c8LUHcbsx9`n@J6axf*+AvUG z|7AJ{ptRQjXb-?_Wy8Y9KpMf7zptX|FGusWOYj|_{;z!hk040UKgr-SsV8$dzAL6t zF)I+R^>0`0;*U9%k2-`6f(%jrViVj7%6I`6!>X{sWJj0$Ky-LU^9Cff89=Q^>bJae zxW*GgLFxg@0|I;d%p;Nil64;0fi|Ig`RcWnzaiL5W!2}4~+Qh{-Q^w2=aJ{f5ZsbhK8VxubI7J zv5|$iC3}7eDM;B>TKAZL5k{@s9|_eK%D7(U3P3#HNrBE(eE+6S$0c30z*uwtC6F!S zVZ8PC0WY|aoaMl^IsA)jdr!d62(ITN_v4o9)qiFKZt}B0CMK>IP+FFsDH7>x3TB{W z)O)#(;)GHV>n*^PyT!5?RKf#_{wSj1{VaML^wl8CnVh)4zT8?)z+ z`t#*M#gNA4ZQnil^M46msn)|?s5X&~i1_6@jPjjXU;M^weaeo;fyyT+c{zUm^}{t? z|2Ol7b-gAbK#0zW9L=q^8c&IRn(MGa%RTN`W5m1VRFn}@oHykBE&n$uQ|~CM0t)&h zI4PyU4&?I`ll7@stY}pgXP<94kS~KEU?1slsPqoiCiWgysV|X8+hV~R7iMpX*C~|s z4Er>Wsf#DWf9srJ7-vnVMo=Du9gdCcwvSD$QHrxKDSkuP$UtE)`v3GH`gZ2_S$atG z8>yJ68VgHoq>q5GHTF^54cu@@c&Lu+p8DRk0nNh(FL4*mk%!dr#aCahnJJU2(4wg( z%J2++d1d%KADY)c=O4$kIxZM;tYBBJVJ)^F$CsS8{<}TJ2sT>DUC{zGXZ$<(cP`eZ zyH*@tIX1>>(`CF$^yd^`DBbQ3UbCFA%zb>OY55CkM9m3Cm(fiDtgrls$x)?Ok}2Xo zYdG4NCuGsTo`i=e?fN7DNc7PUe^zKZ2|1tZ{CHRwa{(7Az1OFIk8-Kkq2 z6K=Cp+zF51)#8LOlTTp?w%2&<%(QLkJE`_aup}+oipn@qoY>^`3em1=dxMAa6JZD( zqT$x(nTvmif^b3^3{JgGj_nM!yavXS4CLqszCqX}v8p@XmEZVF#vAYOhI_Nu^3Dz3 z1{I|y9y^*Qy5+kcGVQ^_HHcSqAsis45K45?mY;b;RS+Chc5z9X&Fy~vwZdD8v@^+# z^EFX}sEpG##s^sIY9(r<*=ib*BsXbth#}~m;7ftTP~NmO++Decq9=m6j0breMgQ^> zJf7LwGzj)+Qnx(WyWZLJ^yZ1;XMTDL5lu_tgIuMsfW^z26a}GwktQpZ_<)`GwhZY( z%qpm>PJtVvB8TJayz4wC@0)HS+Z4_8!6WF zDVvybt32{CPw&L-exIM_JuImgd-!8pcb z6|3GfiN8OMZkna1KR1lsbUr;;t|r^PyW$hE+%lZ5`FCRjs4$|h zrZrTqr`BwTEU^0KSSBi@^ZG&`(s0naGg_zjY4wfwtzzB!&=$$z2O%Rq*x|*l)Qp}; z1pzrl7XG>S(b|y>w^xvug&TKm%aC5=O-SyY8wqy(_i~@#9iD!-wJ*)$+vbaHG9Tso z5~=OMb3b}$z)`9PN+dy$q%_MnXkyWO%q$Ei^7!Hvy>Q}LcZB>TGhKKI#gkB9Zd42Kt!J<=ZkI36rlSU>W|LL8>0ecu0PrEBFi zFs(YT0xKzd{z5h|vN|OU1j)1kwdzz|3ap?}e;LLEFE`PoEP$SRGubma0{k|r3Otnm z@lIZT_FrT#A^zBV#F}P&N~4N2^z&PoPk131lH5VXKx&6$nncJZs<{)e6wE~n;D3#b z451rlp|g-yk*_iSx}1%nL;6MkG#t^fz6WCtQfNtLW@@;TJFmgdNKy2C>N?9StyrGa z!7{?#k0DObCR#oaysIQ|UyM=K;x}0unQmBO<1P7b`Xok0!zbGbK`tcJSsCzLI9H*K z{{-JOVjQCjncz1^n1FXnn_}vMSG1J>u;hD`!Oc*hFkPH(s8M)iq+;?@O?~xP`~53o zob(HDd-yIk$0XQeq4m03&33r@Ezq+f?4xS}%lv}n0OzDgWwe%Ls4BC~IJ*ucjh6hi zQlHG_?Mr}bY;$R=X;ktD;Doh4s)v%l{e2X_PLdXV=k>d0uS;*x)UVX_;@p6728xSh zC-BU7RtakXI&!IruRk0{fCS9|s;@y(U#D~<+T&S%XE0Mt>6?ut|605HnbjLa=WDsC zQ13HxUXp!gJhJaiE?X=2I5$J?=1Z6UN&Nf5z_OR!1D{6zFnnSrF%+?Yp(v8jMy(@1 zq4*E$JV)QAyqWmj@?Re3EIb~+A@yIGiS!zTFgS*J{pk?kt~4P_=2Fdu=JTZP0W*N9KNYdu$a0j; zr)$8mkQz+yazu8CkjmTU4263PZGP0r?=urr{MYCUu5zuu#r8T3dG1kWkul& z&-&7z+^5;6+;^vsX&ckx0kU-z14(T(pE6lvu~mJ(*!-jT-M)-IL*3N9{vo#SFHpXm zf0HsYocC`ifYG8nsUJS6x7KTh>I3UUCOJChYPq!Z^eKISh}XPZgH%(8t)HA2`pUo! zQ~O4dbZ5(-cU;o9v-{o3Bn}!4_83p3p6La>lXDu-9;4ti^77O_QFKT6BK8lcY@9C2 zE75YMC-9%!LFXeuZ^>iY+H)It&sTJYm-cLmnd1Xa|9<~H!E`)O0-DBBuCA|q-G2fU zcjpJdi*lJ4)%NbPlK&s~CGh+{Kh2;2&j@_=T=ug5fob_a5sjB!;4fp{|B6O$2MFkQ ztYSa0O4vti$$W{W<RZG4h766o4>Cr| zNvE}E%4+C~gTjQH%r7M<(U%fdG}rib9_&tB=+yR3SSIKt)|2=M0uRGSB<^?1_Ojv4 zQZoJ1qo#pT2WPzpV#-Q<{j8FoKz~k>6ZZ|R3w}u2lJ^eC^Ewf?%iD)Cx^%LH6oG1KURXC`8^l;GB@NHUTVnigVKu4$XfOgTsPqdcZ z8-dSj@H@w}uPCw6s@BXQPFE*}me0Z1PguZU(O};kLy;2%!Y97$%}5QJ9LzT1&C2mB zM4>4J9R~z-Mp{*V{v)TtuDQUeXS2%7dp@hJmzieuKHa)8R&hVLL8wNJUQ|@=qOhy$ zuP9Qn7oB}VP3fME@}_-tkv8^0`-Kk{$X3?lcr^0IkNq-#-NCwuqT#_rVr{%L$i8kM-M0hr%r8wPbtSE@I%Xx3h*r<4ej&LGk~mBIm)Z1S;}QEE*&bN!!4RmY{B8 z7Z>7`=0B6h>ueBeU#<@ehD-vI48i;Gx^7amjAiO6|Kpr@Sdqvpz}=6pfNG4&mV(YG z5|R8}d={x{N{4Rs!)oMj1JUDhChEe26TqJ;8<~F|f?97~L8qiIfhiT2TUl-7WJK?* zpxyi)(w~qK84+|@YG3em8f<=0aTZ4hbEsB!e* zMy`}mCImKa!*Ka=4w5L%vRkj6KZqlLrHa+uog~@kz$yVQi}alVt+20s< z4j-Jcu$IDRV=|c+o|`9~TUr(kp#Dz#JBOR=_0sECKbTprD#)(8H!`!&P@();N})e* z$!9A2No3-I`uNN*gdcP{uYNdiK6lI zA-4~D?|ROTe?JT%-!eQs;2zF5gc!>i!D`6I)wfeTp(fDn@9fvg!v>ID!?&s z%}cDG9kR78VT3fcK;yQtJt5*OQrkeKre*E+fk4V^TW-@eV|&7$5KqJ&sOdd8*UXE* z1Vd*8XxIGD`%d}>Kwk@`sK!7ReFHG{)ThlnIJJ2mY6$75)_I}O^!NUd4k#Q-XXVJ- zAnEA202Y^ZC!m#CJE$WmPjlbnMRJhO6QBb8D312rRQH7TCAITaLn(0$O+OzXz(S&C zqLoDRI+){J?npaDQL=$iXfhaGx0By6;G8Yv zdeG&EZE9vhumhNx_~&=qKBlgfN+;CkP?f~M_~|I?TW`z2BR8(rrUAw>BA0eHTJ{Q! zr``BYgxc9G87NvV1X`G84(Kob3#!z7DF*$&<9xZ3=tAA+dS&6-`47=l4Wvgc=sJ`g zO{f|9B(<3(Hx3;frubbg$|`dZlPtl{kz+*DXd1B|E2r3iIFUokH}zT>d* zxnl`B7s$*ZRxbS6=6yIZBqHBT1N_pu;z=C%QT_Q}x8L^|`hYGC$FYst8-7cl15>`5 z144|y`|^3cC&AUI1hApZZTNkW@?~~ZiIm`$n2>)*(KUC;K#MA9qyRF%?*g9GIS-CL z(CpS61XpV6YgK6t=185=7&=7WiuAoNJ4(|LU4vb%3Qv)I@bn>5K(;D+qTSbmpT_(u z$(F6(0MZQ=(VyTEY*5$W|M4VK$`n(tNHypTx}{;2cVh>12M{6|=^g`O&*vNp&u0R} zw#`9z)c?t(+PDusj(MW=^5ov(SOL}KWJwAqJel7J2;aU@n0;7r^MZv0&mr%Ef72=< zkn-Il{{@Ad*_TO-W6$vZEw<1HVrLkptFXK!;!X*FmxD2lUb>Vd<-l74`<`gV0=-DuXXBeceCn|pbunhF$V9`!Gz zhh$I_)3i3I_Zn;j@=%3X`+$YV6Ic_udGLLiN3q=fgG7H?fu)zt(bl(<--P+EyahsP zWa;M5BuNTpukq5&kER7Mug2uZ^t6Upc2YwkzP(9Jb>LtNI7@df{^1@VAf}KmLItyg zA63ts_xv3#YhH`DJ`CkOYuJ+wiX#I+rRe|0x?KE!urA62`#aZ!WcI%%?02iS2UI*y zViDvmHGTfVr}*dal#*BB(!Q&l%1^#=>m7gR=1WEP^sP%r*nibsUDoW_m*@MJVPK4$ zY|{Lqh#S&{+8egJ8iw5;SylOz2*(tqp^egWbbWLousKMRoY)KlO(i>U03%UVDt?=w zv$4&kX}O&edK3rc>3RH~X%1>f`MIyN!cO#?WOBwElSq-s>kY!xHDDtw{xYl%&Gh)z zH)9)(cgT*4-1zdj&3C2LFn+XT-TklZSd!QbW$)rI-ObhwPl`I4yXaj@oOUe|MTdK) zy~>`g9d27&f+$xgkQDvYNQxY&3^|el8zq0c()b31MdOc$>b}OioQu;t0U-ZC(x&DK zRuIo$8nn*6is!F9Eu13KLRW51{E4Vib!YGW+Il^PRZ2c}d+z~6szm^0Xv|`+HgI@@ z;?u1sVm2v6>+4j5dZ4xc8Ca5#t|d-_|DZqEIX=Nuj{nhOj`D9Y>~^?Wsrtol%VRwO zxIZX7>Tp;b%J9Y8xjZOgb7X7>@^To+S5-%>$OqWqDLTkEeH#sd=;Ie(Cgi+9M)^Uj z@29RLT0gN429<&!PnTK3&XwQUtiJ{@hy|%tQpm> zrr+1H{7mO>UvbD@7_M~~Z?2fB?H%h4JgD|Ke^W){2O}w*W<5ny;f0|#3JGHwZqL4J ztNCbzDwd3GCRvSlo zOs%zeXvZ6{@_kfI74|LG((;^>u4q-jEbr_Y{3-HId*EB(c4tb$oENXZ)$oQ}BfBxi zr!c~Q&rzVx$*FPv*=I(00@tsd{+oMGN47sCk^;A^3~M561NpIbzZ}VX##Rnx-eSF7 zPXzYVSKPLhYc%Qht(XT`;cQm0zB)t?dQv@z+cowZM23=g>&@G5!mgpI8Y`iWlm;NW zL82+8y7uVFr3lf=%cZuCs>C~mJx z9xN(ViAslz;A9us7?rlA<6)Fsv#q{-9J|}L-lKA>ZGrHwWwA)>O%Xp;{Y~XpBmOPH znEH_3R`R!JcTG*_QF%d2ve_Q9K9aAFh)N0E4cT^L`r=_~d8)ydcHa=#POCw6&>Lo+ z%=`;Zh|#t%8Q2UZ$<^u_0^EC0{h`PRF?wqpkEumqqcw$L_&KAfcv}VFFR*933*Lo6 zz-GmXn~@O)65NJr8PIL$X7y_1zz--gYv`rg7bI*stkwrBZ_jf}bZyntcFAiqVXldf8$CF0)mdUYngw9q4U>9A4?lJKfh zLm2nY-`2aM2_?K$$4}&WDmcnio=_P8(%v7c4k`Ru?FcUzRj#N@XbhTx%qWIM#dgMr z!1~lJ;U{>q{%rI%pH@lA_j;4B)sYb7el^1>_0jN%!ms$-IH%y++Q;9&*nA%{FPd}| zyXzvCB5yxY{e75rqvVAi+ID8X?auWaQ{#Sj`rLl>3E6Tl{c%9f?mP9&*!02T?H>){ zd%Rhs$30@h!_TT+wz5KdI0uU?P2W{fsatqcM|gUXD|-C<5t*2KU=;f1)=}G4%q!5R zby-hr9#3xLt7QasEnt@V9q3kxfbZS)BTs=xj3kAvhK~5ZcW6|f4L3~cSDz*+OdP{5 zyCvRX0URwadipw3JiyzGRP&2(seMp9GY-XDVhXVxGA?j6RCJi_^7~9|!3ECqu^>k?wRcyGs5dbYwLOP%awE2Xh{6Q-|s`-K)yR=mcV1gTcI2U(J+ry6H)cz4F6o z?rF8Oyqr2VQM{J**4<9+-*y6bJGqeFXe9oK=5AD#)SnwxQNS7bIEkj!Mn=D2gg@t* z;38JY6Jsx^8KVB~F9R(XN7C(&HpCA&I-OV=LjR0f^$&R%-sF}L@Joa41v|uOHO*6n zgq0p*^gw3YPaU-QO827evL$e*-shnpDeQK%&$gwg!T19o@vz3*{l7axyaczgE=(;c zf>xE=Vp7U)yEp2!w$KO_KJ8aaw{TM!L>(O>BmlYMIe_#$yP?C;s{(U|=k!rLX=8*? zR4iG9^tyTCuG0ZmQ8c(P85;Q3ZW*PXZcw;xj%?QW_L8+~OT-HJIVDNR9&uJt3o;$r zKw?Z)F^Zrp{;`xk@7iUmyoZAZ;-K_&?C5EvFM_4xd-iW~@DjzV9ARt-WSc}H;rqqG zu|P?p58MQE-ENrm5u%D88-m!A~ zrQ2A_ckf?Ao2vACk&;s7A+xCYpLUAaS9Hhtvt1T2lb%3|7v>f4-!AdZb8OvLRiZ+N zC(t@%OkRj5|M#-ysha&sY+$O?mOLue+NxN+Ks$zc+2kWel38Y6LcdEev3-7im> zLd`TSW0NGY=~o}u0IJY0Ugs}a?=5yva?XwyG5VQ3O)cP1jkMTP%Ke+R`|$8j_PKiQ z;@xLc;}TxOW3n5J-cv>I&3gJCjzh$RwMNH^4t{uJla>(xi1&|v zzYwqa-YDg+#`%uAhGHg5ZRZdk6a+C^NV@6ge%+m2T)n+TOiLVNrJ=vDJ3qgWQTbZ6 zN?|7)1C^mZMC&HA7@}AX{LyP)yrQFHTm=pmMQ=RENZ-4X0eH zg}spxuk$j+fSR~tz(-Ac55a_^+s^N55I}QSmecfNWF~pwAD-& z^3t*<3`NSq@Lc}=*$C_{F#ov&9R1#YU?28U28RD>H+L8U>g0d*FK!B$Oq3*lpcsJh zr9XaJ27o^laX_=8eY3*mbve^)Q#d>iL&MRFluoUh?E~J0mfvW++dMl~Bl$RjiQ8T2 z`vR7pCP?v>4z(jfXtAkY!CM0f680yu$CfHMoin(qXU{fU7(ig+#4sDdx;L4#j^v~n z*~%ynfta$2O=&xk;sR5(7fbphKAWnX(LE?9+#_t_tbzy9+_sGc7td;r+ze(zL&!vM ziDYGTEpe>w>bi;6O`&K$mBi~0vDnyn6b{Dr;ltxX3OrE}2XAKt(kafoMJeXAt^GBd z(01x){&S4k?iL#HSh~fYa%|)TVznWvJOZ_#`9mW&-}RgsZTw-BUPBco+ZkaQ)$N(b zg@`nndTt-&51%$Z7*!(?Zt;L8SHU8vTWC$9uB=fX=W)I5*L};6Z%RSDQcRdaoXk%o zJ z4@<+aZ`8?*`Vk&x(mIzK^WfLk(ig5QBr^UZ&_rLEDLfQ+ODUL%Cz?$?&keQb5bi@b z1-F=ENItz)ZR|-`c<}O*etV@>T#UF%&!!Wb)oH%cjBWXCJ#D1_s8sJ=@h$IZpBG%k z9*VSIP}J5>9q)>qj%)7sh*@a+d>n6d97}j4Nf%iQp);!eUPTKYyf~eCnjN&b zSQA?5oMXm;ldus%ARz$<|DgttGYc-1xv-H4@rL@jakJbCf5;s|&=9&C z7suHbt4Czx!c_tgBmF_mj(LURf{O%4%4wXc2qzb)hH-{?#ag%N`Vp><5?_@q|7K4H zNwrCtaM%3~@j5Zc9YK8Xv59SGW0!n{L>2Z>SIVC?sg6)b7_1s`tNcUsB3^ZOahD{< zwm|Fh!+t&SCu}LUliSRNnO0u7RBrndQy`K=WqtF{A3u65E=Y3Zt1WPE7F2^M|Ix=s zc@k@{LOBsz*>RhpacYoa*M2tu>(N};7~Vk~y$ zn>?kuQr}_=u5?}|pnIILBx%ilDAg8RX?)<5BW=sTdf23A{B88gS$YCpPc%(NV>J^a zE&W}jFW(gLJt|0~-Hda`2Z7p%sA-63RBj$JRy#s;=f(zD0WGMJ`y6;KnWP7T$bmj2 z_#mM{8)!&(_0O!~Xdi)r`mzk5d$nIl)w$fV#z!51xV;2JIJ$Md$#_E$hQB(`*t0+x z3@m5N1JM!jz5fOhqbJQ?k&D2$pfqTtNN`LJ2vT|3(;x7Ojk@Z^5=YM^? zqNBE^^v(0z>S%fY#{$iZJfHv{bm?EwMYPByiiq>3{@hDT;OvLB&}I7SiYoO|PyY%2 zFyBor-T~Kz8S)@J@r=K}!iDl<;>OHjm58_Sy}q1?M-fbXYH+gqq0(d~nd^x9(T3=F zR12-staVekbch9>pY9jPGVxY}CI-!hsG3z6QGX}>f|Ac(T3%NLi0#^L|Cd^i45%~hQ_vGwaY`xUItmYs8kclX8@^!;2G;th?$tpXWcTG$LINy{qZolp@v?i~1 z^>Horo~g6Syerl`6>{<7t^H4c1>JuCb}}#Bx)`%Gj_an zbGsEHBS5!kS&QQOJK@(5CAwCrMbM%kBbF!WzcO?{_787UtLv6Ij|9@sKK4zC_nM9m zUlcMHM^6Dzi~-f^YKbmrf2ibN!n85(5MIBU+^1~}um(r~uKgF_WPRf;(&{N_;}VaA zWmMeepVyf{V|M0LaQPZ=O1JvpSuDAG&+v}-e9=?SuyGL-j%97?c{*bY%cm(Z&#o}f zZ+|yck=k{tY8P~;+ubi_OH;)W?|821UqbFrhdkQ730(1_fonBu6SB?F=O_EY*)8Q8nX_%9ikC_jpvWr4*^{=Mfb1h5oU*NYQ z@edq_Xbn&vww1q({6GkYL0w~O{#d_|8tr@XC%mFRtNEIYEwwAT0o1zVkXwXxlmGYR z@(*qAUBnskqVE1LC<#v~;v5zTzrghBJiKvo9^v;(>^Ph^FBUiuzu02-u=(Y+YuDO^ z46ai2P<>QvKOdbttUFX`PcrrzB|StaYM{t@rWC4_YZ^^@F!!6n72xD@jGys_ipK5v>4D~4p3}&~^)@78} zS0zjm(A#`rwSGM`6T#K3UIeKqXjWQO2C;>(NoF~<8)&r&1)B}q(U24t5myAx`v8DQ zM+5+fJZ6Re=YaLA^)1lAVy_+m>*V$k(Ek26tTd}%35-S9>Jc# zAW9`bo?;0jQ@_vtqn4=GOE*`59Q7XtN?90Uk!CxGp_1n}`ee8u=ZK~t8m{h2m`j(F z$&2baQ#qWRS8AG?e=plDe!YVfCOU8%nrR@P=>~(4qe z!QU|LSKRpqv?gYA91WmE9B_q2n*KS56ZLo_dHY9|UyNkY;@k~*H1 zJZJ3!%As2%^=OodMByGyHWP5)Np0$Y*-TyXmHd-s09P!hmP~nc-DSx#(J-cg9Xn3=>YTgx^RGoX`ET<{T95Oeh zX%1L)nT!@xsn2*{2_yexX4udX>fNLlsyCfz?Qu2?v#F8O;lY1%opSQ?;`QVy zd(xZE01!}bSJ*?~-_*Mthpe=HqG~?PsteWY`C&wp_Zsr){eZV`+Wu|9VY&Qi52@Nn zd6{9Sb0)c~fXUDG`K_o{3-YjkaWiV&xLNFs{F#!j8WqI&sL6(9Z zO*k120V*E6k_!53K9Due3|uMB1$7?kuqi%%O}QUuUstal=a4<`!d0NAnH{f++!`s^ z^b7AYIbwvvU<)v?B5jMxv_0(^>s%~wakp2(ctcx6=oT#>&CxZ%)kZg@O9)lSi$>Fd z%PT8YWaENWLW(RY`Jtu{HJID!2i>{3sFLVabTlX3;ntuqX~@5iA3n6*o}A-3`m|~N z&Ge6c2E{VTR&XN4D|&yJt-D$i1LRs|-ou9dEWc2G%_B%VX*)?h$ugNk(|P1wlZpGA zVT>BWp`(;_WMKGT+2fEG#|=U^i(Wr72h7o2Jyc=SAc+>x* z4|m}V%HD*`D4E%gXRghjh4)BLDv^ICnUS}O9MC0he`u+Qr90yoX>ONcPpWB1K=Oo; zlIS!cT`^rH0e_w26-b~xp$c4rNAIK8{IhOKeVaWN#mGosxo!QhU18hD_0R5mOoE*W z^3YUpVT3#&8x|t~ZD+wP@^JBPV#WF&9#pO?b8x@_kN7|J(eF>#zu7Vfw zAW9M_Jw1tVB#qMdw-BRlgQf|(OiYVbWF*24VS#`k-maiEZ=qWf1*H~(iAY))H;hk9 zG@t5q#5#fvUWhQ@`+FuR+0;ZzZ^r?X9xyE6YKk;at4BhfK7IO*NYO+Y3?K2k+F~E! z&A$@^3-3YYA_lN*RD4d-@>^ia#^3d4UG-J|ujwM8S`R+r-bm@xsmzjk1XsZrpjSz4 z=@uA|aRPbz=Qn=%x>0X}C&RU8wjHPa{)(Bfd+coYdDIMJrq@B356(1dKsp?ogQ<%WVy| z--3PCZhhvYUeGd%O8E6J_pr6@+@=54>jnT6{!QGW8ZUvKy3 z);`^pYd$Sv_ZoZepd5Ucp`&fH@F-xs-78>i9dWw;uJqAhA!V%|1UxwKxWu{i;aim* z;;t{N(+lgeG>=|~&B-6~_9xVI_SyA5Gq#Dd?-!-U=Cos;t(L#v9Er7Ot}0}Ub?yEmdtQ3}sDHj`gY3J@@~DT^!9vRCji#6-tGCm(lvLZdl55X3 z3V-+?i_Vta3e9kHB!;QWLEiTJ3D?u3%6MxJOCh?|)6iS{StcJ3Te|+%ZdoFfpByuz zuwImSTHYs50z+#hJ%*$L%d|*6tM?==3e{VV4}vF%`gm`2lUL-CygQb_PJEc_758l; zjP>rgy2J~s2My$)L=nzPBPX| zZ21y7fX&U>9YmyPi2^`w7?NW9stoNk#1;9{gvn_iSQwG{Uq5_Us;Z(46w3fbYvf{1hQHtEX>3XTa$yjc zVec<2wgaA3&l1*qvZ1ad^P;p1!dPmX)ne-kND**}0x<-A%Tf*?S$gE~v|-M>&SSE;3t&ALM z`22XE=9{&iv+Z4sQ@TRQjZiWBDL)E%=9@BC!=gJV8h2bLKJ})2KOA+X@Gs0kB}~YT zHv7@$Ub(32D7S_kDTc2scgvnJV8V6|l73R(UoLlyMBJ?{yV-jvDE12aG2LnYS@D3S z)z!J9Q`Zknv|r|TK<~d%yIH>60bLsaj@k~FW9PYfqFB*4D@XH}ZSzvpS{dh7`uT^X zfC;_Pip0%vC@wJ_Sg;75T3Bx?MKFdGmD?=R7(jRvLE%HZ8+qMCTX)PIYm-xwfDR|Z z*^q+jW-VH1!JD(j(0FhU379E~{4EN0s5R++*uOA{% z;~}|Q`wgCLF0x)t9+mo~vM{LKbHO=j3`>h`e%Q@7lT;~0NfilJURRqs5mPOfhb=7% zom5DAr$@Urg7v1?uel$b`tN2@)DfT&%$OYvgY46I6Hbu;76ldL9#?0B$Hs%L; zBV6(5D&b@>&Tk4B|Ecb2<>LZ^`RxQ+w6ogvYZGP`A`}T_a7Uy`NQppg{;SeP(hS7? zHR^YQ2RIs!tT@yzlKUSUNeRc8aIV5ehA+5&MJVk#@5l}^Zje_n?&?SSo*t^QQqrpG z78ih4|5^xKIKEt{RA&Bf7b<;z0@_6*B8yf6N6l^0q7HNf69GOm&k+P;#uSxC5 z`N&?A^eS~91@Gl(Qb&>aXet4xr&Ti_Y9S(ZtK?ntOsSECR>?#iSm6;`w+~s6q#u!i zLFgc!tO#qeAq3HCiWI4G zll~Pgf7OQYv#V58BveI>@90+9V`;<@uY}jLFV2?Z(@52bzbb)A5K^UXj{f&bK{Y?A zd{*&Te+tGpL86R%X&7UK6PV?J*ukR{RTH0nq_hS(R42b`AOH3_p^3UWor5ObtGi{_BW$Zt9q|^}W9-I6u(@jlk@sxiWb%VM zVU^nZAHU~hjgKU4qiA4qqw$%3`!0oF$3yq!zdQ-~ouRDRdmGyQF|#!=fBI|l)f7z_ z{(#%$9xucQAWZAnlPh%@IEK42EaW=m?%O4GMw_vLAp}T*Ls7T|DJ-}KOK^90lHeZPf&_PWC%6~x!QJg{-nY9?-+NBq=YHQW{s6UW z?X~6_bIdWv{M9C9EM?+pLaK31cr^j2_fXQ6r$R$zU0XyzUEUiZjS-mc8iB&WGrXVL zCjDv=7v+0RBt--%DXjrhYYObA6(8A)UVb!39T*5E+L^#)U1vx8dvD;=gH(YD*-_sSEVw^s4o7yg0vz( zaQh=98a1iB^`l5;aCCRXlY?`YrC$;F^H~FU0zm?z7tOdhQF?Pr@_Pk&g#`Dzg50v4 zYbiy4GW2rLPL^Dn9zTqfXKaH&rx3`vT;PJFzcaB-rzy@`%gNwme-DngUSV&;(#Kmd zgh}#U$nsdt*N(=%yL-_{XcXFuQaD@6wucw_(!smD>tTg!rRPZPz+&{YbP@mgdEik9 z^4#T06!IPl36)H{_q3mr$lV4{&BV#2vtoEDiL;Wk*cd7+iphr`uJ3u?9|O=p z!|)$EoSOfr!|8_j0rhsE^7XYRAZtbzN6@I7A-V4bJn)B1*)OnD9C=UdeG^B;wql0Z z$;Oo<6|JI0C6cf>d`TLyMo}*+`Whnwnt-c2Lx}^1xauZ1R@ct5ytOKd|sr1VbrJ5lT1Z zJ22eXy=Qw@HRWeY;a9qnG$)*ZSnd)o_h6N>7>=ORSxU!iuxK<0mC5 zmFz2tv2~N5`iD3_vIFe(ts>qn>{BTv&!8kwfA)*`l;-M}j(JO$EJG-Sw(`tLkW&SHnX{3IOyEhwfSyi<01>$Yh7|GWzGrqXUBH7o!y190S38=Bq?evtjv;` zk62n}FqiCFMv}+;@UaKa872L!&(0&vzTHWZ&PyIoq?->cXwic(A;eeu7=akEh%gb- zIEL{dGf*ZSi~ys1o6k1^&B8K+(3+gf$u+M=R7}x0go-8T;wfBfh`BtsTLs)9wy1q@QTqi^5Dg&`%2WxQ}ys z7?o6=(KJ^mATq74PGeQsR<#X5BXT*+OU#;epNs(GQ=saUi%Oo}2tpgxGB8+&D=7GS zSw-oMMxHL0!%IF!3~(0N)Syd5UGHAs5X~divjJSjmFs5@eh~gc5dSPV95K92RM45= z<_<+`nFEDj1^M<&BqT}7uM<-j&a2a-Q8wiud7 z5@S-XM|bNyN}}?2^hMcAYqGW0y7SW9DA&u!%X2fx6x55neuw@HYe`^hmLX5~ zdaiHM7k4bj`mCwH=!-&T0iwW60bo5mbK6DiK) z$|~So+U8mQ$3~Xcd1z7X`lm&_iDNaftX%WW&gov@E^DmnB-0s7$Icsxg}U)apzveF zV({PVK613eFCg}X+vzkPGVGFyXgo7%Z?KqBba>|?5SrJZ5-$`3huA2J#Tq-Sw2)`p zsYis9`w^T~8dm@Nk07x`1;@^gp!Y{EhvYh! z9%INq{h_sc(X8z>O|x%Lw%ebzhTMWVZD{2$YdaoVDZNvK?f~VXBRFZ}+;=15v#dHi z+Ax8$kK$=MbIQ>mIc(cH?z@DrPm!jRsT4D_4+B)9!UQ_n+D&A->kCX3;Bz>kv^x@P zO5Nz9z8xTAnT2(Yx4mCI^=`tbcmh&eof7Jw-khf(tY+4v250h0^9XK|iGUnxKOQ2W zq{|?%8M#^#E_aXpZ>6{UE->lAyiPt z=>W)3rlP!?dbDej=}F*UHF|K=RPONCiyeLsXqSM`Tj?`DEIV0-+hKoq5E1Gv=!=`q z8a~s5&b=(PjNB-X;OX7$V#PU(i$fn>kK{EZ@b(I*CRs7 zzUo(s3V9QoL}^~jCMxfbjoii&{NawaYO&EQ`q_yLG0qPhmTjXbf-eR%?xK9z$XYn* zM|@$!HeA_v$H~b0)Dc47A~$!4g8uJpz}VO8I-P2scqH2^kRy>ZgsrG=p_NBIP+y7F zwd+>%7S>KLZw?PGZTXWPH|u4l9b34@o1q9xsaGpp8_2jwE~{%tONs4Y`3^d^l3SL% zdbAJ2&(9%VkMh@^fEhTcP~wkNK*!&WYnR`1b3#HLN>sMM70hP5ap8mxpO;EClPRPd zC-O;)!iqqQ7=U%I4_pFM#>0uG1j$}MVR437!ME=( zEHp(#DteE^BPYbDo3dsXu4^Zx=EYO}Km~_giwnk?m;Q@E65MZWa)mfT)M4z)<0ClC zMXEa8815mbMBi;wK2g+%4|v_KEwp5nW@SBSJTBbBG8)sYZr2A7s-~&ols|sVdB)fC zx%Fa#q^NC}5E)dUnxZK?uYVvui7<31GSrCB$W?nLjO;+1N|t z)@#LUWG5d#v!ZFNUo@V(#?0!#&pU$hS7`(_d=+1 z@d+EUhotURJHg&l;bm!moQn+^Xl@sOnz%K7(6a$yIZi~HNn=JycMa63c;;rd0r#O_ zbSuBGmJl15$FMznpS$$Yu+adu(c+B{;Fp5DGmqzdI-Ej=C|_A|Gcn0w`lXe3aDsX2 zyKY>o2z#<<7S%AYLdr0>MTD^BL5Wm%oIj(;;JI^q&IkFvf!16R^7M(gBDFG| z^O`oGbv-bc+G7-Ty&~iFJ&K_{8_Y2~x^|&H)wARP$-d;Tey;zf{ee;QK6&< ze$5$(3?~~E6gDuZXe67gXE{oEmsOZ;?~h3lDAAh{H5MHi=FS{CrU#59F24yB|5f;^ zs?6Y5rQt9kvrm*c+(aae2E9~!?lTt=Fs+T6lvJ5Dk(pS)o3^nzIwGj}ES7mPu*{{- zfg?}f+q+d>uoZf7x&)CGx>}D!S%UM4_6BrcMo9d?)@{#NQPG>1QTY(@eOuyz&?9$6u&uYv-?xnnDsgz}io| z-DXth-}3o?)j7#lQBj`W1A=!e&`EJL;;@zC1j`irHm1n+sS}0a>!*+=pBZ0J0mG}! zI-^V+v~&mJjE%z&bl$JWqZImH3c6s+H<*Wa>O1rP;4v!)dd)J2)Sx5$WF^P&0Aa8T z4820IylyC4E1(6oSeLDW)Ldy0r8n{G92Sq%#+T$}KYzYt7|XJ!6ve3&UBnFh@Bu2$ zs|LfzmM2L@&Wp_IXLk0^Bt2QjUCZO(s8P#`lkIHKF55G@t2K2I12z{0KTxN4x%oI( zYWZyu9nF0H%EocGs0Ox>)~HEchHAN;9j!k|HS|=42X;2fUtcvB{FIhdp#ZD<6zHwE z{(lDge_yzRhC|5H6yq_)?$$FAZ2onyNKVS$8X%O}LA7?7DiqRf^gYT{Q&UUdI25mi zjv>jrT~5J(mX$KHxl0(}TDZSgVLY=DpT-202zEaTkNh2~zP4{$u?C#dgiJfsMt5c( z0z~nmGDIXQ!YeN0pPHA@E&yRD%rqH&Y6xz8{d%=~<3@?z{H-uyVT3(&+W z*2rxCCW_wMOd^=6n%1G^OR@ThS3^5y7g|6+P+xgPcjPa;(fgj^fHZFSgyo*aSBV|; zKJ^e*P7qgAihztgo3Inf>9FIR#9>x|_(_a64Gfh~&=;3pt};b!qz4}EvJ{s_4c^KS zNJ#oR0vP7sRh|BtHZKP@Z6WO)=3na#Z1tz*f4ltDl%XQ|k7cJ|{ZGqI8tDjlCy)vQ z;X3GpJ;vV<)}O6T7&!j60rtU9F$x6*0|bx2=v@yCQ`&#>H62$Bn7gOATSmSB3`@ZK zT3%SdhYgRI#mQ&M4K$U#=$%7CXz4oXj6!E)VzO3O$b zh!5`#o}{8>Pw0LISQ^o%r3rBiX8LPs&hUMF@B}^9NR~>6Jr3;3@~?pj2C2tddzk!S zLPU9f1T-jPr2KLMV9m`9WeiS_Vtzg{=6wQhtM^F7@ceVcjLH1fs0y2fujq3HlR}d} zsh+Al*z$^r-hFK_SLQ@D^Q?7!2HkYtgI(a+ulKA>pWbH~jkbhvf8t z`ftmeb(*P@Tx(^rhd>LL8>_)2|Mwy)HlUaNhn&N z+oN~>-^!>#&VL785anAO_`D|zxr6#$77^}IkK3{MX2Yj4VFLwu#656giM|gK?-j*H z1dJU^FUNi+YRm?v8c<@(Yt|f8@;6}5Ku|LPvB9UUG%xWZP>$=x&6lg zYoa)L8^V-A{c8QSm&_7?C)au5BvIop=ij6}tMRSTCVU)*G#z-(WZnY^6A!#@`@?4Y z5seQYLkmhn@HgO0VWKYgOcNY^nmdD<)PN4Q6^uCdav!JY^^o8p$ehe(i|D@OM zqPw5V;x?@5z*%+>vEd&%W>u19eS5(#CWqwuB!A#+y^nwYe+Zue;PqF$CIlhgFnPNI zGdBP_-g4agEV6Q9-tDmIV&Tw}YyV1+3@8MoQYPkZc&=|t4p3)!zkdC?G%D)T)+EC$ ze4c)_-4;Z%;zHI%de#UK#8#oR{xAdVNu@r;i=IsRM;zy&tOJ7CxrWR1k1_Eu;I21M z0Oyoqq_WX^*-$rc8zR9?$TONE$_Je{3iYBbuAvNz)gF3mbIM3--xi}-FtY}ExFY$|w?N5u(1Tv7$! zG)zxS80y}_ruPV1t^zD(%f$Vj)8-G!5!NK5Z-dr(3VY?xJ; ztSD_q>9FMk@X(&~lnIt2$zS6^C#0_`2SVI0q$xS=x()d*=V$R0L|G?uYqSlAQZ(U9 z?h`b1FjhzhK7ai9k#|@vSp>PMJJ+h1q(aVpdHymoiFWK^xkv82s^ zbCR6fmL<`IoF_j0gp)VpE`AS7fMO?ZqyelsLkXn{{JQ+i?x`%!8n0&08oy6f>WY{*{P!f{@-=^?vt^15^ah9wnx z1`)vjjYyur;I^-z7k7~iVJpXi?Eo_Xe#3LLegAv`#s&s*U;6{<5~Bd1%kc>po*e@i z@PEdI|Bvs_gYKJ-$N5LiTI-Bw>Kp2LbKP(1B*3DP+o z#T$7MVwfN;RQl5NwIEt7=PkL52NDjPECA#Om!{NLJTCtsj zsOe!up9FY>9F%k&;DUtwARGSO-~XXX;@krx!>NYK zzy5THZ(sNET-RbEPob%SM~mzq$Pmz*BH9vi%P`Nb(LR#MMy3*k2FI;pBi2lB8y)oZ#V2yW5j0#qPv?dNU5G5tuC{9M*3$IJn%Ph_K?9X!<3o7Qm)(YBd&$8>kYmiGwT ziD?Adz{1FJABs$;soUD}6kfh73*H)lKPmy2P5DN~o&B*8A4XT?Y-f zu31kAIRzC`6x&4MUzz`ul5VFy5Jl9|)~3@}1EPf4lw~;u!-;adp^kIk`0KsC;vp~zwH6w z^qy)h>~HQA?*Nc${YQz`eg$ z?>^TD(8pCoc@)5h%Dk_F9!y4-7m6r<;;NvbNbRE`b$0OQEz$QeqJI6&?JAxn$3uwT z?zB}@P0T{I5_KN1vpb4Cdjd{Sfc~PMN<{L^I(|27H6yXaP$ISu;j1fgTXB^#ai6iw z+7KqXxoQBQuUpgO_2_$lP&2%x1>!L17IiyxBbaG7V<{hC5d(ev2^+w3mV)udwes0^ zx^VMamkz~Zk%xW#CebT?-kh*Zcl3GMac|zr%XQNzgaF(0y5V)m3H=-qo>N>*SVRWx zpiXAJ5E;ZaTbpxy5BxYe&Bg0R#8<76`Z|cAoc#EY06o+Tc(2tIkHc9P?9$6cORrhI zfP&;=bzW#_sEUfp)ZE;&^_yQM&9EjXY)L(;h<_pGW;EnW`|>`h&27x^%vrmMk2i$(A@vgHR~A+*R2o`C2wK;^N* z%=jOvGX$mb!Gd1|6r7z!_F4nzj*hJTbF*%4=tGbSihderibCos+lJl%mSjDZ=AZn? zX1sbQ(+WmohuBn_5#?tT&Uz4BUA;scqYq`1oMukqRr#Qd&mL$AOvoGz=XE*?^~a&j*$I;Hapx~ zfI=%P%_d#%0&*oqkkmC0;28DnZ$qABmt69;jX)tY5iT4hq!dx-|41}34icBIftDd4 z59p-6`x{@2ko%-YiJhOHJ1#f?o*Eki<0qYC6v1i0yN-3Ve2Z;}Cm0*|a(123U~f!% zxZFw0!-qXaY)r{Cf=wU()mEMOVNenAS=Sm;dn`QWMOZ{>Wo_-qK7#_Ni;0bRj)oY= z%8WmAqw9>vRv8|x(O`v6Ovjj%*Y8V{2)8-WhlU+A_%ac^pjedXuU{20zi&yS&@S7P zYkugDTcYxYQL*a(QfV_tg$!tSmsEEeat z<>tpcpG(V%f04KHSOB|LYCF^nG8=U(G~T_CnondU2?aaSA-%Aruvnj~GBX_@rt{=2 zIMnY5;=z{C!$;+d{^%ssy=JRS%^>VGCPzzVtGE<5vbb}j^w+K{S42*fS2dOk z22f^~S(Re<@oK9jvj8L-pblc~T6cezD^SIEP}_k6+i-1IkCNU4?9P5WR8lqn>qKJ^ zA-eVhulC3f8}iv{!9jHov5eJYTl!*y)r)6tKXDkquEyr}fN1s!2sJhVVf%lkb!k+d zvKk!VzUEfWH2Aw9ZDKu)IJ}i|1#rO(FIQCm&M;0;&ivn)T3A#$wcIXT0EUD)8@c7> zW6Uk@ydLf#IpAXmxRmmp49|}oJw)R7$6JZ)bZCb=WjvISj267pm z##2BdKI+;Hn2QQTPJBt<$jL_ODM}3o*~e3+#A2FAccx9Z{zH3$P4DxzxJdRB94Zo8 zrReiThddEykq@89+UbdDQRY~~^&2YfS6P_Z0@+gZkvjG+uooU@35$u%_*BZ0tSBI}O{( z{q-{6D8=ur-&+sFEmS)l3Ym=@j*S7-PP^qzX%jGT^y0}t2r#Y?m<1-=rE>VU+|RXe zG>Ttik`(?LFRyUBkc_FGB=+o(WxHLdT;qp4kLhn}S++^`%%&d1BTHTzjW_kGq-hhby-Gi75pmx zz!(1w0nk;Wpb(w`E%5df;Myypkr;|P$yKQ^8dp*_%CMg2Ixlch-Y&bjGgd?=w@2gO zzY9Xr2~&edalZ=dgzw;udhXCGCwC*~SV;Ube%l+J_H8vD>MKcATbm4KQEs2N7%P~I zfk))gOmNSER{{yZ%YH%mS)q>B>Klx`p4k2KG>)zO%N`_GteABeV=d*`3W>}eLqUU7 zSC;^~huhNPXz}PG18#JfYxX<^|Cl8+UR_NUnT<{dkSYTiVH)NxI%}gIFvZ(tN**vo zF*@BG9B{_0*i|##dbm9tc9z`2Eg6k;0rD zy8Uu&k3S;SpGT=L7ZV)IbAi0`HIw2KcLO-MlmKw#KRCE>Cjgn~e>u1S)St2YvqzaD z_eK2>>UkO!hog7c@BS_ocnb>P_KLq8O)mef;s=92iThQ`dT4IB7rTU-+^e6LNk05E)A7(c4J- zBshH5xHXbvB^D(4H{<)IX2o>5x}`DaJZgeF@V~ma0)Gm9#kI=Y%7Okn9P(i8+%U z{Ceswnbci@Q<(6WeYA_DMZ6T`2Us6r{MN>e=!=k1Tc;%IsuEl4q=0?(^@DOj3ykM*~bvD06n)9 z07BmC3*aEEj7r@w{SRjVe|^UGpI!j}I1fSpb{_uGTi!Pa+-_P>waDDj&!WK1Rrari z#Q_%98uOU)$ZE-IGttr#g$l(e3Dc`uWMeUwC| zm0q}jg!@1buPnrR`8*C=7!Hh`LJFt6zWoZ%^6ErNBk{`Jsbttw;EVH>`|Y8)--c_H|Hzp%o;h2==;E~<6Zx;#qcwU?%gT@TVRuZ$1{Kt zSLr^j3;MYEa0LxH`p>q)5$~OJ;#q-aoU=QjNiM}`Cl*Y&B?;iL}r!zSP zD$Vi<#S%m-QG23wyr_?rq@|P0=5T}o#HF~OW7UG}i*oYr@uH@s3K#-8duV*8)975b z;CZ|M-*#E&7mqEzrI5i3AR94n^F^u##(GztnxuJDm%Y&eXcbr;Ev9zTWf9|y(nRjE#??`P2e$}6-UAX>Ury@Vz6*cF;8e|(H% z9ZC+i{)yn@v)&}fF}wwAfy?H*FC7(H6mzYLa0jxWW%xH3wM!VtV?Znqi50Rao5YE^{ez(3pI$&(X8B7v&}>xhKC@#d!8ooo*UKsbLI>64i~LX6s18F zo-@o95KFCC?TRD#hJ*TV`zc&+N*SYjUKi8cBo9?@uk7Inl5u}dT7+%xovOCnL|28( z*^zH>#MvHq+3mAkHZQTX+-W6&T^d6P71ERD@V&9mxrTv?PEFbySFM3Th>2Ap080;p z!SAOgWJc_6zzr)m(sZWkln)iuRJE^?;;6kfZtCyX8K5&K8>6=$#KiJRwp_RPSgU}P za)HyJ+mMNsDRu-IT$VKM@(tz=3$vKJ1lpdJaRgJ=SIH$8ca=}w$r0rlX0L?L{tg~o zD4mgF{7sJ(4=WUVz@>V)y@BAV4fJJy`H~^~gIzo&#KqPOvV@8ph)uEC-5>qR+tg~h zT-#FcPK}+AMVM9qO7`jUA;CY)#A*DSPIB9NZzPSQ=%T4)R(ISa;?Ri7P?J`n^C8_fwkEqk?b{5fN3o$PktlAhUx>)oc6qJTWMT2e zL7Jn!o==Os<19q%NM2UyMfKC8ox0NA-)mwy2p*8y8}kweC>6)m*qs+eHSriCktY#j3GCi!;1qKVU>#wM)ZigcwNKac(`Obfs+c>oJ9NykSE7KUw4=60%F3F-6E|fuCv>#WcL&qQK$$Z|(dKO9T|O77mYr-ch^n_kRf8b!g!81e%Nitv=KMQC@Wx=-wwpO*fVu z_Xti?41kAVksXv@bKkpM+*`4nZ8gkK_>obF@7$A~-afcrrr%v3^sHT=cA%$UNQ0`3 zQBcxZB0)P}vEZH?MKGp81RffPovJ;&!rbo9A;~gJe}8Z;`Dx2qyL1!7IT-#`1p>0^ ze$g6yo(WKFP6KoYOZ3na_`HKFLo@-7;@Tz1An>HW4{uKw4$rq5D8lCi6bK){`ad7S zR{sF3z2arhN)$2u_UY}N+=6R#E+A&_IMuiQ{kW;&h1qjw6_%^h;rX-ZmfvM+e%{!s z5cXL*&1>O-!4I@k3_K;NV77}aCWDd|DYS`&V=#aFd^$UIYjMJC=5RZXnEcXk?sBV^ zg{7I&8$L0k6ch1q{XIX*+S0veFT|fns%DS+xw)Bz1L2DoFY6QihdSiUk+bIp zbLUtV18527saY9Inwq;GvS$Ol+)s?mEcla2;9_8Eu=;$Zo{-X(UY3?UEudWEqN1Qv z@EQ{}qER+V4%H(}?VCTuoaXCS<(GKZ+?lK{&i0$!OSr8-G3q%dvkG;jmXTOpC@EdI zKHjZ89%hOY?V4&TK05hi#nDk@2Q}Fo?amwdhM!IdJum6@c!Ig!8YQ}(Z6?j_skW$p zVr@POH2!%o(x9l1BY0k+c;=7d3Mdfv0JnZHxsr&8U+*!(4w8psuGG09i&2w8`lIVoAS*3X5(!~NzWZU1iZJZ<$HhO*cDH)A5Fe$bGd{4g}?cb<|~T=aVAP_fOlx^`62Mp>*E|ixn zve|Vs3ZtHe==m-?l_aZS5bf2&;NF0er4$#$_&=^rBP$ae82!u(7pLrr@9+fgw-cxL zmX1K@8o9`M!YwljN}FWxDJCWY%45Lzhfddl7&# z8GfN8IZ)t1PmAGvU7pqZHS;R(8yTwgyPO)5#f#5d$wfKqjDr^_$`lmY-#T&xzS6u9 zqcDpJ3v;u7dw<;#o3bIA>`zhEppYa?ZfZudo+v|}?s;1X7Ho>$gxhfiex9O{@Yf3) z*-ZopaVw-gPc_@h^w|A8w{U&1v5xnhjHmUvN=$l~cKj5J%W5!RgfIGV2aC&&d30?x zjpV{I%kIqOJNwKT7uxzT_Ty~~nkAb-CRNMQtrX-ufxP|0GZ2T=;WMsz=ls)~AY@=x zMtRt6?3Z@MQtg@{8psQ*pfGm3q7rESzPX93J)BVOuF|{pp;!B(tSl>;Lk2gqbtP|C zciE12

~fj`K@p6tU_t$ONg>Gc4HhEms?Ow<2or!y;aa&2H^(MH*H8n}bPC?Pn>b zslz~Q^cN#$`Mc&Pg6zXxp69mn6Pjje)pB2b`fBukp2#FQL!GlkBjg#LU6T=>;K%TM zd1R*_O^;In7o*{g(&XtMCL<>?-Yw=ilSt1@(H|=r7^K*G%J?-0sHm|E`@-q!^mTMJ zDM-JvS=KAESSbIP>T&To{Wv}4ok<{X`j-8h-2_oB_b$rA{i%@|y{h)S%1UMUuOH4u zaO7zfM1`>5=w^4RxNUDt@0CfU)dY!Yy=5{PiEq`MC__v|l&>(Bt=6<`>VK(Ym#^(U zdd+ETSofpC5O5&ece}FwH90xSmsOOPHxlJ7Ms~4{MRzv*c819)qNoaKb7wdQP=UDL>TGzVw(#T`$;eQx4J9pD-g$GUwgNk?I8;h;^@Ctnvp}x5JM?q@3 zO2C}kuXq&VSKATzj*@6Udhs)3@*j<7cnv6Kwcf?d6!&IMG+`;1q#v(H8D!+h6q4tm z$@@35#3v4xgxjN4J(6kPbek~hiRyz&mADd+up-4*?OaCg6;feE9+yG`k1Z`$9poP_ z`(ufRj+fGE#jobg_VLgvd(hH@6^`p79dAI9){2&ucM^hoDK?7JSPIoJUc6L`I9$8@oVK(q>h6w`R?g!}i)L5L$FTSz zn&^=hTYl;0VtB7=dWu39zNFN;wrOhteZA;5P!h!9_G2tOqyIoFHS?{*vU6%F`pFN| z+k8hmu!r`^`qwFa5j9Vm62ty!p?-o4IQfmnej?;^ zf%+i8W3tNi{3iOfw?^%eyJqdtr-qe!CiayHzzI4Cg)53|Rw4#kQ-UzT*QnWWlV!{B z7{k9bp24Y6M3vLH!t(HxkkM^Vz24t~#HQ|Ab6 zDb?mA(3FU|eqK$z_{#{H%XDlM0sBr%8Uf-qkMJBm51kb}eo%M&vf=LQ^{P32@Gn+* za=Jb6B7N^g)50q}-EIYUQDU*plWhKzm6L2YFiYj+^7>pI%>)Hm#}sOiokD7+jA-?g zehu$h`$2DXI2Q**jAS;qIg2wrOY1plLX>eobGw?0{Jes8FMite=Wj0sEL-lGrI&%S zwm*{evbNJbog4^A+KT~(BL~d)9BakN0-yI(v_7+5&^7ZtZ&>(=h$U(XU*JIk79Xko z%Q@=XWq*rEiJ`J#E6AN((NZ#Umdr5LCu9fhRr54^v09u>YZDqK6-f}F%=ktCT#Nl0 zIjyvguN{7+Xt8P?T>X*dwA(cNRTx9?IbO$wXPS%W!yIo|19-cpLx|H>ImFS0xY$;a zeC+gjY!Z}W+kskPtOQ){&?bKifo9XSQ++KlIp;wWXolS0(7{cF@m~z+vX(KVX<3#u zmCMbWM1NFP`CQkTw71w&KfU=o(coiNLCkR2*BwRT2Ml(2{*@-68JG7Xe;P8}grKGp z3hQ4-9s|e`3>8f60p2X8VZLYY@hy5qwS47S>}d$p(E*Q~ z`{@}niaIIyu=kgEL`^JWtY7Lrto4O6Kr4_nIF>aJbztbgY+1$Z;6!~E8>tRInP@ve z0c1WmMON)L>^YY!x4k#Z;rMWEXT;P<%YaKzs{f7G45(D@NKQ!4tC`a<4NR8l*S%Wu zk;H?n^8n%WrQ1rEz%PDL(78E~fwM74fgGx|nNI!ym7*F2z%bG$KD#{q;8U7J3PBnm zDpvhCFfcGt-1jS`V)zfSc0wt$>GZ{6N!9d2BWy?6=~uyP^Y%^~PKzB{)bz^6T(jvY zL)HXPtzdJdW|FAg8|;~xiPb$L!;M}nxR{wVQ43u99(E+FL741Qf0>zdT%-B!)H7fO z6NI3_={~_hIRZqzP1l=A$keqs;{S@Ha_Wey+{~=rpK$Q3LY~Goj-Lccoa#Q_Okbp4 zHZttbdz1ai0Tt{oY-tQUJ|u9M%2#A$s7ud>jij4|G>UT8-WQB+{8;8{zo5ud^7<3J zcsW4krJyBRaP5BP`tMkS6kE@?>4o%1uJej!5VU{3-Lkx`S%E~&Y^|3MB zgV?&_;#r_xMykHq4oTzO9gYiV}t1;atvB9(%(l`lc1I84v~ z)KC~2r(qwzD_tru`@ws_7@Z4)iC|9MU09Nw7U{zB7*sA%POAA{s?9Ae7%>ySwY0Rf z@mpH&LKTT=>uC4)+4v;;K-r+jt#v4qO@uvE)gF20Ymo9x_L3rfSed1yj&6uVO3CgQ zm^r@Iojre8@aDK*_)723Gy4}Tt0a55Rr{p_Zu!=)Sy=3Ay}No9cGA$kA)ry*9qS4E z&LD@QtaTjR;G$8GYHDDiXLWql)R{Zi_fBjxY{d1M3_Q0oOY9Z^WPxl? zM?;@2wtZiU!F2TXh>Z;iQhey`$Pfu-X$H*GOoH^p+%d`z7wuY8-7Xw%+pxVDpan_E z+U3vPBgY=XoV4@~J*QtdZq5?}4F?76%Vg4v%O9(=18&+OS7>k~Id6G9^}JGc1;UGq zOX)XOyvO8=YjG(j&_&E!va@^9(9qX;?n?)vD>|@9+j3;9h13bqcVB9ckBnr#%^S{P zfu6TJRIX-)b#MGu0kRMVd83csJlZbA!fC7xa>*k(X~0tm3uYt3ydN>G-1G zcE+~&8%w}8GOqKOm|(Bt*s{&@OT@~F`X72%vWoZ0h7oRewOthCf{e0)49&z=d5*%u zQaI3*oU!{;Ph;rB@laJW>9&&@oI+GuQ_M;^`tdZs!RlO1kZFt#WB53K*HOowQ=vMB zy&l}yaMtJuTaVOA%qXtlq56I^God{OO_?hPJk_-uW?!_F30LdUd?35Z`EFy^WePhU(k_Se?mohs`1GsF%;7 zMxk1z5d5)!s+YnciZk5NvgIx9f}UEo>i|z}l&!Wj$z=Nbbh+nDZ#tFqI>i%5AJ<9u z$z9SHvWh2}u*~&hVEkhue5&cC65qlz@AX@h&s}0VYNZ^4O*G5UZ>LR%$-P#Q+}1Er z)2k01Y-E!-7mM-eCzgQbq>)(c4MCCt50ux`yPY31z)U-)WM?IXouKL9O_(?bA(3a|cFRycFT$6uUv>|?-$ z6y&@CRsjie=8{sjp#`~$^#NOj7MD1XlHK6FkvwdEL0_s@m;$NCW>|| z)q7aet@r(L{)Q`gaj9wws6b%@=)o*5wY$n?Xq?{OxdsCg?a!a73jShoM|2Tad#Ncl z0|<2Tgv#h#>vHXEW?1Uh@orDk9`~S5*Pdsi68qYRuB!0{eofH;rrG8Ej}6VJB;rlXruZ5r8b>7WJv&1M62%yG$mpJO0Xw?qKo;z zS`D<3`QpjbjfWDleGZTu_p3CX9iCkO7+pn@mwJtkvYf|;9Ri~2e5Nm-$C(=D#E!tI zo?ySpxrEF1OSOd&K4%mcveq3|EoIArosqEahPWF7nj{j9R2LF9S-2kKA zt#t2qR1Ji#pJtT;Ub-fGb#Q#Wty@cv6=b(lUT3d*19!JzZ7tX7`;ml(HH|c(BwYP2 z?#^zfEOS+EEu##g&gI90`<4-Iffq!d0l)0{H7EMd)-i2IO&ae*+bo`0ZF5n*R^a-e znM(58a0G9(M*;J57!ljpc}Df&%oZZa_|GsWVgDLVi)lTp95604a(eInB5pOA*P&2) z_+tNdUT=G*ENkBTmEWIWLma%f+{4S<$JFqQW40Sds2CcD@eDKvL|MaAMGgDuivnBB zo-k9L`7t(kh;!NG<9^V7`f9W~g}_Uc4&kQDinEu% zY`>u1!{6%B=5y6e8d$CkJ}c`|274Cz&ho2%sQ3liscMd4;1Lr+$_Z+nPY=dC87HIVe~}Ky%%eT(C-h*`%;m&Mq1?WkO)7O==BPMKNXAIG*8ThD%kgl*|3TKj{RYoYE`Hs2|Ls%U2!mWV(61g0~} zTTz_$s8gU(l{E!L%t$;{9Ic0U7Li*qt*lCR!elZ(@yEK#{>~{D!(!q)gG~Hr{ckbD z&aqTi%W!+5=S1^iy_1+1f;mx7^j1Zm&azdXt$vAKOs z>f91zKjJL5Vo4G#l;EFX`kp5cZDh8A$DwbAl4RUBj_Km6BwbDOxlm!2>s!fuxkJ49 zFPY4UcSA#$k{A+oHmr^^MEKbI2>YP(e4%|1r*|kdA}qrZdj-S6wrA*GClk$G3NV(K(8t#`$5{Kd^@*ULc zpO`{gVuYPY;=x)tcdcuRKvSVQd5z=Vj9Jps;>O!#ZYc}jon1(XQ*eXK_U5clsZr)$ z6ek)+8*%t4DFcLa6<%Zq-Gb4=t;!tpOxz=C{ zGRHiZ%M!i0B=F$N-?Yx5dEK4VgCwt%8&<(Ykjyl!wl!lEpR%#~bLc&)ePe0v0__k{ zfq-*s7G~yt288NhzVb^hliAblm#ingASf!X%_vXeV?7??6eI99dk{%PRNMJL_z#;@ zEhXn7t)}P%&v*^0X^``xTuCA&-~9dwVxSE9Gm*XU_Xrd_lxle7S3WeyJ~Y#+s;azm za}1g^P?UT?;Y%0V(_}gj*F*(&s(ruA^MTKP2$&|IxMEw~JPbDYL2|gFliDITlUo~% zJ*UT{a4WNN@+SSkhpL@@4D%-#0ry2yzHJiyUM?dv$>_Gh9s{GzaI4s(*dqIPzTXVB zb0ooD7wXzFOmVkjOgVSA6~|}6I9B@d=6BN!b>>(eBwqYneWOm|`1WR=x)CP|kAYff zXN;+&xC-d$MF7fKb-qK#XCl`h=7A*mO8bgy;u<_3rmBADmYp3)1XhRKW z7`+9}8cO=k0&MgR5t2{a4PcnXKi<^HfFIwB)!6DkD;vRYUtS=FB`O+l35umnYSZxr zndZfCU!dxX_7J}hsF%ql^Tj6#g;ixl!L)J_h8-P#@H4>0Na{=eI$jh1CJ6WpM7{BD zXXfD?-$4tqGoJ-Puw$cX2g}I#<8zJKfe>w>|IJMHZnM*}Pm7K3Ee6 zl@iC7PY|WOq2@F!I|RAwEA~6gPd+Jlgp_X*sD;Mr&o#|!{NdsS>gP9`%5o*FDQY*| z=Ptb#jBnX%8+8#@G4$AGajjGhv2byt>$>~LG05K%E^jnZP@yLia#OGKT+XXk+jMYh zmaWRV=p?QTm8joVlXRxCU>pm=1+K$k5M>()lKA-P*{2G&;>w{8`NQd~8d8#LZJ7$w zuV%-#W8Agjj1Kp|AW6abtb^Q5!u7$Bz8HxV`ws*Rw~~S|goQ!WDK682h;{R}WcNWg z$Jra+>frHzG4|F`RlZx-@V(h|Bi$w4AYEHPx+GMP5(K2Xq&9+tv~+I)2|-G_kq!yz z?hXm*dT;%mbDr;g#(T#2{$Vio9%B>tb*(kmTyxDe98waO|1Hh1u2}DzUZ$@F<>d>e zQg(C)@W^F5s_fZe(6hQAQ%zw7PAvt8;-;CHP~s_I4=}zk0cf$?h_R2Pf|HoaP-KH3 z9P!MkJ#7TyFF2?OPzQz)LSzhGfUHCf9Ww0qirn)LSh2_#nu(!`FBouQsj5YES6G;< zON>qv?WW4cH?U-P+LaT?$+84XyW|!3;2s&DRi5{0pYuPrJY)|t#fCy=p(i;dAeZh1P}7IXZV43H|3>c3X=@#S`R zTER%IK#|=$wexyz^CqH0sz0eT5!i_qC55Ts%cCPUuoV2XOVhWI+k%*%GX%Ge;>KO$ zyW7v#cPDVIzqia7FA^5#V?Gb$5D7TaV`tfmujszB{E@zbO!bl}R7MW;@eER*qCBEg z&5_JiAHi)6H_Mq4U3nm(`y)Y;e(Z5*VC(C_^uj?7g6#adFQt{QD243Zz7r%C$vzAW zjf@o19Pdd;Abi}LrpYA3IWGG1rhac5^n|Rtp@V0qzj3h==C1QBF}#gB*$>`jI?%~1 zb3kTy8BE>?@&fre*~UQh&_}Bn7|R&hk}K`kck>8sLKHb-zt?t%QlLdO9V+UE}qx#Pd)F>15I}`MpcS~rT1n3MO((@BU4uMLo zAzV3$N&@e(yRt@VySc`?(F~Cf5^Axn;XW)5_&_(7rNd>jKx&!EzDk0b-Y0vWQ8JzbNwp(HH9PkQ^ZXiG zv}!yx83P%ZrtsFI3ed0#)CicrKXuHNI^J9I@}AHeoj*yEpudnY!3v(DZDC{K%}qy- z4)gTvV3cyq&1fqk%;q}QpbVnh8Bs`@(bj0%v$cE~V##?NY!}14!o+nll(s*EJHxhz z^NC^MWf$%coc{2L*oy8f|x2=`xmW%L~;o`KA6?L4x@;`&o#a<j7CKdA#ggV`(|j#l2R*owua2iP)WCR=YO^URgT(~ zn7zqkdF4Dv1_WZ`i#`aTdkj}%U=6*hijnJ&SM+)^^RY^wI&1uDul#IMv}L`j%0}FT*iq8`jJ-0JwXTT2IYMxn@L>=AvNsfsNx zU0%e>vg(hbk+jULG|i{aGU4VN5X=yN*4J_PYy5EX&FA8!xPd+1H8}1n6Q0J0iq-z` z`m#6;NWrJD0Y^9FY43YbjNJPtEu}>%dS^BlA$@_fBQOh)jI>${HNv6HRC~BW;Ig8o zjZf0t;;#u&>Cup2x}_=XpNAY`RLEM_S|u{bMd-ZNdPK2b8+Ss(zD1-FWWcvfn>>55 zPsFnVH>fnTqbvK0t-8aOnBoRkuz|=l=GHiS%8>&VBlDBO59Sj z`}mkF>w7$^aR$__JX2GGa*gNwdHkJjBxy9H_6R3JE|_x75A+Oxe#Kbmve)r zJ{DPAX0gs`t_7w8d?XG{nJE=~KRf7OiPj%p;FoR=C8X~1u!#=t-TUJZ0#!67 zcp27YjJ8-8CM2aS-^GK5(WPEn8KO@Wtb2Z; zBZaHx%PYVpyiqBR>vMwVX#G6OZeL>7hVSDEu6v+77J64oih#DDp{x5fuA-MLn%x4d znUqN2QpQ1EJGP8f^oug%x9l@0G=wdsiJTc$Gss4zu51QK-NHGdE5Vzo6vwG_OFh*# z3>oM68m{8m{WL9%gb?mw~J+IYPAKVy1Vw`Hj!HK zqR5u`2qUHZb^!A53C^_8gG`3K-WRz>ww7wr#r>zMUoo2XCQtNoq(L42 zs}jo&3bWWr2LJBEhIf-EZhIVVxtS??Y|K9xS^_Vd26Yl#h1X7denC+X156ZW{@=%c z2|maeB8Fpn*IK>*8#3qWkkm7tR2$ZgKD*3u-=C@KdD1vv_yUwU8~1|K)5P5sZJC9D z4(kVg4?(ez!$>I>p-fDZN9|2B5(%v=J|tYJ zg#eNLTQBW4bq#l^!$LZ#syrY*uay!*4dpu_^^RfQY9VUI^Yn``T8GAQ2&RT}j=&D+ zh`c3X^C)Avayu(@kl8)=Ex~U-2$Et{An(_$V%zWMuX2?3MRBQB$n&5>;q`}U#t>Ex zG1TitFehC8I=7TMA(t4xwQ()$x*D2j(S_*3h&54kAR4SoWBJ580r%(kcZ=%|cN-2e zhCWOEuK;H{Kxa0`CHJ!QUXB6?b{vl9#h!n{=g zT=-rbXZLkIcV76HwhAij^*cR#5=RB3vi3GZT#9ep4Mw_<$0P1!Ru z{09t(&M~;EncVyrC+xjnJ%dVu6whDrrgqEWh`v$7@xmU9DMQMTo1*RGeq=9z+TH2W z+IIBtjbd3C#j$|knh^TQFjHU&wN4g={sB9ih2c)m^^2W0Z(=b6P1#IFu`h@3t$#Qf zH2V-ET1WiY4n@k;ST_oJBsoDDvL3c|jj3i6LEw&Ko05v$ty`XS%w{A4Pn-Cd4Bg== z-*??~&jqgqkrxXiQSo795|(c{WsoAocTxjR)D^!x7^H2CHldcNKCaNmYKBK~{n%$F zAr}scETDhP_zo4BMT{UMg7CwuIZem(In?O1B`UXvMB0JPtY@<3j}`Kqi@JQJ1ywsG ztQ8}@<`S_qh+$of$Drql0#qJQG){X&X2xfVNK_;(D{=FZuHPw}dpZO)=zt)^Ny?HE{6kR25NPf?E!k8rs>h zp03_UG#>Xk=C-Pk=6N=FbV{^R2slbj8;z);ms^w+azm%_l^G3WFQPUR#axWc%_U(` z-09bkI>LvxIy^F2x>#o=yGTkMj2eR6(CZxy`%$`;1-ol4ynAXtr{RqfJy2)7FoOwJ zlIp=ZFKU!@L-$8xKWsM9Tz819-Hr72e@K)2c3&JvxC$p6YH*uBqnp4#am;TpaK}n&&*b*Hg?qNipo7BAR9~RFeTn!5B zecx&2lA*TiZ6Uo`Ccs>kYj5jNNWYq}&tCK6f;p(mJ;pBmE8i@=(PFVje`0j(ft_&+ zktedxv(n7yNu*p0;)EhaHHY`_Z-wZ#ASI^l_MLnTPTu-tmIw-j{`8?z9m^o&*E{wa~C zt1<*|OY!>31z4j(3Oc_H;H1}yqpL0O)et!tc;$4Y;Vi&kRPU7ecB`d|!Khz;WY=B6N10ANu z0K6Zh;Hj~pYUE`*9OeV*@nH2wyBPXv0na&k8ZQ!UZ?_QX?b`7K1S4Bx{f`y)5&{8RM-%PY+xh1;f_9WlAv2_dI3BQozlm@e1Gl%e#^t`+7HG`0ruDQ&X zF7?^UO{aSd>hbpN6?9mDRkfO}92*S!>)j8{<;|{DtC!j7= z8k6eu-x4E+$S=3=&$YcyehA}ueuN5VpiOXpblXjYOuaCtS})$6*cOo)f8T$}(@$Mp zL)zz6?p87R>DkR}6qYyTo!gU#G3i|7`iNp1+$ zcecF-hMwb1|GS4*=Nt4m`cVOA#kY<{L;lz5{d$piS)06~1{f~7tlkHvINW1L1`<_+LS+6;t3Dx@ANalSM_gX>nOQVWS zx;vqsZaB}K1|Zyd{ocPvyw_7orJ_I|P^QlIeIdYh{6X+NxAmVbk9b+ts9e;uv; zY~@gRTwZaf6sotsNVNY;LIJOvjd466WItBck+ev_(Td;IODFUC%EeJSJ?ie*6{24E;_Pf2xMnjU?;@fE}YVXXZf&J=bV|$3ED?=RyEJN`2 z#U3FYaM}@ky;hh{BhOyO#_zTxrz8Jn&>o7kyViW+kmZl&w*g*Rx33y)_c(LGvzsO_ zF{-4zMVn`VvBG62BBpOQ0L>P#UZ+OVRIf+2e#2u%+j2X-E$K*p?Xn~^FLmt@x9hx8 z5kLqjK(8epU#34UI?nOGs=7Jwn2msr-~gCxE1BMRN_rT;7X%&Y?Qdjp@~pW`L;ofB zK;P;~@-Jq`rL!f0Cv&do)zYR{Y4eZ*6klM>8qn-5XrJg@Kx^=;5!+ioSeswS8?ARw zd3k##=3e*DZ{Qc)PC|1g;k44YhYXx%0f^+(s5l%L=%3|$x$|6#>q%U9`!*K0dtO%N zb!V?%jnwJ=WT6x{?GE4RqW+M)DpB=9HA_xq=aM^^)rR0F=uH_?$N^x^Yq#r(nX~ z=RJfJ$YTe)n6d!cxpT;f3)#`kJ=*v)%jWB(>mk}7{wE@g08CIxj~rl~vYPzPWxmoO z5l3hUVa0-HX9(DNu38CT$Gm$siU})(jyW1%3IUxgo9Nq4*}8tVsu5ut`aIqq+2fKx z!DwfhV7~6#Y$fBAtgO!f5rnk^y9F=hSpYz!jVZ?le8$xboHGgfNkD`NnE!g_>)Rx8 zbJ*^A>{2S(e|r08q}JyoZPAr$66s&Xkgf?QJ6H~_`={GXmY@KK3YT~1Ej<3$HU7KC zuB*R5-!>z^tI@253yC-Wa|FPtB?O3X5^fyAsPgZ!)2q33r*2z-PP$zUoCfZj-MeuD zJBv$zpeDdR3}Jtfao0W8a`*CaVypAL+kOi)sczoVTQc7x-)(>0QP2&f zUl}5xI`uc(Nz&#(fk>{l=?5koicn^Qc7Lz@+hd+zMcB6gu46(MB%g3DOuZR<3E00Bs-;0=WJTNwU5hMoVF4O#8>tR17H*Zoky z?u@zp2|4tsBm7G+bc`3O*#)sKlsMMpMPt36au#R4PQG1)CarI3=9!#|+Ku{Ow(d8K zTZzJSq40Xoqc1lXS$8Ua=WqQ1Yc>dLZplMHkfv~;i++9WE3TgXEZ!X6y>H_@A95m^ z{jT~B_FVA&X6se)zJS}pK7>^ah5|+0>|&gGuy(JNzACdTAJ|U5nZ)}nWzXUWUN&my zn+wtDx)b!@UgysM%?Do8^(C^;)&JLwp8tfTxxT*6?{Q$Tc2s4neCxqXQCD)l zH_ZqtL4RjS7WhX?0w1K$WNu@r>f^b}f=a$=FVWMw8_bD-p!<_;_g%SlOGw3_qERpf zqppryh5Q%`$Sy*)V zle*hVj-=Cxj(kpX0Uw##y1MZW6?u7fgozZripQ#UxKVQBy-ISA%p0PDn$ zc+ctS>5T@_spjF7(@kjw3i1pvq5V_!9}T;B9w@NL+r@$=;wr@e1z_h4Aw1NnhQ^Ad33YjQZCJx z;J~O+fT~Fpcm)+=WeG5cXqrj8PkGE(0R<$}%{ahV^t}x_q~MjmRzp~qe8R{5%+Ba?O?&?L zR(B@n-jU3kE+~laM6dND|8_6}Kf$#vN5Ip4&_1dm&BfDvk-d80v-@)KBkk>$LX^3O zxJxmhsVIVqUP!9(&Q;~PWsU2qgAD5g*JHSHP~440N>=SdBqZj+<&QoO5Je5Mu7{Up z+}cY|o^&g^>R@kpd}_vxcS9Vwo6-KLzwAg>N4)_IzWR4V7b{|pUW_eC}} z=awcESt$9H$g{T>BQK-EMD3pBSp9s$jxOfw~H^aF(WE{JZM-oTDOUSJt zD!MIm;bvPo?~zAEjhLBDK`N!zbWH1@Low2~=BK)EV%mr3EcE5tMz zSK=*J*(BYL9Qp2y6*?cxiFWg66#fI`5SS6VDCt*~n3$Nnp0$Kl2P_NDQ_?rfcbW+y%xV)F7V-*fe+8sXa!+AeM*rlY868wB191M0cdG2tU248*W8Gxq_UY-~rU)+>!?XWU~WRsf=_i#Zo+9l#-9Bj$(mjR3?4pjK%ucLSQE_o!u2)f!O*{tS z$?J&$r3d8OPm*%dHiuCW6hPA0#3MStm4{rC({QI$B(U9<7Q1r)+(c#FSZKKAy2diV z<{GLQ_b)^xb?vf?x+(D^_WBb_FPiA7jJ$bOD> zNb00`ZGAl@o==vr)Xb$wc7#A~vSDmY=f=nemwg7*ey{!B9(|lNBJ)4DpN)4|imTqZ z)Ue?h=&W?W+6k)CI4=91H?O=9TKfB(RsZ#z3;yeKKL0PzIb4C4;!n|p=}>HC_fMWp zK!WabIv4GM&ODD|wCN(+*DrrlkGki6{#^IfPvN8+3Fv(Fo2E8p40tsCZ8JArPK+YY z%*M(}m`;7kq8i;nRvbb-K91YA0JI%Ym@W7 zeu6>0&wQhC!!s)w0u_#&v6Y_l4U?KLnTA!8ORp<>C^qFAb6;iQdo3Luy58O+)*7vq zlnk!lw~g{zircv&d9D;BRE()Kfz0?=aK?=zpcSkR5Xl+bF_!_~Y^ z!rLGH+THfnz7krHbN{qr*y~=V$bmI+h30Ba3^rpoiwEo#$EEK*8nquv*F%osBUy(m z63doTGCHR@%Ixh34Qi+Xei$L0^frmR!D)iT%ahI3Pi0*%a}|}9KZ0JqOCGKxT;|ZsPk@eQ zFA*G*9S_qhIv>2(%HV_F{azPAiGr9r#ZU-C(mY@nkP<}6{3i8(z2nfU`P=&lT|V%n z4oP!4Qt+$V-#pu(wWKZ;R(_1#FxfiGe@*2@O`FE4Adj?!_x1c!IkP-MjBKDW$X!P zv<93`N2|G?nxBA;aUSpUj7NhCr;b>!K1hg3PYV)cu0;1KwsNzvg=vY~uCqiU6%2*U#>c6@9gIby-NmRo2Dm%0Bc_q(qt-G}}fYX^Py>qr$D@ zKR9E-85AE|@^*FfK&9>dyMEbYbgAUB0^`O{s>y?t(RCjrn-#sPp}$R{YGtv7+tC<$ zB?k*2E(ItnfC77~U>ieo^9-v7tr2xT-PKwrK0skxOE^X{jJNfT$MaH1k(ij+52vFk zTSQWcqpf8QBq8-tHXNpqr)`i6d2@as$t_rQ`Y|Pi{r&qoe)B$0Xs8!8+}g_SiFNC< zlUoW>VcbH9K%`1GsT<$ov7(pCmOi8aQu&mgQ!JVKalU#6e@l@+xGeg;=C>Zu+0N}U z$^SAbCUN}zzmZ=+d3I~YnbqwIX-mxCHOKtLMos#EJSJ(%91ZBk{=@=Z<$uM3s{a}b zjPr3d)iK$C_1q7F(M~x)e1p_77x{o6kN1#5_@9GuMn3BCH~h#4AbHH_t+~Fjv$LxJ zEuNO)k)yvs-CVD>hUB`AtHMbaDqvohV$fj3R@%~R=)Nbyki?yUL7l>x&$QdsiUcXH zw5zI$^o5?N-Uf}iDjiwyE?{4o=@2R%0S|vukkK(R>LqezaAjQ%_$n4|#(nHxKW+=T zUjJI-Zc6#CZShUq1~J?bi>L}{X=vfyGj1T$fX`DzM zW-?pVD&`q{I(KXK-X84h5N*cpF;-Wx~=oPh1nPdNjdj~-GfR1s8r)m6IK{H6zlJ8nMRGZrl3oq);Wlozh-%9%4T>{HEmP{>EFO^1wTv!pA4!7Cq_S%xRDP^BNh0OpGp`Dkih>1+X0+zm_sU%*=>m16BU_Y}2>BPPq?N zg@!fWzWtK>Iqxh0Dh1$xyAUVoIy;}Q-&}GyZw#0VY_+&IX7IXs0-$EoBDxJU8ywDHNAg}mG4qw=Q86k{ByvvqI~ zwmL*VAV^iwXg?>L!)_|TVx%_!h-R_XT|WreJ2fsGt^A^F-WzK^`$hfjko9^K-0K_Y z0~(7d_*CJ}W}egt2C&sHZRh?Q%6cjVG;EJDxJrPPyh#7E1qSkYA6kW2c1qt|Vwj&^CaipzBmfc{s^YBx8+Vd#p+Vk1n@hNMS!H_7bg)n{^l7!Trl zqf6&cPRx({Pq2`_}Vy<_QcAH_$LE#k18CskZt>8dirZitwbouI8fsEtyUZ>+<;Z)WGOZWLwRWg~r8%{9WIY^R*V2e8%;+ zdI{-LZUpvhlk3)2?>Y4(UQT>94*vZz$VYyb4<~acg0?w=@P)5HYxE1H;N8elojXnQqNfLT-$bRBb&T3-gTK(Jv=8EiHx9lXwpoCM~8dCxfX}1FTYuFOFNf zRO3MH9*x!BmUXxL#>xHu<66#6WSEC9Km_~X41X@}K^@y3GGuD3Jj}e_vzleZCSak8 zG?g-*)c13KI>fEIz4J?6J!BiFwC7;Hd#xiZT+8PuH6T39-h*#zb*6Uz zQ-&SJ7o3PTzv9#6eIO6{?$T97@=xBaL-Lpf^X>)Jntqja!rpwtB&b}uXs7M-Xg;Ct zt_T?TLwmG{ZP%L_-B7Fx7rJSfJS-p1MZa7xt?rS&X;-nJi!FKRe`c8#%VYZ%9ni_f zhch(3_9}FAvjy%w=$_~bXo(ZNv3h)!Rb6e5edD1*Vvt_p^7A2XhF5uc`E9O)jZM<= z2WuEaVLhqEXB39(D(m%oWc@g@<$`LdP$qX!1(M!6K1UzuM@h{`%gULqWxca}<*e4% zXA^-n_IsPqd}JjhlN!H9BRcF!hY90P-k)m0+@`Kr->48b?|vG+8fZC59*6Xa{XkX% zU~V4t=l3Mjy#eAZ9ea;QlQ5JMb47vULg2em7QDEWI7UZmR2zlBB7h&!|nPJatu!!mvm z%~G4MynG=ca$V-+Y@OzaT@b!+it^KkM#OVQWS3gh8F$GtZdy1mN}%w`NUPI*sw1{U z6iMZ$8$UBcmreD6*=2(iL7R_|-0UCaF5AoL44*NrB=Kk@nA}byKW1ifqM%GG?(4zh z9r&jBSR{j8{M(-kx3r*3^HtFgF)r6vx{AOeLr+MgsP|F_+(K+F*9uL&mOE=+?<%0J zQ^CH6hQ6r{UhnCOq=bk|WA*UhP0D<1nt?CaEftoR6HGpn{P{yDnKYyJh5S}nVA%Hd z_Jw3D<^>;0q35bYt0we|Eg7l3mDS^h#5Xf#%QHB*GvIc|qITV;iK2f8UF;&Hz>)mZ z-=iA+;XKV-o>YG}psy)NU&WeD+gY*$#5xQph+1NuG!-VvvD_Z7^NbtoVDz z&j9@DY0xdZoWet2c+ATT16~o80zI3EgnvUNI0ko*1mDcxpS}ygL5E4+-^3k{CDHH& zIyeD{bktf5TAk;Uscm)1Q>-vlCAhxU!|w`7Y`CGL$`k%3>DR4m?y=5WOL~`^{(euY z{e60JLqLE1!kf~?J0#TG@FnyFK4l0i6-`uh;F6o->Ex6gEC&bng2E(#lS;eHrPZzb ztRu@w)v~M7f|yvai;0QSs)Yf9DMzvL!$$gq3=b2g_tc?Hb{mr@xIEoKwNzLF?Pi49TwButBf$1@CB|)1oYS{oxdTtDl~^qb^W{4>L7I zjmO3S)8(nZFCDGFPlrgGxkWBTfz?4HTNgDCt{vy>hm+0H)YV($LfL34(eP>p-8^-U zAm}1vh5fu16urKxrJ7{fE1K6%kBw?yA5zXd+=S!E36bX*f9ZK<`f%ds`1okjUCDTd zmVrU~YfH;kUslY9-TPbN2j_l==WJAf&dZmSE9H91Dk>wm@ja)zlRv^+@%z!G2})8$ zoU)Z72hIluR0aGWeRe7;DG|Dsys%`e(Aol=ao$!`zF%}Ew2 zCQMjek?kotJ6OHMfG6?GixVlm-%L9`3#4IS$IG|O3g<7Cft?8cEWNdA9@Xxn zKebd;=bwE)hFN7U25ql4rl5%8G$2H_lQVG|7z-B$3N|yZiy?@QG1(E&`UzNxG~ylP zDDfE#dW!le^vO(?XKsl~s0Q*2lVBM2#IlBRUg~k1(ptpL8cR!JhB%&zm)BQ`-@{s$ z#?4h2j;#-|FFr?W0Uz|gL%vi$E-|R%CU%8?DM_*dC`h=IUl~{zz@M|cV-3>X*-4F& zFB;5k=^s!Q-2Tys5yEE#b{b02q&8Z0(#CahoR3V&>FUJbN?fR^yR@`hF>-z7(l@Uf zd>U5~PEmb5im@TIM%*qNswmjRA;}-UCxF9B@bCU3BaT5~k%@)cC(9L&+5p|+UlLkl z8Pdk@h}=4zmrdD#M3TbdVgVB^95|~w$MeC2Qi)qOJ#@gDfhu|qQlJ=T{oC|NSG4Rt z_fHRgeXfoJwJ%lNBemGD!GM``9c*Fnr@z^H!=!~pR`%tKQMH3KU4uKVP?YxECX4hr zT%yuXbf%r5_0jf=kG$KC)znh0B{L6djj#dD+@|GziqaP)NsO^yDHLuDY#zMnf16Dk6q%B{+PJY}C z8k?b6aasa1D!XXfsl*D0(#UnGsj1(aL;l!SL^5_#=dWX2)7bnY3jL!dTH@e+;j2&U zckwA8-BX%MgqVn^r`)W2z#+c10+9t$VeHZWr1EQq|CY+TSgxm2U!U~<>Ph4hUlXAz z=4^UqwV{z3&tfyd$?IzuEmi-(4390@;gl&ErR%pectEp^{;7(v?ZU*e7RR1zFU^n1 z-2dxZkjbEQ(E!A%oOi?DAO-he_ka@?unB>I9_Vx>_+fWw&w;d6b5m$DsT`QM$--;FeukrdP-TU)~HDYuRkpo zAVH5loM+=6J_AH9L5~HLsZ%wmm}+`5%i>MfCSRFjX-DTM3OmK_Rl~2TNkh^Nj}>A_ zO`Z`*3?}d0ttI^vO zK;L}vj#~ zL%!z2Ad#o)zVc=Zv&dZ}B^rPkWgK)6{40IFK5IF(o#U%VsP1+7-KNYtPCN>H1#IX5x-=QI|VYbpL4ir*yK>y5w zAl|ZpxtNCNm_`G{kqO`_oOtg-{x{4KMm+#2K3xkRFaQ>V=k$LPba1s?#*o8K5iq94 z^_%A(a#FUu5F-jjSF$K(t4pSdZ!BNS-r zo;RQEq2?y{wbNP1;D=hHYup$*q)Mmyl6l(26n$RT(tBzX<<=K}#P7fG$PJG4^k*O` zq`+FljOhVfrjH*5p}chG0`i&OHMU+ZEG8dt<49ttmwiBoTCBmxalgdG#KpvdmRc8l zz1I_xj2~w-021zfM1T;bQ1M5QBz*VC?u1E_3&RbE)V95S&F!_J%8l#p7#i%hJ^c84 zc#ANuCL!}$U)KF)91xlK=~hwxif=Zd2Q_Q&>tXGMOrIo-Vv}bWvhyKbex`}zIXkbg z%noI$>Q#dc;738B|9ZMY?|8h_XT}+hi3xi`(fuW(0k>b_<$3=v|7e%#1$sQ_fsf<7 zvzOz3lb=TN^mXSTN0v{;COSMy_PA?zc75vT1dgA$Tv%z|C1}kK%UMQ26i}z{TT`rU zjkpZwDrpfgdex=Cm7F0rQ~EBVztl*8@mBzAp2TMwa#VPBf0A!zQw~$_pLp-7ZLTu#KfQWjpLND`kMy- z1ij>^CcF^VylunEHv~WJvYf{@KPMS|%+Gw=S3cmBhol)*F(m-1M-m*pH*c%KW3}@K z0RFGEJ8+=`^y$%}Aky8I4gM9)e8Fh;l7a&e{9T^+59~u=!XPLJR9skRha5S$u?lKE zh^qTtN7s{KTVGw|>ba&uogR#n({$+thn?v&DJI|N{D#3A?wTBeOkcwl1#>}H1xW5KUN z{5_S7JeRpv0YM1e;AGY;7+wl<^O&LU%05wf0Pk@%RG;f%m4$2{q^8bbM?8OK#PzE^ zZ2NHe8_1dj{7x8MOR=QwhIbaviol+%q&g%<*U-k})f_;F&oI(+>#tyWOcfho>f! z^RwdV1nOkIh(QC}$AkpbsR+8ThC!%!O^%(FSeP5-oAS?{%7NX#cu){;*TLEV_~6hx z5slkdC%_bX{-(G0N9-R1i*G{`;lG}I9}MC`lS)WP=>4pcr20iXuZ|JuBwc2Crxqj$ z8c8`(w>TJNZ;^CfZ8gG%L;Ce&)>?Nb1pmz=Lv&lkF|nC}C*$arCpGA!r>N>61?M@n zRG^y7OJCM=38JvU`rkiv|KhOMi{L-_zu>Ss8%&A+RnmUqdAfPhtk}_-^4xneM5?b6 z^%XGUWr?ltjAQ0Lone`kmf~#ippEP3 z5UIPrdw<+|8@_&rMgld03qT_Uk;|-cEw3cX4I_`%>{$C^jW zEPI1HO@ORU;Xs=3lg#(xhOB^&nHl9z?@ZXWyQWb{nzol-i7{qC<9lYaUB@{Vd~a^Kg#TnfYkqFx5pMqy3=bN79o1AR|P zT2dnTP!KoBN7vdAR%t$@yVqb}n3zsESsKha+_k6kh5x%{JT#nCaN=eBkR1#IZT|@a z1yCK^Js5;_8;a->MbdQj+X3Ea>gjQ?s2d@Z9TH^vN!V$z|45d4EOkA7ZN6RM{d!}d zmrNo`9{uP3>qqK0BYSa}RGTQd6YN}H0pKZ@%Z7S_F2^Rs^Kb)R>&wgqd_jYbss=o_ zgvCTe5aomqZ9cIh;{kc`BZ8~zB|!6;FY>Po)0?9v1#Sb**dJ+&!7c-SaTE@N>AlBQ zrSDN;Rh902b>BKGEPpv&?iamWf0XX<;6YdKoT&ENa!69|Z}PYpF>Vzttq1t`tB1cm zN}a;4R%aX5ZqKD+wexwDuznWU%~ay{FLG$-5q(!<__24p$g$9q(oU*^YfAPi3C^Nklu=R5q$9Ncs zq#5u5+0Vi_2>j;D3lPozKcIs(ri7*M{})u`FfD7rKtYR&)mMeT_#~PGfvfomqG@Nh z&zwQt{yvIX^|rljF%6IN=d#=Y;BRM_vp4G@+*i=1f|H@8T5R$pX40d>WP516lUfzTesU5kODBvJ@#b239&NeRYMhsd&M zbdXty@F;xkmEAtC-PTM0IVM_PSJ=v`DHD&hcIM3< zl#DEe1tbRFXm6K2N|H0Sd|M5+}c;i!UN#8JqNa3VOdCl zpEI%_sM*nsZ5?Vvdj@_(y!$sC|1TmYLIcdhry#MYM>=T!SGW^s>6KPJ3oX?BRpMj(^Wk@YlULHZ&%srlxj2UQ?QGa42m& zKIVWEc+l^^%vVh@p!*jHL=rJ+&dq1~#BI>X=uV*u^xe}zY*$*s6}@Y`P8$9wDxj`# zhQnzat!=}Qli89Ch4oeHkY$()@`2JgV49ul=u?i9^E0nXiMll#il%Nx zw^qTaFQJ{to$)>%5&rs2GEOaxObZkHzMbr23&edTRnmLoBZMq2j!-qVW^PUu{?G^& zvGhFjri5DY*UZEM{a2AJDpnO;Kv_MLJ@J!r04ZmO7f!|3qJn}}hp6)i=onZ;s?Lau z!MH%?1J{=A;>Y*e@&Aln{2ABtKncc#n?_1s_G+y0+#mB$bKHX{&X$Uk8fzcCz|+bi z%^biwYxtt-nITAfu!JsVf%>d4f1YQP`MpJPU7cZx5tqW3p}6BK%2^&rc<9u;Q z4rE4tL{;RL=22l*AE2bi)rCf9x9JDsYJPR~x3)*2n!uFoA{XHln+dm){Q8Dk9j)NN zzP@a{2h?1wgWY$*rSDMYd1^zF=8H}qB>Bu&b6Jg-83He!md^gqbq;v-9IUSp`G3BE zIxAm-|I(@AU)84m|8-j*0bj_W&AQ`m79DX`ssD$yw~nfE?YD;4T67B{NOwy&5{qtG zw4|V*fV4D-3niqxyGuY3X(S~Tq(tdXX=%jwTJHTkd!O??`#ooj_xnf2&;c&)>;BdJ z%{ea;lEr;Bz8E|#hN>ikert%t(px;Lk)LDZSDY zC2yc8f-h%-9}0=rplqfXWo!Fp{e@j?*YhM2270YRK!xfcF77TE?uTBI-S|N2g<@*! zmGFxQK@u_>x0<0jv;2^;Z*L8)Xdxw!T5Dy2;x(wo{mePa@|8rLKDLW2nPje@xvFX^ zwS=4T?9;XDqsM0unQxAFDpV4`jM*1HxAOV?F@m&@w>K^_+lXoF3-~;}u@N8jYAA^7 zMdZe<*^>`*xyzG*5=vD{$FJxSv$`x`;-#e!)iZz5kItRWkA1U^z0rZ=gwfSKmL<9C z;<5xed)R@)*&pBdbz<0A48C7y0xD<5)Sact+vtWZbt~sbTh&%LGusBeXhf38dTzdn z+^5`85n>}PsJQzx_v%|9Q7xz*-ya}1n4@n1q%lPV1u0(7Le16gfVXOf;?TT9ZEUEx zaf)Gp6R+z~q5iup5^?1~^lSj%bg#S|0=Hy@+#ovT;AhaDjKwV}EY$bdkzj1vn*^LF zQu1Uq2A%d#se(qd{-@?fe9s!Kp#Gsf^WF-96aqA8#66&iEe+x#o4MyG2G`^kiZTR5 z1T+MJj=?Gs7)`Mqt;A*m1@S3(fuO|sLkJ0SGvEn6b%bn|v77zS5_ z>}>^IM^I7E_e1?@VSH|sQ%%nkOmM2*`s~q!`OVhfJx;W_Ue$+DFxBL|z^Stq+C|So zK03#3B4T0yyOt@{)LXA$L{Bi)yO1#t(I2H(j?M@|(~DP)Pf1S>{gG9RmLPadBm?Y$ z8@VB2jxPa5r*J-cdMbvI<4hAsg=DlaxpyymH%S0ZdV9KIv`tIH^JF<*3)cMJkX0iE z`J*XR6yCMh_3}&JUwuqj+ckFZAQ9Rqtc?iw8+k6_-{y-k=)dWS|1E4v&;V1#n1lzF z*^)<$qkpqXX;Be~fUrZC$1uMZ4=bqK+eau^LS#5OjVtf({TeBJT>YTOGXA_0m`{pb zzB1)EOzQk;%b%f#%eJiMrpLT`7_zFOdgxXcF!W$wm4;eJO?{w{#CFyWLZFdlYC(^S zk&^<8elZrQgX7G@fy1#Ev&uz6h=^tCTwQfUvp{YM0;-F&dd3N9~?(@C5`S6%cC&Ggdr*!+_P}VJ?=~jj6*egK^PFx3h0_p=! z{F^@e_c^|BppojcltKz|=h2Vm8(Ca;55xsZ+3$^v#=CTN62ZsY8n#dOpRgw0vop?H z?pU$8Jf&`{8rF+C@OFklFzPyM4GP)>uj!R{MTRTd4GD<_+2S^noBOHPGA1sI7sN%p ze3_InOgwWTI>Gw6C0KnW zxs*h7?~?w-LQt^|SwjSCG(F|Mca4nF$Pmy>-i=ft3 zEG%HA6dQysV7{##e$Cwt`e!s5U3m<+k!XvX_2lJ2Q?>h+Y{5w5ttSh{A0 zrU)t4N@{C8P$Mm*h-~5M8;wkf+39E^UUbDrO-kY}i_e9GM9{|60td8%N$G<&>hC(C zA!cTzPItD1216>uJo)*D!k`~qn1;9ZZoB&|wA5YH z{wq?i>7=hfH6B7zl)g);`BZd|%y8nbG#Ef!0v@xhR@k5W>Zf2v_>cSQ?qV?ZpZ^^P zQdaLM1mC-$W4!gd2DUBzG14p6#dmj;p-H=3I|W00!^@gGzQ@$bTGHas`JP8Q(-n95 zwUQVsMl-~9V+AucEjJ6p4kPBlrk!vCrRz-%CZ_$v) zm4sQ~!rP&^5WVa7i^zD4a8zz2rlr}8%yB^6(Sa&)AW77dk~~+XJ|B!>ea-8pzgx_@ zemB+J+x>rHjIm}-p3shs1P*8Rsn?c?lBe?|MsM!FS9N;WP(9O2=4gYZZ*aejG*&I`>Z;TX{yV7t}V9S0=(`Oz?npye=b-9pqZEGwp?&;F7)^A~E z621!^sJa7I&O9r8KJh-1$Y3(ineSYAE= z&-ry6&VA4$vm^3s^ydz^*rxgwI(k~nOT>1HEDW zS*lQx*Wvuw`1pf)Hi;==;kr~(FC(Kg*zWQ*K%(?i)zpl1@jnMJOJG%SFy`Jk@j=&h zt=}DrTO0=RXk_fH@5MwRZT0U$2B2ELRn)|`&H<)A)c#W{>mP8p{{Y0=sMe!S+f?nb z)Njn!%M{8BL&*H96J6fD+EEniaBJrXfQ*5yx-JobG#E|QyBQeqY3Io)h)05fO8!v% z*;N>|l!xqb+mY=FTTel0=_mU~A6lNwUka(SYTmj=xPF~v-r%D}!%@gtV3!SMRG8LH zHt}zzammTzYBONgryxYQ6F?~Y7ch0X#)S!-zfw3+o2|dJ=CUB1e(>w=i!UvhDdoX} z28_s3HMtgQ_#eL&UFL=$CaMfm$C7_hLD=OqLUABDd6!d=txJ>7 zR*?w_rt#0N)(lr7MQf?}Vt5dMeP92|RG2PCoB9h44I3=fI%h1o znnMX4`{K?>lv*E)y^(=Zlox_nnujFD#)bj!-mz~d>Xu+TBrGb*6lo_l@|`wWK7_EZ z;mKM-ef>?I$BF_DlR^dQrm#J)B-J`9w7s~F1T4JT-UJ_{Cu zKFAx<>#Gd{Qy&$f|C;8t#MA&pWB6@zqP)T(4(PO*OlrT!EZr<;2dhSdR3Q#6U`#+y zRB?|7EE`9DG_{XTJ$>?^3`F$FXy0#g(SL~Ic}2+PBWK#Pb|P%Jdm40lEt?H;gssz4 z55y0JDaVg`qL>aSN2I~6;>J`J&5R(;StA7cvr zgYvO)L5x{3RaHw|NsJti?JocayNCY*I0`k^RB0BqaU~`bK-}SXG65r7MxL@M2t5ZE z4azQuK+ox_$z(y#xf+M%^!Jxp9e-y$8}xi1=;m?AEX-0^WtNYYC1vD(=u?krbC7+t zA$rg>QK#zTQ~fdhwo#V%v5VOHrw|}XMMDrn$W7+Id+Hk-Pcrj!+z}YjjalS1yMI5= znQyE;UDwl-ccjf@8kw{1mXek>hV{x@i=5VF_Zk{XdFXm}U|nC>LSV5+fF{Jk%F1ZH z9DlIaiC!1~QMhxC`f_%`s#6O+FNJU@{c}aE+^@gQif?jIb$2N5&$=uBPj$Ea9@zT* zS5W;QjxvZGrJY(D7zv1~NlCHsCPUlJ{&gvV#`>X09BkFy+|-_^mw%u39;;8HzU@Km zh%k(Np(iJo0c^PFe`>^0P_EGz2w(g53|{YKPcpzlg|GqOj~B@U4Ek5YVkLugpWM_+ zL(&J_Ie4pL%gQ7Gt(bB8!NZ%Mdj@il1aRRoQL0C;3cAG5eAprSjDUEttj-2OK1b79 z2sHKHX^5=$pV4V!>$7=bZIjUJEuqrzleIz-QkdBkJ!!>_yQTAp>LyyZ!T=Kfl<{%hC8y$|B781$!;-~V|1 z>b+C$#}sq)d0yx?B-7?K>}sRE^Pw5UCM>+FcSDzaC5ZgjOCKWChQR9V;9K>^Gh?U8 zsP*-AX{5mE{vLMl!~1qanDyHkaX!dQ^qK>_Y9MCzEAV^!jy5e#`P?89jdc2&2{c%^ z6Us^EV^vU&Hijr_yX|+%r=#bOe(Hne|Ll9h_wV+m=|KX|uKZHPpAo<*`M#plH)?USbnAUl6@#!l0c7E5vl`p{)3W zo;n#>$}}#Nn8VH8{eeZRvKZfVB(Xo0t3Vy&m!G}d#7T*X!9R2w!1DT}FZul`-!zrd zQjP=>&j?ZrP?<@JIvYJIjKO^hJYbe#r~f(5O%icNactpcW)=^;_AL1%WUJrm5ZqkV z6eZ*`AIvm_*jcpC(IM~O?yT*^%+Z0s`3a1E3AqmnyLouzw`>OZ>PCA%qB8U=)iZy3 zgw}Q=WaYA2)gXYrWuTkF|~psI`<)Vl<2s8*G+mT3p}jOn?O7M zIAg!w>13`Iv4aM2pWFNpm!2Gu;G#|PE&1S0e!lYLRMSdZ`%<;j9RFD-okw5jb#m() zBeUT01r=If`=F0$;-$Q1$$PEAFb(|LK>kk$rd#~J55`I?5*F}f`7a3PJ?{(n0F830P z`ySt-U)o?@p)t$B7TFgy6-6rC0qKH^fdRVVei#$wL0eg+k31dGY`FRx=3mxUf?DC{ z6#q-g2jH6jNQMTpGMK#nzp(y)#RQu%;QLm}fbO!!4|z5MSCT-dq zmEJM|y_G%9{&rOGJ_!ddeCAGZ+d(>DVL4YaS?S%yhHP#*BaK`2MkhMkl$X6q z4hxegu(p>|Y|+6aXKUo6k@1&RQ|GcCDUkHG`6&?J9cO_@6PtUeArY$-E_Bd*i16{SAnzVe@Wh|*d`uN&y1Es zlSf7zq#i9TK450>&q!qC93zM*S@9bgo4)U6Ste?imL6|OkDEQzNI|9DMAe=71}pEe z-8qCEPWsI`E)>`t$!<}#lXpL*oGnAKDG4sA6>fe$_;vM6WI&y_Wd;kr(wqHs8^?RE zo#iPcfncS+P99x2wfgWfh|4*aBf!_J&~|Hc0pYPpNhGsS!)>`b7eQ*WLG;FNTG;GN_X)Tg=W1YH z6DK6}v?uDZ;n=NcE+d8&Fl&pIXa$!dc( z)`{57iW@z2SQf$*lLg1(G!0vRo<~tX-howQ5Fuz#Akdzr3w3CTIYmFI7u%f-zS3Pa zH8sd+5>*}B6=v?%^?SrfyzRcL*<3U|S=oSFru7KW#DU3+JLWH=*RrF9q(^R3!CgjyyTi>gXLP5;z+2@r1fkV~$Bv7J~{rPbvc^b)%Zp68BdnLO?H2v?Hu)Wh>6jA%y*unLtz zyDeIb7`mYQ?>{g?&^{9fxhv&5)=4;hktX@IXns+ETPg4%xw8LgOW!%EG!YmpSbaMX zCjo{YtT(hDJiyE7q=Ys?^Hem2_Tohs_`1Ow39U6Q)&H%$|J9)=Sjb=W)_)pi3AMd# zSee1qryA-!#{hAEE7kNa3VYNM@U$`ahK=I6OJYv2sJ$b(ydB>GG?9^slU+33$mI zRrir0>zK+)`7JG@)OfmZlDssmliP=Bw?B#X@SB;F&VODAL1T)otu{FBC@mwb^q`<9 zb)M;u?bGd6_sdAuo#&5=>3*G$X1g~T$;CMIz}UDSTil8DR_8fU>j=e`JPv$(Y;6AG zNW!!(&AH`W+Ex_S#_r|vv(L>@`T6;pl+A!Y+cz|XRc~YV_G(~b?q9SWl=W`}B?|BK zDMM)teuJg|Ai)xp26Xy=Hwx0%4CMdi1@K#q^&grBFd(9PzDhpo9D0uR9Ewieyo;RD z=JMxMQW2U*rlj0GtacQaL(e(?KsjNV*}k=1*Z=aojVU^z`oK=y7n4xHuz#77z9b0m zAsA_I9e)Q)`7kl;Pvp96B+v7n+|ki#4p_r(^>(UW1ZD{YKRzm}V9^(f&VJ%mq~ghx8LZCg=)8KwBMN^~9sTVy`z=z_42kmVeA zkK_)4EcbK$_=R&JhfOb|#_zPLw>`_O3W{;fPG1!df8RdeeJ1wnVa7QmLvKKN>X@o6 zeq)zai#A%QR1vAxj1MPn+(?WXytBc_XVL_9h>MvWEpOw>cLyvkXQkFJg@5JfvbL9h z)M~{))HzK@md{?;PXUoe6~h{Slo3y6ZOk`~}sV)Z!y-4z^U4Q9k{tkunx z`s~ztn_Dn$&4^OK!NACMKiuCH|lTSbmPnljw2E<%Z z1%+&?%+pTm$fcIh{X9a1r|ay7gy;VK(uAau{zh^KtydaICT&c3$$AMp8uyH2lf`~a zWu*bq-@P)=v#y$?5xXeTzx7j>0bH|AYG}8HEgMxOZXdq)A`sx@5nB2wEhj7dv}zT&cb8sa8GYCK85 zhRo|Ettd|+JfTy1(T0v^vh(xZYSu6GPeZ;NAYipRu5Ww-642j4lUWUsd)E$Tm6SsS;%kntIzNc^y_EY z{}cL!kt2r#c?MB~z!f{RYdjRV7Dwbv!u+mLQf`XWEa#OWdTr1t`nB;{|Iu$eEG&Hx1p1G=SWaOT zxJ>icg3c{WSIz0vu{AAa-sI1iwXjCg=5h@#`xU8%hgF-$#vUi#?U+zWt-l9W)56tH zCM-(>-kKR0l!28eFHqW}*RKHgAix!ML|PODM(RRecEHD^p;rM0)5c;4ZbpUU z8?E?GIzM&UM)+-W=C9RRwqVO0{kRL!qtWqot>l6P36xh(H3-d2zc-6E8n>UB#hh!% zy#4K23`9K-Sl{fuGh|P z0I@^Qz@EYQ*XA4S>85_~=`R1sg9qk%|FEa4J)Z%gpvnO7NWC|!Mo-@wst>G)Ryihm z&P{QJ5N56f8 zUKj@>^U{I0axc|&DzsBonw+l;@bwL#R}JvPmemAe0&v5$Ku$C{u-=N{hmh(7XlDbs z2~FGTVG$?9-5kn(TPgT36QXD6nOyVELxnqYo3F5#7#XG*Apu2nHqnT_Cio_&a11@Fw>+fS3RKN`nS$%G$qGn%O<3{CF%V5EuD2vkuDM zT}p&_6zO-l4@G9WuQqaXa>f9TNbV=Pkb|j081;`)VL1_ONLjgw@zi8tL#ZMqm5D~P z)uU00fpCYkBLZL@!Ne@-F8Vhl`}=Ps`$_qqNVcT=-}i}xC?kM>?V0`yl2z39Wa2-J zAz&e_2KgZlABU1cM&-$`#lv`Kw>b9vbTHuaBZWF`TKYOTpC5gB#fa+ULc{bRWg&zJ z0!SkC+^T1p`yk99MK2*=*yef%8H$GFY8v-z<#KUIM8 zDIPfXb<&i}3dA-7yE^JrNpHzXU^-di31yEL87^phoX83XYvzHE2&|HLz{7~+v6vzM zTR=Vj9Z(T}29%@WKLpf&m1;cPgRo0qZVM@THR*m+CCMa5fx^(|>&k!pVBmNL5vFCn3JpMUQ*;T^>6mF8MJt5|@Z!57AHlYFo(R`)k2g9pHtLu(zo<+MF- z@pPs`-mg=43wZ&zl^8WHZ9hn-Ulu-Pw|oJhU-iE9n9cV%aZ^|JWvg5;c&n?by6=1t zDl$=7=VZY2Iq99s{s$dfN87}LE-zIrEfL#2fQaWL9ui#jioKO@ASZsAN((pwF&=P4 zg)>ojXlQIgf`Zr;@G=12_x)c-9sYe)Ng1;EcO~EWqmo;p$f*CsAD{Rl(zlvC7t3wp z(YybEyx@m%p%Dtwv3hj(U&{?0z$4>p{rh&K!57jGR39g8=ijBRFQi_Z%9Mk3r7ixS z1v5|)gq>Whct^ocplj;xXu0_#QecqWt#yxi;6C$G^eT2~`Qr`(gCITq@iaYSjT^#` z21#>AV?AY|)ZTQEacuZ2LfgH?SOI=+VH{^12)kB?Wdu3!f-4k+UGYQoJnL%eJX7=@ zHYdf$FGV)dJQ$l2EtM{97xrB|^+t0)X#CXI7xR4o8`3;qYNpcN^Ez@+6IXUUrELOQL z^F}81XaDfI`{l_wV5eaUjNu~DM5qCzp-HnOIM&m>Iv+n%nT9%=WdMP|UKXpW1Eb8JH#SD)`V zfI4+>d^VA9h}xjj+oNJTY>b#S(a2r1gIv>Ti@Q9H2^$+;KwWF^E-`o6oGdxSA~W_Q^ncZU4*$}LIqpTktzI3%~| z+h%Wn;L&0HP;{rS+P1ISj%2{4+Q8sT6W5UFVB^W^Y?EtC#)LlyQ@zvnTM247)oU5zr zYw46Sbaq?}XRR(eCZ&j&{iU7(08(3@|JYDVQqx%OEc8$igef7zR%h#KsbIlKhR>nc zH=$)x7M&QhB2NeUIbBk{Fw*Bk<6wp=r7?$Y3ISj6nvR#h2rlmBi7WAoKUszKS|&(W z#N^6j^#|^EYC>_zF9tEp-Pn=Zz2Vd+|^YO^a(C)J=G72d zLOCU+GhMo|5ewQLTm4E%+@KiZ9J$=Z4jWNXC1lyFR|%-=h*$KrwHcnQ4JLMU$ms{z zb$8BBKfVQw+Q46_IyhAL-+OHlFzr5JCScWj?N!xkNL5~DC@y*g60Ya0!>Ua0WZlxaQL?DMBH)T{IdR zaoUAurkI$lzoh!osxOeL`D^{%&`33Jnpvw@h`XAI{WQaFYpeFG|KpY@uCn_Tci+g} zd(a&bn=2n&`v{>F0d7^N_dVH3{A40B(4TCO_mRM-Qy3q;<$^AS3y)kQ`P*AIRl;8G z;kCvAsmRXsk@IuRdVwcblY#;xIm_{o=jQen=Hb{bL{h4^gYYCv?qf8yQSs$ekrIlK zd+EJvig=?)qQ`BzFl;U$DEPWpwk~T}3BOb9etAHsXF#3Lk%+k0Jaf0!^W~#~_?3a= z0U%<-OMG<&OfzQXdm@Af{yHT5079iUYy>e>1g59pi?jWW!6lxII5;@x^USJR zD9hBR7sq{HDom__VQ#fmA4vl+U3fz#W5TMHMg2a^gO&3dI2}E=x56W+&lFIpOwS0l z)5w*qxg_-dK3urD6M5yA?-W0Jr0rlPWqPHzN$z;g%Eq>kNPVOLzy()v_eK?-4Bl9E zdN=`{ITG)iwh~eYruQ~Zy(D5Bj|a69f-RH8VdQPkYZ|;F$q{jcK{rpRR=8D!bisXx zI0zz1=cZex*s!rzs&Sc`I&x2H+%cuR)t)@bC&^jgi4<*5bskl@{|MhysIf4iwA%`>adS>z+TqN;%{hrFKxrHn?qLXGa!?_xSNvL1QB!^SnaeU|$@1 z%+358$y|miNC}>y);*pqs&`VzK z%`@*hMgk2V3rH&6LAfB@;2h7!qEGyZsLR zT8>VfF0(iWIy*rqt7P*+-U|O49Rp*AM?+a)p(%@ePC52D_r*NSi5VNQlQ-EDRqNTP zwygM!1)G#PxUjfbZFl-siA?O}@+;OxKG~+>Vce`$@~RNIR3?vy*48CuG&gcGB9(TV4RA*mccie~E+4*tBu)_V>|q|q zGn3@>(uS*%kebf7Kcvi|GodmsoG+#BRdO%3Ac?2YXvt^O@m*4BOCW(=lOQ778t=X-F6wk?bkg>~=wd%uqE z{k}-bYQ8N%z;#*W=*szI#is3F#T*! z@@!R<4E(laaFq~zn&=j-7!qiUP-eVg)OHGhv zD7Qd7qY$<$;iOSyQ*9$~WPuCc3s6%N<|~UCtI_LwUS5)7!vZgD>FOJPP;7Nb-n5)g zBnqypv0ry1_mU66kGa@M=Q`X?VI3{j?20s&aQq2J>gcG8rls7j;oJ#MMrSWO)6>_6 zsudXe@TzL4;6|8{7t5Cq(+C>gj%+tIV8J4!hV~Ux^33|EI%ZREOjK1dv#?@*FT8E= zK7R}73dxU7xtSU2i!gB!p{fY&0XHr04YAF23Af@)wAL$ne;P3_Q**zKr$$X4>C_Tl zxDR?w=iQ65svG|uL&0s_s8^ZMBM1D^@glz2Q52?UlK_r|l010C5tYrd2gM$sm+Y}O z9btphlZuR0+i=e@O3d*;8@SgW{p|7qg^rHvz36I-c-I$ry2s}^}F!#}-)?W@qLa})CxVXwzE`#ppa!EFh z^bhT_F$I&W*;DIT{P@An2LDjj)MeI0)O$YU(OZmA z2UFO0FLEy~!~S3bs=y~RIhXINEf$1Q`JYNA^ef4xauGbFbl};WCg826;A#s z!Fpg-#nF+qBmIZz&@a@%KGkSy@J25glD^Oo5D-`nk-5yB2EMO%b??|P2=BM6N|W9> z@s#;>Hl+X4cCjnei~UTxVJYf%O&EydIbErr5&ta{1~S1}SU;#{OB)Gze7u$_<`T$x zPfQ1oH!<%~zOEAo{5W?idvlZ0L7ZY+_3Pnc+&WvrkH7hC84`al0ac zmW%@Jf?DL(JqKZNah$BCwEQC8OJGZ9k9kZ$A2hBvu{mi@sn<_(9j3Ka@4Hyi{(*|x zA>l!S)!@&IXjOgd$bv@&s&mrc1T5fmxfi20)yZV0r_irn>7_lpmS$#Xm3Sn~Xt+i$ z^v&i0i)J++Z5^q_-Ncf3%x@(Et6o5Nc~p3MKb4uSm=fAg72Wc?js3I6Ip>3JmXq;0 z$gP8{qX!+l^0qBwErb_<-G z{2a19y0;wB_hibHuJvFlYvZf9jJM^zs?l)>E`UBK4rwTSi;d``=Z^cDI(l2adf_VS zFm0>chN9CkHt?yO_~JlvdP1*Uy2hP53F!0f{$XEUc&Hsb)OXJWQ^cK_=6z4T+fg22Jz+>7G_baBAAX73&r&tpTfmxuNGfYtqecyDv@~<*Qrtf`Y#=m+_FL8N<&{s`z*+cpSdWPEIpxP z-q`)TFZQtG5o{R7SPCU2<+%R+M)_C$%Z|oVb7PWVKUDf4UOwnq%cswdr|Bw^UYXZEExoK45)tX-ws=8OD4xX*&6?oV zO5(l8gYq2zu{%eHi`{r!4BL%T}S(kR^{l~;;6 zbc=dV_m|u?XD6zNHxD9)q*Th3S=iX{%r;q=nNQgbTXbLLzJE_k#2~_H8|IR_>O)9U ze^*q?f3Y)++4|dX18PWrvR^y$yLWzRjNU<<<0ac34lSeBUT9ZAXg_8orrHtprwGOO za}p%J{;jV>5n0AxbG<5J{2O`+7yFzPLr24>gvWVbu2l^?%T(>pTwkGf*`dVQcGF{Q z^RB?bjm?-4>j6)l^HUq8!(Z`B6NIXwjE{d;(E^Y9YHMFd>R68#2I*T9^LArO#j}9bN+l~#>*=U>z(G~my<>jF= z7nwBkDDX#iPe@%nC?x<$Z7$; z;-k$(PTSHxi7Jfftu3B)GA;O_o;Xh}+M(|NDY$2EOR0>AbXj0kc#^|9cF9sO)a#s` zgF}0JoM4@bW8ur4a*+qEL5A-4&CFgTBvQ`P@v8e&x#=PsaTzY||8+*eZ3OhF z96SGK;J@S0MUC05Cl#^6k820Co}Gu;PkBJLgx@uazzz3+!*rA6@mCUx+YeJBZfd`@ zUzaEmck__qOz;4&w%Jr6YTA(bdx~J+UR`B}R~WxjGpI7J@RcQ!xqMx@b}0LRB>wJb=Lh@rg*6Enw?(EG;4S(i9W$+4wO#naT4V}B6wSoz0UJ}fEg%*+| z9EjIHrZo#`jk#a`f;TGjvuS1NQYwPX?o(K(B5Nr_@mYr4zxRoAXZ*GK zUkY;>MNh;Xac%P@GZ@JngJ&edf)%IeoQi>uyz7oBf7V?)q|-U`HDjHllNc*@7U8lL zB`OIU-Zi1G`26$Xtx%V{F)tscGq75(Bm!$aj$2)%>fT1eWZEMjxPm%6)BfJv2BN|e z_D5JUJ30SjbL+S;2vYrDBM226eE4VvU@4qi6IJAmF&E*jr+BGSzO|KH_nN)5^)pKd zkH8p@J)WqdXIyKaddsLSvVm z_u_vK4TXoIZv)Am?bR@^g83{pf0hiz;SX5o&$*${0vEIq2a@wAQiqun+uJPgr@A^i zZ_X&x-NS6|oO9098ti1J{>pRV$?jpk7{heu} zo2rPzB=c_TFKJpHDK=|sYuB|*&wk*&7R1a}CK8*_<+a;_Cn3teRE|>Ie2|{ol?$otOZ~r(Y_%&3c+3-}r z>f_h;gU<)ISKcV^CM7HjO)py>8agrh97HNy6XT|P0Jkdb{D8pViK$e9={+D>jAvnh zF5{R|AAP4PC%ZQ3XVJ8@ycl%f`Z!zCn@n_z+<9NzU?)XJckz0jb$bcHE$(~#N^r80 zvdZU-Dy0RBi&9)%RI43|r0?^#owubuz8BbkYorNR-dd@ctN5U)D&KQG#~io54Gn5{ zy65NZeCF417)9YaRnxZ~kME{Xa|`B`j`z{X@YG5}YI&RMNWsSkbG3%~qwSrruRT$f zp7-VZR>SK#W|>)%)e2!VHH<+k_nDQioSkd@DGTK~i?e}9=4 zgVD$#aZLwow=3Vh|4`rVGUq_8DW4?GBm}CefHB1rf0&a zI3oN`$BuR z$&<59w(dh^XA0?Cr!A=e<)YrSvvC|u8_!=jd5TG1dcUu$%W&SGG{M9{JP&ag$T@d4 z_CG(V>ZNKmc@YxNyJY$PA&ZGdwzN`~?+!z%pf4ls;tp0GT4<)jn()1o+Lhq)?C7DB zl3sZ&QmNS`%urU)B>>g zYTpw_7KPYT1sDktN$UB@Zq=qXXamnv_e&{1xZ5|jQ3+VR@FgzO!ZNJTd@jG2Q|k$)E`u`2Mk?!d@Gb-?m+O!B){530wBY9DtM{L`miK#-I4&eaM0{`%?)30|SU4`iEnV?lfw!zaT|&$&aY+n! zRh}EkFTC#|!zZ=tiC1Ri>(etul)j?1mR-o0?im^wEc68msJ_6f$oPJX1ujmaM`Z8T z#k#M=b%&5n3g_4Cl*@~^va>#K@vsq{D#$c8AaQH^^wFy1A_nr|L#&O2Soi{6?=8J| zTva6_!x^!$QR3X%_w%zLH<(yhR;nMLKucehbm_{uZGBV5Ts<*l>H4A=Yg0{8D~!3R zrKy%yUQr&u$;mmzvLPq?gCr+|tL^DUbpK(iKNiV6z%**LEw}&t?NR%TJMei49E4$# zR$|JpqgU9pR?a6YukxSFeSB=ZHYkp8nl5tfRNsw(4Uk_Bo=$tMwR>WoX-Z)V23LCS zp#y3knm2!B(nlhmS1dztJHx;MmY9?wwX-7@f)U);7KRB`$~k*UHHL%9Y9F6nBzC=T zRP3xgTiduDFm$r4rUsif5pDcFnUFjFfua`X_lz2Pc{4hsoL)Mo`{j2&hsdSL;V^mi z>YUZL7Tjc(8oRA^exC*QG_ThswL>uZP4m$g8z2>L%E_-9B^yL;glLYsp55O{{(3r% zUnlu^`E;MLyG>quqm1C(V*^s~L!1w{x7@#8XAGgbB&3@qHEp9|lE96o=C5gRZ0 z3)61id|5u!!=I`(aQQQUuaPd=+V=)}ErId?9r?GQ+WQ@9=T{a?Fx97(J7(o;i)N1Y zag%8hl8_pC|8roDb8-CQZco{;!1WxUG<*7FW!Q-~>CJ->%Lnz4jr3TVd-lqYv+GFP z&d)Bjzj1xyfp4{+&(|E+Y2`re4d`j(nP^+jymrzbPLBA02;7fp?TJsl&ig5nV&0~m zXD`!*LUF)?>ZbAo%sca+ft|=*GacOnxIQxCp!)$UEl+hib3kTYYq2 z$=^BPT3j!Ynp!ex7g<=@$ANq~C>s8%l&v?)38DOwMrIkYM2u_ET~(w0o@=u z=8)vz;-0JPxWeoibL@Mb8>KvZ;0`-{^#wL(q-fAR6&gKMYpX+tHFrOUMB&19n~2$J zHNZ86&19fP(s&f>D1_{$3f0S=my{7g+((DdG(5(0fop@p)?b3K#s}^hpAqZ1eHqA_ zK@>>)YIow{UGz0ivGwJB5M0@5T2pa)_1Hx!r1x%@B%x@~|GNxe7 z)pYaRZEfOt97+Cia}lFhjqiyHEorc)gLSB0o z1jQ0S-urx22B^>h#paJ0@)``F zl_j$c8x}_7{jMd2lE>>LE%eKZ$fo`SLt_Sk@3FGYUX}6H2b?X40;+=LWoFg(4i$|7 zMXPB0Q(?**FT&;Lbqq@Jn8$_Rjj)NDU=n_R5k6jmsl;39t!rwe9T~okE9nxGrx14c zR>6nbSFUDgv$XDM$Jf|G2(uFis600pho_t>k{cTb^^ugp=~EZG+2!HNZy#xO5SU49 zu4ErpcY76i6kdGvP zLSPWmTrEi*VckSKbNVuJNjIs6f+yz5PH}?Uoja^#EP1;7(S~xS&T~z>DU!cBd=EA@ z9bDHTH*!ccpqzOu-&B9YxPNjDD$xSAnBl@Qd`4-%)&v=U`Zh_)O<(TO?RGX0e+}Fc zb?n%mXJy(Q_O9FPoL;37l_QR}bs6sAn`ILg@k%G^9qAo0`Z(B&)5pnw4GI!^UQ$ezmV zX@@UtmBqxNz34_mcWkY43qdjRgW;#`Pljqo#;?H+gra`y-X~3?E)1POJr*yyiqNd= z=V|arN7k^7Fj<`a%4TE&3AiZet{RP22TgnC-#mzJ*P>1bp6S+Q992gX;Sx#tSj*oJ zQ@>L&6aUU~l>X)J_nkl9x365j~;f5ujwa}JIuvPUZgC`dMpK7QgI(Gi-M7TY%|+4c^IcqLzpB2G{wnMkiURYX&L9wsf?+Ab zvkAG&vqM?#hZZvCG4F6NIGL3!+F`81xVjb>Gdj9?&{yvhFTYx!B5kcP>dnp+ArcIQE=UE4oKY@fR?%ttjVq_Yejty^lwNn5%K)jl=cJgxTp&4R% zxi^m)9Uw5rhl6fE*q@lnFhcM-#+ri}=&7!nBu|vgJ6IB?Dqz@}FVoPFJOF=Tb1OwS z*>z>f4hIrgjjpr}mGH8KOAxPXPV?R>h@ovd<`Iuh*E&>To2N&3Mc78@9YSt3Nm;|E zsv`A+wH}I{PuKh+hd^5?g5qu`WaR#)%iF9mH4lB}nb}gCE6V!%bIz+nQlESsX)s*2 z7W2-J)!U$>4-5Mi2d39ql-2fKHtx4cF~I&~XarZ7BA-W(!F4eErVM6IFv_n>kLe_9 z=f^4S24vT(pGdpV!*8GrYIW;WbxC(^1;%U2AbGlQ_T-5%)oSSjp8g`NSPZG znPQ?KdK&qDw;Sx-)cmC}AYxlqkF38#mB#zdWG~8MA>E(EnuaMVuN9D)VWXL!ItXo; zwWo}}CPc^xNW;l>8nNK2F1AdAlMbubZ=LMESS_@X9y2l~Ht?V;X}x~^3PvbV)ir`s z?|myXBM%%Ql_pWIn2Ix}-s%(sMuXA{kqie^B1TTr5Odjwe+ujF8G90OskYF!!510X%2ojdXsWajLiuGib`gTGlK6`VjfK18-2d4wESb%_w}ixtbJT=^(T^G@&;! z9JM>T;o`$Hx@U6N@MXN*Ry91~y5+Qu1`d7&z^oBX;nzAJwq*v^`rP}c_Z^J;zEB~|kPN!>A-2q33Ix)R*~5UL0cbdKPb6DOpEVwPKBk z)V?EZhsVJFu#uNse}`rpe6E|_bk_CuinB}0U>-$A=%M4SEq6xYm*_`D$!Y))-3JVX z2|;{!DD^wKt5sQ8w6CB=ZCSB%XhiA?(D!&rr%E%M+(WJX^m66vsWCRvxQZNk_vI+? zHtaLV($bYctlJ7hX>VB^hJd|P9{9bvBZ$(2aOkS*;_+y`RihEs7pZSEjN=|VTW;2G zoG%@nDVYW0Pb)dQ@Oz)1-0pW83|qBAzKM7biu&d1bf5{{=eCC=1 zc|q-4sOy27a%8rAs53H`!S5FP&|NiR_w-clZIi(dmw3XxE~;bvbtv2UqX8T7!q0~< z$(O%;y4>e?+a7yGzt1rOdqN4yXJ1)-F*m2@TVZKB;JRf-0Ko>H`_XQDY~sY=-lyMk z8fGbbiaznB7`UvMuh^73o@p{qjC0MdL`4=-$3<@TP;H$4@b~WdZFAVP!h8iiXi8BZ zVH|ekY^*XlJx6oBVP{bZ_6 z2NLka%1WEeH`r4q*0q6f9ffZ?(I#7om*(A@Uv)rKkejK03HwG_B09FT=m!a%7|Y>x zV^@S$t`18ZJ?8Uxhd7l5aC4`omQoCSiZ|)v3$i@+>3N-Y(S7nhA`74b=8Ceeyi5fhoI1ru;uBsL8jx2gT07z?+q z$=i65w|fmOdH>oKU|w8bWN&}^M8+{#-RQ3;2=O$K{fyYIt^EWzZuX2id>Wp4-E$FC zn!&O@+HNyWi`@-`$(6BJr&ujjt6)#WK5E>Nlnz=KmYe*M>2zXd^xY)|wBFVI?X%-~ z1dxXDxbuqJ#NcO_k{ey9s)W{wX4ismAN*sGaeP?69azk&Q)7i`F6Q8F^mwuNMdaIs z{32{yhwq=Y@A{^p%{I~uds$v$IFoenI-odM`dn4gw5}1t ziEGg;@aU-#2^L$nmdV0|7SU-SKNF_4{4mq*p=`_%5lfU{1kUKzlPIQ_vFs{Sh6l5^Ci2TN4l_g zREUNXcw1~N%Yi%;nA}}hP7`TC$XC=8Z1SvViJeW=BUBRTN3Q9J z)2LAEmUGz}=kh~FntU+tu2m%L1rA(h!Ss2xE}gUAN4oT*9x&-# z^$R~<*j6=q!64mS&bLR_Sna(5$e6JE3Tg;nKq=y(+pUkECv{biT&t4E6)6Q*HK>5Uc|rktkmJl&vjbzrpFEW!C^ z+B>O8vnxryK6^v~Gkl+)CopDvy)1dHgtf^fU^R~$4ytuk3jdag z#omThNDwJGVoGI8AB|U-!6>~;W*^8^I{UL+hHEWD(nowga9g){|FN3PZ++v*dg(X3(T1i5L+j3=MWc`PF*Sx{?lvh;N4DBi%C`_4SAm zgUd|bjGRT5b~8>!)Pw6C4r}y?Xo(`aldc9EBd=teclZy)Kj&*E_OBor4YMW91~v@E zIO6v*?m$rI=adWJ#ng`kdZK>k9yHC;wY9qe2Z1S- z{PxjcyZaNQes0ZJpRYn+fKI3`EjsJ5pr>P=d9!8}qb0BTLwO-D4GrvC_EWoDMQ&tjNX3rK{S2M^@*fP!7KD})nC=lvx&6C(9(2gq)S`xigd@5p~U<8tXUL$ zaEfdE_-6g2;>s1m*No-HTY~U6H`;ANgXGZc4mnv)6(`7#m3=e|^K>25T`Y|>qHtmlt0FXsW2LS|34rfdI)wW6in?e_}C9qMw^*qAh zK@6BhecOWh(IvE^3uFeI9nWeP|LO@pUzF^(9lAI~cK-03q$E!iI<^*WoP&&0Mf|hJ znsh|!MWu9@u(GlYowcNNSUqSHgvmhbjvvt6(VmM84QPsPlrvW!$@t_$u8d%cj>l(O zcD8ciaPeD^TfVdHD$l-chov33Z_FT-as1P!hl;^eo_PrZE?`euSwRFLHV9dI z%?f3lS2m|c?8$`timhE3_HMQ#QKCH?J#WzBk==w>LOEKdH_Uq0;Mx}OwxBT@+QGh< z2sb0cAca9x1Fs!8CPd;zbW?2`JBBbs(@J?8j)OGF^N+gva-Qqx8?3@M^$v|;XN~rw zc9>00s`z`S1_u+7@a5xAqY?4vSBhm_bx$0E_RYGtq@&W_exd8w+(kb(<_9^E+qAjd ztIh`R;&{FiKKWN``L=5f4;z*<-6?ou5oh*Ln z9{ZuQrGfXDtBZIoF9~M-?G~KC1+Ah$D*RL!o?<1lUP%Yd4Y{+2)_gR_Y1+32!R z^SgVk7X5UJMgkH;I1m+sm?#y)JXe#CrM)OrsEWbsxfHFByPmONDf+I#I@Ttk8;^jO z;yv)T$8?2_DGcFZBv1lt9%yAg@*G8_AJhtJFy9g1#ua_74{<{=LuvwMD_<30FGjo>cL)dEac}X3^Yz+8mrHOU+iF5WU!>-$u~Y_`W#duXQoB;8+letit(X5E(SrI`O_Va4QqvW>Mq2+t-nPB|86j#DAxu`Zgv zA<*yU6P+7nk)2zC-@^&nHc>uDJJdVJl@vv!92F!xjLfs&^4U&e0{EoxF7lZu*{!jx z1Kk~l;^-vUFpe^)vJ+@W8AVTyeq=&n??5GUnvdHxROps4pT3JG`gXL&hl?hRcE-Li zlsl`RtN;}rt(ACW|AC2zw6qlpI+~E!wGuwLP~u&rRzd!7wdkjKl6g`K?(RwYH>^?7 zu8}O!O3A})t=<@_B=U0KHGMF#oiuHG40#N`qir}uf1Dc%xf;wPy2q|~cHvwM)hmNY z%3gsF7uYZXt^&u+&n^x0a_Z|CXC@xLl`PVIXPYutggba8HC4iz%o}!Zs#|@QMKQf* zp84SI4(*}~^jI?s!2F`LYf;u8Rxq)Yf=u~&qi>r#R}7*hpSYTQj3KG+?rysuU!l`( zi|VFETN@hu`sm~%>hsY09r*YGjP2LuD{5Vhawx-IdXczKLm>&ckSVln$go8LK4d|n zdSNs0MYeXnH7MIklGOG7EBGPShON3Q@k8$@a1$XgdbmCfpTUC;I&YJlbM@W|q7|5? zn~Ta<7=D_WAsij?$skV9+}$tQz&E{FF+XB|Fnk;lqd$z|p4mNF%UY89TK#^(1`6BS zzkFktIr?~OmKihj?&>IltGT*P)L*zes<%IZcB?xKs{~L>CIS18=#ZSch9e5Cx%tiM zS_#!usRqj-Ym&%U3(mU}PIX%kz)=Vk)JP>j1u6Rc`SW?H!K18HskdRV0ekhY#J)T16q&A-e4Pl?{j=xvbhpp7dzb0R_$s_KT-jMiKJC3S+4hk4K& zo!O~s?0lDKpy&ek8eHSNHunhkj+RfM=~onO95fZ3&3G)?_G_!f+;<%0U9TPE2C;K; zi)YHp{^`Ac?IGB8UGQP&q>*M&XguBFqvxzbq5a@&VcFp%A=0%cdDx` z?_mv6+o#d8@l#lO91U%}J8h%&8*@Kf=#}fzB{e)hkz!tZ+%ML33wVcgKb;82_fScJ z&P;DyZcp0cskBZ-*fqb1fVHH&BptPAd(QA2p>_NX7Xs`)l2~D(g+R_n6lMJq;|R-U zT-h4+0?Z8VVj`B>)8MtAwN1(~C*MOz{A(9u^T(xGLm3UM<`_wRt0kA+j1`a}fp}?Oi80wJAq@x$JT6PM)1c$gS$B8;o`B zrg+cwf#o~tGkrnm;X=XXPtxgh{9D$&adxyB{dK#%jXC3e>%^pwt={~r| z7LWt_0abcR9TXO91|f0y1{UB)3o&WFRe#Ogf@+eC&t@k=3I@-S9USPt%A2m*JI>w= zo%~rQ`NII(!{K0#;3vTk;)fxa4a5R2zrQ~mlInj>8*PHP9l<~@R3xSz74mF<5o7TY zpyJYJX0HONaU;$X;r!{w+(#dXPLKlx>$A9=dxOj=f>{WOQ>$4 zCkO)UJh*PKu5Ox!I}OiWEe{#_mmgVga^}|bdTF4vx3R3@@8hd9 zap68qX$m|ySB1Jh^LL1Sg7z;8CpEsS+Ul*Fm&{?B=K@b|jk39+!(WA1vK&aU#UanI zIc15yDKOR7W=`rMz>c{jbfFj0`3eK36cogv`ng;U(;#rxEy!lF3qjXf$Uvh;Oru#o0bHBFci3QLw~A1 zc%JIBbSB^8WpL+l_Biwf`F!41I7#W$v%&G+kiMeGoEtLS`u%lrC2hQ}7Iz7KTHTrbg`)d@aUsWYmY`?i zAy8=nA7&m*5w5jJ{Y+kWc&+1(bf4L1Sl( zy_>3uCOSI|WZS%6l(9nj3B3TfK`F$;ighw3+Y|<05*ucu)Oh}*nT<4m&ZY20u z+@w0=ZY>_M-7<2zgsk{J*YXX|Ub|yWn$AYshaMy1MN*$%`>^|XEMV}K+o>a( zuB10}J1T{UsaYVfB4lUf@t6&OfVTlcOUVlQ^{RmPqb3?jd26{au%=o<>0C+9=(=%> zF}JQ;A}6>RMOOvx6y%Wbz8#iO!-XuMjg|d+X+|lbp1y0V@bj)zyNm(_F?Z!25@6Om zI-h9w)vAw!faOHr?hO$>fp=dY>rEDcw7}ykL{`pnq~5;Ez=Pe-|ru!Sb_ zWV*r(?fFJfQD;H0I$;49c@Xn~f2vm3)p6@B(gZJ2+lbpMlMXTe2+9qxaoGiCV%Jsw z$;t{DOW-GmHHscWqKj6-2nsdPyK z)uKnuW0FY!Jh@}_V_lhjs$cUD_&vO}y?$c{hBTPHnyx*5v-6guoT#V6pWDa5kXZ^h zdbH!`RM!?b)fD9Cah4x5iK_n=vZS&y8=&*ze{1~WaJ}S64R5m&8j(Ey(Z=z%Ic~AO zO<+bFH|jP?*a|mhwf2F;isjhTlIW4-W@26+4u-Vs2#uMNr$;@Nf^tTvp|%Vb(qh!m z+2D3Pg0vsk;Bdmk>#aLP-Lh33Tx(AE+HKk4J*O8u6YQ+puJRVgCRnMo0^sJ}DiO`}#mKK5lPMSwlwCyq&4mQKybsFtU@16jY9 zR&Huo?=56F^C=J$q*&4 zStLJo?zPVuY{C^z9mv4cCLfCj7%%N-&o%Kh+I^J(xUlqvO4%unJF+J0-D|OEnUN6& z`ZrhIEG_A#?H(@j<;F&XF{;|*9SEoH;BzI z$keE^He;Vy${F{LAxjL&hR?1+daKmbGB5sx9{M?A?+ZblA=(FdfD;<}_>U_LfD(wK zH|Rr6ichGm^8tH6OKjZ&rG+cq1v}s6_6sklL{AaF`D#WHKkcPl z2uUbxy)5Cg?xk~D-*W{SFdr#So^CsA%E?7D{z==&%s}>8IS#F^_k2P!Oc4zo=}g}| z=fc*BwBNi-&Swb88BbvG$?I0ANPK?XcPK0phEyrKqvE1R#ZnDix{MjKZg0GGV>E!j z+}3)c3!EB7dd=6y2|leYzt=u}fTPst?l%+izVdQ_D9(SxUI{w!YJ~oI}nN^`&D#^*gNn zkKMUrSTtt`$S95Nusl56M;WWD>rtlAIY%N(G6?63Q6#YyKZU`$DnCpW#`MKH)Scqi zK0WpvOxE%8WXK@#>o{S|Cd$!g39KnhYZi&{woeP@m(9d*AcDLIa$&ghc1BaAKfYtp z4KjoG%DV>#9E@I1D~Ngm@xkM9Un-HY6)z?=FX{8OjDFa-#7UE|UJ4aDBQw`x&)ylz z_dC_zpi6)lHeI!V%Z$+=Ee6iBWbt$ILTxU9`N5+fQ5aZSVwuF#JM7=mvgl{oQ6C|x zr#d+DCaK*HXk6sG%ijDSZA=!XjSI4bdv$-G6LA^d)ry!1azAAT7}7yaUITuXRt3Og zd{ozTRa+-$Kd+81#zi|nL{$>X{7Eq?sG8P*6f08158Y?nSJF=fmJUE9K*JWX8W%xu;;GX>4lo|-HpAL@*O%H`DG4t0AX11N0bGl~ zmftEGSUKCvfGbw&TF%p|lIgf}1sbLZpjUKW_)&_~F^JRwe>8p$1VDdINXbOTg!r4R zJ1<+NcL>4cJigmg9*N2CKhuhcLRyR&q30xzBrPtYP+6cshP$r}-PKRmT_)IeMg3O( z#s?r1JYT_L=ad7j7^KDC_?PoKK9Z&?dqrl(i6n5%RjJY`NPnO!%i^Jk zZ(duIT#X{8g8sFnWUke?3`ZsYNYBYB$6y5V{8*g$92wGtYCr`8KL3m* z7^vIvx@VW>rtt!-qOAgXs(c0b{v2h?7t>e)<3U!3T;!&2O&sIMO#FXxu9 z+-lRq4{bRg0qCvJdsm5Th@gxaS!P%uBk(14B3?bX!E8wRNR&b?LsR{p-|ZA;5qesh zkE;59^W#SGs_r6LFJ))coADrqllO?o6e5r*=iwFt#F@Dbr~RuHLQJluwFd_?3CTd) zUJ;X>{|8p{g3ST?`5Zk2FD59pQsfqwBzt73T;!H*QUE(6$b8Bd05PQ}x~UyR`|?!T zbyw@u{RZM_#V~W4EYoe?w;GA~1wdhw+~@*tV|FmPh~PV1H(5UhiB75%iLNe|+H^lC zHDt*BJZ2TYG$dTN3DZ3zz&-|ZM@BGBJul_wam>2|#Ui}ID58fZie|SQ4}YYZ1Azm? zX!TZ$p|Gd#%VQdf=2mijrTu_FLLhLcz`!wz&_h}vB7l^ZP+^Mz^EyW){X8lnXlH9` zM(D@^u;5I_wKyFw^2#U`^3MzW?gvrR#`Q_S(lT&G0d5- z1j3^w=jKvSE3>ln0Lii-4{yO9DSGS;f*dMWSjSzx^C(*{@u|GFhjTgSg_HnJh4}3A ziwjPnsayav0}pP@JD>4=ocU4oxI!?+1vObVru1I!PdTu0hw(sy3@1rTPfyFQKW1aL z5s~<*Ai!d`n1F2EggE1=pAu%4n*1zI5-LZb%ATs;@@ai1s2fHmw3m@wX32t+gU1y> zhytW2IYjfj7)ANe5EV6~MVkB$J^gS7DQcRYy~b16;O~;$D4UXw-*utf*KfjV9G-Mp zslbCKzl+4axGZR*d*9Tm@qn5Q9c>(vfJzjO*&V7}@f&*&RYO@e$Z@`jyRrK?i}(l( z#5O&{ZWa$IA8s}#rE%Syo&4d~DPq2V*H6uhck;1NuWydy`)sLtrYy|CkbOiav5n#?z9rt5(lKJD=g%4$yg6NNGHiPByajbUOiyu7?k zEAN$+znTPrD34F1LT=f9}~{SzA0Q-IMRu1=-@b&J>2?T(65fwad?L7<^t zC~C{?{)u3#Mw)&kkBZ#&LdW)Jq$zpD3?4LaCT4NT_fD`1B^QOLvjwABz(H&BTMlY$ z$Z#gAu+PUiyH}x`er?`0)cAy}_6^b{CFLQq(q)Opfi2%##KM@>?NTOx+2P|FMnfb) zs7G=%^bZcm_I}omE;xb5$*HY1S$b=uQ+D-R<2tM>;-iuuA9Vc_b)#DqHW$054Lelg zydwKyAtnPoHK=!gicvBQyQO2B@$S`2JrcX-ui6Wxd=OB8Qp1wYbiPEj$kSaM>;vcz zj>Jb7J=+C%Qr?q|b#;1Hrj~7M2h(Y}r9!+=leh-45b9sM!4D5@WFlKqypZ8s6Omvj z%pB>vvIMGzz~TOp|JcARDzzK`kuK3uU&7u-EPnww{L26S+@*lv_J2S3>r=Oy_?U;M zOr3MIk`jf$UKtT@JZX5XhzAjPMFCScKy_7|(-cFE?ig@n`tptsl;k)QPH3Ee!?jH}$sm$YRKLn9Ju;``|O*VaPro7}R6r zdwpz;1p4c${uy%p&i%gZvP1+M9ygBh{r6wE-mYlGY`+hnNu_1#xP|8}C_CMP*QJBfa~yd;9rS{F>6iDAj!PGUt=>=Bf;&vE{LGLtfYcJN=$IDzqz!n^CpX^B$1_EfC49V zriN{TtZPvRZhOa{?Da4bC>lZ%mM%UDXuSNT;y2f?naIfrmOPpaY#se)tl`SlyZ?#m zdoTb`D15f&FXDq_^4}02httzQZWf6gc%8(+pT}&PfIi0g+=NC$*rt$o!NzkW1ise{ zPd#1QiO=mc=Wwn@N(Q8sU?gzt4qO#=KMtjm1BbTY8z$PnuLuId;vEcKFVfD(C*B4_Pe8q_wx7s?pf7$W^R6uY_!UBlMl8`5i<^pS z!q-F586`C#d)z=9=;v@7H?u%0R_A%T#+~CiGQ{6M!RndGXZ^G~qBis)m})rVajYa* zbZEG$A0oI_3}Ms3PM*oAI7JwgaQ)U3y#w8dT`yrmyQmZ*353vJGE3*!_E3;W*211w z0r6h4U!6=~=w`HPnHWUWA+p!yaN=LV(ou?hfl@mK(d<>9vF(JuYed=8kEczl_;&4a zS4e#~wW3>_obUuq%S4pX#e+*No*BZt@Su1}tSZUTha~YHcs>CTuqk1U`%o+--mkk|4%SHyzpp3tz&Ed0B7rht8bobk ziXTqD9@v|}6?Fhi=}Hj47N8&K8JehQzXH=Ph!}t9duf2pMGYy-p_aGiBZvs)#e40# z$L4;HpjGdb?9=X4th?-O8`$!2PCMHj(*>&*-t;4H97`|LKA{~l2+Afao6lKsscQ^X z+r=eV-VbRq7d)}m3BTRRq@Ucq5iqn=&vP!8YO^2Q`JFQi3a8_a2=9uzq3=6HjgSG2 z0zvV&inkn5HnYj;2Gml17QvoX1dYJOuQDzp0|iXYmraoan0*Iy(kw;{eTw}V6-H3i zCW_(BLtVtHW7Dy`tl-FlSV%I``w}hKO?NdO$nC@S9UDPJ2MgR%MSe4B+4tQ>bbC5_ zdX#1(`rW$ffst!)XtJhGm;Jm^@tx&;T(IWU0nf1@YT4Y90G(R42G~|LU8Em5>hsL? zVO&ZS7-2cQyjph1Y5t8IB;4N;@!Q@%b(%W&m^EBUWFOAEg9gUnFhAUj*Hr*SvCM_M zxZMrq9b1|un(66Zu=-4WRx)-NyEr+K5SM?F8yzx{=`T$NJnqjyz|WC2RX8+0QOsrb z{{y5rzntBnvHt@|`S9;Y&BA}T7+NrN<-Crz39_K1q$Ib#21U}h+i6@gYk*;}>M7&f zz3W&bDb_MU(>XvFYQpqpTzDy1{&Zri)3hRuM~*hz8t>J_!`2!yczhIP*V4*Jkf_we z&RQT_1v^qu30?X_NNwKZf~ejOr*q2`D3MYYgZyQKA}E_`*I==q!GY+KriqY`o`a_t zw;ts%-?E3ZPm(w-UFnMRh5J-n9UV`3331=|QIeAcy(ud7)3rNZgq`zUO|@EV&{+bT zc4jV*Q?qXs7aqeRx3+&X`;DoFA>~R5scVT|IbKC(o5 z_inBb5_TnEOnA!xXdFVoN|bmW0;}?xNG3o)2CHXWO{s z$ezUXw$&qF7H+%wXMm52Ut>iM8Cv%QQEblol)Cvh2ar51Nma>wz45(I(Y^XUkuZcC zHukGn>{y+=ObGDG)O?f2L)%p6G$u7aE!O7Rv9!s7Tf!LYOc)Y~=!e&tN3ZWkM&353 z|z6A2M)%^|FB2hl>iFEzpzJ3g8`Pt|FB0F2~PId|J+=X z(1hpTEc_txbB!VB?;DB*_?7*RE1rsggMprRj<;z9Td3mLv8N)zKBfkP*7RU=bxU5nj8DLDr^3irb>bu!Z9mQ|qOcYr??iGt!fR z=ZohygkZNkn^mFv!^iV7+a>BRvl)fRbnT5rx9juhIuT?v9I}EYoTXKJaw>zoMGA2( z9SRSIU2sN@!pPV2xDd@fIHf&9XKqDkeaF_ha0VQ-0^Xndrk zP~BJ~H%O1tVzbH-HCg(U1ln*3M35akkAn&1B5d^W#~`?)e#OTP?m`(9dT9I@7fD#6 zl8bJmK)$e^0wUWNa{K|&^`jD5Q-P5t`1z93As3a#YZtPdNY24&GWrxH@IIpn~ zC_0qF;50Yax)~?+dSHS~JO0~GLLmizVGR+#pRHsAw!t0??pk&Gj&tYW6bFa_&aU7s z%cir9-~m5KcgW8Lr;b>I-;2#b*Lem$?rKGTUDp$ZEOhkY*1w%i1$Ql+h|UWLfAKof z_HW8is#%AZxJN_e*I6ypDn7k}!rrd7Ubq5o>0e4PH8puadPvJ2j+euM6^QMni75q^ z8u1I5qV<%@(GqLN;vbsbk6Pa=*1pFq;O~OXdRF*aD$l%onmNOe_0SC_#%#J}gPN@K z7#aBoVb;O)hqvRB^7oL71K>LU!WM)T`>#VTD<1-9@qPQ#VV9Nl5arn8aumeW<6XCf z$;$yM5+jkbgPt=hs3v8{4XJSkt&47Ba0a=VVDNOVaC;VG07_93zFHPKSUu?nJ0Ndyo>_bfE>Yssg~%^$E((INS--G@b#Sb1 z`>OA9$V1kCH70LaPyxDC~o!l#8$>AiS-Oy zz#LS?t4h8olGCYAZ=I_ z1I?C*bBF|OJh60NVtyv?A++n6y0J$0OK{)`q| zvsE=@OxDWd$Ez550cGWO8+4|?L1cs(v zFPAimpub&rIbbpWe}nbp@3Gy8isEq%?nIV-?%C%8ox$v8EEYGbosXtZ5B)&GS-$9Q zC~gtpQgZJ#W<3K8vbFnEmkCEi(At8vSrg&Q{mVws=#ur1Ruw|=s|EpdNTom2Na#O( zy*lCuQ}leXv6Eya0w}E6o|?d_PkOFH&*J9hj{1*w6m-1mgP)PX4e~9y3^aSKY#73X zUJN$D7c>DC^%9Vn_?RYW*vk5DYfM_rA4`dN+{A3!z2hIwNPWe70s%J>jvb!ZH&abl zA)*l+J789Od?G7qLpB#85geC?1Xc2Wsqu+eha2#g=Ee`<(|wNnB+jj^CQEKkeJ71w zi>vB){vQ4*A)X+bSdcN>yB_P}C%t-NUVvT(W{$3WJeQ2mY|Tz6QLq@cb@(PZL)lnX zQHezLSJe=Ib@GkGNh)g6CHG#F#`|Nf_}ElGT_X`8hCspOfm_Fvw;$zl=HD{_IQirk z7Ybk=kG;kLrfvZx?-V{0w*2gb7PVR!a{-H1SDV0^Ut;AY{@D=jXKvfyA@;!0EhA{A)W&j#(C;@+>m*H$s2FsPR zlkT-d0z+mg0O4LO;83{R{Me~yzd-a#!CnfdFz z$FZo~(I+VE{tPPqOd)TAjo&amc2#ZCK_z8nQ!7~UnGTMQ`M%FjY(T(y)%1u?f8cSw+cejXwzK7eQO6m2;uCT>4XyC@)$@+O>$ zTmjIe6W@C%gJd^KjS?9u-y%)9AG(vgs3MNic&PYY;kB{o>Po?LR- zZM+UEtv?1g>5bp|<_GXdsJv1;;81-t*4cw8Dou|!sT2I_2-!5BM^UGQtIl_Yb*}Lu z7BictP_Iet>n1HGzAb3!*Ke&kT7fbzdVD%F7?c)je02n!+ z&8VBKNIEL0dV&f+bp`i$OeG0oYSs<5yA+uW`gK1oscjtKmyyE^)p~^A6w9}$VsO~H z@^?_sW}DBg#Y9ubZJB`8|7^8zthZ{~Cqj3L^NNIwKJ{K4>t|C=TBbe~c8-W&Nm*b# z9(K+y5*IbK))6Z!Ie|JQ!{Ot-~Sg9i5nTXs7p$W16;qgPMLB^pn!46zHt4J#mxZ< zJW`C3S;h+!G|@c!<-7mG9{+6?ZOyD@_%|yYH+W9{SctA5`JzWm=YRw4`k%Md7%<11^m;=6vzbQZn~q^)8_da zVZcdc3s?OENur zrJ#WE{C3CFWTHT@`s1)mEGru0a8Xs2^3wVkG3ik}OcB6ik}B|nP}h75u^9$J0@*df zFVL)=4gZ)P04^}^mJM>5`_KEhj}!TEhahf^DEgF~So)e88rRRaw2MnC;!^az4D>y{ zlZb-v)V#&>0B6;b>@ESX!`sDTQ*xoH-(EVWRjOzj8YRZovkh3?wGQZB)qaw`_(ClZ z*79Q3Hk{S_!_cuVDh4Y#z*t-{4#`Ao4!=^AQbZ7l@Y~TgD!d}*L1Z)hzhwH-0r|Z> zBD5J_(Cyx8XP{luZClIbv`!kNjDnN1REn!wNUmH9GA?tGwl1*5btsKfj>PB25MR)v zHa<6FfAuoOn*P69|JQ%SO754#@CC6c@NYLr0I>-ik!QgB^$&aYP~Pk%^)(_kKk(;t zBO7=jaNi?ft!NLjVHVuBJpaLPcHxqjG*ng)GJUhfFPk}ff? z2V8oRIh*r7-IrIcO6&zO2>{MA48F5tseaKSxKF#csI(|IH)4KT1>GAC_bP?eIM{F3 z(XxL3C8MTOh3Q)pNybcmqM+=3H1bRzjU_EVZ0_+Br#NTRKTE#okiMi=54-Z~1p%D8rRy|=_ z+Jsr_|B@em(EU*^?f>>r`V0TwLjgMXC$E=t?|+Emr2G?Fz?)Z66I<`_=Y0%y0Pdhu zLQsjSekCfFGGvep)QHEi@gfvt^6lLCTQf+WdeE*xj({$#r(4 zheRrm3so{Vht>7W7pdw(c#)XZOo@S+vad-j#AQPE!@DfsCqZfq9F%?96;GSSH%*p& zYGAg+p!Myz({JQW9eGA*dUo%-n9Kwh_eCS+6|>vOY};EExkNkn=t9#HXi!tk*0LNe zsGzXVOW->Xd8CzupF?J*9`z(mIo0ys^`kwX(3)%y*%zs*e0YUG^AT)Z9+|S#{7MAp zM9zILAMGnq9d1x@m>n`7Dg)@^MHfv~2%M-TElkH{f5(us=<<8lyuqV1FhDVU^Y=(% zo86eyW&002Lx0a7 z1weKBf1f`~X`bfhNi8?y@R~FC@>hpK0P#*6^a}4d3^<)BZjXF<69{+tnLy0~AchN=fAvhK zVM#NdLY)S1{rS6Ath%R(eX~{KqJDU`%%>mdhRH^0{eAllntP9!d1-Eoy{DrGK8?mqynUjN7S-Wk^Hg zbWCyEY<`)xvnW{Ej`fUzCg&x9fBhYM^ZSyDigvVpIyF@$z^Viw{-TiK-lHpBB_-NL zUet_2(L;obGkPzxr3@CYn|n zz^D18Nth}w;COjG5g*gq1k#M90K&Od{CFE9XAz175zDIzSd^uGQ2VeAzFvzyMg~Bj zL6M}w@=vR16Vm`ZX~2b3F-ds(dT^cIluQj=dAVn z)|&s2VR^^CuYK)nUmNG(b%EoCWgiQ&2FF+SC8Up% zi|6z1JIrNr{!!dM`_%S&+o+U7GzA+y!To>U%_xkYJ5ddo!-2t$)AN6OcTJPQ15l=#uda!Xne0YmxNMH!Jn0?nfE3@VLvlaMkA70X( zUo6@%KKkY1K~yjAM$SjP|D@fl}b-wkS5^~Shhb9;k|S1{%`^6;eG7}pQi^2 zgWBJL3bRK3VcBc-C3VaengUNua+u&A1*P?ktZPnG8AbL^EEA>5>2tejD#PH~d#wYk*&(X$s&)$~R+^iw@f45ei>o#6;*I+M zP`;fOfzAsJ2qRk@{?fzQm@^n%bg*Mc%$2UeDcNSQ8b(#QSw9IU(_u4^@Ue~EiTdog zt^P~{C)=NGAz(ob)?eI-*ryT^rULGyDOkoZu`){WeD#H*>}))!B56wXw&ma?WGF~Y z5!vgHwU>&N$RbL~^l#a9Vup$F^6^Q}L|G5~s=N5STwG+MF$-X%|A97$gls&W#iI4=wuAwVRzXX0hJ9%jvE25Pi zsiua)Vq9e>$(WIsLq<5*m0*$S!LARF45v#f&#oS5XZtmRC51y4Bu*WgYsNTP>3S2Z zUB=s@rgkDa<#dift18X1R2sexLw2&SvFKJ3Dj}{^BMrd?nK;Q?0#0f7Rx}d%6kJgfg>5?WUT)kywWe?nsVPKB4_hVm$ zuGR}kcdUB=@_@&3!U-Kh<((pyh}^klj;ikQw*6c-_T}pH=SP)Mnt`T+ZTcuY9dYq9 z)_c36(lYxdSG_kZ6kLu9gvY>=V(X|U|aS5lrEWi~Y_pPnRYAFia!y44?4?s|WX zxtsHYoHN0{H6T-llo}NSXC27rNgkk=?ktoH0XJdF@Ij)-A^k;kU5Wo$&CvdxGxSXyR~ECn9-yZ;D$prgGV#o26_> z&u2_qs|Ero8yA*Sy;yZeRtKc zp-k75jas5{S=JZSJ*9FF=jCl)B!193gb{4ke!4VSZfr1Gm>veRf{%z4eM9@l3EY1? zaAo!6N;=l!BXRK@L}-=Ij|9lVVx(T@0#~IPtWnX+jLXzN5KB$bXDmjl>H*zdEl4cH zx|gW-;{2w?cr8I$*`6d+v%-pQH#JnI4IQ#-^HJ+A>x6ugFv>mlq{wuh>Bsb^Nj!*Q z`_<;U1J0&ji*^f#@xgBoSFk;PT$9C?J`p0*+oylTY9Nr9vJ{@F?|LWK z*KOxlwy|M&X{Vyw#S_)aR8aj%3-*%Y|g-M+l_Kn>aX3|&Qw z`Jz`3uz`-?w_Ctsnqh|Now&;#;GS?&kbheqhdxxGmW=9%1@d4q2WCpVaM1~k9V*`Ad;0*^*0N@Z2n_RA9hr_*mpVP zv5p2&N(^MPZKFIGSJJCm9Bl^-+PnsgXQaGeN_eelL4^-AY7}*6SOyG<cW`eH&atvry2zQkdPk>Xvn?bpe`u8W~$3cU+k_%82#u=rx3 zUaU|Q43uMwFo8V@!YXs#c^O8{b`zA4=)0o665*Q#<0-z`o-4M}!~&^JXuPIZQyO>p zhUM-Gk8G!ubM{ssU$_yFx=x&wiqQ z!n17Ce`ntTDtz`elW3!)q-0^Ni#K$b4Gi`Mg6aNYPGslQ;4f_w-=eyQuni4-9=X;G8pGOO^`N$S2rK zq7FSTvj5&WY(7@+E-G?vl8hwdb#n^SN~k5PnBt?6A@7aIhP?p^hLvUt>d#*Ki5J5&q?=eBG@LD|{>PfTANJk-02H;s%Ckvrtw17C|M4hEb~Cr>?1B$Lgk@ zK~%knc6mBOp^B5D0=MQ607X4Cj&kO?k|GZi2V*p2h8(XDCVoEJc^0&rb7oxUC+`hw ze=GGi1&zibLJ>a8! zgQc{S-AZ%^0wQmzEo5;S`ZCN%aSbf{c0w*2mewcP7K?nG@noABie*uB_%%fa^HDD~ zJUKbPk@9Wk&hexi{qbvU?L>rO*L|hb^W)J5|3EER`w;h0kPQv_T^$ahZeo`#>{p!s zbGQ2FF<~x49!UCc<_y^hgBuj_&o^jW0s}lE%G{L!LIKz^DIm(M26Nj8&D|utAQ2y> z>;y31Hv0-ZdxFejrf6W&Dvg{{wM;vH`{Zrxms{WUQ}M!myU0vnCpe{@cU+VP{G*hv z$lWPhOiUcBTgMB&DHPHX-mdra(rdl*Q&!?_BOf&3zzWrypJ~=~f)RjSH|WT2Q=}TgxueQS))B+cjqKwu ziDT#Hxz>HeVOh07yX!o-3mIx7l*P%u4`Yx^6eJH3osI=l8-Mm;v!)amcOc}dZ5b61 zfd-@$(J`c^Y@OM=SrFBHL`MG?9svR8N)Gq7E1Cz?*lUF~L?g4BC; zBL!>GK~%}>m;Vzs2M!+q{DFV-zE-K~ACTy#gvkMn&9$n~=z+jh%kdJD_7H6`M|tlf zzZWn2q}JJ=bpyq1g~68(>a5nkgoPS%!G=1&{tx@gAch+gaMTJbF?vZcJFuvQ@W{odBtWIzE5+;X#xw>eZ>Up{T zPT1!m*72&0@erkuy|8BFxk(V$=a9!7KeXPLkn$oFhHu?f%{-Vf64eH$iD z-Hz6Yvey8&Tea@4Qc()p`*kZ^F+TnzdR0uqI7muEfa$AdMCQl95U#U2e=T`ZccXM; z=)E*e2Zw8H?0KoI`V6q93|$&nXo^opz34&gbMfShc>-+d2AOY=2xffg5MH@AuMfBM zPgfN@!{(2LC4;CLFD1#?0ARv!=g!wNw!Flm6#C z?l2AW@O4gzP=PB+_|I=){x%uF8}+cSiyy@J@hqmQhqWG9oB(s<2JN)Qw`F(MM%GJ8 zxPjI*tNA6bS(6VgfndZZFB})?MigX-{$ls;=@LIpck5AK-woljU3aGy&Ac0Qbc_`C zdk^@cc_&k@-6Oi&3I95X3a1FI+C(EXS@?}}OdsEKQx8M%qqtame0+Rl(eDM3vCr5x z8(<|vAZ-LsS*Mh$%jmmIKi66BMO$xfIT4CG%3hq((E>3{+N)SZ)oci8*JBTHl%L`v zC3{X;qcY@$KJ;NlE$nkzCU~Uck(7C1Uf~$tWp2SJKuF!eFw|LTx&c z+*S6TL?Pdpn3(GTBzq^_c!kE=Yp>W|2Q4V~!E)PYJ;Tl|tk>b39@RTYGVJRs0o4Cz z6iGC~K>G9Hb~BeOJevpQfqoRJ6O;b|k1CFQW2K)UESQOYSgnjw1V$A=F4y|Y_*1b# ztFT@1jI{4aGQ~;Fq|5FTyIX~eWJv#p0GNw*_|VgLF#_gk<0544my?HvwaMv3U&6#t zdIrj~C`xiNx?lu~G}9oaoN!t0sjEJ;ws7NUnmol`bmcKKSVeK%Y{DY<6Y=m|yR{*k z!yb|-EM9|_KE73fvXEaR3T25O%QwqCH9zhUKtto>eV_5y2Jp`uw;-+!4B5^c^^Cq+ zPTUD{o46o~ve(wmynC_`e#XMVAucTqYx5fuU}c~S4Q2ZxsIVIv#1{@VFM}P7sE$hS zHfX%1U`6 zXPe6rhvp3=Fkrel4wpqQP}S9f;l~^Mlh)CyH9JJ~^fT30ju8JKgv20kK49Upd=v`$ zLnfdC0XgO1_ z4dd=!Q*4J2udre>Llr$QPEHP|HqX5;BWXz39?hqROb3Ah%#H{NiO$wmMe&Jq15Yn4 z#e9BKPx-_!6ZsQ`SbN7O!3q4EYU9k0VES+0`AWImuR6S|M8(4>Cnxt1ctOiPx0M!V zQjl(MdZ)sLOyX4zlO(~%+hQWywcp%dT;6-~iHvn|ZCGnzks>|2NY0xw+*hJZy!Gy+g`m*f^GI| zfy?Pf@khP3KoljHupFF`ND~rA1fK)B&c5Yw18?8lG?ID|sWP=%`<3)oSh8?F&xbIm z>i)Zp<@6`Ls0S|6@J)sXwhOeX<#7*B$dR5Ec%K#o)R#o>!P=`7;RAE2%~=^2g%Nzj z3^_+(_mpA#?}P*)5iLt$eCU~X!@R_}xym@%MSUZrMLncCU+=yXAy1J{76}5*8%Und z#7A;Um+EdmqGx-wdgK?C1J5{{0@|L1n0gjdd=PxGM};bVmqhE~uz<03 z=JwOLEOK=@N={ZxVx|fz&nESbb2L`-sMlz|IwfVUxSjl@gZa_5IwO5wceV68ZN=vy zTx9w>?Mwec+NIDS6lsn))-gDqG96@!u}?WmKJKYg<7D78v(L|6qeiS3$5Z-7 zEZ`i~_o-S-55|UTdgX%Y)iAMl@0GpGRt>SG2Mw&pOC-|Dw8fr2jl%oD5Hy5l-Ba~p zyL&g9C-3Ni$7v;kgPoL>Ti(OGyX_632usNx{OB4nqd_XvEa9Jk=?7vAFm3+>O!rO! z!TC49G#GH2x1oxTs`=diJ5R3`PpQDo-t;)AQE%{Y_vP&;Ah)jFevi63oEnA;?~}nv zU=(95#j5J_Bx_f2(>G!a8spJ#!V8oiwO0@)UTxtFlkh@3&*tqH+kKAfQ&K`55v3YS zaFStc%ryL5LLvyomO*j;_>m70714{db2wwG4C1{JTFSpx6@`-#nkJjV@`>=udaM0e z|BtY2(nVV(J_UF-F_C*`;3=R=53z|mTCp~wJprieq^FhxlqaCA0e=RxJJo$mcd*yq zT`X@(PN-rVxA>8Qsj<`~pkxk-ctHK7qbFKb16OjEO zP4sGlRhlfLRA(FBYBSGTXy(z7hqC8|iKz2qh%J&{9c9?w%dl>igqFzTn#{}*qgi|; z5s8-WqWI3rJgQNcd1+WRl>q_OF0)NS`SBx;+ha_E6w}?mi>(kAd~u{M&Xo2z!AVvsr{hz=2vZBNogpR0IDu^zuNT@sAV5BD4CTvr(8bzli+VWOTw z3@*HT|4wE%;MLG5>V}%;o4GJ317s+Gb{GzltgwMEw^68KSOcO{qDv7;kg+o|q_WYz zH%{0@QGJIK^+3sACb}flwLlS`t!(s-o7!$!j*s=z<@eovq9%8+bH@>3tYLVHO#XpL z?<`0}qS2`6`f5#vnJa#aKxX4Q%r^4w`u1Ev{MLjyy%e;B=_*Z9U^{&S1z?pVwem9? z?PLU!a3Zk6_cvNl1P^RZeb8w_cTG!d<&vK=K4pljMdS+3Fh#mN%NWIO< zoTYf&C>G9b*`H=wxi?}ObNB3R=B2uR1t*OQPT2bPB)~LX-NN3)t24Z3tnb>jC;j=rV;fIa>xtkvH-|YbY9>3>uA9K_Glpgb9%EIg|#;q zaCZ*arg@`;&WQ<^sugwP+)~26 zA<sBk|7kvou0NzH{wz69A#r5|0aq<`V~W|w}75*3j~ zlv1b@PAc5W6xR(LzTv-Ga2TGcPYOu!fAq~OfJBJ!3`a|TLHS)JdCoZSMWIKQF#G2_ zi0Wv3Waf-TS7A7P+5sDph%>Q^Qe8FOfQca+?A38Hntgi> zF1%G9yyonpO=l}Lp;3E|JYf#nK>rFusWRdA!w{|QPrxYmCB0VvD&k?zNAQ$;&p+(z z3*t)KXTdDa^!P<)T(7=l3{!pX97%Lb?(V38L^DBnz3px`57GVi`oAKIi8Gk!Ism#@ zez`7Dt>T7 z=~$C^)Ev?Ab(d}f43V3c7qq3StPCTfkb&>%k(K7x*PmBoWvrGM8X7Xr89_jMl7v=Z z{Y}Q)W@(UT{&_CmHiBOEubZ9a_b@KDM+NB(J1%^YNP8o>?-L#aJOu>g945Nm4anIg z47@N#{9&hxLEZ5G2Rqew4N`Yo%yO&OT~KQ+i!2gRM| zLdFG-ScXWQh;hvW(ZmTbpEr_2CFPU4ss=VTN(H6?6XkVr3>CQjOzH z;^H+ldbQfm0C_aV@hiD7S~5#oKDfOv6)EWwC+vLnJ)ulgM9O2M#L4|2ZRWw($%@6F znh;&N+1Wgk9|y+ofqp)woU;UWF;gLB?A&QQMQSoG(CFI#LU+c5#J%F><%xu^{I1}I z_*wD1mDzp8nqSU6ODSnLRk>X9&D=jSlf>J4{F!@T>JuIif?Esj*o15I~aMsvQCkulQAa{kI#i7|6nBxfB9))OvrGab?b-Q#mXB!Z@)W(JQmPlJ03%SD&r%E{vc7(4PCO8N}<8| zaXBE#d5NlacR(gFZW4GkFuF9Kfl-wwgu;76Do(&`A4~{=6@rtrQw`q7e1ypfre4V;Lez2R2JhH>tEC5v< zofZB<3rt~_rtXy%c)>0=ltmu;d~I!A_APg!Z>9?YuTRKz?lMXg ztNIc>EwND=?4iQwqFV%Ts8;>%CEnq(rBAmdNw0COR>9C80|SDh zSa|z?z4r4{<(HXUc6U842j_1NisX0HL61rkQKY9rhF+1Sz{_&Nfn;GGxLz|tm4eU* z`I41>iL4!{0j9}vRQ;6DCDV=Lg6UD#tqE)EPqY=5^qfZ`rY9yB9^i`oA)Fm?Tm&|CRXK1+8JgncG0;V-&&r zGP5FRYD$YGWb0Viy?Kq7jJjjaitJW#5{Z{$_j zZCBLrj9&|fg~+YY3@RLi#Tcp!Qj_4-!FFI?P)ZUMl_smI)4!sSlY?SX?u^C_`#gPJ zu;zS12UHTHp0*!6Z;w~ADh&v>UR6c)Jh>9OF#T~!GBnamQIX{=PBt!#69%Kj=0uKy zIi_W*(HVnZqsLiRch}4q*@DnGYODwq&s@cfpV}&38q5>bZguP3bZ1DN>y=1Ab#yxq zp&RoMwRnoArpPeJT>!Q56o&_1u(9RJCHr9=ct`q-vXz6gfWKk=e|4oIoR@F{X2-^F=kM>uJypnHkt=RT;Z$Q7{bv?`C39E)CYk#$vXZG=eN&&) zK;yc>c!{y?sGGBIhFQ$k9nJDD+Pl=+UL^R3!C1K2Jw?|6k;v1e6mXhFvB$93WWBJ5 zr)Y&txxe!(`d}sG1raWs?dxKfKxNdkz(+)^JaCFlA{oa9k+%nIsUxU8#{L!KL2jgevUqPP_j4dayfH=KJ zLuLJ)jD~_`a@@_$F3a7eu2MAHk?Oi8uaHTK%o4-SK-E-?mB>e$&7UF9yfg0tuc-*+ zcI*dA!-0KZ>LLq!25L$`OQKZICgB4mBzz1881hU|;AX!rJ_~>I;PsL}&ygM)ov8;V zH#wb7h6KQ)-mk2zoSSrOG5sYI7$aWUh>7~yjozx6IpZAps5_IYA#n&aoy~~%Yrjt@ z2L>V`|4ODTBK|I!iU?xe`hf@ZBk2Z_rvIxl1Kz$VDh@%{eGW|dhoY%KtJadDM(;Dk z4TywcDv0tsOXkE|_~1_^HfyOEX80>(v%w#zD_^{Y!L-scV(hNhrEHcW`L&cKA|vM4 z$S?w$kN!N6p~LSn0)`Lqq8sEM&`RX-%OYZWvaYRKu3#es6(iLws=N*Y=vp zW%V5!0-$f)8<5QFb~7|chu6LILp>T-@*1f4W)b@|xett;6`em%I63Otq;&Gk8G9bw z{G=yEcHnYb`xC=#NiXV1CZ8A(eo#|j(Rmj2%g~?mqDqTWVcm>OEpaBg{MPiBtQ8LI zgGwb%1aWl~Mq`ti|4Kv{`x3eCC@g@9wgRVf6b&ni1r5x)&WmKi$18s_2yINYo}KfL zU?V9D6uPni;P}6Vu8_NC0E1*&1pNey%m988?5~#jr|$f(T)VwP5eqvzK>+!krbzZ@ zf*h8e5v+ZsPWbrvzY`VUNP?Yf7qORS?flLUV%vIOJx%*<$Gfe2Rfp#Iiiz>Q7j<2X zzYvUt+U(q6A)OEgxZF?+ilN}k-SZ02J`D@P$_VM~q>g}61W}T+`=HqSaHAP$f|87s zyw>b5tqt;Djw#O}>RglX8nAyc5i0TINsQ_XwiSm4l*XHT{7*fLq!jo^)UOE2bmhL< z?=}|j$YXTTgujQkGTd?SLtG3>J$YjjKVhYNpqq!K_H5=Bu_~RJ*_xayDxB0?TwFfT z>}44ac%!TpPT{}Lu-F9{ZZwE-S+q=l;H=q2q3cG}D{_+mQ`6&Qa>gb9(-{^4vXKNF zt)wq8V01Y~4wIDrx2s$CUsiYQ8u>qnNx|-dqM{oz(VVS7MkqLxk(Ff#EV)6&#>7{U z{DT*!FW;MFGyh%kK8s+J7X&$Rrjd@5elEHzVfKu5_7 ziD1eJKm~!o`?-L1>vCt@e!Gj5kq|)BKMtK$rIJ7TD43bS-V-Vit@5fr{qh~W8B}K? z`Gtk|SM}x@OSJFdvYa|xQkmUr@9#7Y`z;@`5Am9c83|VqL1Bu3iAmpqZ{^mQVF*=m z{h9B-V%8}J%%KIz7-m_6B^3Ye(6X!i=R*sYbsS%jjEt-jq!b@wW5@NuFbP&k4{ClZ zt>3G87icfmXn*FeX+-$Vri+q3mLT@?!{aBE@v@!6wE>Iicj{xH^}rMNIZS*}o1O31 zV4B1rEe%o2RGg~U0I3u8Sx!-kYTCLKY3ywqJW?ewMwVqH=~Gt$W=qWAJ*SJahhrtq z(a5X}S&$RFyh%kxMHw#A7F0ahySKOZ!1)(^-*}@rGbr9DwkFkmYK*J-)n;8lk=qe1 zht>l+d{_qd*D0MIy|WakVo*?!q>+-iLi;}PoZ$^lgO_@GdW0~sYfoVtdn1l(O=Y#j zpk+=9k*y48X4D%3strm}>ElzyoYy+8HiM=-tXl|=+}zMF`n&@=O|tg+AM}6;0oI}w zqZ`v7NzhfCA8n;cPILID+aaDmhSsd7YpH-?T*J}B@$*Xx5BErQJdPIkTTkP0sueIV zspDTS>3$T2}fyhUzlWJ&m1YJH;-01>863*cat8doC_8Wm5R&J^3%h=M^7FYUuFiT+*L)LHcVq*O5o45Ko&6L>sgY5@+ zq+Gu93k%n(k-o~sGM138a^aZ@P} zz2dPqAqMqMHl+(nKsU~)$<*Y8?Iy)t=Hud$A81krtDMn(o%uJd?EU#sqm}IXb~}B0 z^7(F7yj4>$ozQ~8gjrQ+YI(COC`JVY(eO(nlpaznARu)J%sPMgv(6yW;Dh(F4O1Ew ztk1!y6z&7@f&~cQFEAx>nvugW4EP^dDv?SFz<~b=sG^*$(zf@Nn~j>*b;J5SY_eb; z=N0u5BP8bVi+O@l`!LlSHg}^YGq+5nv)wB{bR)qvE(*`Yc!+{hrJ|ZzP}&@KM6_Be zAR1X&Q7|Z4kmu`UQ27F=^|OX~`P&f@=}X?TK~XN*8m0G#b7VHw^glX2h(vmietUad zeHpc&V)>LwkO`7{`}WeZSbQNJ@ z!=wB$n~x(?O6vgfX(4rz^?scmyG8apCO6)?1@f(7D(p4TXr^9n5t~2}{d%kuRPm5F zK6m<@?O@(IjF`Y7|2uFng4KBoR>u&tI&?D_(QtPxSUf(!%YDKu-!*eLVC@-k*VOJg zyiWoGP8kr(+;uTu*=jglyi2!4O-lN?6En?a!6&08 zCbocy)J%DnNpCl)E!$`-gaNd54Lxuj_Z{z?4nTtp^vMUiKPn&$Y6! z6bvs>VD5#4+o1IU0TIKyaL0n`3$ST+H0mizj3#A%jYMpuKS)E5U4hnJ8;0g&jIMj zy}!T+^OF>0;~+c3XF3j-#6|i0+dDyP#TFLG=Y~26D(zXX02OYG&}-tSg<7{*Lt`Kx zbE>EqTgkovA-sh}1fpG9ECnImPioi`0DBC+@L|x$>2!$dV?zcrcv%({+{w2Lk%E^l zC31*8|H;E%gGKLNdF2SVRZ51&XM0}Kpfh3{h>P%Zn3}tsFK~beX0`ehO+-{yhelr| zVpV4_nF{+V3#ffH`6yJ68C<7_D1ARiB^Ffss-^E7V$56a|E^iAL?) zw{L$}iFIyc&YI+3Z~r`QdP>pN*w`oA+fPommNb*UP(Cd@N}1jYi;MHRa9ZFzUH0_2 z^xZ}NM+_HcO*KQx<8LkKus8LmXKhcr)-blN`*KSv8pK6Re>HsSNnDSQ3; z=2q$s`HM&(_O@dVrmyk(K`Wz7|4{JSU%{8ra=UA?dW)Fj>AHQQv}|!XlyK?>HJIpgsx`l;^M3Z2C&cwQf7$ihpU+aO4kQ55oTT1 zdOqMF8*7*a5|PQp&dJIDxCw>gVk$}egv*j~mnh|JM?b?$rKdXHPRQ-3sHnH0OyD2d z+a#?RmLX-lazpvjXMD*b)iUeAJE`ntfWwl9+S2_Koy8wxqVsB2a@+CVx{$?Vef?A| zgr>hrmb^o@jDkWt5D9SEuk`)d{@)ly`oCb5{GZ_m{QgEiWa?E4DcYfkpWW2)} zE-#aA0NSp4gu(Rmdra=KA@Ne~?hpS}0!=e98bo*h>cMmkURQMQ=cadqi@{MQ&!cwU zlFMK~xWm$+gjhh4A~stD>z0vNN^b5`*u)~zzfpr^QHr6Gv87$wNGQmDVdz{3El_o- z>?sZl0EqedlIl*BVeeZiY;9(3&;{PCTQ1*p?JobJQ0ke%?OO~sn|8DWicI)g3mq{p z<0s2v=ftougSHB%aNR3B94nR>5`{S94~a5H`ZwyKkr*Kcv2h|$lX=q0HuyLEn4nO6 z^2O=0(?->@;E7{>2+nZ=CVjtMSWxi9**U}clMW6c0YT)Gn0W><)1j=cXuDT(O^+y~ zNV8EDfo%x_RCrMPv<18LJ5<=x9O0e*W|V_SB5ZR7cb+1(=3{{W-+XXk-QnPCQJXY8_z(V zbm^NA8i~VVd7rHZ%pOHurS)+lTK4ft7@x_5KEdvs5FMff7nH54<#awILg(<>H3FL=mK4#rOF@VB$Ga4jZj2NHO*{Br`ChV&YjC)5oIa$_2RIj zp_t;jmF!nYWAZ>O)ASKV>5F{L3r@9)Z|@QkU=}m=3W#iCbS7Ny#~9I<7#?z^P<`di zhZ8R^3O#2y>yT-5!vR2)=*tTeU)}sZ76WjetoLZewGg{M`Sgkd#L-a05|<^&>b&t1 z;cP+0^jeWc(A(3`SLpS?fa>3=uKOblD$D;b=y#mD42bIgwguwP%{LPT_B-|{f7@}i zn3*oW%^U)BNh!j75TzgIX5h1S;Ghe>`6liCOhZF|IzLRT5`B*H10C)ICD4VXh4i8# z;54@=MgNdAoH-;CWOyNKJ$-xa2bH+t+$id&s_MZjD|kq63Pa@-P~>2y?TIUWcn>nv zIJS%}*<&6-ph?}4B@2sK9ZQmAA&CCsI3aRL)=D9(OzB|S2C2~~o$$a}Jk3xI#>WW- z=#P?*16M!nd_q4sS+@lpb&-~AlFew?C@J%ea zijIHO$1a0;u3UFO3cAdtAiRxp{wo^Pg`U@?f_Y;zZD6p7Eo#ll{@z8B(Gj<}6Y?5V5; zSAQ~$p^1o!sy`0W>rCNM@EI!_5t2iGl!L?eNG;-7*Sl#r$iZdd(8V=X??>fYoU}YL zr4eCa(1qpbm|cSo$H+s&<|gyY_xR55n3T1`-_$PS7V5{MIRw zc=ym@pk5IhYGPr5kdrIeHD9!SaOD=)?KZzeZ}7JRGajH&JlOYF-`B{NHr(ZWk*aZg zmc8=Il(|?+TKa?E6LICumUEr&IE_+vybjrzT+1TBe zH^a$(HWQf*gm*aje`PJ=V&Glux+8YBkhU{bgF&oCoVN#GhOe`5AwHkj`H$LSi%uIK=)GCje$j|7Sj)yGm!vdO?ZkkgOw;tu_}K>I+1vkkFplY?b^ z`6L1Usq%H&^~;rkrIoKYmB6##VMF5^$}n(Ga$we!O=cOYXxpHta^GxyEs!!Zhorfw z>1_yzkA;+(?sY^de5W*%<@7+RbCBBdN5bTdEeCc@1{e65O3pHi9--PDL|<>v0+xTd zIEf72$BemSt)SiHzi8vR|2u#gmyNlHE%bULf(08ny6kd#Xww~g|B8d{=j^kHxHx~q z_TY}z^xHr44*Bs91HGiNjTeo_8azeMtG|B)reb$(KiP$TpbWw%dE&=WgtLBA{kGCQ za3}*3-gu?~ zg$UXKd4DY9y_XXqIaq4UY3dTrk*023#Y#-oiR@x&N2T>&*EY{;EIpjpRnMRZ9S^Sk z9|AsMbIMmkzgVVAW_GmJE8&H_B2b*z6f!NLMgSA8i5F%Pc!jySk^98{hi}z!0$Q^B zi4q5O3h8f2+COwZ=04w_a2+e2ztNNYHH^jcvy{ACo4fgjAyk-|j_KP|41%RZU^pBe zs7W|lYM<|pdQoGb^Qx;rs8=thVZ}cVyj!67*f8~X?|vH`GZU56=_hE}G*eh;o5AR5 zfqBETGf+EG^$cCMMW&Eos0BdXz1OaB)E~-LqFYM&-pm_VIkmg@(UT4%X3{3>0+aY~ zzCl!9yvQU<+5b7^`z=AMrB%Y9yHZ~rBjJf5CL_D8bkj%9h~}uBd7V#>9M(X%zD})p zo9OPYufeNVDMYxB9zV`kv!}fNslN#>)}*^CvDK8O47i9`d#4;$)Bs_o|2oY26)bv0 zGLPF!@^VgLA#YAvQ`2Kv(lW8oJqFhXl|d z$Kff4b2A~KtvVn<2aOYQG{r9uZu0Q(=`DOA{btin%Tet>{i3%=%)$bVW+TP!L|UrF zCtFyn*paNQ)5!7ATZRB}9A2o%bcT=4x@*FSyhBY{fth_ez=%o=?bhfo0XD>7=%o?^ zT}P`(miqL5BKLAIll3ualfc|0OdxP6pZj)0d-bKtL-13IJ6-^@*Dd z-bomU7x|Yi$+HqLr2m%iza|A()&E5J1Mbwcaw}TLrNhz6#l``YyAF2;6)e|(R~Poj z41Xv3^-QzmDPTbxKz>1t%_b~Nne&*0%X|reir_|`U*}Ep(K&S(*wd}Z8yFjdQ@GP~ zZ{U$uF-s|*n+y>oCbTqL6rHAP?e_QG(lVn_yc^m=ZLFs}iR!e~SVnMKiMTvUoz%Ts zB9^$-PpXuv|5>sS2(ua617&VH6Ept!QI+?=aTgLyE8^SMwp0(WG@IDZ={n7*VLv$X zC7*lS1LSZ_@b)}o#&blBAI;Yu(>#}N<$TIy{H3~se7P!8Cii?6tURU^1gR2RP2~fu zs)eaDbXD-Bk8qHpAHjq|b-&PNRjbljO4;B;;8&lmoWOx7hL|0 zauxxI$3$S=jlYuwFc5>E{HG!qz%SD{EZQxt0Tf*B$M#5H)qZ_ce}6wv)Zz^c4hjtw zd*K|%c|jEy$rhhNYoB7XF$+E6T;ml_AA!SL^rQ^(QCqVJo93^&53%YF^THkCZs&yM zQ)G5`TzsosnKHnkXZ~bT^E0z7<~>8kMmel^BieFi-EY*k(PSvb-=8f)I%Wv~8vWb& z?eN^5H|sZBfR55n)2a%0HGR-EFkNTBK2gvCdcuvlXhAUMAKX4VexNw7;P*b$HGo`6 z0r&c{I*o3dOPzc99Y@;}i!O1UnUztEkmv0;D|Ii=FQxr_68hlGRZ(DQXb35&j(W?o z3OCPNha11*;rY53#|v^x_kxB!fzUsF%8g6!_0G!j_ELHOy*&ryuhi;0 ziz8~N&yf&!F$|2){tL#NgC5s^=Q1!(or1|A|E^E{4~(yl#mWdc+DX0$WKMEwazugA z2jKk|${)I4zNvQ`fA<=8kiRnMH)wuxeSU9Lw;jw8eD%EvS0%GaMe&0Sj1Kqtvd4y6 zk;luL3)_}Y{?wzm_p0V{l*GP0iV(Q#i1Ut*#`FiiDfxgTk8za9`ny*_PFW!%Ykc&t z=4A8-C#*{AOwczh1$q)8vZWcBnJ-Yc1_PN;FcR{c@3;J3F^Cpu)?8m*U8Ur+R}Ue| z8Oxd5PHpxr0(#mX->Ot#$EuySG(o#))*2qhWCvLxhh&qFOzlNNbK!`$B~lL}eH%@t z>@6ua>?2*q%1cceUbvjy^`zLZ%(z}pWs+BX>?)9i8=P5x|4zGi13`WrgBQ{f+fYyCCFmt<-50CW1`{4dX3s)*ajg4oA+X zYtI_YG#$?N1{Fku_SQ#rFXHb?rS#(2t9ezMvuTOC<4DPTP#d5>^ziUdQGdUrdqy|r zR;f~RYu1zF&3d$g%=KM8XHUZ{k^7Z1Xl6EWaSlKz{9nzqyjUO&2)=>KBE2@(7O$9Sr>EY%>=G0$cpHf$7Q1z+k%d6)t8Q(k0gD#f4%R~$*4=Z z^FDe1H6&AIFvzH3X|tx9dGh7){i7%G0hyO*xomMVK*9~fL85Vu8!XS z)h%6m1J*st^(#QyK|d!e;#c0esC=$<-swK{s;U3^51^fANrd5uNmg>P%8Y~|NyluE zFau-l(~H+`0f?toM;FuB<%t`lu299_eCO;yhE_AXh)#M1)2!&tDgrQ(Zsr3`Y16_B zN)+i&uk!RTg*V?~o~o(c;Wwslx;Ra~_^{&f5^nZ7NT34$`t>HzYu{E;Uj_~5ZZkMN zRBFmV-u(JL(eolbU4k^__%%H+|LmTj{)+yuw{a~WD}%x)OFoe;hZ`!$@f=L&2J=Ti zFYk}veI4u&w){$C@8&0j@v2VAp66AeSJuqd1N+jNu$YM{yKz(r4Vlqgd^YkCj!YSKR-qC&r?A9#!9|tV~N<*xuCyv@F}Y4 z>uvAP;OkT@>hg3?=^8D3!6!2WV9i0kw3B>MY`La5@Z?c=X$S8))gIPzB#PLa^1DD5mS0*OQz z{g!By3{G~I252kmp4$v(EFqOb=by#4Z}qpkZZE|&8P^*&?@I|^LAf6w?=#7%mHAYXE1qH$Zn zNJH(-cz$ciPY6nRB(m{m*Ng8!uKB$uvcjNc%jkL3&5G+QllHU~j93z_8I^|;#gwgU_a~m>mv2!8(>Hki zuvmAQ2fDXP%hjv->H_q6n_BROL4&seBrOu-vK_68Dk>rK%su)Cg$#!^TOHjIn^LiW zFJ@eCypdmRCvwwu*9zp_w7`O9VA;HYB$(A)=u^qgb}xUq{`!df1N&JH>8{)xBs6cp zlbb$m%xqX^Ww{mIV?hC94p5xui1N}9@Uf8=NPZeZ9y;KU`HCcfE}t&}d9 zKL-+ZFZ%N>U%Jk+*>&9u!D*-YYroU=k2k=bCWFsE`%L_~o%LXQl~)tleieCtCbok( z62YV?{@|;d zz~HjiH&T{RSbIZ*x)nIt`ap)rSHJH~loAYROsjYl89TyD1B}v2YsSf*Z~@l-*J~UD zu=WlJ>p8@((f9}jVL1>EsCk8P(Plvv={+hbt{eafL*>ll^O_f=DW|VzY`adqF#87@ z+9kGD(nAb@i)=Yh{l(eld@F@+>7U^3y%hC=1fNY*lz#bc0zd1P*}7T(fJFz?y-ceI zVC}|K93x@Vu|EAJSU{$qKWc>J*K{IJb)tU)n4n(iv<>r@@ppZVoj`-BQQ~2f3B7^5 z0f;!&6u@<%Ad@WPz_bpgM}H{VK0JE|Ss7~%P`soPIonm4lye4Ze~buXe5CjTj*Dyx zuy+ZeSedzR;%rZ);c**q7d*<+>7Lg4Nz?ImR}~vmKh?WJ{u=l;0?beo~&L6SQZ*)sd3Au~GGstNut1h2_wt z9;k+$3x>il``JCMsTyGus{{ThwdXFP2rp^*MoXC39bFMn9I8}HqXh?+{GruZm1T8^ z06T6HRFUHDi|?;5&InJ}b(3{Mu*gVBS%}!~Z+W`CyhhFu6Rf|1rKTIr0x_Myo#cc564&y|Bfs`t?bm7UNXN?jcF}5^N1~c~>hJb@% z`HgK{egXlK3Nz4)9=5-0h0Y@6kHyns+Ceo0ByKnk0^b+6+U*fXA|LR@R46R9ub|o8 z%C}bpQ?P>O3-E~8(!`yAWCl+01_*Dy$n!(xwy{gM z{tt0)9Tip7_JN)Oq&uX$yCfAD=};I1BqXIlx{(+}M7q01LOKQM5+x-Bq&uWRTKb;x z{rdghZ{7RXU5hoWnKN_F-p_t&Kk?gOS0p7qop>~O%%zHNwT26r3!k28Xs}Tcz8~Y{ z?+x(OX+V|94q)ue2?wIiQs{JkuETC7FF<};SfjaJEewp{;|I$>) zE#Q)i2{TuK6JHd6dr#`Hu^L+WVs^0Z5xCITwT6Y$!GX7{=3y+qR!P0Vg-ueH29Z}V zxcBuq#E96!E?H30Wv-+qGZKKf;>%})c?KdLe*}8+o z2-#rTZ-upGC}7eg)cc?{OA+4R$NAtyF-rAYJgab(-}EDuU@%`NTPqy>V&1p!<+B&F z43z3atH)AG@`P8gn!QX>Lr_Z#Xg^yjk4=mxka8zijPb;Q8SK3e?sx9(8|<0Ao8gmn zX2Yj_6d{BSdxvv(NDF4e+RO)8mRfp)7|UI&T!zZFtWmB){@V-MeiBMo&aaL!y>Yyf)9MgJgfCpX0Fz~a#3=Ky^nXfiU$;g7x;!vMgWX6ct zJIdMEDBH`hk$2r(_%>jHN2>@y#((|<4VYW}ko&`-tY$8v58JpIOG^x4Wx@)%1cM;L zTdF+0svvQQ*345=Qw9*@!R7XvYpYwaBAc+>6hG2{D zQH(Kle}Dg%E#bmyWG=YI2j{`%%N9&uVXNq#GW~kf{Y*|})o|hU8fM>`hR#EM2~RRl zA0Jj%H*925TDfF_*=FPbt?xMz#U`_v(WMGx$`3 zyK~c}MHLlF?oS@<8~BrF`yHt~$rR9RBPaQj(N<^qyZ9WLRPz6r$&7VE6|@fuba}_8 zVd)*g`K;U6g<9K$`5ryRzv}fL@NxhB)re^%L9o0s;Ix!D$Sh*-2njT~+9PvY?LZ3} z?pZKMH6D2PK}Y#;k~?vpZM>MEshBo3`OWb2LHY7t)Td8x`JuM%6AaT6c5%KHhJz$& ztki|-#Xh2VLOxN$H{|C9G@A4n4aWve^j~bG)ol^$XCXWv!b7xe{$e-C4U}7*RG>e z$8n$N>Fh?aR?3t10?po9$Ehlo?^zi^am0rle0=@2(^gxP*B4VFd=WKqw8q@U3(+)< zrf4}WEyQ69&n?q~YA8e`@m6c+-?QAOqr(o|SaH1LO9`kKSvGJKtT1eQm)FB*sDVep zMfBx_f=bMj;-gwlPAVbuB82f*-e3S9S<`G(Kz&jeq2J25f?-2!)Z8wUO32JBw4#6W zu#nYePyF_U!K~ILF0b_y6e;$RgwEVEwaW5xvWURsYxpPEAD=7lW*2YS7%A!PkUcX0 z-c$3RjhO%V1fhoJEA%lbB_$NBlFUUQ5M4sS!NJvD2gctsB|iOC?Xzf9tTR1Mw-P0Y1hdTFJnU$uNlPeyZBHXFKOqA-1So50a8}7{%(9EfUgqUS?vm zQPJA>#)^|ju*$MvI|xVHh}GstSaqu^iHMO#Gr0 z-c)@ksA1fHdB#$EexRQBx0p20TEDPMZ)5&qYx6PsFJ4&D% zwyFX2D{qb#Cp?*FpR@1fE02EJKL2ob(b>tUtPCER1=DzRi`E@O%^$e}o^{1P+u1P% zrc_bQWt@4HJCdOq=n=bC4S{z7JcbRhSF_#tKdY;&iQk=ZfamZ`BlCR!T~`0|?T9{# z6z!Pr@t_zIKxCHBaWr2g{@E&gPLtm1#W1L@g&<^zi91T(z}K|B0~o5R&5v*b01nc= zbvrj^WBZ~G`t%($BYZYW(0NuTvhlzGgULj6M2?`*F>1bF|D;7Bw=Z&MoTsH#rGS$ns;bjd1<49vUnTzx(lu;)0{xnm93%aB;Il^1@EKx# zDvWRV4LvZFM|t(*Jpty=nYQ&cuPVCfw>>FT zh`4XI*yg`i`h5{B5JL(09ZWxkoXG!}^nbi{R7B1mG;IP7GYi<M z1%Ke5KCBaC%L;t@6i<5bN&;D&+Zl}EvbDmh(mZK#DSqR){+BSVkD*u8DQi>OKWuj! zs=h!d}WMrQi|l^lTmLaibn>l|i9nI*Oq63-Goj*5HkQmm}5&gve5om-MY;n;`< zQr;eo-1V=vOS?X`7VChHj!i|mT*91oNjzLyTp!I~?TeKgjDc$9B`hO8vL)Z!@Todf z)0t2;zhYHW%@usO7_)%;CJfvyqj!bkg|a&i%!OyrC8Zmh2}vWsDC4Cg30I`Y+) z>04@~aXdb~Z#xu@~}F$%4S4?;>9a#U`h}YiJNh3u74!L%RgaQH$F0 z2YeEs1Xw2%rg8Ij*ZW@WUFU7*Qq7d}_=wKI~c=*NY;DPecH(NAqY1Q`{P*qjz6UEcEcrCjY2y|8LIbe{nod zBB4&$FyA=Oi86h057oD%8Cy<)SJZUAn~WJ$_v4`CST^~BPhab5MI^h1VY2?nVR?W( zGX81}Xic>yk1C5$JItw9`&^a`#35wg{2--!$JSUi%@RjvVE80&UFKohixqAZNV$*5 z>6KOpBjSxY_N;aLV5K-j>VoZwvpBkv+4hg_`=l`9T<^Q5Y4$&rv zccps^X0|~*7B;X^Q1X?UjSYv8u&|)V_Bc{S{nO7uvV(DmKzcWve&}1P<9{RZ|8ljX5;@nM`GKha zL^AlN`~UMi{(f@D25zj|&-v@4|1awQFAs&IU8jSU>Fo=xbK?I8tNrhne*gbB*%Fy^ zCpDar+aEbQ!XqiPgurIkdQCRQ<)!P*|2j+M4lJ~p7^khpzpx>QVXRH>Y99HXh||u0 zFMj&BpLP5|MhGVNCxa006cKH7MHLV``}G5KL}O$7gpDb`u7+uFa8NHM1Dm2T2Hx)J z`pmy2w*HG|`5THL!Q9kKj<)?dp>J*|IZ+U(^~Uwf!iH!(^o|%aUT`IrkU^B{efajG z$MWd^V8F{PP&waD-Lo=PUMtLHswGVVSPP&&A-ymm*!a{qz5RpVDU=kLZ~OaR`O(Ct zGqKl!C37kcv5EGyU`CGlP1MAKUNS(iw1i$_DLy)hvTBD~S+|sX9uea-v1;lbp!pkmHf~R8xhJZKbfh)&82dXz0?Ao9YkbgD6*z z8O9U1{Kv({^Ayp$sn>>BG6}gyty|x zB3)iwtb+1-BCn}w=ES}U7oX`=)OIKaO&NM6a&vQ^ii&IU>eT{}F*a19 zcm=4_$GvYV!w267{bh$N})o$ z+Pe=+@on@*l#m=rq!;t)XJ=07v`wQk5wBBwaPnSGpONb~s6s(cOjRx(1nldGkA-*N zbAts=Vvk2r1sG^ww)Ydln%?{=@I0D^C->Z)eSMDR+g* z!rD47Hy6btp6jtj?d4_uzSS!|4Uh+{(c}Y016UsiJY->IP22d>;H@?B;wI|MXr>Z8 z3HAecR!HNdt0rTdl!W)1wL@5j9C$c4{EVARruchWYU=wBa-pi8zbTkyFOKTx0BENU zNI?n?H2G`CI6*!OI@#m)TRa*mvnUr3{^Bp_E*Q>p9b%V>mWNZlgNvLqm|i?X!=*=O z`Fh?@wdo{qj>li;QD=p#Rsi|?^$~Kk6Sfq>Fvmi%qb<3@1*$IB%F5R~r5`^k3Q@}Y~dY)GIPgXn|(Su9%M(yM=ra61pUsYt$X|Xg@_YBb|qNwxhhT`uRG&| zGKSC2;Ggw1{gXU<7?T8;XLjWLLr}4nR3g#S3lnqvlS9$+<8AWIWd()@yt zrm8zE_trGfIq9lVY)j{TZ4rdrhZ|npPfBnk%H5p|_OmwuOon1*|8b6zwbbsc%7TyaKxj8Ind^Hd_6 z&CmzNH~6C&EKDYR(K)++#4{uM0m&_eK#}sEDC*EOVXO2AtIJh64982?--;R)C-{vm zEstm(2T98c=4RrmyqWvdAUF8F=V5F0jx?WjRi6d>S9 zEhk%J$SpGxGZMKCQ&m%Y%Hi_Km@f=i|J~;wq$S8fQu^9{5s`(}H>ik#YWNOsHlmr8 zLX?TfZgC^NnX9btW;Y-uAzO#Efe}|;K`E_y1BA7|zxbzHOyK2L%>0k}XB6)j1Pg-~ zXDVB7FL%I0(=m>4jx>$*3oxoA0?J7w*bgEmZV-D~Y-7^p(d9kYH)iO(NApDvKN)Nt zgnvQLLZfy%MH{YOA2*-*w*T5)RSsOcyW4H`@;2P?*l&jBig{K6uS_Azv8D%EiC;g; zk{LwRunf#gv$LOP&WZa(AG1W?^(GvzEo6wh?oO%jsOIu$C31v;0Aj7=@GMiO%~rA8 zg>$3VqVUe|@O(@ZhO_#DLtl}yxhCQHT~ELIWk{L7 zZMoHtkhgCiz1{v*stC!0Yc6$?*|d1dQBkhWv@V9bd|=5=i$nphKnpoL(54q!<=G+= z=mO7}4`07Llyg$`@Tgn+^{YSaleNXb1h4Di7ll6l-W00m?g9y#9UP{lHcs~hmT++J z5|qVf1cRJTOMVKyni2?$R@5K%O!qc6w|tks#`09X&H47t_?Mev$S*uAA#93!ktloR zfcfeTtVCQ?^f?HS!p^N*mI!Y6xuDe$M6d^zx8Y?9L`K8kp2UuML5nkg_Jegm1e5xRe??5z$>%6(@&RY zQpwYMw~^9*gU0cA|f8d#8fQH zM9#mLI}gfCfOE&;W4hbYk_PVe0s*T{? zFGcX}qr81u9k-6A_Mct7YUs8Turtd{2cg!CisHuDpF?a$rlbn0U!o57`3-4>_7L?n zv`oqw=`{vbw4))Omm9ExeatCf@TfjQfW|2$DXC9)wmrVm%b!;XpFXa1a%rW{0Xn%G z*6`@6G@+SuwyUZAD;kR$4jZF{KFQvPN0pCht~!0BbjaOl*mEX|aB8deTlEP)usI6v ztC(e$;KeYXk!QO7p`oLz5ah|81lzbR!>FkXEv5C#yPqfqDSd!L5Og6W4!k5{&eXTJ z^aY2y?dz3Ar#3ssznqN8M(9be*JY?m)(%jw^HF4&GB@6b$qP9sW7kjEEPGbT@@H5*)#E0zkjF$Fwh|l; z7j^rWGu!+D@az$j@BtS7LYS-YaUBiqPE`BJirO}il$u&ZxY@N0)olhXe19JZ3%{I) zxBaT;iRqh&8;sMW094iMYex8Jzt^kuB0Zpd%OW$dfk#td?Q80 z<#F2E6)LmdDh_yMCzYZ7zTnWdSQ0#^X4I`fpc1XazThodsPY};20Vc`TMOV~V^h35 z*)sPMK7K_i04hC6`Jiqb&e14*K*qrX1@P75(%Npyk7DJ$o@ zs`T_tchmaQ`Ze~GeAA!2Vs>Zihr`Vo7ka5ZY0iI`Wrg`E3%B2ec;;ykPZjmt;Ip+6 zo|vs@GQ?kSgypfjV?=N5T1ae4s6S}LsxTwvl2 zn-5IwXk<=4SlX^iMLS)VhsLNhJgOwx%%^+uee{ZKLW$;r-%0JaG0X6Uhxj{6>-CF; zt(%Zgtl{%?AUvQe%%wH^wn2L2xb?URM`OJ~XQxRw;`Ow6{b8nJgWK(!i=!F)%DxlZ zF|s!+PI;h?h>?Y$LZz^EW)0XngWIn#Y_B&KYfm|k>#wwzjv+)x^yVx1#KG#fz(X^p zH7y-YK8ZBj2F}FLj26oV@y`KzU9XoF?eBs{3m1W{D2zaDnh!VunkqVzD~j$ann9%N z8>#{7Y=+`ISNHMmY^Vdb#HwPRWKyw@{dy7`+re-cszrWPk>D79 zOxk_Z>Lx4~ADun<_~%k82c_hzDqMQew^=km;=NO;64CAOfpwoU_FHZaj^u}4qpYY^ z(C5C{kvRgYR)ZO3oy_c~2@?+T92Ing%OPXB>%G|)P_tl*l@sr+#rl|&pi>#ySV>I# zo7Vly1@vFoIN@vRb{`ETy^Q!+Ogaxx(GA4kVfq~j+K#f&T1b`Qqai8`8%NQgf^N|r z8V3P!#A1TayT5+=d?5hXGDbfP@F9T*-w7FhvdFg_D9$5)02eh6$d&KGjTh*k#=Vt1 zYI#@0cgoqz9lu7hae93pYs6!7q*KUh7t^Dm)NAP?`-h|PFp>9Q|4_ooLw(mBgC>;9 zv0qQo=14dpQr8!PN?DWM*()Nl#_B`3~_56c>xEPxp z_o#+V61&?KB6oILY3o!!(Sd$U2Zyu9y=%pu@WsVjTwuK9b-xjacAs%ebmxj-T0GkuV$vlnsMqNeVeLJ z{)i~+EbTqn4{qGB)z}QkLm=6pw880Wri^(Y-K;b`C$&7KUW~wI3o`+g8WHL z(zYpkcYwjbDas`DWcb{jz-`S3ui~8bTfrY%o{6t}s$Va~2Qy?eCR*FnL6|gFtW^@y za9=-lLMt_T_ItR%&mZNMK#`};BcE=T>GofijnsriiGIy|Ev39qFo68B$6Fhg1C0rx z>k~<&hWRKboeErqon6j(9@i{kQEDCLGyn&bg<+!k2F)SWN1=%Qv|FdqWvj(dGUF}8 zJA2>PS-I$KE=%vYM10y(?jq(vk+rGWD>Dma|0pcW+e? zCWtRkb*$tDe5W8KM5byJM@~Skk=o7Ir|*-#h8+r7u6Q3F_rNI-5o2JfSZ`WvZ7q+4 z$Nn3`M_x}W3>!Gra#ET3+iz?CV4S(<3*Wzj1S|G7KW3aVn7&wle6b{!)A!carDo+q zL5N9G@g+L`xb!+p>>CKxnw5&3!MjX}6sFgo>62(PG?VcQeq+u~;loHs*guz|l6DT^ zQS)FFf!kF(2EXNSeWAc*_cSTD@TP|1)@C_;)9f+t-3ha*4BclOG@it(a}#V*qy(_NLDQtGHthG6{>!O zHB~Kw=C3b_x_)ggZ6MxnEi;lrO551mJVc&=j-#_d+LUmDGx~kSfVd&;d+?+rrM9VB zwP-qV)=9pFcbEaAGT6ZQr*25U^ylxNK)D+^#HIjBRz<|@AAm+}zZ}|(3Y-1hyKmII zPT0S+{%g&Kh6!IqUzq$!@?*K!x7h5u>_&Mt_ryP%ZRNe3Lu>cp)yZ-=b`7^!+oVVn z35@?jY9PD4F!-}vx^u@1hqMTT<~j2u|9tW)#;~f1_>FqDpnAeX%rKX-jeHA+*M@Qq ze@v+2jfPl_P)@W;_t&e<->J?u#6_#!=HHJrpZAckHhJ2}siHKb7T`!kyi`i`FpXi% z`bo;hjtM%j0aB3~(#o=bGFtQ z^O7rR;-#^*=?IIJTFM42X*}|hhU%$7kL74BY6I0|GV;E4w+$eaHi-noQ^nZx-8L~5 zT~J1tzvc881y9Q*TL*E62!7)VLUc|S1y&D8(_Z@GELB;kw0ocyXrZujaj8y#Fk?=w z7tebjutG2>2lGBN+5W~_b2eMbLVOBGecb?XC}sW;e^jM@h^7-Cos}Td zTCc+&-NPCku=Cv5rV9u#8jGK2O>?;hgWfkl7cnw#tGcQTrd$@MU@W6OE3Y`G<( z_5}Pb_VU?m)bJ+sDRZ&$e%+kCguBb1ifM1*Gh_BCeZ6LaLhik$IPlRI8|CfuQj6zs zHZ;WkoO;UI*v9$}Z>Ys=x4$Hz4zSIti`5bqCIp_08G8S*+LBQ-mx{_hFb+$!Hmvcj z;4zfcLbe}%x6&4?2sK_^tp<<+N(}Gk<%vDE<{#)j|7;8Ftsgw%4NF)p&_Dw?5+8ZQ z^Lg$ZPKBs6%(cbw(FfM*Jkt`kFg*dLW@A!9aIEb()xJdGH)>Z~264GxcReLdtZZ># zZ^~3rk0ZetRtAq?J35-xP{YmadUF$J6w}AK;ZLWQiRow5?27?E(RKB9iZ_%9B$@OC z5PB%X04!PgmkDSY5n80)Frb8y1qSj?WkN#0<}Qh(Q$X#U(|4z)v}xCX!dff#vhjil z$$}0EZMBjPJQhMrIh5s#=S=-lcP1+|$%y4%p;APoJTN z&onudjzG3Y9s{t(Mq&yY2H}mTvNZO^krYI{nm1Rrrahc0e&ad*)$H+-N3J;QyNznD zsFNjIUNx3qzE;;QRhtHoQ|ofQM(gZi8jzm(yT)mX=lR`S-XFg2AkI zP!rl62`u=i`9uNUs%Q%Te4_vB=nuYKMp&rwg0|0EzNe;bJ_4|Nh|Jk_e(J=VXunPU z?K6D{W-B_XwT+(@B#Pdp+9wn5~xI2veHU7d)K0J>lp<*McBv6{u#UW$7( zb@Q3-{%@rFQ{^Ev_Me@m`Nb*=`%T4ZeiXkxT61~7clwMxfIVwx&phQPWl3qtIu-G! z3gb3TPG#8sd|^r%RQ9TBSq?TB>^M(2vL~zCK3{&v*r9xW#) z68%BXPB36NDajf(yv_f7)LKD5#IV6zCtkxiR<2oNTLk4!9k82k@(^C+t!~p1ESSQYGJ|2b7q^dXItYG+~2-ke07cDv$CTqKWvv#w$sAl9$5n!{w zK^=2o4Qo_th8j6A8%8^>yxT{Y@Np(k4K{q+`Z8{RszF8AV!!66klQj!TD2+P9N+KN z8KnFl&-Bypi+{U!;rh$FhGw?@#hK#`?)a(0bsdqrv>K{nU%H1=Kefz-FcvgSSm_|*htG^KxI&T~GhJrwdVcoPiaFnqAuK0ce)U@FZ| zjtR|bXB>DwlrXvAGqDVi+S}$udp<3Hb?4XKa=y=|B7gDl%Uj&%0#%B-g3_5n`oKM) z=mn*#)Z z{5oNM``Zh`P5`F<&@1&u1lwKV9vvyXL}BaLK>Tv6vb~EBA}(tE&Reckq<{T&vdw;{ zDFJ)vmf2n{*|1roU2^5o?S|gER&HqmFwoOT2GLXPwu+WUp!!X&6@FJ3{L0sFbM0>@bBh?3Bh&h5e%nZug#K z<+*>jkLJk5#m$P7#B;VU$why&jML~N@>CUjfQ69}VoCuNZOl-@`-c;}7X5W;5w10E zOLp4qa^|7E*Q{gfQC`l*5%eK>8B){w(bBtxk(SkuR^JiGq$DQV7a0)c{iOUgnEpDC0o;Nq$v!vn*(=|OHI7diG6OL z*{`>>4F{eo^j%!=7x1yrq`fqj8I`h{>BO76E-H119nY9_k@E^>p`GEauQahkL9CDz z4rCT^Lu7d4w~p@v@Gmi?sNLyB$tZv-0A25EBw=5lx}C2%C>>r(^ms8V^Qb1}WQ&}2 zxb5t7+0~CbKmejEfS zuP8|b{{koO5fesNXPhm>f~R8wcH7=*Li2qui8lz00Li6fg97KIiQFR>{%(Q1`xhs?I^GzIfVD_D5M_+(MBvehbNEK zqN25G*_H?4;QpfqY`bCuMQdv1pwaDcV*tK=lKRx;1a8X5P!N1`vP-=HY$9#(+8zwT zxPL?;K!;waEU)-HtFdSti|?un!z>#gpHooDP|ch<%m{tf;VywK|0cOBeto`)$D{Fu z5pNP1nX?#w+|elU7KXTK2ZAwrB7{jFpCqb8gUm5nSd&M9L+?tR;Do;Lso6$+!9hh9 zF)ZAClTHhCj7=PV5_^jRWby#Eqx4SOUCY;R-}+-8qaiT29$N1{U8Gtd9RA)L)OBUs z=)7ODFqbJ|AK!BOEB!u@sHNEHS(q;bAnfdyF`sc_oMpxm1lH@n>ros?*UgUC%zLNy z;4Chbe zT^wTZIBXXud)8KEUd6pYLkRHg!vMQA(=-fV`}Ck9?vG2gGkiO&+>L7FMrf=|&$-~!&NCD90 zT>PSTEqs&RLJw(kS63CEG{dh4OWo(~&j7n`=?X%OE@kq}hg~R$KQA^T7v-`iF_lwR zXgpDVKEmI9$T%yyR|jwunF1Ubv`kqorRebOO>}Q4z!i8w5`UP|9=rWmj<+Y>F(CSc zzdyz7Zev5}_WS&TuDHh1lr*x*eeLC>$xj~ulx2Kt0 zpYHRf*d8Y%H=nU8m_R>dl13EJdGjRfSfxjk&$6Cos#?Eh?4XFxjHor=s~vK5GT=18 zy?Es|rH%sM{?!w|FRsXW1}%V?4%Q_KADwPcukD7(xn3DyJ2H7P0@%}}$*AM5gJ-xX zl*5q_YYV)pq_QQp*)V)q28=8G^usCy`V9D0bWsn`E=PgwyZ{X-{ADA@7mXooI-=&S5BRXcc)or*NvMBQdjbu@YQRq%I%vBL ztS5Hz1^Q3n4+7&l9`M|OATJCK_G^yW9=@N9!1@6S;&UB<=oW#%o`0_WH zg&T1G*aq(D8fM;Ry7+fUkNHJGF^UGM!TrA-d9?m32opU^$=c0}g;;E&j1o2h4%cpwaaxHHT zm*&~Kn3@MJGwZRu(bB8o7=-Q;F9XKe-rlf2lKmhN7}#WG?iwrYU+>S2cJ|6T+}xFkU9aQUV%qOy-$CHMzt$~<>`nwIABn3{y&eJKB?P7I z4b4$u8}WPt$xoXh?a#Xo`3=E{8xKx)Di#JuAp{ob`vrpS&&tkjg1;-tn8U#4cV!^C z6y#cpW~?L*B4?`N0!7mih5e-mkC@GSVn_que6|tYvF_%S5e3tI32jbbvwxMB?Jl4aHqIXcP5}mj5ZaE+W4_aL*Uf52!nQx|G zYnzsz*~*vGdDBKGW|8CP7!;ZFKruHDLlxLJ(i7yd-W#IivZILl2D6!9X}Fa+qylr84W?>{B$=4+B1RyLA0IbTHOBRFSrkAdrgK0G2X%^ z7=u~%CT<`wsV!g~4Zxm3%+Qya9pFJ*Re^AP*igel)qaMVwQZ|U5-@Is3T@0vX#Psg zjoGoQjtcb=eGfrA5~=>Bche2P_Adp-Z2%6hr;;045X8*rJOFDOH)|IJ5}PFlCo$3C zuDabV!eIQM=^Y9IKJ-Sc@AA+@oZwaGR%X&b2^TkbdOd;g8deor=sQ(RI;tFJ)yQ0h z$-2&}(G@1n-od!JGWWi|Cka1RNm!*_(#<`pS)XBW03hJ7W%?fqn(w#7(W&=a@XKCyU>%NV^_O+qlJ%eR8uEo8?n?^}d|>^pXAlW!XQRNV9}>aSQGsnTF`!?c z_4kyFkMY1DPpnM&;x+_&NU64MCHxsMSb7GRro zpovm>A0Ji@6cri*|({5DhMC zl5Lx!0FYQHX%&r(09{}|5&%xe?%1YfY5AY;xn8mtU{`V?h7U8?ucd@9$i@hMbVB-sL;CN1_;8hcNIHX@X{RM(1_Y4_C|-> z72sXrS-BuyrgR7c+T2l1UtfGCp26y=cYz2v8Wt|TJFb7*F zfWu@PToD{%Vpu?NmMNJ=tsw=Hq7C1lbSMm!5v}4~Y$}3aQQDBFY zJf^3HGcCy(9l6!~?oA{z@98W06NYP5Bzj)xdVWO;DN=Ng6iUS8d877RPy9-aKk*sX zGdQ>qZL@RWOPy_HyYucK^$JgrH4Va`#10}2&aN2Crf z%Gx7 zmKLpW4_I}w=?5~CN1#K8H~#8<9WpYGAl)*27?0U~VybYZRC92Vam{_|uXkVd4K<*O zrOgt{xRkIwNn6)5+BLRBg^@YqGqZHJeG&481VL5~KoNVv?lcUJ@T2P0kT8(QF9kBF z5hdsqfa~dj9Fx*UJNWszD7Y^CM#%MyMA|w*H%TQhH|bHk4~|vK1%D+Y!J=~a*7TH}3hY7&#~?~tz&vZUA9G(O3xm;X3qgcflM*xLNA{_gl-7Jx}p zCETC~f)YUhR-(Q7OibXUg);mi0i1OPSEltTHIl)Q1Co+F_-j%ZkR}!6kwT%r2K7k0 z=er;>%WMJ;Y2JjI0|?Q&oZsdu=HMMuyU4!1B_^d|dd$d3Q_H>U7O1u(o%50>v;d+O zfz3()XscV5SQIYCRg)Ade_2b5#y~ts4;Fu{W7(*knDn6?O$|)ee9_pI`G}T%U=D^Z z{z?KkvXpp`Y88`fD(+9 z=7T2pYeS^QY5bQOCmKMMX;p{MudJg+Y5&~9G^way0(43s*`ko5``mViqj{YfvM5ja zA4!iESPLY}Ii#yDSr$Ovs3r)m60UyA_<19X!I z-J{V#$)5KR2Cr-)QV8Z(_F6$AKb{J5#DF$Z6)(4bnNc6g1guy{@>yf9o<2E^;;7Qos z6iUOd*Jb^;%{M>pADn*rlU+vvF3Ja^lyozZDsUcR;eXjPN9QTlqrc`8)K9ITQ}JR3 z)UPR^7#?YVyr2TbGs)gse`Oa@SHx^e9rd` zKu|1U0f0g(D-Y9`d(dUMr#ow|)fL^o*8?8M%jt!kjD}DDoPOP?hWUlvO$Yueq)Q2F z{$j+z&zbz@+Ay(mAhG4UdstKsgE{+?&n$ne;$`@VYa6pf0R?~6aBKh?qNeAakA`;7 z?#_p1wq=mEt%n|58-gIoKS-_13c5>3{_Z$D5AWz`%vU>F|0wlLUy~KA)3Kar>8tr2 zroS>dM$qnixwxhC=&y=f5YAl1W)ljU+aCuyz zu1wToKs`=~Q@*l;L;2eJMjr@c%1tdZ%^@)@@yx`8TaX!JAoH8n`cJRyr0P{A)!}TZ z(*V-Ih)iLm0(*fZiSk90h;C*|2Q3GNGq)E7a zWQo*(RV})4g8uQ+GMNA{-ev#CRLuqhP(?)}2rOT%o~P2V^_S{0eY?}iUu#?*kSTnD zLNF+8--;V>C|!7ljz~3|#GeeAm@wD({G~q@@VTzC!ei&M7%%0^#^<(GHTKsszt_*u z5Va1C>OAeO?DHCv0d9v+_e?7zhW|9*JfDma<83eWou;06=ST~I7C(>V{d_JTWk?YI zVdn#fU+t82n}bB%pM-!4TS{J;uj|XLpn>O9)%A0zs{qMg4?3Qp#>+ukzW;84(NR$d zaU0yE1LQ)7rh?fz?|nl6aR*H#7Bhy^cp>pu!NfI2L&W8pof4 zZpfm)AzcI*1(VE1f2+L+!$8LpAf}V-4S;S*3>^doa&dD3HKgWq8W=@Dhej()i>}je z9UztJkZ~u_-|?&Oj@`k^M~kQjrRwiPjkTG;#{66n-pa~4*3AGSM;h#%&62J_!0aOdB<{YUAQ%6mSK2OgWJI0nL zwYqpa4X=p0)!bRM+8;VG3yt{}sY_K?;)))&1}xM?epEQT_n@S_ksHi&O;{8D#DY49 zFV>AaRByg+{dzRQ()8UEf-oF-NpDxAB%m=nzOQlC{acPeQh!B|qR=08xP(alrSwaw ziEpPrr*~eN%6x8KGpL*=z*YLiMrOacYY*M>0WQIW>I@0 zJ!q;;^mg7n(=!RCbHnrS8-$jgiUnl&KR*3oPGI1&!H5z%)nv&l8uOxewc$hCHqGWs=g!@bXPAWcX>lsenW!#x?t=3UBP7T(yw3G|a zR}GpF>XlW?{u8uAIpI8V&tl$#_fepe!=F?hfONu;o0k93y2t6Di@HDxx^9JBpf8qn z4>~=$J@@&nSNv>F>951;ok_Mi1d#U3`)TWZcts0jxh67}^6+;~wRB!x#L{=K`O5CX z4}KHm2qqvjJam#0%_O7Oh0*`|bLZ$mHpkGP&mr?dH3BEU zBp0{3pqYxCL7a?&6t^cb;Jq7e;_u!CF#LgG?m|EkvpV5sQdu7$~FbAo(Mbi#OsbB>h3z@%E0aRh@E8>8PN)U$AroBESRBpzC;yG z+76;L{*qEWOK&p9IHiZdLu;hql--co5=ZhdWhqtw3tRDwBA4K@!!>PQaWl9xrN0Kd zxZ7=To-0IHd=wuO_syxR08eUHTnK5&{qIu$4`FW^R#n)o3okkaB%}qTLlBUZShUh5 z-6`GDCEZ9jOOTQhkdy{#1!<6O0qM^3EZ^As-QPan`7^J}A0p2A#2ELugCG4qR4nog z5r&R2p{_~$S)J{5@_JLt)1dRD&Xn({t^C7x(YHgMPZ{3{j5E_k*Nr{=j)3O#GBwQq zVu)94T2|J`?guq#>J&kfYh`a$c)_V&R zXY|P5%IoeT8RPo5ZA7ls#x}`+;=ZF}qLZu{B^l4R^Oi8BoJHRG^y%m6e2Z?m@3aob zWTYQ4q&V6)-Zx^=Qd!Av{yoNwhS8=9qB3U0^3p0N%opQe@^!9WQkHdA+Iy)XB)%p zdARyTrR9weKw&zMssCG8H~Ei6EjS(Le!*Koujg$waNHt;XqolDV@KGi@YY2fUX2u7 zl6~NYFd1Y{y4pS zd=d4?X_8CA+v_Gm6LJ;=G0s?mXnsgyvx#(U%6n7BmqQ0y5syVwO%*Y}eTl|surr(T zmLR<>4rXxWXJH>gJ4p_^jY$4bWm+~JYWcEY=T9H)?31&{cQ~3UNCDzL+{nJcoKihA z0QIQb^2;~v67?)EeCBDrAmsO zXw47enYwwjn@ZiBfBTj#aoz&N)Q5*68Q#PRPVO`K2TW^f$80|kCkaEn(>*mnjASP z*wdB-3?Gi;QS;bdxz77U%sYc(ukq5v3I7Cg9e;M5r?ZvuhA8iH)$&ZJ zrlp3}eUFz%K4KY=W#epMEPvZp!stX>9MV#GHMc^Q`p6dzJ7Mc#{zw0DYBRdCG22(D z53qhRgfupiH89KY(i%4jLbYzlc4P^*Bf@v-ee+SRbIJJ4iY>USb4v`q1fO#W<3k4M zk;EkG%q_N`HacB4t7~d(Reg9;J4Z3^ev(tzYFcV043a3D1P)2I=PTQkOVb|Gnkx?} zLqpg5wprgd@4EI6e}88mf;r-pz1y{`xr;s?iTAih=o**eohgS->KGckW-x54-Mlh6 zv+oh8@CP4lSyz_FcDu0r>IYlz&tE(sV^JFpxqpv5Nd6~uqA^ljQHaBbj=6S|UP^|% zq!8%PQmextADbxBLaS0&z-a&QfOzn3V!Qe2ULVh~&KUgqvTXYuv6IWynBXk&gTXK% zF>Dynwj|aI_%vltM#LN|k+tH+a(#KW?U?6n432rkWe}C+>z4DYtMPv@zTRFL*hzx& zDscR;~i1oel*rGR+ zk_l-=lB2XT-S_z4544 zI=2H$Ma>oZ_S(TzV-4SzpefKY@4wUCQ_NxsS2il@$@O@?K zPl6DYS@m#~As|hLeBEdCp@6ShC7{7-k0P8s+mh!?D}3eeO)9oTvD2AQSw^*9h=*<& zNYEK}=ufg#S}`$l69ksX zRqqUn_Rp-MO*DoTYvo1o>mZdNHEt7v7!9G{9JS9^%t3TPXNQ&J}eCOmVCmCvlv zvf%K{95~37Slkf7<=cUn{>-;<&B6?6dX>{%JneUSVsa?8QEaD`yVEi7 z$CtilLipxL;g0=Qb%%46u~f=uw*fgR7E1$-y&@;G`{~*tSQ_X*%^6Fnv^N2WT5Hb$ z9va<#4sh6S#!G9M!fR#V<=)&tyxJ>b^Wn)ZcoW)y(@;7k$I)_7=GboFlC z;h+L2q$^hV*qNDM@!C#P13zSGi+g>u72Jextd@f}=GvW@tvK4p$a ztC^}Qb%2VvBAa2Q6JiVYmkJb8+N0jZlnTFBDQyQ$C*;o{{YLL0IiqVm^afq_$gXba zgJs>Dh(xv~hz7W9Q8=pKLOjFOV02U_M2VJ9lhPq2B*Rg;&xtfin``GJRuLve&r;sD zSavF5w-K#xnk=f(|JmSabXkp*Pj7)*uxk}!ghjIox&izXLYTd-jrsmSjcINRgXX$R ztVJr-Iasm~*4upjEs2SATS! zFK?^_d`|+*Eaz0d+Uc;3*{63<;0v(z_EO{Oe?&K7o{`%YYHQsPlAyTXHA!4sngD0x$4!)5M2?t zSp+<@?o)0T#^^K4!ojnC^e(W)DUqJ!*PB!~Tl*)d-=}Lyk%jz^8nXBsba8g4%VMo= z2(LXH8fb-&kA58uoZnOItUG74?f*%3mLCZy*atL2!D^+q4wocgvbzmi>%261*3_lVxjJ;*V5@S*Uj6LmDtMGZyW1X~!&);{+fmzzO3Zbgx zX#+6*(Pah8=)chZF69hxq4WIC`a278M@0hbQGo`4ET2YwVn!Z92!WjE6@=~|7WFP zQvk`B8y&`i_--B!gm8FxQXr!$$SeA%gJE0n@LYhYT5AdudEzq+#bMf|T|(n;ijeB( z)Y85C+z8PSEK|xvEyOq{r0!RWl)?cX?Fe*7l*)`}Tg&;&6~bhzH!Ao)zsB{*k_P`VG)8g|tFViGf?T`itU)a~qrn-)J!>YFmFVkrnE8bQExDw!L4-aXGC zJJ(%8HrAlITgYAaHGk6T+pi_aYl8}$Ted_J{7`(4aL#mm54-wz0L9YHJ-aA{>3ldHKq5I9GsR5&frl1 z#sxyj%bw+ckJeI&x)8|uRAj*E&(QQE8m5J|2VJK%w?0-3H`D>fge3Cv5uheob) zPey(o{8vjGIEWAlQKy7vMis@D=qk;Ym+I6FRohE^X(#en4m5bJQmCk4ULfT-)mUm? zjS2}zm*}xKQbxzI^&nbyxnbilPLrDCutrCuXQekDkZ%`VF4opz>JP(3q@I1+d%i!8 zZ^S0UOJRP5UVw}e#(-aYcLWMuy;(e3pnN!>|Fpi6Vja5XG zf*36pP;x41#74&^j@t1)+@DH-L2)_dY8DmQ+L;jF)Jj~O-&LR~Xo)!Y*oDiiZ$v9X zDT}>yvXwak-5v#b2^kC8Gl5*Lxqg`=MiWbL5OU@i@PtW2TZN^*(m(~Rc+9ciWQK=* z(VoG&TLy*e`+&+-{A~%ok-ph}tK;o+Bv#m95>C4FAIVMiXp1o@%Z zNNeLt%UwEVOEu5UugZ6}i(l)g7Vl&>9x$iF=Y`qz%02XoUF}kuC6=glpPT}m_qp|V zC~@uUQUPvT3m;dNbb4z1<=2R4Z1wSj3Uie7FNb6MfArzB5T+*K^|iIW3jMDqDj;da z$(Xsv_kyPZ2@ zG^mtkdQbeP_g@NB2zE}&zRMOl3Cd8-)I5NUiNqjrzB(x3skF&cT`UgN;37sOjV5;_ zWsB+U=4gs`nH!EcOtB-shxO}eTh;UYbi|;@EBvm{Lc7{yvqC^n&Vm7@*>iOr);v1h zh#Tk0z!4iRziut_`K{1RjVik@;)^mdLG*4k#C^hFk4#gdP4>-@mJI2N5)z(_v1(S2 z{x0=v&G|5yCjDJISK`v~yN-|Tuy~_CFLp&0$9nmp!lOUV+OamLfAK>-6JrK0mJKAI zn#?CV{}Z&rLyDC)+2bPR@k+xc6;dYihPkrbx%@D@b#!L4g%odS)So;9Wx$i{+X2uv z>4J${t1G;N=Mt)Pdk*=gMU*H{GFH8+b9s45)@4%m$*|3D+e{>Wm5%NZrE4uH!pCBJ z;cr3k>s$<>c%3QmT(b?&EufRf2|;!Quof^spv~(4ipS)j5jdbfiO1Ufz}tl9Q4tC-_B%JzMw#bR^!GmPjiddnC*e3rX}m)AMWu zLKtl12zz8Ng`5?M=yA)ps z?KV><)|dcW#0do^p=#W@+^ZGo0MW_USW!pE&g_@yvS|h4iEJSqB=IXdnC;x-bbc$( zwHqhd1E)M8gc*kCx$IK(OqPB4HjPZR8W@g0wxvGdpM4)?xt)B_7<42O+&#^K?i`c- zOgpqE7LlJ%+4!TyU4Nwf=^i%}fidNcX+-zOF7LIaSPsG^jF@mJ)IR=1%FuL!42eIB z1f7kYrKgE~Fr5I69!-Y}B3*2M_=y-{Tmh2B`{=d~ZC0&q^kg_65$2(H=mEPX;*tI8 zDK+2y5@!ebB0^B5esl$&U$LOS%J=vaz5JW4c7)@D21t$p}O!TrTQVQD0_ z&21UZ2)@ixT=CiiWj|=x?$OFS-&;{pP~;U5Htx;NTKCH;NEw}G=)DwSur!VlwN=k* zdQ~3adq)8^ecK4#LyE&df?7qn{q&!f-K7H8$|MwCj$>~A?d}1gF3E!8zf}VUQ2;Rg zH@vnCK)w|b0rk~(^6|5(cNhgUB9o3KBMpCPAl=+`&V?f8EZv7&5l&Jn*#eu+70K2%_ z9}gR=$H9&%YX|8I@?**wPHp1`nHFY!lS@EG!T&z;odD_&Or zs_6y)mzo~DeD%=H+E`SWvvFubdU{D!*>Uk_wQ`~u>x>JG*e<{~pg%l3%oyQ&yQ}pZ zspi3|#omnHycOWf2{!Ri;ROsZLQZQ8`{uDx|S6z{K4=@YC$T+%asZTZNEzPK*O=6sm+=$`C4@rN1KQcg@1Nj#s%K|g zuxYWd8u}0!>1RDgQV3+8;>x@tNKuka@3j$u2_X8gagz|dK!r_1l3FElYp0?t&lcy( zY+to`%DnZd6KkD5@Wu!=7eVNm@;_)Z`xfP3A())*5UzgYGBJzp?#Aw%!6@LRNO}#bRm0nY{7tYXpMJaC zv4t=>V1n{rEqTKxSOx?7@bhNv)@H6h`_5D)F>2g^W6f#0WsXW}(cSH}jnURF&q*gf48PVejwLxy>j z^zAY!u95FXR0=#}fI9EUVj^;%E-c0awpzwW0g;Rl9`wayWLN_eD&?nA7CgyD z)TMR2_DI~1#nhV8u~OyBA#&z3+ndYM+*21jI*7eF z+EKaUBj%B_o~w6d?gSW`$B_!(986NvOhuyYw(8YDc?&C zei$YwbxQONMN|&{O%m*qX`6l0_>f&Igy};nE?O}<^7#|IGSlqWP2r8cSPo=A8S2+h zej+&|nlHuvnU6JEc1-7WC>hdgz>Z`5oW9joj%XyrFx!Hh(_&R&w^qk%=MeQr0|-3D z3^Z_-%CVSE&+ie)kRU{SDmB8IttS~x+$u#WTuu*XQ;Q!OnihTfG+>-<7&vtrkmHpn zWbRD%W%xW5`!b_`%Tk~cK^SuyEy!SPo2{t?PFn7eoXgg4);I~7ai}k+SA_| zf@KZ>xz6^7z0zzEz3rog3Ylvl#2!AhKHm7A2y!rGU0uKU^|(;|qqcs*x`Ygo?Oi1a z+NnwH8Q0P=Pv5=qp&gjAdtXYd)YT!NKIymFh6Hp=s84WYHXp~{56TTbCym3(k5clG zYzUSL?cmu?&=YZ&u5<9BIt(`D^4vh|CP;a5RxO@A!1FM!3Q340h&nzwGa2Rlw{G=q z;yDj9i~Wi^si=7ho78UXtLW&=FR|GkGdGIa<~D=)en1TRvTrEGhkr4SgrE|S?e#pj@6%sSvB~iAVLS0CP+#gz1gN>?M_EJ)SdRCfYfR_^5-t*_ zNA8fw#kF%%$r2`x&}+47Agse8&QU_R81FwpQR*v`l3U*Tf(@0Xf+&4QKz~LI;o4RS z#fl#~%-^1KK_k`WPWMFBGVgi!%~6yrV!ff&B`(nOD}r*98hiOg)3FX$n%rjfhipJL)aKUp1-0t1Lr zvt8d1Q^Z~MhgSh*a{WTeD|9nAQZ%ORJl~=aXhmurQWBzaZ(Mt*FV@xgL+u8W2M|v3 z05ZE;J67sf#LM`R2&88O7EIQV*mgpy@>Wv`c{Xy7^&0_!>P^;})63pb@(Srhi=osR zIgNJD$UmcFNQE_RgVjNsN{`Wm%#(SB6x;GM@gMz^^ExLF(Aj|to$g(>l&(F8eeGRv zT&rWb&y@}vswiO9%Mdp)-n(AWuaII4O#5;Ogok4NH(|h0o#If9r42Q~peZC-O~1T6 zafe_!^s!$|K&B+Z*U`Ku5M`=?!qc`65tI_QyV?5tuPHD(DA;Cly%qPBZg_xK>@5eb zzOU?O{eTF&cLgw2KrZ&S1gP{EK0vyFb8t)Kk9j)tkztXJ3jIfK{V%$x)(r2~4b4us zCyqwtIIN)IkCvY@qQ>5VaKHgT`xs}o2G-w4t5o+|AchZNEq|!K9%~~7>se(C;3e4? z1Q0-qy?N3eKz(n&Ac&Iw3|40PD&Lte5-iW~ofKt~4Ieyn;G6HCp0T+dZmeL5H-Z-jM zj zLYQi$ulu#Uq#yhO>K5CSK-+Ur{|J)IuP@j9UAt*u{c%C()q z`3O>hS=OPoIwtgPNcLd#%MM!9&pK#T)9gbgLxYN%usssn#dF`&64chh9|cH+V$F(< z0oCq3a)v9p9&_zj@^Yj4N&+1ISHu1Yk}uJi5S8VWV`FiY4_gtK{N{NM`M*$QnVWl7 z$EiF+Rhjk|63{UQHkpLMl5OK1u>+?(aZQ zpbc=zAbCJtJ6G}L%k!Bgw^9lLXGsw2RE$82sO;qAgb(cQIzFc4^8MXud*$G;F7uG# zp@8jSK#gA-zSd@5hxmFtRU+Q}e1Mn4gO!qqc1D9!uNk|)7|1{IR2y6KR6tIXI*0Fp z;vT2!ViJXUdE8;f(Wq*>dPw+-+oI2|1_0|L6ZQQ4C%133XiBFlmHmgwGsSL_m%z|# z&D`Rp#g*Oey#lGIXDHo|v~#W9u6&`+axG(`S*PUBBmv>bLpp?EFt%R*Zh(GE98+-?Co z6o^Unn}U(l&}0#0J_FT_d@}3G$s_-WFG~h@|Xock;!qC4N`u)rIg-ZcFYvZ z;HI>HrSA_r&&S@Zv7FB!meWtQ*FN7H+%TcN7YGwv1L;Y1IzhN(&-?6Zs+FCsKnTlr z02DoPX$;rz04e>M@w=0qDcq?ZL8$9>tmj|Bk|7EOE_C_%_lpEX<=kt>05U|h;`nFZ ztv*S=3dL+^YfjmZ!2Ue)*+>X+eWZfY+3Z|VAc$YjKrvM4aF;d1?aL{=qR)O3yBGSJyCH9VDxQgI zR>#BAypBJ8TxfWrw!t<|$1fP>?nzuK!@`y)5PquBK~q}Yj7=~2ZbCmgHfmS&JhDn; zRADDL1azzfk`AWH03o3VXa}6T%>dNrMoW65e>HcIoYxKagzPmN3MnBVJRNTAyvUzWqlk!=?$(8|2nz*I7by~9SC;(*C(A4al6wwV7 zwvb9mCj2mG!VD=Ucix;eZ&%?N`G`Qzn49$bIJLih>%%CYH4GKQv)PFhOq_peH?7I- zU;~_BKjv?Dq9wy8w7k8oGT5Z?)>edi$G-(*&*#4*4dT<`3^$6Hk5BOkA;~Z5xz^N2 zrEYR4CfO1{4SWqq_I~q@s0^YFifSztFrDk=*3KeS!xLWPAc75b{HC;hTmH;R@x z_}w0ebT#vyR%XDx^_bG_9a*#)d6Q~dKTvTw!ewG!5C~>!VDdTzcd!^z4vO!QPLOQ{N$dze~kvkWPK5UwZ^0qIwe<`YBTY6Fq$ zB|b7u46LXmP5%o3(O~U79~o6o`4VODZno-qN*gPCpNu@04AA}I`9~+JIAM@UjPvKY z;4s$PWXEN_; z*sv{uqNz?AjvM~3_rs(LMBUlB8Xokmn8i=ybNl1Pejhx5Cn7ppJ(E7sNVedAFtnUMyTN*F4|i zdfTZ!J&i&wS8iF40^7Bh6uw|$W`CC=aP$ooqWnTMH5{EE8Uu8f%~E6ZbFQE`ds%n$Crg(Pj~9m+nH^IScW zqA;AIWBpS{*wI+l`~`WqQ8g(QML`!EK|%WFH-mSvM=d1JwQ0j0iwMm`3ymWLU|=T(WLM)cf39mz7zzQExnGY?q7|0 z(|>BrvpXPwGW_oW`2iCGlTP6NVw)iiQ>Q{@>(wWV&WCTRYc_W61_uWdfn{I`()g62 ze>uB~L+fRsd^(quA%N612875_vi>^r5(coyG}b?9mu=vL1W18)_d~Vi;AS-y9^ont zeaPhzp$Ro6G={1&A_Ayuda^&ELz9yb&-@MNr^)e{XsC~8aqq-6nlC~-@BOe-9Jll0 zqkCwpB3$`cLfE5S2shRTMjqyykx(<)*CW8lJ|c3$RJ%zf#JiNaI6EQA0w@}7_N#_% zKdV;G>qQCLEH!HnORlfgq@_b`Z>>vr7ix7Y{f|aug2WNc|7DI)_17FhCERLI8$>*~A<{(ToiRyMshKZbVS3@W6d|p*e z%^W}%&RZiijb5jphBA2M?v6O_RZUEtzQ2*c&-rS)A=LACuti=uh^X9fxG)mVWsr{E zeDeyS7y!Enn%JqElebX(qTytku29o>%3Uz$d!nMJMg6gKl3bH@Ya^jeQb zHs33HGhv)y9$<(eOs!VFf-#R7P~8)h(GoEov1U=Q>@oM0Sdmhu|Emjn$|LIQ8b$y1 zAmfR?RW_#{pQJf*Ts#(>5qSt=b|11U3V$O%05IHJD$ajcyTYqKz{>(6RX0=~AlF~x z1QKpB!9V5TEKK+hAtr!?1YI_wcV}v3%DdvIL_zlo`_j=fAr zmWEglT_8i0k%TE=8IWRCh;pV#!1aa0a*}?Skw3cZ(k)6t-((4aH?EJe1X9HXRCC8a zdfm~G1b3i+D1K;M7fA3!!OTG}dy91H4dwlB>+6Fz^}Xv56lfU*yRw*TfH+0?K^B}@ z;R$GHt& zC-CN*N`A<+e=9{zM<>H}rdlfTUNKkXDF|dsy1#9^uh$4)O6!-OSNh9{$b4%I`2;K( zG@5|Z_%%~aQZ_k*(jTfxI?%M8v6w5a7 z{`{Fvs)Vf7Nh?7VFndf-0jsNb41m}AN)VO~K1ji!^*V$I5!wSR*v^7EwoQ?t>xcmZ zZ-DJg4i6QWuL8Z41V=@NJ$7Vc?_XztuBu+iN~UXq93v}&814_THn5A&k>}76H4263NO$uHRgt18dq*boQNQPl=~r7O}B~M*=wR50Cx@RjDD+t?Vn9!TSR(OVuo?ooyBI zFaN5)UNI=frM~81fB%rybZLMS-BLT_p#Bs2w$1x_BOo1XlBFesd@4^_s;mK31T**{ zO0R)|!DD)e+uoeO_jmb39)tKEL8uQBK>OEVr(thzuZGdA|MhLj4uRZ~LXHq<&t238 znmIA~CBSPF27Y?{(o))_E0!VH;crN5!VQ+0f{0nX>+RkLy{H=qS<6>V%B5983>d}t zfWWREkgQE0e8R5^tX{#tv>x+%8rb`o_|YSp%G-DxWryO@FLz@sexu1=45bRsK9uK` zsb{ZaIoAMX2G$`ro#!|D;%l`XG;7!NPJ&1Adt z&##56llijEsXl*ufwk(+kaj`RRO?ZWQI)J3snD;`Nz3niTyVrGoVGw zASef?i(GBdHopCnf#+C_z{C*v<*nPwV!gv6yZoK$$|A^1oP#i_p$^wPr8)=x65Csk zb~_e$TS0ND(q|{i@OLx6nI7KeDZi z=PngPdI!NNYG$G2osstI#DC}y8+8SsGcS6e)CN-%3sTQMgHhw)n!Q^{r;?i^@V|bU zI!?~FhR=4yktN8`CCv%tSJ@D4)F~X&K@VrcTXPOz{|-vYV*S$@!DnEvx4^Y!`ouff zfL;E7mt4a~gdilL_?iGo3~ueGY2YbE3CqjR2QI7wb~Vn<&f~YhZi+34n4Fw-zxg;8 zB#>eTmQ3=l+8gpZz@?e?MWDjmBM15Fv23!b}S&fCH+S#jBwCkT#i9_Qrs&}H|%#Xv`l4+Red#Zh-G=w-@QYuun z!WeDl=XmQH*L528L}VlH{m&L&G<{-gJHBP!guJB2c!(TMho`b8uM(VB3~>#qcuPFd z@%GoM`LLJ4*l|j3uJEd;(6G%ynL*Ckaf7!l@w|e&=EqO4$hPO_jN5SNmSSr7y=i!lGm0r3f&p zF7q~TwDq$om!$ba)A7Zn{_fNd7s>4v;d8*y_hc6Zk4gAk*$os#mikwI6$mh|(X8`S zaM33M>6^1=!SUl|ZA4{0hlM08V)kOn3;3|BvgItpXWx5K&Aq6*yW74WQ0xQhUXe2^Ob`I>V(>bIAJK!B*4kSSQS94vH4hvC0XA5L@|0kyk zrdYO!GxP_ANVryyCOGj~OL=XRLrSM;D|qEr2&9{kl*1*)jdfo;CeEII^-soK>Um7b zO*qIfIO%>TsbO~&i-L>Lv^|{I6HX!#R4NqK&N|SBuM34;tCSrWoYzRPieKs0nn7E`s5joa^2%r ze=xz>|@6bJr}PVepXf|3Iz)BM89F!Puo;{WR^X%#ehF&qJx)eGUF9qS94NF6wv5H zwNZNNXHak83t^o(l-5QVSOE>tLU@EXM$QyUjr|J)?{jT|)RBt=qS5b7GG^7q*GaP^COw&MDm-9th!PTh}xM^0DV8eyy)5pzQ6r=pQrR9%?Lh z0HKa}`gHB|qHXQ=Za>u6Q6E6QIlna@sdzNb4nYtTO+p!}B3xQ+F6^6wW zgzdMILOwD6fqzN3$q4J1XWRiB0+YBxFpPx%>a^1TX(q2-0!Y7j!R2{m=H^j7HnFh4 z?&;}C_IyRz^9yNiNZ2tpmqHf5N@r)M{8Hbe!Dez4iafAry)d959SE2)H&;!9B^95< zvygC65rgoK(ZlkU8uSK)KvR}G9;??lqq%%{@{SE^Yq{}GYRiaUW0jpBWv5%wJ$eGq zj|G@b=(pYTU7}|{BuMBTAaQN=Iy|+jUb`gbd34gC6;oNtY$_1#B2?T|nLQARwKpIP zF-jcU?0epwRjlnl+*LCkuirV6uV8IcO7spB3pe#MNGjSy3>vYD4y0|R;37pUzI#0a zsuT@?=^T|sPXJRJb-4Y{gg)*fyd3^-i&)+eQJGOGCrL@m;hc# zp6qD<>){A|v=-pDMPIqH&(VkB1d6VU(NR^;^Lh7m`1>>ep22ej!q*4p8eJx4B5=1o zbH(|T#Fg5v=l9n@TYa?Dfu#NA&3i;;W@cuFH){p~pfxfYfxGT9LSQ4j4z{+)$giVf zl^Qk>0x%z7b+BsL8?-U-v1g}ph+grS&=;AXbtWfNxz@62RY^<7WKKV@sD4qJoSM=; zaZ}-Qx6n4&+jeCb-t|46nVW0mp;PJ5x29;66L*jBst1VrJS2$wtvG?2bmr%nOU2m? zC2E#hU^LU-2NP{d6BC4fSa?33zc9Y_EbPMvFp}H=`Bwa>X4peV$OBIFLcsT)gX;|{DPSN3*&H~RbhC=#rL^6q2V{)piXx zVcP-md8!&34B!Gv61}bwP2@D~Nzu`i9&qYljA{d=#R#`;2403gY395 z9IGeWSWG9pWs>?(ib5Hf^!<|7&zxUUElUl% z7gswlW&8A1M93X=DLGY_X_KpRAaK6dd{`!|;6p3M`SUwKlz-$~proW3^=r>Y^s;_q z!=nF3|77SSmRx&n@`?+wFy|0Grs6uz9zLaDZ1wQ!ZrDzR7b!{)CFA`zsyx~?D0&YW?+ zUGePB5>c_-YK+qC&!R6iUl@H25|Ec8QBHuq4+Hv|MnpX_P31!pwf%GBqvz@2{0~>d zaZMv`U$73-xq+D=0{ZufNg=4i*)xm>aU)ml#=>(eQp0pax0d-}V~Cn}Bz^uv89sU5 z@=jI2cjcYl*Cg6TflDrTv8Lv`TVi@ICP5X}zEPgynYNnTog70a$1hC`VEywL|h z()~IZD#fp@nUQju1iw>o<+2=#8Uz)JeuY5@4cb!iw8eTZ znihicfW#HL$!%3!+7oY^s#)eTgjR+a+24X@Vw*PE@K+T1t zc?LTQOiE4~U~Ic27HIB8g!L#=Lr#&j(l`S3Yi(Xa--x5jA_)rDqJbpmV_Y`=5eO5* z*Mb5SoeVOc7Wbyw&q|Q_K-Duo%i=so5yRkPdRy za_nXSJ-4=C!+q^GKoy%<0mX99>tZhxUPJ>_^*`>36=8HkP!=XE-fZw~qrO&Cqb29F zPfBLhlm%i`hM1IPVh)3(7Vq;syD4@MZQsnMA6d6@bBRw*zLVqg*Mx=S1dg9!K$yP# z-m$j9<7lc}g5rff!|J`{dBUQ)?fByfp)QX)7t~%4uU=lk6uI2mTvN;Np<#Z!9cqjl5I8d*p zLI_`MJhTPpeGsllJYX1FQ`l^emvDjfVIi5*UM2~<+4OxtOgxG*yb*tKtdJ|BZ>mks zYa4%Y;c+gXCJ+P)yh0Ah-~b<$^BtoVer_6iJOdPoM!)6<42cq!k-1$!&7s6@ znUf?;btfzP^VnK)^bM$h8_WEhYOs}Ld#2Nkr%}T-BGvfM&7&rVV zx=Ny`%~O*`eKaK@cpG(yPJ_W5uQQBeq+ z;{hlbQ=GABt9IP)3vqSl$D{vycu zZvql)JL2tc7}|Ab=GqNjL5Onssthn34<=nr4M9A+d^welY>E@b=19oA)rs;DqLXgi z)gx-FnvIT2D_-hitAGy)2%;g-DCoRJ(ZOWT7W%hECgjWn{USd&gQZPCPNTm`b5U|e zY0?zOi8W5`1x_rl^1F3^kG=q|)iqq14p5_LJPMr(=G3z0fGbE%M~=&Jh7 zRwaK9yQfdd@a9kEsce?G8QzAY_B1Gwm-YXnS4BvzH+5#*%m@xe?mt-WN{(JQJL8Q2 zvaGZ;k~pgyUJK`GLJ|2v!PJ5;rzL7dAF11*c_v-vL&0X+=L60&HRL&RzZ_#8o zg*$5ib=hzsOs~~P`$v`lt+uu{rXYRZpFRT@K!DyiP*r_$zQ33n8j4zr0R&Dh&m6lK z+&(lo@W36_kM4%N&Xj3GQF>AB>Hw*Z+u0S$6V-=hH8H2X_jc*4AP|t9+>l zNuvJ`X>T1>h4!|Ku0?kv4WhJ2NcTdd5s(Jy6p-$25J^du7LX8-lrBNKQ@XoBy6#;5 z;)`>>bM77YpFOtjF*a*W=6vTnpZ5t~ui)p(Pwk0|3krjsw(Q2D8XGFL=6oNN=&-Be z^T`IQx|*tCrIz2~JAE6ifu>a{QQZ(wWyul8dfcrY^z@MY=t67P2<^btyeQyv;5V4i z*;@>e*K-;7aelYIN*y*}Sd&X$fb{tetTIk z(3DtIcc5dyvN%8Lq_MUT?f44jYvbL*A|h1hy#YUmDRu|Eug~@*xcZd9t&}IR2-;$K z0PoFl8LeYsz}I*37aAn<*6v zYH|_n#JYv0vuZ0h*g+MOhtUr2lOe?Gns;4q24f>pk-d>&!v)L^ z&KQBLP8_EF-x?7!-4~+YV9`9_Zkx+5$U;#RDy`dolempK?%3R*d*&O-DG3b?kqBBX zTUqL>ARYoO3h{5G>}m~|G8zJj5djgwaQ{)~%g-fdSVQz;=M1o<1(}KPAjk7{>;iE+ z4#oJdgQ5G@E*Ij@t&zSpS;mMRYX>ivq>DWvp#~n^AozgSLNt``4-TwMaiqAAj7<_a zjwHmRiX~f<40ILmBtz>JQ2uZu3VxSVwr1OwA>{@v;O8GBX{i+=osGWq6)m>XF1c&t z8f?anB}QzyLDVLg$@tJAW6O!7)~SFZ^sq0E)pJAfK6!0~ZJvmY+)h<2$_w%Cw&%F`Jl; zee#RMpetR(i#zrYcqpOc;)Vs;;DsLLCvpgnW$@a3Mv3@cZwr`b)4Zm5-3stX=o2C! zX9VPEiLnh`R0uISTgt*6A+SK_ficckJ!2q>ULrxOw6}V$u-Lcc<9N2|+in;wBe^B= zDVA441z!m8$G>7lQIxQ?Mz)q!JQB(`qgsf;c~M5d_egf4>FU936`wrmz|^k7FRj=o zMPFtPV*E&Zmm)mAAZnn`W`KA4o3~GF&b(jx`GwV_=a%R}^T}bxly__jP$of`k-N_R ztZM3oHbYO?449OO5-qfq0E9xEC=Jku!6V8B8LSMPzX}T3EhuFF`IP;khe*2RJrE9> zuXW060J3iKiZAGhX+0Z#pAfl{@u=>i({62NYiPk-A?a1NR5zTR(cRUB=X$mB%$z(G zKFa>yTcrfeCsa{MU{tjr@V|ORf|?KGLuFE7Ag>5bbahhAM&Ok`_hm`0P?WpdOCb<) zCjAkgwJ7&2GdVNUFoaWXf33IiM#%n5fWFqiAdwcr*P-x{?we7gh|R!rq+ZL>C@BH* z)kgG&3k3h^<{h8Qx1H=m*=&*w2oH6d&WN@B^-6l*N9U{%qypufHsP=3UhCfGQfTJBpf57QBw% zX%@CK(k*A9+Y*;H5NA3`+UeSPe|-M!=LZv2dgKzhv8W#lL_H;{vowbK1$71t$%E-h zTxg`!myV1G79O6#(wTXtyQS`tRZTDT?~e>J??Od&&kGcxGF-G}IFx)?6LSr&lJnP- z8QtT+je|0i_k0l&a(ZO2qwJ$c6vY!8)? zlfSngoMR5|!X~(BdwS)@Z7y(O&jCyWAXR9P-o%6eaf0kzn(cf8vo|pbi9E8dv`5TO zFUPa=?AM#XP?(;cUUDIO=rZJd`){6kD*Pn?jJGv#_V3WB63)8%DU$IJG2)XKt~dqr zid7iTl-s>Xn+;R*ub@JuE(pp-z;6AAosIQ}`fA&rVhZ~eAppwgH|Ouz>$%2#xyChN zj51|Iz!5QtWrv&;0AIhs`bO0}O8?%FXrn@6VSD_W8AzBU2yi8x8|dfe z2TrMK2Fcbm5M?*Vl3vY>n+M7xvfvM9+mRl}Kdld1KA0Cy(jJo949_Uv7~Nd?YIAzV zr^TJ*#qX&ZDrco}s8yzI3E@2^8 zxfZM+JbKeLpSU4->>|I}SycsJYZum5goXRgRB3dVs93;$$@4C866*Q# zSQv@Am)OHpq%bUHIB3X8=_PYTBNx=6fs&y>Qvi{V-miV_Jmg1%!M= zRvdpB<$@t*mc~z4`>q@L@kH5K;z18qduON-zhFlY>4IS=qaf@^{~;s|d28YY36u49 zEK18uV$!?)-C>{YL+A5}$?3a|OO3%=cZ=Pe;#~DF#>PAy++ItL?Q#=_oAs{5wxs4fJeyjGc);yRG~6U4*~0$>IEX#Ie6Z(EZA84#LzPjs^<}la@8)*r!{1 z*db_k>k!Hc03yFc_&`4qP+R|Yf9|)y$}8_VoEx~U);^g5f2Se+`<(_x+y+qPv-v59 z@0FEmaJja0H06iwZL0!_`iQ}#OJgCc_ILQeGQjKX?WbpE@7p^%roW|}ix>U{8w@jO zA!Z`NcJT;UF|N)BjK`;mPSalR8^3DUdA|(%PVwHbD#O5EaIeOU>O8A~m|kIIOqm_H zX+s3rj4sM0>;rnBeJX{`sE_SUPLEGjBleTl8LDQfL#Zvf@ZF6DE09`Va7xrku-IOo zaAYe%nAQnBiX^|+zaJr>lG?v$(ksPMQnUE-$3vb&fr`rcqS*~X6oy5|q~ZA;V9igg zsr<7)j|IjsYWcy0ThZ)~^k^Q!XmF5|0ZKifOhzhLngLZFde^qNrW+*kFx1=xvX27$ zj{Ic8=uNfyGY6fzB(C(38sy1*b|@1{Y0pxm^X@k#VBT`B7U3Ezuv;thCQPEPY@lra z_;j<~Fh-1^l{#{EV}vIKZ%E$%)T?2_NC>2%U$#Rr(7#GK`#;)s*)abB$8=@S#!t|G zzYLY1cn&dPN01@6$UPrswW3J^z9zIbt$*&; zuZw>An}%ab%VqR4uQm1dFlD&^tq849iM*`3!RDLL7OzUz)4T&#`Xu1QA)E-DxrO0Q z9F6eVfe)!1X3G=yYUKel{Y1q2nBMdloE0OL_;l&&w^NV20ctqO*;@boc6EV2+TB(0 zhpD-F4}c95&AhINtJ9{_+pG2WkI1AKFrK<4rWB?2N$CV`y zd2d6k6VBH%o^(j))#HE>x`MRVDzRkSd#&n2dXWC+F9c#A3s`2xDOCF}pT2yTq4%L7 zg)gTl2Tx)g*;3e0O^(G_1fsF|CED|qrY551TTK7uhA@f6vfGlqGmR+hga>Q#_{#{h z)MAqwnytl+vNMqZq;Uzb@xyl0cJhwi`~-)0W>LwDLoy9NhY5S$BVz?y$?TB|c|sVU z)LD^`%!c%OmnbM^1mPr+BG^IJ*0eWmpvrsOvFg}-E_Grd(l2&SKb}IyI|TYMO#$O< z@Jo`UZN_oT!9km^*(gDK7pODoC`k z^?~%0VHJo_*NG#LmI)K9AFXpkp|+pea|tYVjY7U5MP+);D~}5|S-ZFcAw}uE05P41 z`+3v@KaW!$$gdh=14&6q6)mlN#dURc^()GBRtaZrDHC#Dg}AslxJe3z*r%+6*twnCS`R+mNT+;*l7Z5t-}5(yi41k zNmMN^Q*_npvpV-ke%v!v@Q`K+B|(gap|n$<4o3X*2_;UnL_KG$_?Fi+4bQ<7R3-{z zmKrC_%ibhwd!C;D+R%s8Z^tq8Z}!krk*)R6*1GE4l$A|Guk{I-oXzq*j@AFAdpiC!qe%K=YwC|y&Pa`OFcu`{H!Y3tWcPahgPYH)(TM5` z_3oB*i~Sp2gttq1P^lF9ZhH%PN**bfS2nsr<<)kOeA}M$3ib(nAaNS}1#%n)0a*)| zzpGU|CZLMljrzmh{hus_3?XgaxBAQ`s&VuJ0?;LHW4;0OLV&=(_SxEjdq&8@r&y|R zZCsD@{A2K%SWCzGdx|(P+%;N`AVg-Z1$Br=QH9rL{wY06`?|$@u7ZNnzp%!adVb?a za}o#&a=Er725v%#(WPnH19#n_pRemESLm>{exPwEPc?eiz!D*w-zB`7^_KEvoHHrX zxuxQyqNLw4t2}zthHsr!U->PSyL9zxHfDi)A4!iIvd`R#i1|}V(xcjJy9mo=a3or9 zS7LFf`#n9F@>3$P1AU6f1O^w9e$27=6EV=*r9^@%@qeCWMzTZlO9~dbcmuRPS)cvZb}P z9kRwUQPl}`&AxCr&%MhT#$XA_tY$-eidEvCsyj2n*NjtSfGd-aAhDeIaMMvKfY!Dd zdl7?=;Jr~TX;FAszbj?fHu;0F7I`e!fOwfO68El^=;BX4$S?=H?Ht61VIo_<0rh$Zw1s4gqSpDi+XOaUWC`yP`?#WxszV z-Wh>@!x!o7eMKWbWc@0OF?U(ZCm+1ZOxMC5Y0>1Cys@$x%X)-)FyG;zQgF)jKw=WV@bTIjN~m!J=)N(cAPdmk0YjdcO>!bp_x$@+}FurGg4Pr5J{e^ z9aD8Auv6IpuMUt=1f2lL{R)h&PxBu&UD<-?O~OWF)PH{7fX==CA3=a?9K zuLUcv5iuoYK-Srp!?ft(fH`BjI?e^nn9i^MpE=T1HC3~5 zmLo;x*&7+N1(4?33+7J#{4U0BLPmqVLh3NGwx)jg;6d~iLBIulSc}9buHGw)!?iip zXgDmX?#%}RuIJ%mU<tO{{2{Kxhxj{V)EGRZuwLBZtdkViT^Mz<#_nud*kue zr$sqAy3xBrPY;@}YiGNeTPnWl$fhJQkexhRT}#$aiei5|W-mdThEM!Fk{RNMg2j_x=Iwm_*)4yS%SEuPHtyB0 zT_vV)_>wYPaxwJ${8e#;Wsa7*r$hA>Jz@-8C{KR4G}+l$w#8akIqn%pd;i%HxoU9l zmzM``95vw36O0rxR29$ncA&=6L@4h&RVMY*3uDN%(5>G@x zU?d?cxM$}z;eqH_JsbGpdTWMhB-5O!OUP68PNR?SnC759bCPX6SiH2oLT>uKoNi>AP03JDADa`(!^ z!ymA!B2~>*32s+JWm*O$8g?#or9^F~(ROw`v)#;x{(9Kh5VI#ooAv5;k%($;dPPGY z;G7zkrVDfAD4mqoxkW|K*$lcS9{OiAXLxVyk(f^R$v&gwCC9zvU@di#UX6Hh^%g$j zVG!AQ33+%(et9+hwDzNm-T-wo9dkjpY47=DZ`UjI8aMQb6a|Y}8s)qJFZg-?#%JCH z2ixVr8D3MN$+NbXy)yO=tmb=jr&Gr1y*`!7nI=_vfPT$!G4N4NMk~K#tfrwOtOodm zu8~Dx!o&n@HHBE57#KAvgZKRAP>rFHt=R3dE4p1IX=T*2&FtsBP*@_cV-Z%Lo2uV> zhR1a=X4yf7%OZs+*f9=GT^S0a-K^D~0w@I5GSb zHj5Q-p3gfW8EIn#fI2H+2NBw`^IzXYK(kY#;0MW^RW)1ZfAx0+RUl>p>FL3F-TcnK zrM0`MYIoBR31xx1&Oau``r+#*+gB4TNI)mxV#q<&u-g@xL1WY4@VOJu%-3D;JCx$> zYrgv$AFc4cv+D0qrhW!9hupxednwBe-rYUnS*_|Km;0Y)b-O!d*UmN>2aJa{J^N~h zYXW(5Z_WQ@CkAnqi^qsEXWO@yh~$9`K4lf=iEU5i6sGBVpjlEVBGLX8?|c6;$`15x zsMxTju-cA%PF-Nd-J8wM=zDJpq1}xc@c6iAc?h=O6VNmRx?f$ar7i`D+zQ03;2*a2 zE*&Ir%N=bO@Sp!R?O3-)Wja^;B%lZvYWnDH`#@lpL&)MtO@k-n2Qd++ zl}ff5Ga)TLD6rbNf%3_^RnO|VMy&~*`I0r^rdnsC#sKsY)t9sEJD+;~ywgy~{tljO zADK1Dkd+x5ICRqLGPmCO#pIkX2~vv+3;Pz3Rd??>2;2s_P5!9S8*e||H&U&`(FE)- zX_u>Eg>UxYtJTVY1E)=LodXURFene|^|g}=_o-f-rorzVQ4P==zkdgA$T11c7_ z#l~TmjE@ML?ILrIVHlcca^PKU0+n@PVNvXem!u=Rm?mw00T&mYh)G#TYJQX|BXTC5 zZeejrpoDpd;K67>hT@8NYpaIBfa)4)P9T`<`4JcW2MD28^i3VH8@BJGWT$6?#qVH>Vetb>==+4bXZ+YA}5G$l4? zJdjfv4Z&v70!3WpgV}aMAJ5sle3A1dkbi3AQ+{$mXSSy!y|84cj>S;L!T0}z4wxLF2}lFU!1$B?-Xow^~Jl3DHb!ZPuReG1GFwpBS2aFU)hMK zS&ToS9(5t9*!(5dLSbd56B25ef!Ag%%QzcyKILj~B~N?=sOr_rpA_qcnRLYhUP|2}%V@fC-!=Lk!I= zLnL8y^JT$>&Gh2YIpZho|28vxcc6kLNy_@)W`>(BGJr@)*#*kJMg)8}k$uwxmIFXb z3a8Bi4QAK@oKpLrru*4@li%O`cg87XjAp~?bF z0<8Q61*rgwEPpV#(7@3+`20J6M{pSDGrONw=b_J#f)5miZCG|iR%&kp z3#u#4q?Cq)4`C|Wtr@U4xRT3AXDcWunA5yNS@@iiLNCPl8qQDajHYtDxp1(wvWi>f zL@zhKMbIu6lz70F_OwhQTd(zS|Ga*|Fd5O57!0D-l(A4gg3IIIcp+Gwl7GZ|Ug*Ow zhY~E(3Dem?9A#XVCp-gW`8dk>1l0IjS`ayx0N$#qafo+M?w2Ah7PKr$XUd7JsQIp-*0xB{3Bn}b3&KK>Fn3}?N2qcDUHMp9)|(%0kJ0|O#Vq=Z4FG`( znBW3!#{Z7Q3`mxGBwTq1#D~lfbpwN#j?d#N6$>`#Xrf2N?;MIR$hqUep=Xr?JNpAj zh5SKi-|*)Y&s|?->^nRLQ{JI;zI1xgK&Yd&?;nGII7vNTNZ>C^JrtLQ?RB+nqK%zh zd`%61czF0xd0Ae@%gria?p{7%)_t}&vjz;zRALPtzA9TnvuEh7(@g4Jubz-ob500G zLba^>bk-_Xmf-IGdM&|a`@Q$Y{ssXa>BIQEn6SrPYM*%cUf$ROU zo0a7rQZ{Vc%Gx+Ho2mW2j564F&DVoDw5b>Ho6G4BCR{!>+Sm3O{E*_fO5yiF6V|}$ zKqud}u9`0`2@VZG^u>tt_lG^vSq_d3|JDHtf*;it{FwaTNx8t(aTa89dS}D8emtX+ zmBc`3#;%*7C~_8OCkai5w6wH5Qy_x(lwpG1>eYiYD@Z=IBoH-{cnrbAjY(`cfccPq zrBP!m7_=ULQEL8VgA4^2g5H`yVu>|Mf~pgQzv)oOaY$7(RsLb6~!4V1@kp zhz!Z!F1&_|BHm*j$T{3meM%604QBJ-4+$@5Iu)mBu@J^4$MZ3XcI36Rf)SL9#8{$& z{s8dlUXTcU)M5}C{f~+Q=v3SPm~-8m0}#Xvz#(>UTMIp)Wv#U~UcZonW`JnyL)`mC z6c{I33yD$Q;i0{(oLn3b>=$yuowtbJlMh9MAm8n>uVB|k2xDQCrTMo2R}{xd055N!hzWViU_lu{c5fjN&u_y6g~Y4%X*<8_Tjmw z!zGXPEPw+1zphE-28Zn{gVzzXhx^vEs$ne=o6nHagJ4Nvg|A2&$SJm6+*>|_T+@QW zgp1%0&x}`@P`F4@oOXeK;0jXJP)Mwn7C*2YMM4c@LdqTCmjTb>)9(@>4RO8)oMn*p zH_h|l&?K~_0$yq*7XP=v-zVC`Sr<;HD}DI->`WAoF6}*k{(LA1xiMWAN9hM4(c0oo zem>JDL6<^>bYX3&DDqs61Z=s}MpLC)Y4G!yGQpt%3D9stXN4%(O+}cOXuRy8q)TS_ za?M~Mz&WM?;2#z6h`}PbfB4SGIp~|lr#_PxbNoXQj~m&^jG| zR&}{sdr~fB9Y_`4vBRTHlbz>>TfW)_2_z)4Ta5_7`sTAjD(@)pfrc>Qq-dRF5^x^d zqer}Zv9V|fxGw8MCn+3?H+SoK%hi@`CQ05mgCQXy<<}8&<$epk0-k^J6S7J|gOPiW zt1wslX>8-dRaNLXE%_fxi=7(y-aJ6Aq8n)I!PxeC92MUklI?k!aRX1dmsSZ>&g@QK?p4ktt!&8p=keD`&MJuOMC z=2E=>?i~aMHeLqUfz{k;j2>(of7xXXg_vaSmx1Q#J{MP?V z5rdWueBQQ-IvAQ0O@LwQM_3MJ6%8&nz0OKKcg#vB2o_FWvipW-7psBq3KP zAI1fd|D%?Kj~|(~wzlGulD@JjIR`z0NM00>iwDRjvvq?=yO7wikDMl}!HQF$zMUuw-iQ||q7ZVrbM!`J=Woj_nITz=9F&WaNmSc(*ICLA@vO|%bkgTWJb1B{re zR+QbH-F2G0YhS#0!30(${5#Oab|)M7GS}ADj$pWNGKyV>H{2ZeZaNz=%oaLlDQXKt6)Rq5zry^K~I* z&dpeLmuB6iym$Ge=pCErMS~h_A^-9M5Gw??A=%X>>a(k6cow^-+qB0z++Q%I{r`Qv zf;B@j_1%|{wQC)7acoGasBV?fMX?5Ydg6e0LD&9Vy$e&rf$w=ehC=SwuX5hrH*IF9 zeV`PbYN>%A@(6bt4GZhv#XsWgt7%nh^VtvAf08`^hh9(Bq7kq>*(toPrkszy6%>5< zryxb>bdT9q7dxU@EBikKwpB+e{i14}&LHWG57s#cdOf5A+9oXymrt$0?j3Ty9+bJePo+nS#BSiTxw-r`V{JLd5C%;k}7V z*=ZI837KMhC#R`IYDZp1s|M-IJ`xfN$M2PQY$iXbIXiJ=e%jr=3_Q@Bsb~N;+CAU6 z7p8H+9%2CIIR6*QH@qu<*+;p+wCDt}I$rHAciq#`E;9~c)v1&TYsn^}yo>%0bfsNO zZDI6`jMCdXJH-CW$8aV1a?NPofJ!S*F* zf{a~hCii}`D~{jXltIS{Osn6?NS?(moNC~tgN&o(l{n?G^`g!+$NEG3J_-9Y^rdpC zoA=8SHp9d$6Mu*&9?Y9|AY>v85!T3*59oh*Lei8(k!;t!eHyVA<;E*(DC1u##IkE$ zV6G>6lT%#v@*uhCX7yde^$9M$&%UX4vj>p4(;+@kdJ(lGw&BcZ(Kc#;KnEYFTj+nd ztyk&-1NEgW@%W3CkJII55ltg`d6;uAGF+D%t^VscePnEmuG@sb^w$+~xwA_stFGW{ z<1ndgqEuB?Z84ACAH3Ca+EYFvwqiG@)JNy%$;Ecf4@b=v6KKmg2}s*lCI#w#pGKV$ z#T3&Wm}fOKuQ=7FXwsoSUZMAyFx9SmB=IHm#rB6CB-|IT37bxINc}xeGRviGb(^1( zaD4}XX~}f-E(vdM?@-7DtdM4(Nc>svV6N&AoW!K$5L}&@^f*4UJMRfc)Qq2E zV-r=>pZ*iqP_dU5Fe9FCP0x`5_bi3xwGcv;`3E(hrZeIK<#@hHi`f}7Vc$m_y$`SL z@4F(%4kFOX@XTpfm&d~ExL59Kq4rvc4`~8L%OqAz-^|)e@BF;V97pied99;t_gUE> z(Egq>-=#CgghjAuKG>j<%tT}nE7l7+ z*YT^YTnw?_QUkFr-182Rp0AgC8%q3!prX> zj4KS0NjWc47jEzoUqm^KwEqCx>t63)(zly`vCX=H^ex-~67zUt^Z8R!p#lj#pz%w7 ze684fec&7YfdV6^u3Tf}Yp!fZ1D3~SZnD`5CdIh;vE_~eW(l*|n%VJ5#=Y+R{h|*u zGn=?@on)LUf!WggspAe@-tP7SW&%>k(1b@ac{?3KPG!FC{7f`{c*$%6=ZbVsB^jYv z>+%!UmFjVEM&p zsX9tW+9sIN&2-30bSMDf@x5`^{_*w`t8R_*vuDqUtiGJ6y0}ycyC1@EsD+u}LR0v& zOhI8ROhXv<9Ph?F*6^QQF(oqWsHAykY}AJ^X-@=FrMS+fPg_CqR9YE{L=&}=44_n1 zk6bSMJcGU?BKXU_OgjBu6#H}m5fWkq33M3UB?38ldWyirbpCF(`^lT#{RzX1RcDBD zMPASD)F=c-dl-KBZj=E90dGCA9s<;c>~4Tz-JNpKKo`&&5RDQ2d(Cb#vZ#LG26vC8Bq-Nr?B!-~8QC`y31Xs_6H9<2qY_-rJR_Uxd zBgFpOTgGJ z6y)Q1J0WO3S;g0`xHU2YM9BLMue`0Fo^6KNKx}W@9l{Dx?$iZ#vsT^R=mb42XlyA1 zz=qTdX&o@}Xbvt^qG|dPcAtRTk^CZ0E*>$vB~y8^8^Ywz)@Ga|VEa3>>zc1SxbChl zkh@_Y|Ks7P;UM4C^rCi{gQ4s08=E#NWou08;NV3g{7efj=G7N?b&ZYKF33-*CdlTo zw5!pgRhSONtB^F+-uv~gtULmsTd`I(Z2p24lHbi7m%vRq7E#fHE&6Azv7QmK9d%$A zm!E}C9%CMmiJTT72+dcbMo?aX!srF@{L2-!CoUsgHjF1+rcPUZNvR?J z@5`Y1myk@P(+(&um#%u~`$PV6Nd7Z{hJ(nCU>^+FEgobn!2yz~tWqOgwtSN2;?3b_ zEK&u@$GE&b)6;|G%a-HC>d(H*dN*Dv$UtqB%(KG`&94iCjJY6ZE<<2t@hicyynt^O z=Gf!0Ah>Y7Bl7k%h`bg9@8WU0=*$ch*4UsnGNKBFMJYot+#LCnQaPtiL+~snrg|O! z3~3|61hO};_1JuHV1O~(4SZWnrzP zRtr$Ih!VxMA`1DpU_SFwKHXVtRRwiV^6XN3q)wy?$?r@Ex2P!g)&1Z-{bSLY)6z(< zE9I-SewB-idqo|IVb!L+sl4xbmBL4Mdbw*oBJ?!9R*fi>!_y-ym$iIm2Lt~hCMYNw znFH#O*X!&Cj7|_(KP`k90@?nwHNt}^LVp*Y=&?*p%1wu>?b58kmLc-gB6>eL+hG62 zV_~IgVlv{#Wo^r5EB|!0c;~Fscg@?cl-g}qjKUgS z6!AP3i$+xU$3MoBUk5xnIXM?hl^S)zHqOYkW-QWr|&z3gsAyzXz$pyIW>dmoc))N z=1zY3a*P`z9VtJ+fT_1Yw~c=UQZ(q>T?S!Kd)dl3S0W8T=_x3GB0Ee&8Fx?*lv}Sr9RO~)AbGqM>%Xj76&93fFZ2K9%lv|{x@Jg%z-3)AF({M zCgQ;0mk(${$eeeEDkJHnZtAva^*06*xcDyz8Myd=&S0}^Xe@|8OsJUljBF=3+}egz ztt4l3k@CYDFVZ%GA!;fkXvv5PAidn1&bHO)#u*<+xn9&!0QT#J7LM{aD6I9C`W0(u zYH5B5HjFqqp%uCv*$}2!2G*h#%2IqlmHDRu#iCvjm5Gt6jTP6AAHGICD4B84a)pGD z&Dl2su&OUE`*bghR+3J~T>b0Zsd1F^O3)cU)ibK40<)XS`h-{_}g`DQbT6KDdU6 zA!GNl5R67S&l$R@rj7)rAX_74rSR)j|!od`*vc2j#I_NsT>v860-96AeKVrmwRI{+RCfs;# zJ~z*rJtKwk5p~|(6+yXq4+wl-IFoulr|;gzjxbwUABSW<5`;2^aWhq>Dwo=9UiRoK z1E2Lo!R(x$j_y}~WaC7Ao;6BTou}q`a(&$3v=0uT&EaU)V$-i-oBTeH)aNFh)T+HP z&Nl&m^)2Hyy-%QzvK2)D$$hq`U32f9L}u^C>#q~hWUGgA#?8)2yIv@_E9gMqmG z-)Pc;cSosG!DUzC#jCRo*^9H6qnf4)7-11w*&;jWuz+iaS&xkK;@awCv&l}>)xNR_ z#~n}VlIT02A!zmb%pt&qfkg_q2Rw%Fd+5fP7MYNNa8#!rHh+UpnaKeBJ|IM%{``gg z7M)f^{(>IjY3z;_#k7a>NRV-XBEIa81D}3RtcYRy7mm^%y51ZpRlPjH^ACJ16Gju( za{VCi4Dp4I)z9t;X{t7e%(6qf zVuo;7+OIcYATiRLak+ws?2QA%HwQ2s2|*JY*Z9Q5vN$Y2P^|wGmvmeZ<& zG1Lh=ESIt)Wo$lB*VQd*+rUr4z{K2XW>Z!IzM&$`I>-0Nb~yr^+HWXfo|QqaQ*v#G z*nBbGoGb&$)=lckr9}hVx?jKTygYX)3uQx*OTJ!wFGyRtKmB!}?U^2P zM@(1wn1L3W$pZcJR-|Q(nBFC?#alz3I}ddXzbk`W0XoXlthwpv z5;OYBHdkb$tGc&QwW9|mx6kHqn`bOLgwyQ&Hm(#ZcIwj%ga$u{v0WRAy0B~M_x7Ep zR8gKwQPnJzFpo}6#AyT;t(6)iAtu{NPyyDceACsc8!5}TdPFs=X@zdBGvUyd*9C0K zcXt6+t(!@_>p}5WgKw9F^gkP~$FTc$%1>4`J5jMkgtJj72<=hKF?hY!MOfwID|+qL zII}hUbP1Pl!DV}FOmM;lQOmz~Cc>31NbtSSA?-FIg}uBuL^UCzCD^X8(6%{-; z*?aBcYSXI|f9;YRan!MRkH9PqFFhY?Y5v^UB|-eqj65dCKX;boT;?Z6b$GQk~Scip-1h0{Fit5+(M2 z2(rkjtb877D3j^1i7kwcr{-=-YK0lr;xzF>4{cz4d|cl5PQ*pHr^%CIU+V~T=uSEu zPsJRDo#h&rZlPR#yoRrg3Mj=Am?sy5wcPWxgaq4vb#}|A=*PMJXbhr7h&Stsr5z~b zW~hdX6$b%!m@pX8Le~Neqz^M&Do`>5C4$V33I(=R1Vvr2(rwzU`@UL74oZ}xjI(x7 zB38asWze=@>&2d}9$j*snB|_jSr-fs{uUEIn_vHct!fF+opNIL&+y4HE%iIb7}*3D zcJ|31_RozQI?7Tv%?EgCPHR1BUpLLVAlc4E5fD?+I;|@uOZIMTIw3J`8B(nn)*tY=cwuDj$MckxoOzDT!PnISJKtW~}NX={^s;Qb-;Qmr$# zAYZze+e^hI!5Y7F^zM4S{5*WXxJsxfI5^n?sL^WIhi6Xf3jDeXBsezGBtgf=jsiYc zm6SrR3L6d=_1F}=Oc3Ho{pJ8HZ<9wiFOu_9Qz->NTtOr$=W~#+@)(SAMzapR*}sPo z3DW}1%rj*5?w0{DyEPPBB586rK1D=ZkHN%Un3*wbeAnWx$s|dkm)g0ui4;s|S3gas z&G@=R{$u0`I>y$IQoTTe3E}5_n_<|V$sA|_T#CA3GaY_-_Ut%<-KDYfmf7=^xcge{?g2!Ni5jx9 z#4sXd<7}Z-TU;f1{;QIcy$~JHJ%fskP>*|QgD(sddXH!ke?S-?%=ST~O8tcb+4~ZZ zkqQRb_G)l#d-bC;YFW?CEZ?L0*Myt;?j8IHP3;VySuxFxp~>VLSqak=+KJrGdx_-p zQP27O2j%b6^i(kq(kOXAerq8|u)A5^dge8Z@u_}Q;8Xn{fqrRgu91zAkeNxJe8?=S ztwO2L1g7;uiD2;UcDYOj4S9tE-$}dk;V&co#z{!WrWIPK3QC@Rgo0aVr z_Jx7)HhN%uQx+=o!j!uxIjkG=^+M95@8QO88uBms>^gQI_|iT~&EcM9ouH*uZ7S*5 zvSB~P{wPZAH-zr;Vsg|f>{+0z0#l4?|C6MZyP5{8_uO2~(BG&hmg6+mWc9w4)#uBA zqxr}EE7Nbz!e;pfUae9ixcq6X;taFV%(F(oCv(2a?G>a2IjG9O(0DYL7lbNO5sN z^6|*EiZ(Q?>v}wn+bURRciW*WXdsJlP+WKb^Z5NzWa77|!B@=To(hsLf}hYCX3V5M z#Q$zt#HC}X)KxH)?RJncy!(0Ln8eZ{eYPh9Si_3)4s*#=)wqjilZ6j<1lst?EgF(^Am@wX z_`s&K+HO%_bXE=Z@h%_AHnU&n^cc8GT6Tcee>xx4D^xbG_86jw_Hl6*@lQ_nNf|VA zciLngfPCQ10&r*pX1l*nmOnWyz=uIjUjB%KOU9gCCcY{q$+DBj)qnU$REL6>+|=S_ z)TNnjRYRqUQPK=)$)~*o<}E_!O`7@onLz>+r4p|XgqzQ8bcS9PXO)Pv8Rb_uW>89Y z4!Ucq`ZG4K!_l+sS8D}Y)sh(Zjbq8eE4Lzs8W-tIMCIwC@MqTgpRGDC zqEs0sh&y;F@c0UtzQvtbl=R#UD#3Q=+FWD74o?;xFMn%jg$^J^eP z&pAMTew9f1O*`zV@1&Bd9)WY35N+r@*_`O3X**x@*ubD$z505Cmkot3Df6-RX6v43 z6R+X>8)m|u4ey4DaoUnj+IgHkf!wko4|$8s`N*G_zNAvw;?jPoX_H#G{J{)7OYa~g z1NzCTVS}=-dkkxi~5}{)M3p;*3m9O*bV|+ZNLJ9i9WJUJhR% zEiLWwtkdAu@7UcbD@4Vps3^F#7TkOWSj@|T%(utioA2rb<&F@F?V*aI4VKFm*P($B zV!vv}Iz@-&eiQ@Uk{!V_Pwx%a3kor}G&{SUvV&mlyR(Db6^r|u-;%Yk;p5!6+dQ$v zq#V-_b)C;Ci^fB5a)~`220tIls;bf+%^eDQavRL<&dESYKuVNj$w+6qP>H9o6p@J{ zj=|PO==vlRm!fW|uGK%{A~vfkmyCJ&C7BUvAFV(1cc3ST|9+!`rp*+Y%jm32ir#EU zHISzd4H!zpy*%=Ivd$?o>ub*eVSqiqBhG!c<#)$Nc02BCeqW~ADxb1Ck#S$eO}qEB z=G9}hFL;E8g!}Jww6`!P7Ko_rx4u9ychHX)5KvJkB4GX`4fq=`s5XmQ>z=#IwG#AA zI(78`oPS-}q?MVfZmy7` z9;R*EXG_tO^@N|$-{-aA$3*i9t{ry-|1fWkV%jXDF6qqm0P@igblEn$w=MX)mg9o> z!7G|HL)p>zEGBv;zS@cpgnxyNTtuG=IEb<0Za;{05!iAp;{RAVTbljg&Cj|A!F7a0 zL`5@?Augvkmi#7Eall)m)4l()PSM>o;YBCThtrg1_SXxST;sU~W#W@2^F>cz<3TkI zn?80XtxM?{#Bk*#o58e7kULY+v%v}(CmT=!ZGc$WLZ z>DqR;LNONWQJ;-7itKK7eT7F)kvr*-rE#=0-MzPtBG*2dH$o)@?5TI0Y|qpMejKRQ zx-Lb|HeT{jry_qlnFsbEW-~K0!4i=j?d@-_GJesdTU%RM(e3Cay#>ZrifMxM$d}#^ zaUdADe9~DCzfeG z8&gjT^0_sR8Mx-Rb*jMpqa$ED@&*$`W_LMOJL~mO7q13g)Vt_Qi1;WoR^tm7N*by{ znNkl3@pT`gy`3Y$F~%1*?Jwc$f7$PuZUL#;)epfz@A3yo@mqQOW%|)8?l+etr(h`n6qOmN{2 z72a6QK}{v6kuC~t(;U4l$SGBT;<$EwZZ!NH9Ng;D9OYIPbX%eq(bP-R-ibeGa#K@K zrhTw5m{#{Cx)}_nh<-Y9N<8A3@8%D2VwfZ((JabjYDPNRqO_FiwCTv}jHF>+!CzTf zrPC`~>B5!5eY&0Gh7ikG?EUM3J(t@F;$E|8qP=Ec9(Ih>Iu2piYHL zip%LvO}Lu$(cQaoj!+}nzc7`^_gid!eGjUJgpeQjFFtVo;Y`W<(9RVvZwIZPgy9U?u`UViz|!;R8Mu2`hUH5zh}VQAEY1 z9&@86I*Tw_AkfG{B%+|AbX~0&6pcSx>FgCcXm~6u`yi`t0bTYe&6JBK)3Ff@PK4iK zyh01%Q^Ur__EagK%dMeI-tasV zj(R3B`q4`8IF{QfkSXpK&n_RetZM4BSl%Ds%k>1b)y=Nltu~g5m*2N3I`&`p-cgg>XU}P^?kMKbL%1$uYC2;Lg zsbn7WPjr4RJS83nT)~V7datA7Y@WSh%WB6c>|QFxk5~2cjY8Zf?HZ^T;X;gaLEvxv zz6{63XsDZYh2bVu*@m;}*Pys$qHl~AFg&~)+4qc1+xuSFn;;nJasm>+JH{nQ82u^E za>CMQE18)*bfX^31f1Xk!UPklYBuKA_KFAob_9K|&hy1ia_`;sXw<2NX;}VT2z!a& z-2^p+Yd?t;m@c$<^@reR!cwd0EEU`aYQ@G=OdYf!L?$*TR8m_GZ3#$?!^w^Oxz^&j z_Y3zm-e)k}wfa+Q7Z}qO)9Y}Pzx=8An@qO|ny~XEe`+iAacXP1y;P0kpb7e#*@GnU z}osNi@S3ek1{a*4s7NQfAn#Bjiubt@pBZj+YocX!Y#yGy%M>8vOW9J zO~{~Ur>U2bjZHHMH6BOS*6(#93Ds-3w<8Jd1|BNv!w31eTNGPf5w*=^UB6zHMTkWZ zC${hi2dL6p;t+kF<9TmI#+S?a5V>x>LB}g9)$QNGX;9eO4O`R;#5+5Zm%sqym z4^Q9!NHi`v8Pu8WSOBs8*fLYU+l*80w4|V0=T;^qD{FH-R$c9RQoK1+uLoMnb6*aF zk#o1pb`!uNLzB%ODRg;Dk(9VN`V61*d?;Mvvcx|mP+8~NQ#PHE+ ztBa7np{;FU00p>mrykXuPD()F&*iGP1io~lBEB1sQdl#gzhAaQ52mfeXmlA4dtSmg zRV|7IdH&TnWbty%Hzq!Nz*3)p%t}kAEr6X3lS#Py7s^NczP03*6r#!AVC{#2tPuTs zUVqMy?LwQ?-*c~?zIQniq2gx!k40hNZ|1Y_ZjCD*(;)%*2>4z@5RAo04-d9Z5E6#sE~! zJ8#ywv&nhqisW$o_=FGdy?k6}EJ=sWjgt6B^x3l>iXrnB)}E4P12j)00q0y#NwF*h zQ@BFhG0FUg+xB)1?dGFi1fGQNr8aW3h6AV5j%QI57j{X=WK8clDu40*VXBgJ4yn!=_$zJ{N z>4T9Bh^X!S&WncX*N)%Oma8B7Ke8X~5{Y%Fi7xG7S0+IFvH$IYsN3&sMN*Z5#pGHy zH_4SF_s2sYi~2dG?G&P%KX|AoZMB7SR9*2e4Io`Vo&1DDgA0R~7XAC5uDQDSJz&|W zx_2PtV>=kEfc(uv;K zsgyHr-vK4vMl19_)seHISLM~^-R+FUTY{Wvg6};)og6d2t^kceVM-4k(*|{5MQIml zs|*C|kok{!O6NFTowEytJGMve^baF#!R6H!qF^Azl5>mDDx3GD2E3G~Q*35#^Yfbb z_@K=S$-5vcJ~uz6ti)fb?>XS)h#QtSp=aS%-|D-w;(z9TviKu`@g=p7vgdfkLLdq$W@8{vWLr5HhsI*n%zCjZEl1LAQtdtq zMI~E=a6oC+gICz!Ebhn)*#+=^QSGdE>THM)t#)G-X2Sf`o4{n!{^@{`L6%bXCubzB z_^7dn;10jY{?d=VDwzcxupmJC-G|jNIeDs-4-2F!W{$59pI)|y5FHq3Pd?ICyZa`d zaai7s%Uy{k9re3#fe@UmDA$dlJwo?Q4oA==!BL_x4++N(S01MqNAJR@?)n%)S7Jt(cOU*5T^q{HK_X!z#RdhX!>K~f zbmHQMptWJ~^6a48W|V_mz&_P>@|*H2dSMBIFk$!oY)vh#%oF!Yanr|zyp)T)O{^TM z0*u@5qKMOS1jDFM@>#Wuf6ml)1z`3 zQIK7u#qrG<*G0Vabg0lj%9tgOMxKLZHpSBu2ac%Y^pB@Y_|m$^HuMx-)B5AKc8*8R z(m0FF`iDyJS9E6!mp)KBmY>)aS+T$BUl8yN9pN0Q)n~mYnHYW^e4{7h^=EmG{|ZKeJxB+bQe_kxXhr`BJM;#bY%A7EKx# z_2_=D3Iq22|WEjlV~v)?>sEO^U?9+hmX-<|48{>rQLpeX z>eB9TejUeW-_5DLP=k!EB{uZx_i?NSPaDS-Uq__9u&L?W85vBQta2$^9$SCWt#`1= z?%3ORbQ<0`=Dc}QtmsV6mV+KSi#38&OY`2!+?+nsbGiFH)*G*Q?&+2Bow~fof3VMO z-Q06*&$c~6B0j{t^hHE(WW=R3U5v5_ZRiAOC!40%u9fuT?%V4{olo>g*v>EGG3IlS3NQ9_E`eL}oW@C~7mkWc#5RQT_z(nXVbzR&z} zEX6`Dq^sh0H{ST}bcYFPEC%)#(3`y}4{N@{jZ!XyeUzg77SY>}Mk&b}SA6s~2b!_<8Z7SFH=f8^1d^y2G>+V(JEaADmuzDHQWF8B?{)`X&~ zdh<(pWVhMP(QK+=LBB!9`fq{y^DiZnE<=@CH_ODsz8^p3X<$%Ln55+YxYHvl@ArP$ zJsyI0IAg>V#Z60thnbqBUgtlQe+qis4$d0F!p8PmXEwB7_O36HiEs>kuspu+`DT#% zTvJ%QlKPzPz{J~`2&|P3n)W@PLi;sy_L67+et(DHnH{)Gj}7G*Cb+WVPo(rWf>>|l)3=2bVT;I$o>y?!k{qmnTbY}|-tCKiGv6#=uMpwh1k z!VT@>{vL%v3*%zDZB$e6i7{B?7y zwdNhY7uEMzGmNZXkC8H6_=lIn)nFBm3Z&fDSA{lbFYs8ptp}KO-#U>l1<|8|!AVJM z`oKOSRk4GiDA&EXAe0iEns`%+E^WiuC#%P8ANLLtih|Wt+DTZl2I*UIR7oGm3p z$HtoQGiVpFoC50-nfNO>Rq*0!{_ zmwEg4o9+o%EYWLowZ!SmwIjO%1e~%rYwBLtXDe`W#T%^<>+{1kndK(7idi@ejE<8!x4=zbU+F)62V_z2Q8h&!s?NaqILOz2ozGtZvk4 z!rr&=RVY4R7u2wyuNz-n_uE;E1*+DPkV#{y_{pCQORY?C0;Iy;l98(*dS2a6*csc+0V5|LA|L@OpWcasYz9=ycGewt2;A;60 zAiwq>qUHJ+vRk}eUPQk%`+f&bD3{#m#vjkF_kPJK)_F^@o77ifXAG-5tJlj=i>g&>J3IF^>JA{1VUn#^mtA$d>%}Z@MgYm#V*{5G*2hx7N_Jt zFRRQ}C^7xC8XcN(WNMq?RdT-MiFo3I9b?hk%9G*~6t*(@NG2j=Z<9w=G%OXWiAtE0 ztvt}p+Edm$b;iSGwU0K$>BDmN`6AyVphBkHFZH^^ohWy}cDh@8F)5Ukn=VBuLnvvl z)!<2{*2{+Uefr<0-r5#b`gN}{ZuTUuBdo?Xq=?-JD$G?|5X?k{WXR5QkxcO!qIz9Q zp4zKHyYaS)(K6qRc5AwWUx{ysiwy%-a=85r7dLZ5PZ1&5@MOol;$i|>{ET+08n?nr zbCL-?;T8tiQD7~jVk2(S zJ<7}eEGE_>-aAaNGc!7=YiAivzd?0p7A`9-#<=LOXofN}G7u8JhDxvL+3RBgTI&4G zY$&kq9!{ZQ{lrgMGc&W#XzAu}&R4S{k(Jea1+kyA?c&NQYOhI+ywp@U5LwyTXtUlY zX`R?d*B3h!{%ad0jWWv0;nA6zZ(o#~Vu3M(K@!({k#Y${dlR4gM^AtXVE<6eW7xEC zP~A)8vl3T7j@!^Db1H4WiBm$B8vz0&LkNU^Qtpc2TW+uj^)dB1@~W!-*Y39_M$Ez% z!x=-B?O=)G0+Rkwwi`^mo``M9fp>+j^X`BFF4!`j)j9Nc3(7du=e|&>Y^dyQ+c3>4{R1H zUPOr~EUG*(yJ!BLkmqzFw#oKmK21%Y2D7`JK3j7qR*j9XA~%LMb5qPO%KmIn=>N16 zBFc{LFvu_$sI8Br)@>|ymGcu0+==uFr@f1^E>r_uldQY=?#q{R^rj~%;j7V!tl`3+ zly4G=95>bdS7oE6qgIVFyXTeD_7wW!N2YDvEfDv%cTdoSxSBn zQa+M9mzP@E5X^Wu#D2)7fmy{DH~F5A%Uqb&YMm?j&i19(y`P@MPwZhjI9L>5_I(^r zviK1~{}e(U!uB@9<})z~I!o6VHLvp+{Z~$uqjYJTp(TY)oW2(r^RbI8AA^l>FLbY~ zM;)`8W-nq`z@h=#vFl*5pR9?I+OrdWN#ijnEH;Bk2+5`76OXgir|~RsS9|S0g%3?B zXM7(TZC|=?<6ta zFwzZlQIB|ibZ|y?5oYQa3(s8Y2D7yTR#mz;d6xx2d`;iD4*> zy0KAMCC81kOR=ETW(8%6xg;JJT!e7E_A1mfvgC%X9nW0ro^)3a*d_b;PQDO+ebuF$ zD&b6$yQIKXfBt3?v*aU$Ppw?Ls}rvKi*uoUlll&+*q*LOy?%g|(R{0bg=xn2H`q%I zrbaJ;Qz7|B>sUYezcdz6)-SojofpZ9Z_EU06%+_qwVyF|U2A8WD!AJ>OPv*^m>+i> zO8M_FOe4Fq$Hl)&)h8{!6w_Kn5!+jvl&@w=HXP4wVB}U*RjsNqiy&vEx4>v4Y23Pp zj)xI$V3mqA>Pp|x8Q?MrfxmmlVtJ+}RKTSdDj~8QTZRuejQy_5wW`eY}7V2m1qMa$m6Gg2uA8wplP(KezCQ2nN z$dBIVXS~@%xlN zlrP^tq0IWY5Z>&qSyUUSGeb3l^K~?;GwBE|X~Z9TfW zF>Rul1*6L*m#vQC^*X)tgVO`=LK7V3>P$DpFCs(9Bwjt7IooyV=}cn2xZGcgo7Hc` zf-@ZFLlK(^`VFs9zu#Or(~8)6Ron@cN=*@Vk1-8LIDZwNnAv!%Y?5wAspZAY}>#D6I;_t?f2D$5|AKAN{k#7`ifpuDsbzwwamALD$9JH1100G7cnY| z{Z_i)F~-S%Ej>?9=CvXODSD~zjSnqLJ|sss3uu)VDoEADCnzi|B*esI>t=sl=pF;-Y@?q$Po(Ax!)Gm1w;$!Ym-6B17V`a+AO6qG|L=$0p&~ zDR{4PXiUA3%H=)uykO3qRExQZpVkOVT%HL#sJ5gR*iy;* zDaa|s6Uo10E#KKXnmB10aY)`=3aXm4J9gf@_|t7UnZ-eBt?M~`?7eGKEv6k`cE5rf zCvFUiF!y5#$s}|`fey>$ho}0}aV=cY!tjn`a24=AWDuZO_uqy+j64X60yIM$m&)os zISF$61H@(dP?`W~UwsFDO{iGd>&It*$U?*k!WGdRVB9b~BIH-x=9o)#d~~J_Q~$>5Rl`sW zisyc{4A9{~2+BH%QW^vO?jl|7_S7k39{<WD#6_0787Nl?xemXnEl?) zcKG(BnBIOh#DA=t)3Z2Vz0C;NigAcO-YhhZLO5Gm2B=*y0?8ms%fh2{R3Lk zl*pL^wCOBc;YoT{E$kRNnXSp4=sPmWPl|H#Q7WIuReK#HOa&z(E|SgUz9_McpYpI> zk9X_rKO-<-TEgo%mGbq(f^P^EHJ98>rX&jZ-v4Zg!DNID2Xv>fBi}cbhtbDjHV@YH zrS0^$!wjrlFy!|I(|smU_^l7)!ei79zDwrrCft-!CoA`w`gP%{ zV%7QdLilXBM+%K1jJz*D-+_;7VK9v^pR==k;tKH~KwdA*O00l9))}odTagreVWt@rJCaSYX}_Cwc@T%&_ssz&d<BO96Q`# zkRZO|rkm>~bD0F$d^8ao`_78lqOAT*GKY@cAvY;gQ>&}1>lwIR*Lj!EqVHiIBqv}n zH^awzB;P6Sqk64R+SlPU@wZ?dcirKj;;$9UAZCN$_@1Hguf-Zh6^eJC@V(PiR1DQs z8U3^|Vg_1ljk&d~6wF234&)4uv!TOGvkPutpu@B-Kkw+uz2#SaDc8Ofi36vQ_M4bv zhlNK)UJ5#Byu*#8htcB_A*ogZWhTmn?H9#1B62_S(Q9Myz|{OcmZuLJmP1pBk*Nlf zC|#Yc)@AK@=y@=4t!qZGf1Mlr73^%HJ|S%QCrdqTNi{zw?gxa(RB|K}_{1rK>{}EN zEbvi!;aR#*ZXEZiPv2bR3VNUO+RxU7Bf^xzGI-HT>W;)`PM3To)tzGEwniveCSWMPS4estsAZ zU^I|*f5E+o?++sROoESil%J*$+=rtI*W#EhxN1ATUL zHvFCEHQipZPprXH&Nnl4?mXBy*wH+kEBTI&*zhBoOnI!j@B)wtpdLsOC$iD^etnaW z@&3Np@d(j89}6r^FqqY8UR%}v3*=}Uvjo2zvXu4IS|%-qoEZ5PE&UGM*ZcA8=|5%x zTrc<dw0&W zkh@b#(*32xzS-#8jsShH`3Fecchi|tT)4NmNhSqen;i}8{J+To@>LpW6t5CsN!=z8PRU~O(x1zmq;c z`7VY)HRUvkk)penrAqxS8h%dbQYc*{p+etyTV&pLGsE1UeEev>MStbNgNo2|h5I>W zW!jBYe}a(wl*Je0yLtlLoa0YM7KG%Z5IPg5FgOCHLB|A4Frl^5Ks32W9(OJiA(d+@ z;Pl&acqk!ou%1sn*_S!=dY^HaMuwumzcy`@mk80jcba5{(vP*84lQb+D9}JNBKU=kQko2_SQCBys%QpKN@>Pn6^km3Y zF3IvZS4YSW3DsQt-Ax1AYHF${MsDsAZhMLye}*i{Obb7@zder+Fn$Ubm%Yy*gZq{L z(h+5X;ZZOVT*pEYo|tgI{rSvhR`6iL%YwmB%}2z2((hqX=y{A+t;sd{#l?Mywu{Y* zex>+auq~Esw8W^R*x>sU`zb+F3yTydclVfct%Q=IqBbydLV5hVateP^hi1bdx0{IT zHt%>E?|02tuRe9&%h7hXTobDwU9oQeVm0(RspBi|tcfZy@)=DQWXWGP0@%>KG};^+ zPG$RM>ZfE6WT}bKI=Sc(1fA*R?kC7nU)%XxMO~@*A2<*x6c&gqngtUqFrcIf692=Q z7Q91wGfQfJC?fW*o}3?ACKxVo2VT+$beB%Zk`c__nj-2*lXdJop6t(j%pwg{(c+$p=Y#3GwljP@PO3xkd9SWyulozp1oJvv733g?yR~a}5)^?+K`F3*51k2LTy4k}9fb*2<9Y|3$(3WI zDg~;42)UBp==b`CBcg(r4Oe0S~$YFIl|; ztlicQMmd=b&Dh@-PX<(1kySM^==Lrfp za2&S>4*aU5QhIJQnXix>`aMl+Vb|@4w^A$c$Aznt*^cS&uHnD3bM_9unRXIttgG+{ zt|_B4*Q+mAjvDerT3LyZAg?c1Bq&f6R-XHLY!p@f8V5HPcw2^>-eK-_v6jq%6;dr<9X|lX$H1UonV-k8p};sh_pssH`h^Yx=y@+e zNC*g#vQCF;GVMRW1<9bmGw_==|2;^6Cz~dfglgs@xGT`|&`6Mr0ZQyP%i1UksQ0^; z?W;b#dS^B>p_lx`as7=_QhHGnZVLnN^uR@4yA!VpGC)UGucGrB5fgtmTu;N#7$Z+E z(hIyUG1xynuLokrQ||Tz712w(4@^3B;Rgkix3J+V+_eTK7)*?$-`;2Z_;4S|`(2{| z?XH4?!Y(kJ37F7vI^OSNeivO8dG($`fW0Z7fv3f}^vQ^{eaH=??Vm#U+!0m zrgy?DQ52T4IpnE}bZ-|lS-sD$s}s%6%}se3VD4M{`cHtnzT)^A33pObk`Cx{>72`k zYJAB~8Og7yAqU+l&tzqT2{IX!lizP{+E%0+5~vjG*BF<}_hv{?S6KA(_bVzWcw8C+ z2#O&G&M4{>hD6~3_V5B{#F3p}{Yd&nZ=#M~?FD_kf%mp92ZoDLT|WAh60q}sDQUa!?MMpwxRk;1>m2R<$?YSJmTm%c}f|G`?{jte(jV9?(J|GC`XL(OVgDK;wU z$CCLS&%0UQ3ow&RhHhony4^vA*4DODwjeX{XW||O=^M6Dv(w;SE86$BJI@U4D9Hk0m+82I0Vb@8n?mdF&f@Pp)M%PVf;S z#EZu>c`st?!xYq6q}K3jRKT)JfsJ2BYlzg;FkX4-qSyu%&+pk3@keH+ripL8@eKYI zY~S(&qVq3c3(nW6lyb#!%#j}8^W)4QU?~;ex0wQAYSAPx!dT7cJGMnA=fdvw!m06& z>*`wk)sJc&&@k|qPS*|=p`&J88YK>0bx_a?!1eXR(nMU7Z9moaC-IpOjx?5IZy2(* zuCE+$9VaPeWv<;HrzS$rgI_73bcNnUXJSnH$C?+Zw`|}%e;8ylsj|+A`SCKnAN-YTG~zzsBegdER@?ecIrNiJ7>49 zEQYw}1~JXxF)dRo>i z&&VVkQ*B4jdurlW-TVKzF@gZbtT$QFFypg_DeSo9g>`(rhNBU^z9f**(M<2m<+3hC zCr1cXq&w``c?|f_K=a5h5!nD#^VCr+P&*q!LP&EP2?&`0m1>enzJ0j^j7X*>U?uxp*zYDX`&^Qiq&khr=(?|34{hm^Dh*1Tn=}%e9y2>AqC> z+My*z-iQ_t5p~7~;$BD`138i^XEtA zO_Q>l*4J%g2SeCtx^o`$u~T9>JuOjy%iN{-V>0L1AV-X(3La5yETr-ThjPI-?OrZ~ zq^JFy07SZ+zK7>gy$F$GzKt%AooD@@ja zFApUBPtab4kT%7tgTRQ9(AQG0g8vbY7A7z&OmaS3(1olvowPV9<;g?Ej<<&oK>Q4Qb8afzjg*MCr0qr z>)s+mp$OJT*zjVVFzQdi34aeJBxnFGXCB{TGXTbwTd-C<)dPXvSmi$-`z&?8a{eYK zSW$emiw1dH@+KyA5x%CrI*Ni~eI+lS7%_5C%1?nPU)pHC=^cCn*UOtE&oo?81|cpA zWz_StwzHS2mt%PDYTv!ZaDS)}bG4>Z>l3Z^doM%^jVH zkot?gf6ssp$3~Pfx{CvL=>dLir%t0Ic*$$GCxze)Xc*hxfL4V9M`I)hL5jgP1ozh2 zmjCH&i)2l$RRDXTpeNvCAYj<>t2W;PAmFP!gP;fzK)SX8=dW?b@&!2-yayc40g%qn z@qCR0-zy)?P6vYyNWSF7fwLe*$yNob{4uYf26nVI7ngZrNJX2e%-ru3XTn*S$S;sH()yovn)- z1?@7l8Q;JO)su$+nQWP1HL(Lm&#o#fPW71@J~r+Mu}xM{nMNpgbX6+uH7l|b5alKf zA6AXr26oF03-HWh;En$mt6{u*_TcOlmMA0btx-QqcZ6Ry;r=H|tvH811Q?eRo%#U) zhUi={Dl-_Qc|E_x0ViXm{F|2KKfnUZPj#-!#o1!-U0t_~C92$FppC5Oq#8plFFnez z`nE{it-?i~vRLLRCKiMKW$1$VY8{PifbYS$QTU*2edplY+tV436*%cUF3HM5wdhN40;L4U{Y80AElnB|+b{alN&m|BvZd;i z2hL9xOCE07l>X((7a_Nve1M)1e1386c;mx)cUvT2%XjQuUBH>!Q4wBZQd(ePp}f=d zlwW^wdT5|NPUQaNb3r{X^57Hd4pPs?*n+K{$NU`kC%@V8RJFN>a=C8nb5=)A`P^K4 zPchZ_&(9lgPSp@~L{fK%2|bPbziGn|(RGa!NKx~i0($wMSVNYY-=N?-^HeMQyx4`6 zjl3cjS#RZ>JMvcy?=+rx7qF#WcHvJbYMpVDAn9adtwNiwzPl>(XN*+~YP8$_86?rE z^D63G&vFV)G94-jovgYz+0HD=A&IMUm_N^L#_Ep8mU(!K`Bvw74 za4av$yr6lIcG3?FS6>3ckK=9l$(pQmI~$!D^q(Ihb=;gZCQ4~EP=9sbu*#lt#&buk zP^S!9gX0{Xn7FEsr6)JX3=;n%;z!~mEFhC>mHsP}+XZo7g!(qT9XhFr)eWPP0PGFR z%+&RiKbME5OsHC@7&cQ0jfICswp`YY>%=tP>&A8>AmS_!jU0qc-b%*|1YQoeT*P0W z7hHB`iIO*62H!sqQUPZDfc-(1%?WpiYgYk1YOCA$4Cjy9qrjgJ6C%Vb>Q6L~Y9Y5` za>^&~_i}gO=Ch_RArI)NqdW^1RW3EK>O>@1Z-pQkZvO!$SpFq3W5>%lv#Ld!Oy9YY zqwZ?wjat*C28VI^77-<%az2eAzn^N$g8F@A)*-cISXz0PriyCb-u0{XzN#uL?X-`% zunI;TS|<_=W!n?W+ww*xJIoR~*+iBY5tqZ2mHb(szF}0xxh}q?5DHB&QM*>a^A8`})KbkN z?UA=2M>SBtF3u`Km#2})$<9e3l4kRFC7Yeo%s37tX0F=w-1|=&j6onkd-nsGLzs>0K4iDcj*T+75%{_aTKK=|! zHzx}PaQ+cBo10Rh#IQB`-G#Kca9~XLO7=qoRXAyfT;8%jqH2RXXZ`SLujrS6SOu0c z8WCOS@XNdCx>}P&goJzZ6gzWqRpLK*h|-D5*2u?~yXZM-(vFXhe{Q9B9>35w2sy}& zSZH1&s@Cy>#^zd>Oe;FP9AB^0DL2kRL@7#v7Ia$2LBWJ}XU$zJJL?!*y-G{#yr#o( zp+r8;Q9|V9@;m$uIZ7^;CYNuO3inGHvVj)$yo-vVYiw}OO$Nv2oek+XW6clsV}kwY)AEpQ_$ojD*MA+Uk4vkOJMD`4Y%DydAr{nqtU zS%XZussY#1pTi@ICAMaqlyAo4Un=*fNC;U8Q50qe@1!Ei+G~X;^#(MJ7&B_HUHC{6 zD<0tsh3+%M^5?T}E4DFgx}Et}a?0KJYB&ZSXiTgSBbAmb-)E!QPS=v82sp^RdivOD zc}cX=LF)N)l4IWnY9s3N2eOe=(vp((rV9#E1MSaGe|M;}SU=@wP)&NoZPfE}{2=#u zP_3Q@IPx@bZ9ruj~?&QSQJ!L+#1()Puodm&x*b!yeBL` zO>G6?b8cWPnTip-w!bO|-7SfQhI~H>-EJe_-_=0e+nBh?!vP@w=ih{)UQiKhGkk{v zRBf3Lz)iY-i9ZuWVd(mGF#rQu!v2?9upZ+!}j)Cm^f=bwq~2!dTfZ% z3W$lkiusVm{JCx#VBG+%daSqv$78C6^fW(<^c1>rudn(usKeUYS@Un1`%+Ia1URl< zW+#j+V|0Ghv`Eh#_j5`pel}1b(NP{}Q)=n%FX~{515f@fTS37$teN$>T)+Hcsy(0$ z{ij~T#0eBJq5Obxb$b&gD_bDvl+S>JFuxa?aA?{CsieR>0nM5upQx$8`Q~V#SQ;80Z>B3n2 zBA}tXoEu0wn0$PE%QTH3)|ddHcllXEwP1d3PbVoFKz@0E?emC{ z7!s-N6ugSj-aMG&$CwG8?6JAznRl;UNEF`>GWOa^a-O$lzw`Z`=gErNBWK{gICHN@?H;!!Id_T`z~^CadL z0aix*ERb&{2JUWH+DKQJ)_f@`<~74uVbc#gb7%9 zJW_w){v#WBs(G6YwEy2xsE1Ua${4E-CjJ***+&4~;`m>53t)ejt8*73Iu>T;r&d=Ri+Upj20rzT6qZykXv%N~AYnO798?iTm6ud{7x zir#}SfuJKG>cV`45J`nB$Wo>&yWdF{&p@bI3*5bHjtdotB62>U>I@cc{7tOCt$F@E zWewXd{`ZuXl=kf2#k4jTwjMi`W6V|=g^NqLpwlYxTW=4raxc!(VNGf~Q=hn)sWo1p z>g9n=ikU>~{3#ui2mN7hP+?mMs#p+FuiL-Z07sI zU^ES=bWAP&^K4RD0{!OSf}*fn4p3!yz>oSrRhi-@KQt^1M1t0<(LU5_4a4vdjdm0H zYba=q{LtgL7G*3_jxB-yAdU>3X{8#JSAgr}pwQqE=U-+w(Xt1Rj*Wc)D=WLP&WW+% zeRqvPweZIye>HYk;JVqk(j;_P+xG$|!uyC^fg|Icx!=Tbif>a_W&+o%E@IkZxl047 zeETK~SXrn_8wcH;hYx70QK*z|6|C{F`JX0s{5l5^c~&{IBdAaL<2s8#F(LRen>z z%3L=8T2+-*RMd-T`tG=Z0Y-6KPUu4W$0ctmL+Ks3-}(<)EO_#t{H&H#NCc6;bm0BC zoMBTgC}JL=+}Q4`dF;jnqm&5R35U46g!`S>+=1Ja0ib6~`Oq*EEkwu{pH;4{X9G%& zn%cS%$)G2J%`igxjFp{Bsgb)BV6hbl;Kj<*%mwEv$ zk>!EO*9eHIkK&56R9KRWf=0s!aPw}?8x1Zw!mC7FxJ3kYTRv^@Ze&YFq&3PHEwsFe zu1U~F$O6!sA(_@OV7u=wkG*F65H1``U0h9(sY}v)6ZeKAV!1I1g{9Ub0gBMRO%MKC zDV*jp%%H(*ytVkyxS_94MdMn9C&i|-KZJ;Jz6DSXHLmXcu10IM; zVFl#wCYk?OR1H1aGROG$O7_2EMinH7y9G4l+!lH$Mc#s&Y{1vWT0yB5B@{4aY67Ix z3pEsl3W-VucQw3SFwU3edDoL{GgI6U@e%V`7~XwkgEq=~iW&o~bLF>Ngt-9FZl7s* zvnMa{d-sW&t}H^Ed*sB$uQHPe-Dv1szg7Yv28G#*Tu!lH%2=I{kdTNORtBz+HiC|- zy4}AjcvnAed(Qq-vjDJ=6~;k){QbiaU&oVe18|3c7!0hx%_U;!-+eLnd(ltm*4Ou7 ziTxH}oBj=8A4O+~mua!uC(Qp_5us=~h%ke|~NQ-efPxh|>cJdB=K- z9vG+oR_qJ=>$vX!=i~Y>A~47aBxqncKp+o!`bJSc=vb@XPg#Xd9wQI&RWtju?)aww z6XLh2CYm_#m@9c6gGzpZ%;H)?P%+5*Qb-&cU(#G2*KYz#v|46sRMEo7!;E=cfVXAW zD;KR~oNu*8(@zgH@yhPgcyc7o`ENj*0|7MrSAvHJ9lpN00MYeeS=spy7KmsPXP-bR ztJ*UOw4u!dFWLRc1)cz(f6x{agBga-HRwpI3>rwPrKSr>&_MIZNyOhZ zjut>FA}VHJuxe??<9`ZLc)Hx|kHaz6i?7I>*O*UZ(Ni>Y{Qg!tiwO;mC^cm{D;(f^!z ze7yrV@;)Nb^!pe`DF^6-&%PwiZOYC44{!Q^XkhvWr6Ii50)3~6AT7)UnZLaGzjcLy z@YF$sce&yzkysw>wrH80rcbd3#1F%*X&i+IrDAH zdgV3yDW$WWkaoFI2!L6GlRguoCk>eXdO>7gh9CLJ#r1X0eMauVhxT$+%*@hU>|M4G zJw|b%?Rf7d57+SReC`)uPKlRH#f~aed%G}w~r3`Bj?(h-u(;;4drPa zxinbG00xGa33w@fQ|h`%*X1I5P7@yBX0!yE&*LwLHqR%sMmHQcxNh$Q`P&pgbUj8% zpJu~0in9+^gI&{*UX^1#X{ZVP>ilyDM&s}4enegRk8<$d|wl*myxX6YX9I508g*|>O zpnN+4tlj_iO8>RgDJa7k&}xX?gRZ6`bB~ZHg+`EEfPxk&qtNDW;6j^v zr(56I2^U_g6?1dN*dp$Pe0RPEp;!LkpRMB?VOcU^HgX7y6@*RIOIm94cGuQ6O>tR} zjQWgs;(Jz018i;lYEqpw6FwL^=0s#;62{=&Ds!Ij=viGH@{Zk0N%d+iZmzkww$Qo= zRvFz4aTM(MRkLO2RtUEm)Dt^Kb-f;(;*Zq-_mc4sdJ=RdsltD3up@sQ*Wkd#3q>hT zF{LrB^1p?NGZbXV!*N@>KI^89|Ap6^9R)?fpodlb<=T^UUA zt^O->k@@*RZ+m{3wEK2*cu&uOL`<%WfR6Cx6a3$u#ct3~y0NdJu>)EKbtNZD22@8% z)I9gqk=viEBS6U_3_JA>n4+ou-dug?#K+#}yFc^UB``pQr(&JGfetOsxCobkA`(cT zw7r9mSU>epM{Z8cM>t!ffco4ANXv()sI=N~j0Alcwx_2;GJV#h*QIB_Pza*E8>9}O zj;#MgiQS*cW%4(a>9g`I-qXmY9xV_RHEJ*D2#e2M(fry!IWj5*v%FT+;^0yq2CM0E z0&KkB(Lfo$pT}o|OC8wm_@H*d58SvA2l`seANtx~(-VJpb$8k*fz`c8gia53_J@gu zskOg3ufo4E1lu*p|ALnOxgZaOd}m>j;{7-aP0Rby%3;*-7HCOk!|9@hKtO1{-ELh1 z;OiSyHKv$a+NTQpv~u{Uq9`Qh546)Vp!fK}Eu{p^78X2Xm6oQj?MoZ3>48T-ee?74 zA1kI5tz--EWgymFzk$_6C*G2R3W+7bZ0)bdz}Ple5Z?$UlpBYNfCxYgLSYuVS{ZOy zBqf#cwn~^etxUXeOjSR>m5Zg3uR6Wd2w%U5J^4 zQ9@d$P8}Fe0F?#`rHR1*q)>u7A@BiN`(Ua7oq+@EeY0&hSqWpK?rXTLn|A&9-gvqC zdu3()P$?1zY|p=O8LOuIaldawO>Aedp}J9vozeP#+Y1 zh;(x}{1OK|RPwqm3?879{&nGbuN;?`v&WWctQIu7?Jf1ivpn*<6*=9TkKEd_z5eCa z+M&qNYyr>6_NJf?2zkA7(E!^SIvc+4lUncXR2mQ&R$(r2sR`N8FtLIa2@dX+8zz0D z)Y~rd3{8#unHgj_uyTEupswEu9aK=Kf`ul~+}c zhzD%y(-W4@uDjUU<#$9A%}qa-sFOcl3vEiPW0Iq1meoInuhP%842wpeTF%8au1;%` z&xX|c2|V~UOXV2nODp%@nQg4mx}%V%=|c1r2ok8?YdzErQP?&jrqf)1mp0k9e1#6JTui@(-9nP3#oQe8)xd5JrS z*(H3g54v?*eSx^~U-bGw*Z-ZkzW2=nL1Vmb=FUpe*tdb(GH&WNCJBpdAl z3+TFpa%Zmn7AAEvl1GnvnTjU=Bw5jxK{CXwK2o`-7&2 z_cdC$a@ys!yCP1tqce%4ViKrs(ri`=;OjkSD}xWkd{eybOw0dJx z|Lg=bLn|5^uT1%mI9ji<=Y6l(`uh7lnJ{d)`jubGa2C&P5-9RSaF??&JRkBb02YK4z53hMsPDx_jpD}rua%fsa5 z(Ge@4dv1g!+>>g84uGi#QrOLY2j(NkrMJ(&jQf1$h2m-w|HmxsC;ywh&~( zOSoAz%BitEC(R2|N~F9Yq2Qp~cY#~ZnUzLW8#xID)qIIK*vn$0!<&@=EAj@dQ|I7J zG;n10`Hp?yn)El%315*Vr1cEpu6yj|3l1fFbYld+hrqT6!XCXX|FDT`E=~Y zpnj|jk=tuNBiAm~^`CiHuexg101_(BSQ!9*cLQA2q{r4clQq6-xJmOt7YSq3K>eoE zLGNUhIYrkeJCwc)rb%vMpk1Q6aj5?dSxEnq8D%_|rvtA&!yaFP=C&{p{PI=7aY{Y0 zQbuxYB8Lv4gVdKPj2;sWTErItx)uK+GXUhRd;paF&u+WJY{NU?L}gW1W8ZsVyBoK` zh|$N!ZzLTZwY9MZFXIvut)vP;9V8N;8Gm&25%})Ua>0)NJ-FlQY~gKOVWaDAb{wN( zXTq5;Lshb0yK}4-RDq8u(=%*ijDD7wg>D!pSG03RbH{x@)1;b0D{E;&IW1mEbu~|i zE>%OslP;)&?wkH-Si6(=I%@mZp-$8utmazy50S?6}Mw*&t2})lpC@@ z&kQ!*=g^RS0Z^dpApK{V5s!dixhI;&I^Jp76UGGOQ?aszR+q5xGM^8oypxZmPewa? zK9m-$RigcPuUn4k&|PqDiB|~a{z10>@gT|XVTb^DF;IdM0eFX-i%rPEWwNIg6q>Oi zcR*vKbG_xNk6A&DiN(%jD4p}YviCku#SFhF6~7e!p35){CM3NAQ3Ws-oa|o1vy{$* zx7Xk@SHb7EXKOduvzL;2ju?Y|Vk@&fuX^H}ZPh%bczN>aa&w?pags*o>Z|-_z=g+0 zI+~s*tSmj!VLPEGR~yWXjea0|L>sX8zhxFIER|!hcosgEtgl$aq^U8t8om42nYb!7 z?&usZuX-l>wX4$%z-?=v9=+sR8~Q+kep0p0$I}-cL-Gx-4B+rfAab#$5c{8dl7B!S z$HjtdURFrqny*KnMa{jiebuZ#nUDqS9!}{#AfWprXeuloaw0k#D{(?Y}N z%poy4V<=tCaWh3up?C80>JMGR&xpff0$+ky;HpX7>k6jhzVN_m8j8dqA~3otQlK9% zM)ar(up51KGuip_{iwk5j!(txv8}9)lH4h!*=lDP4v$M|X>!i-Y2P%(LZ`qSo}g`0 zbS5ILzf6tE!vYHB)ry2J&vDI}FCV7q+82Zp%Du^1CYl*7Vd6tGTI9cEXkF?p!Hdp* zpqJEkilG|-qYPj_EO0GTo$KYe8h3|iy8*CzW5X0w`#1kxfC;{0%zXRecgpWUs zU{8Y5{2s?3!Lu22%;}K1{kxZNc>>h&2{N$V6mjSQ+i8ytLqogDkiiIbbsYV;&O7FD%Ky#ZVk+Y2AmQ9aQYWvJ zRG_wP&CkyR#G8B<$PWS@rt3?yuo7Yoe0|%??TNCbVSZ7MoP#nF16lP2ZG9j@eAXR#t*;zSy6_T)-*bJ`3cRKiT4xB(@+(#0@|- z3p0I}nT<*yKf*BTr_?4MZ-Z=Rnd~2G{5|6RyJc`s9h!kGDXN;oTd+BSA<8swC816& zV)yp9+`d+gq#S@_9}R9*}s^YrM$tl&4vX;RP_ZghLL z!LrmEDzk`7~QG zDB%gxJAUoqx*R#4`2N%MGq1ozuT@27jh-`?^Do${_TxtvBV7yDhJE;%LetaV*q7<0 zt2y%XEtAA194!H@hRGu#{euzsSl=nYTidJK&F9Hl+TQkVspR}Bz2h+TCygH3pCV1Y z(gb3CzAq2J3J`lvJ)!^@se3m0w|J#XBfbN$Z;}M~_>#j`Q)&$(lX;h8RW9p{d#n^e z6Y1|(nt+~&hK5aBQtVo~dQECabnzP$g87Hf2~vLd%HMBnC~r{lUu6P7<;N5XuqhZQ zZyd;D!N^hbf=7Ytj)4e{qQ3}%LFW+|MYqT|S_6ISMK|{N8n+w|im1itdMjGeh@;yY zL&QLrzG3}>=SaVum5%Pa#mPwe`0``CB1A+_ONv(62{#j(Q^^md+uv48-vxsE=tqBq zhN?yvY;5bn_`NXg&VD+U`BJ0g62O^?YX!vP+6jtlCENohX_35E#|Jv>aSN~I5m2?+x*92(A_lBL z*(!xkq34;eIG}A(U38lL@gqq$Cc*Ns-?b{z8Ya>ovjD*KL_9EPk@`6p0Lc`~u=IEb zE{mpC!G}k$)#_4F2FA8}Sj089=K)CQWC@wv;@p0?|eA zz!I9rybNNTd{IAzA^x^XLjS%>BCwQZLWwdBVZb5WX%jDa_I*b{Xz9U@3q&Q5>a5(3*`Yp)_OI>svUoW!S zp~rMes+x|*yA)x|`)DRwBA%J!qCPN9g_wC&zR~G@;YHJ|Ms1x!8dYnB9ChjUfSERcKe10wX;MIJy*?0LZn;ur`;Zh%6iumxD?@Jh1O=lW)i_+X82%UK#ETDu6MCZue ze83kF1l9n9`Hg~JJJ>>hhOv?ZO7tk`8dS&sK;yy$hoVW<{<+oRv2=3Xblsj^qYz!JYA8tHzd&SUvPwuvSu+^v1_y+`!8cu# zz{@REu#KhesG=p{o+pHgrm(^&C@ch1PRvb(;S#YFZ4$}&h9qK#$i#RVY77})AU6rn zQRfXULU6Ci0fqH{qXz$4)pQC2tL05NFECu5f& zHrslc`Fd+vfb-?=cyU!PA$pH^E8x1CJOS7Bzxltwf9!qB7JVN-pPBjNZ2UK161*P& z@*e#A2J=6qsMtDmv_y3%_XsHwiJMo0#f(djyL`++3^S~z$kNi1>ts?D&!x=!D+V(} zAk9bH0%Gi*NB{r-lY)p`HAtQk+62X-i}e;2(U(NNL|_cp3j)U1LW`m{@v(mfV1~Z#fgvwtM4hO@&B7VKiKF6fEV#M z)c7%#s7loM4=}=@@SpcZ7dd9T4?yy+suCW&XP4F^KOu)) zAfDhTyVP29Z2q|N&vl?^1Nuef`WPJc!+*=j{5MJZ-@4Kq`STkdKs0%TC%&q?4-WwD z^zY$;KOIJ(#{GpxBN4nYHaikAu||)xATKBP#8yUVbd-m6q#LO!^_jx^(Ccwm*H1Zl zMJQ|-uGYmBoyR|qqC{V2ZHW_q+uzNtN<+d#{6<9wYmiDD#cWQ;W**iHA0HL9AuI-H zcOw&o`z{Wgy7vQ6?n~=`jgkJ3r_H|&fcZZfdF_8r*@Qx4=&`U0Kjml}P%yB5bYjyDef5m{ia7sP=VL z#Ik|md8}jtOfff1Y&1A1!NJe3&y^p7p3i6=R6GCAJ{$O6{)>q0Zy$^eXq2wmM%5QV z4=nqVKtPC!j&HCJvMM0bk%~f<%9Q4mg;wDm(Wt@2(EiKc#^{a!EtO`eP zUl^}keb?Wk#Mgj!?de)!!lC6SGC$-HKR}C!!UMGA79h|6Uz{8F=i_gY+V*+S&Eq=% zbqVpul$xKpXD)78lGPX1K%^IYyuyH`f6Xd^f)l>(60$JHsxEJC#0ScSs1LC(swHf~ z1)f7apD8fBuBoFIlIP*2uqK;VFKj5lPS^>ky#M>1@Sgj8OnpE1B6SCkGXMMX<349~}pQSp58G6T@- zqv6k>;NUZEp9N87T#%dpJBs$FTI{zk5*D5@+Q%%HN(F%tPhToPy|+6tJMwyfDV4l8 zEYD2$g>%jfF{hD|u@Dq^(Bgp! zpWN~K-h@}vc0%ZtM@ix|h5bVvpu=FBcanY#eL3 zo_Ec)Q86+bGnn#uJff>oI-^fa0!A44_f@O+x*G@3u}qjsi+XC7GlA!y0GKvnqrz6wlwH}=JsdH?g= zHgL+%5`dxV%9pc^vm17MoCjoV{)usEVHQkp7y@^0aWmfrcRBNRJbmGeld`@2ZCvfm z=0|7F7jsYMLrhPtM|C5fY$Obr8j;yQFWqo|>cP&=ZcM>v-fiq#Ghf0=0FXHiXh=vo z_>|u<{-)6UH)95V>44cA-8Um}M|ahb__T=I+b=`BSkPB)JF3^7#JKA3RfXX+U$w#y z-+iUnwkf-z;9sC}lEvLE} zvDJGV8m)nqCair&7Qrhgh~Pa11Wk{X*xd zb~sM~$Kq)H_UG3J^v2$n7`_+N0uM8ABVjVaVj-N|IY_uEdSz$694(XohW3nA(}TG` zoOKK95fdI{992P*7rV^YYThU&d*Trsr@n#dzr3(nBRfCYPY>8T&Y2B-5Izg$nrZc; z5KwT){QQ{^OqBZyMoT&F&52qJCcQnWC~a>bK&x82@DB5#*YIo?r_rjjVdglU-IJ7* z&N0~!YfAk;x5cdnl2^?PNTAdyxd~+}y-ktigDR|!Z+`s}1(VF?}IDvES z2S5O9WFCK6tjsz0Gjm#chTbaU7`m(T%|I3Y4dSu-A7gLIhaPN_T?~ds*1;F9;BE`d ztBUL#Fe#a>PRxF{RDk=$CyE8xw|ym+7*z zF{mU)EI@n}D+%2BaRFdaEzF7cBKY0NMWL@7)H!sn4W>TTRSQb<5Ikk{TDmGOX5P8; z7H<(d=8GT~W$qUEE8joD)eMZ)$7x+a)&s0qMFiMBRAl&1x%Jvr(b;>d=H|~x6L$SD zxx4etSEusa7xUtSg2(ov{me1|<@ZxyJDm9A_z%)zCdXM zqTuCVQRw2FB~VU1=8lZAj(cBE%WMjg*y&AOx?r8^IM?Cmvm!>YkqWg}!ZZ61Ql znjtk$!!Y~F!ucm&cqfiQ=8g5Y@C031m3@vb>ZO^Nx4xIG0wxDsmYyI#Iz#J&`KYqx zb&1x(71J}A!o7ldH}R=(i01t3pOhy};72Cm^78IZdyNLm*p3W}C|RDC=Vk~pzw`K! zTa~4Psib5~+yBEMj;>^)KU;Q!I_?p9`CR+Ev%$ocL~`1U%U?n>+w;Uw&rM~I%`y{e z!*e*;%u7_R8RAigSE}Nu5L6C1gXwQX}>7AzrQ^*qIAt+mJ34wZEVtKrSXhQp83d>b&bRb)wn~qM!l3Ohz zL~gDr*L~WQ6!sl!E`TX*#VT;Z`_O3=FY&sDZR&9?6!puznL>+%Hn(qY$FmM*W zKl1Z}sBOY$vU@=atOH$CLxH>cUz09iy-qB z_6?j`)!}%=56k}kmoN*+oNtW2otfTsv!UGxUYmk1Ek)n^6b6}`k}5}XAEy|uW+pc< zc5nRcQ=hHkl04EV`QEZrb@f-0YixS-XxgPuNgQ+7BgFKjC?(8&M2)Gn6gl%1(a8px z_OJO+anGD%#fV`PUcP6Z}Wz#4nJ-+&K8);Bag-2|bAIv#TltiW*pasOLA-GtoOY_{?Gf zhp3p=a}t|W-V5KiIW6ngV%s~)si*4{z6lgdr0FGBrZqoSTJPeed=5A38Ztr%Ud#2G z+?Kg;!{W~G9UA+#U0rAka7hW@Jch*WQ9{Na&Yr0mzMp!vZ(yXHXDBbsP{i<{|6o`= zrg{ms+1pz8bJz#Yr|Ae_2rR`Nb_6v%15#x(uc1{LelAbuE-)a4ok5V38P8-? zNS_pxntz?#r-4QSj}lDiE?ta-)RIHSx6-jPW!gKk3-CmfhP#5-gCWw85U^KU!h_fI zB?PMANG(AG=_4zi`7tF0J&ZLF5P@i#7D;s~#l;~60naH@37*l7NwN{FnUs|&S1ITg zibV4evRW)HG8XkuH%C=?Mt*K(Pd?c3Rw~%>rz9|2zBbhvrLfxm(b-S|2s3h}%-38( z52tCC>Wg<7Fi=3|`|U=4yjxCqheh(}RSD6};#4s+Lgw`i93A{fq--S7E==Z34tPJF;3`C>o4}W&gn&ik?J@7J(-4si&C1y(h7RQc$x;JOUVEW*J=scMb$P zJ>c3HT45R;!dcVakc-1Z5(8R0Z$@t{hcMe0lo=1u5pADnW@yNct~C4I%H=CX#;p>ZKs!n`nus`V5&_%ImA<)lerkYJQ&77>`M7!LsRqxKONKM_{tAg&7mQ2WT zo0OioD^&Lr?L`gy`P%k2$%)|SyShN$fHNF}31pbUNe4(Dn=$JV`bws%OqF6sV_K}N zhCMuE0q4(F$zmu%=6g*C!<|N0wN!q&O#v9Sor4tZ6(8h|k3+tk6e5?RV#3>(oJ)?^ zi|sz1zH9teA2RBXn`DNEXDWbU_iOvO(G+}@$* z&@OsheoNvHFV|ls-99to!Ga8Kj-%i9NPPrc=Ar95LuZ~bV$<%*FXxZIZLvn@9nmV+ zUyC9_4zXai-oQ{x3`)Wmx`Gk(W-`#jH|8VG-ow1ECS$Q{Ef=}3+xtCQ_B=|R8;5je zloMM_j&*-Tp7TYq`HodJx2!7+qaA(KUH9-2g=jwC@(}bptW0}!OeM64Ao9c$R;}W? z9Zn(|>DAEpW#bdby3YJ=`nI6S+4#F@O@U#-E88gz=k*$qu6hLDiM(~bG%~NWu`0=5 z+lGu@`-19>7iq*s*0gZW=}XutLx zRo*_A4H%oAY4%cO&wbu|I~xc&xlmYY+Kh?eHo#|1LZh*QFB}Tgg00vDi^as{V+h(y zG@45bqu%)KWC6>YSeDh!43QkH%*?bO%M^1Q)_m65lr!Q04Zhf9y4V%>q3lJ&u^jtU z%graIVrEVV)29@0w!igF*mB1mJ98n zUieXuD(cIGzS}cQdD9X5;E;66-oibdGQCm_ND{J|Llr>|x~nOQ19W-Ab4a=OvpdF! zN=45&k>lYEODZk@1C0g85W`PihMEb555~M`>eR%1?-VaCzamKNB1~?W7qs!@oBJ6c zl)}AE`dq+n)k-n^s0}Oaw!FMS;RE3M zZ3x#lI5z6t@krk>$SYCgXckd-EQ@!%anfXq>MJL{m z;td~u;!3!$Lq`<`#$ahi$vos?gFoBoXo2s4>S2RLU1Xfd6NuS!Ff0b8uyoIFGpGyJ zSg?=%Y7B|ngYJqnkG$5e|T!I{6Y23M}r5ZP}Acq zLkv+03l6e{f{1`p0;DR50b&i?{t0pFdV62Cb-*$DF z)`SURHYsR(L3+uwKf%F$LY#kkd;}Ao@3UvDnjh%YdO=!kCW zT2Dp<$=Q+ulsm8HKc75>ljxM%70$v0UQ&WK$Szk50QK1hsI@5>R7QQF;yvmV}~^t~k>{%1x^)HegpN5-0x`6$G% z#J4hS4s-q2%~J}ANCd{C(8hY4uhh&*Wy$JI+)~_?y@&mVvLCROUOp4|UF-Ok2G_7p z`lw(H&1KmtJ$8Lv@Zy5p7g;V2cdZ{xOJyJ!BJIF+*J&(hOGSX#f`@5P<5UK2be9<$ z_Fg)1p^e#E>Lr-LbnxF)vbfFZ z@?$Gvi0mn<5R*neG4M=+Ra>T6+58|L{XzUHlF*gdwmj0@L!_eHS@4r}`Ew_gv9~VU zCzTXqvg_qj#Qm|cs!BLVDBCV-MY}x8f19$@kbV+E>;1_mV05|H&8=%w*FG8oYChoUR@)mGk#J#9JT z^CM-DP@$wimiRs~q4&;Et^2a#>(;J1* zzSYoSl|K7P7BX2<%X;fx5vc&=X!f$b*?sh;3N(W!EtRQVUHHANdbB@2zLZ<*Bn?jG zMu|Gk!RE6Gkil5Q`ejYQnWsXsn6850b-rLz?;>VUplU%PYG(G_jGNTbJr#< zj6@&;>1QYNGrUSv^m^Di2&0LFSWb6D{7PqJPy6bNk=K~tiW&{it^+Zl`&}O<@NGaC z;^e_A2v#H98%bJ5TB)m+bH0@eer|ST3gm10V=`B&YPK8o+n`st4++sf%oGRr60tE< zqN#k4<7DMz)C~xbtVO>t`<7($q*0dUaus|hRK(gIHdThrY4pjPCMCoD6t`ty8hk%! z;4L>}@?svb93b)RVJi*0FD$?aUVk?g{QV~14VUy6>ALSXn*|hMjjy|;R5e>mQmKEu zY73>9`enKQJvV(&O^s5#LJprpF@eC?OGX9`3JN^Y-Ph($5ijfDPKa_Y(*%7V&)p@e zS{~LAdOYv;OkR_MKDl{*=`EgA6tRKxlz_b$v(IVm-3zDv4s(#lLAs3@sdicQ)_a3I z<*vIW(Sr3MiDG|^H-`8U6d{`NGU(HHjgrJ{4O!(jJ&?ko@&~~mgya*%8zhXq;ZBIX z`CHu2GWQe>HABKHx@4JO>h#aeBCYCAV1z^X1Oy)^n7^XA;s0=ENDPDedMW^1@v>jb z-FKCDq+{9swW@mgM46jxkK-CE2u6B(Im=CU`fJf15lnGbm3+BzZ*1h;73`kduip2A zB^D}&O2IO3bp%9nvw<(>jKPm;ccjW43Mr?22t^&&Oh6>`G^%cO@fk~d0W zN%Y&hBXQbaM;s@gUtYR;b-!l1u4ql5Z@O}Du#I_^8%exz}{Oh{1d;32-r&w`@8JB8*7Zc z^;)<{Rxxf3y?E=o^(|u0dipwoAwR;n)>n zxIsZg*5TWHk>#oJ=+{46MXrb#sDB}4`&SQ5^ynrNv?K(i+0BggB*6}Iw8;uTlt+8s zo4qP8r$~92cq+x?eoit_BVbWeg*16uc8y*CieIDuC#_O2t^l7*zEXHTk&RF9>}<^j zqiS4hO0iJA>qjYn7@^7M48u`*CF*MY%04Uf%1PSY4#*N!L%j{~lSmP@ljri>n3ln& zLEYh)frASd;fH$(?_3!(o_?=PY`<2of4%zN0w(jBXgOp&A`2$7D*`Q6FKB_^`E`?$ zY7#XS)fdL)onH;+BkPQZ{-;03zlAv&&=Wi)C;3Sesza}cp3jYjf+1sU3b*A^x3RR+ zVk7WHg=M>@rJ|@@cDz6>Zr++qkRWV5GhlBBhW(;sh-OlXh5A!G94bqT7CnMM$kUAQ zL}`|_EEUOdPRMA(=QanxNSP5GgskRPlkOU}sR~pOJjr0*tk%R9+N*_(@eIni{D5cCs%T*>Kfj#>|-Q#+OuU}^=i)tT_O%Hu$i30 z6^0wtpg=fFjeO*fZG^-GH6msTqAmv(kecB9n0%logmV3saxjG1=@kojKHK>f+N%JOJIfU^QO7wtp^%0l(W2Wm7XNzq`RZ4v#MAuBiju5`cnWNCQ zlFWhE3W^#fx#>eukq=3usO9KDP)9PEkhKpM*2 zb3^jId?+z04}FQB;Vlh^P$ih@xi`>=~G zDv14NE7p>vk7Jwf(tymbmgKNKH~P)xVhK}4yy_`wmrDr(Hj?;uY5kZbmk%~h7d3V( zO3VVea*}$$m3!i(HWO?&0l1A>$KC?`V8v zXfWBc2+HNtDjd!wcswQ7PJ6qv_~2MGJWr-pqqid~=2+3ss)tBSp6b(L#hUS!x+voj zYhs*Wi@k1&pkL~t7P7pCz+iDM_A|FU%8h;h5J{3 z^Ceosd7U_1n{^TFAIk@Lb&9yU!9gJ~$7A_fsUGYyYnyGJB);b{!pQ4J=R+@P7gvZX zQjV{)t^1$%UX8WJOuN<+I;t0Lk`QoVaUd)3xtxARi5AK(M}a;hVL-jzKohh^mL)wo z?$#vs_ouS9=C`G#3*a_gG|xjN{zQ4r&qurQWFv@Jwo7RwkWi8x>6zx#*NsOKoP+Ly zCgoFM{t<>R5tLQ}opHuT0>ux*h^tLWgoKp%%4CsWJmJH6h~0va^^K)pae*)m`a&t8 zxLW-?A8N)XYboqN$E(c&^R*wKisg^?2CXfe1l@oUrQ(Dggs@WgJo%Z*3q_n!9u-rf zIhwuji?(g;IrfQ~ea`{s{NNl+8-{`9;t+LGVe$*oB7T|hLIfB(vJdMcQ5>mn0-i+C zzdd`$hV8!OMw$7+icM7^7@ z+J=7IcH#=HRzfIpjcyT3uyTo#lc2VM0-2{t#-(YrJ`rhd9As$@2XdO!2h zBJ*IZOKXT(P2OLUk~*Z?>02)&k@(?TAK~K6Yw2T@GjUcr|D-*D4x2;8n#WtLu~*Q^ zgu+}PY*Jf8uY^P6QJ&1BHijwXNkS7Dn=*$pY4_37~(bdU~P$Z!>hB@81U z(p5hwj|pK=-Qh9`*%`xT;Hb-d<6{e~$q{nTjOxbnlNO>-rrd#;X?chQPs`(zaGfHH|v5)Fi+^>#_cPec9`r z59%Z>4`15b%Zi`BPZNyEe>09R9)s@3wL?d3cmtO|ydDu?*Hl*9W23WaCm zltTMVk$!&R`w|y_&f}kTvo>(wenNX*3q0(-&Q8d~Z=(kj_ha+=6Z$@tOW4fcZPiU?E4hOA(S)1--H=DlPov0|Jk(Y@0MQ6s?4@5}1WNErIe1bNiOlKUhm zA4ikl#2tJ5;0*ICS^zadC>V;8hmn=~qny#ZBS_+{dAVgSifQi8qr?T6_?&GNXAO(3 z(t060e414$)ip`)QdqpOuN{n{Z!=cVL!Qv_c60@+a*lD zMPa6?NR^nGM=sXBk(7a>CqjZ2DY=HFret#VV_3^UthGm>>NcebXHCs z4sZ3Gj-%Z>UyIntN`#VNORh3FCkok+3maSZxg)JO6$8I5gOm(QjYeEX$*ZB`ue1nn zoIbKEDqW+0k51txYce*xR_g6-3QD}SOu~LqE)+1TLYA5lLn)v02u8I-}*Tqf=&A2kvR#eO^#OL;9LOeK}z#wSVN*-4XO#X>6W z9z2iZl=J>WjkI-7;j_1;$Pb@xk3jsF@);mAOn6&_NF-ToYYUzTK~H$SZq<2n3yR;r z4J@QO@}D0eB80=ZNHi_mJf@D5o-L%wD&(K+eH%Q2`WTO3g_)2a$7a2mRJ2l7@)=H0 zj=iI)3oWeVm=&Eo?5{Vk2*te>($5cpp5pb0+*itsQ5cHryP!ohgJr(rpT znj50v-fx^_r)ecN)U0lNG$f^|0=3S=QRy%Bh7~ey89lMYeEc7G@j%{8A&m zdy`8)J!{p%x`#LQaHrzaBr-Gvf>XhVB!oZPR4zo{rcywyj8L!V89Cy}I9y2Kz8BUc z9TZkM8NRLaszs@{RRz+gN=-v!2}Xb%uBpz)1qC6aU<#;^ytMpvWyc`*b8?b#lVH^H zN4b&H>5d)~JF^U#Gv?*{K3o_2{6W|q6y|hD!8Gyv9T7+7I_5C$dXHx0PvWTi&j{^& z(qvkn87V2xd}=!>)8`f7&uN;lrDYVFG+WMjXk=A9f$=Mp!n_nphAQ-4*Q6RQ6j9_y?;ji-V433`F7gb8uO`0k`y^v78QUI8iid|yoV%LHpR-tH zr0Jj>{@i!WVQPA659ZK|45bpI&~fxu6U-FK>Yh!AHN?u2;lY|%*T zmJM@VNXW!9f)c3KGg`(44CvB$o8ctlLgX~T?x3KNP$a1~o;luG?H7ir(CP5bSl@2L zZ^>==g=V*n`XACiwTt0zCuG5A_VxGSqCXSo@LjW5Y@<278@HYve;@eEuJH(g;YRnZ zz$9h?CW|jUVnaLTX7Lx@*}RxxVVp0`a2oB-%w@1j7=Vi<%_q#ciyevjzP?HCb+EBq zgJ8S(W4Z0YWeAZVJ!}+8KQ4jJg-Q)6)j7m2uNH4l>lw$}j+8~34ePx)k6uYQ32?{U zvZ!L^T8HLud#hrq=U}V{`$dP#ygx+=XNA;b#)R1_!Z7D|e2 z5xcxI)T;RMP&eZIwmCmr#eZnD-N~lCrm`m50b;w!EuWGKPr*nC->8c~4!yT$A=wW+ z`}U9}EQb{1a14y2B6O89U?<-{4Mp~FF*P=hfS}6YlByJF8)#)cH73*xjCbfQ6M3hO z42RWFAD@W9$)u1^KOmi(YXGG!r{?zew;;sJ|J3k=9#+OIm$aQXZ1ii;qcF{mBdk$M zsV_FogjqpXI4;TXL6+(}7VnCSS^6xt27aOvIeN0G)OBqioWn{nnbnX7aEf5UV8U{Q zp?|fRNZVX_^3c?}Gu?=L@A}H8X7DCL~FUc~mmocI4(S&`J|KQnJ+-5LwRj|#1 zqY_S5RzO+_YB1JYQ_alL>{M&gh#wmYVGAAhX)$3~`Ns$p5vEfYg5KBgX(h8#hOo4F z!j_1}wwiFydq;xQ557ge(&U;bpr^Ir&3YnX9M*uxW0|scQg0ska2HM^nTmr9VoRU- zsk4M!gER0UC?4Dv?tbmj#UL33kC-bv4klA(^Y}e{86jTi*btX3o~vaFa{Y(vYsp?Y zQH<2a{OCAqmKp;Y2}S+ShWMrK;D$w5Vc!OF10;h7fn?aoIb_)COMH6eE`7qK0+oLM$ z!dKQ5uSR6M@4oPO z&#uB-T-v`*$FUuQ1SKW{o;sXQ?VH+Xo1}UTLec>TPlhR0<2VFnM<3;SC2p?5>$!xb z3Cq87zW-*NeRWEIH!{ui_1=M=lz&xwU0z#u)!2k*YZG9D}$w_R?KU9 z$&TVgD(-{~6Hq}JsFyJ5}Bo5(ZTaE@vE=V4!`9fcxpep(UDEHAb!=p>rSQ8v%a+|g8r6_ zjfXQRlCpDytCC>c*`gM`>)OIWT)Irh`072 z6zDYp#q|%K@3L+(lbhmWJH}`ej&mGVEe9v*b@1Ol+?OJQhbDSgQ7-P!VG>5WT zG;v?Mj2swf<6gXY$ngEUMo_>z^RCeO=1!p|kJMlf6}1GG4a%fKTGmCQ~{_pu5JwV-a1XvXKVOF|(gWN_yME}al7`;Iwz z{5cN!=B{?1lJU9{lh1}GuaEej)byih+zY=tY9pF?Mn)d@i$~5@T9T7+>c&+h>yT*riJqI zj&=vjkI?USigaCYf|ZZ&)e&t;Cs`8;&)3&;q?S|JBk4rbYF-%Yfbjb~|0!W^=xC7kisomGHATz&q$1AExzkdt5{~r$By;oS zG5ygCJbmSD%yfzzRX7wU3g6er&(o@v>Jd;zopcN~9Y@G8^ei8+l7)hKOpg@}8I{My z-XjKSW!-XWJfIw+nOnG6T9lFTKT;}5NUMqzT6SAf)LjjJjyS4bW47Deo*GtLHd;JfZb|vyEh~k*gx_fj# z3fd;&JvDu_bSt_pVEbS~w9a&rX|VHv2QOc9HCN!9+YIwT+Q}HUG4*D`-3jsl!4;gr zj?K@}4;-*GHD!`i1MTX(ZsDMYLl^Vm*bFMO;CPT2x=&*1jw^;)nreoX1OXVFgF-b# z?rrj@{Iqpqs0)FaysA~`D|GKACOQ9PrtQ&VVPuK>^~mBkW|a;J%0EIY95R=I+B^I5 z?c(~5^#%JVCv4yZ@{wNG4+t1mIv>7M#f0%U83|>xT+E0EH}vWkps74gWyaBl-rh#z zw*D6OBoa0vpVH}&n2kb8!SzB(42cJQgWdRbz!i9h)e{uLrED&O;O;nydt3&T_qVZ8;j9GGSsd$hm(6H9IhXa(EhU7wunTQTR|XrI`# z@uyFD#<-rPuZ97iIVSs#jUZ?87^p}9quhQZVQGuHMKMc>HuglR1z(ky6eaX?j4yY1 zwElQ?e&8U+aWrZ#8GjO`6ey zZau1=YEY$^jN!f>8yD<5%1D3Xv{(Egvjt^NRL#BVI`M0l{9{H016Nggn6Hxwq0oAB z!c#Ve=9>`Y?vXRmp8C$20krX45G7sWV_Tde>^Pljdwuh>={M2FnFQpJtkwkX?cJLS zZe>Z-D;C4r_YT%cVXa&@L>BWE)5Vo2RVZRF+a(>f_f9IN-!|_m>OwyQZZLd!TSU&A zLaE`!g=}FAUrbjxRpR++lAgDjlQOckp~MfU28kAm7POh)iO=|W1_>%XFleAoXjrs3 zTKqQml;B~@VuH0A!CH$YX35|LMN1IHf>;& z3r7=v*H8-(gvG0aT9r>LdwJN8Dt$zwo;nA+_gyj;I#FxulcPS%2=7sku2Q$dil`j0 znPiU@A-FLeL%$Y%BoRVc=#I0TteyVAP_RHHBUo|8&ogfRzWpZ(37p1;o%TOd#n9rD zQ=8Kkev5UZ)M)lyN?7)!V|uzP6V4B?8|pRg&^yNA<-BK09+;z36yd4>{{$s!zu20^ z`~d71eSC$9I*#LFqe_7O@dS1KUIB7X%jsMRPG;eO2J<))UWyumbR6v*zousw+@dbH?RzGq?$E z9qxK(pu}{$V7Iu#?*+dWvY>2d&&qy=Ti%{WH_qCYa}tzXnF+~ya(+>mio^3W>1C+s zkM=+PL;QLTM|>yU@7?wUmz@8xja66uEsFbW6SnbY$Iud-Nt+Mw4tu^!K9Mib83Y_@<3x!VZkL2+pBg{LE}0x7FJ zym;imIj!EpMzAk%`Qu)^afDs?OvnN6A79_gL+>VT5e6^g{D3%=8+Nn zOi%HA?QL0p{2?DiXcL3UTO7CLM|7#=lUUoGa2t4zBi$tN;tCJ7j*KzqLt%EXorTT( zLU*d`(At3X9;(>kQVoU0?@fpI=47#(m_>L8^r;LgB;T+{@^H3fp9@bGtmKu?P>hVn z`W$rcjV4g@Hc;Bdtqwh$7aNVQ={j_IPEX@2;woy_Y1y+$_4MKLF9rT3kC{Hc@?M)G zuEA&ljF`r6pDTSQa~_Yg^$zyF$;CW*{Bm+U{F#6=R9NA}6pkQcRd+Du9&BxS+!Y-1 z7#dc1I@cQJ|Da}|iZ@ODd^~mkx(u~K&~%vNv>(<|RzRfdFI{YMZ1J?pqQ}?KbM&@@y2J2t{2yvH{BrN#8&?eGI ziN3~{H_c-x%q(X^(;o9O*H1)#MXKucX`BNE8Dnu9-kOtxBD0jPj@hndeCvJF&r(^_ z_0svOac>o6pXC$8ODNagS1D(}V9-s}|7L+})YJU^vW~NtJ>t_@efI-L(iV&JC)2mX zSu+hEy~n%S<4hU8f3-z+wsh5johWvE$jr+kA)oV%duwwDb1lDfDD-OH=851^4}~cz zabpmyjp@xN%d-2%34z|4)-j)pQd0v}W)FSt0g{61c zLfPW?PrFZNsRM?CKBjg;oVqN56irvhzdybDL(k6Lp+f38`+(g(=f07IdX3mkok}`U z!Aa+8O!_5pl_q|1iM(v%poGXhLFnwOO2nHdp9^}wbUw!7dLS>dXaD))YUpGj8YD|M zK1$OB2PwNaV=oyV?(vZ7MQ$`=3QDdkPRntD9B-Dk1u+8`tgd3WeaFje)shh&k7o!8}^k{ z30gJS>1GfY^!Tz6h9zC;f5MS^I#lqoL-4Z9^Lg0R^~ctCjp39X7j!(TO$;;HtJjSq zA;8mekRA#4P2o}{+4(Ij&%Y!rM)zjEsRB_mH5m%;e!HiZKJ z5p4>;Za2Sf*In;4^&iozy5ZiV;t+o(qU2!d;OdXrhp$ALda)x@PLKa?Sa?KQ=mvIm z3{ef|jl@h2&{tROj+EcBnr1v)KN3Ib@a0z(()$y)J|FjLU5%vnNHHikXk$SbbL6Yb zkC&e6m$BU+GoLj-B^W$A%jC*(yNb4(n&DFwi}ljiH)hcv9$C7K=y2Ye^K%Q`eM$8= zf+($~yEY>Fa>LTwJ zwd-cb;h`#tfD?uN3jgknpTs{rrQ|i=J)#3h}q>ACU0ae{8^K~ zXAWB(1QN!2-;sr5%b>BLXs9z;55(#VGC4n`yq#2!12?-a=A)&0=O>_j+;l+IhwQIt zyH$aec^?gDh+<)CTos228F|EF#+(_QGOZ?GoZ6d}qim1>A05TF@? z`|&&ePthBX?0y9_Hg^eOJ2RuHI}^B-G|t8&aAI_@MG5T&Vhj4_z$R43gqZZ>;0~ zRfyR&LaYpWN+69Lr{YRE%SkL+;gj~A!DU=|G!DpZ5Je+)d7@Ph_OWf31LPq0yIqf? zRa!j9%s-i3X?wy5fhBx~BQm&C^iBLsMiMr;lWE!L@;um7zHtT1N}pTY)k@(GA^vR~ zd_1{3e>C*J{a2IM=ljN=bckQETtC}SOib(v2*O<2)el=<>V1oikA{KSW3DcKTk@>a z^b*|43+KKU{H3~D^s&b z$`E_1C;o|2J1u7X7uD3yP&ZdAE9-aBTeS7A>`$5NoFZ%WuPvkcRZ=2qKV#!8v>G|)Y$J{n=Fc=XpE%`^lZ1%Qo@HvyhS>&t)2x1(w=^E z<1$)RDEnMF+`J>jV5z=Qc{w9YL9iq(*7l@s=gA?h!MMwQ3hcWyJV^OO_;rcP>+*|D zCwmkPr<((P;R3da0EwN%Up-;H2JPyzQ(r z*t2Cd-`?1fbBu6uc>U{v>#BC#!TP-TPp?>q^t<%Or3Xc;S94O$3`UpNN5XxFSxYaB z-waCn=ySzMLHAlHpiXugqQ?UU#1<~?X17P2bXXO_w&gyMq9X`RJlYrPMf!q?5R^n; z?V?}^o;)h3@Nkp^)H6pOR7m2#TI7g^$krfId+pG53()$evCZ}y@WKcMoR;ypwCD(hwXV_AlZrk5bep8K{YfW=7Ti>o~!tP4{w(NYUI8c$^hG3M{!3o`d_5<@0wV|%ZUcUWTKbFCAUWy+px4N4DQ7Y|) zt>k5vu?&t>yBewt%8RRfvi%Kkyr0vtYeXt(Q7+Z==;~tC%+|p)S6TdLQ`0y1n@$HX zp=vZ}^rPhT?dQ{)c63qh-KI$@$0s5XmjHilPcL)Pr;BU}S<@|t&%MnuHQ zYR?!$iA9x#xM8I+kE&*E&tBX-NB8e~W>`t~k-V>=Td$53zW?s!gc8+8(xPeB+u57V z0?IlSsQ7UxK71oyNadi$`pV4fX!R)>9JKlMb00TP?-#z$1Z{SI{4YQ`dN9S?Rn|V{ zxchIgR4gLAoQyt%7u8x}qAjG+JY81$=@xVDLBwaE0zpT^gIVuVKQDAL{9&q0|5)|l zNc@${`kV7Ed3am>r;}PtGWcZGw&vqBNo7!$UlsF(%63XSHX4*u8d!UWW{9QT!#p8@ z-_r^Lu_wF!@kx+AvgX}naYp|X6jP3M;c4x!V!)lHl4e+a>V95Ye|)h&d!v*iadr&p z+yW({Qr}xI9Ib{p_&w9^384-uu*%1iF0b}dKu?ATY3V@gmi&`X*3I|5UvSc*(nH1L zz4R?)BK>osI?#sFhVWlqE2oz->>Mq55qL+kK>0cam-8gA^VCdNu}wL)F|aGrA648H zydd6sS!*OAxqn-#b1?d?YBY2-Des{G*Mjb8A0LH7ojgBqk|(e<@S#rR%h4y$m<9s~ z#)d|@zzOv$q@LiIUrc!48|G-J`sS2J4sOVV>or9KcUJdEW>hY~gf|2_H-cl5PKUv9 zKP$hmG^k-4st(C;tfoM@M`l#%&F0UQG z9Bh|6xgl~(PN!rjJcW*M3FjifG3!s`iGPpe4?Rkr(j~nI7^lX`+>+GkD2#-6NB;GxAjaS81}{veu^Esu1Y|@)kl?$Ze;erMD|;)<=p)S+74idB#(( zE1CgMr42NBM8xoDd1z}#Wk|fa`{(MQ8(T{JVquZj{4R`*6=k&Jk~h~OM3#U*+YbZY zr`UKedrLHsol_O9aQP#7PcBW8jjODO+=BvoNckDZ^0Kb;!bcYo4k>B-%7=t->Kyjn zh6XGtxo>@LQA-w}5|G~`O}=nc`e#|YKy=7(vaY5f=bt|!jQV^Bgw33KZ?suUTH>R$$8KyC^|BHc?SjlWwbM6cxcYqe$OKHyySDPGBtJD zoeSr(a|tTgMntL23wx!&uU#S#DZf(~7e>l~4BgN0Ut(oFMdnr;>91%4qR>PYwNXSH~_$K;?=?n68}0z(V^x252zIgE|_JUV5dpG8Cf zuds`K_det%>Ef*x(2b=5T1TrMxQ~dVXMta(TomEeo-?a}#I zedwn|m3!Yl2aAy}o(Ti*D>8xW z`pbSa?I;>oW0g<;>fFplS|}l0X!HZ+uevKYuj9OI;9WC9g;L`%$&!Iv4pr6D(p6D+ z2ZT&bB!`8G?qpNiuza3 z)Jr(PDs*QQF2OY{gWkqHh7ReN`(}>n=&)VtI5ir)?9XD2I3RD(qflTt$2u*~t0amy z-~^qxulESBG%lEgR`WwzBuy_94IyjU7meU9EP}v2>UAC30L(w7LoTKH={Q(fNBX}6 z08msO4G?8xR*M^lbWp9QfpiKN02FS=bw82+6Bh%GnKFa#jmy^lu-^q@{XYc&M6qcG zD-3X>Ixhu3q2hug<)sb8arLf#Rvtjo-))N=S#Gh!wyO$_=C2aMHsnBrxIqN#liS6R zW>5r^V_RWQz|9>LuA! zPRE~l(-h_VGk3Fx2VeHJcu2??yf#hE6q5n1|L;PS-@Hg<$hn%!{tejC>zk9gpgVTc z%sBvBEqzDrOITrh6;N_PH?ZcWsTX!=NKNgwuS7Qg$08S5jGOjH*vRbI_-J8J7^JKS z$_s~f&=FWwbDDs>g4X}^eQ5fi(MLGV>U8}mwb8MD2%RwTX|g?341;v+V&riEsiGs~ zmAopT#M%r%-EDpUk77mQsjB@Yc#5G=_n>lM=Ko^%u3KOfL;&B{zmTO=n~73hE&`zX zvB$CWDP+yF%n1eqAi=D^^56Lftj{3MxyxFuB@3jhALS=vat8)xl|sua?7N6t)mE~| zv;ZD}%y zdsGWlA|J5AU&@5L%csOUT-L{W5v!=|@-do9)G(>skTK5QfGgKWq1TuK@A{$l>Fq}^ zq7sw!dn3xf_tN7=a`tdyoKD_(eDLPQ`lq~4cAEmfF8g1K@x{cV+FWkexVyS4TIlBe zE2+nkTn74r(#bhF#yQ_4-++)bO;pv#7aSwI9oya;>kiCa;@Z_Xl9?TUs`$?S8e+l2 zQ+?wrtbcs76A#*i^|K}#Y^3cK{`|dNzNiA1Rmmw656HfCxKEO|}HTvJNE&so;C&4)u z(w^c8JcatA1gdC!K#lPKgl#}j`ba42zNq^Ns5fOFiL+Qh$s8`NG2l6`Hzqidd7>W8 zx%U_d72OiElhf1buiXKlnH_>ibEKv_9y$ag#LQ`1#;U!YxSRD_ewX5{&fru248~HZ z4wu(_q8FJ>oSG%1PrQ;}c(?eQ%eea z_aRN>UH5OhY`Zrw2)em(i01EyASnpB8MM`GxO-J zV!nHG+PZsrb5+iLn5s%MBYWh3&h05;eis9Co%3x|jnC6`o`aj~Swff7FpA-V`R2^R z>%klNHCzK1x9P{zyDLg2aEbxMbaiVv!Mi*bA-^>aQWKGw$Tyg4TbA0b92S~)o%aP8 zIXvii^_Sa!|5mpVBstJ;-K{;|$MEqDs7aEas&_}%AI$y=%Dy!8b<1jqh{*$btVSxA zuhjS2XRpLBFHxTK^-4N5zBfI~xW>T5&@^&=J2@aR)!+@Y=#<04-IZ~5edzvYCPrQI zj8s!oGdw(;QqcH({dD}-uZS<3Arp-r++3zD(ZzlDg5@yGk1x?|rxL<4Ukfna7sG{J z`)#~F@W{R-87KZTV<=8$(h`MgN*@A-#ggAo3u8&OGDv%e&uzVkdibikmu6-+uQl=$ zHK|Es&}%`4`Swh3%8?bB#HB5V530`tMgO1aRqlL0B;l_H&mFgZ2yb1D!0z`M!adx} zIx@bZ-MQ0`An({PD46!te7*O+zNw0O9A{^KG$HDAeeTJjt7J%RoFFivc`Kb!+jMBz=>bGg{MT=*w_^gvl2#4o=#xxaim}Mtf4YR>GUb0xtfF^ihfaV-q)znlP2OZU*+aW zI-#TshH>NR$0!;bBL>S0;+w&7EEa-qPL{JyMk2=H3LeTuKX5chaw~u>@Ht+8<=U^_ zmdLJMC4cuO3MB{5HfM&8!N%AzF4byujc&SgMGVS!W!4w9&_Y6(t>tM2u2;-A;{0Rb zZP4ijX5sZ|@cLVqR>Q_adOz2DGo-a^HNV!x#+CF@Om>eB6f_jW)1E}F zPUX!HE(k_@SGD?YYbA5T~Yz@ZI!wX6;N)4h3{94iH zwM%~&?Bxdubz9li*c*xI!dtJ;UiYWbsIRskr3m_yFPNF{PF)BeotBV%q!MXN;GJ@A za%`NFz-JH&*(swBRtUY#Du`+`+aHh|m)vP)U`YBgYEsG9m*jh9UEmcd2^UA%BZZH6 z>o_9`&0;jaYwfyMJOoGF!W)!$#PpgipEFE9wYIhXN@3~~a5cN7;A<>Ct-mSr$n3t} z%BzJA$sbOfoJ0~55*rQOBEAlKpa7hI^x@)Wvx4J|{jHaSQ3~g8UX=6?$m;ZyMvfgG z{5q3c&bqTXm@LXl4BD5b|pD+?qZvN zXrvUv@Z@SEb+=sN?U!1s?eK<^?l4J6$Q$PxJf;{Md`~_iUD=G%knA$A5)e6Xz>lK- zQNO$^OZ;_(&zbXwg1iwo&d#$J#OPhpFmBN8D6RV{ofPUz6(Iyeg6Q%6_KrB^>d@fB zHg}*z6B6Ao+o9=rSGORGoCF0@7tXyMPAE~+4GXgPfSqi$Ef29N_*VT$fHY!@>&zcR zY_E!OAd7tbS{fLFc8scC9{i{#)e~ZLS=gY~GiOwdL3Voc)&z5ZAK@$EtPX^_hdy%R z^(fwLQ&UMajNia^qK6daO8Amy zP2@Tx5t%d`(lHHdNW%tWTrB3bK5!1`X{20vG%n*boXO~NFha#2Kg(E=%5Jj=-68dJ z%?ebS5$=N;BQEt( z>ePh_#Abd4J~a{&_X= zC~sU5Vr#xz9Q+$kkmj5O2$wmIZN+e5`YFVgy8+~)NX9h+CZK7=o?}7Fg<%&-;D_?X zF;Tz5`V^U9h{NJ&BWCb3`U*<2pe)LTG-VVb_0nvMdWoW0*e&2ncI`arA{N%3HN|D|7A|(auxz_(w0O~x4nfuwIz}%|aE4tSx1`i(l=qf&49@&VEVa)ZyjyDp zf1!PJ_yE%(!6(;hfW(ajT!|@v%9rih4a9_}zf)8rJrGWLU*#oS%{nqk4ULaX%L#Ui z5)l$AZnK0>bzJIP5J5_`h^b$bKrY&*KtL8%elZxA;i!qm%&G)1#Fls6jb@ZZ1nTHS z8X;s|a(nWVdlLmf1D!M?y+;r#`27d+vH;F<;<`y~ zq#+n0nnM=+jd4|T`tWTfY%$CLAg}n27#zbNidu72`Y=pXG_a7Nzhi`?gzxb=$1s8g z;{9|7I~*k|uk8aGqE1rvU}T0qFlmx4?-O1#T8^U-3o{t|O}E>C7rFe-pv1qhqamd9UVN zsZ9EPntUc zEJU_L>x86Y#^gnXv4ISu0C>FX{DImnBDGhA!U;ai!HiT}cBeQL6sdYc4;Uij9zbe@ z*rb;*>Gw(wsnI~PqaD7=0MsKU z5F4KxSQL)GC2B&)PJ~ou&1bwQ8ty^`nMM)T_xiBPO{=8a^DEnlw-^ zVm5rXY0FC3m+#9Jpe2na8loUeEc6aOk?O)DE`UG8XSROeJ66&%APYla`7Knvym=Jkum^^ z4@A6qo25)2T+_D!lGW}63Xl+HZZoA0#HhgtL~Nl6dopIrHg}JGSLNY;xEUuFfTEKH zz;a2{RV;GR{uhm}LxF3J-(KSR!o_?Kv^P7a`tI+&PgHh28A`SvKFC!8?gAtB1X)TV zL`t`^RuMMY$2-;RhnGk2=lw!2^I-xFNoF4%D~6kpTvm?Tshr5~5mXt?OWCU-bJJNq z=fsV>6TNX687K*#kL^kfTdCTan6Tp;{MH}tOg9DAUqnLB`SLE5m?}oflY7BqX=C*@(?;L%=<;!fs5_RM?(znmEgplgok&g7 z4ZFl?V{2HpDT17h_t(q^_7!&-${ z|J+JX#8QeIV{Op-EdUOGSvAtu9T5R)BRJs%)TZ(p9|LJ0;~@=iUHLL1Bbd~Kwjb(O zV=XNb{0A;sUS@N>arSi=BY5Lb!(Ja@Vnb#5&9K&OKvd*9NVUp%7rTfX$1wH&^b65c zi#L&+U^zRyw=%L>$FwYaE#cDJkiXVt}%H96P%( zeIhAC*?GTiwdJPu#$U+m)Mokg;LxhvFzh)i1H*R8H{Y(7SL(Wk5;GVDGT)4<@rAu; zbo=VQF6m5SqZUt|1(A!r9$xw}jO+f3ZS0w7p0`bR)f&~3HCef@ulfz%Vj<7Cpo|-x z-GHT?y_BM5ozm-`-3&@xug@Ew`k`5vz-5l*R*B+7hdDZf)c0q zirxok5sr-_N=sD-#KpzewzqW!kEuMd@$foFi?oRt7~%>G*<@v9*%+0i*n9^bN3cEP zN59Jqbvj|V!;8zy+Jb%yvh)9z(Mf)%p%O}ms9NM3=VL&nA}{6g&-qzhP?w&W7`cwN z=QznL5$f#kAQh13i*32jf^c1r6njzXN8;CVy>vVp_2=gNm6601b0GV`6Uw~$bYilg ziXL_>Y-~-&F-Dl)7TTzWekq2}n2vR*P2Q;gQQuX>k(!$6qaxVns@Z22V7wy_3c@y%t`GfQ{3LOs)Pwl`XSCy$-7@y& zd(YVLHOe*n$1B)@7Rvsug-z^Wdnqn%n2o=)(_R^zL>9+S*m3o<5XnE!0z`T`w_=mgTs7BGw${BNH>0lQcqm@wftU)k)pt?6o|CV5b^ zxQU~%$g?CPU9YuTF3%|{LnLN+8lA9v-$+_beA@WlbSAg3Q0HJ)SZYKscC^_^F3Hlh zYwa#Ye^zn{B0pI8fKZN5*Wi@Y8S6Oj=$!P0?s*~BI9Jx!ytfLsTD2 zps>4PB@j3o3kz3I(D8I7#;jx1fANA-%wiMpo06*KEKrHh=U|Bd)0Py$Bu(m!9T~z1 zGIGqH5@ddUl<+n^pxR-j?OXRZvj_J%BgZT@<^L;AN+d&+)M3+=3Vh&JQ8?bWwVkb@ zOP`%Cv0tItyjaNFI{vL#I|#FcIa`ytACPq7YW>~py2y=wM%e@!0AroADwg2 z7Oq>jZb?6ThG_UT2)pARxgpWO zH%FL~R@(5YDE<4LN69f885y&vy|<$2IulV7xrL=FdB_{M!7{ztz6DfmEDWq#pXC)b zGRhk|+y&p4z6J5h0Q`gxUpVq^Q7zyMZ}`WA?!W#2hAzGnr%TU&h4 zIVlI$y^9R^6O~+(Go0+ZUB%2GbeEf|29}0zS(uU$Uqc=nd`;hCiTb%1jI9QH)z!me zXG;3{^Y?*SFZ)y82g=%#eII9Q`u>(xeN+VW(rX9gsHO`%A4ucx>Fd*CUOSvM{VVFe zhpqAwQmQ9)noWBZm^u0)Os!A_6(Qnu`NJgYRc+z&?7-62ma%7u!HY(3E_aY!SI7Fc*V{#^1qwsqs;Dp=nFI~|4n|Fkd&3R=a(A*i}66<2{#~<6`^r9WzO*8($dbw z?l`BjeG8*HN9oQYPXtuZuxLOon(E=^_P(r4O*pf=tIMqS4U6Z+cI^X)8L?{mf}Xcl z&i9a>;HECbRuBmiNw*1(h=ka_27r9Rzko3RQpP{%p>0*&+Ff*ullh-xIEl(lSOKJr zk+HE|JJXGO>#W&<3g>)hVkRW3(-tT2zw_SZ4Ln7~C65eV@Vp#UZ|kD-ny7z;`sGUp z;oM-m*tSDcOWdk$vQ31C18^gYNx;+U%=4B`17ZGmGXdTrB?f^0u>kJs%U5U{m)yqybY+&W*HWoC_o*qlceM=kmVk2IjHk>v4S?7I|Xc^ zxui?7c@ns2Ln>ln&c4IGHRoc~y(&)UPxZOxrqsi3!5e}b$GI1`=@_10t(@?1zoSE} zs8K?ZM~jCZ)$f)IJ+-1Ey9p{?I}PufHkwGAZ~0kFN0{9n6>6}{lwscuBw}OV)zE%y zrA9v$1PYs{`2dBE{8x$x@chQ#`Kn}axeOKfiLM|OR36O%Cc5@1E$jF#cHu3fbe;&F z`KJrJ#2)z~9Qs>U8>#Ul=$Rps0=lRhw5;4Vgl7YyPJde=h;x zbazst68Q=gANfZ^-R6UNj~5It8Mpcw!!&5c3@e_&_V@QI?0yT-@$jEF?;DX4qTg)=Nt0$q$YS$$ZLVJA zpC2)IlmaMZd6WtBUR0oJ+}r6Gut?k-aonWZnC-BMKR*kq^I4(vyIkQA^K;afmPwa9 z4N?k9=6QvmKNy})36b911)etDHVBg*xyW#Wv`<1x6$GMatA==Tz0A>g9t^|;thhO( zt$`3!2z^nfMWDg=;hB$sW2W?%V^(U4^i%$4=UWLXuCCUt7k#ia3JhjH-NYOdBZ#OU zP6S^R+Vj0fxdv|OuN+1h%M3CDRVV`1;&1;6|3A|h&42*_F!JJYOa= z`%Sy%j0A`i0q?`mXRgKv8_=VE&~K(7)NA^$b8hy)X! z!|-!juA!ujO%CfW)AW?qnzOoR<*kSm2hTNjkA8uwhZX4>sB>Uk%dVH*KL^w_5ik!@TS6xj^dS zV>*z|Me4;kJJIg5<^Fx|V-yAVLiY7~_K)f5n3Gw5Q@oXLJv~yMo^{s>ioG8+ka;Hk z+<&Gv@ZFsx1MruSaWj&GBJRz*cMO0mJV}69UJ8=Ez|C2MJSnqoor;Yh(!WIlh<@iO zh`UpdX>sF8J^yJN+o!?$Bu$uPXwO}3wqAd94)R`%5v+JQKSkVj`B zM9+Ez6YEn2OqhgfA1sn7DZWA@I5gzrp=W;4%$g43I|iDhlTl%PCBAN6+%5PyeRu1X zQr>tvuP=tc)+y)y>{$#-F$zi$DXn>fDuGoo?YF)B*->ytXXqp~dd+B}r1a#l?fvg2 z{7ilwo=UwB0#|h+=`CCJeh`-e$=M z=jYy+``@@vyZtW}sIzEFqk~(UL&f>j{AB7Hs*uj#dxYDxxCe8yM-44$E1-;w z7BOj<&UR%?S)J+_=zGat3c`a|u|7fOfKAI*j)#AP0g@|j*`sR=0+UE&{<$QD3u7LlhFDerqga?k3)! zNe=Nqg!}oP+CRg3H~HLywEQ^Wr`-cspE8`%zXzP=WAM!DzIUWx1Vl(9*j>_ty4p4tuW82>3bwH>nxZ8YI_TevNKLIy!{u>%)6c!V=fcjo)ZwJH3 zWgsQdLxOg7F~xia_3UFr%_m%fKT~&33Zm+~&hF<)<9t1&y4BPB8mHc&ISQn(3CSOz zUkZoXYtc`LkU??<1Wz$x8mcj9l4CiQEz~Y=M|dRdDwknwvCEFM@vuVsr>vLjJ~Wg< zXi^E^k{5o9(K^4*98iM_$BR}zIw%8%zynmFCofu$+|P>r?|U)GNA<*p_!Tp6(;PK= z@-Q@}m^ORs3QG@UiPJFnUq0aFE5K**f2J}1gEsh~&RCM^>1j)Q`>rDGf<$8> z84a*GJyB)#R>?OxDJc|O#J>DcE(DTZ_X~h?#SKXj6%LP#g?v92*eDngjtktDURog` zRoT&?Ud^ed5jo`G3Ew04o>AUac9EgmnXZdG=x$Z;>P2-{wwCS7Q3=czxGio@MSi~* zaiTFPs&bz#)LdB1qpAi`S%Xo%WUc%3iBjt;Dss-JMeAmX)IV;&rbN})@KM<@t%(&t zD?BE~mh_i&CIqwdl(gpR-H=-Z z9*}b!@a@er)Z7;4)I7R4T_laZFiX!uv$3iy2~B(H(|m%pGt1e&<+Z11`X>5aeQJ1Q zYM|fw=cKzvrC?nI?YBK=X9eN0_%xCBWQ%jAemU(}L7q0hsXuT&e&pkWo5!_cBXIYp zSJov7!?@3Sd2Tfc`uNV=msFHX*WLR{Q5<7&01y1fTfc&qza=w^bNWtAea#ksoj>*M z&rEYvdF!}5~T_

2xD|``6PsL*Pr4!~#c9 zTmC;>jNw4}tHJZ3iHT@YL53nmcycYp%*B%N{eH#v#2&|nz11LTk>RH-SkKA{@G*Ya zaWn)xiMhIn&BLoQj$ch@- z*EBEE3{Zps;DAL!_hyX*M7rEl5b0beD8V zDJUr2-Q68;ZN2B*bL+kP#&DeB7>>i<{Qp|tn&159oJ&AA^IlKe=Wmoqsr+;jxY@xo z*?4`-Z^z_dm?*F-ZmmguT`9}!rv3|{2G5bFhMellFWZRmY2sa(yhzto1Uz;lWeaZD$)%mW>cr-;!+Vv zzy=$<>8mOz`1zJ9fA-ZFE!8PdZ~VvJ+5cXqfOy5M|N4smh9bfD`BS0*9=*|WHn@-z z!IbhlEa8xO8qo$NrSKcH^(f-}ETfO~Gui`p&*{JLi4R*hURnB}HhAgY#J$T#3v zc(@r6dq~Jok?H`hIS9C1P#2WY(_bZgeHhWw{@W5>O#wH;C;|yT!k!mo)1K#r_FVXw zm{4WqTR@!V2b$PB4?u)3uFNxB64-cx;-YU(q z!;Qgg!MOoh?1V&pYzCv;hVx2*7tR3BQxGy#{1Z-N2QRbyxOxW<<|Hz44XD7saEq#9 zo$V|;)imwpq32qgn=_h2hwH;?V|D7}?w{=nGUa3Gn3yntK2UwvsO9;b;$lu@U!MRZ z=0OK91s2(_r@46*IQT4^lQr`JJ0~qvIJaiG`534yr>3NP6h{w)8q$Z|be>hhYh8XZy4v`;TF_DmFGELjfF_5<#pSgl0)y z`hQ#8;0lakresSC3m8ah+-^L_10@t4e+!cvms4J`MC-fy`ZrJHT^HHEfNb8a%comB@LC^ zGP6pPckxO4ip2WXXD<;s9OPwQiEXMgATgxI;YIb z&oAF7#-ZDkVP^3+bzp@0yfM=WMaO#zj{(&_5;Es z{)s0RYGvKa5<&f)&=8Q~<>i~~-+?AYX-5NBT;KfmGgP-iWm?i}g?t6dZWjgQSD&g_ zL&;)5!a_DLunb-}?u6em`VRN&y^X?l?2VJc?f^0%g#k-{FxxPRF z2r_TLsZR9tm>asu{;0G4r~O;xm|6ieYFq;qKJ+LQ)%yZaw6xqbYDYEVlf?nWYP(ZP z@;tWt&?4Al^V_m=z0E#S-DX0&3uMF}XH?qL!|Xlgc$cJJy$uiNl=w_%mSCvboRXd- zBV37VpBOwZuwgqeHX_Rbi$X4LBoF**{voCDjqhzaKE=zc`O60Wm{#Ohj!t<_#)@$( zGCPTg6U3r^e0qAU5&ZsO+=GyBiO%icLP4p*-kRU>R$19v6L)^jY;Vt1a`c_j`PBK@ zw|1RiBgfHNCH?6)4opP`*Re7;e2VBgWhwedv*_Jict6>Uhkf6+_GI^cN=9P#k2+u6 zSF3;5`M#90*Gga8-gVJR3Jzg&ycGlFV>-EiE`-5ey9g9jfXXZ^E$vqO7!m~>&_f4O zR#sM^PENcr?Alq0-%NWFS8X>aMLvY9ECkX=HR3VJgFd>b=x3R#OE$!Zb z*?<@*unW71MpHE|$0Qic8G>LH{#KC!$>UmvUs%18o>u5O+Oh507b!%S;aZ#0XJ%-# zF;*%6J(!eTT3I=AKEbQ~me5CGR?m;h(x9=xC*{;b&5o4&AmS^%R@}UHII3C0NEo5g zs`gLw?z^7xy5vl`IJDtSNIIB8?q=BI5FxA&*mAgqJ# zuGGNnIlg}R(%cbB5dnsMpwpDw`N9z?9E3~)s1s&fcx^Le;8Gh4K6dIR1R>6cF#E?O zA04E24?QmQ?Qv;r#KgmUWFSkMeDlrO%9a&ETaLw%m^$o!lbA5Tq-OnrnB$EsX{r1Q zQ+MGPGPD_%a_B}QRIHvoI*&RTZ!$|2xxb8?Z;4I8f~TwK@|pK3RMMYyf`5HSG9@!yr2AFWuO{|IIBR_nLd~(~x zTQ$Rz9Ma}?3}x!~Ixw21eRguwRxtF%v~nWySo^uw6*H=t@7 z8m)NZx^+%ZYl3Yp7^!b`a-k@?1NDN~-Z$}&GcyMFG)o|sLj_3MAzb$vGL{c2nHkdI z!~hWI5i;R>`N}hSwasI^CW2I6&U*zE46fghHiEEz2+M@pyyklug&tmv=8mZeFgjgT z7IdP?1N*p$bY`N|#%_HLO%TuR4()w1rJtpk!5@&lu)cBq_}KRjk1AZIifwBj11;cP zaEEGviyPZR^##cq)Cl@0Cdr4C%SJhw4TWf-j+F~OY6=d3lhE@&_(mx>{llg6Gs})D zcfg~Stum!L^K4#%H?-)fD_KA|4CKly#&Drz#LKAtRqijzaZd;^nCqnDsX;GxXnPH~ z_zp(^e0~d7X6NJcAvxgA_8A~(uy}#NO#P*EYk@K)C{vWIPGt9+2eX?_u(7W2k8rBq zY3F&|M^aL&UT>nFV1d)l-jV0;ZTx>R^Fpvezoy8p2NiAOq6)`6-7`K-LSPsM6*nowBN|c?77_!S*-daUJJ5U#&BA|{~Nq{ zgFb&%m{9#X*)Ia7w2;`sevzwFLS*+DF*YjP_~F>iP;}o(R%lBU)(Sj2eQ0W)5un z!{7Mfw_Fr}!U4nx_1o`(a$;?HRTXJcQc_f2D=#Hk8OGZVPfm_ygIkg?VSDSYD3;ct zIb*}z((PyyKxTYM#L~PCpVNGLU9Klw?SP_4J;CrUTe)Wco^D*g=;Pdl8KN{ev1oev zxr(*}mk2#syfjUKf51=5p!hv|)#&I*w*f2Ht49u2>DOw_$ze4Ye5yH9Lbb%X8mo&S zA!bY+^`SmWclM551V0f=5nz zI$cyuj896rihz*tE)Tb&v9T6-rw6YoJLMif76-5^$n-I#yngumG%y>}7zPs2zjzM3 z&d$$Z&$N8z0~#e}Gy8K=fBGd+3H#iXeHIm%SdS2z27FU_2N)7NwFW1M(5Pe> zsh$baV0Ezl$j(41{QeY+?rs=8%nDJfL5Tgg2D6RggPy_GV5J`sCxxSvh% z-TQ3<4}QPt-RAKCS7oAmO0!;@)Xnl%!^CXmjFc5CpBy96+0Cph9?7eS+*p$V&&CN}-mmjYru@C8U zINOJNgw2P4oCQDUSoU_ud2E+gU*oXM%H(ia{4VlqD!|i4MN)sl9pF&#z9R_5{;P9F z1bogTnsAmOABZ~=5Op^yBm5!h(Po>!oJW6YNB)yfp}7kZHMC0oNM9fSdjqQ#6-~n> zio^{$th;1g0Xq4s8;X8FskyM!uV9nrKrOB$ey0U9J?h1!NAJMIaPPL)*&ea{t_LSp zi1Sa=Qw&%Tcas6T|3~D<6#^a?*NbAtlv6FGl>S;{m{|G6xyoA_YK!4&SoJt^-krzy zna@*9m=-U1qL9NeMA6ah+-@+O1V*y-uQ(2Ms(^jK`V^A03p*UP0kj;~CLr4oz532+ zxHr;onj6e&JFGgW5Gy_(I1T=O;3$I_if5QY@gUj#$@h$=P%3|N-HzVT%|ML&9k|br zA275>p}I0LOaaK&XsOjoVad{bves{o;wUmMPgf3^797AcS5NpoWe{&K1fTpj!Q^Gz zr2M~W;>TK65lzqGEG!=X`(ox`wVo}0c9>2Z(DU_u&&swSx&Z8B^ZuMva70wp!pV|; zY1?pFco0omYAWRXXtT9Fm{hHk&F~0k!A8q_aJ{_L{wX6}w*UK3q3MR*A9adNb5%lK z0Q>YD%o;!@KS{Ve<~iCH3JTgIZ)=lM>grc5=;@U{iGm%ljeF%+kk)B|lQ^2pg%V*e zvv@@8!iezrWWzI&n2YcPb@~mA(Dp~-GQ5$PWvu;GPt1B28$>2qaj9c%gZv?Y?}cGE zb(A9uV`851ysx!Ly=LB!MX~h!PNH+2YzPVJtK%6^&B#h88bpO=nl2f9-5ZR-NCj|# z9QHakbnK}R7-QdjwCOj?q_2;hTHQ6^Ra7kW+8vaXE!gA*SIzdOe8cuw9=cA=D?+KN zU?nY`zyjjn-InbtTK7E1b8dIZi_vmXIrhoW;B2g&VKJ}pXAbkg;royFMaC~JWA(S| z+t}P07s^w>U|Fgku)r1-j9VUHDU@FQ*=qbc;QG93LH|aNRZR>p(~cZ%VLGSx67f%V zUUCl+E@!{pOXvw;yg4yA8tHmr_d_zo=KjMb14~WgEhx0LW^PrG|E#Y8T-w2!ETV)t zhXM8hS2qd&u%+e&nR{6|i#zvnwUi>M5VUeP(g)n{%7c+(_|9*-nCi@hkdqJB22epkClMb2j4(Ic+q9dtoDfX#nKXPU5rH)+{JbcQF-``2+WtdsXtH z(S1lSBdQG1M9ne3`^ zvE%F>%4LJZ2gc||#>NqePm-qF9-*q(!n>I^!136h2Y4mmDvhxd1HXf$H~7t?q;?7N zMsFhX@ZWzsm!81yX+~qM+W;?n{*UajqJgM&Jcr9gPRr`kz1T5@fgw(t?hbmDolF0d=Em(k_i;w_s zN$rLV!tb*#vJ6)1jZ0V1`N$d90Bwr7yncb1d4-yLZS*AeATlkP*9<0 z=ol=RUEmz%OIqVU)b)7EW5L#q`Nq|!B6CzTF)4qBiOt9GXsE4}l5xKFWm3r~5;OAB zhzLMbz(%^+rW+1Ei3LDDfcBdZ$3EC4z@B&sn9AG3U=9Bko`Qd9U{s!?Mw`HbJ{W1s z-F)#q-Y<^JG%wDZ2`{mtT{V%HxXHT$-tWg@H4ec zzz2Q#mq`HtsQ>Cg)uv4SN7*r;+EsvxsNo^#m%%z3jJO5B`3@x-__=zdfP|2ARaE4A zxw*;O+zP=)fq5?=^R(6QPk(vt9?flN<@)t@_s~!8up^RJH*tsw#js1cmNhkEXKS;} z@Q`4WI8Y_>`es@SPVg5Xrjdi>%)S5@pn}rM3Q_9N9qFPm4U2E;#$SElw zX^@UesN{Y0@j;%bb(BX^Ow3#L#TRX+u&GCK3dKqfoBqqA;UXfMi_pLH@W{Exo~HUj zKW(SaT=kuZGgBz)JPSy-7=2>{Ro@bP9%)itRYha=I{pA`IYzZ_oBn#>{P`P>`}2tT zH6K81$?8T`U>uJ}^xR+s=V}5xa{%S#(*pv5c}4*6_lxx$L=Y)s{@c&ZZLBh*!HHsE z--@3WryExAVeNZSaA5`&@PV4;6^l#i@+@V3q-U`T%)&M|9()YOim=|HQH9Mczdi^I}(qQpgF%&(9C2s@s-Ryu#SEKQY^~0@G0;SK1Y*h~|!9 zQT-A(-v>pqR1~qbJ|Dcd;3XoUpd@=__wy<8@)1JW%*@6iVF0HgyEpl_(LLY~?$FVF zer{Z>_s#t2?U(HJdT$%tj_39kyL0}%x_LXhnVopY?J)dkhuxOld|9L@8!>jA>CV{M zJ#6uj+Aq1wg)3j)H@hfrEe3);fQ6%+e0c0T^cs>ZvflddEm&ILXfK~-Pl8uCq^F|` z5D6E;2K%2k0A=}1+++&b!7L=a$qOQ3ZfmOPjEc^JYVjBnDcR?Q|C%3L#gM3)6D1+IgEyI`J`eTe84fe9Ow{6`e5K(G9W!kX_OlK=OxMe8PlU;V@1uB|*1 zswM-;DGjQ&d98gCf(}GA()(sf(uN&c=$dAqF#gs%%2oCw1kGkN#&Wp%3A%e5Hh|$QWNr-?BMnVM2Htd_N;FVkaeNV zpL_p{4iGS+%Z_OfX2FkQo15aKnJp@isAA5GuV0yH?|-qU4c29^;{ZM2%1av+1sN;$ zr*D%4D0a)2pQ)49iL%B^+93~R13U$X)7n#G`4kt^d~b|DeGkSD7&#H>^L?h^`+o(- zsQ>OM2_u8_SaZU?{rzYZRZYzg%0}<(6CEIn2=WkQs^a3}0(C7?&d7)+)xAaP8~S@H zV8X)+hNB2U7(h^jRymr8|5pEfPzTly@c(0hRjWyAwM~MNQwn6*Jn96m*?|t>iOXNe zC7`Nzb^Q1U2;GGd(F=*gepCmmo2$|qrU6(x_uD?-m!G)_$((Rcw0Y6j&{b=TRUdYw z&doQFk(Pz!I7lojr|&bs&`ks}F~MLKg0%M->LzVs#bD!LF&+aU`xm4j?Kcn@VCYuA zAn4H@P}KfcyAYz2r(@VsYTksX%E@6(d7fJp=+=?2sTV+V$~-;8Kz3g4>B;Uqzsug; zwKecd?J7eCz%Y^Mf=HG=lW_t8FIj)V%YPxc!6Whz&eFt21_esz4x+lIZ%!gZ{~x;w zNY{lz-~o3?{Ga)ly*T+A2}aS)88wA91eI!PIUH^wd3{Fry7U5pW)>N02O30a zz8Zf=rwW35q#?~SH>0nBR`6Fa&SUT8iyvG?#i0LUEdH-?wh)BJa_HW|!UE9o5CR!T zR+!h#vm3X-NJ-MtlC4O;Da`G73k1LDWi}X}u&}XVUtL}8?C#oyTS%sKCBgj5^en)s zo2!G^V3OYfn))}3XUqlgVSisdz)c~DcuZ}&-2fV@{3lN3j~#{V;riukc9KyN<;?E> zzRb;*dmv6=2rZTWg?R0&Fkk~JzJrld!Jr8YS*$TAWz9xrMHz*DGubc;&NNi$Y5oyC z9#}5|;D+z*eDEz-+5X#YG=T>Nqf6r!A2eX#Rpyd>x^bnduU`x<@1meEEFXQBk`fMT z51Xt-PohApMJ`2$|%PwL(Cvx}9dts&03rtA1#KYkE;()3bJ%yN5NxNxEw z-2gjBIRK1hDHq6|&7|HuCd}D9JU&b-t3d6<_wf5HpRJ^{2L*5^T7|@fjwzhBWx|nL ziz9rjnx?Wp;`@^!L^&9^hZwZ{UV{it{`T_oNzu3e>Ci2~zI^N65DJ67=C2d9HywJ}+p1B3=w5o)EgVM3XmTf0|lHA9%warXyPHwe* z2J9jd`v9?O`F`P(Mvcwd5oUG+Z3n{4mV8ME58S1@?9CvxBxbN(uOAH0dmyTA6oOg_ z#1L>-D|!@N{a>GO^M6v5Oq;PGVM6>_#RrEIK1-}o16v7RZ-ty{*W55)*+ zt_;~C`)>`3D2giX*L?XWYw@3_cNGe3c6Rn{XsDXQlP4iQLBMGIb70_&zdt&flwEv% zeZBB=Q&Z!$fn3(7)ZOinTXj;02M4L;xDUmzy7vj(^SNd|`Ns7hAA!cqHpW&)Aj<}Ij< z$ZkWC%&B|5-I1N*9`rkD;Fm=6w7R=na%X1%=sv=4andl6hjJdqay4n9&B|1Qpw+4Ry(5o9vz6$ zNs)|r`(!hpP};*mR}FZ%@xVH21QZ0d;TJP=Jpm;{Mt7nLfp-6{KL=q|;we;x zNzBzw7^FQ!GughTfG*2bf02Y$vB8HR!(0O_+8u5Zxj8sa zojkyWEE>7y{aLvQmpBr;PMd_j%17oKysmC;Kc$d@nG};#GcqJSs8MvB_*l-*pCB}s z(f;}f*hc-&$4sE_3*b5$lkMDt)$Wv{z!;mXB8l@|(;a#t-WJ1Ok=7C#K+7NYb^{Xq zU;%=r&$)+E8umWCA*hCaR^BVxgy+>ychX~P(KcV9@KVoyR`-)p^%L}yzLk=Q{ZE%3 z>dp&J{o8Fw7K{SWLpBU%Y)vFQHU8IS?q-iXT+DH;i6+{=(vzw&)O*yIS`KsY zaOzcPiXRn^DV)^qDeck&AFjc4qer##E?u&yN6?#5u6Ir5XzIni9bG(eZxhLjEaue2 zZ0H(N*3+(^iIG!i+Xfts`+A)Q_6$!sZ@+(Ca-SYeg|2mWe%bizW$bt^P36E(JQ&wp zma*NE&jUOdVQz>1W!>I`?+LHWvuVG!rgv7WInxYtOi3H{J=r+BWW4(=QGCY<*cXF_ zARlG87q5UvhTioH7os;7MoUD7mD+KSEE>POoN+k8*n>V2jM-nTi{_YJooKhiMo$$q zhi2A3xQ_!x(~F zQ2M)mljc7wPW^_!AZ4N3u_L$sz_I7%i9kut%X0;H&KI&2qko0;v~DN;QUa2 zAiA|_m#Br()`lb}#AxmcB#61QH?9HP*TF`dhrdi3V3V*R_?0o)KucoVKS6=PXqm~E zFJE9(b-<=Rn8t56_r{*-v8-(K(dKx|O;s~9W*;A)C^;N@9zMSL7GI1te-(FOU0Lf< zWb_0wow8o1$ES?Mt!jMM6R?{^t;!dtIEHBG=y`)1N%H4z(l);O#w&qIxeCd`fk036 zafN>`n3ts;9~~~ z!WS=kZ$cKAiyNzmt!?e8?4IrlynrX-7GK-T{OS@UkW*unuQmWDf!HJl0HL@K{!IYovG#noIArKwV!dVQXP`>xWJ zYqJX9ZpQ24V^QpDuRRVD0toSA&>tYD76jZMYw6jG%$p^0=9Il{JA7NO(Qt1Iz_kK~=eb|k>niAWLiz!IYsugv;m~;tMEu0eOO7WZ^?1I@?)33Y=<&`= z6|TZuiS65$O9?hGWo4gmu`4o!FlP0HEM~5EHm=Ha1?w;mGO%*=Kv46euB1hA7za(m zRkvuB&AwWd=`^0#)r!^as3s>!*9_OIhxK0sYx!`A>sQ=DhqzcB_5`R=pbcM+JtG_w zFNS7grK@6RKc_Kei27EhP2yRxhgLgmVihHe1v+%%r4{Ar6jJ`qX*K`&J^N&4`-lvEaPakJVpsI#gj38M6CaqHd z{aHbJu#>7W{}XKVD6t~2BJ$E8Ihg=C#y*8a@oJ4()Y{hVrinSh$a6oJ0Rf#%s9&o4m?O157yKkgMS6_vfop&Zko8Jn#4G&PdRXHBG9c>F(a7#RAdP0 ziiPCN{J4qgXMqPJ+33sh_7qP2J~ZXM)FEx}otMV#JourC7MBH^R2wgfVti%5T7p-T zH^`)l5JLQ|S;vg5y@FqQGk;TpWGoz5pjf{ktu@-i1+O+WwjAMS^SpOCWj}r+`C0K^ zyBjG8jMp};7a)gs1fYHqC`3Y(&vF9ubU)DZ6ny8hmwbVdgWZL+CbS5tJiO!!piE7F z0KAI@gx!5~np$^*4{OwRAYzpKfry|`DG(I;nq_Ya1cluyAwQw-(d*&~l zczODV?+6utBi6v=#ght|JC^rAh5Y zH;y%(gEE?j;={{MU7eM`jYk+0O&aM(lOX@S^dTRi(gPLMms6}LWgCNEYD?zzHB9lh zrdT>Q*N^SyyF?yIDI($9C#F2qo)j7$3wES?R)9^PM44b?v#qM|7Q=jp3E#lYa-IAm z=VJ*Scbr2n{{BEmEnJ%6>V};M1*vf5nWRQWmYofu_rkTRNB|q>=H@ytC%t|rh6?oq zE>4y42Rg5=uRH*w`J_gifr*LDV0E)7GJr4>D@SG>J0W{6ID+tMWNy+_# zgM%Dj4j~~aY3aPUG)Didkq1FZ83-#S$Z^>*eOTYNdO+4_kr5$7SwH&_J%`L}&Z?nXEr90UB!m~A)A8PeT-C7hx zFDt>nhc}a+OY8N|uWVao4F?w(A2~`l#SXi8_ZnDrfa|)CL_$0}i%)P{oA#xqEe~pp zyUn!g1sXS0A*uvOu<5PV!hF&kdbLO9&o837scAY9rYgY9=FU%ye6KGu5&g9P;M0!l zoujSEV32W>kyjQZSljYJL=1_d>OIPU3@TLW4IP|TSu{&)F*1%5=XCDF`eP)RR~nJC z6GqM*2$pCfa`CmJTIZWlJQVguy(1#$*S6avnay?bEIYiBC{8p4Ej64_V*JZIimJ(i3bdwd__*dQzK&6lD)<8cudgUtDkimuW!;l|Ov=i=}n2hM4H z&KpRMq4snFiEN?G9NLX$Z z_%1^9`uPXH;st>rGBV0XMWT|BC^Zg384WCRiEo%MDTA%d#j!I-0^?;H3PeJ}$gV zKc)5HMtey?M(`R+fA|Z{qGYH6dTQgf&wju8|~{OVUTm2xkq8jtT+ zE9v*Y@j9pZBR#nDb9)#hYLI*OGG;3>qpqCd^)|`;5Hn;=1UYpn6M(Gs>>|g)EZpg; znGtsx8Kx~IR!ubM&&#B$Plaj_Rrf@6=(EjgUS#NGhYND5r?9hn=GVo`mxpVE2J{KP zO^CX2Uq_C`T!_&0vXK#o;BE&+-fTcXK>l>6f;90h!!8ldq>)-m!3UDrT0iudOMM8@>&k&HecCF52DPgzg2h_M81aXxOTGkSV-k zeRcIL%4{ ztuQ?ET|IKRR`}ZWf!Ew>2rBu{bTyGyHz-PNUaedj zdj+FIFSp+cT(n|)NL+3u<(z`R_QvLSh9CjaZo@RhBPmw@IaS7S!shBelzQ%g zezTiqxr0GB;B!tHNU&oObU4-xeC3QMLsmM0uf6usKYy0=mqy!X*v}8zx#e18r?$1T zcWP{F@J?Fk2QxDk@_nd~m?Ljr?)yr!U1VZZgAeT%?~zk;`gL`m1&s4;v7O{DhzqEz zSd>!HSi`*a*&OMlhsCNWs7?Bg3-GhUb+eR=#AqFN-3z?cch}hZj7(qdW(o({yC71(EF0k*5I5 zy?y$&BabiUj_FhMG8V`;BoTkg!dFlB8E=Ln{Hqvq2ZK+-+@npVy#}@&MO7IPzUND@q!{or-CTDNXwA_={fE zo!GIbGIgN_DGwb97ncU5!5zc78(Ph2#v-y28uMzGQMdLbIgs5Bz+MqE`)6D`f4kqYBMf4cVLEGBEB-mb()6R^M)4`Z1 zJtQ+DchFPwRNn&z zwbAtJyffPj0l0hUX##K;#Pe7hV5E+)yjcdq-yEXXObUt3;L4tN5fN!$ngLm44A|l4 z?88OYf}7J#USz6s2^|}d_T2put2b5OQ*WU>z8C|6Ea7u=M)&U_X1zdI7d^BEMm5Zw zoEjy>#hpoAAnWXq+{Sk@dxSkmoXLr^=GAIstxE^XF4$}NXvjn{{ z!L_BtRX200^Q@Cl$@SbB;b_YQ^2-cV|0JNC`ib}8fu#3` zG{+5^uJtfa&RBd-OB)&jrG`}E_(Cq#qCtEiC;>!G-3SxPzy`5sy$NZ@gGG^Bfb@0o z1Ko|3uhSCeeR(~D!DC)WfkvUcsdRolX1|nQs@n-_csk+Ic)~;o5^icofaC4%6K4cimY|# z^v*guldBaGo0;ecA9!q>bJed9=Jebya8Z4QE#`HsF`na?=Es+B)-9`(w?tavj5Su=7YkoH-S#=V9(BWK7AkrvVuO-IXuk&k zfjJ2wziqZ43r7WiT7=!*TTh;kcK7v{ZndR)5cR*INJ8rUlVfp#$3$$j@?c@-W?I=`XA7 zAN6xT7=MJql=79BFK>aaqQqR@)^T{leFpS%`LOl~~o67O#yR#HR?g>{M ze|#B0n)6s-{`qcRO?9^P4R5Te^EXYG4|nzGob^02lsQs3-(^gvMEKSz$jY1-zD0)7 zOXI(<3C_xwDZ1@of0u{-HWa%nS3iigiwTK2rnfd8eWFUL>^xuc?6Xga6Kg61kV&s`^xJpZXI4C6;AtykriYtt+cZXpe_2*(_AR))jEpTv|NsdDEUdD znP6Jk>vAMEqNmz^EYNErqkiPiLi>Fzy!VON6Kl7y8*dzP7M$e=DV8%5EN6tEj7_b~;YF$<7e#NAX2sVTCCFjC zn1wwP--vbRJPl{tI7q)Wi}j>Mce#JpK*!w(iZ8};>xZA!>x$b6 z_;^nGF)jqAWXCCWau>6*nmV;{+VRQM_4nmSo;y^&qQkOIyhu3_jR%lL&C(yszM|Pk zF!$ZB3Wtz(?~>DC(|o??{>}U29*?$1zhk0l>M60iy{3+sMoDF^S0@M83liShdB_y< zQ{27x^46i&7V2S9&!^yNEhf}dRvXpsfTl?f(q4VEsH8jnrUoTra{^cwH{p94Y(u$F z5aY;Q`L$+Wl7M+U4!>o5+lHS5r&9Zx@FP!r;XgodFe{EiB9prdlnI1L0Xf49QtE)C6TU=sB^mU7d4#EWxJAoPJJTZm~+@-trd9b3dE5`aV7$ z57d%XA4a@)AZiTR;nYzXZcWh*`d&c_&S};uk%F@5DDbh93AVip{R_hHrasRP(9=!0 zOSYSp68BpU_dVyiA?M4dJGbJ(we+fSbZZKjve*Po67}O0>t8ITS-e#ogvallJ=iDb z7dl_M+Nx!W)410y%-Ce?6q#tBXR=Xr9yh+HDa@Cw_6QyW)2O`XRQXR36%z{ zpUpnD^mJrSE@Q*+FYje$X1%9kWkSgEj`<7n#YcxzNS&|ysI_&K zu(uUs7sdvSSlEhwkT&9rebR~Sk91!QJ=?1H=h3ik!5a_Su|9ZpaA@lG{8fjn^Fw{w zxMvwkdj7LK_#537rRQdSTeua5DcfD`5`G?I<6{pX%#RzUd*f2FlLsz7VsF!b!Wh9Q z%FTU0AJF^t_4qM;o|GRRPFKPM$vx7g?)M2PQBNuO(vs~|FId|Bd#G6RSr@iGRni`$75cROetlSOz>~*rj~~~U z&v}G)IVrqX)i5`s>hdVYsm7_0jJghc@z%zV3L&wBm0MpwyJ5R#E%^Sh+2kEMc8~q~ zE*lNv?&RSJo!-8uBdT4-6doOsvu!+Y1c9C|D;T*LVt!xhP#3^|`|b`Q{)S-DA^G-0 zl++xbrxCDJMI>g|=SG?}j_X6Y$JxNsVfm%Hjx$j&l}s;ehtSkjm(J{wbKnEI8*I8V zE43hTQdUIRYO4NOS$R3J>R7pHxchE9DS2dA@&^)jt&i*1XTweEHp)7&ot$i~nDcD3 z8ncEL7bm-)U-_3B`x*h?$R4obt(UH(1ucIITe9)orx0-2`{<)7Y~$)}#NP<6m|9_K zT_Xt7YH)Sj;XxGd(Y0v%MVd>l)h}QA6wq)PsHm*&lM6H}-NRvUPa6*O$JT#HFk{&I zSp2TXPw5j=S_2>F3p`E&&0deVbbpdI-1SE?@#7sGhWgdK93ZE5O=A19_DXkMeC^)zO?4qq;b5HlfgxkmBQ$ZSwPBWX+X2pH}^Fp0BivA}aQqK9r(R z@Hv%l5VV_6AiL#UGyosBHxisBlHk|TsFD(-$h3F~J+mt4_YUKmB_uDbAn3vn32U_+ zx@fVfvbEOza_(L2iPmVVBf7^ewlj)*gXM`YYSt{8;=LhLXR4(@%NYZDAA{bWUa=@b z3@7>7!TnV8q?^u!_F3H<7mJZiMoIT{YV;f%(Q<4cU69_N9t1_?Gs5$>4gBV-SWz@v z3GqAC_ax()G_!aId|dKPQ05q$pSs};&GYQIQdMI!g<(4kn~*&w7jh9^>wLBLdDEHl zwf=-yZ|~?Qm4<>{*Bf{2?f1Tq@`bn75FR7l$81KI$*}Cx4R4>L<|5|>~yJ*?vvecHUXMS&t>g{Kw;))OnHwBbZyxv%sUWXT> zj@@t{!)o5OFP1+#sJm{;IPKNfB)?xqMYW?bbmJ&5aHM+~t>UR^?UFL(AXdIHs~GNW zl{?>XT&|M5iWF?ZcHsGJ37NYqv&CQ=&EWUDVdA zz!Y_zXFB{khoWJRO2nyxf{vYq<7b$py;9k)I>toNEGhdluhqaxTpoSd~)8m|Mp zsYPq~yhwR|AKNN0?2~r2O~Os`v|9{z@aG{E4Ca`-9i}A?*JlM~_!HsvPULMiGYm*# zgbhkYOhEpb4JU<2xuKtTZJ?Alb$x;)Y&hkd@ErIjdHP-ZXYZw zGZdQurnS%cnPdfS;+w9N1syF^<|X4W(lIv=t{T4k6|*$wJfyukYKqTplqmK~4xu(9 zqmX@_DLSUV)T3GRJ|&qVD8c{s%apS)@W?9vq*O11Jqu+qYO2)v z`j+!kpTZZ6sWpa}ex2f;jrTYUwv|jPG0^vtMC_zeJVI8)&xOY^C@;bvHAK7HG0ln^ zxtQCmH_t5Cd*JDbYNYx?T`s{>PbYfRE_K8LIh=6l&CU1pk>G-IDJBW3`P4I@-A%u< z2;z0^LE40X^G@mywgHeC@O-W<@S*F084Cx;P>BYPeA9-VkNy)9^|&L_R1=TAhWcPVcO))*d4xhzu&yxvN@ zm?Ld^2eN_Iu4*oU06v>K-)zCKTGao$L+jbS$q{bae93Hz}B(BE2>C`k@Py@ zTFKfZObyeo1hoa%?|}i-z6GBYJx%FGA)^&^-;bG1{WF6d*`03+cf=`&M`Pch&);gyK5DP~%#?!{f5B=veZaSZudi=d}C!Hbga1kD@o0 zd`GihAq>kXfF6lD{UiYyy5hde$2*64cUKrQn&1{abGtNCw)NArl=*hrnV%TsxRjVB z7^?$CDD;CP_k&uFE*6eBR_2!!G4?__4fR%VCP!~996H$Jc;oSZym6wqk1;%mq%o`$ z?$UZ21NtS(XteT@Ucki+U-uXQN790Z9d|H9NW59^nHIl4QejyS zP_QD|?mdD87VL>sxlgH|FLcru=q9 zS)N1hhw-MuN=9m` zeL_r465raFYh745y$s``2z*OY`7O0T1oO6>i7%u0n?gI4OmXj_z?z-a))reD>#mC` znmeRNKRQ1kW0DX0qE1Sh%ICaVUHdSMY`)ewZP(h~_jY2gpCL?$FSiJ41AKeK4Q*&7 zhi1p|X}EpYQ8&}Gk}v#}&||WfVS#8~xbOxroEaC}^7(W2A2QWX#KDuC{BQR5_Towo zn&09`#DEJ1hU4&foh>Ety6;=KFD2;Q1B-Kvk;I7rN|Rb|VG!c5TUuFl>pL{$W^Glz z{QVL7`JPdZw7`r>snm9Byn@&B^p$vczr?kH_etH%jkYZ=-BLJ=b$g?#>FTpVqo6DB4aQeD#o*z>{QSXP*F()C4G1|;rm)m1ahW&tURb%palTOyozStq;hPyh zCc_Pnv#gfT0jPo@zJ;8nNcV;_GK9(c$v}wBCF4sHZ0nF$1Y=e-?zXQAr}l)t@%syh zLWV*lKbwoy+#NMNAn(=(zf(f7?>$BY#9(HT+IM3!GSGR~LZs#``y@Z?+zilYCMB$- z=+O%y_*KWX2J_}A;ft$X&BIgKn#<_xnsU>oN%H=rJE3T*Lh5f{;LW~Q4yr~H*cj-L z8*wynzAtrf$P+}k1-a--E~tH8k)sGPAvOL^Je_5aD>^5HVThANoMTE=wUXDWJ`jY6 z_Q+^GE1wfX5_CtQasEo2_Dyz3qp=R&C3115y3s?lE*BF#e(*Tu;XMdTD?Z`J_EwWz z(Nu#ZG*TTZ%#kb=j6eNNahhK^fN*Dn{x65PPl#Mq~?An-5k84 z_%3uzl1D8t)qXCu#zNklhzLFca~c#pSkJ+)$a{)qbt6eT~Gj zJk4uhHg9JerUxZiaV%BlL~AeJly~&sPeZ$nMllpsrg(J=*8jy*gWAJa#P#yNnE95! ze)|@Lcv^i!3>)&L64QJrcD1s9Sa|)U+g!mp&Pc*VyVp_4&~MjFOu~qTfqVAkizw-rhV)s@<|*y4s8gj(q&=*9B$A zfD@iwz|M-cxJqL3?g`%=zQxbeFRGhxy1kkX@M>P|zauq_q$5&g3%JR|A!_)u*>c@n zrR4?P4>u2$!CKt2f$Kc@=YSWMdLltdbD4Hie)Um)ju!Pj$sYoQbvBIiqE@fNCcT2} z4Gpof2lKO6Mqj;N^338iu7X!y{`MH{%tZKV*yBhHdL8v`VOwmJNF>>OW$qr2RIuC( z#3u;p@aT1Cp80Hhebr|Ly0@qzmCZ*cryU5pT{C@37*}w*#SauM#NdM;riKI#Ea$?6 z%S<{Ik2c#@(k~)*nuKSKC`IdbQ{N2)@IwYdURsr2r?g=C++yg;Ph7dip|7+DUB*BW|5XACNmuTIS8HP|5If70Hy8$Q2XZaXjmeg?^P=F9Z^)J@D&1vsC-i>+yeG zi^)Fw)Xg0K;X!rD&4H7-Zx5xeGt)e#8>Tm)%*N9zW#O`&c!LIAj0!U_hV(k9xp;uo z&4hFp12I7FGHw;c6-X`<9-{mCEGWe^~C%6!%uCldgmUvX9Vq(G*bX9a1 z+|ms3Z2RyfB^rVWlc8R>rp8ie(*2~4e^vS7C12~G_#|PYr_=*c^a9VT7vV`l<42yO z*l(PZkCG0yv(Z)Y#OsF-FzwQ5?zBtNvaxlJj~(1vT_|Xy4mx>Bb_dr(bp9>W>SRuQ zHwGOpeD*Y{*-NH02lGUovOc$zXA&p56nz(-1F_1^nv$XsYImnRQVVX~AhCC_+Ls;^ zeU;Cd8F;*=^`|RnLmgekpuB&;Szn}FUh(OGs|Rb&l>X@g-fKVco!c)x16vE&ZTWA) zj*`Xf{C~cP!e&LzaoDN;&aU6plM7)TS*Cg^5N|U1?X?UB#YbQMEd%9?W}c<*Y&=D6 z>|KSU`nrP^>Za6wcDeQtyHD1lQh(c9ct_t#^xMv2-dNm2Hpm6&vq8y2L%((vB&!oG zFmYOMG~@acd<2d91X($>w6EVJzX8+wj}G~5hVEg;f}z;u438;nz)qfVwpwyuvuaT^ z)rIeo8WaUNIWkmKR2EOj^rHj*MUocgnamF`$31=5K5ZYng@ByPjlF=HQb z9AA-Fvv5I3;O+5Cg3%LzwjYQ-$NM*(ljHcwks}E25B-)$ziC3zCq(WKddU$Vm%`1q zUSWz*j6|A*q8kg*O0Rd5vj=k1uwwD(B686QYo^0-R0b7@#m5n7t=^|-30md;+(qp- zB-dD?vLL)>BxI6JCc@6Qok_jcZDTiJno_KYe!-X~(=U(VTkR9v{NZ%xTCnqSzVKE1 zoqL}*^Ly1#XRb?zNHUR@k}!tKzE_+Xe5PbZ*jXg&xhuIW-|!>@sK-`bCsUgB`KiVf zbkulJzap$0eF0B>*r)|riOPvu_G580*Yh!Gjl_p9_jK;RujhAh-L zu0O(RboNQ3wZ@_KSmqjQx$~}Uexqz*_=N0`t=}EMN3!( zJ30e%2Kw>4-+uFHmvahckh7xYv{^@w@*62PJA1l4(f$DUA4hIGdxD90-~HcIjRveu zd+aA=l;J@6bdwn!CVFj;&ThufxGS>E7);UXKJBIxqnZnI5#{pKb&417TbdAmYtfm|2_k&k(YQi`%{m8ujn%D z=aPVP%E`U%Yoh6Xp+>rZiTVCcq7|dz{Wi@sv<;>|pUzrpp>8qy-mkuDtLs60We4HW zvq1}Qn^hsiK#N6neEni_)8(q(*>?EOQlZ2qtf=RH&3kkNZ%1Te!UlO~+311g-hPcO>CTYTdWcA*Amcbi zwX4my!h)jrUs-rMJ9fNs`v>!U&X7IH>5S6B`0BoD6S0lRjkoM<(+&87is;R9J6`!m z)N)Vz)8aT3bhuPaiZwRgHpP?6QLnc?G?liCGpK*1q})P%SA0S#IgLrBq1=e~>vtiD zbliOW`S$zrOmDn$qE{|2(T(afS$$5Y1br`~=a{-&?h*1s1)0nBhKX#uci3k!;a_~( zB&FJdqh;MpJSuXP0*_7u+!7@A($qfJ8Fh|Y`%VphhflogPnD9Cl0vIkaYRsJUk@I5 zIpJ=8vk5pnTd}e;$QpSlrGK*<4|8F&BXlf!<2j{GyNT2|^x{EGqQ*TKGAztVoUW~2 zJkZZmFU74>*zMhOyrMLZN8YE=?j-2aS}rc>2Gwp_xFtyiD>G@6FZB)vZW53!9r|4% zWcvlw$rDGEe|^?h%S#t_k#B7C74SDz(C$q*e2{H%N?h;SdO;%b+twfmNwD$dg-+N~ z^GU}8p=$Iu7cw%IDHv90!-q#fnXj|x%HLLI6^;BH>(0MLZgCE_d_~85_@q0tQ`Mc7cS+f?8d?GN{y0CctI!G5ZoweCz5ulQ7G)cUrQNeGUkd+ zvX+N`vW=4xlg>1prxeUmrMXtSP|xV{^SJ-IXlG)4e91#Cl+=JDxZ_Hy!tk@F+j|R( zh*;grzVm*ErJ$4hGvp83TCg$$F5b`#x;{Hd>QS8#+jvXpduV>)YqZ5}$TqMtYl^>E zcSAQ-LlCMMfLO`C{i$<6A^ed>NtnexC|Owo_1shi_Cq=H`{OT4vRZpp7lci7(qT|o zMy>&oYEvTz#`L$)TLSe)V34l?Si}T`h&!MTJjt{z80NhiiyC?V=yF#&D06i!bdA5~ z7ARBM`-r9r*4N82-7YlBIo_Emfxw!V^h+8@vp0KV@w>$J3ma;_+bavnPX=UPGm9ym z!n5Yy*|s@qtTGBNZNT06CH?i45b zv>`Crz%^~i$5mYUl=nK?hmG-ims01VC4c@bZ|Qr}>DW+0JZ|zg;qOz_YmPC9l@OH- z4i<4M&o{;rH%|>!VmVCaE7PV9mLH~7_k?CD)?H92y86sIOm(#T*U|Z~SY=O$Y&}`| zrJHH9ozpSr;b(W2Z})^Vbys~=Kadw48l8nyHLSQ|a*4#550XBk;Z)E- zn9$u#>3+sio++PMxF>n{0^*6Qih;J_Zs%xq(e<9#=~af^(Ir~pu9fnX%`aUSf|qS} zXGhxygG=z#;_oi^>o%${7FbwLJP#@Sj;b{#{jJlqyVlx5mIjJx%PzX3mt;F#HdhE` z@8c&jw8?cY@YJ(6onl|#n_cVf#HI_mm?u4b6%}czLtfn0X{ZC}7XR-W=9NNc@6+Oy zl}~EepD>?$=N)g#Rd+1orbb;)+ST-Qit~6AJ?prFg-Wi0XExUPc>JnP#elLT=US!S=wWg+jS!dq%(+3< zLyvH5`3r_qm&Dux(JnO{1^Yo(W9Z?yH z4v!{SjGGTsUj)LCMK{bXdd(Fqm!$D5fA62I6Q;V)Wr{>L+CcEEUUfY!t~?c^7^3Mv zQpURtyXy8#fG(D! zs$S-)$PuAq9J(}9)-^s#M@mzRWeN%kJaLv5UlyT!BqA+qX^}79QPFX#N!Rdl+L#6| z_qLpnHSjqG&ZYKCx85{R7eOLbt1kY5yt(qRxgh}4r*NH6`MurxdPSo@C7Fr&8qD(F z-wlqBneI&R+zNjCafyz@(sHTndR9%?>-?h!n_>B@sLTB%l`NIGutx--DeL9O{KTsU z{c0=fi4T@zQPTUzKIc!E6}#fOcIh zPh(dA5ow4vk299f`Hy3H@d4ggW|`P(zZ0q%?+WATtNu(RdCK{?Y-e|OI1V(5Y2q-M zgRl2TwefO4GS7m_mmgnnY-lsDQEYmt%E1my&%Mb@vunA~DkhwGw%Z?l8&` z414sdq?Y1Z3FmV6Dt&fZZ^ab$o-d+lF%ekbo8S1r1hTf0MGVF3qWT1O1LOHNbfI>Y$fPW4jSeLR7S213YZzZA4ww=uxY6*Wp``nX72CB7$6= z5N>V55YX^4SAD_3tH`x; z@zO7KJoXF4Z97vNn2mhB>61qu6F&{dev`l}EFc^}r%zWaJZ>A_C5j>Iws*urcb_3cNI((2~8<0QC_@%hBfi*N$h+N{b_f2^g z_77)B@A=)Lm6(C7D3Cy=T) z8;^alsH{LRzkNL3N=cV|ptyMX$vvag=maZcZ^ef3DFS%S7K7`>pX0TWgFdYxRrs z^n1sYT!-|fd%5;)+0o)S4u>T|M)zP?;VKaUuF`>_`zdtAMg@4&B@ahN6|~%gG~;i# z1t?}M_9Qb6uk?-inR~Od6mw5yvQltTZ++f%3mkajc;f~ukNob(uKk5_($hL)W)X!2 zczZz@ZlD-Xy!nM|SBs(X$F#{PPDo3S;Rb|Qy;{q?whdogR*sJt#F+|mK8Ukxg(1uq zWo?2yeP)x>g7UIoB!yYdef+&|#p96359h-0ywsF+=h^Sec}QD2BUr)To#8;#J#+0t z-+daIQg#*;V!5BQ7yID(Yshz>=>E!x?nsCa`qJVT=lziA#zCuj1VetA#hQU0LBHwx z6`gCWG4oh(yK1c9T!PKY81`b-^^yj@0r#M{BS7FUIKL}i_j_=TWes81V_pc!nIcR*U7OUmgN3-%96K<|;Vw57Xyf?3_LY3;+unf_ zD2U`W8&P$itLU(FB7|6F9A3@7yw)v4;%)F92dt-39SNVdOk@VgRR^r%b69>aqUltm zUkD65Qp5AB5Sv*VVT@my%Vx1WaSd5?##eq$?H31AWe>O_hhfrL|FxNtePB)4Wj%iL z*B9xw-wnI1c)V2UglMWA(ez0>P1i=o?8DL;-rkK7w4&PLB7<*-f#gc2{!~#>lSdV- z4IoF;GGM;no?^OPeGRx}_ppk@)D~<#P6cjeT*nL)*LshT`8_x8z%OCp;kk6&9C=C~ zn85&Qwd4X(p1qIz$4%#gPAfCl9osq6jhdg0S#q!A(BWAU+XP@0N+YWX|>U9PJPCM z#A1I_-z-e}f!1BQw1?tsN02J^(f}Q{Z>^p>@~ylYWSb-Q=dy6WRpOLmasonJ4merW z_@TXgGre6K`HGE2)qXl5F63iB`iNU#)o%ne;_>aLvNVnum0x<3#79;!L?Lg|DOz)9 zUN&~$kB~!%3|veSi)}PCRUE$%i6$is1Pzg@rBgYzZ!P-^dPDXmHv&l0A6@lSiF{>S zHf8m-7~Jayw7zBR#mYynNQS2Rx_=W*RR(d%D|~uO3{K)J;-uZk1RV_)gLOmN61BRC zB}|Zj3dm_>`(5qsn%XR^-jDFV3u)aTLLtZb#g+(rDl~#0=%GQR>R3!|-LyJC85;G# z7t+js>2<;l;WDTsaX{^ZL-h_>7XNfSZgqhyW?&gowt|9}$Y~o4<7yJ0&hcdQy~%OOacqvF>N6AM}5J;NsTzfYgK&>*`+Ky*rAU{yT@Sv~&!K*Bfd-KJ6+_ zdNMbvu>RTO&puh1TkkV;1lozbhIE{inS5tHD__2^m#%lYm*!#jbU_Y zl<42xUS_h#dDh;o@?`uzwy#x;>WGKvW9gXWxO$e>N_G7(@%k8cH#mf|q z-RVRo_pic! z^!fxlxOvLic8!gVYJGFUPbOGkNVyDM?qM~1UOqXFoUPUQq!rU)F^c|bPdedK37liy(up9 zAdpsEIw2t;A5R`l?FNOUhM^%BsP*68-k%@??!OuE1ygDrLI2n-3_3opSE6 zu;7VHleo?AF{@-08D#-IkpI$9Nr_<6wG!dft%(x`*E+F)G^KLT&4v+tZczjmC_aY2W&;uiQ!YX5`?Zw))#0Xi7NbjS zs^E0fJEXp0#R}CWR&C>HZ^E?$&l!4JD?MMnOuNU-awk4f1 zZ^E5$L*vt33$u*GThYCq?~7vY{CQzxpZMt<%Z>JF;=Nxv<`3Ta(q&)mQ}y<~9DjTR zo2_IH4lpTQxItNC{P}b3^Rt5Gj(J{>;^cYq_4dNliTqA~n7|hE$_Dpd~q; z?8iqKslxBz_HU!MZRJ1GI;< zPkywR<8!^yfFT3Q$O6pPVC?gdW+fgRwuu3eb>ot!EazDfbhFysB-@3K>Z+>P03n*= zu)+W{3UT3sU(+)+9k&hFhtNP5K7`chAjaA;A%XAVd@e)l=Z4ZDUMKO!781`TiP#DOf~w5fo{AR9GcIJuWCH;P;&3a{E2^ zi2Kf`wa{BE{5@0BMB+IRY1Q1L)YQ~jLk5K`_F{!sVOcw?DPa?Nxw&iShmS_ZDWOrW z$JIRG7HMGGSbJr?qlA{2qTPLQOYpl1!U_I>Yr~`VI)^t2v=epVp(EIzQEd9lL7^E^m>}nC+3C~5nU7< z=4H%6d2=PM5t2h23Ni*h3ZR&W8}e<1CS*V%#ftR{Z5z)VqP7`^ySE_ZhSO53-kJRb z_WqLGjtmS+*aFag_%o8&%@jhC8+jXGwV=lvH;hkjz}P8#saPmMA$t3GhD+lDY<=YY z!Twsv%rqvv_kBX+38aNdq&f&Fr!d9@_aC%pH2Sy2;SE0IkR=_0W>yQv8Yn96a%m5( zKVX2`t)Bup?s7=Tas6wbK7_d6JU1ynhE570dpTxdwqZQt@Glv8AwPmadL9doDX?qD zV1_F;02*ePsq}EDNgmBh3o|i2mdL*|hde$$wtuj;7E1f=c|_XWLwlsCc8{RLwB|r) z213YD)L#+f&onv4mn9zZspYxbmy*J) zDU#_sBxpQL0$S=ORx&yG#%pt7pKIYC&c|)t^jQ~D@YQg6HftqZtwH}#CL7~-G$C(mjXFj33%;=;GUM!RgSo#Uj0Zv!v{Py zFifWtbVHS7?c#8gWSCKiDq|#r=?tu3orF3vwCTDN(7%SKR#1G1*ebxtr7VJ2P}ndK zyo4nIj3rBD5${_KNve5TG`|N|vhs zClI5)8!fP!gufwhK~!!sK;T2x-E0WC;4< z$R(uum@zVs21iH4640*~9c7tgzrm1<5c~x_v~n33#0kpFD)Rq^T3BFD<}NkJREeDPZ7= zWio&VjJ4O)!P*)6gye2}pY7SZU+BpHZ!TaODRe82|ZuT?>%5JDwL?SyL@m|P67Z=E^0yx6t5JZfiIrnyuOCBizS^h zIyN$^x9sNo)SvvV#}UIKI$zEX^LlH_+|(*%c#MimI{~F2n;i4%6~YGQ{lMeiKh_@9)1 z%efbyL>l_o0xgl=G85m|pJj_X5HlhUXhT0hr{g^z$;QQH#O0-;2Jm_>!4lie)8`?e zO;L`WgX10po00ZsV@{sNy}(7q?!-fI&(eh9A@o;@A(7`(z)%w&gNvl|eQgZ&KBE8} zcpP4#;NYMcfSDl*%)qaws>GS=Knh+3;sWro9Kg?{XI#+K??R7NRBpU0%m^f8Zo0nL zuVViPm?iPWA)!lhA)5!AbpOIe6PCU@ae#SvgF@9^h>0Bk=Y% z23-}vE&9{|&Kh;T1&S)vE#T=cob9Oo0y!7}*?v4sglcY8MP>7ZTK4g%IjS=emh)tv zYc-o;7YZKL(&_9iyUE5;&zw6}(iL$QZrL|aMo6)|KxVJnZWZGqJ`|NnUxL$%bcP5lW4BGtE2aT*L01KSgOEqm@-e3R zGaSfv`ff^tbZ^xu0iGK^9|#!R$CkQ+zLtT*p)bC(y+8%NdFYG3EGjP* z$wm%o6b=55oEL+~^%(5=?l^^f=Y`)n_wC=!6AyJz63ZeX4q<1+03%>%=Iun;4+ z1J-q4SmZtXlO443h?rJE{A^_Gr|H}JC;sI0!F&sfr=6IH$%R!%pVcRRTdybRp9$jn ztOyKcNSElk2#l(0=y8vd2@?O|mMe44!y0-J|TXD zKQuf#sT!As zkWYBrz5JugCENmzFD?HjXy&nmq5#A`&N~WzXJNB{xD^zOWKJ9s1Rz6ZGX*Y^mLm$r zvee#P0cQD+Batd>$GZLC?~zUi4uH-s+#Uf7EGW`~VlUB+`aq5r+(H2v2z*p6 z|AMig0&_MLjEUnODu5B9{tWxH^~DVY&iL}TT#ttu-)1-C5~lM)ShTIwT)n+u5=o*t ztn?nbwY;*t){hW5%H}I)a$lo?@D6NZ3eTpgb``6nsIhz7L?s4TzF30>Me=Z3%`IWy61jl` zi;8%BYaTWZk5AI++;$-B`dRg;qj~A34bm>Qp3*(G>ArEsS8pdMeJ<_1 zHhyyD7Zt>VU2Phb-lbYy4v7jnV_cnG%1KOxE1f(>K2lDftQbdZBO0UcMr>q)ICIzC z?jJ|n=D~H_hoBr%)ciq9em)F)wl_b^2V7~WKZjw=K5FYu{=|i#d7Yov^W^A9D z`w<{3BoXk=XM`K&&bU$cKypHRqUxWxYdxcGm$!;UQ8G2nd!p$&;D%TQ1AupNjFdxR zuC#_Z{&K6oy6~IDBF{Rz%r|Av^v54Ob3=$7^ze!FrwS{4QcSs7zn57$0;2jPZsS-+ zxu_p?JB=KmMUAUMyB1mlBF4^kXPu?R!2>6^8Me;9t45tPqFB=$ zrW7sa!xFN;{zA!yDXrrV045JB(5}dQhyEOltMX1RBFG z$daM)$TP6EI&(^#KOL9b$t3{cKk!?uC6_3Brsf-Ct#B;ifO}S3+sTR!L1m5!sC|ZLOIR4e_`=;#Zfa?KP{+4VF zwC^B1H2#WLbWy5@cHUnRODS)q1uA06h5KM240dEb_u}+x8?r_2HV=>JVpDh9L2}Ah zp03>1H~%cq=h;WAVQ9wiL&4#diO+3(1vC9l>9Bbor7W;?{@SDol@zM?d$-kYUcoOg zuPcCqr-L1>7CH2DoPEn~%QuT=K^biF{f3UCOv^8WMMR19iYjX}B+@qzYm<>#J9;bT zLEo75A7dD+wT7_{dcTU`;&XY1<&9X;7a2XtRliQ4c(fUOzp-D4B_5Kwn`;6Q47)w( zBYMAUw$M>Yb(5X-*y=zbFeNek<#@ULKKY`8z4CUy_Mw7;g5?DHghhv-s6~H074Cygkg6lMv4ksT!;>3ny8xKAl>(kuoSW*7EI#zyX1 ze&SIF!Bt5u!1P)aTfyHDH~@cy=gJrXrgHal%|E@REoYm690Zc zf87&Q#4rX(>~5{hBY^kQ{CN++!-6ybds5Z9WqCpwGmuXbs!pPtqIt7D;vVb`SE+Qvpk=p9w4-$_Q`w|gmdZV8;ajoPuX zadV?=%Agu2zrI|zE3A0#>l51o4pL9BY9UdXCduShOTJ8z1iM#}MMQ7np2nwE{drA@ zvh$Y}Z?9~v(eByfm77@)#+@gL-GFmMxTYxnNMVl4>l=A7xDnbn+_Yo$Sf8+)WUlT! z)s`SBIql&S>Dl%cym%3xq3i97e3^C_Ab*+`(kJqOF>|$x8v}YR7t{Q4xMYrR2tv{Y z(TD?JQv*BfQA8j6pG04`1jW_=53iaZ_e&deDa)o z@A%FB+NvMlKEr%N$G7i4MpEL4Z`#!M7{;)?v%O<@dN2XvES)IPrSDR+m>Yiu!d_K? zZa9n&Qve?-6ojJ0xp|SqH=r&rHI*1YI%T0cn{#z~+E_og3yNFl#)ipl?(iE8SPBm? z6=4#r1N0#OdKxQIbl+BNi5qw^Z*U z^{n>mG(y5ArA`-*E44#|I`U3WW%O$8vx~+{AFD$ZrPr=*CxW0o**hNtL(^CzcY_a~ z?nLD+u@$yQO?IhGHGdf{y*xg|hma*`)75IFAyMzqo66%rQ9`9rO6#LyAojPBu@@^qNY zh~A#ocZk>g(Q?JIIGNwA{;EdzHfIJCPyJ;zKGpI}>9i-q^q`|cYU(qKnJT%(hNdf- z?@B%K9ht@)am>+i5r>0LRLt<>yhIygH9}50YZkC!OcgJxYwH+Pe`bE>QozQGY}s)Q4ZTx!$4?rYP;sfytxk6|?i1!~qa|UiiYZ62~?gMh*6s4)}QKPgU;Do@;2kA_t{NK-n7mil`vWFu2E1}nEUCI>@$7+To7#< zS0A*CwdC=;Wjs+)Nm9#}?R-bVc#n;ZZFcUP@YukMQk0gu8-R92;`=e(uQ})z+D=-$ zksMC_O%{x4IJvc1H1+jre%j>u2G!?1#&`E_gsw~4USKiI|B%M0^y(BjF(@UeeC=trKjBwiy1T6x(#i*vZ++r?Q}`yj-)= z>kF9E@KySF^EYEc0@;snH@yn)FZHM2Z^iO)=@0SAvaRA=r$|gVW_tl%=e`79D@vro z1sv~hdY)GkfU#3Ca7+s1e@SV0P?(pRD?~5xdu6^Sw32fyTp0UsPZay$_sBIFFqI+q zmyl*j7&uV|B0JknmgZekhFvKEUT^_!DVf%oerd^c2bRdn9-E?2-@t`oRK1BZFm^@f zuE?Nfy#JO-`BwpNQUwaLBkm*THi*&4Q|Irjq-5llqSt?@~5 zP*_kWyWRk8%L-A!w$lx`{{F+81hWMX9nccK%7k*ixQHnuWuY)T@ zOAi!;9Zjl8jT&$eY?434_xyV>Ci!13vWAA}Fxz+Rx&!tBllJ8FreNue!U#Xd&FWogQ>j=cs-wKi3Ch(I2SFX_#8S zQuv7+;Vc*3?A5#cnn69Im4c1T)Edex=L15(A~39aUgf60)FfaR|C20&88g=1whj(s zFi)Cq9Vr<-h7QZ+TJ0Sj3U|-V^LKN)Bt2=Km^>^RsNZm41J)@b_s`f+k{<9;BDN^v`ie^N>1mk0Y_xQ6pol7P~tgeUIvmm6$i9RO5r zd1h-OeaNMdHapBgUT1|X2cv@~TM$|K^ZD8KfeLirD`>zVk0h8|{$bN8d6~(XEv4}D z_hr@!Rj|Er~i-h&?h2z|&-IA8Z>D>XhTX~VNh(AmTDBa5@ggG?dEkEkMu zCKkqWeg@X@x|i#jsK|cJQvfQ^X$J%}CPbFuXWM;GKsr?;!BLIMXz1GOzTCB<`>!-6 zAfDW!C^W?#PIEG+`(MKQlUu$YTaaMN{iE|$M)!@+Rpm-jW3wQpi_ek{DF}a6tf8&V zjF0ZT@ST8??|DRUiryEqpq>wcDp(7T8>N)OfTr(52j>8#l*gMSBLKjBvjbWf*srO@v$FO6?Dii z{11MTWoO?o)~+P}kRB5+D=gUz`o>)Ucc%q4XFW=|vqI=dDN=0d>6l zQFEjkLs*3&4-)AUzRR7jiobjNrRY=yE{%>jKW_wx$N`uoZb`&&2}%?J5*x$n$$h{({zNxHyNL%l^`Rjr3t65)v)pDJ|IANcKi-%ImhnF(cx3PD7lfj^m{e4n@ zUWetZv$0HdFeLf4Js1)AH<|F(ecVB_=OFHr1sufoBO5S8ATs{gxs=@X zssI@5&;>9!7l^Z2T_``v0xd&_I+jx%8 zun8tgXq1Vaj7!8mE~HN!q(rJbXidJa?ra`6GCE@0@YI)y1k!YJ3Y!f69MVKYF&j8$ z-q4-0OLTU7#b%bqy%~y8LKj^+oC_#B+2EPGVcV1@eB2;bz9y_kz@w_M|NM#e@uI!k z`t(8MCrjI>SL?7-&FOjuL4~mU>BZ!jXpx-y#m~IRmxFY5j-$*vPh7=&Ph`Df?Y9U{aKFu?@Jl{a zSW^Ow{BQXS-IDhClB`NKRAnG)p4S%rRxBtlsonoK0q97Ct~QB2Ns4+@DR6KaG^=`o zCRSY0!EoWWNfLX2D}={|#l_mHVfQtam7`uxR;x*}j(@4nP`c7Ug{J8s%Xd^z{a2Rh zOD7h{t<0h1za>g~WaDc?#iZ3GlK-yKi$}s7?_-e~u_b)o@#cOUWa%a> z0?GzIFn8;$bR)9z{Y^xF1!dw#YJzdU15U%kvelU&kBixNxGOU@_xi>o`$-{z(z@;D z8*l*ufzEu5yhVpU%MOcRl^Uw8R`2Wc$ol;*4k|33!^^*s@Q)}J*P~ZjbC?@CI-;M( zjyX&*c=T3E94>y)<--xKEzX(E%S((#6mzTk-R+TV0vx`s+5{+k1Jp94H>)7GLN>~L zkZX6cw$2{rjZ%4Pb_7)~mH*^=IT|=Yx6o%qQI_ABY%6cU;LF z$mkQ6)X>dMbveYZxx|wh#;rNu{?UV}kiHSU|DrncycW{rlb#(No3|wNt%=vf^8?Vl zZT011@B$?76xacfa4a0Y01%`7x9Sq_zf?bv-uqjg@D`YQQBd&IP8VYYpv$0ma&$%y zHUU*&<2PE&%wnSkc2rv%*k@bFfOkFuYw;Z zw?G184Q|ffV@2mVwc`b0?ln7zjk7N)K;>VJ9u@UE8HhO|HTqTM<#!1&ms5Uy{dum; zVnlEZnm2|9r;8bm?JKI}$=B!4$Jz5adES5RAnM*juOlJkQo|ykWaprp#*fz6Ai2w> zBuP1C75{aFW>xO3GG7J{(BW~2+%yVEut0e?O=n-HhMF6#ce-`A>72E_txQcm`jez0 zx;8V=vwMl;e>4VYrE6)%pH(wR%6WX=MCl2C0F%{CN2|=1$-YEU(Uw zM27laNF>?=ZSTq$d{u7>-;d~di9oe0DA7Sz0V_8BIg#f!*QUB9l|OF-K6TUMaRK`2 z$ly3_$oEI|D$aOX!LUxLUwCQHcm+ z5Lf(5peb|H_hUm?CHr`hP~N9cZ{EJ1uY0I$kcV{@Ass^4*>vH0t!#>{crn%mDtPXJ z79njD_Ct2gTnSci5p$fu#)N-JsjvvfCIDUk(dWY2UEX9d_$V}9x>eO*6%vuKn-z$G zvTM*&74%BR+_z`yJH52cnK}5H&-PL5*s1kj$~}l}ps=l2(A7$-)S>3WPeDdX`U8yA zUII02wwlkX?9)*U%E`&eUraX0DCDo|1s_yV06h%eg0&OeY_6X{jAxo?;WnA=^~fY! z=hWto*wke0D0c-iq{L8+1KZV`El7t4havg48@es#69KnXoWL<^4(Ftz_{$}t&Qg+W zi`~+>U;R*eMNf^YR{U-FxDy3hN%5zrUENP}i*{4vNsZo99^CFuYv-Ubkd-imbcM(*te}g7*YljuK)35Y9rldwiT(h6&3O8bH0Bhg2|!#&8))00?wjn zgzsL@NnB!AIHKk*Y8M;iqn`)kQ{MC-NgU!qrI3eyTmLhvJjX%la{sGn0E8SY_*5Yv zKPT)%I^C%Ba6PbPRI(D49%dMVNKJ#mdOY@BO(Ub?>_t!xOcqGx8`Zhk85d;V=Ircj z`K#1)YP(&!q(xm3rqN6U$n_B;3co-AgRb)Or5J>?NN+<^rL|KhaCn>4pAj!^>~j|r z%A%sS&HbE@Z;PqLn!o!Rz3Z80JJaGr;fWJJKU`M)Ow z|KEZ>YCtnZP_OujF&9W^f7k~h%|8m&0zt3)YkK8{0Weq-6jE{6-v3C##>J(y+@D$U z{cdnm_m3YjU_VQsK`Zsk52%-0_1DXVSlxJSU)Uu)cY9@~pTV#@uja#SBAaJuO3wvH zp6lhlnO=_xu6@Q<`{uDS-8lMvB$SbvnnQVC(b>G=_0*@NF&MA+zGjM;OR$tQpp58I z@^GP}>RB(@7oFcKhJKDdyKR3p5hCb}{%>TEU;S3qSxBME}zn3@3q!ERVhzKb-|^ zQn5kY5k3PN&RtU8a1$<1l-tB90!Ivr42$5Jt6I8GeN=%(KUw+sPu(AuP0U3POiPmy z2rdGHq>0p?KwJBFUF<(!aZQ%<&le{&#-nE%>(}xpPm7J)eM|jr($n*tanOO@8=dZ5HFILR=&aq+305Nd zj*Z1h3}hjwJfkbIf2!r40t9C*=B2(@#VvBa{S9PTAYk?zplPFLW*#)x4fF85yZCA7 z^a)V8wz(L*>VOE5^al`-|F!Dm6Az5_SYA{_WDyK;bpoC@ABiMJQ!Te5iAv?SiS0=j z)zSX^j0rSPjXsJeSjOvZ`x6TU#vEsmHTVbgtAPx_E~*IAh{Ewf?Wf3)6l+3R*zQ=KrJYt>da*m$%`y79gPliv|^tMTdm62uL@J4ha#EE)k?t z>F(}MX=$V-rKCkcKuWsnz1jPm{oC((e&;;r`DbnR{(SaY-|szh&s;Ne&CLJLw1kS@ zan{h*E;85#2}!OydP+)>Iv;JHH4C!nHmCyrTfw&s-q&H7yOuW~$P<8|aU2NJOC19N zKPW6M8a<0jOH%%>7H*GGlJmfM>$72_!rJ;~#OzysrH{PyWw$9+<5L7*3uC(-zt?HD z)h^Q)WbG&S(1|OXr&lo?98X7HSd-TQj5HfG!z0Ov1g+tkc-p3?s#Kvwt*yaFCgiy|8i z(f|#3*%IbH$%1(zdu`Ew&H^BKRWYkux4c)6zct)w{Sj!ZuLu@wJYi~lM+lffw+ zuxcu?y6uswc(6J7XEsdoCKYf)2K`ezA{9X%hG+2a3<;2kn|fr>7;69gc{1CDE&#QD z9p_*b6@eewQWL^&8zM&DXyv zV*wiv2v)~{VD)|&1vvUQK=)M>TTnCR1R1;?T)c}43nS0YT;6A4@aGzQ$7~jc`z+E~ zX&>ZRs9dZ5Vb_ZPvTHX*Hh{AC{GF{ibnV!0$5-rO;dpHw*01trvxs9bOX*50qjo{vCs>cAV4q0?L3Tq<5;CYDgw?Wax32ij~+!hG)zfIIAEUQ`~-A9F9i z3A9f1g*kiiGd&qZQB4;FHP3$nVjRwY3y2SBAi3La0Yex5Oe3PZ1I!^%{jBnas|HApbYWzi zI7o6baq)lf9AEumDuDq1EY(n@n}gg_(!b`OUw(uGKdigrGWRrn|;jN#g%N3#4)lMEd4Agjn zk5anN_}_ghj_6CyRkq94yeW6qKVEV1HMm@=J^fUi!QyZIAt{+G7t>ags!vAwbx|i( zh$M4gI1xPj^`7q&PlyKPMbDnTx*Ld0*dRsaRbZw;`F(&-|H#2ZJ#O{>gPIi^x1vtU zNv%f2l8j)QWQl^Z-=SH{f94iOSX^Hy-q&9Y20EY{Z#+*&ycP}Q9OwR*gwR3NK@AC2 zQDu?xH@AKH>9bGj46Ry^bam^#gh~B_5hQ6kyf%%8kDtAreV|1ksB&&lWY}LsUmSP5 z@QrqLSSNBfnuEs1F^pM@TTvo}oH6xeKc05r6_69LV}6T?)4HD@E#KR{O1FdKSqIAL zj%1DAgJ~69N2_YTHHW?sAu=m>OkwtP(}gTz`=QP3y{gJeWrcJ>77Di=?{=lfuLdFr znbdp1YHg#Ry^%kvCY33i{zxhi{biR$vvJQc3!0vcNU=Qn=!1N<-8*u*AUzU$o8+NJ z{=JoU`I9WQ4dYLhTpTuQj3Htn9VnBA>6i3?4jDdRRR2lQ^2H4a9Pp1G$ES3Y?*CbT z=Wk@MV7I*#k_I`WNA|=&KSN0B8{LdhZL0v~_iw?kotz{}?uQK3um^lh3)ZuAvqCfP z~PQ;1nj)D1axP8;PPnr!n+Zq|^!qehA&K0D=5`?{h{&1OFTUPOlE!b8d0@;%|F-9~Yo_cjh*kP7PWLBJ#>Wd)cJYLy>yG;jWO>l! zyswHuzD?+k@Cf%9Hi{c)-@t;UIXhL5q(@*laaNqy`{hM_{WQbT?)i)*r_BfSbO@X8 z?CrghE_6GZy$-orN{-?(s5Dyr`NtRsCUWPCPmm0lu&l$WJVU*QJl3Ps4mSy+GB?e* z(%FI|<{cryw5i2UvMF&yG074!@zm07Roq0V_xr=2D_l^4>fuQ2*rsbXHI=3D`}=dA zB15SS%xLk!#-A3o_S*Zy`!fs*+fXqoQ74Q>9N@>xmMO%G2jlao1;&4?`@f^>tl==* zCxub25q z=Od-(K945?Y-%0*@D?>g&-2oO^oiqUbMn+{A*CCaWRmljOEO3O8^z|=Z4pOSAhIuX z=20Mv3CJLS^5w;CYypk_-;<4hc)*aQrKL{K#&RHZy6@;Z5YjR-K`lBoxLtv^qPO+h z@LU!P>Z9otsZcHa8}42Qx1QWiJF#L(EOsp5nEhvfO&bM?xy@5=0=E{Xn9N*lv|kY3%|9EJ;iHert(#G04C;xP^@Cx8M_6uE6t5CmISsD4URePQLX<*Lq^~Q9b zkS8MA>fufnoj*{H-9wv?7`MIO+I$A>I=~Ww zk+?d4Bz3ZyuZzF2Dldapt<4kx`1$KIaum6V=`%+9M;^ju5pTP%hY7cN=vpKXfjG&|k8f_gh0Bwl-q7k_hIm$Ysk z=%{La|9}k2NY9IWrvUA3au&6ojEsy6s?36Q2At4w@Dn{mw99o;*aV(rU-}a%ajB`r zWjEzu5D(v%>>IhHkDb(riG6W+CP{rbFB-+~m;}eUn}mU+GB0Fi5;xEz$9m0Cih-1_ z67P()ObEO;P@8=AS`L*SKBN|eA+yCu$)-v5jvYpT5I(#%dj^fk=g)xjJtei!F~szs z_j=7%>4bzVe;>I%CJ?SKN5RSvu*-z>#fH>cxYKws#=fU<=J5L>wZiyLxi#k7eG&Rp zo8oAx#_8#4UiULju!78h=z1`P4rBYJMk!*PfST zH%16#**la#<>qrHbTbRmhmsCevE+k-N6>!=etJ~|8sqxt=8w_iTmoT-Zk<#p_%}q3 z;}T6I1!JTO`(6D!o%X;E6Jx5kcgQbSPy+7Z-}MyIAG=7zeD;=t{b!%`A>2g;5}TZC zAtn#X=k0)bY|$yt=rfwkAFeuuWQjgv^cpo)YS4bf&JG8qPCx#ztgNi>tE)Uef9-pL zrA!S9UNCw%ka`|mI9K;#6hn>Nd8O6hhl~kSwKPDQYcyIbb933Z?K=b&^>nH4wCbbV z?>h?j=R%V;SPkZK)&lN5f&PGJhhp?i4lkU%d-F`k%q|j+RrqCnG@gz+NR?hfLgGh@ zynauS!*g6B!O@tSJ5dBqWE1(-1-SKzq`eK!cN2NaUL3U>M|E$LS8JZ7m)%>8aLqk% zGFr^bD;mXExKM@AyyPeBTM%6wcRBSY49sXMRDiEqf8jN1?4R}Nv`{M8~$*=;f#1Pg; zk0fMdu)-cZ@skPU%!H?pYvu54W&bj2_QS)ud$;SO-4dI_x+(|(9`f_gU}U|a;K8GW zp5uY%!AjqDyC*4hO)|-v?w^h!&hzxJNm`Pa^noF&Qq*U;&K$KU>@3wg;+4jpO| zc-?99hPAyCZ^ykj+eE{A(5ra=JgG(14T_?3KilKc;i_tEeCEdwkoWak^byJAW8;bT zq)?e=KIOm(!YJ|&B~xOaSTO`nf>CK-pHl^ZT~kDY&Z}zpq=P-f=+A2{*_lqGiKz|O zV+x;srL-9LN^Nv}IsE%6uhR}CR+suE%&lVms>^9_fxJCt4KpC{0ZT2j(#+3vU2bqs zCpgiNzdvg(kzr+|tbsc`kbRvFq|ky51k?E*Mz6u)&fukN5G}zjqWl=c)!idmickHsx5r*7+>pN5-ae!9a}J&oznZqnhV-d~K6f0O8xQgGlW^Pn z4Yj;pn&sGKzq>)9<-3cy8`Z(D%`Bn&;i%#sIH_q_{dg6~1gRbPd*sHt4U`Is*kEOV z*Lv9Da5x8+N@c(^5AxdS9f%4~q*Eo6h-?ND*b}&|lIn4Aa7=#EP0AQzVopVQrJ^C% ziwn!kyDOU07Y`~>dSf^p8MPa>Gs|Tc+}I@~#K@jShglXI3+OQVs90&JK?v`Yg?x{A z@pIJ(!E{O6GIfnPlzX4>lPsyZMNx*K>AdP@*7M+lONWq>ZJbys#cZnV^-U?;ocTcoQ`8+@RGw)#OS8*h?JKD!_}k&{B~}w!+fjayLm=$@f|D|cg%0S}Zzf+D{jp9-RWdxoI$snzz9n|H z!H0%Sjm!t;>cNe160)_Yt^z^+LD-#tt zS@aSfuS>b&BD@}8PU%7>#;3gQMKwIyKU0J6{*;;h#WIo*1?RE$$*I51&d^^iULL3V z%9!e4YMh2Juj_(9z+^-9=kxeXr_{ zPniZJ5Aeei*fFWxf+SUSQUXeR#GD8VtP%E)9eGJmksPZ?C3T;yCV`(OCX_r+=EXc^ zQX0ST6P=@u+nMo?$B!|ZbB9K#sVk4udSL-mJQQsa&-!9PLBi`bdy*jYjhVItM;2Yt z%|#&0HwI2FQSw>{D7yQD8_C^9LzWqxDMNDG^L%_@(ThWoSc?5`ywiiyRp)X%m~@-+ z*LL5q>x+o^1YQs@8?w!p-)S5ExC$`&%i6YzX!jT7VO}t0&#c(1Yu1#sz1!&wI0&MkK*<{^uS_zYIjDm zS8Di~s^vSei{!B7;%J&IQMM$xpS4gyi|OfS-$n$1{3_=El;5GNmI#bm7&ec1@f;fS zGAgg3riOGmw~I=ZLEOqA_r%7}aQqhuobmm=-7#BuWRYUNdZ*knmASHHV_xBHj5~Nr zvJoNEhmW;Kkot6mTScKK(uk{s;Sy#^-<$>uE3;VoPv58#u*9WK=*7gkf-uZOq(#>^ zRxPN6YNWnvjJc9f>tY-(|H`rmy;6-xHbi~FM>@&wuE6n%hBX&bBo9$zfs?9HzuJ7y zuoEE)RXryArRTzhhD>Wy2FDP>RaSv2)M0z{lg4f5Xq9TORNPdP4WxH`V~<(d*;wUi z%<}11;)shIif)~rit~Orzr7hz0^uotCEYj2NKH#aw9))%b4=V_P*Cs{Tt9`GXvB17sVBUrHxeQLT;@l##r)kG zJgkhf1_>a(7+y3a`Hqx{%9E7!%Nq*_2l^x@7dR7|6nbHZN{N)i1&z7&pX4ZOYcC5l^h{Q|hnFxN|NO+sxS+b4#GjK%v=@R5=N%~^}S z_j6~?S{0kaSAp65w#VP@R|L-_nU6~)3F}u7dtM8}C!CY%4Sl6ISJ(y{>ouN}{zk*f z_quL!>fBTQ^}H`M~)#RsAVoBY^VaKp_+Y&8~$(shD<@Z9F|Pd|ng z7Y|!o!z=I?cfKzjzg*0z2nY=n+1OE!Je`vKwX|aZT&^3Qwm=iL2sIjV$mnc?^v1{n zjP&Q7t=5em2@2q4vfJzAyFc?5kD|3|;Q)Rx0oJ|u)LPAcu>B!IJtV~mPtp;}wV>fo z&*KR_$?>7!i@a4gI{B-fMk>EC*7gvIvZ78{)>+ z3VeusdGo2%7^w^XG~T?%q**bs|0uRqq|p!xCWd4R5PQ zeF$FcL+@3|`|+w~(+@^(!tpn8>!m^gY7fH&s-g<8HPDLO4m>U1ZyU@w={)_x4f~MOa+lD#dZFL-p!q9>J7sc{ zy3aFGy9ke#`n#ii2aZrx@%kKWW?>}{Y%KdKHUn)u+TjGNz=oMAO>~>OR{lanWQTcC zr*$9AU?xTe3_R>h)B>m-6);nUWHEQZv%MLZkn7cSBZ2l&i9utEa}R7Vfdo$fk^r!p z%V%56551qG(t9;@W!aD`0efh2V-J<``nXrBU@0lE8w;V#|Uk;uba(D`UoG zb=2}XOP$>U$!_%5sJOy~_MA2*mRLHDNb2Hn6nU(s7qY*`j`(}fNhl&(rpNNA=;!l~ zSG@CyvFG+2p)nF(cu-XXz1lzsm))71oP3gn9cfO_SN!pb_-qd!b9ewdS{e1+_$q)j)4&;$x`Lnm zC!PXdQT9d?*m^cwj43_HA)(q2+%r|#v&PCOyxr{rNFTBHr%xDI=J>+X$X`p}m6h}u zN7BKD?VQ&B;i0ylozn?|Cf-{r-}z9HYE-yPtSk~$NMHXEw$uag_YVii4xW>IN{apFvtYjWkpu`|Oxg&ZD}eg< zoCXH?)_U-QX~5n7u0L>R0FZ?J?~vr?9Q|p4KSS{cFeY{Kz349$T{Rk!IBZx!1|{l@ z7GxrBPce~TWtmEd=|O!W51Z9jH|imMOL4C#PEW`A88jrJaRIIb)SUHG<__U*iUe*p zWSzxrZ>YHPdpW6~hu~DD@om7?2>6NiIf38UJUmg}(UIHn_e|i^#^dkbKl#$fm>PRP zTQnW6UkaSs;288BpmuhK5iR^=tL_zQ1TMq5$n5`yBW_f^J?iQq+fP*F;o;#_=Rg+<-h$dF+ zIhKHpKD-(=iPI^m-TFgb#l}VNlZB1%$_((Z8|a%b8wa!T%V|sI3y0Ci@&+2Hs;Orv zXBnNM2Q&w~80yhLMSdd?-kOz$s;-)7ZvV~&&pr+Yz*eVuE3WEseTPXKVQt?&pqm=W|5BvJeGlgQGzYoc%qamA8Zn0^YkJhcP#Y z6(ZQ45Xw8402@ZhhARCd)jo%RT4;v7CSsQ16>;(2Z&P&t{_*U-;?u#r{dZ{?(}Q zjJUXXKxn8q;Cap-gIht=|9}{r1~m0w^8)O!7<56Dys6h>^vWpZ-R(QR;o;%ZCPXCnEoY@9*ZWBy)m3JR`~+dn#tN6ww~s;7 zx8q-4jdGc0LS!tn(bT9sis)JVT60=&-@$%gp3}C!yQ}6L!|!HURakoG7)I|!Mxq9; zYQD>j-T!Y@gJA{CF~A4ogtzc^#e<1{oa`?BKOe#$Ru{Z+n(RV6h*?PO|BZhE$wg;f z5@-zPT20La)vHJXC*=%y#sy7Qcx%ol6?m}3lgimw9|H=+(FAFzd>9A%B{Idllh{}= zRq$4))Jax8_#R;(qhN|EbVa2nvRDFy!|Z6}2w}d>oYvnUpCtxx?y*sc-Ccm^Zjx0i zBiF;=ANuIPxGsV;B@^jQ4&+0QC&m=$vb9^1aNGHWt}iXX#YG5u@iq<|z`YI1pM7$Z z`IPI%0RPtuMAyw{wj-kPCTsRD4oB032O&JK;14U94Lq0xFU^nm_rL;JN&u@steqvS zMv5LDUsorVf9zl0+DhY53FP~Ls4q`J8KtN#KdTzmI9D8`5&WkrFx71lPhOkx(Cw}S za45XDH|H^D^|*rW9PRqW$JcG0u)$Ot4ekWoykOC-|JGE#04iR`OA5BOvb!AoQYgGW z=@IH?^*rrY&(OinQ#lh&sFV7dQ$rsMQ9SbN=2L5{Y4mBq~-_POGcYAiRMwh%@+tCWn~A~WY}nl1dKnLj?6a;%oli;#ntG3Ti}18Q79YJiD(M;!JK z$S^BUmrNZ{g=-(!_QvF$eI~IUvoS*s#s7n%E<+(FG49jJ63H~l2^Xs?n7T+ zZzP_*9>2+N1--6FZUZ38?_K-kq~hrJQx4epP~FbwC*?sDx&MStoExHhK1On>9EU`` zjcz%?1#kZbod{eHZ|`&!of9v8_H@Ni$)lsBb*w(2^-ml**7caraDD>Lu_)-z%O6T} zLh1+Bp#1Qy6Oxp7U=iu?sQ|4$B~3PPKj$@PBRYz><*uT#qGui+aO}-1*pp5=w|(M8{;h46&0NMBQ(Xm0lT?mPH7KXfr>54{Vs0XL z;!4u|>C&Pdtt5QJ$PPI6l_WxDYr@H48T|NitT3F2gq-&jTTo7=?nihe;lTLs_5FJO{JP_`H zKM4ECFONfjmz}x^nn1q3%;4urpL zt#M3zSbs^NRqzmoN@xT2-4d``d1BHjq_l-r-^0MIkPv&r{PpPtC%N0j0=0X5I@~6} zq9edVz-=yqiCcuiY_ymcNl z+6y@Uu3y~=_q^!X9j8yrmrL}Kjek@VzXmF0DYkQm_`pt4TH0e3z6GBCe@Ut>qk@}# zO!!A}>+3X2v^H>Enx~2~&eC2SAKffNrp?b8@{9uNP~i=Z*}^TI!*4 zoI^$YW~WLY+Qpu3et?EFrV;Pf^ATrT&4$s3Ld2mrwZZJeo3KjozYVKCghJMNT2JqK zoODp|D}7yUy*eyU?K_aRvSP`~&Q?-V8fmcsyv|o#vs;E ze;<3R-x1$~BTYXayc3W6BU>`O`)ld*-Ob0tpwIvVv;Z)W`mO#P^_IS|F^#f9?h7z> zh}UtGc6t( zIE%uAT=(oNBcnYYO!JIIaN59M$&v}kdzx4PA$p?;%R3MjiBYal;CgwrQC2*laB_Bi zZt3?i{o_(%7Y6cy*}=6T_>~C|;Qx1_Fp!z~WSQ>CNVvy4&~{xH4OqXT5)yuQYK^Ps z4P2K(d3HMSK_CLXq0d|#mV1ip-_DTvtreJZed?KI@FoQ=eD~11^{p2B;`Poo)n2q> z>WQn1^3D{Z_iBXo%THovChVup<`^->RdPL;FCjqHwbQ|&{*h)j=k^tY9JX~7@9fk% zVBE!wtH!8Is@O{R&%USJHN&n7RjtC9P*UDq&zy!DqAJv1~ZD`t5As7WH7F&&~%phjaF7x%)@nI|^Il z&#!Xz``g1N9Umf|-W#!tOyGLUuU?|(^<@Wl?Axp;>Sjh3OVY9>Gizz*vh3vU@@H9I zyAxSzIdb391)cDZ>?2g)j<{0qHEM=(jFgI>VVD)JolI_|fA3vo`1tyV9|5+gm=P-X z(`w8Gm!s#SD?JDSfeRv}AQcE9Vo-tZ=As`09&(a6vo2rTUwB>SCTMxu(1DSWBVArz z9Lve0wxYeVoC~A(MEH#U7(gB$*Q2cC3da%=Qt>dMB!an#EQD3WrNVoAu0PM{bT%Ky z*!fDi__k-LQyHmgXawviq1i)sh<2lA!Z(%u!>vHw@|L<9ur&X$%s;v+Pf}tlzM)bZE*KWCr)shRahQ>kqIx6l~yi`$S^ol}y-eA1ac*njV1y)%Im(8t}LB+}nh|H*{ zR6gI*tFQpM6f~q_5u|T&)^87U0~Oh5hIeQqDvA(vek*i!+Ed5o`sqyxF?9T7cb5bK zqYr#yc6GNxrF#kNStVFw^pa76jvH*cG*X}go4^sq6f=8o0b+8R3u+O zSZL>_Gx)05kcgcZ&$0AsDp2%zKBAjR8+m7z+>Qq}8-@4vQdj#}W^ ze1wU;{WP$6)uwkU0#*>VN=OmihJ5&i5PGw@X7>|1h1>E&(9QR9rrIpEB@D0*b`Pc$ z5MEI=@ih|@1@x@3X}d?Lae;f4criqT8P#kQyO`0` zGSyRM^7XuThEK#g*@~E$kPMF5M|ij-mZM;bQt_}DP)X`V=c*jEgHHh3WNaD!i61At zcgwb`s;VmNClxlQo6*s6$YapR(RTh%04e6I^25)az!m5v2g?nv7#T@BD+KAV-k!y0 zP*q9V%X5J1k1s$t_-9#urVM!RzV|flif!t!f_nSaq}Ylizt0tgqdUv-`i=tP9v*_6 z95H^|j+VnukMCby9k(u}x&B_WhpG;q@TxNFdgpydKIawno1CEB?0trsoBL$vtxGwq zAcf)@7K#TgT2_yBI7|D~pdv14jv5*&-`iw&H~nfuE$9B}oH(S$>5zXwzm{x&i`pxS zJ?6xN+S29WP%3ZO61t<|HQhE9Yky^SMg$xKAO6Ev|C%Vx>B_Ucr)Pm^HM+v#am)C) zdFjWO%HNyKmA~ie*V7lHqpM>zp>xZk7Ua)R2?svbL{O~>J+ai5cgYzXjYvHcK}E94 zqS7P0lDzorsbS+ywHU}!(WXp$mFbQygf%LX3RfyJKybA+1_QZD0~^1T-R*o#29-fY zI;ODTs`F=A>_byQv(Hxu zgsf+Kgh6T~sDxQ8K(e`j{6O`qi>bLOzVK(~^Dqn^zt5)^Q#{~zECV4l3>wXxu$aM* z^!Juupd!scakE=;3r3F*HD89wLX10(^vd7sbudH6msR1ysiS~(<(0&FD_pRdeAh6G zTD&ZF@AIQelp5ms1E+Fb>7GU#w;;qhH7o`rLktsebBM^VXuKbaXD|Vrb(Mv`mo(H= zwbdhZ#qM$1ukUz-cK|LQt@>!HP^f>))i&N%?{Lr&lP|>+habzMHp<`47^{Mmwc2RW zlDg%(6uqAv5eiFB&SPYlt3uSYgGLu&F#!jb$hKBWT@f;cj4WU>2+(Xb%q+ zwV?T307OQj_ti*3)1ZXgIP3Z{+E1go&YFJWinpuJPkyFqoXLZ|6{%Pm+y;G|vJkXn z%#1gq8pN!BFOB_d>#`|4GdwBL8;h*7i3b+5=LSJ-t(EXjU+o@_=y@iJ2k6i&N{ zE0#(h^bYsO*ME_KQ}n#}yzcBI@-9`yM9wSz-Mq@ed-taOI511Q#V6%uQ~jl=PD6xW z1^&xW99ef)xDvv+BIyxV%}7_>)pC4gGhiE?nwiyvNBoojWQZwUN;rtf&1wv5Ac0G(`j(PJ4oj7NZ3UUt+Js*9 zUD?R4P4wfm7pMh|Rv%BVo>0ADf{r&hYz)Xmq&|6D+ku5A>C2zsv%8KS8YdqyGOo%q z(_xC}m|@srDu%)DP+R>JuMV-4u|2~Ms%0zjHbMFVj8pl5itJaPp@YRhl%c91`#U7# zbiIUr^y0c_yN~iBk3Ay;Nhlmj>-#L zK9ZqESAt#)YKLPuQhrkK&v)WpG!Cn6M1{rEz8u7K@@%uCp-;iRzoDwBZ7*~e&U z@oSu}h{vL6EX20Eb80z#kT-`jG)``_zH;7lS=OPD!|5SWr)AG;S><5)u1lO!Q0hv# zQ<*i0rP`R=regA5Vw~?Xm$yqu(0ml-YfWyOD_BHULf};O%AoHjL+V?Oh7Xe}SWO=* zlr){MshFShQ3S^ih-*3(6b5I`2REY0&DIU(pYU`GW?zU|?n zJOc&DpmsPpDZ)epgh55ZWU(#P!-r7Nl1Z+g{p3spLP(15?||cp zg7Y*JMz0SSmi|d*HoHmhRa9IEzn0I90ALmhz1qRS`asn(yVYJ!@bhK9*`Q2ta71$A z`;z3z&IJdwD&j}6y(`6`p{8U4v(_sDX5yha@+DCLryfvXCZ$l3qtP*FDl?KF@X2ck z9l}jaa+YRjNH^S1;^YdN=YaFbzjCjl@xpr+ObG20CcA5M)hPDUMW(B~yLQn+MXr_`%ysetQ`Z%h%jkt=vp+Ev*vImSi_weU6HVU{ zSJ*7(I8ddhLqmDryl2Qdcw+o`8nDCmf@Bb=9?ywt>ZDc=Es`}z7ECNHIo%v~YeR>q zpN5xUAl0GuL=J1_~a{rz^=$7rQ;FUs?`EWly?! zT7adL8Bcs{GILBvO-;$dGAK87Ac50}H^#1ptXEj5l#V)+6?sk*%G zS9VysJ!!`axVeWOi2{vOVYmR`(b!=}X_R#}(J--@?7p_6{@gWj%aFl%iDfZjZTP|6uaj$EB4V$cyUH!1?d);0i zElIsKWGszX^6f2<5~w|77dRiN3J3^j1M6OYm&#{vtEsC~0V1B>%oW}Z-*R)n((dGp zidGOWSYv=tssE+c2nl`httP8UgReoOgs^y4y`FE1VS$D=yxj^RT`>%pD|thljeL^h zL=&XnS<;3s=BRX9-aMErHxAf&JX^x3)9CR&kkYX3Q|!|HiGC4#`(V2gce`Jlf-Y~< zwH&I9mId&AJP&>;(Kdw5-_qhg8R%ycGhe@(BH%*QbCfC}>W=X1=bHe$Pz`-oLcsNp z+cuA=b)a5(B+bb%_-ZJLv0VXPHEa5zn7fFI!T}Z&r$#CN?%uiJ6WZV`{~zBRWZ4b& zG!c#y{1r>5#j9l!{C)k8I=>Bvk-0t~h5JdXbQ>Pctzlqc3Rm4l7#7YK?AHMwwnpXB z9Or@`KcpvWd0|=-S>PF&Etw>3@1EK`7djJf5i6)dezbiBTUF4HyYeiXil{637}A#K z&;8^1ut&cbKQ<1IhQQoS;h){R3@hWNW*QSi5H8tZvmjWaRn@)Z(G#Wz7Y+>H0#wV7 zc9*wcKB8=l{6rQ!PrdJk3*Oe(`e|YVRZTsyVGIo)kyW=lML+pG7=$cca%{9(FGnf` zOL#K&4&uYdQcblZTx9Gi; zT_aCV=1Syr6q`dX8?WBaZ#O^mh!K?XJ|0on{8n|(ADf(mVaJUI?OR{X?cI~dc&8#~ zJM1HPk;8fLjJ?-bc8kGP67;tF+C(dKbOsx>cQG)lEKuqD!*gK;>+9P0r_+6Su1_3Tkc5T;>G3|)@G*7N&ZyXUSA!jcjM ztiUIW3O?2)*r`ozPY7x&v>h!Nk3W>+qBszlwrK2lsKi>}tx`ThNy4|E3fPp)G z-%(pdEjXCCJZbZeNtix|0J*bE>hA8Tlk^*yeb#Kh9(X!6b?NRP3#a45;em^xW}HVI z2RYIKaPMl;^Qm6*0iCcE$=Ff9wUC#DdHj1mOYdnoFc-D)8@}lmKj}_Rhdr(Pe%I%3 z1FFL%iJ;v>Tl!DztDDfoTq;NX>JNx;?wdF zng@!BQq2#Q(#NgA^hs}K>Fi(Is}`up=M?kisL063NOy!6o@BEq1T5I1!7eeNLf(#? zfEluL+4&SKx(`zJlgUgOgHE#yg6%wo0DZx?^SoUSF9-?m8Cbi7ZJ-v+bKk8K(783w zzcW1e2+(*Kf^N&OdngOM8b~ubnl>?YaNV-_`GE-G(~y?rZJ)2q&hHYFBD*xeN60TC z#zW;%9XUOBlZKqcM|aDXzy>4 zL6;JC*#qKDkx9S16b>`uieAzYI;d+*X0o-C6k2V~9B4&{2oLDf-u3%jZ^iTIs}}QC zQa~%C3{!5iV)T_P4a^sXo&OcVV%IB9e!t49r94L!R;u=_c8;Yt1O3bqN6Z9(a4Z}E zrt(C)V|Blp6m1@gCDf4OU%J;TGaP*&Y`ha4Gn7pwnfL~jM5cEwf3V}I!O~E{03+#c zlZoAr6aKLY2~!mXm%H}EPyIbFmMFG{o2f1PK;xn9+Gy?(hx?Z$;8`VihqtafKLfxF z=~K)q3%~y$RBfa7#g?cd33^>PZ+#7Eor+4C2!;FtXeMI1N@jGXf>U$`W-ihRdAi=t zhzb2cWwyiErSV)#iP61FGbAjOr`DYAi5)-Ra-iOIO}UOnHmrcrkjVoIt%M*E!h!g5 z!`ZV7obS?;37lS&FRvPX=H12$^SA=l3m1*EL4Yytaa*>v|9t-k?IiL`O!gBs2#r0u zKXwgfMow#s(}a$t$7s+md_qD4R3zE_*GI{QB0BG3`HE>hv9df7c8DHk*i7}I1qL?0 ze)9!5Sj!}`Bqf%$d4>B|%A=Z|w?grn9hY8*+@nCRAr6I1yhONGYzlwYynZBIR4`BR zK*GxE0q5hkIW%d_Cd)Fo@N{7CC}=|9qKac;eDtZpH-Gm>*wfyoNdKZe^JtjOvptyQ zbW^U0%6D)fyNKnmntU!`oZMWLuQp8wT3M?q{y7Ukp94R+^@7Osx!9QXJuiVQwo@AwOSuWP-ua;+Eq z=yJ$NE1zW42Wl50k15mcr?A7HY-fN$ZzI3GQP!w?h3TNCryRnxswhoez?pc5 zK@8CKr$Off9e$Y#waLC%XJ|4#Ha8&ivp@0Qp)k&-N}3@{+!-2(iCEJxZa??o9hMnX zIzKy2X8SR=N-b+t65BmuS~rxyKNz%U-)XH7?DhMWOfVKMb!41`WT>AgWP%u;VNU(? zdTlI!x39STdjW3`DEU3~Es?OhI7?EfS{YSey|9I|B|M&?{iYzP;E;+|P*eb~5H7sK z;sufR$7PNb_7U&RdcJ{cvpVqX+mYmSK}N2ga<;lmkCsOAFBUgddMa_|7gN)n3f7`y z9-HRGG~&yP7>_t24LPGi%4m;oM;uTsde%g@iJ{M$<2%&cEhs8s_wgRh;y_gI?(Tj9 zwSYmZ?gP>UHiPypsm9BuiVC>de*5l1tY_^!p|CL$t& z`vBjH3JjkcG%+$ZkSoWUKIE}QK=FRL-&l^;(?Oe^4vMQUYv{lB^{>w+6^zev)Wx_P zcFIA2lKTP(AH`m1hc~+(*T%2;rV#8Ak8NCO**4B(|3L1*?!#8=@k{3&$4E@du#Mnn$cz~ zu}TrKwLOajN(-@-X1I+b?C07O%jZoNSFA*9q^!ixg9wdjD(M+1%QFwM$!k|k@G9_#L-8mHbZ4vk{h4@mn; zWkNdBvKCfrjLGw+LYrpP#;6~bBZ3&Ecq~Y?Pel7k2ub8H6%-&YIUvkUIu>x5s7M*H>^KdXhA~d~Tne9u(opVecJy~kt8X_eu-ocgUeJWnc1 z{<7!x^~fWyqUicuw%w|*^!wq?!BMyj!AuTzQMDFFW)tIrhqOwCUDs}BooLaalsaxD z>=ltDLJz;4=+vav%{E279x**P&!73aYP8(nWR&)U-Q>reK)I)*%zKXo1@CEvyjBVx zK?`wMT<9Nu)IJ>@Kfr5ZK8h;VW%dEuBO-XN8QK9d2>k;Cu^TV^P)nsmE0}+d4d%4D zy4(W;WCA+ZLLkN&JBsDFA3*r?v7Y8o%Z4>*Duc)*CWd z&U8dvao2Os)alx%NUrFOmon;5@L1m`ciVX;(7a`)B~4DV=qSs!^@06cG+@6A-rV4u zZo_bo0|cVM@xGdl&V8_JQqHqODeu^(Y4=u|(_&zlxYOm~4s(gcgNK{tE_zv-h+TP5xp;wUyDmQp+(iL5i*v6gU4 z{@Ssy2p7ScCDW%I$4BP~{Ln%9#oL5R2>~4wJ-NxEklf3xpQoQ`7j+dYz|g~P$wtBU zw5L>F{IZj=oMOu=kd^b1%BL z)VsT9@>SVWHA3H2`**=+3thxYOqQVkAUr_V*WqEr-0Xzj3=?wndLo#&M1n}Ba9HB< z6Qd6n#z7s~(|c)>{M}={%MM&gjCI;)iczoR-8!mX4Vx1STWPj_D#RHXs!GP;Y?Lza z$3OxeMCD0=@@TP)QJ=8cq==7Pa?xtt1D>bWhhwU7lL?N4lA}XsXJ#`mJ$FSd2<)FI zv+Kein`E{R-O|b_xg8rT`9ZAhmwqNb1Ux(Oup3iQTufd@)a#z(CM>wXJ{hlI=8GXv%8jglnear9*qWMeW zW<9ypeoS_dA?FY4utPQaDlB?5U3IWm{rTAHD@{#0k~FJR5TZ+7jUEKAycjGwSyUUB zGD|%T*W6XIkHFBUrkr>f2HTdqzfGZ*uPwT!2qCp)4#MC8Jc zvB~*OQ9&3m@hqOdTfSzhn04xT!D&69CO^QGYRpo?RoyxG-i@BAfFQ&!z&Nnxv6__W zmx=UvJf9=}rFLs(!h-(Mqn6q5m2+t?RH?T2UPtz!ACze^IfLfG%!vw@3G=zP+wQby z(d9X@ixPm={SosGtyhEFDMt%PWIcKCJwM$e2uxMvk1f@2C1*@{ZOK10HaszoY$k-t zoQpMlUp`12b6QP(YkSbERJ9{c+*sW^HAfa)9Nzvdd3Vy3ob~@>@2#Vv{MWwGdxi$- zl$1~q3F%G=DIKJhkW^5*dkBM;2C0!21VlnQB?M`ZZjf&2@OIFd=;e1 z6c&>cVPyPyp*-)tP+?qu{>X|uR41eUrCV8wu=hY)db_cLkGVz3VCD1RxcEX{v#t6U zr8Xl4jHJoW{fo-WDJD&3B9CYMnXE^PlRMo=m^~>ksP39YEX0OEdm<;~$xBp2cCYJ@AgBtAL)9 z^2xA@y9U*(ElM)}GD8Wr-ZpfqmMJ&QGbQnK+o3m`U%}85|GW7r*TD zgY^sK%8`uCI6MBx-{CH?{9W1Sd)oH(d-qO~gQA+Yj!X{@#&h5dKAmmXBOJiQ@wW@l zgHo$Q+E4loE!S$ix;|uW6^KVg(o2}JykQjg>BN!I{35O3eZ5KMZz z(_8XOfNYf%UP&oy&0!Y-U$*xCJ)jI`Fp2V}r53mbM@VXqvkslB86@#jYv+D5_za2# zP#NLf`pV^Rl8t>n|8CY{^YByx#7Eov-T@s z@$1wT-C@o>-wZnXEybYau}S~kX{ml#q}|NS%q=$8M`mORJjOQrA?#pkX<6zf?&>DM zM+$rb(*BnU6E4C@kSVr92=^FHAh{FxY9(>iiJMNO1J9i zV-4_Q0qS7``jN(}99iAAr^c@~1P^l>*hTdp)psA&&R!D9oHP?p9B%|&o*$b7{K|SB zU;HWoXq?TGJe ziab_sYs82{o>l#x{f0-*=hRZ-JKS)`XZt$~^btiBOT#6i8w<&b|?hLS=G?+!$PkbqR z*0-=5-CE_cMh3v3QETBslttZ#b;=FNOkn=zsRVqYFQK8~C{d8EH9I6$-J(|Kj$ZOh zLh;vIVW;;e9amsPhoEtvLpuoU;FP%);`pBOT zrN1sLT363hxvHWJAYgvQy_GUFpl%{xz~`K4#1bUfENj1@V=2PEb+%_jKaMCbd{_hmOzv}j3>?FqEI4{ zFO-&tFZp7_CL%v}Yofn`oAEl{Z)zATa(_>#a7Fv1X~gMLhy22rY?T3Sauexw&?B;U zvKS@dd#c7;A8hZlNSCXgrI04(oa^fkhG0qAtTXbJeI9RRY+eN%frj5Wd@MmB2P>;w z6<}WIlj^JMS+oEmdU)uJb+9^u$6K2L3c>@Avvek3evI-0kY;WPv)rT^A9-6x{i&P-M+O z8JSR}AJ?YDEOU#DG60sJKtU4oVXP-bUjfe6AB|)s3waQCdUAHd#wBje@Z>LGW6vcX*^=Cda5=SumQQ z|9oLn2b6$9`M|2Yv{C=cm^MXndU~YH#XRxJmc9Q5hw!mpiE&%DO6-qC#zSi{UxExU zZ2LE_rluQDU%<583sK5xikV-i41^Fy7;!o%9jDDqxlEalcSBR|1k97ot0*Es8avkM zP)&>tXH!U5dW{b2WdG(Bl!|_02`ocPDCM2h$luhBCsfbqC#y28VnQa``rFJt^iiaCjNDCrlYcq%WE>1YY(OT ziZe2poSdCW8F@&{Cxy?->Dj;o+P~q!U2QxtV-oQ`r!1fD^U0&Xxdrr11@Mrs7C@Zf z8K6SI{_kecHpl@0apucYq`BGYef}DI|L@nni}}}uRsSa!R+VvR3#AlwS$#Lz=LhgC z_vV6##kaB^cH`mvy((Oj1EZa;S&!DZ*0H%7;{HbE&`OBv#;jpQo=whPysk$Jwqt;Oc&$p4Ps0OtDuv|~7a!Px{lu_W)B+#9P7-W{lMqk5}-qrmE^a-29d>sRYUVgCC zlbHpgaCn0`##D39!H4eOB=^tb>iq=D>HYBQmzAKfaArfehWmxFtUC(jY)nP8S7CvTs@^-MP>pE?-~RA9n#yJT@wb0#y5cf_IODFBfF*`30D4=k2|z& zJ(;&GRUt+{$LYga!l5iD7hpYFUSuv(oX2s1j=JNJl_H!^j}H$NJ@l)t9~S`puFE$5lVpedq9c#jH#WFUMI{t3pJ`uMkMMmB5>QSWb4Lkje*nq;P2AhZ#swU?t9evsy*$*L!D8X^!cQ?39qkjARZY8 zg1{-;_<^E|wTd@Ie)=huA3j>U^fBvv>*jY>qJo3*?yIT`vT&qHZbUT4N&KQc{8;{p zLa)TIZEvw7Iwdg-1X4r4RAM;Y?Mw9xlre{!YJN(cR`CyzIXTW}vN+j}{uVZE!BkHX zCe*t1uo?6&OyWG@v9Ae&y9RHt&G!6GK|iSKO_zExAw=nF!xXhOVSDs3e7w>i;pbl6 zP3ka}j>e^-=+~*jP79s>y!N~&=jZ18OW*ZBKZUh_i;}ajU}KdN8(1JGqp8Eht_AU9 zpEk|A=;(m3M4rs5DjM)OI+xHq_zg@Q$idcpPC z`jampQk$?3w~$5@1vZ9ZV)McTe#K@mQP2eUlYX!o2;*7RDwTc6Yy49>1%HHqfFL5H zA@^}+vWU=QKS6f(#kuA0CJ#XL=#NRil^hAHxE#d#P8v;$OGqeWn~MLTDq7Ya4K*gR z<~CpE3guwr4h}vttntBH;Z5yG9(tz!8%jeADYYCuV0qx@VyCjSm`+?>wVY`iT6{yZ zdFV%WvVM)rM@K5Kd+#_jHh(dy|LB4JI%cC+51Gu?co~j?fkA%pM=Q+y$j>KLTJSpI znvSgZF_S6M$0g;;A1g(y$T>JEfG=(7TE3%EOHD)b923)BO}_D-V%jUkS0@W$ybc%N z98)Dd$p9o@uEtQ`&K<|KX-0xspig(3@b1UvMq|^j?T{&cYp@yN1c=Ydm|H5j8d(_D z$Wdm0Mm(2*rl-`hAIDVK2YY=yu|d!m$QY#*X!fU`AiggVZS+!*mE2l+>TLj!o$uKZ zec7twtA+LB^^Y2t2b`N+oSZEc_*U8>EGOo#P%qugtx27;DfyF0aI%)fT z_|f7pso(NCW!*uEJL-0#v$K=NZ0|t&_{amJjg}TEb^hWRn~}|NTSrs#Ps5oDpXnyQ z9-Hw@{QCN#bfS{z*(Zk6)%h1Dsh;qqI<}s0c8Y%^FYd z5}?YbA3S{6LEX{VG~fD0qwzlVxsb3R(YES*39zU%NthGjBhgu`q;zyI&h7!^vEp2s#3JDcKN&c8DIz)bDgCtWU+fb;G8zLU>NQH)z>EfQPbvs-T_UgNcc0<%?;dia<Y89 zs$IoAob;|TP&74S#oid09^cu1&U<)B_&OpKOVFlE>!H*u_6(mvh2TNqT|1G8h2-dD z(}{{iV=p}PCJ?WDq0t*AGE7M+B~fdA<2#s@wA}UEg5OV{=smG_ILA3luA$*EGgK>m z+4s7~G_OmsB&xOkn^I2Ev5xnEbZV&o%WA^ddtYDXY8Jl{w=VV#MneA9L*ZV%3EIC= zJ)Ugl-`dZV4n(=QeyAVW&eUB;1OI_S04W89#S>{s2QnH?Ozc8P^*q%}YAB=})XDqa zW(ZDK=ZcSNTt0DQ40A890E&PWPQgx!1m^}=sqod5Y|kFw`#QQOJY z`!-&Zr{Ca{=~J({pHwEfc+|}0@F^l}Ps}+`bF&|c za}FC0%F-BmQ8K+XWa{lTVH0LQcK^d!$#Y&R2Ft~y4&hNbKKj}+O$o=b@Uzcgz~zni zb~O%YzzPKj%4PuM)+!*QwE=_xQfM5#^lVR@j2!YmU{tTQls9s|V$`G+d@fv8miMD{ z!__U8u&Ah#EuPO4^8y2A*^4@QPi%)*CiK_uo8UZZXz=52zQeq4!9RrCwvD_*`Ggtr zQQj+4=>AAOIS~0#+Acn_5L9xdccbHHrKOz?bZScnZsEKbSI<&U7*XR`7iT5Yf?Mr! z6Y0s|71Dqy8D1Wz3!-`T*rh$YeETxGkZ;)o2s1PD-Okpixs=(H+%<>iL^CrJf@fMF z+WTwhvU}2TbCu7+$|AEm2TzTTb0tsFIi*Clhs}OoD@qGbhU)n@MP~l ze|OER2v0-m&sJr!T<(Qtw5+A4Pywe6LQ5xT_w8MU4#joL=E&SZJ8Y{P%q@K}EsvJ0 zs8~b=NwP9qB1C`m&5mBXzhxyd@r%y;W4^q#rWj~H>#5LEL;xrXArTFjxbSj+$DS+S1EcZIZGeV>9DJ61bWSEY}rlf&o{aV72pMuBRYNClAWBOag!XGYQtrtAj zR!K3@)U})_%OBGmJU$kRB&9RPCuF3ordpx|n7)sSgDPYYa7teJlF$@EW^r;Vnpr4kN`*DPjm zcN9qz82zvkyZNcjx2q#TlK#eIPjgMn3#QJokS$3Oq8neg8GIR8e@81~t+Q*wUM0UQ z-}B7&>$h6tEtoB7XlUoU7-;h$mbti#h8qGk+vGUSulHxBvn~RCSV<5cn$o4b)w3m7 z{bFkR+iY@;e=HE4GAidxPwNq-yN5B@U^>@hCmm@pZe(IDn~A;H#Lg~S&o&y;@a2na z^AW!qQZxWE$;ey8`V}vH&g%0%M?*_EQrKimiMk=F$IdG2h^6Dx%A?p+HH|N;TDA&h zf^6*U=BJ^{CleDTrakLIQf3_DGe2?D{`jnAcz}wc!{wPP1qDS?pRU9RMW$8r;}nL; z5~Cq!y>)pii#2)gq4q*wRo}5n;=-Q;KX;xB`x=P(SD3I^PRvYB8&gD89{+?Rk_+QI zW(L#OnE#RPGeNC4$n49=uDAHUvaM`+)W&Cd`rU+5lyP;6cE44G~qPybci62 z({f|Ormfi|JiNS{Kfr*aql8*#?PaH;keLhDLc4~-=cRi@xDL1&uZ5YGGnMlPA_@vQ zqhlR0Zen@n7jucoa|lKBe~x}jQF}edjr_W;IJ4VlI?JrI5Ny2J$Bzw>3m1j}X;PIU zlGC}}L#<A_% zE6($=?=z{QS~$_gscI@J&1#b-{4IT(pI(aPymc1)h);L?0YvN?CB=PauHzZhzJ;f9`D45%_ z*B1G_=dXEuuP)*1*{lNX*0GvtN_sp~O&_yXNXdEUEuKC2c3&({`@U)FJ7MdcVeGgN zqB^y(JK6Q!9TnYcVA+jVEyh~3GGo9D`U&K1t=y@K@l}OcP>)w?x;mpPS?ru=b>g;> zy{}JlOPwxkFLr^?Pd-)Q>w(B6O|5+O6I{1FeGDq za#6faKFDJclpIs1u6dDq(D>|C+$W7>%Mqu^j(4wv!rU54Wbn@KFnkiyE|& zmYoscX~!-Z%_lfVn);9nC;EY@|J8v^v=cwpn#hnPU+>*AlG}Eh3IdFy-&t8tqB*|= zm9&_KN+MN)}uHBy*Rw;NHnQ>z8GLU8G zwn3a6w7si*F1fy8LsF|bo516t?H|H}JmuFfgPqVLUtj>1l!2A_#Z# zG`JaRL7JiDIl8g%!=UV4-QDZ&>rEG5M@YdHE)M|u)JYWe9}KYLK~~N$Kq0sw#_fk* z=Gu2dxhJ(hrHkWKOS>z^G?^Y#KPH|iUX%ZXdTM}@l)oV5=lv)zRJ-dyliH^EMWFMv zK)vA3p)U8RadKV+?Rfe?Yu$|15sT5;(UI{u1$A8V%6yhQDGkNbGn`+BU#?S2S#8a3 z1nPsy<39%M1g~ASG*^Zm%0#^SPQYj0e(j{o<1Ki8fDHNgNDbXK>pFxK)H;HDGp#df14eHSkGIGOyeDNcalsN$GdYR>D+lRn9u{Y#V z`FGsri<_Z+;bLK*m(ma4G9=#6G|vpKa)oVUpUE9Q!lB+W77CLszQImzviS!yu`lNa zuYi#_WBf0Vc8%1~Z=PX%wny;Nv{U{YiQkpi;s_5lc=Ol60$GlG6K!Qx4riFX^pm^{ zk%rEzw1>AX6lfAkd`Vu2$XKT;(YCwf+qc^--2lBamz`CPBV#=kVJ0jhi?`qwyA0PU zt>qsSK&l73ktCX?_O|O3_SCZV7i0~hM_6dE1)9u-^5>r#{P}*`8d$OfJ)hv{CcXQ6 zLuHb8I^0>Icl03d(cZul?+A>iptCQD&znES>S6qW0rt0|FOSEpQeMZ6#^rHZiFu## z7aP_E2$EKC)joqUo}E1w%cu`dy9_oZsvT%nF!v=V@;Sqr@$#h34LPyy!BD$vWf-pD zfj+^cexpRWizEpa&z2VJaIuJX#6oS{i01}L4l0#eW{j2xwY!ZFfY`NX1DRnNDA()mG z$43j90MnkjRyAZMA>n;M*t3%P&F^Y7u_R%KQJ?r)#AMc(RxO7ZW3hDq$QCI@9a}vX zmiDxlc@&u%%lGKk>k6;$nIw9!I7`b5HIU=$<30a2m7mX%Uy{G*wmT=sWztY*uKAVe zMNI%`7!|MIk(fOty84|bBNjjXoqQ{{09s!i$U0d$@?TgAiA9g2_o;GzyGvih0?-$vA-EE_mXK;QPBMn73@<#G*yxR=psPM{}fHc8r z7CfVdmJ*>I;?W>abzd*20pz00019KU##9^MT!I1nf-$U zOXdt}vRhHjFAb(hNyP>gwtw;tiRR+UWzKLnQriYet-=u^@-Oobb zQmZXZ5k}wtMY@pqmM@UXc64+2=M&+lhvtVqJ&B6ny{>*)QQaqrg4lXe%&3aiSa8nA z!*YcysMtqCo@=tYnTR@%{9a?%pnJwL_rhDo@E7;iNFViV`tMP}d-9b(R#sgRE6$+W zqR~-4nLhIxRiZ2e{E2WSTA(^uv1AA?;7xwP!jN=Wf!r$P192$iJ3 zSA`9)tJQSP-1Z-74q#tb^!eKOl%zpU!nbCjP(j=bY(8x{FS#P5e!w7br@4-;?TIXc zzkIG?vIA`g=>BFyP+_~7%%T~xS;onfvpXbChLd%QU3331}1 z2g{b5QauIk+RKqy=#BwMxPV2eA9U9j5Ncw6L}1sJ+#U2wqJ2(jIz|gU}&B zvoO0nP87aJB3ud@P9AFCItGo4G43J{<)Ihh$s{*7gd4d_>JDM4zJY|8uUg;l(031L zODgBQaDCw{us~H|C61D4XrrP2wtHLZWqsn}^_i483L^p${2EU^xLrt^n3@$^PcGLl zJMYF4sfnZ5U+vh7p!)<0?Kix!aMLA-e~+x;yRx0tr#F1lzM?YIwa!|RTYnxC`Cd{k zq(`xl>=ki8fp8&Y*Zn>k1QeQ+Syksr5Nk2rp2KgcS{{B~L%%6>UbVO)37IOYI`PJeY{Y4lirk=HwD|ts zC7D7>4zxmA9+{Ygj*Okbj?LF&Q?zcdM_JM-zjLU{X3O~z=lRxu;U+m#MZJH%46PIj zFHO^CeSP`SPeQS}KZ23Yl!Jr)EZ;G(WOd-kGfZgsK{u@6(ljMJiZ)1JqwbS0Wk&M6Y|Il=^%a&!&|+rM|5FcYmp`jf47_~ z_;d9A(S!TRyvc>e3BPmhSO5&v8@3p*hHFMfFVj-9g$fihgVo~=#oQTrl;4oxj++Ov z;3hV!m}2rsM`_;Ku99r?59Q!J1N-9Dn5jOd7x7%fTib6(rT``+lw(EMx%hRW>; zk0yMg+m*6o`>u1o^(Ku$v#xYo|4!pe)=J7_`}DidGO9Txq^Do-<0mr+^Ku&u4nne6 z5VA$aq?Ye48PXpU^`+MY2Dv`5gCoskw-OuudLYETS*8Kc!m?Qsq#1UvUsd46=#Cr9 z5dVN^&bJ2`5GA?g*)md{LL?|DD$Jg$u-l7G(3h(iE>|$9Gz<7%x3o7S;s^64R*=jsogE$(&Q$OyhM{`Xs$N3*CUt)b zzNz1Wu--D*|E5%!NM(|xPG==SP7prbUW$JCy*lBIC%)|A5u1w}}#>47JF1 z)Qq*jh>14#J~r1fzWLT%>Z0?st2EWNb=I$i6m$8>+-{TD6|Px7^DB3gq1K-!+{#RH z$)osjfxHSn8$^ag_AI#-^u;?Ha~4}qs9S7neoMZ;RG5oAgtqm$$NU2aOae*i+OCt6 z<$8QsRXdLkxom!#nf;k0rx%9jQ>%8gb7s%>jYtxzaGy06^H@ctiu%cO!iX?GSR57{ z5-cY&HQw42$WhU5m$YK5bu6gcI!7A9}P;w1lfnSN^4y&zV39!-}(CV zhXY}}r_%RHrk8P5*R^hveDB`uYj81c@r~{fhzwePLc4r+NI!mvm3rHeVLPX%yYx?Q zoQmj7+2%*yHfhhPxr)@+XG&uJYX$T&2{+IGIBM@ZL^h?s5lYyX0=W1bkPt3Q?UQeP z7>L3e7)RWLW--1XmV2k|&Z09C;uE#7d@f}O|9UV%Vj~-!7nH9;@){cM*J;lUgQ4bf z5pg(>%`tV1>;7H1!q-#?ac4fJg5Hk$#F3$;X6!M2WGEG+V{}`uHzU>=LizDS+6|sz zgvg2C+sY8?iWfunRKD%g3?FIQs~Xu~Chgzbshv(!k6{R{b2#ENHlgr%c!P7-OYAt& zrb#G2En{1tmje?Ojxp?mh2nzNIX_w8$u7f@k2!a)CY+G_IdZ{URUOJlbc+xc6lxiL zwc%d-REH2Jzo_sVwyfjR?XYH6^q%I$oz(>RDByEnIT=koP?hm&U*sH;-nMi^=vLO#ObFqypP z;;jSQ(Vbg?<@?9<>|B%u_yPtqGtRD0npGzC2L05H^RoT+@H&qNOEWKPctbu~$HX*1Aump<52U1hf_8tDzG)sLWsq*$X}nA%lQ?_EpA|+7&AO{zH8K(30`>QA$uc064*^LQ=d|qIbfX%Fdky24tW8V= z$S%XwTX*z$*RfGP&$!g-e}J{#hTRCl<<6*QONEVo&tze@unB3PSC+o}-`U$?1$Ce3 z_g0Ct7Hq$-pYG3;e7!Hf;Ig>d+FDY9bKd2r{@FPxE>ZV;;-AZ+DX5>{RVh6ChswK~ zVJlgJbI-roi2D%j&(C!0N5D|Uvk7X{_MNYWM%6!lv(4|l%gP8*6yMwT{W(HEyT*%+ zly`FS)ygug^-^~8sRZoewKp(rs#eh?KZ6hr9|+C^z!BK?<2etdmvDM;v?UHd?v-~| z%7fyf<^Eg?gyYK+26k>!%>y4xCZa}FJE3ftagCOjDvVPmF9KuuUN?-BMV^YwTg7+Z{b0hI5XS{XYzc*)(IjC-6V{lMBF|t% zi19okHD*};YjIXndDZt^hSkP{)LFA!#-=cYGznLxFw6m2P{Itk>h?0c!59vZy5*r{ z+VZ9qX7fXI{P&Uamo1S$b}&SPV?)3YAvu3*h#otXueRTszU6c(3FS+O)P2G<4&VK@ z+jDG8^;7BA^*(YdChJ>;X}GfEs&j`Js9!nk=XDs`(T+$K$W_^{8D`V-dYjCnnd5Lx zK&p*?-TTVn&o&Hogu-Nm^5a|}gne<0HP}RpeE-%|Y>%Dkgn3oBjW%uwR-nyeVHDen z39#Pa_RFsozfce%lDh}PG2(bDFj89KC~=1XRBj$-I#coEi-lG_i%Li2zAJ=(bJt02 z4yx#bqYz}wLWDPWedv*9Cy4^b3+J|phxyX{5;~=b~{=U4-U=O0 zL%3=hMq975U5h-KCynN4w2F^ex;vKmTaE`SbqLe)=l(q8Y81IS{c5Sd52pjxMT9JX z!l~Hq{N#;s|8xDp?J~2Q;$D)rH+(pSo5LFFZuCAfdiKyLhE5NG9Yur)`_(DI^fG2X ziyN4e^{nYPf*cqSzFnt=!-a$+4H3dW5QDrA!+hcLrkcVq#VWy;9x0QwBGW;aN3ZQc~nLK@3n-m1x z9!ZZn!$3rF;pG2_x85g;(O8T%yNSdx6injXYTe9`EY^XLXZQ~cMM{>thDAkXbK2E@ zk%`J`(A*>*xaYePigj!_M0jm?VhR^0zfg>0%TtRdz1|MW|0?aHW5os`<@swzOnmY> zJ~xt>`6vKllrI>jE@u-khHFL>RGik;L((xJd>zoWdh5p`EkmovweGGOPWH%Dky1}r zB!6w})pZ3d3M8#UX&QHz6jad+Ar7c`1Rx3q)a#4uIDSH+s0w-%u54Q3bjH<#VyiOD zbaIH35z@W}J|GHVX?Q^#t52bPDH8}OzJM!R1VM@pkr0LIT(aNdrvt(doV#~X!L5js2ObN)xMFO zoHV{Hvz;6O;>g8rv=pVry%uwq?Lr#4_D?K;6Vkwa0|;NnIeY~~^eTigH}0s0&DS8< z1Y)!-ex{x#@HB`HZ_KoLHVk1w1$bw}qq@ms4x(ME*zk1oKYY(ZrCn<7Ck@R4UUC|R>l3NU2mWRbnm+@>4S2`+03f@?ge%c)Ym4f1AAP(cG{j^dM_osIfbTl8^s*+AX`AgD3R^qan)NpQYAe|%k zWfs*r7?M`NFHXfNT(B{`_7vv8j_56ib564vTWvgvyuuz7#%@66YP@9MI{jqZqaKur~wW~NdVq5Up z-6qZe;CSjE=u%bR!QsF{Teuxw@f;B3&txOPN1g=j(YJFyPDxR(79l|5$Mx?z;1L{{ zLIrfsKl1R###~6U*-R^VhSSBA`c!Zk9E7T@9xRi2gfHLKEVy30L+H?sFK2=mMnm-W zZl~zd9ViEiJ7?`{YUnBz;e9LNjC=%j^|2M)hDN>;Kj!1YLY>*PRMk_qfwzsR`!sGt z%ZA?I{@3JDS}+c8S8#Wu_wpq3z@4QJ&2Sk2HV`76O~Joex2Rxui#|X=T_K{nH#8Yg zsrOp}FS4LGa>qJrswXNi4n6KnDDIBsCqM9JJe@dEAC3zMuVrCq z3o)R!pAaDgVuxX&MLqgN5=zeIL4xo=d{h4bbDHiSn3(+F3XtPAq)_S0YFDqt`fhHN;Y!*NG%_h@XG7C-cK>f3iK(hBurJAch!x5K) z^K34ffL@;IGEkR!CA=x(0jQ)t^XPe5Nw)a@F}%+p7Eqv0*;{so7Wx?nWLffNB^OYd zR}V)N4}*W!OrCKDyORGK$snBkNo8%5-;pfocL0@wMUk|)U3xGMBJ2Dfj6l==S3Jsx z6(jtPWU2}pA(nSr*_%Jl%vgxYR-jzWw9M;+gM(YJI1S+x$e5`oql#*Ches3-CAGk! zrgcD%9>4|HZ;DE2Dzmm&`v44`m&cVRjAO+X1;k*47Q^d~3g}{p%p-DQHMLns7M9;U zGHGfJNRP!_X#qcmf@Av?VI1r=E`2~x{+;j`K-l|xK-gXSmuR;7{>4^jz}D)a^k#@V zU`3r7(5tg1r3`dMlOZh&t?Fq0ZcnyAh)OaKS65N;rdKdoE?qoHSES9S!g$WO<~w(| zW4^?|Rg$FiVt!fScF&R$9gL(bLnC-6ctnW)x4YyFsB%o>1~|g_vld`_0VyD)7`@02 z$%KV^No(K*m_O8d5cTikwMs>6z+LfY{gepErvK(w6BH1X*Sntxs37B3YA_G@_&|<# z0vy3pZra$a2B_%ocwto*53W8!4}N>pzAl2+WVA{%rl$aQq#a$uMoayFPbIc&g`i0D zd_L7PI2FLsX5=x29{Y^ILQClrgbIra;=U}H-R9|x&#a?QjxE(6q%MhjATA3F+#jJc z7c(M3jaQi)K-&5z4tUq_`7L+LF=rl#fen0XKELk+j2}xdJNE@3Q+&VQ_b&J(c;%<9 zs;a8R&awA$fAc-3ixbC>pFX{t>%U=8>K9#izGoHkFg!kgb#LJf(vAJc)mq?@Z;5hSWqz#UJy_9C7cqRprG>C z|9&(8r#=v#6B?s6McP;(r6?PUEBpmyAJ4Db2u^@8kk97f?iAV{mOr~I3*&f(56n)! z>W)|-91*#`O0OeJxyuNG1E*OkY-9`bB}{crsHLaFkOmhQQz0P6s>-~7JNALH@ZC=z z8y?tj-pTn=r@+~z%ErTNun@a)#zkYbp%K1}zzlQ_Mg`t$?Kf$uQJcfs)fsYsxx=&` z6}bbQ-m*svY(m^BAS@3vL3XlU98`cbzxeUg|6|q4#6f!XhdNnBSfy?= zk=$Tr{(v30Zw>suYmY|6BNHudA)RIDLzm_L?_@Fwfq@km2YHWNDyjktqBzs|yv7Tm z`rSSBz%~`s6q6_Rkq+7n7XFtqfOE0_4+EA5L@|R6lz$Ky0cBt}orV=wJd^+%B(uEx z9#JCKqg%WJ|LGO}w~Ro0gq{!o1CIc{#MLDdUGk0sxmmS6LCTP(8oBD+NGWDID%jv% zunnFh`G(D>lK6s9r;i-mG5ay{$;qjlAp_ShR} z+HU_-(f>b0Tf4J22O3b!^CI*{7SWBsWlx@MgdZ)=;kxw*OJ%KR*x_>c}*T_a^6AHxcu z%f0bn*DA1Ch5xWw8Q@_4%dtlLB=WSy>)^cYhi^-%=|Ba__#1dZ@>T9GA7*vFD0#ct6g0_yVHb`aJdW7uK+>>-<-Z| zIKuba>UrEH_7HF&lx=hw|FEIJ!TzlcMZc|#)-JC;v?bS8Ude|SAdf)-Jnp>mbAYr4 zAqb-HK2R1jgdD5bx@zq3^6D5cK|h${~&b^mE`S{G?Ug%)Xkau}?XbcJLYeDObi7?Y@FF;nNV z_6QBTXVDyp6_XmrB}YTPra?)A3q(kZtCKM&0Zt@4Rqs3nay<`^{X-ZW@xWy*9Rz{| zwqy3Yz+vdvPzaToj~>vpG-GMOY(^j+ff2Rmj&R?DfFPOYW|;9?QlO3cA&!>C{Ba8Plgj( zt|J6$lmlpw?AomYjLqT^f3}Nu`ny73pJc)jZca3h@BJgHZ|09ACz(c(>uMSoUwmF6 z_%7|LfbVRt!WyVpck{+QyZO+=mWz=+_!IXc`JMR(f*W*w=Cmdl3p~zw^bK!{R_8TH!D{>4Tzo*qO z!6PDkCE*99iowTh)n~6eF|@W}qa7CwaSvMkc7dk@QKJCb{70hWBM(2-e+w8vpr@?i z68Q{$F9J2kTsp9P#@nCRMX7cFZ^9^=-EA_!F*>v((fB0xp8)|7XuLSu-wt_4&Z$75 zD;!+${bOSs+}zyvKd%Z3FS9B~#*rA->9KQkBn*MHq2xB&L{NQ&A&4PaAtB+Xsgbji zvRur_4BYY)acD%};Dp2?y^COpFdPr8vr<%?95Dg zQ7N-;xXFCxuRXQ@J2%7_NG(1p<4Iyq8MJV5UAwv4QmZz1dSkU!pJ<0rXoSPp-DI1Z zy7MwqQU&uuP(o^`%Iwt4TK0)O@SfB!R^N>v-jI3ZHY&U)^WgdbPMpMY(>CwB`$zv7 zabl)~_g&~edc@-0IC}@6t;PUU3pGnitfsdA9&)aB`wfQpH#mX3L`UroNHoF9J+w;# z;s%-FDaZ^{zXpIqjn1(&uiImylKMOLI6Iz(o9s!4|IGBF8${=a^S^+HGQpTL)c@=7 zlA*<*khpCC97zA`OKe`W8Z(t7_vivQr{jz^^4AMz(TW`Pmm=E%MRuP6->%94^e`S| zCA$>R$iM(ISiAm6U!NoIVMan-o#dhAjj*J}Lp!RLCzx zWmZ!!FFhlRln&CjOBT&&zCOoNpFrrnfJAAD9s}kN^JMr>c{n(5 zEq5u9;x&4({D27&rWJ2WVAFsB2QJ+Yq$FSTfXnojl9U0q{a-@WMQ(_q@5A-J%rH-} zRI3!kYe`X2eHK<$%c&~I_MR!nE!SSeBcQ^>Ux8h?q=o*O<6Sd$Ol=Ahem;^}XLxGb zM;+t4{+8m`OpKC6&|bp3m|FyTpAvuHEM&9Y!$*7y;LtQRrXBPLR(7E7Uo+wVQ$lDA z&J74Em-E$ug=2>}u)`-NCYAt_a16j?m2lsV108Cepl$1>fPj|t)rmR48s!UMwdx1% z4T0sO7eGykLBA6_f2o{4g0NBYAm?3TNNqz8(kB=`VDTyOXDZq93A0e^}k&JDI}TIQoZM@~t6;Ir=BZ zld*raR^jKBGPrdT*v^=sX5BMvSM{}<*P@h`wv_DPzS$`1ye-ZZ8 zQB`(Zzk9Rk5G16#Q@R@kX^?It1nCZ`O^I|$OQ@96Al)eqqJ%U^H%Lj{h3`4%d(S=R zyZ0Z41I9q$dDfb9{%U&ra@fmKA!&|q#*z4=YU_Gl{Wec4VRwQD&BWJpnv&&oEN3sd3?Lx9{c1pG5gs^rVZ{98;z^zJa55 z)$x2Oez=Zx8eKuQCGGkC;}JLW!+%T$qB;pr$Hy4R7_m?Fy+@9eY^C5p%WG;n$5G2KUdCN#m60G6&w)xsitTDtt{#tvp}A zqE(u&-(!Br+FRct6CNI&a0lfNm#P2YVV+2Oz3j*v5c!`yqdw$@?;F3u#Kk9A@26IW30fph|$W zsQz?5TZ^wBmv7+Jky9QdJ0T{^`{izjQ9yu?DnxO!357Qf!K#RF`j0WTN|lpRR*^p)n9 z;>>JJZ_vifp-l|^U*8bp3iW$Fd~j}#T`Cv%^EA5AMd&g=Q^NbPnUzX%38h_}d?MrW zY$)N25?4}B6708=V5mVRg`3qF0@K`Vi=TnV{ycj~&IWR~fv#>Un)ZG8I~YE>5@_Fr zn^kHb5qOax8S_H=yL6n4Pf<;Apt*9`GZ1Qb(V={iYs}mNuq6#fct6A}O|4;)SAb8! zNIAM5-Z$AvOEcJ9QldpsBL+DS8DR1XFg4|j`J{?mmulZJf=qdR#aZ*S=2^y%!)8${ zOSbUcF6VX5rtob7B{^ycAHVLvW)J3~QjbeH$x{P8j=WH$r%q@0_G;mkD3X8e7sqvv@m=8xW>aZ(X3t>r~JGf*%pu!fjv$e@V z3vYe^XazL7?%;MZ#vV}x1^REjxfcx5J*8!2^p;H6?a4h#al2(bV0cdei8PCh2)a!i zZ6P?#SVnilsnZM#JU5B#MmZUK&4TU(LdmNKiO+lFP=zlwFD zULYYBoaPBNJY>t}qq<88!aatIr!dqof$cAmtrLm!-l%cMl)aOfOoCXSg~;% zeHGjuX4O$deELB_GMK54ARp%;YMq=9i7LXW;7>!|IYL3FolPq4Qr)E#{YKrq2WG|{ z$-TXJ6la#r)vyPGTLK_deuE%9C3i3J0tI9knDQa%}=kEs76RGdV z(?VoIQ%lx=V^0jV>03wUzY;FC3e`-0^Q8OQEIfnH0QdO_kN&;2-LD=7gx_ zvJf@*&sw;z{0Z*gXAayEk%Ht*X$Mak$2QKxPQ(EgrZ8P_X?zDAf&Llbh?H74F&|-? zWzxwMTafRcUKeEFJ#z2}|22-SF{3as9F2u6%{Tk>TyFx$HNy)b0f9-nU-WczU;5v$ zuFiQb_;G}f#Tb;iHp$-sO|v8v;Nn4aC9^pA1|iwzU4$f6?Vf%+6-M1RL!Ov!D}3$H zq;^j&_`~N(7}#cPs)6m+YTn8fsgA-#=#!i>SR?!zv`rZR*<5d!D*f_*H-tPyIrr3t z@j=h1{$}1aq#cq|j8S5gKWGGk*mkC?dM*0zd%V*+hZzc%m6r0Cs%aFcya~8S2^a&Q zd2~J~blIAVk6}4&w~KBK8WG3Gi-JkCla;EZs6_>bm^EH`3=ESLnp!+PaP0vjivcbSBDl`Umr)qCX7dm3SOY5 zS8n&V*l(AdSb$66RVP-%CybeO-*LN0pO#o%$I@F^5;EP-aWw<;i2$=f&2VJIp47ga z(vjo22CEzXPZIaxACX37AvSd(*s;}%LP$A}6c@!@34QV(f}-}u_ijpjC7~H*0oWuR zN=0n&fNYkh*TJVJP5w3@Wg+Bjyd3z@jK`>wy2S(5=JIlC+S`{nvuBI@H2@jEVU7I^ z1JCn8r@u8goA)6RP-KT%_xa622n@5PUQE@F2l@Q|Hm@l}Gkf>*jQ`B(z0(@;@Fd+O zLUj1@?;kZ~>uArXs@H5s)E&!^avExz&1D9c}ZY>;H$#5@A9j;CR80 ztwyr=+G(WK-2C;MUw%frXVmI%L?j7ft&>5|JEboyM{rh@TpP>upXPg_!h3U5s3Q@hm4I)ke;v*Gv%YZ)A?gt-C$Wb5m$zl@R*b z=LPIZ%+5T~olkYFQh53JiLCND>V9LAK7dRXaThh2(5$PHC6T5K_x;>F-KJk zGI9GWw@zRBkmGxN$0+BITOEX+kFC@`%H*jtVTTOdcIotABa&5d^UG)j6+NEOmtT zK*505TSIE&dRHZ$(ZWLK_1e&wPf0FM>y9&qVz(_ZHTkQ>30)UQT2&))2$a zU!M?LL;A~nWEutL>=>(NcN~K2hdeMmxB|Z)6G6VKk`UTjBV9Et%pBjho;xO^{80Q{ z#l-JY-1A~}&*Xpw@^J+DOq5Ucw>=Z%?A5j`^P}l~4|#>(Y~HIwTHNgX$PVOqfrB7Ed&H2>`SWW+ zC(usMf3*gGDqA#g%@*BJSlrF)s4z6i>7d7Mo-7X^rhq)hfmn$f>mV78e5LqkQUO4} zw*|r&5=$Qf5kqPcItxyp1tL z4FE)onfMG#OfnC|ee0Y(Dr0g_*~L7h@gapNbl&sQY5eB-&byl%8@#Kh{fZb^NnN;D zkG+KwvRxHRGbN}!9gX78r(Zb5=(BAbFR(Bl6QU8(PePJT@B+&b%zccW8tbzG>!-_U z0p-u5v=vaWSlB`Z5F9jV%v}z!Di9p_#tJ=SWH&Jo`Gz7wemexnhP6a5F2D- zIL&C#ZlM%$M?!}$bAyi@y81n*`DKDir&Zxkh*bTM-+NeNWL=?CF^IE4OC&u{L+xY& z0y;eF#B66bkZ=LqxNieE?|;JL{?-5c42=faqDEGB3utiLOn_#meoM*DepuFe(@?%c zZmt()9!eb#mLe(7`|OLk!(Hqes0BbDj4iNv49RIQr+BV7i{+1H`)qzzOjwpgi;yFN zTep)GTgjhF-b)U(V{k=ruVuVx)BC3VHx?P>d-L4$$YRup_~0448Dj^G+Ue7Y^?Lh} zBO_gPRj68xzxj*rjn)Ufe(QK*D>^hc!AkuK z^)GLVGe!KL#E?(XT6QC?`E-QE;^>b$zPzNJKFtV7X_M3)k_cNBE~eve*grjXx8=IB zo%4AR+4hdIYqfd(72(kKoa5GI)4G`I(T6)S_a*L_=%Mi^?7P<%C<8LLzxq|*bzq_Z z*2n8IPXupeV%GZX-*GCUCy+RMXTUE5CXH6_c0(aD(-d*z>!G##pvMH)*V8pyiABwE9H3RoYRk>K2 z($>)9xPZIt%{S513vl#xrRs*OW-*RGA5LH%x9T8AXGU>4`aCCia}O)B?J~a|jRk#J z;XNkZkJB7w2cy2^s@{94mK9`1e)ezl{I$G`pb5{1X%iCC6RAxtwsOH=^xRjj*za0R~7CEZjVvwx5j$g|Td z@wBuSO9}OrzPr+g$6?3sROt`()Qi{hAJDzi?5ha2vPAia@N*r1g)Hy?<|lT`Xaxy> zTdljy>gZ`PXZs!jl)ql4f1p2l+)~Ld$)Z##(bK;faH(cP^%w!~haAU7{I}J-gFypj z@brHQ`*E-yLhR(XjXh0mZHBugR_47ieIPT@5~O)ox~wWBPMv}6ABr9xHIE)W`T{&F z;1{v0p%(gcP_}uXYMA&=*CoetSypxhrJ(o0lJ z*ckheTS>G>l60J$M8W-#OijH&u#qk+@GI8#?^%F6`z;1R_YY!XJVtY{O}CZhFv+6s z^P-}ob3ItA=CW!FJ65J@uX*c%SCbZe{#uK>oxK{*zCZHFWi?F9~ z+^1hH{t5L{n>$REOD1xiY1;lM$tYRCd0vTRa_;w_sq?1YC-%!)k>&kZ4CY;X-35MY zbZ7+>D%GCvSWz}zy?6NfC33VFxbpemzLTWRFrF>*3J8(3wz3M_bd;5GV64vZzbS93 zt%fYOKe@*r!iefi9x!-sj#hC5k7J$7@rtGC&2B#Z0fh!nY*Tbe*BofMgc3nJW%bc1 z>$@X>qKzq8-FTJ$HuK)i_|-MlOP*H$IrXL*xiGHBSXf&2o$k)8{_IP1LbFaxEh4h8 z|JT@SF(*gzki*q9AY!%2i?Sj}mVB4|sh%FY&O@MB%iixD$;`f!a$26wA`xos=N*Khu*T2QZC>3EO_isg)b&9ccib1+@5d0Z7etuKaVMS@Z$RL|(MX@}v z8Nzsx##$Z|sjl!47*Im8iPl%Ji>99!PnD?94RDZQ9XNYNx@E&g z#rD#sSShBvUU(vOiO!_C;vtucRR1sPe-YE=9Kc4Hx^{8a)@ zJJGb6vkyz-stP#zqFIFi3rM#KM5Q&@{PI6HP&YZLBC*|{j*C86lquL_kYs1XwgNti z-^gt{baZQ_@xYBhCu|eshGnk*a^ErRta?YXBph5Q`+%otCNX$<4s!#Tx(D*@=<5<&O&ot$kEp4zMGipmF3&Bm%~ zM#1R;=pX7UD#51%ZPKCNXzSX75rZkE+H)DUZGY7reo-T(U5bzIF`gb7L66g2m|OQj zS!SMar-CqLnYqrSElsEyu)kTQ?>WlI;1tE(lYHN+kXrFDwhbEJPHE>Fo8@K0?@z^l z&edAgh*eJ$j3jx&DYzla1f#)Uo zaILNc;qoFsGD7MWI?6}}fxl}T+1@*E&1*o&R=6FVN<9YgM3bryiL3C=*eanrGHyp zhr*gU4dvY6CStel1m(>EiU?#YrS&QgX!IOGg#>g?SjNdFl!+DNwSz1o2M#x* z9FmGtWAU+qXoca6^o&#jZQ8R)^ba4XR6nxzNA2=MnWR|uu zmmyMBCRl({(aezrFh!=qa@GRj?q%}RP-i{?HYO#t-Jh#vAL{)>{C|k;dQ6B zw8*nta9>q~-=WoA>HkpX{xxXc30_t11h4-yR6!*VXM;V1RDI6&-q?ao7lZX!r2Jn% z_8H4F;LgGA_&R3P1C8MDBz$O`6cmJD&Un*9>PQ5!0puD(Ies`D%8Jca9W|Ks)sSA* zAv~l5YF4)1nupSst(>>(x${D{M%z|<*xpfSI*y!qZ`{m0|9H|f+x zPWEPKqAB;(CRAY)lR-75nIdQho8`;_o|iw6^s1NX^~%lO4w^LKukJsGx^)O9Zkxlx z%X4c2-omp7eVbxru5%XbAov3~^A9-^aJ#l6_Ex`SB1Kw?XAa#L=Q@)z;d-c-1At$C z4wJ1rL2-)P;NEut0crW%O$Ef;y^*aFcTiv49Z7z|vb<>pFd)Y}{n1~dTG3LhY8bw` zCRxA(0)oC0gW8{6Xmn|9W1{r*-AQ%t2k_>yy;wOofZg&iwue08o*G+Vt^Evt;z95E z8|RK)2^_aswztO!EO$Hv>Fx{gqT{JvoruUNP zwbS}<`g(7nfmwre)aSfsOl{JbHMgd8iflHXgf+-;IN#~wv!xJ? z9QwJObELT$yC7-U=WDT&v3<1K!03ojY+i_EavE1M_RQr z9D80RkXk}Frct2idn6hj{+9ZEPYhRkBMl;oqp|Qq!V$8>{z_c)-r*c9@|=z^xqeg& z%3&1)y6~dS4TCC(v~rKFTM)@}`78V%mELW5?ca-^mS#y{7+lD>?6grZvvP1s<5rUh zJtr%EC6Qp?df~bAuqkd_b)_KlXC-mG?JG->xl;7~`Q#3tx|R5v)1j%6ACx(>bU|<3 z^jCsjusfa#B&B}{elXWdBO4VbuaQ4MXH6nTKP>q=eu_}Wc?k_^v;qh2Ww_3$=Xgqb zd><)?DwuF#N@0ftJab#Re`9Arz4@Ea6G+6Ff&Aq?%N=w9mDF2Jr|fGFMp=z~*bfLFacz!1>&#~v7i~8nVMQEUp1R!JYSh-^@yxi z=1wElrlzwPikLISBsiE(TjE~#*_NT>K{1gWWNSH1!wD`-#fa@8BH0YY(nEc^aH42D zNW?-&h0p%M9LXS$-bduV*KB2k__$FYx)h2UOjuA(?NY`;FYC#iTw+iaeoqY}Gr4U$ ze42{k%Q;SNvCM)$lGjEl4&9IQQ49G{2WxDVy!E%}AF!LQd<^2un20xt z6L7K*h|neG4ZQrT&n}x?HWsOc{I3KRe0AFT#GYsuo6zn1H|Z_onbWhpY?>~ss1=(} zxfkdOF20ZV|BRL=GC=bA-&GiC00{Rg`HaQUptN?m*SP7{<7M@fAwg_wS7 zPA}=m%FXY|*8b@#tI)kw*Y)8y;Cw6i`ZhUpV?WBHm4xJP6i(T1m(c43a#2PZJHOC^_8v+VzcXi<|#{mEVk9<>S2ku}Q+hw`IL!uZ;H ze%}LyPKsNKOW1_V`ebh52Gi&8GXJLuCjx5w7dr|{0$MxYJKaY#UA2FwdY!n=i1L!fDR8uhFF%f6Fm`}}z{CpBqn~7pQ2Kdi9 z^1td}1HwZ#$ZGyGA~|jde%rf9E0zA~EL^V_`H1tgp^;0FRb=?q@m!SN!6gxfwY@|8 zc~;D5ZvKvuo^!5m^R;R|uW`o?ue`hon=P7TUOgn|K0ujdfVIr%Z!0L=qn7aHsf=MEpe_}E}A)RSzagsZ<0*ct~@T06X(1Lgy zfO`3tntsXIK8HMqDBCPy9wf&8JRXq{IAm!^v3mYIU1T!CnQfkE{r6giW7LX}(J*nZ ze%146ItB(3D_4dyz%{NgHDj^=;Y5AJnIYXq6X^GNX;?o>pWa-TCHNi1DIuwKNXqwA zeugtF4-F5IXQpu>`fV(`B2Qb$o$g69*YV>-Lvi`M(;sy!@xiQvhyWcUZLkW~u_ID^ zR5E?u|I177=4hMYfZJXc0fF}!YLGcQG7D8u83jQ0>K3yq*l$k0k-CKX&-=N)m?Q@RJHs!BU#BUeTdg zybyZ-M|KUEOrF;r-t|sfxU)3)^dj!+;&Rfl8NQ0miPS)>Tv`(TjwPW*ougx{Z*3e4Z~VdO zOMIqd(?9rmiHV&EiI88vWt`cz{}=I^PeVg-&WBBh*zxr#N2N1O$QD;)LfCwdw%G!h zTJ>Py+5T7S4^Z)e(x?IC$8uVC=m$ESrrPObOKZj2bRd1DUVL9vpkj{5l!c9rHw}Ov z?$VHmVO4%eM5#JJk-mME(`q~WqxsC>dETD-5+n0Hsg3P@N;KYzI(>))fo1p6Zc4zb z$P)>i$>6GabBNki>2tcA@6_Yo3k$c39s$2a62E+@>#uarxZi0&_vT6e$DXrsu|jg< zucEi89if4!bvZnH1bp5NQExe+4)f*};!<7&y?zd|`?(9(vSML6_6?(G+v1Yt%U5KA zHVg6f7|9C1!84aIbNB64>y=W!8<&Q+PM$oquwxcAs|z3CGVm*(8F+?-;*EUSEG&st zx#92tc-~l@K?8ng(OGhg2Q^je&Sz3+{8;q*j{ji?C+c5jq(J1H^z`(omGKt8%b!&v z;ykQeHs6?tBb~AEey4uv^?H!Y&~gS8QVhx6$LnZBx11h)R>Ndz0*+Z=tx-Q%B@<_g zvr)$-ydJ83Cvy{6z=Np?>d<+$KQ*vqT0cFLxNUX$ePEn~Kaf<9k)pVkj)fbU;9lc} zw2+bQ3XHDe41hob^CotAdlrjxIJ47|Jfx1h)8g$IQUo-fAxV^pc`z(^$iA=HpSV=C z9ks=xzv=N^REidp2|x0{zDCTp_jcF`xf+Cp5(3=3E!Rg9&_Tqswx@Lc3(g|@gsbhu ztSEW?6f|uhU8N8$F~9A;4RU`%clzd-(G<$oim78I7``}X~bhTmJ~33Lb%L8yOs zv6!&L*x?JB#fG8{^NPdpQ9=lQPGiI+y;A~9Qsy>cbmO}_6|6TPx7 zGGIa0w{*9&6samHl$=*%%%3nOW{{lx8)zBt`9>c_^hPI=*!R6>=uPJibh9#tv!E^# z&E?c(SAb;5;zS0oZpdn?GoCWBY%*psllOhDs-yOn@6L?k7|IGHI`EM{E+KzS zQ9OW?J+FI@%k=z!)?%F<2yb1D?0OaM8ppp6XvnZTiAEeKTOWUA(PAP(EQiHF`rfDU zxkje9A(2NXCnsX zFqq5=2(Ac48C2uKKUP7QdbBsZ9;t0Mwq8Guxh`L?fH09FK8-c^(0W{du~T~c8#|ng zi-dFBOa6&G&tdS#qo;|pPplkzyu1dj%t*}fqGXg_XPKGtJ69zrACV;1D*0|Rn{n^e z&Yye&$9PYbAdmia*-z@+y)3%iD-)X@+*fV*a0FU6NtgFqQj_~1=4h_ zoup9;AyMvjeZJs7S5coVGZAoRlU7;jiRR$|PUNrST;Lg;FDVRhoW z9juB?w?#$U(H8@P4;BKwK!qrx3^1f@7;M45ct0qM>c))hWr4n6HA;<)XM}*vM2b zP)c-0tDkt>!|lj5%FD&qDJXrPiZq<$UG6h@4vy0zwE0uynS)SPErTK2ohP+T8r-p! zNI!p2RFg0k&|Z3UzPgsEFqD8Du<2h^f;kwSI5-!Q382*yWa5_>E~~Thzdn(|k3M47 z$*dO;hZj}4?J`*No8&VxP`4q!JrEmIDQ+@h2y3}UAATx_lbJy^zMN<|v{oD45_lfS zop+Q~?&ysGCs|X4e8kf}XZ+c*x#Ds&73-wmNX?V*G@8`l>nu`!Ce0ZoY!K-@J4=YO%K{f z7XT~!c71jPcKFW==nv{>y|?f90Ru2reFi6ESaV7AKZVZ6XGVYX*L6rj)C?&`Bmi%P z6(+EkRyC1WQYj0Pgq1!J$+sjr;LGl(+YPyhd3vV+)oSU7yUjV~w-^xUpRE z9NU);+k+ppu|+d1^|1tcQuak79nI+k#IL9#NFu!+fVz!c{bB=&1WOO4G3N z3lmNKJn`ZBK$K8 z01`Bm)hdabpaJqRI_mINS7$p;7vi}w3pc;W$T21Ro9l@t(`SUZ*sCWN&=_#Y%F>kM z1La&rHUqu|jlhlu=GTMx+APM2pBmSp9GK`_M?9vWi5Eu3OV823+ImNEwZ?kUYtnbsy38WF`D=`DVY|lR8$IDvQNGE%A;_hOx*{h3VOS+edRF)r5R(WEb8{ z`aZ@X0rKZmC}vU4%4d#p^7MedMSjhaL!iB3%*n&dBTvFG<7%p6j93?W|5rQ=_@+FP zMr`@!0xvDrKBg=cYSDjxljm~0{J7q`JLvGd{z7q*$ZySpEm@t3bt>`Ag)51(QAr+f zcP5=qNZ8SzdV@FTe?MYCItEGN2?ei_X5Jauvm6LhucdG{NQtLk2)J#|6m*t$SZHAf zm_=g1Wo-aRxL9J;kKF!C(0*5mYQU=9L`)YlAf^@-B+29a@FeeZfJOiIXpK`=YJ|H zKVZmA%p^o8okJk+$iTpZnztUs5hWv)2%Lg~`g6U%=fV=t*%FJtc>PH1dNo_y5qZXv zIL!pX!zxloHSCmLM0gB|Al_r3`{nAKOXyf-R9XQ{V|j;PFu*xU05{4%?XoCL8M4U~ zpiTX6#uylur1OsaWWkn=W!Q_Q*(1aHP??(zUUYMEkLL8xh4dEm~h+HHf~dk&gACs7V3LQ-d6D039b)A zwfU?RXpT9U4Jm{NJ@Okk?!`_wRnRaeN1%FE=|3=Y^jpF&M9K}q?W^7~M53BuI>?!H zf{wY9(0_z0s7U^1N27O@Wih$P8SQ9mhO%$)13p|bF$#tg!*w3B&PG*ZUk(8!NnX3< zhyg(?QXSo~rJ`pguqq#&f7wziKWV0fRnO&YO)Ckh$!d{&OHQD0lTA69o01-Jh5qFc zK1_+u#r1ES^9I@kSOoyVHpLS83O|KxPFYESZ0rIXNr%)kz@cY97MT7bK4Wx%=uZOtU zsNnB#`bDB~eHh1DW2oZB=^UGGB}P3bp-iNf{j2v&c}SkN_SRuBK)Nj}*nj9#n$Be? zmmhvny~xFtJB(6f*-HKUVWw32AZzZJGa;VUxv_{ zKyeV1!mcn6Gk)e!g(?49)yrWn|HFqzr~a)v_sB)*dSfY56zJ$t z!jStSP9lHK#R@3;f`FCBu>#?-2l_q^11F~s?~F_s0D8FdOPNC@z-i;H`~}eW{IGRO z@TdEmGy6~RicsX6{K{#%B$zbf?dkC^SRE-MM?+sfz3KXNhTC;b39r|TO>sqUILYU5 z9enR4{<{V+5HCm>GwBKo@~r~q6GzO4q9Pk<(LGR>i>MO;rK9M$E~>AP9|oCLGbWIO z>bAF5)x~pH4DjSLoIxVw)K^Z34p+hZmW$Bma>y3-PHRaP*@7&3z1Y;jY{L+>!zvBv zl~PX*2M$W|-pJwc$aar=+uLE`?*ywZD_^~Jy>CIxPd4u&Hp+SK{ri=s(~3t&6K9q$ zMVW?D<#huBZ2dBeQBIaUPCVfNL{eBBn`T1#EDl2mQi2!kN|73t>wjqafCEkSW&Y9( zqoOBN8x@;!)=8re>16TU39+{38WMDPtQyn(Qt5He9BQ$N;9?GhZ9F+Tl)!6(v?XXJ z@cnrkf*R@vE!xPq86QM?oU`5YhkCG-V)p&p$T4rjZ-@{zU99S63CuEmkmWbOE0W-j z%VGXS*mr84;W(uj8ddv4Qqs+H3xkKcC(Hn4v2wtA+|?=Q2iD9fy7zJCZKohrG>OQB z1rztw+VWX{Q9>}5q~0~!gWZ#psoEPh#GH*ne(#q}O_e7p2@R%b9!18I;WHhLNRnua z`Bg4kqu5OKUVTZ1v**KQ7T2Wv{i)9HA>*MiC%ZUbL*3s?lqUUrNu9Q#RhrTYB#R#d z;y2OydYA{WnX_eSt6(B;x_8F&XH4y@5wkF&o6Kt5qc|DuV}o~#FY7;rhmvd!AZ>H|n$->GJhgkQNerSl3E0!@FOY9gx zeS}01bg9ohzlEzc-jsJen>u*%X@`xfFgqH}d-_y#dyDgd+!+rrdlf5)XOG;O2k)k^ zJLCBOsvqRhg(R&cyd9y2oLny48mqh&rl$pb3h)gKi0n7f-pQSBpl?Mi{`k3+FF@X_ zP6`yE0m3DD@ICt`g>1f2MmB~5L+9Nz{Na4c^znn#LBixJTbm6JxHRYLQm4nW=~@C( z<3#teTLeALP9<#}6EvbY>Zzw;RD*mF7Cpw8{>{$IsO`Z|t>>sXJ%)Ig#0n72E$L(r z#a=4mFOr%!-ykp>35hmYsxFrPes$sfEW;sT+S*bJ#rfTBr*IY-(&_=G4Iw^`@rpI{h=kC9}j+uODi!(>0hQeegz=Lq?upoV~OS*#a zdGE7*}q(_H7ZTUJWRgwZb6Q6@yQdWC1?R z`}3uN!C(t8V?I;BoNw@zI^Dk-M68?t{rxtn+twHg;T=YV5?lKuP)(X5s4*-CiGPJR zRDR~;I~qEbAc2a=VG*=r1`(`ZV6bfTau-Aw@jN0LH=+-_64%Dc!lvunfj{NpM55AtEU1H@E z^J+jomT}IX&Ti4)@teTCow;o$0t~4C4|6Fm%x zV%8W8)djA-e>)yr7(KWo>OIBzm0#=5Mu!S0X{8)1isz~yCyYO! zia}w__ZQ-fVm&Z6Hl(|#7rKiBtlZxZ;FFIp!bs^Ddhs3%x=KH0h()T+Sg-iFw%bw> zXPe551hJiL3JhPZT<(6FPKbF_qvaH+ppp^m{)B75@@PhKfoOfXnb?%*?OqyHuh-1a z7d)0fqVyPM59um$1C1BjZ~R#^n#Uz=LRM3whymf>wt_=f#sbL$){8mX-ITj%D~e_i z{&Qk8c9zCh3Y4N{3{g9G!bAqTl@y)7goz1e$Nw3eKVZP^#3VJj1oXl3pnqj+<_F@ z&(z2p8qT^3OKSThyIH|GMTMuGR%@N6l*)^y zBK+*@#6;jzbEkEvbw^Zg8`!7Oe2C?}S`yHa41;fN4@TmV3{{2irTos8(fmN!p^pBmU+HLXQ6D0e~BgK75J7L<-*1QMh~AD+pUxgZ!@*Mbr(PekqN-XZMhvd|J&Lf2b`c< z;L9F%uQ+vpC&S;)%ux0RiuL}iTG;JCO4K{*&-i4aZx_|or4(XCq z5nT;i*DCqvAAmN&BgBtIG4kh30aKukmIq};ktp8&<@3vc^pN78)sLP07HEhSb^2tw zYJW>B8KMMfxUCJ>`y)BOf<1XI5|p~6o>pa=#%6MxKG>%l^5GM2;+KiDs=eC1O_R@? zUlBcbZQ4||Ec0GB6OgWNpZ~09h&ffNs4g(!D?9&CzY$eufv33meUj!G=WLfhngkLe zvc9vdCl__uRK$8ME(1lFc=0`;JQ}yUV7Qa2fI|kzr-~q?D{C~-y8#R&kNwwXFbcSn z6g|WXKvqD@&2QU!kKOc;urRPqD{W8!4KsB9RSf_>yzBh3(cvA!JWob%4xs(xk0a zDAumDeN`Ct%!iFVatP-wCtMa-%6u1mlZGw{Bn$N*Kz2Do;?;k;w4O(+ZB`CK%20Cb zGRzRTbSaU;Hc%fYgtXs$?J%qNIVN!vi#~FkRO8_2O`5cI^r1>LUr~?Y?6%mUA2C3b z%eJm_h!#*0b5nk~MVdTxLhJ-Vht@Zw+vvs+_Qm?NB#C9;8YdS|qnhr?qzU6`Z;YGC zk3(!;B5t2Puppfe{QleIS++ld_SOkE53?43vtmKHH@Cy`qc(9c{+~677=ET`7uvoG zfA6yNP1WhUa5%=9{ZY^5?W+Xu?L|&L^ip-|H6B&mpJr6?yP7&0XysdM{o)kd(}~q8 z>+HVgsKnv8?iuqpPin5$7T7&Y3w9d$As>nTaw}S(Y3Tv+?-%wf6&AaPA2B!Db2EzJ zDRwL5j!!-4Tw8hhn|kJG8DNJymyWB$(E~2~6MWNkyOWg^kZ-!yvYw1J?mU${`|~X? z+-TZ}$Z*Ym1g@R{!z{>9m1x9X_NfD5Bve5n&h7?u`WtsaamMVKVpZKKKX3%hXx>U75CIt`#6QkjC)05m-GlxB!bRUX zHzz2`mHcfzsjZ6+=e963QB%*k!Z$2+a8EVB%1j-N`Nfmcmj(|?z53{sPUyru02&SE{z)iRUk*aM@NY+cyNDfza0)$hvdyR2RG|YERZDjiH6vlh&LI zcWG3g|A-*&N>#rx$b} z;{Q2A$na$rJluDQD#pX?Q(7iT=9gzcBuit?@CX|7L~>%nEIib9d@4Ng!-vHz=lH&n zdTRPm@5-$=0v5+2t=uXi$(Gz44Wx3Ebqb$(6(7sz&er?OtB}uPFc#9Z(Z(kP_to;H z?>px3-W<&L?wCJ-7(Ml?Gr;Ib02l2%0pQH2!0ol%*wDuzv~e&}N5u0ix-WdG5d4&G z@qV`RnhEWT!|jL47p-gg$Y`M$GXsuSxE{x>&lD+u;wN_$qd>||#Cmjof`ff@aaL8L zKiAQt4FoQekZvzQPN>_uJT1Bk)p73&&HfV7Z(ERImJTwz^zgNroiX-b5!;DWyg#M% z6cUkeFD+w}>N(}JB9Pv*Lkb`TAF-DOcNz%IG8$iCUHw;{A1EyUdlr?c61*Y=Zy2;4cf z9)fKu8Ppj#`vx`scndG)E+O2YErxt+o)P#tJ0`>64Pxmj@5>mXqZCda);Wy=iGb@$ zjiy#o1`!=KQ-cK8O;_D0!0ZY~)Ee~esTt)zGR4c>1ae3^e2!zc0Cad{>Wd%ZyCO+7 zbJI0{AeWJ$yBx?<(6HX+K;HGL|25I4X9mKQk>L$v`He`07RsRp=%UWgGdR6(CWT}a z{`5|ns+f}Q*mgo2Fa*mW#OfQdt2q@ot{l!46A%3)YH?R*C?f|onx3>%LcKfrgUJ*~ zv0>ccchbeD$BM)hLI@`-op~_AS;}_O#$Bn5Z@D`Ya~V2^;z>*c!l9Bf1$U`tn~IZP zzJAV#ioAzy3giFsoC@U(m^$nfg%tAM3Z&WTRH>32gu$*xU;it847oCm2==LgBOoj% zuH*%=^*du_h_Zqyw*T6O|HayXMEeV?%;KqoSA#}%h;UV5geOuSr;9ZsXMR7=ObMK5 zUw;EgFabPgum7zr;c8F!F+6DRosuy{PJxhYZQeG24&4%N)8zcirhQNG30J$|Kx>=} zVopfJK-=l(+Rwyln+5x~)C>L>*`wViG>bV&6?ShwaOmD^vn5hstuf1Zm?H`-|BX(v z9|!hfv&7KePt5s-X1@J~@>v=8qI+osWlZlyK<|B$ePt%pRsndbcTF?S#5@k~x)=ue zOD*JGdE&x}?2@ez3{`QumR@2f!;;VprCIk#*!Rs$p_oDgX)cCKVITHEZFfFY-RpxQjpU^jxl-VZDbQq``QsOg*OJrV~`_iKTq# z*nE=g`)f&#`@xAbfv`I0;$eyId9!^)Ovk2`1-%Z6L%>4zemGC z)k>TWsdvmFOl@pR5g$Xgh2Q4Il9xl((q6|mJtNBz78aEKsT;rGc+D_nX6Tk+^GNo% zgP1CJ`9>D$39=)oP=d8N4BQ)c3yK`p(#>RuN6s)Ld9{gjnP|^fTZqOU!t0q#!D!=8vShp7C zHyQ*;Vcq;UU6jiobRlgRp}}O*i+Tu2WIsM@ocdlf{)(rlK7)8)pG;cL)YvDyNu%$R z?^7)D9!flyYahR1(`cfVFpg4U6OjYU(qji7g}lLF_!6{_wY_>_O?I06+F6C zWgDPPh@+RpLaL&d^p@*!#qMm53JFvcx;X08K{=THXw*T&(0BAb(sJ&2z%rBaa%{yb z-lz4q^_(r5+WV5{Y|wt6XkCPFYIWd)sf8>l)W+%FE=4VsDVmyi>82dL3J&4fmk8AnAxA*3hW zN&E7R9f_2GgN6ZptyG~+;V1Ps*~G567TvE>7NIptxN8O0P2TmlJGeRh5spk3AzfL4 zh=s)_kKry+Jid&2xu2@YEpi!%N^Gc&jFBFf`eH>AqVLe&|IOigBJ5SduD&t`S98Li z*>e&raWy=CftzEk+87&^Q0TI$R6{0lkcdyXR@9)ilUa|;!=oZL^yI1t{#%P3)pJfQ z7(EN0p$ab{sYz2qFVKUabYU~_i57Q4wo*$ZuZ3yfXW!}y{S!MSlN#g*`EB5`qBBAm zQ&it|{k=X3TqGbwM~Bgm{w3QqVmZk)n6d5QTE#^gBxLBRW9p z>M5)zb!6sSe!Ywbn-JtnsjudW>mCsi5=ztty?#yj^QzCB zO3$xsGY;~m!7X%9J~d2Sd<-|o-{{y2ImRrK+dM)GPGV@34B;cftLm*LRfLngjs=gu ztJ8CV4QeB}!w$7!as&V3enfM7-SV2q5BcN zq?S$`-vCt@<>IU0+WTVUV(KGVqWp z@SEoUq3kQ8s%*P%*_%d??(US35~&SH*G4*|JCu;_MnbwFTwhQF23meWv=jooat+gZmd8L5s1lkd8dzB1DAjy9JW@8J6jfad4h;8(ore zKGg`Y`^niQgzZX?y1_UEe7I09dA}xORA@J*pz@<8@GNM?@_y%EBO5W#^vzMB-YCRE z4>^D47}SB~Ri>-Vx96%t(abFIg?9w9vWl^#X4>Zq-zx^|lYi`y<(wQ!-%kfy;PE!> z`T!$*&bP-n3ed)DJv^FB&AlQ^mnz?_JPLLb4%E}xOXS%&l?>0-qynM6EuAp_F1h?IanQ~Uz{9QUNO^5EpIM(a2f0=~-cd+K% zP?uziX1|hVT)h?d@o#yuxreig;x=u^v8|yh34!YSgKjq$A9*7^pop;Ldd0KamiQaf zD}`7H*8Pz=wi^iwa37~2m(-J9YTyKYFfLZug<-%Xa~*Vzxlw~jKpGRqqD)VhM?Cg* zMISdXkT>!Jw@``ocNU0Ocw8|`6d{R-wz?hS9k7+7}mTiI$0U%f3Sd*-8_7!7RJpWVZZpY9}73X2NhX3!8})p6;7w#~uG{wO%@URcs|y|Mk> zQ0vYa4XPye<~A=jPVh&94^o*+f#K1V=zLZwbo3j7u-DO>Egc8zk_Y2c zG?4YSp1&UC`rq-WAH&P_5~iemK<1v6lspJn?!Su%qWTC|Mr7h5ly#UcsArTC%Tz;s zRp|2#g96jhgNtClQz^h|ML4HJ^5uKCYKc1c8#X2ENii9bCPl1D=)ORrISeO9C^hf% z!_C?CL=l99Ef|dyFLR7+xdKW0s#)NI0D^79pf=l8`|2~bss81WZpT{5p2hEM^!p#i z-2`sh?1??olC4>pFF_@z=aTgN`a=Rp>iH;Zmn9>*DKCxH*&;uP1q>v1UrQ=yItG~G z=TOwEM{S)Qx*x7+fQCr-0RzCmKjuE_!^uI~=q@Ot)Tiek-qst#j~xBgG+_OyIYMB> z?N^maXxrq}Bs~|G!`fi; z6I0$Tw9;{`by)3PP0=LSyiWarv5^pM1_e^_8`$=s{hp9f-t=9{4dM+bx!sxDGbo&R zyU6~@6hEg$9cM8ro2N_WeT?e|_N>ul?qA7Rvz|$ZH--^CW8onl^ysuiPfI5Dhv~Fh z;DjH1CfC{sY~VaZkSkWsmCV%><;+#pPOxUqSPL5UW>RrCsns9KC3$xr z!sehbQQDX}Wst<|xZ~|1Y9lIg{kFZaUXuszm_blzs?!>#X_-yrRzxlwRu#iB1eI7f#?L{bH9>_2a#eYH z1^UrLZjr86zKgJfhGLA5J1@R}&>=bT({uRn10MukTjg-m`_HuU!B&{FdoZfg)z zFgaXYNNjf>ErJ#vY+u4eZY!Y}Wfh-Mwm1UYniXR!=@uHF7O)C7GIe)z&RXeuO2e$& z5aI>np(6nWZgKJ65EtJ|WUS7@0c*F$0`?Ct6AOFYIPZQ)tX&d-Zu2__xn}KONuw7d z&CD*q(c*tW-}KYX$1$|5bZ5)MsOWz|6I&$fXM9_$&_I9zRYT0fnZCshonfu4{x+?O zrQl&`|0Obw-3m%47h=TD&Lul8;QLy)6-E9No*&+-;@0OaO}L_$_9rfSdDCKk&6v{aepVoSdvx_6Ptz>%82+cp(m7w`xP} z%QO+whZKYI81-xWd3mSBZecn~5byiRJ(JfjhUT4e7urlOrRKE=V2`0&MFY%JbZlf- z1wT2XqY%G{C#DTe^S`*rl^Tt4Sq(&YhPw*3R6kJ~)+=C<6XHkdA$}`5>0V7V$1xzx z2&@DX_gH_;N)3!kTVk`x@{0*YLtY;AbNPmxzB6Ni@Z-_OAg9M#KFhSTM6PX-;t>MjUJ}jUr>I>rnFtO8kM%)zC_-*yBF2?bv|pfy zCUhL5kT;zT?iXSdy1{%6QiJc&z57R*Medo^hZh#p#K>$^qsLDNimAd856%{-G6F9Z zIDBJ7%jEK{QK9;$#JtT~^0v|dKv17Y7q}uQ5KDD_`h7`VKJF-{Hw{|1NOKmvjXz_YE z7OA;@ikIh-Z%w~degp~K+~?L zfV<@noc>>?WQA5&%pl8 z@F(D|kb6HfQNm|xjo!~o3jV~&?lL|=3wV3l=vzNl5mYc#t((AOYVrH9d7- z5Dbh?E)8hrl}Oap^RS8);I_LuJF`%m39$i^pgs0zjN5vwmn75Pmn<_eu}&?$*IUF1 z*;2+vQI0}Z;DkVB{-LGWNpMBABa<0^hb52LHQsU&vY@teO^@K5_ z6ol2|zzJFX**cr~!J2=#qo~eN()J-_)05{H^OP|iTE)D;*=z3|whTg$at~I*u!PV1 zwVT~_zHIEA7)AYb?~03$qzjs~`Psu~O^6k}r0Ff!7=rmXhH&E^pxD2Qi$XH3Zg>q`c!%K8}#P)r@B{U-(=Sz@_u03 zJQtH!Ri3&4MPaE%VEP2J&$i-!_Dqz|{wVa>A5GiO>jLlU_dlWQ-44pLOCoRXr!mAP z5=M!X1mn%!eRgZYsP5f%vY^|Y2f-jm{w0&fObmAZ;wptB1_}O`A{vJhSUJ-gp}84+ z4Og=o0-g^shxf-Xto#=Ej_7Lff8G`ck1110Oeje@9~#c<$~fjeGFsoEYgb$6vFWwo z=Q%oN3wY}(44c*EVjnMq6*O~yISmeb^7-Nc$g6P(Ea2Yd^RsIoXFYRZD z&&mvaH~y|cXu+z>h%gW1Aoj>O`X?Da<(fipUZ0>V58WF1yo$`yH!#7A-1-4&8Bewh z$XR%B8b#?Z2fXzr^R4h$V(>i{- zLGwCkNMvCw?<+djtvE_!Pm09-z*2NaKP!7;{Lr?bNGrqHH{5I{Val$x zU+CQp64R!z<5B?k<|2C3O8U^H#YMB|Oe=r=tJWkFAxdCGcJ?5d%btT!TWgu(7uU-= zb;qKedWvx#S_WlABkMGrBbiE*=1D?fbV_*7of$?nC|UwD|2>5cab%L+B*LVTojNgI zamh^B9#)Rp7wdDL%Z`lKyhzqYM00TLRh!@cN{#IOD&%TfU z(UKiBoE}(2-w~J=_+jNeoKBh?nTf2sN>&36)AX$LBJOP?sp%-voTXEtP~6=n{Uhee zyw4p`8dp{(=){XLWh1)3`dWPg}y ziGNb4sEbLL;xIYx%1k6iRQ9TMva3d$G#f2)mn_w8DxgKg5Z7$P{zZKKsgp)lK^uLy zAGH$H$#uYG$Bns2gHLI=1n#f8ThLL*E_#bUsC+??f4LAz!M9xHD5Tr%O%wr49y2s3 z3BH@6dqW%24dMcLbom07TieH7;0)9-0D$>#KZy#3KRbQo2cmR+WZ=XdCvz}(Bx_`QU2oGcyMa_KTRKS79>hD8dw}eBo#3L`7c+-2mc3jI$ z@lM>BLkW}OBxCMx>kkn&P>Cp8-U&Ag8tFYzV$)bw3!8!WT|Ca#PSI$hq&$I~lzDh0 zJO{o=S02Z`vVipSd3F>o1J@Vg^=yI^ZTEl((rq;E&Ig9lPAfKg-QFs{*LhDpW%16a z#em9&&s0`xi_TvJKJJE>|9-vvPw6A@4+3pUKYazjDsZC8lMPJZKM~lWMF$b6$710I znv~@1oP(z!>u=c{!xbV@5qHa(?T^1%;~dXpzTvr?MW5SpTZWepH`@ES%oYb`;#oej z)a=6kBJHIjq{rWSIaTgQhz>_5rkKHly1(s|!$4lPzx@RApO4NHo=?Rn~ z+vT8sBtffX(1w&A^e(N;4n@ddE~v2$kjvCDpOKu{DQ{CJMLCOodkfgZN)DO+;_{9r zm2Oy*BguK3LwXIqPuo8FXll(!V$J;%ciTK5nXafj-*)?Tx7p5d!Q$~gUJ?HovAlYPb9_Z@h z(x!U$r6Md0Acv)g3?JAHIUMoNYDajl z()ZxGkod7s(ZlIR}mi+G^K1Bo=GDOaSWcgue78EFDRb^ zrgcwCo{ul2lKi0eko86nR#1CH0u%;OM8ni%d{w7-sv|;0QhbDB{xSriwDChZng)IW zzgg#og6z1?j7<$UqVuYO9j+xj9pstafx=nzK~#_`5#(BU!j8NvKEV6&^4bAffO_yh z3Vo?f13^6kOe6@MA_;CLG^|4O8K6o28=&p=095I0m=~svem0pGgzP=Wi$Om&yRtc? zZ@*+W=A@%W=L3!X^5*7+`2_{CYHA5V!NE~N@0It$pRJ%JO`u?>>DuHZlo~CM)uOKX z=TJZ}Phy-Uh*~(f=SAL4H|Vg~PUa5K%QL(L=}+-~sao#NN&7OIZ>S_2(?dVVJ9z$~ z8%-UkDAWhS5^o*I3hZj5E(LF>MZPkC@oJ9B z4wW2g2W~c*J=!tHu*U@NsiJ92M%>!|tWQ&_@49iEP&}l51SXE2~@J`y7+n%VPYHyqzLK*jcmCUe`-I=zC(%aUw$hnSWpRrIyyQ& zw`eCH9@;i~UYCz#;JzZQFOA_HHya$PlV2;N44GIODWfj(uL9I+2E(c?a zF{JZF8d2mG4!Wm%4$fxnkeA9Q5%FQ194~|itmN$l(zn;pPv4VN`#EG*3TPU&Kg+-V zFG=fIy>9zoQkIof{2{!+d2hQnM-8sA+5O`9;F5cdnbgSVn}&1d0MCT4LaxkI0MB`B zPUr-f=&3K1aDpfa7(bF&>u~N*z+7<2#ewz~z^d03w?` z!SRq(-I16;gsP6MzBhmnG>a%ECJ76vLL?9!9-X zvDlhGMrZqPB62P|ooF4;Ot;@u{_9gUj~5u(;EH}(raV<>(xqnMmu@Kx(Egl-`)^uQ zkdIk|+|b64x~`uyZpiCD!E~=#NRTEC|A<{+6g zM_5FmDLCM2*{O7a)9W5&T%BPYAI$0R>pI{h4})5zbrxRH992#7S9FdR#lv+XSM34PTX7ve}oW>&Gh!G#c%5N9} z@cv3D&^rNvi=wB$)aIE=JkvCQ&H4hL`Na8OvL!$eVxK8(%Dix!SVs{86HW@z!q*;# z5;35^&L&K-uc4x%BDtyQ?el&Dy+%jLZ+Ut2%*?Vr_lv7Tcb5nBOiZw3wr$eiStIJw zw4iV);yCt?t1PjCbu%nAWL)myBdPob;tau7rS30{B}gnNkV!~vPWO+G6M$~0K_ENd z_Vg#4{u)@}>C5yejGfirdwCL99~I`$unEI34CW$iQGfwtAS^08)?brC^oZ3gAc zmvNXa*haaYTC`(Yv=A#wxU)*oN_2vMySg5Vayyo+(g!fkG>>}!z51J~<;q`c+3J2C zkx$pt%4YuxPWfT4fJ6YxlEp&Qa~kehH{4in2804$va1YfT|}l&C_tcUsP;d?Q!{Xa z#X^~3bY2x$!ii%y|GN9sS0AxBB>Y`}No-vf7&EyhyXtN;K%oVkF1avsyC!oO=A1Hb^#C z3OuvkVOFg4v+b0$j${gI0;SfB+S=N2v9Z*hG3VQ}3*k3AZsdY43P2ftfhki+&;42; z=2k(n{`)#uOiWCy^dYLjnFA_RF5D=qMVp6$y5g%!VQ#q&d$u4g{amQ(L*!xyHrjqXz)rf)AAKM zhc6esqMUG2Vndai)1jOjfCH4w(0c#K4I7BAh5x4I_@h-F!%xyZdAqBa7CIQ7fWdJR zqwR39cFC_|cJP-$33v?fG_86!N|q@JkhFCVUHTUzdixLj0T=WCN7k?cPviQf)?+Qa zTEN;bLw%W@F?=5nhmR9Ew@L*O_CJW8r6bdWeN@v`N0n}TU(CiDn_r7AJE&DyPklqu z;pNDr<-HZf?*Z9VSb9{yY7GZ=Gllt(Wg}_7ty)hmfB|ER;GW3WPmkI2S6&4kD z->XeXUV;0TrK*ci$#NYpzDu1pDIjrIod%|Ka`L}42&1$q>Rp}JRy6ycf22C z(DwM9Y_(lFjpSf`n@hX?@CGu;t?929iWOn$XT9bjg#zu7aWH-q@{h8*&Bd%2+h|#H zWS4OcHCuJ{4H4IhT}`Rs@JgkX#1WNj6X)a;X^|v zwzri3QmLDf`^x8Xp6!^1#}KH)=HZ|Do@uG`xl|#WYtb1Y8<}+;E9Y3AG@%>N1t#W#xt5k1~uKL5(kx-~nLHr*tY_W1I!@H0& z!te)KZ#I~VwWrlKqUpJ2(V$t;~SMT9f0`4^kBBE+G>hHyT*K!3Oqd+ z&lpSx^S-}g2JJsyug-05Zfxk4!I`!A9m~!wyQoi}LAhFf(1gaXKhA+Xi|;mfMU_fe zKZ@n<5(Y5~a;W2*#bwm1>HCl(zy!|1`2X7Cv(_aF>x-Ikn@tD_>NB;wG4%eYwC# z6KsF*yPMs0Jp04N%sv#48iSI}XP4G;yOVxC5lU%gIdOV=ntO4T(*D`iRi3-$?Rb_T zYxbTD=s#xxWP4ww+Iw*{t25+ccfPh*QTX+6=i$EDnw(>g&mx8r3fo+&MD!X#k4l>o z2v2G2_jv1hUhQ?cr%V*_XzH_|r(J8BYie3J@P;YyvO#9BznDVa&oRdBbh(!FJHYIo za~1y|$nwAfo)+}w5EB!_K{*e>@sU!#%JxutlNKG?T81wta_PEn$VF6SG!noVEHjV~ zom*drQv%~T6vN*Zmu@i};@!U2i(%A0$chK*ef%?n4@`G0uUn@?q!-PAA~~Mfxxsfa z*#Q&!^PylY@d>-1oW^iR;rKtow-?lLa@xdET0{X=QDmRw*>#=%qt4Wna|LBPI5IOD z_}rk4Bk3$+lN-me8MJE63b*U^-Et*t#XDVHXC+V6%&o15-ioEGsm~0>dKYcU&j&?B z3}L<8JbMb?x9?oe!_M zMWlemQ#2<}6uH8LeG!!6?x&7-=vxl#Hwu~0($9Aw;0o#Lea;MbJB(8PP1{eW-&H6l z6HK6OZWm$WkNCLPx#j<_VxC{GWwlcR-0jxZ-T^nb0~gYGfaA=Yr5Rz+=b=# z6^uuja3Q%d1&kWYhrwi?m+p0AicW75Kd~cZy&u23`bm8XHB@qfy_x>yt|?c(J$$6g zj|DE$y*`tE`iPDYU5|HlLCS&QYEA!lL%TQ#M-ACtS-bkZzR!Wgm&`ycH_o5-EPqFs zAnu%dU=?7FHfl)XBDAiF8YnS6cTy*~$YC(A}Uu zs8e(H3&nr8vlzMRuIa7dwu^SvQj@mdIMwwZ!*ze+iuzaLf5Hf04wV|5k4qW2RM%U^ zG)Ej0RZgZCq$l#$J94<-Pqnx?LT`%q2~nIAJBooe!?J>jsY1>pkVfyJEg z@B5@BwvqT@3*~{sAXE1f-vaRT`&6KQBYIXv`bhgw`%ohU2FRB1tN_3uZA@H`KBqGG zC4#d5O#>yy2PYd+7_b}oJjEv@srd;}B6b~swQH#5yJ#3?e~@@n+Odl+)k$AJ6Y~b2 z|C@}X%ln0YNd)Rkl=a%`Ruv?k&WE}swd13$k1Ar2yykl2 z7I)KaG?IP(cqkpHMA9rMle9UEm2+M%aGEWh5S0}f{y4X?Fabw5V$(iUL7_^iJ1%8qVOAu zE>*!W*`k*xpwz8~IQpi}mBKT~U;E&Putm302IgT!u!5&iMl7-LzXG|#Y?VG+rlZfq@Yr~;q%v<6J_%kXYAgSD~B?pPxy`>Cka>Pdx;KNuA-9M@ov z9mb+vZFImI*XYqo4Y?f8R^170NSb}MpZI2}$sI#Z#5b;{A5(6x?z=UL(OBG6(!e*-V^aGoIyhYpCIdfpf61xIlmMz&tp(f@!Q=6PHu#dj@Bx3+x{( z#N>GCR*^_(5bAJH9~7({ay|}rNA3GrTfjzr!(_umIe)>3m)p&YVPL1)m_wNPZ0Kf2TvPEgA*XcBbjK>KASZS zhhPs3y+$aLrc=eRldi^KGi5u?li2SU-Pe{Cn|)gIgHOxL%YhEf@>gD-??mo*D}XI? zZMNPfc|E=33EJFw6Xw4KchO5sa;LkK{jW_$A6Of;*>|1Tho3(D+BchAaUs0!Ijc75 zf9YLNRPbUqC%V}=fKyLPjY&X2;QT+fPAs*05{9QvR2vPWXtYPeC0J6$ z?3uwdA3YxXSX|sl3)H%89f%&!-f!wseoUP19mFBVX7B;|KzbYZL_XRH3GUWpB2{L- zq=G0Sf~;Y`@MrHBtp^-cGXy{wdr` zaxvL)$k13x1|h&$`PZ?zmA(~94Sug^VWIRzC|F75c5Zz*{DY9L`b@mos80`2pisZA zG|?FA_J+Y?Dly=2GS~ISsVGXaYER^gUiB$$Grx2B1*^BzlXI3_%Ql>53lr*T* z%vJ0RgMb9utf`c3HsBEu>!@!&cPdSxt7rd*c!$|PtL46Zv`cTaVw;N1t#}+Q&4c>j zG~SnpG3VI=PXeBfhWc3_srwE2fu~;+vOp$|oE~6me&?ZJypkc4Wcy*dUO^Cxvr1g4 zR_nWOh0-N8Bzh^!Q_gkyz(sBhcHCQ17x3K=S4bsbu00!7%nIDK1EtKZ3lyieHX=}R zueiEi%BZSFY<*yRzql*R6xwpSPof>>f=eo>2*wWXov&P)i2j@juvqpz_`K#6AjTU_ z;gTSDC2o5i{Nh<8qV!M=scNG*TeYz%XlT%Sp2V}axA(k>M!U*57|I)t6G^xKlW2sH z^4Q65TZG;^^5c@H(=~v>xPcOhYnw#KO}XDB_71&K8eKNllqaE@dHYfd^PNUQoacFb zoXQ*FTDz&{)mG;0kH~t*{QWUrVl&x0eIAOeJVNi2fJk0op3r3J5K@C{9SHdcTuX z_m;2>ZJ#&B$F*J~!*mm67{gqY!uU(DZ#&_mZ!oMS;L)Z|B6fZ)&Hs8yN^PcZpi4iv zmm1e(iiiST^jr48GLzBa{&nD5e{7iAYWEnu$)ytoQAdUHhcH0O<|~QkmZ;=4NWgdp zd!4lz2N?6?)oU|fkbl^5mS+f56Xwpj$bFcxvjQYP%3Dfrbfd;d1eW_Wu7;EsZ;JQ) zv*Zo4YtlX4K!=7NFl*>f8a`GFtV%$=&Q2Z1$?r&t#y^~{X5G*uHKVLq@o zmjq$V9`C1mE~707uQpj7c2cAw=RfMEkxpc31i)Rcn)$rN%Mk*y?c#Yt;erJa?<<_fNo?Mrt5cA8 z3zR47vBq<>-w>lgx1~TVCjO=Bh%jih%IJrI%H^1jd977C%?n`!m<8vQz%)xgr4eC4s*lf#O3yWh|mNxM2Ga zDqP%Ky_&1)>JAcqjul`vY2?H~3{W`rI7TkwOC6%!vsGtOJ65_{o-QQ458BL$RP{?z zZZ)?NNgc8u8*Lo+RIo*aTgMM)<YF9=AZd z#8>O^a@<*wFvDL5;`g=?0e0en3U#{2&cPbb^L3@X$o}%;EO&VaC*y<9JZk?z{ns`u zw{b26imcoQ6haG(QtHzZo2}E8DHQ)7!>-h)d}&y2gD-G;C|#^X5FEM}7Wcetv&rU4 zSogOovLBgllPd-@8mr>KG{bWA5XF$P5aGEGw4{s7zzVtU`e8GA$Tlm81q}hlFY2Ye zLr)6E6Kv2u%(CSkbL-m;Qj8jPM7-{-FuLbKfYn*@e*u6b84e3j#hm278+qnVF?PGX zMp&7UumDm37BN*PwBdf^s8BWM@Q4W2XmyV=$Glx3S>^5R?UFx0|0x5|!xOKV&S%sS zmI%be2M&4&lLO*ic;nQ}w7{D0)k{B%weeLNh*{YR6EW*7gn$%*Y2=Fl?0hE5QMJJ8 zFQ#we=Dkls^@h!RRqdDOa-ngr7{ty(s@4M;6x4d*zm~%z&987gbl?e-io`ukVSsVE zdt;#l1YEyffY9QzeVqQRY{e+FPkU4hdV56fItG5-`>~3Z(*(GZT)pAhRw`W=FF5eD z7go;vN36I{L|{Dp9GU&%^3tF7#?seb$OVP0CFN7W)svGv%tBzZ%x_1bJ^1w%D`l(F zHRFfBl@m73=%OZcN3n8}0^#bGCH%y7FR^mm+8CmQF8gz@b+RxYIR-bOuR5?0nAmbc zRQ*U~{Lbe*JUCzN>NI8hrVx%tSQ8wT**PN7eUqSqBgwEo z`M!cXsaaGljgNb&c^#aEPSoOuo8R30(;^WLv96R*%ui0Y3EY=KMX$vrtT;8O`$?x& zc8@||3bPa0H%1~7jE4&S3RbJ4;D>YO@!IQodo#As#WzBqrwtPB5IX20`Yg5^q_s? zT)eH77{zqWK|xA8S1y1CLgoCrge=Ehp?-IN+v5HdpuBoW$G{=vS2gd8gOfyr<$jFG zD;E@U2?4Gr6kX8wB^p$eTv{urn^ws$yZ|U2->~5efU9H3(*%X*rIGS|wJD7_9-}A* zf%eZOnIMl=m}(;Z#STF$PS{Ytb4_r9VBg)-9Y&|eW>~RX^IHU0gnbfsHVfC zK8*x^VId+AV=;;vA_m%LL4od`i}CQIK|i&$<0CLJL6|{k?;IXT@6)*@e$xWg(z^y* zJ-JRHaL%hNkf=vKLwQSVHy13Ta5#KCGMe}z#pKI$gZSMa?FCBFpe(?JDD6>*7U}I? z%uKuttx3|T>--ra3&vB=nSRPWPRVP{TBhItYlBfaRp8B%8dE^f2W3jO`{QF5MiwD@{G$s-?h&a)j6#Da>z z#B#@(?=QFmL!v3adFa$|K!JLlde*?yUMB~_U9Pj@08-Yq_X3UHEVPk8%B0{uKj3Tp z?jBa#;sv073oj9waB_@I3WtSRAj~3xX-h=QMz&gdFL1LXfJVQ~xkvmQg5f4bDoAIW z@3>^cZxW)%Fa($!R?b&HxJyj~>>M0p(`_W29LS*_7A6>P-g17)xc@ad<~vHEj8gWB z`1XfRYRG89^k=M`S+-baHNU3Sqt^;y=ni6ncGk(P$uG7y+?gQM0ljcwykL5J%FjI@ z^emId3gU5(9Jm=7M3{WA_^$HX%GMj;?e2LaFdaJ$KgMjPiBg#IiY;dXu!lT8FuqEr z5yzw318gzpnVhRO8>TZe3NOd(_mHpJzW65E2=Ow)kpPx{80vjm z$a>83fJk%;v?yg$uM(eL>OIcHBNK^wK>1i8+ki|USblrjAC=ISr zeN)D#=`({a`!|J|M6*eBfpa`2UV89%`yeTa`-4!coNPo-w_7=jv7OK3YQW zIeSLP(4@PSm93vnr+8{&OyoJ2EnR{r1uV?rH(R0poqz9A4vvfvU21H;R2*sX5?(al zB?WLf%veC9DjiPMFlQ?`F^XR97+ z2_gie6q>011~HEKbKOM8b+4?DcGe*d4XZ=`r;ps?mPQ$#_UI%GtySmoE}wWP=mUVg z2Wz{V=exdtRS4LX5(w+d013|645>xI5_;f(Ey9z}6{MlPXV>_7y@c#`JK^x}S%Ziz z$sm>+7kANM+Agrbx(R(yr%I-a7JG=und?fLXR3Q%q`&*kWT1e&on7nJu705FE{OMY z+uiYtYAU;|cR`{1fuvUkKh`!PMS((ouC>|G#q&o^&k222GwTsN-mj%vnIL0M6zGEf zQZTxOsSnXbI*D!pY^{*xDO1$>mz#Y1MZgH89SyCE>}C2GHW{SiNrVs@$Zcm;@R&I>@o=tM6Ul+g4g6+rV#|vp zk5=_TUdj_Ez1^wQ*a+tZ5peGB?&QT&dp{Mu)6f*ohY#>%Xa_dmb49ZK?tSA}kFQ@6 ztgLbozLVaAghAdqBLT$tN*p}xHh;e~`|(sy-OfDf(s_SPC&p9hd@Os?Mcg?!mKHK2 z0|g$7<5lu%022hP2KWcB(*uS@gxzld1IFFmmSabO2LAvcaeo^0&F`L%D$oI-J38vE zmlwW&^>=zaQJ_vh7}A95zf1Ei+6uW=y=I#V0Whx0daJ;$4_Y^L+HDIGe&N`^ z17w%Nyfr!~JcyD7(zx#4+wt}qD+h49@_qqj4Av#mAeKQ?|9-28=7ksi_Y^4fJbb+0 zaW*u4U!y@e+BN{c&WQ#+PNM_@S&R_4g6Do8-b(pjpJ)S~3GMS<;zp-kV4glu+>IIo z=lpo*%*@0T6B&uF-SoNGcD_;!6-p<{mBqx*G6Pl2udP)p*Ap92$?wH-r&Et5>rjl^ zB>H?ihDyNSCYXJ@Oq_6kviX$Dx9j%wWHV08QbvsmMZE|{fJG)An5xXOU@3~nDl5SG zW1`Zf$lAoAclK7=SVU9CykNwXKmLWARx~r-&h~=fRs4KIV;CkSxEjd|>Q=tu*WlN; zClh$vjv4s2{hV9aD(r*ZrJVq#8EFaKvj3TXP)XURC#5JX>fGEnt6PYu`*dN}JR}6s zm{o2VZJ&+yr{RC;wq)Iw@49~_4?QSOi`o^u_z7SAo?OWKJh$ZHi1JBYJsl>ujg|8n zJMJWS&X*OHDz#@Mtt^?=0a^2>V@kw@5r~41hQrJ6E9D>V?r>fS{d*x9VPxNj_tqt> zkbIlu?F0P;XO1?cYP1ATNbSLTV+VTCULTFaCT~Ohy`qY?x$1FVyBB7`7f~AJGaUX; zZMF+nZ&9G>hFkk!hhpSi`|wX=7CP6$FUWCeFjngrZeDQsn>b)!dKiib*`@KTdGsFL zKaH0NsfGWvQB&JULg6hagzDU(i=8)l<1m(ZkIkze~6C zpC4ncWULQ>VR(7DaF?zq{uTokY>cw%OMP5HwET#xIVPvw?DZB8n_RXvC2)w~X)~Oh zujDZ%_idWh!15^^l914GmFlCi=&d8)8%A~0rcj@Eo9_(LP3wWFO;wX;7;|fdtBuhd z`9=3)r$D~ea4h@Fq3=-yPKvd&qm+@>zVg7B+R1!) z-+G|)M6Vu2d(~^npC<{j2BS+))D#9VRcg()H;9XiGl10T>IS;r40!seBQD54VFV?N z^a}5`tb|GfI=b_nDU(~_n%F8h$WuN}HkZ2mpuH4%Oq&F2>`=<6m<&uDbv>o1t6}`a zN4l;gak_?i?e!z_LaDu1@c4$Lu&KIK>gAIo_##B-OJq3o*Xf^v7i+eh6J{$NFW>ad z&rdED8nIP;c-5z~MzS+AWlFm+supSG@7B;es(t~#w4t?9J;@@_be*u3#R@&oSak!6 z{v6Z3dj9~*2E4v@KoOY6CIi>=6W;m z5nymZKG*%jQEqS61RG+=WcU2U(K*~G=Wi=R#=owJb1xE0$fdIT1B=QY4*B$$Z~ zu2;0>mk48JW!A_AM`e?nH*bJa2y4_J@N}ADYBfAd!Nuts|6321=DP2*e6?qH6B>ba zu48-h@9npn@FdpxXt^a_9$X?6!cn;Dor`Lkx5z7VnC)9mG}N=>rN_Sy-Iq2H{Q_^_ zHw0506R#hUX zg1CRW-d$}mPJv-3E#C3GKyh63Z%c_Jmn{YIl#5%(v^c}{gp&a0&!Lp|crDm2a8I%0Z3$MZ zW?OC0o~+Rr?vm6-DGH9!9N_MbabB0KhNs-(LMN7``{OYDz&D&mP7v7*wLj zy@{jl9fJFbB;TV+PJUveE zXB)1C{f=>J;v4v~^rBC{73q2HjRD=>CU$_c_FiL-=T8yujgj=J@`Y<9yWOZ}`uT+zDP+OZEKpxIuJr+C=A3W={IH@^l6EPUwWC z$fI;usduNE@Wf}FNAYG2(TF}@$lkd%0mW#L0%l5nTnWE#`&d@55ALZHYNtc3WHAyvpemle!cv%e%W1ttId4R z50NKPUhX2iuDMpv;^{q#0FwGuEIjk@Yl&wN=9p zE6+a<*sQf9QS!A5d1H_n)vaOh41rbOc!xQgz_Ij+>j(KOa@68Pd|r}`D3c*PA*^fy zTtu)QUNg)7)bzDhaF=f5pB4Y7S@a5!v8t-`|6%McqvDL3F44x_A-Fri-DwhB8+Qnv z1lQm$!QHJP1PJbK!JXjl4#7P@xKG}1X5E=J-`sWovBC;8=hUg%Rkdpu5^5j@ZaC3z zG#tgOmqCKlzPEy8IvyM?NUEdizD(k5Bz;%AONSb!&3TgF;XyAM61szY zn=@bC^&KCD@#8Wjy-?rWZRip$lsFgmNjKFK7Rv;ptchNHdl?49(}y$?bne%C0VDDQ)CDE0X{T zlj?tgBk*6~C`JE;$Zcv?tpxV)$SM1w^|i%yCT4>5&&N2fJG@LY$Bkb7uoL&X78ESi z1Tnjxg5f{iVV)4|B3=D>`#32MWDLoC#@d$M0?;UHr{47Q_uiTF3v2~*??~7=*E_BC zy98n`5=8XN;X+Hx%5J?BH!t3WhA&<6a5BP~VtMZTS!E-2I&cYTJ@#U1diR3z`8z>V zspjc@=&8*IgK!O``QXZxMWsLtTEY4zw~r-F9UGJyZR^Vib#(6T^-4MX#PcqTNk(CD zJoZlGI<5`6y>Q&TtP4HNj%1^{sRb`Xp4tatjo)67cZA>P%+K);JBV-BpkP z6nCKd<0gd<6}}X8BGLRYlRA+`*p%foKBmT5*s3RReL=x3rDrx_k{aL^;nd+)~4xzUB)o(vx$&xiI2 zoW(+bes-jL_wwl`6374`axrk$@2x<0Ng4c^PH0eK;p>pW@*x1v!iYDfS*E;<6Kad;sH z`^85vO^bz@<5%O>-&P5R9qF`BUtS0V3bX-w3Wk!dc4;_yuS3FY0UdI@X*4dpoT7du z=!oVLIheQWEf+ds_~(#Y;VfTl_>wv*Q+(znU_VQfCQKwe_4fBd;>|&MX8JZyal(8b z<_tsu(X*YR5S(bAeVMP}56u*WWk!%{v!NC!3f`^ANpSjgM`xveBvAZ48hy>r-;;Yb z^V0mN0LNZtvKSlQ{$prHw1Z-pDiWvTePSGnq~fpa>mj8H80pjXznG2+0Wyn>fG zIWDP%OiB#q`Colt85s-x!XF@Ug$k!cdQIv-1+kiR5u=?`h<-i&<+azs8hpJyE(D@J z94jcF`R0K@Lw2#%Fjo>JTpmoKi_x=j`f@f230f}et4kqml6!acE<`sw|3wldOm5Pa z4*nKI8ovToF6(aSsLd!J#H&jo-^r++4%+w?6@Y!I93n^Tq-l{ zc)H3>?$i?ZiTLBBm?>2Gct=d~Bs`Hz<=Yy_uYD5t55<~22hVrMggnfH@Sc;kiyo?1xI1I1R*S4ukL54L+mS=s%;ABV|8CdtI$nG%QI^}e@w z6D>n0$@QF*$fYO8#=J535aw=ujB83DiQlXSDLu``@-6;Qpi~PI5Fnn(CK2$ZScfQA zbQfXnE+V#QEYSVa{1KHKPl6y{WcM1~#k%)i5^#h~7Yy#<4`C&F;Gv>i$|pfVN8GUZ z=LzjyeBU}|wKo!K6BTZ=cOJwO;Y&8L!#r}>(L)8u*Tp3v6sB|Iv-u@HE#IJF8 z^B(E=-x^4`nr@akvDS3%)0`7R$@c7|_j;=`*ny89ew4N&QU_&9S^!f5#e0ii0E$HS zJ+dQ)Hz9vA9;50|ah`9fdu!H$!?E~&?2gF6r$2hHt=DnuOF1iTfpB%SfFzT}fzd)h z?w#4xrLOd!2GXV`;+Z&fs`_u$9no|!z+Pe0I}!hqi$Q>|!&`n|K!;8}smF2V)eThi zvxSchlIg&1EBa#_2i+8shC!PbJ#NiM3ui~e!me9!>x3&JbDe1}qB;z!C_2vavo~^5 zs2>Wku6_F0h7OziCAz~L_8?6i=#~aDSk0Aa*%7U=haAv)}_CIA-(2%F;n$^e8gmYc?0-2OaDS@GCb)w&S zIR7CMA^?&Sz&M2^u)k>eqPM4~evyEHKwVp#;oowj=c`c;1r=boo;E#`J9b#x&SsDF zATTeuH%RH991;Tmc=#=jN`$oH)9mc*;kjl+tII(wNyPhi?`B0(S=8u6r${T>+YKEI z)zs8*BQNZE85U@zNUpo%z3G&kBsS!m&GhLYTx@67!f~xH%CNJEDR9UTUM97>#dwZq zFcH^6WktasfED@okZfYxVSTV7$bo5uDHMTt_+Wf;*G(|}_}356DwK|I;d@M1a( z&Sco(w{#tx+3Rj0W2HsmMBLrg#oEWv~#Ko_TN3u?lu;qM|2s5c?d&l=bxPurgNg7&FS5A|v?aAz zFj$Q3*v1vKk5mJ;l_#meYMq&1!1KW_jGD}@Ko177pp`0~tZYPa@8Zk>5f8e5 zM+*)F54JijMG&487(Hh%S3(@Qy}v zQxSQ**?51<0NO{P67OB_w-q>`$8|#ZNliHA7T|`bVz`aoM4V*MdgIJr&_(B;dXx}E zI4T=ho(1vJb6=*nl?cq^tb}eCERybod9_nd(qXxoOLS~JLgP=1YC7Ni>&;QyZpeqk zd?Vv;0;CyV-(N3a!X|%eDk^V}rA7);3h_ze-HcyZY>bFPIe^=y7&)njNlhDc50D!^ zQOh`aBNu#)9m~mOSwlw+Z@h~yX9`Dp$5xtOL(Zv8&+GN44)^*-_^?c{i2S7)_tp@E zyRBBjpD-jxz#k@L@ubcgNV!sfXMJa~o8|WN1hN)&I!_I*iH(Yq3;3fpocK*1W`yH| z=(0L|&(nBPe+HTsL0{fW3O&zU<*lIAYxRntFVd}NSu-q!gr~m$jJ1*)M`VVJS$$RO zxQJ51&-B4Y!?^IRzI}H+zGUcUEjHVuNZuRiG#0SuGRAo^oz?!HaoP-WuF^bbY6(hJIRjN?xLW}i&n&?o?bTSSR895jhQ8V0u6A!JO$UFhk z5V!b1rz*$(^djn!+s7LmIffZ#WD}%t;7%Z*zy;cfynh zy?7Cln;J`v0kt=zjB3r#_8UK!*?s}l;(UxnZhiP%9QYf_bD%T7et)PWl?5*_fwbCs zRFC8@s^M9%^;tCA9cuGtEamZ6mW$Q0>_?ZHEXjatP$T(_y%Li17YP~)57W*uA)di> zU_|P233Q5$K3P5?i@ec)OYv52fuH?Tb3IHQx$~Kdn|djy#wj(fRNfK_mjbQ}9>q&p zj$NW%U0E5D%Y#TYo=yqPe6-$73Yd$FXbRUci1gC5ZOdQSbv4Y4C?`vvsnE!S1wUbJ zDNd;{DmJ!_#H*sb8Q1p*#$->cWg#%QR58H(7FDHLv%y^Voty5-Mzzreu`-?=cIhk? z$28)&Q_N@gEu8#gC>gx0_%-VQ{T;Gu4jy5wO?`#>aQmeS-tQY_4|hW7$bFLNY3N`J z8gVe%3Va?Re;~G!tl`8`f%&)kH6>)O9t?Yub2mTG6mqPoEn_>@I$}or4oMTA&BBnk z|9!3uf+E1}wysAJTo2B5j<%I{hJ+0k7LOPc6Ji-0<0K}ag5uGbttzw*Bh=BTs~jK zWZTrYmqF8eWYzMuzXG!kf^2`@XjYPJWKe|p^XXp<44MmhQb|TMN#@GJ5l2wa1zkv+ zA)_#{u84__St7RwtzKXC5~2rwQP7H)8_S2C-QKg&GKyf4;;|Z}^-YZU(?fO@Hs+l8 zq0dVoQl^C04^gS15sKj_dhxD&v8D1qVq z>|B<<>9v0p*ISZE{OoWw3u%g!W*jY@s8H2@gnw2qM=BcpvW~3hXU~i1OnWy*&7~2H zbsTN+KcV`mEc-3 z+rvr4e(6niD?Q5BNTmZw7PFH6!dY~T2thUmZH7>38sCyg7)Imw0);O*XVitMW zwg-WwW^#adA}o#;z$Z{pne6NR4YA zV@0Vih>Xu4{u=pGU;MJ_p)DGk7m$c=@_RLjPo#!qT+4~`L;wj7WrWzZ((jfZ(|BBc zZW)Al(>S+ZcRaY`EWh>^g^4JTmH%Ktg0aVJhugm3*7rSchuZ#AgummnK0}vRicBlq z^|(>KLx`<7TV9OyJTECNPYHTHDW%Z}LfXz4o!JeoumRWez04Vv2!7`?S10^4|HGUL z-QNYdRM!Q}Vno9=HN%;gF)eZ?XdQj~Wt~<|yF$vFnYEc(h>?9b;{AbSrsYEl&?%gH z`ZPR(4-C}=78Ws@ukwF3G$()kyT?;#ioHylm0ay~n5N-nvRv=>eGGv)8r9OCp0hT` z+0a~fGT2)c+R;;nU8;Kx_R~+U0jIH6dzVoC>|NG>XkMI8iQ!u64AJA-Yh}MwNr31D_Q%>xK zYePe1n5O0yF$HT|tamD+1UQ%Rz7m{U8Xh=yN|6RE6WkDedEvSv-{F;C!OiJw-Q*R( zQHjvB`tp_?&HyF{J*Kp^hLL0?nFQHzE^5(ENtL^?zU>&aiW;tQMh>9toSGl|;Kc5C zf|SY;sNue|QXd^{UExj~0l8nI0UflSzwtxrQ4kLd4<-tX{;1HSuj@d&l+;dlS$&G> zh)hKUewfvDq#$Xw{n4qk_PsMo8c#h=n~iyvX~f{qh-Q^a6Se`D6njBcr0#VwBb3nR zIjxtkw4}NH)fMqzIW6+aKvGF;^N$Y+G~g+s zCb?qlf_mutim+pw$u@TKKNBLXa1MI!Jj;`Da&qEn#`JJKQ5$sax~zAM0hq_dX}=V036GzP_F}0w5kvcYo8K{{;*ZL7bLp6m=Zp zqxZ_T0N};2=Y?`+>yDqI((i`I%KU7`N1TRFiIZcQ>b(f3bR_M` z5^8#-8z}6j`{YW6QcA$WOJ}BpoD1JFc%<9jZD10^lNqTJsFW#6$e|1yac`{p#iKGx zV3N24DLwkoPYfZyT5y=EW(U)(^nkj2Lm=;OU7|vBoCOJsK3Ct${ziKor|q?+*Keh4 zEN=BC7quj;)Ly0Zmb^z!&n>@+4Y}h_pX}A#7)q@|rIiqW+6Z416lZ*6GMkvQr3qdC zz2md~OAoUH5t9}trc|pTg`m^|ANe;e!+S0C5M27#kMxHTg6Rj&E?2hVLf*>L69N?P zdvF=B<$`l@n(U2bc?fh#qaPM<4cz~@KPD^ zMK~S}X9%&mOm`~5;%@UCCrOSnMOnhsWoP`I&-~i`BSt>&({_?Jyhi^QpS|tM{K_{L z`H1wmYV<75Bd+w1pP1hE_qkN=2U%l|<;ia(F~pi?U0yT7l*+L0Wy_Rw!XeiI7stQ5 zq0OjrAJhzFU1O_GqsDQfaC%={xZ*S;w_9K}*L7 zVR-b-DG&9>vftq(BQmNdA`69s=M$ZN3*EAb<-n-Qlnm)qB~ukft@;>iv>wD2PEE-r z`0iI$`PpXP-y0Vr2+X%25CI7ex^#{1(({Oy=gV3e z9r}(k6|xy?PH_m+g6_JPEFMgU%0x~JjfwByI$ZDl;)xXLPiS@TKolmqwv)df?Dh*kZ%XI9AiVk!}M9%F9U%_GcZhwx0t>gC2i`L@lrcnAj zO#DPy^8TJIBzEFomo6+Qu8w8kl9C7W_iFD^&>*a!y`~g&WOsQ;WBwLrZ|f2&SMO5@ zSgv{RzL}Dn${rCB5w`{=if#CR)m|2aXb3ydr<(n)o%QO(HMH4HXmD^$msQrdig2`7 z)6}rmE<7rJR3(kJgkFskde2th?|U1V_9-SJuTnm4c0;;yycf)MhItY|-W;&xio@fm zOD)&d6)&~ufTYVcgxaC;nB48Br#FTF^z;b z6w9lbS)_%n1xn;!B4+-T#Z9X)Lyo2Afo`SUAVWl!h(?OdEX{#ayJhacA1kttFCRE6 zlN7D1MjFccOMMbD6M*@M#X<#Hg$GwP(mCM+k2t58zTC!nOeyAO);4#Fcw(wByVY8C zt3W{k3i79P*Ga@36!*PsHn==-$z}b-Por9%Z1>*|@cu9WWR!YJV*I>~Qi6 zI}sSk8O}YA?f%rIu8$qYM$ac*n`$|1Q`O0^V!U4TAbPg=?t5nDGRfm^lIregj!Dl} zy+R(NO1K$DN?OyYhsyT;`0RmsIZww_^LNT`&E;67%P-Hj)bw`o*#cNw#1YE7yNodZ#AG&I&FAVv-9_p*b>A_q+ zhR)8l?GM{gwSF&NdJUgb!jezokmugXsr)%A&fEQ!|DGf$*7T^+VS}BHnkxoiG2Q@L z{;y9PHeZEt^F~Ihv4dFIBGl=))Yu}oE=o50P##XIx+<|=e{lBqqxBy5_xGcqXj@NY zCvbE=ynDZ<_7ZWdIhw`+(1d2J{CIeHv`|>MxoiDXDUYw?;h93XX<=Ag#%21{jBEgU z)7iM>oC`v1+RnhO*o{`OhDc9BzAX$4x_o%NE>-s?6;HFcqy2fn$x_ubSNk>MF&7e3=pC)pW>ZHPc9>lu%|hcB z@x(8$mwDhI3nm&>m!q`YB-LVZUWvsY;Wr`^E$ndurzi}}&nnXlTr{7QB3voVi%5$8 z7wE->Le`hB z$Mdj-6W~kQr``udqN`~?&u`c!3v6t#0AiNGs|o?{!pDU&-uge@$VQX<)q;#&h>4kN zd1(op6xWmkSVBpLGO`t&AI$=HjanHvY{LwX2di3gL386zd}rnozTR?wcak`8WSj$S zykpdTWj^_4W>pRR>{S&7@uLf+y!cYi&U+3F&jM8&_!5g!@ABhwa$97$wQU0`P>;vats@c2>O#m7mM9Zg9m!Qi zNGb28A0+vP0>F#Q7tXX4@HzK>%dE?r*^3Z=S*_|LT|pkdN&eEyG!8zhcdwmyKT8ioQ1{az*+q@@7|JKG0V zvqUQSpIQKV_dVR6Pv7hk)5#ct$(v7I?nwn$C&7`0DSLCy`+d^G;h&YA)QI2Z7LOQE z+V(C|79KBhhY&3YuJ@|EC&<`AnjALyI~Yg8aBPXakGr{wgC}CN!1F>&A(}q2 z_-tRDucCC;QdVlVj(A|wm||H*IGhX+tcLGg8ypF z5V0kRiVaYJS#vCp4yE0=x{cbiEAPNNpjAgYKlm9H<#;*R-av55%h7~fIt$3+CL`sCATM41eIqZIX@?A{ zpKC@XolsQz*j-d))q&-A-*4xX`L&t4-{TZBDU*f6e)&6N z^tkVS^ouNxQ>hYlH9S*OJ^V)644^p_mi_JirARp|`8hcyCDf592@zTl0!xH^b#=u% zy9RJxRPWY%MM_$kj>}|CC^2x00|6)lh_sxDcuLx^dt}J4`@WqOiA<+PmdT~4 zb8XUU4PUfQFWOB`FU~gZ+w*po8*GZi>0p?86tuLo>_PZ2jKVTFu?0^_qZJWiSD!SV zw!$g5fxQs?m>|~R!36<_^<;97C?^-1WVgpFOEOW9U%$(nRYK~3zoR?Rr&u+QzfvoE zz*&upi_89GH3`Q7q$v(lB25m`bPb0~UKvBV9zw|vgqH6LerZ-JBs!En!@R)jAbwE( zwKU+9qqAz57=01j3&Ltor;nFn%yB73tue2I12sH+Dqp^^5o$&SrLcgKS6Ok+P6!c8 zf6l^CvMw>8al0a`cB1@&2u#y(-#GIK<;7#?7LK+vk(&Qf&CJjpQbMs!F>H$#6OpJE z0$YxaB}_-WMF>cAHz$McZZ7ZGARhO!IX1A_j1AGhwlx0WSdq>ddvA4Gq*3KY$X|#T zAZ!IHd2fOnz_LAJfC-`Pg8?@3T^^tNaiGwm1#mmuT}sd}?+eH+Ef$XXyJGJ|SGa#z zKkRqjT}f3bAUrT#W?Ip}a^05A(BJP&{*HA2dllCMlb~huGp7Jt(1 zcDvtbdNl{VGupkuR>n{+si@?e*h$&@K5gztGF@kdXe8ovbalbw&)9^JX=7)X(uhlM z7ecSId3+o>sxB7cCwE4&Zf@b#Z-|IB7<%6iJ1uusNEm)`FSu;y!Wv#jmt@$#cFJ%J4jlHh)V&)^oYiw z+`Qo4Ff(7o|9mqe{ru4DxpSOE=Y^e~mCXf%V+Kz}i7ssCu@Thrh{)w^Iu}Xq|L5IV zuUPkt4I!=K>bXYWaJHp*@X>R?sdQ_*90Jq|f21S37b%uhw!tP%VN5;%783kw$OLKp zRRx;e_($W$!h^9Q5rV?>@com6aH{S-|t6_uc%inBLt-^Kv3czH?odJv;aKpzbn!3*V`;*fPCeUp5Bd}HvMsIg}o}>^z6I&w2 zO=~&}`jbwDW|u~g+9|z-h~E*FBjj;w;@s8eRxNk;L%&prr{f!LtR)A7;FCwS{h67` zA?`I8ru=nfly-Ye$Qn!uk}-(MKD->sTdTUs$VTxSoz7=EQ*3s=WxRf)w=&soW)`G- z?;7F`7DPFA5rp`Pm1ZkUgxOsCf_R>#K?(|+#E!8bO*t$OQ>4Yb9_HY5V`jffYfYuT zD=LrJ*rE-O%E(+bd8Wnp$LdFg$-gIysS}O;mUmo_@@xJm0Sfq_V^2a5y}e1a@&=;N zTuKHYttAgzC#mAPOuvLmu0EOI1xBWZa4CABgxZkESPbx9;?%p zy&8r|zo>GfSsla#_z3RYlku*f5}ebkCay4$6#<+8R6tBw9hO5DRN|?J3U&iQtT8}t z(x8%154I8*rm{6xScnLg={04eIRi||t?CCe5||PT`I1ZAvN)us!9&?YOt3skyOAt> zz?CXWyXEBWw-287zu+KRu;8484K{DCtD^2vSn#+b9JDzPE6mq-0V`q{qyei6Dyd-q zcS7SM8L_Yk&|g#$>v@yF?^lNn0L_MN$f2}5ZG#AiS3$EAY~F>0Eg*=5QBqzEEX1gb z#rHdE3G83(Zdwq%P)LILz;?!M*8p18tBUi-t$hYi)!>t#^4WRC+0QFaJN7vKmY+*Z zD&fE52f6R3xYWKz#PI9Gr%;ejUq*&K##$Zmndle=mqun?F(pmJ&T#U%$ox9q{;W5K zQ*Bf)XWuICyr9Na^EEamjVz}*TwXTlgALC!hWs*xfI_6>ZasFtSo*5v9V4dOGs(j6OO0P!Koe+z#La+6uj z^K1Tv0`6c{`KrOqKxKH=a3x4?dm0MDFKhk<6G?S}>D1LF3pz%Cfm(exx8;O`)MAMk zdki$E*A*55=l(?OyGM%fj0#RkPA0NxKRo;hhbt`?XMBhLg~VfOFa*^ob6I$#eT{mI zM(hJJ{Sx#_5YI;SgP9?8>Z)6#1Y3cmtW}>IN zQ=V=Ob@jwmUL^$bQe*?87U%wE(J7Qj2>yaO(El1B7->cps<@z4Ibj1yD;o8PF0kkF zSau>y^kb1TXs0Bh2bfk6sxm)uy1uq@oGA};Bf$qGKGr7dMlF8UNlD9pIG!vz&~!eA zuc;)X-9KGmTqxiUxjgH>ZMOV&7TsaGz1!ley%>H)^G#Kc|s|GFoQO zmXnK2C{*7xu-(+D`~^WhxMc;etuD1{yxFAc zJE7P)T9xId1uAuLBKqZEAgK)+(c3-4gaBJL9SO}KQj9V-ii3tHpF)w3M23rcIV&23EP*70$S$f;mlew^dJoF-+ z+5S$I!vXG-w|6W1SOe$U((JE*y-KS+|;{_4SX2gsMaj1~|bHl3v%dtoX^*B`yM}h3xe%792B~60x~%qhZ70ZVO7bXK^1tE*SUxRr6mK@u#joy)IGWvSTNh@ zPn)ik7kviB&0$QPyS8~J2HjYGZT7M&84U5|cC-WkR$CBL5UXo|0ttCGCd3wbb%PX% z$w^3<4K#~2I6)#?p0?pJF9@EZ3%M_31F?84quqZx{elAK+gXi+1T=0ew7b9-bMWPIX+2l_rP5 z{wP98B&O6s+Qq(epPMOJCJqh(%hLyP{TiodptrD}u|dvd7V^7(z^p%#V86{50}=;Q z5{odMUtIB>7^(m4esCuohw%arCZ#1Rbri*|6(Nx;s(Z_6x$4SHo5G0x;?EgKnizR# z^|PI%o3xOh_;qly+)A)+pcnVS*kIwOjC8@!%m@Q@sPPv^(%qlGC&4ff_M9`hA4B{w zX#NjpAfS)e*KEOMD#YlIV)fNnY3ve5E?5Axjp1Mie73j62DfXk=9zsTrz3h`R$7jG#CUwWF`I;OR6NOj4;X~&DioTcqgw6$?gWX!5o}NR)!(3;otR_vsw1&oj zeQy}X(Bz~*H=a@9y5|A8VTadTvzXXrAD+kKUYg`##JTU&Y16~sr*lbJX20-ji=)m? z%nlf6I--L^nlU`aw$<(3hWYs*C8}Lk^bftd%k1Qbk&$89k^sUtPH!pwvf0Bjp5tZx z;UG%9d;;^|c9#jr#4hN?pDuiBw#fs7iWjJ?f0|hwL|Wx1s1KA^;gCbmqZr^Y!4WdE z5f>UR?`g$1I*I-L4k$TAKk*KzPIQkDzP>wNsp9Lvefpg|H_82o8^8@s?+Zp*Z~L-x z`8e;7w{rR#Z?UF)G)@v4^Lp|c+W*!BoG!Q=8WxrjlC$9#Y%5?z2nB4rW%%~`HxWGo zR5YR#FdxhRVO844Oz_eRBaXEY#m$zoozZ{x^`qkdW?$bxkjX9@OvpCpDJ&GW!h8-~ zdUWjQUZ#A-rcM*Q^BbLnIrc$KjY|^k4`*7hsHU%SL-R@+wEmDqp~{F~s5Kiw5sv$n zH~zFw+yU_BebnEVx?RSEq@@St4n!_-Q|`>CJK{7Z7CHmPGJ6`$$e;?Wz#?Z<@U2|e z-@lwy9dj@v@>*yYYM|0K{&_bbWSq6?dbuWdqjCO=39~SzWeqH#x$phGRpl=o*8lZy z$6I#=RlXJ7<{01_d}A#w0+l@Rn5xb{sCbMjeL$($UST&!%zM6B*r?No_|vCP`yGfi znN}#G;xzrA|COKz#xi)uZCXX`)>`MfHCX$bKKz6&2MFGBFIE~94zGV-1U)XO07*it z?ELAO8Rd-jPQ-Gcf*{uT-1XYjO-O0_Fl3=LJT_3r(P7#$N%Sr&HJDpeg|OF|UGxJ; zQzMv8y%Kn`2;wgnV(8?l7T^@2M*ts=CPjTjfTk;bcvmS8V!A8%bNJ%lQ(dS=#8_zt zWQ{DT$cyg{+!3Iypc0OzR}_=;^<@Cc+?l+p;)^!u!!U*C(5^jYW@b8W@1oeQw-d|8 zQAmH1XWqs$=QLGd(XKFvyd~A45vQg|n9fSDXqO%o8MB2b*Cv9FL3?DHz|v|rH6$i$ zBftaA_up#}1Y8xhz(R6o)j%VaN((|kKumUXjbwqM!pFj*f<5EuE`7~zOV&DhFM zF6h{u+^i@k$0@LWgqo6=7$l#>KrJA!5TFLEJyi|KK4g+^6%r94RZL^&W9P_TSnu$@ zyj?Ka8D`AuE0B)j^t$;-$G~tWWbBIo756!ytnpvaMhnQyjnO*s4?U`L9_M!z2H$$J ziCAq$MST_`q+kM-hJt2qZ4d+g>`uahF;sh|wN5y_|I&Tf2;dEd;JD9w_%qbcYR$9H=FzV3|3IH zmsPfGS&Wz6O6htQd&B0h^7}V!p>pV*18!Zh@ISrvp^X>sd#9sS=HCg;e;-~1)D@q} z3i|w9W=qsOj=rZ|?2|gsFf$JeZr@&4L0#N|167zhMfld2*Oc0Q!VpcY1)ME`O5Jjd zFc5+Mg(8Z-q#B@+tUKUl)Ma6q+PIAlkO8bVH^p2Uz=AD^3b=i-H;6Y0My){AhKVxO zJh%A)+iEKqTagp+@`d^QcH54#vk-@D=<#Qk1a5I<;caCMV7 zGcXFm(V-|{Yjr8HaXhz)r%UgAkmiSeB_JqVLICJUe9|fd!}JA+gU$=RujW9&7zFG@Z1J;oF@WPQf%x&9F_M5blj!Ss1|64+cGw?XxY@Ni0E37$3>e013|sX(ik2 zz`P)w0OzA=U7g^OECDACg{(EEPL_$gYfhdo`QM{Iue-fc` z83!B32kOu2SE^lYbwgugV?*>Qm`T#gbN|X5I>_xQut$?Qp;A**v-uQ~6^22&UnFSC zG7)~g1OCdQ?`6ux{4)@nfYQmy3DIxcb-{ZduH(6C5;WWPiVUoEl8GI#U=D2h{30Wd zk32@J6IywDl14WIrz0e$7-wvM#^JAQb#4GUmWF}8K?U)tE-S+?6aW3I0H`4DC?9uD zXjqwM7QoLmVM5-3_}JOV-x_~HenLI+OMil ze({Iz6~9_tu#(6{2n!h;A)@`>h_AqW8wZb&<7_7(u}K?r9I)HlwR#!2+jd56C`?^x z&@`bLHk6YV6@7y)tL)8FQ(lhVCcY8s>wUik49HI7w5coT$b2{&@oE1H)Sy&Vd()HE zV1+P$`aYti0+y(n@Gk=dp78*zU)ZMWmZwjvuY|tAa(p~d_+yln`EcYT5$?!_nt$4% zqwzOz`sX8@$kcu97Jt2%_ug@)Cq--OE&2E{ZsS9l38Aa)hULDdY}yh{t=E1gCi5?6 zYi%WoJ7+5nT`?6ajRghm@6ICDr9UqdvFfG!u|O1}G)4TaqU!2umFft~D+-D;w5gwy zE`DT!ScG!@PPs&m#@R@e?9E@#Z!Hh8K#}wpGSNx{yPtTfulg6h!>=3aeSw8gzu}xJ zBdB-fG~wG-E@|IA%VsQ_N3p50dQvn=)O$h=AQ85OPff@5K;tyGkA;Dr-7dk5d?#&4 zbRV+Cq$D$R&@&f%GI6~m1@W0BXM`6Ec3O zB~=wY89Rd)FI&EXH+w3{X>W*g%y+_RociQMUq`Y$ZohP;y?`~V{7D&{n9GJ=`q$b{ z9aFl?_~|cbE=LSrMc%Oc%YaHa>#$5$pR$K|Yjuk;XNSGgm9PKi`6TB7m)NHC*M9+i zk<1^tG)M3cUgK!L{YPD~^Qt}`jL%<<0PrWRC765QYWfTD5 zydE3?m+r89{k0O5ob(QG3R*|PvVeoAj8#!X&$v@#KAez^tT6d zthf)S^{<2Wz{Df@u_=z-=pnJX5)6ag+%1w;nxF=Txs!=GmC;WPmA3r;gw z5G-vFYr5qcFpcX0{QNof=j(C+=&lj`mygJ>(C>^0qMD)RocYr?30~E zV<#-sYzLRqe~HCnZ;W7w%3#&2r*HwX!9Sv)D~A0*KO#!=%}lxtbFqk{?u`+X5FVwt zOTeFp$$#)7`XN7pC`lM5D$fhEMKL#7>jM_ADvVG45h8CghFMP-r@Dn}_8|c?Tci1V zZ$W>5SP0NG03nHQ1eRu57VWNOF8D{S1~GmYux5Ia8YXNZBKVh zk>qotN^;V;%->m<($rsrls~1{kP^*d%5)(74+cr(6Z$bJ*w`WbQJ^lONlZLJTwlXHk;q z2bnR&6R!``vLo*=;4j+7fa=L-DUELkV4QRf(6KPPiC`HhInMxi|GBV0E?vhBz7UM ziqfvn^do!UW>i#xnAm;w??h+nYz>z!oL*4*Tq}F^#jfbSWzjL>hR5|)7fun0@y{hQ zS_}(1Rh+$0wL+Y&{Cp6=rv#wSPhBsn)jDSHwvl1__i4@d$&!4IUcz)Y3ROZ*kn`y= z72jPo^M2DSb|IFS55C*e0UhtK308ZH*ryr}=BzYTZw^~0>f)$?FJD zU=XG?MQGF&vQ9WAF@2}aB5ohgI~|bw0lXVKICJ5DAQ8YG|M1vwD6AJnsRn%@e_@hzowAYP}qRucvRLTG83qWXEx zpW<2yevb{{da%V{sKZvtEGuU3TJB(gz5|=s0ylpGxXq%qwNn3suP^eCch!y}lfEqGT>oikqZC;@JMX`Z5LXFaj zwe^7{VEKMHTs`c>ZneW@CBPB>L?$kWij+F=-2CDPo@DD5Y(xysLxkg;?$MU)V&op< z_;2rDozEy>9DI?BNB3m8#LAjZZQEt*NR^#_3aLJ0aidb#{y4-pNzL#wdJUT`o&#Zmh`;sXP{;!}5 zxcm*9v&OzoILIs3-S1VJjvvgnjBi`WGx?I!qcXzlSDEk%uh%VNX+zDh*ZAR%7%8?Bbh+R2BOpJt!j z_okUyY2GQEDEB2FE$Ker=xP7ci5r8G1I&;R0$SaV$V%;#Y54t0Knho+34xpH zCM)>$YPo)%huJ0y%)UAK^j3(jH-lt?czghi0XxA$iGsJ==3c3vi zLC*7wT49|~BFresZ5n4RjJ$rN*|nPR_~F-HQUEY_J{ zQ+DTCoz}vjMD#`*YiAuBgoSQc)mpiZ2=1__zV_9ni~1IIf5buj&1HJ{ljivtbQ>WE zK3xwi7(Gxk@|@}yP$>vUbECO^Z$H%o_@f4_`k!T53#YHQ3e$iAK#>h z9>BWx7q!(LePmd;^Hq}dGzkRxE*#P<*MDWW1t!pcl!9Y+wHTBp;@kC8r|5{>oI*!| zsE2J%EGqoUb3n>|c9frQiV8km4&(UZ^P!X`L^jMESgWDi;l<~Cut!5H)=O`mmp2Mf zEKjbD7Ms3MVp57nmzF9-NW-UAjt!ptSBoQBzJ>*DmC%(|7ZG^r(m?6yKb0DaSHRz> z{|lid?9$IK3NTCu$Usn)Cq#h$9&zmYe<*wFxGI#be|Xa&(kb03DV?H#G}0i5bV)bT zAks*ubSjNBQWDbA-Q6i6_0Hy;d+)jVyuZH=pJTzEnKf&DYe1d(Pgf|;8ES#J7?rgd zRbEKSz>n4PywsG;4EH3)GMPGIObS|=jD*Y=pV%dI!q+MEH`xOoNdOx50DT9ox3|2ZJb#sSmk#-I44sXeG4{;$X>dBO$-#eviNc-T4T2Q11ut1s6 zBG{RbuRTj68Qfp){q&*i5@W`g{K>`X1QMooOSK;BuxO@IcQwG{1e1}@lDR#GJ@N;2BNR0T28_ClIbhM_N*v(97TDgz zRTINQCV{q4WDf{79d^`H_-c8(KLDyp+1@%pdv%zX51{$Ke`J@`WAlcDD%tN*g_P*D zq+aL#cvT^OY?&o{cu0dD{=%MWm}go$MZQ5lBECgO(-q(g1Ov<9GeEcMbcumhovJ^p z79dc*`~hE)g9`MQj_z+@5x8~kHs|X`n_!j5o30kh?T3<(lo5{2 z#bTd3`|8?{cV#L&h+y&vg_@m(v7?bLr_CtbmrbB1C;jygrmHrX5D0E>aes0%{8o|{ z?0HVgu;uywFZIWkR}cBu^|JhD+(ib3~e!7iJZ#wLCUPAMp^}&y07zK|j z?hrOTF%`paTE1aP{NU9i+V#iou%Q2u5k%%i$(9O6r=+4wv#zSjD$GQIoH5hcC| zHpX;#5Kkq@w(#}?!c*ZMolm0Q4^G4g@kp7iTo|yWXi`l5MDpg2ErpTa14fSV>19=H z%NGXb`1V=hBF&Qu-;B04gqb&n@m71Sd6V!l{6RF&a(3rzeTnIV47D1!7%jZ~n{#kx z!lRU77*fh$xP4z5r~sHazYLu0e_&6E76AIu#kmV(gsjQCq``sz3a8A*5Sk?}uRr+)XEN1WzzG@sMSqJD{(ecbK- zQV3l*wOJIVK=|$w2Hm3xu1alfxexdK#9hK29GK-Kv6YA8*IdlJ{Do)9WZ}>64l0GK zxLPGX6iJN+?{)qF9ehKwFONxw3aHGrVEu|hExn=t^RGPPpbl%zJsT<#xlbbG{n~ z)cSppdYA2Hv|A6g^qsZ&@5v?D!oycd1sQBBtt4Bd0uQ;<6wzA zTL=Z50oJNm55-o&7Y%d*9imX|n|w|%VVVsxY7MJo_%V(9!Qi)RW|)k1pygRd%XQHD z=B6Lb9vzgq_f@qNJYU0~?FA7anUGSSGKHuPGYDPIBHGDlGvop#W)5o>IfP+~X$mpE zAtuDolg)DjTY7zq7a}v8uKC+~t^T2A{WgD0EZ4U_ot^PsQyxmXi^=N!;MX2J&)BBp z?_dh{V8zrZn3x`UA8B~|cPN&|D@QFq1+c2GaggctDdh5 z0C;XAet9Jr4`wDDdhFkUUZ4;ZTiv7*hX7xI!IKTo&V~gBA~F`ccL~*sq=_-WD(dU& z2SrA9mFa)Sq~twzQUYktqS|6PLWU8q<(@gM^C!07j1l(z`}J>4MLtC0GkQ-bJ%cd8 zFAZBV(TONz#Jj#K*^4T@tee(3XezmSwliHfd_&`3GexKik7kL_UR3=Vdkl|PN$oWq z>z-C(GJo&pF!?3D%?{>|gE|O9y(ssn8nM(Np>M{fnWE2t*@}gcZzJW+S8KpG#(cba z0)8eT6mTTK8p1P#hwUIOL%TgcQV|_x>K50&g z!roXWef>M4kPxfwrBxO0cCmHSXzl2^Lp2nk1h`Rlm=5KUyyRhP1V!NmKrnjWo?br` zBaOv&<&Im+d%ntSb3^tZL6jb%8gNOPysGd0)ps7b)0uif1@f79`SMtB9Q}K00XWMz zfDuTcA;tOMx&ro(kXR@GUoh75jlqB_dBVf8`kt31Q7!_FO z*W5d_^%NIIwgCOZ?uzHC1H(WdWF(gAg@&O5VdZ;YTIXUkpH-YIG3&A2e;ZBPrwPj@ zKsWx|$MW(R!E3@}fxEw#Yok|4TMu#KxVA(DA)|Kx7)V>&`;1d4w7xl`ls;m+SH4Pf;?gxv6AKDc*7KUclplgLPo!MOG9LUHi-9i5Q*5$Z(^ectZwjgS@ZgG#l4K? z>ppk|uCJH(qzZS31us-G^4|zMI6#MA&mFCF*=_!$R8?2c%F9EHa!mY4e2PII(03W) z5g(xlyn{b_dfJ9_WuszaSI_vbs_)x2!ah_XXGm<-JG|QUwAz=l<)7Z|**~TmA59C4 z7(6NDG9A5ayQSUdeTJZff)bTeuS87AxeuF>+-@Fufa4QF1;HbI7@w_Q@?qrCZNjBL zRhZ?&#AQy7W_8}TPtHSUzYYa*4O1?Xe*R?d*8A}+-pW2Wb#mnLNKlwW=TJSHk3>x) z763zpYsW7Jlf(O_>Sl#bHkaNa^>+yZF*mxUbrAuCX&U07zf6Xo2f;<({bRuKYFLU| z(yXy2RWE%VKI>NZnO6*UJn9Z0)Clyi&=i0IU~a&!UcAPmG6EOGaxp|1bJ@H6NqJ_9Q5Z-Wj#op-pC zJcE!IBG5#&69}R&Pv>HvEd3J{yd9HL%Pb1K4DZ29!8^$zxkW~$Na}&LsWW2!QVf@5o^aBH?{xc+i!;@LX$0!!Mh((}dC}^eIjw zNr!sHyc~Fs1PlOskPMNVlXtLU;hT^QeEmE=vGW9YRXBW6AdGg&|BuzYmG}V z*RmN)!{{bJiqLBx#m<7hm$zEjtU%FNd?VHbNG{zu=zdF^xGgQ@Alkp&I|Zwg*!ml<_qH~!xKLMZ55RA*~64$-0K zYPLV$C!Mvh++n%d!ey)Et#Q?RR;V7z>$b~X5w;`8URFrQe;UXWyHzmogF zwXc!n5gF@guIh-Zotzvlh%5M#h`q;yci2HeaeU3!=hZpFqD73tTkyuIFyec%`MjVX z3@fbd#vf3SjjP*`@+rgM$crm()8suOHr^q53?mQGXKjs<8uqSB&jz4=Lp!qGA}NjA`v57RYFmqG(%0|D9hu7Bz=D zwqmbFX%`ja9)1x1>MeZ6nddPjkjfEOsR8xZLz`=xAKnI_4M%^RL{BX*FVAOp98g#1 ze7gBEHa?b0gaZ*LFUMf4K3B8Zj&>Pm8H?ztpv%snzPkV0=s5UA6fp0d{bYg|PqZf+ z+_xdW{!1GIuLn_r$_TFxnr#LqHYGVhpg;`fvQutd_3jHoj_&731+}$&D!11Avvr*h zKfm+qZW6wy$`QZ7MiN74JrRv-*(Q$E3YZd((>2(#woMB0$SfzAON$Bm7>}IJJu|l~ zjQj(N+TGvk`01=Dd`}z=Cgh>&_F+y9g~@1#2@#963y&xBNxjK+Fjewcqguue40&+= zLM}BrB4>yZmJiEunYRJ41cSWBW`Xl?B<_K!?NL5?-@ zcvh}*!HT)r33;2vH+eJ8yunrlx91B;2*r?9rqQcMDQKgWmOITv`9(%so;4SR8yw!< zs0bMX2CeW1fXmDpQ6LdKMA*(rFnIo)M!os?RiEzZb5j(dn4%Mnwul|3;y7t8C!gb+ z!Ly-+9}HT!^`Lr<0Yuu;K)xKROa5naDh38nmwz2-IrL8l1P>A|yDu(Wg|2ohJD0i< zc!X1hmiw>D%GlwaJ|q{JRc9J(di=+3s=VJ#bW+iVb@bL@33h93ZmB}QxmnXit;E|` z9J`t$K5AVVz=?Geo9LO%FTHPaDqnExW@Ag#B&Ym@yYX1tcdK$kUuT-8y#s@rUC~mO z>k-14eo})iqoMQpz~k=LuZs;gdW&6#AEKVn;jPfgOIWvZ%Jux<{B&CH|DtaupeI{P zfPi~+A}MvzcTUjg_S|iaZO5qCrmLqxILT@!wXLdJM9^?w9Dt?~M-?TED_tpTzo|q! zt(LMgs}bXPduU8uNO?m_sZ@>~r|Lq(Oy*$92F5V_2v{K@qyi~4E2=i#&@bJBBy;w#i| zuR!+sV`!BG2;<*r1;DGdDFUT2r(03;RYNlu^Y;OF@W2qy+McU!ClZo|NNDasp28;! znZw6=O|u>Y9%6UIj?htV{-?%%oN=A~hv17P! zX~PZ%XbhBUsUHmN@sy90L}_Oe9F7Fr$S=R?In`RWra?)wTUmqW>^rqTftJd~qB|cT zfklsMS7{)5V%__6WrK^J7RKcVjYc3kd2m>g13SNAV4Tl(-#&k7NohW(KHa$APYoKP!d1aw5OucZLP)zq1337GeJBuCp7$s9 zP&E1y`5&NQ`zVo(<+sNQk(85V`u=W4gMB;%g8K{mmxOFz{}~bF_$IGRPh&X6b;+Qa z7K#w+@R3jGskPLRU#}igK$^{~g(fh<^WL~d{FU!N#Do0WZ#YO!vkyofFVTHK7XVBg zIy*niJ6i*2Q>m!s{bqlqi9iMahm41{uu2-w`fnpQ zW?T;Dc|Hy8S1rUj$VhMXHbp{k(%syDqUBy880EJ=ix87oTN{7LE?7VbA# zA0g>Tf;Ge6_5ywELE*W3b2Kd|*AkwR8^?FH$CxVUDQ&+`o0yOuuHk(mgdK=vqh&LX z32<80WCRQTie^l2uLGR0^e5`a!@V4CJ+3)nMBzx3%Iedx!sp&Bo|TB>zHwcYKu3yO z1IU`svRK~D;L!3zMV)`6yMgA|pj*&+_3Enm?p;n!&KnT0(>WKswzrAd0UiWM=Q~Uq zQ0j9DCl$m8^4Xx^;BuP)1tQ%0^TU4j$S0bd2?w87wj{Ho9h>KOvx8%=&h zoUlfL={X&p>S}Awz+=V(bv6ZIJl+tw3XApvj%9dmxhJx*gUChL1;PFiR*R_Uv2d|Z z2xRG{+c{b_RNUa=j3x47Ul}=uoxUNVcHYc}t&r!?UuTB&Y|>mkB;Ww{q%rBs20@qA z^?D=2XmdMJk63Aw*?}Hx2{y<1u0IRxahma8>K$78gvF`)H%kTfi zg+o2tU`Nn(qm!~4^=N1NCxFQI<*Vq@Rq^5h1A)Z3?jq)t{)%n(~enyIP^8;CB9RAYgs zp&Y^hb99|-^C;u-8P6-I+I;2e917FdAzUy(#&pRY9HYO-fS#Ax5a@75zb9XuLjnBX zf0?Pka)m@VJNg(%iNGpVn)GuT?*aXF;OwjpoY;{-)k#vFE!y9*0JP=RC)P|9@)?}h zS8CDyRlsF*O)@bLQ8I~JLroN&o~cQ0=WI~(%E=twQFqUf=u9?^DYUbCl_4%%^6TA& z7MP%5eJY+b*hJgA98@n26q{sgxjqL01jB|}?)yn-FEcD?>)9_m`?|znmEsi;6I$nH ziY4TiQ+MO}pJOZANzyxLnflSi z+>Dh>TK6pj{+#2%=PED8Qkqr4mrKWg`uwHv39IOcsaEcSx==H^7Qb5r1Xm++ zXK?nX?pBbR+H1e#ik%XP2KDxvq=l`-Hg1-B53KwMGW^Z$Xpb6XfN@2tvE8ELjO6#6 zbc-UrZ2B-yQoU5H!g(I>vrIJ&FHie^eoVgDzI5 z{yrGb?I)9tfDF;MA4N)R-SqIz!x@Hib;EFoVonp%(`t!AlO+WhIPP=B6S{gw=aq^u z1s{0F$I>o{V<$*bS+CtdvU*hU-)&0r}{4eM`J$?2i62YKf5B0{6B1 z7H#8YXUZGJyUbyKrwl}G1s-nhrS@)|$woJ>2r}W2;b>(${ZOngcmGT;w)ML^%pu*q z;U*I=22Dct|B{XXDG<0%c0W6QqP~FjnXa}Xp7*}>`U*{1Ln zH4=yS_4IpsIywW;Q%oBUkUaq-JA5v*KklG7l8L#bU&lcUGTgHlT#=geK+HzhW_{r+8!0YKqNtGu?X$Y?NzqF0|yPXg-1PF|D4;*glO309`vm3hyhlp;@ zif1m4E?mGYBE^7xUI-8nd`Q*p1mWXcif9_C#V^+wRZI1}Jfp8@58=H&v&3M#CQT*s zSX$=?lGRv!Y;GlKg)@H%zcq`EZYMUrEClGS^DTfj{bJ_J0R zqTT%vMi%TURB;N1SLJS~@Ju#b>;nAa2E$+XoF$}34c2BLm0$C#S=81KkZd7TX^-(; zVYg+&eR4u4AC2dJxPK0Y`#M_l#5Y^67jBjA&8~E!t3#!Q2Mk&m0p&#xi4q<)yCdY55B?7m%unwNSw6Aq&>ig!ys= zz8e-$od@bQo2}>DYw3AEAN-A6=5UyQBGhn;A}1#+VPR2mn$kdS^ld{NlSz0t^q+tw&*sg~)LLY(OG-6RiO zmcb6c4*nM;X>X{oCh3VI8>b= zI?G-HM?$oCyW3SfMB1v>5Uf&YnMW0$m@q7)!`S}T<~AOeyOn~J_>uwK(@?iK7KuelLW`qE zwp5m}iNt82yV5>YL^INNeqeqey^drBGN#Igsan-8MUnyUmvyC5%Qw|9t!cAySL+$s zEDcrhSo~eZb7)4>MSV%l4#1?`=-dzB?t0{V2uyVYFaJV@_Ul1L!K=J-^i9{-9xCXO z=QD|43`|8t3=9myvg_)TW>$Cz^TCeTC^$p$IX%7e#2gbJsx?3Uu zCZ_DmAp+Id-!(CZ2#_l8J1>vpZu)|P0x>JA#|CSXe7$p6Z!_L?gLFx`W3%=ktQU%k zyyMFrbTl2quJ#QF~juN(YSy5M~T-oPl0m<-@`?GJ7<;T?0pG{*$y)?73#y-y$@Hqdqak16z zvdkJ6s#W)({Q8o>o5;uhvC{fNIZ6qm^&nzIw7oYaC593sy6lF+CtUi^-!4oB>0dnh z4{my9g!261^BRB^52-9%5QVzvO6c**9fwsA3!` z)G+3FZz7Cgv$G5_NpU`u+49GrWm{I;ERQz7Jh+(!dYf;~a7Xe`<_#c-fVGOcXRZDV z?=eb#cOiBCUG&<&TRxmzlng$TPCnVFq`5Y*&#L@b9)z-OwRffB`!ck>1QRaV_f}4z zeifBchAFUy`RJezCI<7imqQ&i#o+J9l=>Q#LsYy4UYN99zu@rjb|pvmtW$AE$4?sU z>co)9I<@e){WE<=crB|a%|60`_&Ls$wUGm^59(fjW13RSLc1e77ZH>s3 z;p+F!z2pQRDz^@KTH#z{y<}hw1lFYJN?piedsk@qkY7S38*I_43;=3pY-@P1-gZx4zE9^Kw$FMj9b#XUDHE51a2wt1Ol8eN5U8@fkmGs$Z^$Q z?@RhIFu)V}Qu!=}^m4R+_CIf!p@I8Qa@yH)z6BSfvIjWdVVn2bN)H zSb1sFy_X;Z_`#4eNT?)gl2{?z$PuKIBQRoXgh&NbCkQ?uO-PaUkSvYX0l&{_uFm0s z)#KvjMH~~Eel?BRcKNN^U)}NNwu>is&pX3Ne53IVI!<84)`Y0QKJlw>jyJX!+@G&| zOgm~}mv<=0=y`siY<&>oD|wu*`F@P8IXKEt8B_Gh5Nw7QO6_)2m`E6oR~C;W@8`JV zFKTPX1=tCnrAJICge|MLYVrFZxDNcdnN_*cPb@Gmf`ML00KE53ka)m?zN-xRj3L@A z;;jZ62W-&J7KJ(KLODJGDA~7k2mHH#k<{KZ1kCY73k@!8BDWWYBT0eKYsjRk9dittgZc_wa^EO+kSynnuBdw4HKH4IUu~+ambG_ z?8*~;aqyKI@jA1^!BL0g7+*A8jo2$(dTS&qRX`mZHdEZiIc4I_BV*;D2^64^kObeR zlopF^1knF$YVZA}E!AmuM1MK_EF$MbmhD>r?}F<{D&!}seFkEh$4}wKe{MK9tBdb#$08S^U-Czx7P**4 z;Jv;+8N4|Bj$yYw%6KwGD;vL9diT3D3bYF%JIM+6u#+w0;fQjpZNL@pi%?K|!%MA<{PRUtipyiUHgGiKUe}K2m`ITv=JFmX*_c zh^XG$)Fj%3K_tQYY2N}-}uL9(-Y30_|Gdv&9D_`I{XQutv8Y=XUMx*rlJ;k88V zaF6YD&dwiA-&l1H$!tF^f3_5D6r<4?uK999~T-w_^ zsdcPJ13pN-=zNZNlD_xF-yYWc+1o*0P;G-Qz@}kD18c)Da;-1kG6b@BkIH%x*<;g(fX7>3tLA zMI++cN>3bft?iOuk|^mXollq3)6+_NdN%V7u18cCZ!R0SEyoFY9qONUbal<}0^$qx zzBeWWsTBASI@48vLh2Hv*>V!+f@GtwQ7@rDP9o#5iCksWh&{MO&2TYeh-hycfz}8W z^hf_$uA790oM0q2w8r^%u7bEjf|p%@7NycYFZ+!aD|TRqv`n~>H#_a4LMTb)48j^f zgWA*NaTN>lM_2qYL|S+kJPc>E@E>ajG+(APw}RJV!<_hh8A(-P)F>`+K_~XEug(%^ zC)8$7AMIFnXfe&kavFB?3Xpl8k$bmi3JYFg_-xPO;h24+=RKBDa$U8&KAd}+D&YLW zdNKD+Ys{d**)=TV3a_3|5kohTW2)Q5;i-S};nUg=)|zX%9mfpfoqhbvKHLi}iuOq) zU;Tmq7VDh8I;DURCL>3Mfl}NVlY1CG3TVO*#ucHKy`w+)q889~|JRDsGpmSmehjoe zqobO}ohUZge&C*6Gx6G+<_56%`TiUQZWXweSzcA6F{>A`=V6`||77%kdXLOVfW87j zY~Vduq&yUhA)zb1DwF&X7UC)bC$A zlhsx!Fd37*tZZx;(1+Qg($ZKrCYNr8nWswp?=KQD4X7y%ixU#NJ8V-`l_8CN8lXXy zo2unx%+J&l0wSYo(?NR9iLa7;jm=zV+yl|2lf4wX{n2x?g2J}bDL2=%HC}dOZG6c{ zdGNw%;@ao^9iiYN>WRwYDkAw|!3OYJ+HA(>kDMY83vu$@s^xy4G)5Ft9t7`ZXz5ovq~W=nsU1y2G;ZbbzJ9>^#NNQj7y#hrM2>h_Tlsyu-E z3{kCt{HO!8^xTsrGr;>Mk>6kFpoy37!>uNE0|yJq6V`+?yS*K_suU=}Nc(@9gP@I_ z9CEp8IkS0t%4gPITaDw*rqj<2o>$f7XbbgD%nGS|`bF(i4*R9I+okO0!#NS0DB}V% zH4^ziLE7?0-RBBppBmWz9b$DMu`r&Gw!`66X=E3%e6U^UfE%9|f`@$!aj~vhGBIZy zNE4#VJ*Y;0s}|6{M9pE^s$Ar{ghvDT9NIAhntHWCsmAVxY_^-ypBLQLR#?wP9*A9!5q z!`K%MYM0lnoWoqdi-+Rl;`Apr8p>=AL`XzD2-Vc$VA=n)$HvEoB&P*(FlM*+zeV2f z)hh$_*)8NN7{`F!0(k5Hi*OlqWxJL`-0y)1;OWijh@$JpCl}k_r{**?G;g+sb1(XB zy!^`zyPQ2W^&T?aS3kb{>c>x@4*8ryM}nuDNC*(dr^G$fD1ZUA{~5R!cyH9}fp%o= z|NlEO6fw{+bxr!)!aspi;vctmW~?NZK@A z(shOto9cPqOEjKlgKlH~*&eLt=BkR`E6c^W7rt8CUzl^@Unc=;$GIhn5GxzaDNHi) zj&0uIyHq~=7KnNy3d9e_eqlAZ;w^YaMedy?S)?`-5AF!^Xf;r1y0;OoqX#z}utc4p;D~ z0T|6#Cn=yI6@#sSbbAF!%!!hEcSRZl;dj~QbAJ09ZXlJilGpthfYqNFd^0=PNXGLAONdd-@n(L zVpQpBZl+Ur`!FGW`9n6az%KKL$tkqPe9>#`b9=zfNR1QqvLdp%E7ccm{2SZ>@$le* zy>8N=`RCFYdmp^zTh*}R3WFTe{%B6mx?MOkos8dZ<&pinIuKO{Ly*Y>yy=+4Qb+|a zRGi24>`oGV0N5()wy_3ni61y{H&S|S4Cp@Lty%tYD}H_XO_AMeSQFTt?d8%q$rueK zSA1(v=VF={iT+3xp$23m$hG(vV?Yyp#$<3LV2L01G^!aHPC82C8^a;F~JJ^^_332JG?x(bo0~z@xXv( z5ynTNEsByHT6$z#)(aLAmMS>O@Tcau6Q859RoDfL8>_T$wRsOHe|0yoLO$n~!-W5; z0_lqC%lJ0dyJjejl|QjvMa%=q2*BQ@HwQl`G%VtOsJa6xVvP=c^yA4gVPDi_Wo143 za7OlhT)x5XMw#)^PI44C^@BRMI?by#?IJJhs=CaBqJwP3+)5iYC%k#DE33D2)p&So zc=i#t1cl~?WpA|5Utbl+p7qJ?FxXhnX}F%&iSDO^4*%ci&-dMWhPpVYk?F0x zNT^30=v!fSh3y`RpEh}{XM-9Wg^Y|%l1{X*bjYk^o;`yD@8`-ho}Gbi54v$SjRuf1 z8DN!n`&_Os>{d+Xfe1Q|S>q9S$n$~2YXLNc`=YFKzengSSAi$N8CW5=^SYm9qlyXB zbbe=Bt7-3*1#x%yw3vEa_b^G=e}+kfV_=q|(f$!05jeXl+g6mi#h@7<8lZ1y#w)$> zreOt$>b}hZWFCzM1ZS)Gr|{Y=gM|G_ps$dX}%zUCu& zNn{Q&^opI}_l4uU_q=hskNpXSGQ^QzVKR&oyuK9U^~;BM!Z2|%J>tgW?KmoU0Vxno z#&#m&S`QH%5ifG}`qU!hrtGEqCQs@c##>TOxvhqVni<<)ZEn|&$AnGu-$>VTB?v|N z;T(<$45unBIq00SW^sc)#-SfE8?C~Af3yXm)mDuig87WR#kEdhsQ}bU90K-e)_Arh zb%jr5MkT441n~8j?CBVdE&7uTFh8+k?>~s@Y2=kDSvf`kIPrlK;;k;|ip+uKA3$SI z|2s660Ksjgi?3#8WtB8HFSMgGl@f9OnN5s4m;NYMrs2*{z1N=f2i08SXBk8s{7aL73}DrxrPAPBKxY}5LE+yONL^LoJlPB0v8>8H;jh=Q?drtK5EWK z2yZ~C8Gj^<z#w`?_Ci6vC!S|) z<;wrc_WICE9->#M=T^~YYYmG&E4cP4ftadhu8EMhsA9lD@0|if zqG_k#PW-P)K7aP8qu}AqBQS0*gPb=$x8oe&l*F2>2e_0B>y?`(3XqV&?;kL_8r@Ey zw7Oin2r0;c>16C_!V@rNXMlFB*#&>?SPM2mt?_3ije#!4!F zeVGHFT}k9$(kYP zze4rgl(CKbmPDui$%BAb%xkWndwZ3%UWZ7JQ~>+QOpD^}Ybo#T?>gxI&(HJe?7BH= z4-Hl@$*?HVm6o<4RhGm}Ak?Kc=RY;Aj~#`0r+;0t?#?t|e83==R8ym+%BC)JG!}X7 zEh{4-e;?^WNQJU@bxgqG57v_knrj1%PynCZHuOXMaMuM6L4+^(wLjbvY%mOI*;v+8 z5U1Tqjx8bkiP{9IM29tYt^rP05@b=3lC0kuMzr2Ki!aLCKI>Xz^FhJw8V`(}gBi;%Cdei;!Ywt13Nz$n$3C1AUrCsrOn;>ZqBn+3lbkun)A$kk-LQkkCA|9t`CYrYg0cWjO1KZI9%`I0ijJx*t1HerWqhbGCb2n-BV!os8{ z8-r5X+R1ia2K?rvh&A_amL`x5_P{6g0W)Mw9w)+?=KbW!7hQ(&Xv>6h(+woPqrC}+ zCXc7s62I1EiMSlty|-lgUY)|*9&Gwc z&xM5Dv*2t|{j0pWXk3iHdX0ioV?dwwh8S8h?f(5>auI>oQE_!yzmMOCw!>wsL>qaF z{!LHr_vr_V-)2NS7Ld2SYXB4eAuXiC2umN)!J4WRMX-*7;eGn<%VB82q?ll=7|e*7 zb&+cRp4XpZ3+`*32K$FYu$EB0-&kmK8&KNOmPkbnkMKknK0HA-+CzSYOdl}3qmdIf zi$x+>h9q_7NjG7}&){nD&}2|AXm?f!juC}z+X)Ehk(84afbd0Y6CwfmT#|>ari^Ga zA$1q>{hi2D04MjQrbMm{mP)LX0O1R~PsLfQ&|p=`VX@(}4OrvRKD|`a>4W*st+won zriI1HRyO!Rrp~as2kfS;*sE;1zOhF!tCezRAK>AIz%ifrmRW+R20BB@ZthuzCW~re zL6@j81-~w(e6E}lg1$A7<%vJRZh`xFU$NW{Wf@`&)4eXo5u`u>psuR07FTF|2FZOh zCg6Mo0K%57wv!9TE3dEBU!}Fi|Dlpl5-3Da+#eaH#(msh47j*^_bEQvY~Db0VN^#q_Y>SKOm~g`z~4iG!S#fuRMA zlTir?ABy^lMCw8uyd^j6#g#x|HnXzgUg1)s)EIqE^7LlAbT zE!nBwpMr|K928lgLKY+N&C8=p<#U4tjaBDRw$w{)<&cYZLb=4~5Vf6ngPYE{AjKdl zBlx}9;$~m%^p+&HKND0q{s|;ukHcq*%z{31;EI3cLdrp)WmR00oBs|Oyo36NY>KOr z!)aUfobB8o44sjHP4~IJJ|%cO)ej7Fi7DRy#+d(niB74BEd|b-rNY0-hNdQ-K@k)m zrbS)fMMy@|E(AQJjON4HV4nsjf*F+iNKN_JI*5jYDk}><@UyslhrGwHqYr%Ui?VWQ zA~;Ev>-t9{gEIV$Q_9s(H_2h*si?Cbo?_FLh+dZv2`Di47qDdC3|N8JQWsnHKV3NA zZNO^lv1^PfmUyAmg(ievji7eM_IVb@_yMo`fHL1+{9XZ3 zbAeNYo_hB?1hl>efONxt{Ym;I6W#G3BshEm+m-e{ftdu%5^y7jWOgOsC&dQ+-M=PA z0)B}E4wT3(&;aNX=>bSK_mzqc>2G9y#?4jf+*VOtKyY?JZcy7bZ@Skva|azvLOrK6r$Sv%kI`qs--f^{ z`I0G0N(0;!dMU+l{%`aV+;M-;yn15`ru?5kpHtZ`EEmeby%7nBboUKn>XA9TtnP;u+vCZ=mF*TjKPFE6)coA%dP!N} z@pOx#<05K+uN!O0yGpyx&(K!Ig2~39Z50iwD0fQ{)6($X?9f4%Ff+FnujFeU;SR%8 zO^nSpzT(hF`>6R_oV?ARG7~c&WQcfVp0hoMdJce}_1Lo{*A|&|?><-{nG~CCiafrN*qoLHU+ducxL z&+^$tghcJUiy`BD=%a+}KqZmj-5GacdNfAw-C1ae#9?h9kT=-V5mHr}Sm8-YsgD?f zN@Lc!TKAlMdq`8mR0eijO@jjXh{6bvKl|UIqz3MvaF<)pmxVv!=7t)6U&4yLXNCY~ zL&PTZ4B-!Z4A}Pnvo({WfGC|+4KubD+?Pe*f&gW4qwUYK2(nwy>mB8QU*pd7pnu^D zLuJ!KoEtj4I0a@Pgpx)-Fu-kC{prg^tcJ0?S9|;C{2TT2dIKjlsn6~XMF#mNzRX#O z_GeUQ4Wlvy`O(NxVj^R5im&s-aN0I4G&t`WVc?aw?#f%>(M&l_lq}uOfBC$x{i|*Q zV3N-M5&*p4#Np|9OAFY|WUz0+1Qel;{p&3I^ARmE=YgP)x&M1*S zhQ`MozI+4pH>X_BDW}n>WfMNJMMWP(_C_v{MO|LX8u!OrMK^oovglOwwJ0h;aBujo zw+>9*>&bAowth$*q>Q@wXCwv&212+V_V~Z^JccmtW?@bzM|mzo>YwXxz-4?PyfGgO zG|heEmbto0K!KqBIj(a6=d$a0M{D`(izD2Y{DVUW@oh|Vi?#aV3XgshtwxI~qk+_L zPl-2jr08^ERERGPe?tVPwR4sfKFt<3J^nRTlVL>(sr*F(N654FTimXCDBxUQM(Jy=(&5Q_jp5pvMkMtv9b4;7{ZGlpN-PioYtTld_| zOf3ZuPsy|9EB=;EZJ*XT<+1`X)KFC8i2>b;m0Z$I6-@!)B;V-ADzO$H{o#3a)_n_i zmAhmOm!*0ptL~gvG*=-$f3d<`K_NPG`m9!PK6I7~5G#$vpjQA0Nr9L#;1oxn%Zfum z+{~f51`uN2sH{o;vk+7~MVtX~f|2GiWTmI=zC5dgJ{z>Dc>VhF)UUKOY7LrDZFyq^~b{oIfW^f zRLuR!3H~o69ANM%J3f6kd<2%8Xw#d*)6>(w?gEMba*WL_c(t;0ua*h^1R+t6uV&C2IOJ@$kr$J|t_l zqZdZBKROo0d{X_Q_4!#jj*kUAL`@XdM;jFqQh3{PSL^%;Ma*f6@em@`(b;Z++IY-~ zkRk420_LSr2-v|Io$k=YR1H#DU}E-ALs$dDWXrf{f^4pr5pHWK8wX9#FRGv$}ETU_BY!pIS^7o;5NBo`@jM&`$9 zq`M0_^@nk#rRBNPQK+WamBm6AIR}{evoFdP<{fJkESO4VMYDt@d`dzjnJNieSjKDm z2#Wb0Oj02Iuoaf{mysQ0FFcG^R_0Km>zq!1D5h0|YHnSSFrJ54=TbUR#A`h5D4ilC z0YVMYSe_*^%=+<7`e-V;&}SKN%gV0ko^ZVbkH+b0zFDM~vU>UHNL0)D=fsTvCdH>t z{AFzHfT47N4GLFt-l~;Znq_^WWP0VKCca z4e*oCgN#3Z&>W~{fcRhpi!9nq5ItG&b8~+V2q9iG7GZ3H!DlTW#WXWi(g!swG#>;M zB3YU7YJ!o*UqoM}Pd!UThguB55*s1Iudt017+k?t1!-P-!s5L7kWsT~@W-Lkr*9A1 zITrMzmXd{n=^nU*fQXZt|A2hZZYBo@hrv#Uh7Li7^;g(Y)}w%h;{!o26G=gXfz>~Omc+xP#@47ne*@F~5x5NMf zwt&8LYj8RXC{^r$o}@rq^4y#gl{N2X#JOa6l${EYm|U}!lM1WB1LJ)^ zp`>_jICNhIoPik|%N0}nda(SnsbJmBo;9>mA!Lx8qNEs9W8Wl$9q`}RafHZcLS5M@ zTj9+6uufHDtao&4>xEk~J#jUzjD>au#nwZfrBfcKuf?MMyb}{#q)7jS^1`vEM0=m2 zez1*}*=Xo^OdVCI#_5CP6Esi$Fxc_2ArnY>`zkJu6AKUPz;kpEf$0NGq9V#l=rVFT zVZhw^%6WH^^8FW7T9>E$$h71KhTwf|lH27nlAGTJsX}hb$;ppkSUY>x3C4ed?ODq83VG3eRgRaWg%(*vMpG=Pa=0Lal_2s?-r#@{ffSi~PuvoV&=~?s4pv@pX69qtLCI`!X?#PKT6BY1 z5-tm~G9*L<3s&i8K55iF&q0qJh(RQe+Xln&|c25F=|8l(m3l17p4 z4(SeQq(izp-u9e#-Z(I*krzHLsop2!V z?GUAYava$sN4pvi-7q}ZCh(I2uLc0+EEHwC6;8(u#u9K|P*NS~TLE{fiX>oa0@E1- z2n?5l4JBA1@4yNfSPx>DieSDVC2FJ_S1{LW(D9{uOr+Uxxcid_?>25iv>s z4rARgah8E;j*90|;6nG}@`@@`TPSPl<5*-wd~mQ-gpw4RaB=`^U49+=awhrYd*J&w zgS4aRfu$ulZRHA3urjH@8IyIS_&0R<7bC-a36mP*eVQ&c7faF#jWS+8XH1 zFqHgTYlX(&|F}Z0)ko4|ES(xrjl7-IKCJkdXyP4N`p9LUzAgd_5hJOs2oS$dY zWY_5b*6tV4|5Ng8X*BvvJFkqzaEi_3g%1Blj7!99t<}T6-Ts&L>VmW7*F9%j-y937 za(nw5{#)e`_k-lF_aY^0%6$mvQwK{&AO6WlP4J~FiC=(RqqC{tYS*Mw0mz?dh^wmY zD@UrY^Y^M}j~`3gRM_=14=oCodRWXZw+YRD-~FsIS+)q)iskw?Inwp#G+s!8T<_Wk+$i!^p6%p*af<;46o~Ey#@$ z$-4J#om`xDwr9t?}YI{1-+x7yH9n9RCa=K7wBei>b)$zrY4t95Ar`z`$O$ z2Ny62>|h#DCI+LMZ{o{1xROz~a=ktRiJ|5AGsyq-`p3apf~v{$^S@U0e@uKectGE` zPHJWWUu`Ut8YOktL{>dvY>r>&=C|MBBA{t~;^z{=c33|0SpVpssUQmce>fL{Q82OC zh2(p5c|`-K9`^A>N~TZNMYUh@XTyQnS75sEM=R#Zl)iOwd3>|m2U&km zV315eq7VletxbTm856ZG>rX}p#C{AXN`i$OGr-{=fQ>i2^XB;dNghVpE?Tq4aT2+Z zyX@L`i2?n%`(FozWo3$3{vWks{Cw3FvxFlc6wy|ZTJsNGBwpuA;fY_8*;1!?aJJas zf`pb#K^lrcDo7vf{92i13-*&A{PmfA3ENLiuw2kC$ z$euX-=vvZvJ}F>Bldfj?Z#-}?xfwKGBqY#rG0(ewnhTs7d#f*~qZ@x$X0T`LDCc|0 z)}i#n?DV@^5<7cTl=*(QxatG-!}R=&$>a(>Ls5!4^+W|_{X0qjk7kLoVBg-TV!*IM zCW->=fepeu5>Kx`IC{R&1A}W&Ym5&}x!?bdG8Esk)xhS0^#7x|I5B=H26ou_1(o7u zdT&_5{ybJECS`D!th|w6rxwuu%rDBZc=ceg&EB72t<3|)lA_kET&e4wV&w(jO`|yY zBqWsovyoUO=SX&HS#V%ehfsM*Z zC@Avz&oJg16L=AAtCftO&Gz-UjX8+}+n41l0pIozs1%E9^}c?uq7sk4Nk~*7=EMSy zr^0RE2rSYT8e1D81If`SoevLN!Y4+1`F7SFNz1q2ICda!SZuG!9W%)OT%{h@PTyhH z-xvQDPPxZG&b?!K9LsHwk5$|%W;d>HiT$6= zfzSJ~OBAtNJHv}BTc55@&#Rx!{KuX~t>rJt@4p=9qqqNR$FfN zCCF_3ijUoLu>Yz}`@eKcI=$)f!GG>rDQ`rr27#%JF?W(zqUn{KXTH%G^>P~Ogq zQrMVcjlBP<@!$I0HSxXAzxsM)|JQ!kcoSD-VhkQ0eiE@yf1vS&*KzCh=EdJBRb4d} zW;{Ies|VKkI`8V1CXe*|LWcAhY|4tIvpDkwRQrBH(_;(hwPRQ5V^y9#}_pyi@axY;+e<@ldVug5J10P~8gdt1>{T!~r-0Gx&^>``Qaub#aK6&J_45%whx*$O;4dVyk46>q_2pnAbnihL z4iwJVdCKdTm2o_bcJBuc!2eU7j9bCn|KIRA`5`7ZeXW(W7aJrAP3^6|8ACn zZNW53*a|r7I)WUTjDG`{GO@wz4f{Xn$iBwR4&1xj*j-}HErQ;gs1eMu1Z(~7S?RBt zQ%}dp4u*du2bGc%!yN5{tOV1!4_grl*tJA z?U87pYvuzuNXRMOm{khdzDVh(hJ8H8o=bC~8&yV&>!OWaO7ef~k>A>1_wjFi@>L0J$sea5jekIuF@8-hc5&km%+z?0>>GAwDL}9G| z{Ncdsm_KMbV&#;*UK3^gW!%bt-2%8u-e17etYGv`XF3Oy0Fa~j242|_fPIcZ1bVK= zY3t1$&vzbbuVdXFus?1-jLmQ6`ON-EeSQ)-r>z$cqanADq+((j>>`2<4tzOmuE&5^ zX2=j?xsZD$pjtBSU%26nLRj?zPMPQsk3a#iDMN(*)T6_lz}*h<+>Ld$i|s(l`3`}8 zHJa|&iUe}lA+S%N9LU6E868?le^V6!Oea(Nvm74@G0ACc4Mcj^y}!y}ZNaDqm|h3G zt~)Pos?r!GNFJC4ZN#J{;GnGy8vLTBpnk;4m8Y?TbJ zT#pL%wSp-BM+3R#RW$(*ZN&B5;$s_xhki9|MFjRfXFOs7(}ApBhd-0ekRZic;1zjv zD)0yw=C=f=^G{!WE;?}q+*+i{49h@X+!ty8eM=0jMaU5s-Y#i(&5qg`FJ_=I_Oz&yP-4>yELe_6{VV%lD>50cJ{9d z?*$o3z1?x_+odoOCH<2^$-8@G3J?SW1;PEs+c*#VR&`_cSjR36iggxQLD%I8wK{Pt z?&e{HUi)TO$GSypkC^`cHwnHNSueb&M0+%-sk5y*s2SjwR z%v5Ej1$t1F7XfIisSIo#^{_3%-IN9jiF!Q41C+yoTGs+bIDDWuRXY_a%|#7?Nz{`& zQ2v8?W`wCK~)m44Gd7TUp_vEsmAs@(9J-FPU3C6+nQ}co9MSB}Xj{ zR2xXtHMzww%=0{=`52+Te1V*U1F_+ug&MEIKQ(SqSg}az6A6^Ue|s7l!iDUS(SUnX zIJ(D$V?zXh4-=Hn{jF;3u_;DnoSBt^-I0P$_CO9VT%tYyo#wxNffR%hcqJ!GD7Q^r6An3U)m1cF^_<}_G%-Yeqrxn8&07nYn1uqVp_r;K_Hhfoh#uYz!r{L!1=6n;`?R>GH=X|#L zPO8CPinr|lo)r8K5)sj>&F?NSM1jUvO>Eh%0-kG%_IVk%Pl0-k>c*3cr8?D8e?0fi z-=!uVyr+EWeSFaE!l~@x5$oeoswMhR?oU;P)-MqI(y>KW6TI zeRR8;DRbLDR{8^Z?u7k>{n~G}K9upL3-@BZDAse`47KgAkx%sk48;W*xc;{hZ1(FKNt17AbFe$3-Cz%4y6?8B8=YZp4EIfi@Q zILp2+!`%$(_Rt<4url){d$Rv&>d8jYRNr}H&VFu($3ohVk%b8l{h-mWDQo@`jAC3G zs6BBEKtK@E&m;=qUmwgDwpzcuReB=1^PP2^akPIvTs$vMC@F5OGbBGWaj*wV>^|$p zuipm)=OPlK#jm6>`MWg^P?WQ&5dYYpD>$MFgSNV8!f*62fu5dDcMA>Dtz=8`MS{US zeh9)>p{F-Tf;DNlt7UolWrF^1SW>R02uvOV(X0`yees!a$JyDLiR};Nl2t^^HFd`e ztS%O`aMMxp=SHJ+o(`$356qMK_(J||@-e=r@RRU^(jhe)UU>HCf*~rgC6-06|cqzPwDk z`@Z}9#S#mW-{&b$5!w1@I z8**8=@)(Foc2DuPSZ!V3`8v*@{y*SC*zVbXz=ivDWr?q!YSf(1tLOzMk#d$clPi3* zqc7RcR{U?}%M6f!5RaEgFr9kUNlbumHSL<9JIVYlY>x|=c58I~1DC@954*b*;uv^9 ztX42nyZT$|A>dF+y#Gmh#mCa}HBh|oJPs&Jo;{Q)d}*hXv5?QUxnpve44L)%k1vL? zU4X@q?4cJrSyO^Z%HI!~z7`Z0%c7m3W6X)!+1|Y)@N`CF0xmae9+K;LUjc~76KA#Q zW&?wKFJh6G!h*?n>DIG^DS$ufa{*=xbV+Z&z;*Jzc>_~js=!rE#~y-*eXr7( zkV6@xTdM6_uw2T*jR$*Cl(-2;Ne69g)Y0S)E5wg-D9&8rP6U!yhJI@NRH^lJ`sDw9 z{&uP`YX3aIpPt54qs zLZ3H^V6V9$=YfMu)x}zh-sWULB9W6Q`01IYGrc?d*vp-Ll+95F`sCZF4GNZ zIMytfIQ7@0mINtf0%{eRPF97Ab7j_yri@W!Asgc4)`Tx*L-=KgDR)(NE5%Fg`7$T^ zr%~OF7H}^s_J3WLB$VLAbbiO9k#u?<|2%ZKv2j&Q&B(sNYZtLZ{EkG z-2P?nGL~eTwDRqc_BYL5-({)b+JiVLW>=O%W<^Hb&<20n*^RP{qDucnpF;IWCD#y= zmbN0eZxz+LV%j;!i81du_Tk^58MCbU>ib|9nXW0fwQK30EZ4Il_xqA%oK>2en|l+~ z<0N?e@sj~Pqs_a5QD?Xw;o6aNK4@wtCuXq3U;pt|0)494YWZrvq`Y86dg~fSU;{N-K}Qqi$Im&%WDdP#P~eZmkH_rdVkzg>>BG#V1uz0-LfFfj zWR*JMfI9vw|DrOL_Z<2O`+r2_sT6Q!W##UQg12S(Gf7rh!Gd~03Axb`v zd1QC1==lm6jJnx|K^GpW-1a)5AITc6FER)quT=N;Nb(PyLo)6PrP@TCb&jsr{6g2d zysr+p#N=Lb#AjrX`D^=M3Exx9vXq%h$yNHCbRs*0+WJQSXVKLI*QMBGxwQrQsu#?x zYK(MwY)hskHA;98G(g?fHeF9ru$trfBi!`_ir2?kjW<)&n+qCEm{uOzvI(C%Wy%;ql%336?WTOmQkwyMZ7!Yf`=$dQ^P-2fOtICb zY1hOoi11T@%*w%=TZZ2`*~lIq4^Gp;pZ5W3?L zX2bd=BYIb)+P@W~d#hJHILE`bbPF2)V)^JL=dYi6xjY4fhqN9ZY`h+0Ifeq4>{9s& zK#}X~C7w4!!c#b~!1C?BDZ(wAznJj$rK~!1<Wee(L!Qe5mjYDAjIW!uwZ36YYzlfDal-z+M+$BMh>XP(%ect;;dE zl%Y)l3|`1DIp3vynhY}sipw09kXbhKuAi;{M2KDwA`KV{8*)Qhnz-Qb`{6+xrHUux zu1>4%*XrgXL9H|oXhY}rIk=_@Yy#E&g4;vSH$Z{TSvR&=e;B17QxAO)(d`A0xvr-T z4}3T3AsHCZrHKG?P$Y@M0E4(G)KUKl*!vsqzoOD|xHAz-vfO-ohkvIhafEhKs)+jk*UbWKp3KS@$CUMaDi#MYa~qy^HsY;aH3AhA5-l8Czswu}P3s zqG*mzXkT**MslhOOK@Hr1&@C(aHcxqZX#@m-^!fzA zIh>!NGKz&HrXF#$4LM8DdF+4Be^9OTxf}>{CdaN}%|i;-d|9r;m~PO)l>Ttw!b>lc>J_jXbqgNK7;g55fr_lH*rr8iJEtavgL(UO8q(M4D_=Hy>L? zN5+$^Cb=hp`L}fc$mTT;#`S>d{{Epf%DL8ViWM*25*}h0rrV~JW)kAQR1OCJCN)y-pnJtPn6=5V{lVVgXkJ)YpYPrPE%Tp&CMw){@dJT_|hk4R)<2w_OMn{t& zb{M$$?PuRwN|VukaMl+mVLR`c)WtkDRh|VC1p63caW9Iz;f%4HVP7ZmEV*r;P=^eP zLm$PtLT|T$S8KP* zc^@VoC6hJwO=Gl*sHKuV@9fENDMsp75&fziuE|U{HuoSh+Wmf8{;|lULia}B>X)=4 z$FjZ(d8o8Q$S&~mO+AXzXr^Tm!LLvSux0-?+V-k&KT~5N7y!nP>2G25Uc5Boc_3W?Q@s0tFd&RK9Iyd?Z^k~S+5*0XK_7XUm z<=`qdO|vttIXYkkT-5YXpeh84NuuV2!qP4uh7+F~t6VG^8T*QpB*~Z@FLx)j1K-Cp*DFPEhImSs@8zSDP`Qr#B>3Tgt)GD)D*xEP| zoUbUUwby$`vYF?|waAF?TvRyN!I3&gNjcmo21lt@G3F;FZDlLo=34y|6{d$mvD|$< z-U@I^b$q@rZwry7EZEr`j-ZA4lo9ubPh^->2s8RK=aSj;#VP%%qH%q{hePl;$;@IT z{vGjD{;A^60Ap}!vw+dWV9_K>K%++C@~Ejuq>t8~ifN5$_fq%;;B^$ZNnSX73q3u3 z=ZULkUf1c1^U$c~vIsSwvKZ!GRpfD%msn=ZFCZoQ%!$oMH!abJFiS1mEr^r0ep(|g z8E9>X9J6iHphk;Y01E6^gG(QMkc*- zGvmR$%4VZ{cz4hX*g=`Z$x7w?$ZgQZu<>@0+{I3<(3}|M$G0+)zRU5NMJlQ?q1t-P z(qDov6m42`(Y36&uLLk(mugr|2W$;k=$D6%$8&rwVH4-{<6svxu!@{6N3~B#$}&Dk zxd(k$ULQ(%Z#>;O`P7^e?Lu|U<~-OH1Q`bL}cG5CV*V$>b-7$tFmNPWGSso)Z|NT$}B_sH`yho(%bTLDun`d4jI{a z3;gCZZ+L3F&?w3*I^B0ttqh|4%rdf*ka>S; z?iqSsSJ;AQQ+_MY(rqO>k~FUc{RX*#+?Zl5FQ+V)_~!w2WL28w5Pk*BpA8x{0~xp4 zU)ovw-UNu>HYmItT8q|6-|t_ITavJ$lTap}VxSdVXLV8J%FKtbF6Uz(6973W68jFm zc(5+P#*lF1AWtH4&;um?z+xnmnrDqEm5bE6c^x8p{%QNA8sD|u0^`w^(gEByl&0@qb^)&*;?zY0fo8D>>qKvwpf9dlbKpU;pt zeymZo4gV$P_Hrhv(r)H@$^9W$5Z5ASDD{}VgV28kgGe{rs$U)z`|4K<9#P4T=abzi zt$JGBJ8`1h5kISX=N<(ygtG*cKrDF$$*ZdJkTR+O@JA z=qd&;oG)BqJ`DL`cU+P^6W?~VXR7{}RpbA6Y>+p~i36AEb z`xv*0=0U2T7NX*a6gOJ5KIm1wOp0PH4+MAMWV;c6V8-iv-(h*Xz^2;pSD=KIL$ax* z+Tyv|Ngg9;cvq}?n~hiby`+A0iJHv<9bRiOC9~}tlkf_3+H=lN%C;`>uBDLQnY3z< zgeBJYsN{poFYuUZs#EghbHu~CRcMEt;euIwxIdiT8hvZ=P(zYK zk7b0-XO_<*ZM2Tg(Z#v+l20BkBOz1QsH5gH3!&NFftRdt&5O%+rq?G4Vf;gBJd`RG zqr^}qZ}9-HVvDr;Ds-{;1Xs53%F||Bcb4)9TolMdLOSV~Q3?EBXbX4W1^E3qAor~? z?rTHmA;oxWtaYS$v}{t%D($4293&NCf-jtHt($C<@SO9=Rb}eX^B7fVrTuM6%lV>TegtLrYOB>ym=h~ichp%=(gojq0%S1xRF5vJZdY!&CP`q0q8_*}?S9qTq z5cXrg=_i;F3_dEC(jFXrLc#$>wN4Ci5 zuqNb`kX0lZ&RlCVESiM^1;w6L93W9{%y7Rclqp`nHC!F3qqUjval*6ZO{!6bZ4F^p zq(g?$i(T4lc&vT?9VZt_(lX0;FaygJTOHne}&a*;Q&dz{d3JsmEmuxpT+ zw4t5rrh(X{7T}KxLmT_Pt5O2u7HWmSBjUeAx?~Td|LDHgwe#m&XBd%sBnDyl(jZD1 z^NIJFc8Z^9mvP2$4&nKLNIR=9%?U+t+nAtLaOdBTT)iF=VwB0v)(1rbu_uwoU2i<2 zL`&0WzjFbk#1u!@L%BU5cyLM6jf8BjkSU6Itl+6H*CT{V!o!g9%RwOa@XAPiFMaF7 zBoZYrvdP5jG05QQ$>nK_+f;29gV8?sxLCNYZ!>X4Z}>LZ+#3#$x(KV$Yr~eS#&@JH zp(%o5ygRm}HH{xsBnP4#{Z#J|^p5di-Mo z9XAZ;OZ&_5Vf7to{W_hXabf8&h6^+Kw@uNm-buH56Gimv;-mayPdKlJkzhX_pQCgcx>nleBl`J-d=F6=D?F z0l3l*X{jQVix4{a@$f~H8rm*aUtbYVREv-&3o|AOzIwn)k3+~Ymu$I{>xIVpa?)ic zW(uAEyp6VMjd;?}KTK}*j&NCxCRbxjnh237e)8~4J-cE*0gr=XaRip)-&C~L;xb3E zE8vPjT&fT$Z;eihBR3R58pTWHez6$d%_wsY3!x8X=0uC=ZMfhIn{P48!VK3*#q}rbIa5 z>^tA3a}*pbSx?pR1ALV6l^XU?(T^|z_eDIIsTmdAdk>pe3N;*od{Imq(Sq@^Sv+DZ z9rC4k_R-des0=DH((ltgyE_DhMxB}G(yyhIe4D@};U*yG%U`bHTj8@8BB;QrS3xG{ zae4?Y{;q<&Cxz)HZ&&RgrM}KBtz{ylR^-|tA`j{Nq{6W$8y9n^4>_IX^hcf1(`i5=kryw;M9|j^};^h8rgD0KRk>&W6bKcbl90dHiSkod*u8! zDS}lR6E*yF4)r6KDj97zs6BgWw2b%j5S{fb5YzE!Pz?=bFD|L}lM&F~^ckmwj*8yuha}%5I0+CB*ucw3g zr_dPRD(JWBu{4L&WY127Rpb)H5CA>J?&xS$=6<7g1EX2=5%SjCRp-e&Kf!JCZj`Vi zM@DPcVneA`t8C#|%BzT7|5J+NDwb9OFY6*Q%-?W3{f#ckAAN0QE++6(15{i<5A;wx zW{ReU5tAs&Um5MbzSsHRO0;ckk4TDcHkqv}(&19y;6CQdvLI$HmfZ_0rL7vZcpiNw zW?R-;F}X`XgQlvz_XpD=l!yrg5JXnb`2>Lrwo~Foy+bcr*Ts;fqUeJ`HIz&r0%d!j zOCo`y1tLDD1)^f4vCfPANlh|t0B!7#K)5?E?%y$ioA5C~Vg7pg>CB03;>%o!^~sHD zMi3DZTL^&o?d6@PzDH{jdj;Z5>vJazqVo!;hmRQNEk8{v9_)P=j86jY0>I$?j5mIz~o_yVq}q}&>6Mh++gh`l7Rb?9ITQGXwkLrdA(Mei^1 zB3=zY_s-xtss?f3Gj5CyR|>~h0H2#z9-fRJ1KRi&M=9OgmAY6)>?N|}Yk0eF6z7Da zT(f*_eXG_-?B1QkaQ2=fP5#f2KLc(l7?@-oqCDPBkex3;iz?~sx9vp?2u!HZkAR^_ z$0!NLXL`C3>+W96DoqGeUdYp(IYWLXs35{08*h! zHIwO&Ao=0|Lj1y`b?!oHQ;ueA0O?yK9YBE5IGAW%8BX_hH_*B(B`wNHN;oIRTZC@# zm>4H~93O(?J~^TgIgZNEW<*~q?io&nm$%7DKnTH?-l0wuUyV4ySi>rOzi}-vR#Maq zPuR+YcNktFL)j*mLyNVzF6=-4*i%V+hBbH#4Apo=;avRgqfB8EIzUl{KXQDx_G$j2 zi@ilp&%r3u0LBt*{^biJQBE;-$rKT8WS@)-san3sFZ6XLvUE4KE;;(8a8DPL?>u`q zK*>!peEnFzSSelrg#>eOnHI8*5@sd$Y8sXusauzD>`!b#=^zbynDP#z^BzGF~>*K9j59&JA=sLFAay?;uS zaup|dl{exURf9{x)2@_m+FHy!q{Z09za$N37vDF8VkqplCo8ncMYq9IFDF#lQKODG zA34+@V$(m+A=_e{n_v{&?U;b{LQqw*ZrK-bW0gzg>;#nPi(=#kN=#LwDEEFv>_7ra z*f`o=aXj7G!+$@$xfA9j;EVp;6?B_J5HYa&a*4w(X>psYN^v^APD%Om4hckRW=#}m z%g2{8?RNKcz+K%CWs?C$-#(ZQ3hhaf<|&L9vZlkxC1ez*D~>Uvr8xuQd$!}6n(GmG|+D1vxlK{vmM8*gy~>22%LG$ylYza$;aT0 zZ7W)z-_a~TGdP$NB>lw#UD(iO8yc%>jHvy91e#sM?t>%t-K4jERj8-m`8J^8xb2;J`c`vu$jm)R}WYFk*2#u10l$)-R$0+jK&Evo< zm05K=%?+t?xJ2tv9(Ysl^`~in!6LIgesW@-e)bhh!G&sLqK_e(vY4EEd!xRE{7_s2 zimh3S?j5rGbMj@YSGCU5>8a*f_rN)9p2e6Vo&+)|T5?hTPq3&5@}oZcLjD}zJ`4xE z^J&|PO+7g{u8ZI+r0hs54)HDUNPnT>Yfpc$kbWk*RYj^iRS~~7M^X)f$ zZRo;B-{#jAxH(5_tx6=hzlY!EKjvZ%$xE?2A}~2_G;Y8z1_3&!*vaynBv}0>zuFs# zR%Kk&oqV}_kO{`fX)IvTJ1S`wU30(>s{qMe`Q zQyW9KI7R}N%~mt%R@h8)+UuW>vA(ACM6}&3eyvqi@ms`Dbg;JL!4^$}8zbk(!KzgU zUj6#HiUDOyfD3YV!W@7-zcFD4-*yUt$E({1?L!?;yH1>!c-y1$TIl!%kHDL{Ho z^joa>LG}uxdRN4`-Yk@xsmLWNlc{XT-fO_mCk$IjxUDc$h(TvW?9a1d12l^tyB?_& zB3otiSPil%UYj3v!AVu7nbyf@h2BU0>Oc8L9mpEoP%0?$>q!fbg4B(82FJXVeji~x zqyn9fYZZO(m)B83)12GqUeV%+iA1-+m?a86BbkoUNUtt-ExWzLAJjv|-w@B|nw$)m zkQM{(H!XQ$A1m!p4lVpD417Nn_&mn84>TUPL#Dg%JrCc|aFuMP2!6a>7zuZ+nIOZa zY+xlPihxjI#6+7m>MACxwHwr-)E;rr)WQT9;MA$}u*rLXgp z`@tK~R_pW!yV^*%r*W(~_6Zf(zI>+D{rH^eytn6@C+(ffQ z>Oy^=rf==`%npXXTs$Ve7Yp|I=to>M1bj4 z##dT>*w4mwB75d8L`pZZN1jmZDdHILZ@*6k`wVa2+cP-rM!QO>j$oAL274MH%l)B?!Aw+h47>lReJ!&qu~f%`~81q=iN4?E|F!{u^$oW zv{W>R^K$BX+RjX-N1wpim{?ig46Pp6qtxQ>xYN*UW|c+mImpyFq$Q9Mhm($!HsMDPn;{=vxE90iihQ?O` z{~?YQ?oYz?rr`4QEAxaWqnx*UI{!f|*PZWcIH9E$6}G;%n?HKpGY&fJE<0mM7<$L1 zCad?Z@tie5OfCI;fA_vtu@E?8wY46{5~ibg@G>Fh3|kkp_d-E&c`tNx!hH<|N=;7$ zDel75HSbBoBw|j6bAr=Fn|wCKb(DB8Ha^o<-N*L#VJYKfPzZ6P3-w0*C& zy_{5gcj7p9bzq-%ym;UJe&%Vn!-Trt@uy(R^XhqFqEIa(1*zZeiIu%gBFj$XR>$*` zzdVU>%#rbQ27*+#PV23g*J*7%puj5ix%lrcvyL%>LZ+6CQf7nylTkvRqA2ZlMI^nye}=%c_K$@$yRv7c>jl25l2 z$V3GG7}f`a3OYjUBl<0H3wDwuEo@cJj|}i@PP*Ua~jWg2I*Ii@cG;7(Yyk6n2|7*C-%-y`fFFbT;N|v<)XDC$3w;hyw+g9YK zp{sHZ(i}>TV>%qzpKdZCIqThYaO!BZQ0k`cEB{4MvcDYa8XXRVgqv4joMV4=>-=>d z6*Kam`p}l^Mq=`~W~(w*-BgYtO?L1;zb-9wK3?_xXwu*o`t0fYb*ON)Z)?kKpQbQK zL^)ySByc1c0q?N015Ck*B5+{~&G%S?Up}C3^@!-2k7lsKqc?EC;a8^1mV7TwaOp|j z+Fd)|8j@Tw+FY8Gh7G2PV5%U`_V-$THaqMkFaTcdhO8_WZu|{Gtis3{o19w;JikgK zdl*XY4Y*oZQbQ_9a!yuIs5T?%Vgk#!2R;u{D(~D^43clRUgF!PbppVl~54` z5BZteEZ<<#+3%Zq*s=ZBeedHKT3$FJfdbDkOe?kz0Y#Gki0&^Z@);FUU!`ZQpuTzF zlVXr5HGDXshc!bXqD8cySndw5kC%C{4}-N$aB5Rk>PkX%kL+mK7M8Y7Yyo(Z;|=;7 z5vf@V_3T7cJcUgMzuD-fsOj(+YOEVG-jCAmMTUv+LRMl9Q+*_^B+StsHU)b#d;*Un z$!E*)N~^~tBx=C-l9G4pFHE;m?QOTNpWAeuUA6}1h5DBWAD25$$dSYa;yCLGJZH!NYedr z+bcC&M7PjVKIA@oW}U*jaugBWFIRMjLoXgYVXpHjX97VAY7i;gKW1lujx2`U+#o~v zs!QBwxwX}~zV=-Vr3jn9_qAHduf|_H>w$rlpt6F~cza1+yWN}sDg7uZ1bt`yz(TIm z@bJ4(<<^i*!xSr}-${vHqYkypwmJ0+&9YL%R@FlJ)Zk7V6oj=`3Z7SIpD05P*jrUU z9j56ucoH0Qyyv2_{nB!JHN@>NI+-O+6^_vMm){lw3aa)b1CvlR-US z=TgloadFRTW*Hv+wgQzL$9$Os}-JfP%=! zGm_Y*Fd6t9ejPwPu7grB27reeQD-H1wN}DV+P$jnemV0T3+QYFqd0FqG>Lqq6y2XY zzxguXE)q*HEc~>v;;|rFuK)|SUeE!c`mHz!Fo^}OLN8LJ91ElS@Zs*eO8*DB*@$~6 zBDjv89h(kn50rr)ifW_b1PIo1DX%QHoktb!+q4)>BmX*8&u!_CMtS8x)Qq07_l3*~0_myeg9g z{ZSX)@nD}IfI8ZwX%jPImN@|}q&Vo@3yh z>3%PW_l`ku^fT3lzi9Nc*N{UCFx|(v*(yev*n^N`@J|bg3gtqipsWmZrRpt#y7FPc zE1NqXoG=5*DgWv}eU^>?v$icY+y&y?p-;u{- zCz4q3VY%nYF@J2qBsN_l2mbc?OV!m81TZOP7gFv{Es_-x@~Zt@6NQ5m$BNxBN{>Qd zO?nYA-{6!t8nGFoV$*9gg75gejl z%0+L}BjWgyaC5-dc2#>O1gG5SVtzMW(wlu4%2ZGnjT^O!etOp=eYNKWYzK?LcNP`W{+J48Y{-!XOV zz1OaH@8`MBxz1VttZO05Ie)(~#y#%*eAAI3?idhlCt6J?pmEj9R7)%YB8_I9H`oH~ zBf`c=u(P?Wpc}9$EXZbbJ)ZkJB&*iLr>GlD2x@X#of*n1gYhzl^oB?SgTt zWec{clLq^5KfRZk`Vmv@8kYH5pR)Fz(*`iRB@LZ_QqU7j422x-orgl)#S~Bq$ztg7 zV?RNp?#uBd&S*IhM7!<2e3dfxE(0?9m4BZ>b3Q|4zhYXD0N~4aF{Cb6Z+czMT_fjk z-bP*UNw+KTM0Hl)w5Zjv@e-X3xC*;-;eFf>nPoZ!^O-)Os zRU90X^NjCT+s$dChZUi+_DGTo+AoZc^wtO7%w-ppP{GxyF7A4Ew!*UjMhRz9i4XZh zVc0k39PDi04*~)$13EzpW4bHjReZ8SbL>=^-yARW9Y;H=k1)x)jNZrlYgTj~)}v?l-?~-1N9$uXw6^+v>F!UcGI;oT%il+zblIB0 zf!A@+M&P5nI>wm*6n1u3Dybyd>GO9l{#=s{sD8a)txD!5psTNO`3g1~`D#9VbSt?( zvFDWEl26n;o2UD{VOpQLn635M@qL)g%ZaO78{fajagaHEmM6au4Gkq8*q(;ee~I1T z&rkJBt3N|WS^)|S%^(DO?<)zvHMoi z9G~HIfk|M$$b&_>(Z46ur)m-b7z(%A?va_0-tp{(H0PoyDk!-LLr~#)m5nn}8n}mo zE&!@UO3ayfVaZ`+%@VLC*dQvROMV%KuPtMi&B)Ajt620g$3jSaT0aBPWd=lBHF3J; z`TbYE&!n2fPLj(Z@O0?N@ z5!-i4)4rt#m_qCwtONE;u&Yo)AtEP zt*E)(4!sX!9$lpwmUZ`d2HCewe04;ci-tzW!%x+OqvALkbSOjww6GNdCSD{jfpImm z=!CE^*xK$ROyGrvfSL%O;@>DDbn-VEX#vAj)d{5#M;(Qg?B&9-9!pl^X<$5L^z{yR zBp6$5mxU=E%E4T|DXLh~FSow%18wW*u(Q5OJ|_v#H`&5Z08i;}yB&XM2u|as>u=y4 zIla{3sVu;{g-aE)9oxTq~J7BLNeIFzXbjXrS?}? zQ5ZJ_%uA0m0s??mkmYYI6&|hlMrq-%aOanULS*&v-LXXaO6+^<0h*tc7JGzWzI@p; zIyFx5-klFHd{@KO(b2Kricb5vNULD^>he5J&^y)s>o13f^@(sgF>5vR~u*K=8_Oy@57#P z@Ke=CR8)DO*-~H{;Jv(*Ef(TP%Ax>A#{6+otH|@ZyLt6FH*JBR{6)bZJwYOsdxWQU z$tX1z(1GN^pw(Yk0N?o9mbkMTNx;*Sj$;FQ(M1&i>~PKlypF*y?xwxC%(d=c2}US) z03}MNOKc4u0*FTMxb6raA?(&s>iPEvNUXG?z2}C%-*hGd?D)@YNPAHv@bg(Z@S-O= z?$i@d`j`wxa*FNmVS4Qhpj+-02kkGG7+9)3OR}(3<$v>OM)0{Bz!^nQd;?O1awBQ| zkSIV=y9GczI{!cj1I-HS4yG?88UYk?XdLbO1{mUx@8<;Yokj>T!`Xa6W1c?crd@8? zS<>ii6I;Z?ve=a;Li%+zEO0d#!BFx&Nu-4WHQ7%3i-eSPC_N814?KcK(CoF2plo_nZ63jxX$`as1AxI@xe_Yp5 zaq>YxPyzI#*%r#96kc@1)$1`x4ax7~2e)GAJKTc?y&xTO-?&R|VI7|$>DDS|9XB1@)@IBHXa zj^zkesBc?-%3JUbv|JICqDV_&1YGVnyeLfzVkfW-ns_u3=nc!@b0GSLTAafK!0h-E zh@JM_AQE^|+ZdVL$mNW;QUN;-@~Q3)0ctTS{-}|yZ4E=^+FCaNzi~hlBIv144TcmT z#f^DiWc`(tKoTJm0$_I30AVy|-)WB^y%vFuUlw_2i1_sXuNim0cd-1Ox`_#nnT(7R zzQ>IB<&_O(0)dQO9qB=Z`-T}xp`0A7_)r%1h#s2)DLl4f8b^w^`6kNx_D0~0n}XPP zCJA254Wd{T(L0^Po%hdpjLiNH9)Vvy}*_Jaau>TwFRG{Gl`$QipH1VegcfA6!4I z0kj0SiY;2RS2W8@2W=LuAl;s>CRDzSGiZT)M^KFLT`g^Gfxgbk{`GNTza;ROj~$A@ zZu15^5kY+VpFZ#%$z6V)-ImC!I+uk~>gUG;nvYzR(9X{L?)k~MyeD?>d0f4H+SqFp z!!q*TYCb?x2@Upe?05+UvE-U$Hk2#uguqHDQmkC%Qg1L=W*=+ zBDsZ_O{^L;zrWkwjJgPRho`?0ytHMpW0 zra(kea0D2SHGJc)K+xuY$qI&QNW1M31Lpnz%!dV5<bxQ+u4Ohqh<<0*XQk+Zx zztfv62u)a7%NCk}_)I-Y&C6T3p}8=o{7fygcfg<&G8&4Z@v?nUrJV=(TTw$88Y`Cj zeG9*cd!f1Fth~;=@4b&5zgC!s5SOthOrzH3WF<(N+x|1hc8Ij%{b`Wk{%Y3aSX8gC zaiZe)0P2ILP4c4(VjBYa&Hvv}_rSYrybVP-%Ms(mo;7_#S)V?@Zo`xS?h{N}Mg?TK zB$vj7m?%!CP!3iz--Qxu&AWS4EBpA+XG^g`bc|gM5WEwnH|Jmn!~?e9-x%Wv?jOb_ zf-wFKNFA{tP=VTV2y*xbMgef~kWkJvdV~9rBj^yx*YvlKlOf}y23xa+HyZ;jh|iLe ze#S~ol3ZiL-Feur@YQM~bU$zQaepp81cjEmK0*o#hI?G(KH%Q7y=pQ3K$QRt6RWCc zVtFF)EbNHDhDfORuPM8rnj`0g%?!&BGpxk#`T3Wr-PzaM9-4ML4StLgO-jJO-h55|MFM z?B$Lh*GCS!yvnx*SfnHhrVrkHpQ=@U*6;y&Cxr7Rt$es&gMpo$W8B~9`Y&*9{E4p5 zK>uyUT3K5=!Hc%TJJuTj`~yECFuuPLMqd4~Hin<-a!>4we2!Hse7x2Tib@N4jc3bE zUwj9={qXScT)O6#+-EnY!vQrm8qxY6{cgeB0qFipo(_fpy527iKlob#(ih=+QBC2J zBJ7d+Byr`hMF3KC|7sD$GmsPv>2-uREw0hCl-+e2RWdR`p>O$1@a>?u)Fe*i&W$Z* zc?J^T$qzdIgUcdz2vX9uzWagC9an%E`Zj|)Suobn!q^^{l7PVHv}!eqI||PiY#!8n z^M~3!AcJbWY)hr01PQC)GrQMW&D#4(Wxx?USj7>SJtz=rkhP0NG3PEqgJoZ)tm~N$ zzEW=?rm-dZkocJum}vqyjbgd+i85G9OrS0Rh_-0}xBEY`*lvFF)+X}-9|VLxCt_Q# z4A%Q64aaFQ?>}%8ko(*!s{LWl>udGeF`tKYrbYwsF=ZViaPew+*-g>L5N&{hk9H*(pD9SX3{r;q!Q-WXe=cL@Yt;I^YYF6J_??tb# zSin0f9-OX;ZzH!ewjG9NhaO2vd( z9hY!NdU0_P7bOoeiX`f??n@DPPlTCE?%B@l?Yqyu$F%7L#nsathgEs~qxp;}b zJKJ?ZM4$H?1D?JVX}FXbV+oew4``{Q-2af!`5$A3|IO{9ej5_hQ(8OxDES);$2!Jj zXj;J>K)?1z1f=N69w^EX7cy2FBUjGHo5r1Nt+gp4x$oJxl%;uiXF>-WPM3_VZdmsE zQ9!Wj+>re2I_W-xkwRM_7}84k#o#>vZ&h=dKhrS%z;Nb5j#g4%IU@UokBx``l|6}V zH%11;hk$Qx3)n=WMZbjyyrWSsY_P-(7?kk<{(ND{o13K z%mJvdi*9H(SPdpX!H1-<`zh@$Y6$C*j+`37Dw3r|m>-zTMf%9T3+~NTzRoqPEthlI zw1HqP&z0rX+UWD+9i+fb-t(~V@VJ^2PolmqAEdlhcIDL|+TO_1O@_3qC=;&^GWjD0WKd*{0I9TmdZRXFb?~-<_Co zXNl(av?`@P&{J+cHhy<-bOiAUf6>RN;O6#W&~MUh{9Qr5Otd>Z6}nTg?7heVgcuk> zEYO-r(S%MwUZd@!BO&gIbKEYWn32L+GQl0M+WhD?!Gp-$Rc6|xm5$AgABn-iR-$`{ z6kqYOb^!u4SK=Re*e$-Z!@M<}(GREQ=D1b+RNp(zNqqG;X%?7vkg`fdWMA7GJ%Sn# zsi2+#p<{|4{hR7yQ`K2lPT*xW;3E97|58pRXvrLsF`C*@@d&N;%MQFXI52R7{uqThP=_bCHqZ zG=y}ko&&MdT4!4N32MS4Z=_bwoq>0V*m(Cn>+Ud;#_GYD%Gt1k%@glHmT217dJ$=} z<7|_}`s(#03Pszv()V)lH!%|u3~ynOCKt4{(BWV$dl8~p*gDz^AZ5bgCfKmJpKM6U z@^}VJG^94;w49bcm$eYGuUye85>YKO>Xvu$`g$GqU8|HSSZK6jsPT{}19ios@#4&} zV|-n(QL@jTXEUc$QQ19EMK46}NJ7>=)26@?_!RiT=c%FZ+ZR8tI3dMVd6=rhj-}7B zg@35OglCrcia1^5d4n~-(m~kEF`&zG)V-S=-Yk>^bXoM-^`93aC*;ZbpMc8W`J+Ej z&kczi>X3vd__Va5{EjQBjnRh6XsMv+ds-t05>$nd4Q#Nr-(Zs64U~zz0)F>fkSLf< zO(CqN@&c*!7ZR8GDOHSzjB`%zu>CS(Ak`ySp;U?>^VfsVtle6u2MSBihcjCt?y&b- zAZOvEQ+I~piz!9mvN%1;C~Roy7&Ce|%4O;B&9t@?kshpNB<9vV9;;q*S$X*$+$$5U zRo{zW!ZBCbEj?&byumFTDy!Zb7C8w(q%KH9NT3d z-6Z0Z0~zn@kH)+SrDL61n7lXGRrL-c3Tb=LiKHZzo7#;vI`J^=ed{e#az3}h+^&d= zE=Y_hC)&?jYinI-p2a0^6CK7U=?#KywX(9;Xu+_ck@V_~h6;?)rFyYS>Rsv{<1R^d_8aLXQ4 z1pLGc-GF3V7$HBSXmV9fLoN#5kGXqSRS~R>x|8|=E~&0*DlkK=Q`Q1SqO=9~YHO3$ z21cZ3ZR%{VM|hu^vCDOKQrHB|c*s1&b@9WEY0)jud3`qIB`9LbV% z(6duAED`cpU7)l_XTWMI6yqMLPQ%d-8*}j8t;7m|NM@oXp2kB>#x6gy_=0{Zt0Q}c zcS~!>4=q{}^&Co#=2-$hp~Nt>kkU{vIarqmz$a&kVP}R@CtA*i*quOevf|nHqvhUS zX*;D&>?e3kE}4^uOI+XQKGMoZ)7;O4MBFlcQQ>Y8S9@^5_ygZr(`vwU6Tec^|0_Z~oaB07c83N{y>3 z?6^D599)^A%(dSDzheBvg+d~Pg1e!mM`?>SpL|&(f@!-NUsJ3qf zNq!BVW<5E0u=#YZZ0T$bxs(0lph6o8rn@Q6uE*!z%{n*FL7s#eJAvmY1~QCyb@Z(M zq;frXn+-A+e}sbu9y&t?eAANQG5K|D&|ogufB*jd8sBrF{w&F84&5^HlQ1=nvIz=p zC3W?#9lztdDrrJ}DA8y=eR>%HtK8yG;`tTAY8rrY;uZdF!fAdyF+#-GliSg-zmh!$ z9$&(K+Wu6HTJSr&g^%1g@4>6X=Mx+a`QG6KIZM1AU!SC;@~fA*YoUgQ^Xh(l8MleT zzGAQe8Jn7lBxSo>cYHVft{1s;*ivjlT<|H!Lu`~prZpp^^irH>M)gm{;~jw3YP_4r zZRj0AOj&o*39jYr-EqML;+doaKXsMxW^2aK6N+Wu7ZxA*GhWDbFw?V%;)2?Htf*@l zQZ>fJy#8E%n>lAlnS@~C3Tv(I%j|E$4-))u@~?8YrZaJ%eviAiPn+4UQ<)LXOivFL z2)n0mq)O7(LBc7n#KX z%^*!*Cw^1ZrM*|hw`-mNSr%o5fu256M?`)wibk9go0PNVWN!&q>fH^E*{TBI2}rW- z@6~5xCKBP;CyZB^I!K<>C|4{T;oY=(nsX`%)^#KMh zUUGtkX~VqGQ-ZB;h9j620zMX;E~IZ|<;FQ9UyEQ8K;McbN-MgL=@hayMt4Qix;i++ z>q5{M#oXC;Ubg5Z$0no(e6Z!OI2b!zt*{2-ZabN8mVY<|1Uu3lw@VxLi<@fB@ajA= zTol2fC9yWlI%$1mK#6N3>zSKZG(`QRxJJ8AuN;Sn#oEkg-$FEB8cMC-EPf%@DE`2> z(>1kyp|HA+P-g|N9kW=LuH22WK-ChXx3|%k`qKS8azz7{FUKKHH!{a7xM&iv_A8yy zqeOZ@F)O8x0vS$p7kS zG~!BC(8^4(poN3@GYX_jD0bO+c?n*>eyyOU_QUOYc8+z*^nrCt==j0{6v<}rbNvxe z{M!W(+B=YNXGg_=t^<#bAeSHYZluT9PPKKD`^vZdU9hZ*@3e`M;C17qXr01Q?gd9l z*|+HhM96&+(}dlseN~BeGc5(zXg^qiz5bw76e?gzFXW6PBh$9K?3|YSwLapTFc%ek z<;#*hvysuzHdT%P8Za?Bme{2IBwE)J^M7~-dujN64#$O(rdbQ96)eLfq#&dW67 zFxP?`ORFn}xfUw&-3c2hVoU~_yLlKsg6#qw`N5A)BTqZo*srIUrb!A4+cCeSNLcd% zvFt9+*T{^xG`Cp!YbD~VM(fh?J8zp&FR&=r0o(szTeg1xh3{ZaoU$xN@Y z@1YK|3@!y6)&uy!p+_N=tFT-O!$W zKZ3_)S-~8ShMvuXTrP)jauU)UDeyYTv9Hp3XVeZ?;uXq2g9g1BEO*y5~ixZDtQ;=SIh z;}Q{(9qx`waI@rvzLiZ8a8zl0y7Aqa`fBfqwvHH&qozvO1mnCq08daWm()nCm`s(T z>NPns@a3;fLN!QJEJd~U#O0suw5!iusJ{By1k-WXtJ*W-^s&4x*ZztW|4Cu`JM=X5 zgAbRyo5jUX-<4ymsZn<>%owL7c;+m1ne3&`%_mOG-M6tR6v0+h8j*lT7xKO#&G+Eo zzD*!iI+(lD)0^qwhR-I5nwi`3gcRf5<0*`r@O24NPT{91i2Q?Uz=Zqi#4I4G+-0WHdJQDWfNq_h0AdLP9-kJ}N>sM}XFYCTf-j~PV`;Is3f&$m>D`C6et zEqH!to&Nj?!o>p0kN<2pewPut4PKW?$T)#~Cu1Yg+t!Z`--4SjR=AnRZoZIlYS;@7 zfRnSjh>8)wT>4&S68r3z2?9;niw6jI)n$x;p9UvRzt?S9Y>mB&26GjqTX6F7ZqR3O zKeVpn%TVBBhgXi+Bjfa?OJeI+7AQxe#ksUzHb!<-rkf<}ax7jQ=s`+Imv7#MiQoO0 z{RqO^rmVgamHy-jdla^2U2ZP}q{@DKs~-@51V*Q(Id`>@{id&Uzh8X67W{^ZiQQ3u zeQ|BlmzIXwK&W6()l*f8)e{i%6h1->|@4-_X4N;*sGcrjwJg%*pwTf>bqx`l~ZnvHkz7^WoA%;OyIxFUb z^INvIch)4Qnm|e!WNs|qMYrCNs;VmZ(g>NcPr&un&UNQ>t!stb zhNgw3rRo0i7iPF@O#nVIvaFl$VreV=4&?};PKz;#r#=$vQa)qBqfTO#-aG?TN{AA? z@Z4i(z9EuP!t1HM+_wahz+QzoCHwUo}6SqM?RkNPTe@R0i~7QU9H@eg6vI9u}xG zD#P>78Vx_Yu3zXhdrfencS~RpI4`bw?Jx5_qBMAV_F2N`=jpZe@L5$v(Ad6vb4Q)W zHRe4tGyGfl+QtF$a;`(oc|{L~=X7%Ni=ksWmgH#(k$YckajXV0W}H|$T|_j(&+r4W z5)Ufh#MQkROwDfr8UG1m9B@SJfGzd5EZqV+fYS&aKqI_4Zp2AP|MN)~?1ACuH+`#h zWsx}l&JW^vD(ixko@Ddc2CvR4F_p7{msuj7(Kj@Fez?3GmiC=J6pm2>=GoZ+G(0>& z8b(M4-Xr4TqwCs2!SwlM^?_6W#7t?@angP{x?kpWg3IJxq);Pk^`*zpceex>$^R5( z1XRcLdVez@3i~dHIlP+D^C{`J!BaqrC7$D!(^e7bR8?x?lC8BI)GtAv$Y)9r$oHpe?u!YTN%ltt5<82$W=|8@!0X^_7T8$$L0c=f2fjD1sJ4 zuAQw+oB<~nGxK+zEH};HQRsQ98n9;M3X&KnX;6^P8q*S|fG8QphwsdGo@#YDpA9~y zp1W?WIj}Ns<=E=Dw57>k?R}39H>64$5gAx+7|)5v%g74fAx+Um4+&P>thXD*+I4+8 zal`rb9h>=LThX5=3TJ(iZZJDrySU>CQ;!g2V&t=1F0cG9J^WjSCxA2I!@EdM%v8aM zcCk{d1TX4x;c_Z=YN0@aH=TdWdZ25>#-%I${wmFL`)xEn+uFrb#Y#EPh<=?$ao+xp z+Sd?mhbWsNRH>*mf!D1#!L1k1%*0+g6jaqJ!akzCR0x*=m6j}G7&uaY-XteaKF`T2 zXDQcC?9Ch@12y;iZYqpc0|RZ@B$WjCtipwtpD#hC;Cy_sUK6O}_PEmP4nuL)rH`k?gt?*cLm_M-TEa#hs;S&2@c#K(ia7A1kB9{c;(E{@G@rOj5;h~{YK0XZ0ZNeKl9jnkPwLV?4pB?p3OH&E7~-46U!*Rj|?zwkCCsl$^ylZ2PApNdzW*P9Yc+a zT!NOtQaUK`b%>xcby~Y(Hv@KEKyZ6s7~8?4Gy{~(v+ms15_WFEm|lA4g49Y2k~mEk z1|jFt+*WMCSH1z#FM79Dm^wt5cqYmm40>7DA5>s6=goCb%n9ba!IUMqvpD^Y$Xpbq zZ?m{W)!iE(;dEOW`{>>7c1XBLJCQpu_~$IZ6q+l75&`Hnu)}XdF(_aR5w7J&l_oZ2y6=N#5<@44GA5hAS@=ZK#y$IpcaDx znzjEN{vj@U_?2ewBe2togh0wyv-3Z$oKb@(0XiN{`@*M?zVxJ|;K)cEewkNE^KSm& z2B#)xRZ>vki6Ndy6R?d7fY+3%ffVq37Klp5TpR19X!Dz%*xB%GY^3y;)@ApC!3xJq z{u|p@ltr(ol)X|+>7M~=HLgak7b4}jo*YjjJ)_jb4qMF zFNrxzzRBe0p-$ZJAJT_{IyJtR;WaX}U9WS_m#g%0EmurB8}dHum#ZwQ$t05-D$|_d z!_3a=!-olx46q)!?A#cxEY*g&z1hUZFIk)VIa3n35(9l=SIR$cXd|0@F)N2T3ixr1 zD{qGK@ixDBVAbrrJ}z3f0N!{#R}Qa@(-8 zv9*@bdYE1}UFN^j*4Mwa!HjOcS1jszjJFjV_bUyE5d(Wxe{tFa)>KOsR7?;a8N!}{ zPCSr5;v?D$C2uDV}6KqhPUbpey}aY{fq>cE>9?euzgAB0BraJ-rNm>&DqZn%Ou) zqIT?;MD0U_j;0YUa2yVIXTXoeZQlWt0=}RZ#|X&RsY1e&7V4jE^V#kqvURntYi0{= zLCj+_g%5I6HW^>ROWwZi$hhl)!>D6uXm9^%t@rf{k6EuSDJ^R00xk6F~B@!{Kxm9OIG$8 zTd9JDoFyJJH>g|(6TQ`>0CL|@^0$^rwg!;w#~K{e$kxk%C)tI!1OmZ6Y~ zU@(cMe6P_5EugiXpyYk_C`6x#Mg0M!7#RjgdPEq-Sf-0t!~`S`e*PO!&@c&}I?L<% zu@3O04iFhLimf_?p8=BPpnP=<%2!_}{?G&lgZ$h!f>8)jn_|R z5e28SgN=-NA|p74G(m7S6>&j@d6(JM#c9eN_hNuxklqmB0XUuRu0zl;Cy&a*%IIqF z%^cGwVQDI;a&m7Y>#68Y4U&H^Ywc%+-bXcrC38NsM4H3+X(@Elu~KhD#H$5kNbeJ& z1%nX(oi^Aw?!5EVX;Bwdk`D49zdKp)S~599`)TIl2}UA;OG8nW?FcgvNRcP!HA4x; zp;#=hFg8}1s;j6V@LJn1x9lNJzvGV8aU4NFLd2DGm#E_Tl37EA7fE(@4nCjW_vyP= zewUf&=cFeqs}tXf3mUX^%P&W0@yUrpzi2U=J$iO0f1p=N3V&Do>cRnGJUZY0Q*r6< z>J4d-(J%2Vd)pu9QVCp7PftCMXFMvrep=z%nDKm0oO%UAHp2Hi+h9o-afZIkV^+_M zjF!BbL`hwTl5nzlxoLum^EF~}A#-aQCgGwa%(hEOwfu-%!O*7-Mf*8yKh7HscIV6$ ziS;Xu`7O_J=4K(#-+6xr|FnL%zPgWk8Ym>J3O&iceHx!|WU!q~mLDqN1Tu zrR3GgKnjPmFs1Y*aa{!ImX%xh147t46pK|_3K(hnRGonOQ;iIjk7yVu~0qt#=FZAiY68>qL>8t0Ox#ITgOp9TeJ>jVf(yzLk5Kds0nlyj+4gmMp^D%Z@bMTbha%N+s zd5BAecb6&g>YOkKw_bS*PEDHN6H>lme!mAeU#8`WAa+2q`((@b+Ah7DR?HLTzBO)o zd3G53D(S-zVGsRedn=uwpkR&r7AYbZo1IP1Kj^f#*m1I+6CKGwoY6}XxL?+j+z7@h zXeZ#myg|gU3GfE9*V}FsKsAJ>G-xnWzsw7dD$eihe;pBFsNRu3`Yx;Q9P4#Yb`*bb z*7q-EOA1s9JE-lE0hLsqz3{#>@VoL{651{D;7pMQNxJ2S)=2#Repj=uwgp)X+pIhED}iy&*ziM06w zPoiaa;S&K7(b6+Ll@}hT` zfygx;M`@NUImK2fp#RQEEtqJ@>i_h5O;pO+z{=x;oSl4YRZorIUb=|X&6Y!DYh1B} z)Ynmp8(OM%ccBkEf>7VjG7--;Ms>e8TSB=XQW*JNG^Np#ggMU=6p2)!E{jl9Q4KZU zL0snz8IPR0SHzxUD-W?6Fyr}Er-Shmn$;p#c+>_tceQe)%-%pHgXCoc5y>7vVyy=m zcEenMC$s%gT~h(CI4f-wH; z`CDht=O^W;*t%u+`U5Q&AtX;sUaxlEaSkXZ2+8A1n62l49H}C`!O{Ruy;*Qn`(#*A zim1P+6}NsEhN7<$Px#;iV-fCJ31zK&87(G>E|GbKJ{pmjMHCACpWnX%GjvOvV|vEs zo4{>=A6yv`PisJm*DBR^n!#ls-}xWoGY&}j@$d~AvCy{$A1QSUii_DVeo9_fzQ*MD zf~tb4t4^fw9Z}~+l=qdDxRAu8B((gLeCz}@WMxq8^D)c<&f4GF39_+~?|~1rx#u;C z)GbYp9SfX}$C+1JcBo)7>HRwbgzHezGy`n$KUQeK@2lMcr9YjGO@;$zh>oUa;KhZH z{an*ca4(61sN)5Q8O=brZzW)OytMea?q0}^g~PINp%D>81!MC+ zO14}PAvO^r#IDUYg5w3`bw2MwVsBr89D-Xq%paxOURrhK#PnOi$ywz!M~jviVW(RA zR@ya;>4^|IKMPTpa0TODIHBv8{b9}N8aEp2=1mjuml9% z+A*9%-;eK{YF(b`7GXzMNNfHi7%av=DQZ~GY*8PH5pvImYGmBdz-9I|wehT3fsmBm z00YUFXFGH6W&#npQc$S)`&=m)RoqxOm;XEWJb0~YRxMy{iUWPs;afE>^{B_rtoTJw z^9A8OHfWuHO3D{Rbn9*E> z8Eu&l>2+|3eqi+A)f2LQv#tXFwv~pZ&ePDG1gjtS~xxTT#R!c61P zxD`yyRRe&r;9k6Y@}wFO>OB9C#}K4R)K=Y?*OpFyuI&LAXW@^FgV*A7j2ap1Tu!?tsgMvx z!=&Rfi2JAyHiSXF97G(&zGNwq&LZHHIOMq%sm*D_lx(bve^-L`2!Pt6gyO|wMs~t&6&N5jNAEHDJ*IY-8u6^bnqDF#0LV?*4 zf8;`iGg@&v7;DWR9tr_E z5OaPssZSpuS}5d_U(x^5!2>6-_;+Lde=J{ZyLe>$1kU@@)m{oikOFh_Tr8|L{SY6OaZSYHZc+PQSiv6BC^~+q)%cn-j9xrlOJaemSc=QOwM@^0_f}Q z0g?CekU}FGc`>V7G}<&;As=aa9ibPq+xy z3i47Kmq{%8Qyd_L)-MZ6vj~Z_7SB_P4mMBuC(jRcfv~3+n>wUWN3UYxCQ?DZndYNM zb4VI4_K|A>f}=Ur#hSzO;!>7gm~ooJvFSoOH{r{{xVeN>j1FeLs)-_t^0(-_&s1-a z#zk>^2t~d+;*Fs-+yw#1Wlj=sWsA{)eNC+Jiv%z5-zL9w9y|yDKB>?3<#Feb59m-x z5%c1+>Q4`2)5@n~WNe3M819@MZdvirW<}q;JC1Yot=tq@E}G`;i_?Y@XQ_k}Vi>*G z#Kh{J!Dl0~EU1bne0J@}UK1$>JwcANV z9#1E1q|11uSist|C;d5Z27Fho`L8-4ukv3dtZR=9w6u`XUVy9KW)-yCA&cFYmHmwS z(u)S_Q=iG>neScL2f|cFjId3mK)Mo@>UAXZq2W;0!S3rgr_D zVM8r2CGjx{bJEBS0b#6lEh9_K`5;V)jxNKX`u!q^hCi>G8W0D49kJpD>n4RLRXadf z$vPOLxgeWY89vhuJ_I2i{3dIeIryN}EJs<|eG02cK7MyZrk8Z1S6CG4~qC_3S} z%U@DNyuy)uuB6v%SiTycO^`Eq%7J2B?bN2yOKb=U#Z87ErnfX8N?P-$bpbyw2wzHx zv6i)L5RpCI)|9~dxnsVXUgUZH!abCRG)%}1KfOs}Qes9j5wm5fOMP&3G;j&sWv}Df zC!&Igu>8lz9$x2Rz4orC&PwS4*_)e7-0`O*gqEDXTq}>eJig(<^J+60*W6X>0{NBbzCqA^3*VOBvv$q=cyP{wX2!q-mi@j zm%!0qTBp8aVDX-Tw8?PLPz*#|r|1~?sx^(aRJtui?`5#UERE=8h0_wP7HH?@hOkZr^26E*+|remm0Ptv z+(H&r^O7AH}d##eNY zUdgNH5`Gs7!$pD%7aI(Vki(#cW~ua`V${?}(PK^xAt%mDx$1Ey(>YV#H@;rI4_@5k zYE!@9JcOQTOR$o}EOIIRAfxx4QgLoeeS&syDhTPgQak4_pBY4D=j5a|eHqtr$)@1v z9VK)%%oJChb3=p1n2wfve+m)lD#7^>*;P=m**y*gO;VPiPlteY?Uc0kM{!%MZ$ZWO z2<=nhX1)0dW(x}|I_rlw9?$LG8E>jS3Nm!%J@m?};qu%6!ZLX@CX|qzh;4s~5~k>! zQqM@Wr7w5)?eD@g3*d&16~ z|B4?upPZ=NOz>)0w4&eIK|Qm(rGnY8J|+5Q*0#`Iysi*u!j(NAA)y?Di0qmD_mS6M zA1?%GcZ2@s3qb{Y?&I&4)OQn}nEia2D|%b69FivN)LE)u855gnI^F5Nv9=~?lcu*i z+$zS+jkh+CB|Jd!F9eM$2Q38< zV)^7dsxNx)U(&fQQgf0$>unXm|H%Ui(L+%3i`rl6u)N9f`reTuO z(yeBv)M5JAx@L!IJLWPMyylSgb*D`GdP#U@)+M6r`h83)Eb1^1juGw)uj zgfcf3WPeBE$`B36k@Ss_e|sAPI~pJUBm<_^Ok(3Nk{#TxH&5{O@^L14k10-hQ#B%K5RtN3zQmnw$?% zV4Ej&111m|%CRu_dxkgRVpSD>_`qwJ0EYUnWirr(5YL6^JyS7;YLqqVhQ)KD?|@Dg zD2sZgz%gA~-gR&5a|=~ZV?JhPG|*kx*_Yn0-aytp;l`7t+!1iRt_9fyB~iHtJ;NWj z!lQ6s;l}ejb-G%WCyt+U@z}w&gMjZR$o5D1GaErEisHI?ALLzZK0RaLCZIcUMEl~P(Q4YDw>gfm90!zB>DZVuT5-1Yo*(m^?Tz1t6zA1`v5|#veOM9} z(Lf7LRd}u|(n#z7ipxHquD6vlfoo&KG=fJ`0tmZV7 z6UR*e_3`M`gg;#SSm|4jSrihKwdTDL-3A|-^IvWU_sLy>FE&_cOkDX#4+^4n>2IeQ z{NZ0ln?Qzg6GYv>xXc!Vq!w@@_9bqB1~kyS4!1@{NIFYY;q#-F7NW~WZx$xyj*VPc!k7emy zE7MCge!RNW@tLoQZ84WvOmlNPuoFMnQmdTlK?>_Ol@h`~X+BY|tW8(w>+Os@Rqm_q zQdde;MN|0RvQT*-QxWEh2^ph@XlK;u>tJ8V%9lF3aH7;GoTNm5#Xk2F6|zOrKxt2< z+{Epb>*9)s0vCvS>X78<`^mJ6BvHsuM8@bZRnvq~vWFc^OiX{8o=Bv6LZV6bkJA&- zpI?re~k>wip8A*xHyLHw-SSyxs8qXkVn!=H!mf$^GuHFsm2*Yb0^zP6be`3zK< z$H$eGcMh8iVaB*;ioYn70MMbK{T;7&HJD^rk zQNEXw`^j~in{J*W73Gy}>AQ|YY-{HwGn(-BuSKONnTE#QN0#5*uomH3KNzIU-oCV! z>xgPSSr*8(vbuHqNpQehjE4lU&7K6;(O}+|M@W_3NGI;uT%l9x) zbT*{ae-27w!=H|$fvNvnl@({;2`Gwr3N&;b$)T2S_NtyZ9&+onR0daDk_@3gK=6KLDs)o z@wS#kA5pNQ;^hOaP&_DP8#o#G2Yydmr9Mb76sNZfr4X$B@sW06Tk@(t-I}w@08VXYk)~U&*Ko+|Ih93)qxW$>bv#c|XF7=xEK2l1!BJPXh*`*!iCu z9sl4Qfqtq@2Jqx>UxCC-vDEbg6Dg<$=wR=Ddsb(cFxwoGeL=+>baSLrHC+reI&^|g z^+=TQh4s{SksH>@$fHrpULw90c&QzI1xD$qKz;cL22l!NrlV!FRJSxxxu-;@v|3#N z2I6EuttC=fEivKQ5H&Tu6x$ixQNOPU-GN3rHg!3sAZnq*DQDSO`dmba%az``&whdq3avyyN-C z`2KMW8AFH8Ij{3N<98hAXl!S+06LZx$uj`ImYl=<1YDTXx8&N%@JSM?7X=CLFYv+| z+dIp*)3M@x_Ug4fPD@Od@b^S9TvgMbrdVL31O+ytODUCIw)ZkXR)*rE0wk{xyqA|U zKySU&XMfR&%K;qVd-(4spcL;>VY9Mdp57F|2O)Xj)Q}a-Cc&ypMo}@M{Gny6vK%;~ z>dx5$hd2Um2VXJ|WFn|T2*u8T(#C^OyC8#z?gTF|PGnlqBgloHZ># ze+?J|5%hBRV++=|6`gg$(%;RVU?@^jQua53%{*!$N32?#xo!|Y+S~7;OYxkQ_wPA! z<+ua^8(I8U{|eQ29I*^lFL;^zbG~_?4kj0F7fSi0Xkh5QtM)69T^G)WLH(+B{hofJ z!jXD?88wEz{Xblf!@oYri2K;tsfo?s`_Y!z(Q#^^^_d1x&@u*2Iz`6=zyNkP!oL$S zgUUS{Cs36FL%)bbK?T1X60-Krt0f;YD}%3!^MX@CO1Ig`50^XY&}oveF7~d`G+%VI z_xJy7@D=;b-=|}bTES6RcNsR}e9pZbz0u%B6kqtqY;ttXJfr3Qc^z4x_x;EDX37uIgkix*3*+3~T9*X+bz;I@#B?L$j=n^wDUZnRF^;@(f9H$h@+R)h(yo^C z!o_D4FcXD;GG|T{7N??@AtWuU{5i;@KMlUc8!D5u!Kw5pQ?vH!u=C0NU)HhC+n@1q zqs7=Lu(7l6QNJ(){bA_9$qQkl#%ogehPs>6k~p}A0u{EtzV3QFu5m|=gX7n{A2VoV zgyyTJA`Yu~ifKZDAX4{GT$~PEBt)z0vCVsjy3@;iI-{Vg_9?kTbecHL=qrVwZkq;+ zI`_}@_cpxsAtyUm)F4c_sN055f&IpGHcIFzRV8_8d;j_CXi=MTv{t_rkLB`WNFBJr zT=HwLOGrWjSS$QP4a`k4rJ&S;O8bokHQC^sM-SlXzK+h%@&N@dF zTfY}EqI(G_6Z>i1l@2wfFGo&)LzUZ{`V(8w9O8?(_gT3H4yJ#gLwT`zG&q>|{-AY} zxcXy5(Fu17-ia%Jw34PlTqqeIPHBOvfsKxaM+3eV3Mkdo;!$B7H)~p-(xTu_Ejke) zvE7fwta;M$-5Dgu|3s~TeZjz(_YYr#nMaX(?GYAup08It!2h!DwlSv0(6GEwl*#s09uc*<0hl|l&otD!&O5!>=9N_Lm3cl0jzsbTLu=M=z$-+X02*fSe`76h_q1RCe|*|!=)f_{@(vo}o!0QogcAP2pvSwB)vCW3j~umDlG(K5*!x zNT7#!XzJ+^rG%Rc(^e2nN`pGLmV!5+oah3)(f_Zw=D$BwUh|PBtXegA4r>ElpD3Oa z=@(^WWGb>|T3T3?f${-VX!rvz6wvXYb=s!IN-rO%f3{&qvX^2KLBpte_|PxU~kn!{w*I6@w)ggt?C0|VzV9D{bs*g(1`CB z@*iGkE5%^>Cvkv;O@ExJOt9O6vER)<+OIB6R?uPer;}z10TK<%?90cXJL@o^)$9HGGwGde1jxN+x_3z`z}B`_B^<%8r+4x15mE zdiXD?BmRF-xc_V<6H(sBOZVrunhJ;%yTU?KST8|p5f7$>8(3!ihrz-FWv+z>{LJ?X z`PsXvQZybt6T*4q@Bh>8vPYWi&Rn4|{Z2UPTtA7Ahts*tOuQ20`WJf(uS*p@&VC3l zqya;+O8epP-S*J)bxKMB8>;U$!uM>=Iii&xP;+Z) zBQUhAx+AB;L5<-%nOj%TG9aJ!A1xd3KzHYt*$KXkB ze}*zKSPeFW8^qN$H0T8dDa_5yU9XRk;i@DwS=#x#u9cIdM&_hc=It+|qoXmhDq=@x zw8=C>*NY!N>o5F9*)T=W;M6=h?8Lnkwo+7FdgWkBl_@ zUirS}b;{|Jr`L3y`%F?Y=Rpr_J2G0yx>*JI?;YhAMV$>i%$78AD)+igJ*Rt@qaiO^LS9Ji+OeVWRZcy*u5KpUCDjx zI3M|m^XBazsysYA3sYU5TW23?e^4Ly1_i#%i2oE2Fq{*)sMG)^MRdit#%FP2n25!t zCH!TnC^JwbySfZ5`vbBhAPiH*+}GdV32GG-6kiI-a4|js_h^P|XlYsMJt?6gClA)p zb?d5=tJixX)?kt|LC@*5J;kf0uKtKwjvclvI?TNoI$b3*@jd^$;e+JO>mpc=O+isn zC#Z0MMuqB4#4)Q1sjkJ_j$p%$R>B!FchYx6xG2zvpmwops%G64%0~=(<487#SP{9~Kx9r{DY|`n=;mz&~ zWY*fv;VYJ_3u$wuvapv?d5aw6Z`jp~z)XlM? zOm9oz%#8Mj-wmXoCkX%bXs2DdW`zj>8S7x%OyIAN6lB{PZMM%SnDTX|XlvzK>M;^H zW_Tpvz&c&a@j~JE{#RW?iDqaRm=`Oj32ksa<-Gs6{$Hr1=jgY^P)iYpOQq1|h^z=& zMg39vkC{8R+z-RORZPn$0fsji9e8|c1}v^ehb@&Ho5HSM1;AL}I}i7n0=N&;5IsQ3 zNC2bif^1lUUoYcH8~5|A?qBa2?oNR#6W0%!5VsM?vzOXik{#_t&+HU^MmyAZyN6{R z^NUM5{V%(ino}Y#U99z|ip~?ceDWcPfZP2!IZBL6;Af0bG?AK_ra*i9W1=TqTL?XD zqCJmg*^RQM?@wu7?aoQx8Pe2Q3WGiWJG!wNWKvTDz)-78p}WncXtnOZd@9sD?G3Q~ zEdCe~(z$j1R!t$CXt`edV9 z<$@?k&EusEA6t9n;4})LgBKH5*Y67BK2FmCY9mwA6g?~(hI?j0_RCmzl;`Nk#&{t< z+Fe$O%65;_;2C&p>D1Xm#IKKH?F>K`mRRP~1ATU)dV;jHM9RqLJ~P;ycGn=9cZXOb z^B+kDC2~-p@;QE9ywM>dsclQzCB(y%($S%z1u6aA=1T&XotZW*n@OBhL$q_ENa;cpH1u9JeSj=Y(_hbLOI@$X+eR2ju zN8@RrvsibEn5P1eM*=|veGjMd_PHk1cl1q@M(@%E#$KR8rIQYue9$S+Uz?#*fhyiT z03lESPOtJ{s8DPkRxdQ4#5`=q7bCScWDA?c9=3~mwAMPzaRj=6%qMDJ{7wG(Q@ZO} zZQSQy8}EdnAh7?!Hyy}}w!>yXpS^IokyCNO2XQBihp$e@#0Qq&Sb-f2#r0w~HWVmt z;+Z}y3VW5vEzJTy6x{(Wwr!LE~dQ<77G^V#?>tUDz3P8>Qu(v^VuXUXfiOl!^X z<0LF_#`Vrf6311ZGZx(J~%y+k?J$?Jm1NASXmBzaoxh@Mt zvz?Hj)BldeQSjPbihIW!H-Mg?Hpc%}Au|by7r$?zeI3u*28Qc1i55G8qjNsOE zEg?9{mIB-0knq*P3__t7!`Pu^_aV$Br$71U9&m;NB%@QqG@(QHWfQoF`nfTTKw*sX zML^=n=qMKm(JujARNS87a1zzM>e^Z=c*siXxwoH+XZY^*Viyd>R-Qk%Hdip_T@Kw5qGYWNS|oJx%D^hZ=#}u z?uwg^EVt4?+{{+ohEqm^)#!Dg_3Ob(#oF=T>0-Mc+Y^Qqxf#kfgZNOkw}DBA@sB-b z4Y2XhglnmG=GFy7B7Zp~?XE1VSvOf+-->KHzHhlaT=V3p)+g(0NPeX)=0Ry1vC|VR zDy+;fv7Y)kE9jxg*~_BKZCTgL$XoU|3z?#rp>^*S(~l2Rdkza7-O{}TiVald0ROa|cog#j8cNBz7XSd;a_Z)~(^^gI^)RV=V zV>c&)?whWYvMa4@sL*lVmqijKc2B$?Fc84fz~voRHBQLMudgcsv148*u&GQ$R6uPQ z!(iZP9~v@!MnyEMLx-Z63ZmZg)HEGJ5GHKDw@r_3{}3xDNE|eA0b|#XXW|pD+UT71 zb|VJ8Hi2U$swi$j?EM%NC7}xMQCZ&OM{Cv4a}Sud?lJ`1^cXAwEAMws#{%Z)g`l6+)fT@iUH5=j-i~EO%c4?CN8^o&mm1h{ z?qi!<`bWpP|6d*F4q}RuRi)%T>J^Jzws5vOdqKK{0iB-nQ4KDGG1&9((F8F6ukTq{ zEp;E)m$6sT`*9=;;jIb%_s7e!T_H-3rJ=ql&SuesB9uH_u`@SJwIYrq~ZR7e23 zdb~cihI2*@fA#ux2e{PUWG2nbtJ~2r)xyF;KDhzxBGP)C?tCUNjI~44XWx|$Bcrel z@kJ4Wripq?0Iiug+qL7at;VezG3F|1I6B3ZvP*7>tT_kyypO=RPYDDy` zZD#XwV`Fi34cx_OS^Da&K>vrTx|aQ=o)k4t$B0EP+mxBL@6q)5dD8G+b9ZowDXT~U z@!mqnIImMnzj3`~eXi5YY#;yk-rsY5$n9D1JYFrHItvr@j-M=Z6+HV7-b#SZEq=7xlC zy&`j)?^fv# zK-!G<%YhB(bujvh6sxm3wB8g)mfMA_nAA_(XozKN-V_eM>z0A7C}E|u!3$f8Z`3Sj)TwNKOoTSFeF5v53zo&ZK?LxhNC-n(Kk zTed*`!2bqE#14M3iglv_~p4Mbsln2z-iR6 zX5J%XPXgKC{gll}3EuR>aMUrUoK|9BBk?ne%_wH+Bhag&HFi_bnf6+ms)kh25&o>s z4GF+{#=$nx^Sns#jvb1;+r;;RKAY3ikvhKgcL1-=2fiJp?rjZrO>F10zRpd1RH)om zc{AvQ{e=SxUcN3t(q}d}A_?sBxSI6*-YmVZV7l)0F-RpAU=9Ewq@aCgck$eClG&hKrq97s%7#90p+7 z!T^&`=>A(IZ$QRbkK@{$S2ZCU8=JC_k+>l7nip?F`5#x zI3o2ulL^dwt88h$E>998h%~zKEfWw ziOliy*Wdt7AguWf$a_G%^qhHbxtZ;<_DxZ@&r}$g+XDreV5$Vp!fq%ac_5rkMgcr~ zG>Z;ZCcYgCTss4|!__C)9nHVJs#}Wk{o=+qOya60k=qQrqbl1TzL1u;sr~San^n86 zj2ONcu9wb~h!J@hLd*y{z+wqOhnX!-wtjW*heXJ)4WxBM&4&PQ#MIjwy%Mnw-cbz5bp6nXC`t(119fbbN;@gO z$sp9|d2_zj?e>g?rH>kOZuIe+2~RdIuHh}4_5kdU;o-c~b_M-wC zWGr{3;wksrl(6(&UaDyFhSH510*xq?!-gy5b@k3J*K{=b&&}VuSbCw-){99{Pb((& zCqVT$89cek^)os&1XT@DjH=c%@YG6X%_x0p3rYR*^jLdyFO6oUBte*Ox?W-M)VXQs z{EceoX?@MYyWHpPE9Hd9P0y_wg6inVXY?2>cI#UdsiL6Elt;@wV57t>7)&LK1)xoa z)QA_I_AS#9O(!<9-D#6ZiuWg;O0~6`O5d;-f88`R9bH{U@h6?S?Mzm44cc!q)mb`= zG^x((?D7-9q9xloL0cf*x3xr6PaY6GC4fD$zu|2Y+m!q^MuUjlB74WrpV7NJ8(4`f zQ&pK6l87QtRn5%}!BtD3Wt%VPplz{4)8P;V+Rp6hwcUs7U7ILoy&uo&zf%f}L9nsQM@&Bcc65H28c6%F zlymF>lS{&mTyH1tTFJsty6V~bw4v){51@9HIILM`PqxZN#_8rw7KG;{2(2IZg(_v+ z6RD3LK9vq$vIUgdO^H`|QseR%CQe5EfMKKW4!J)uMsWn-lC8Sc8@kA}M3=)9}Q z?0n+|Iuv`rmCNZrJ)dZle%?vJ)$Zw1+!~$z9d!qNgZt&wOGzDK>)&<$eQP$4cb7>& zWh8zKZ_x@l>m2sRgw_kCHu5#c`Vh|Dc+n}+)x-TZUB3GBzf_&aue0&>eyBdbqTY%P z$05M(idvxObhzQBD5`#i!GTrxi2tP!$J+bZ$g9eT)N5xH7~xZs?~XPiEe=CZX}knf z32!&>-yU%3O6Y)E0XgtLb%2@2$`P%Z@ul^3#$s(l9VNNXtza!*?RSM989Nv*uQ@Tc zIJ!LvB7>K*hP?YzVHb0enl=JNLW5=uF`qOecc=i1&!FG~fjWlP*)sy^odcLQgUEG`n3`S=lFYi%w;&Tq%u!Nrya8CZ4FSw-O+|fQA{KziBm^T&- zjKxrJPdbXKj4)sUPsNAqJ#R=7^e(HY5e0iIrr&I@EkFY-NS@oT2Vxe9cet+e)MLc# zXBHHwPMz;zOq)R+__06rA7}XbNwVx{*9~dP&21;QKN9Kpe*H5;mK|7_EU#1oo1#csnx6W>sdLj*f z#5~okxBn^^*t2wTG^(Un&cv%-g`7i|Cx16FC$gs}Nl{&KR2(#wW0RQJ zI_G{?M31i9)5YNh{Z@7J8eYjqKo$}waBQe1YCP4*ucQN+Z>_JI?c0LH&J== z$g;PunbuDP&yruO7J6%6{&8<%=1eFG=u} zlL;!L;tarK3Fi|bh)~x~P(ZIwUw*Pp$cD*eNB&sP+za>>>@;bjW64;8@{5?TgQQ_9 zkJWZAC+apSyL@)yoI+@VtJKLFF`0_-&DR`Ex+dU?&Rk}oqYkJ)13<< zZPi|3Qn4Nj=f)F4jqwou`L&0t2#$`eG3;pzovqWqx{@|?Z ziTYp-H0rQGfO`06F!NqaOwVJh-X|1XoN`mC08F=D46|7XEe6|v_|0|ixz_g<-H+{B zr^^_x_;fuSl4b<3c&6|@pMDU_tV+qX+V1{+Fxjl_S?R=~!p*1`4d-`!6{7IKeB{}? zHQ|&~!q&&>my;&1j7grcohi`@MP_>p&JTC=#IWoC2TRiO4$|N=8#CM^U)l(QB%`KJ zJjI2)1M1XG9=eKtFDbX z)_CP>xe=F$m^th97N;ZCi~U@;h2U=IS7(Jsy7j5MgeiwqsIrdNDO{#FkFz^|p3mBN zqY9IlXavp8b%X0Q2IqRhS!!1(=Nazz^cak9hW$JZomdzfMEUjJb^NFWKa?T$%x#x6 zeld49lA`eZo4td1)C|o7E_*b-qB;1S&ymM-tl<1Pr#n{|^;U8m0SK%y6*oFj^F!9m z9*wYs0g11D$?4#Yuusb$=bJynlB&{UZT1KETqMtjPZM6x?QF-kT#v2jTZTb8k~@Dl zmS@t^Kebx?ngmen zY|e2EnnBPRqr9*?2t*xQ{9L&|u#4<-`YHhJis0kS@DYT*B_K5ZiULg|!^~p=li5#6 zl{xdtzSQi5uOxCRgLx$b@#USd|whZqtxW~2-*a`{?ouDnod#@A!72xR} zP}zSJh^jU{`j?}KHbv*@K=5W%OtO)I&?o6j1Sq6 zd=3^N(Kp+`_{b9i7}92e8hNtIpNs_U=d*vJ)oJmCAV(_^ksQBFwCiXY(sEB4EFu+W%t+SU2yTzs)z zjm3VPdS(zZn~KFaOsod_-0J2yGBrw_nSGCXlzj5bG-JIcQuHK|(g%~is5_co)(EuO ziAhSz-y;?k8>S!4xVIr-%VyU^$PQN+-*9e(BaK*ryr2e3!2}nz1A?n3 znBG@7*5zT2v2kM(d(B+h>E0_=ZpUf28i&6j-y^2+=ZEf_P{q<>KDXA<7ZVT^Ixk_P zR_%*9hc>%epDj%XXVcU!IQ1H{^0PS`|#Y@(08&>1}STxlb}o?$%fCEOey4>MiKEfx2d3I2+eNlZL^ZlF~Sx_14#IC*0= zH|Z=^6tXsjM=7=|m0pt>Oh^Z=FjEOpXvGz&i%N@){mS7@980X6{y{izyQ5bjZd+qh za5_Mdx#1l{4`J$X1#OBLPIhOvd86IxWc^JR>n@?7usi<3yW&D?um}HEInXv>p{QQf z_(w82PBmh|^?Uut_TQIN#ox?g-EM_l7AvS{yr>bann{(mL7$|wFZAg#m!@R<{M}cQ zu&o$6ipJj{3tiH#RMG8EI>C(hR{}5=rSEUlohqLcFZwpeMB#;a1pTElfo^_5UWPm2 zi`%dH4bSf)_%CXC8PRmEEt+3P%OS?i@iJ`pSobXcR}If(zr_lE>@@Ik0Pk~{6UAk68r z6?&?*gq%gRU#1uG3^;ZA>eJ*suO&&f48~7a0UaIH)zzQwhuBrPu891)Ixvjka`XE{ zDM;Nd>8#wOYG~f~$ycGeq2i5VMt=fJLf99QL9fa+;~zO|NJGwN7*4J9)h;Ac?k}TM(wD| zE?&MZ6VT#os?+7pkq4?GWbk7XO7tg`<7KVX{XMT&lrBCZpFD~?2GR>-xMRngNK&(`wXdMr zD85#$!QFk>g&f!?qckhye2A716W*twq-iG53ERCwF$;pL) z1VbS|<7ya!Sh#S`=e7y-#nCyGg@&y=sw!~YrS)-hSF8LD?<1zNY70PT9659I{Gd7d z2hyd=qD}9PMfh(BFrf+nP>r_JX1c~2Tl7ot1>8T|(+sc9tx`esatPMGWWPL^HeE|% z{{Txf*YVif z-d+iu19*NSdu`1nHLq|6GGc7RuHV<&UL?|HOy>e%cf&S`)yFed{zr0nLX+wW?S$zK ziQ|i#L?!1z>qmHxt3N8uzvf$}a&ywV@5daWagX-&WIN^J{v%qaz*@&*m+w@N#0TU@ zZCr^v5As@9FRQ)EBHy{dSM&Jw1ZBLgBRv~1JRDgQq`Y4@t6q?T8cA4JzsblXb%P& z`rTiV&)enAZN>4AAz$76$hK{(mnFD8V$&u0{O1G>3lpEVK%e;i?8U9wbwkKK!VT9@7l3jLnLHA#qUNySX6C47&jMvB(eI<;yib6}_gb(+5R z#tEU=wJ5YwZnGd_v*1qqkpObqbFk`8vvd5(%dW-!=KHjGbNUj@MDcC+f;&UpcW$G1 zmc#K+ziuxxO0*U9k0fQ_%s`+zhD zFFlZnPbVl96#l@^BODdgBR^T_lwwF?FTAW|eHe77FZJ`DHxjz5lCK#a$5QnLS4*1S`} zlE>Jr#Wi&xBlp=|-O}7zm7`-7(n~CD!)UI>Dz^GhF8~~o`#1rp2=|Kc z8cdO$%%Z9C6*jX=3^JGg&bo91!?a0F8_#h+&{~xGU4PH#XoiGF_P-Tr{AAHWS&UewrYkqIhen=d$!%8tiL){`|RYy{lbUTspo)_n~2Tc-$i_w}JNk7#FRLfwr-5 zr=RwqFV?mu?`NkNE-OD)U;gFJFKEXybJ;&UT*>8lwkeL`d_@F}L@nrb!Mgo~yTbYD zhGeb6N8h&nnhe?anmSX6Hc zcM66()w*7tdNS~Ot@KAZYOWpIc%e?1vdKj|DoK+nLD$1q1oPllOLHVfI~94KsEg*|Ou(4<9~j^r(( zX`2kAGU2=N4CbJ|E%zl>Djr;gVoQmRiAWjPXt~*JF#`7taJwBCW%B2NjM1VombN&i z{0Hhk>iGmDJRQHm@mJv*qq|R5Z4BrQa}J_AB)U@#PK7(cYkD;=#cYavP?o$XuodG`?A>cVm+4|1U&RzMUmjNl+LF} zH~Y>e?|U+@B}uPd^Pk}_Wo59#S3s!#$qqR@IX#U&-`t<11{}vE265tr%i?e?q;O*4 zE242bc8dDmbIfuhKzoO!^#_YW9Kg}T1#sK%&BSlowggyVxs?$QSP=saWf}v9dTTfq z;19X$Xpr;)_o)&PQIRj$G`q#vP6tcwOOVFP;Q^q+&6WyOga*6`vNyIeS<`=mZQNay#cUozx+)+{F+sPOE=^Up2=;rBFZ9cmzy*3-3QOdp`MvF^FP$DUbey%QFN7uk2HmWb7(o`<)R z4UkM2i!M7~5t%3}*g?^=9^%zHFvhbTN<{^2GE4a{m3gxCg{`dF+%n`wqQDzQh~d2| zX6!EN{D5?8(D_$LNQ33*ph zQe96Q+2Q1h*Y02kAQNXbF@cS-djL!v6@GyMh2~plq{f6M6n7Bu?bjY0O!y$Ls%xt^ z&JVTg-|t-0_I@JXWGNYH6C%$ha`n5zc%^uEe(OR)AwfAYQ2y7lQX52Iq zP@@dh&o4kt0_OGN@woZK}6%VEss$UQi;*ob<$k_U&d&(?G_=V>u$PhOK z4s50o661xasq6ah~!z&pq!?3C+HK`K@9y|&L<|}d<2!X zYE-ky!=C=R+HC?R(&JIec%)sNh$0p*Qj|!5vdJzyL*QL^PUM@qK=OJGivm@Gh=mG# z8NglWS)Hp6$7sfc z=2zI}^Th7Y8h(jQo1bq{DDiCoVu815? zODF-EvN)4}Ze@beI7OiH(T|X)YR%Q+z`Vo3;=0ybEIP{vyC8aVa`@*O?Az} z^$+!G2?ssyCY8}Qk~Y6R30DP$c{U(U^$y=8ACVEX7YBr=yZ1u_f-=pJjY68?NbG5Q z-BL79_B58UFb2bSLLWROhi7vaMagG%T=T&m>muMljn{WMU)|Rw1WU4Nguv9=a++Ze zvj7sE?)zAo!3^&D3&AvHA~zj^qG3-6}?V6I;Wd zsWOL*4~^?qLG!`xJ|jGIFHv+fvlLy08aum1gcUYUsCrh)>uv=e?9u$`6~sj0Z~_qQ z8KRGb#m{jO@!554vlAi=9aIxDGo%&n9fuESqVNeHs#Xd