From 30679fffd32a6653ae53b58e1ad3b240330aa235 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:05:03 -0700
Subject: [PATCH 01/32] Adding GPU-MPC

---
 GPU-MPC/Dockerfile_Gen                        |   34 +
 GPU-MPC/Makefile                              |  132 +
 GPU-MPC/README.md                             |  123 +
 GPU-MPC/backend/orca.h                        |  243 +
 GPU-MPC/backend/orca_base.h                   |  311 +
 GPU-MPC/backend/piranha.h                     |  160 +
 GPU-MPC/backend/sigma.h                       |  428 ++
 GPU-MPC/experiments/__init__.py               |   21 +
 GPU-MPC/experiments/orca/__init__.py          |   21 +
 GPU-MPC/experiments/orca/cnn.h                | 1388 ++++
 GPU-MPC/experiments/orca/config.json          |   22 +
 .../datasets/cifar-10/download-cifar10.sh     |   12 +
 GPU-MPC/experiments/orca/datasets/cifar10.h   |  333 +
 GPU-MPC/experiments/orca/datasets/gpu_data.cu |   71 +
 GPU-MPC/experiments/orca/datasets/gpu_data.h  |   15 +
 GPU-MPC/experiments/orca/datasets/mnist.cpp   |   17 +
 GPU-MPC/experiments/orca/datasets/mnist.h     |  227 +
 GPU-MPC/experiments/orca/model_accuracy.cu    |   30 +
 GPU-MPC/experiments/orca/model_accuracy.h     |  228 +
 GPU-MPC/experiments/orca/orca_dealer.cu       |  360 ++
 GPU-MPC/experiments/orca/orca_evaluator.cu    |  504 ++
 GPU-MPC/experiments/orca/orca_inference.cu    |  122 +
 GPU-MPC/experiments/orca/piranha.cu           |  119 +
 GPU-MPC/experiments/orca/run_experiment.py    |  326 +
 GPU-MPC/experiments/orca/share_data.cpp       |  182 +
 GPU-MPC/experiments/sigma/bert.h              |  103 +
 GPU-MPC/experiments/sigma/gpt2.h              |  125 +
 GPU-MPC/experiments/sigma/llama2.h            |  120 +
 GPU-MPC/experiments/sigma/sigma.cu            |  204 +
 GPU-MPC/experiments/utils.py                  |  115 +
 GPU-MPC/ext/sytorch/.gitignore                |   25 +
 GPU-MPC/ext/sytorch/CMakeLists.txt            |  372 ++
 GPU-MPC/ext/sytorch/README.md                 |   24 +
 .../Toy example- multiple inference.md        |  113 +
 .../sytorch/Toy example- single inference.md  |   97 +
 GPU-MPC/ext/sytorch/examples/bert.cpp         |  554 ++
 .../ext/sytorch/examples/bertbenchmark.cpp    |  260 +
 .../sytorch/examples/gpt-neo_nexttoken.cpp    |  499 ++
 GPU-MPC/ext/sytorch/examples/gpt2.cpp         |  762 +++
 .../ext/sytorch/examples/gpt2benchmark.cpp    |  244 +
 .../ext/sytorch/examples/gpt2correctness.cpp  |  336 +
 GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp    |  162 +
 GPU-MPC/ext/sytorch/examples/gptneo.cpp       |  398 ++
 .../ext/sytorch/examples/gptneobenchmark.cpp  |  304 +
 GPU-MPC/ext/sytorch/examples/llama7b.cpp      |  263 +
 GPU-MPC/ext/sytorch/examples/resnet18.cpp     |  210 +
 GPU-MPC/ext/sytorch/examples/resnet50.cpp     |  404 ++
 GPU-MPC/ext/sytorch/examples/vgg16.cpp        |  185 +
 .../ext/sytorch/ext/bitpack/CMakeLists.txt    |   22 +
 .../ext/bitpack/include/bitpack/bitpack.h     |   11 +
 .../ext/bitpack/src/bitpack/bitpack.cpp       |  136 +
 .../ext/sytorch/ext/bitpack/tests/test.cpp    |   38 +
 .../sytorch/ext/cryptoTools/CMakeLists.txt    |   18 +
 GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE   |   50 +
 .../cryptoTools/Common/Defines.cpp            |   88 +
 .../cryptoTools/cryptoTools/Common/Defines.h  |  149 +
 .../cryptoTools/cryptoTools/Common/Log.cpp    |  136 +
 .../ext/cryptoTools/cryptoTools/Common/Log.h  |  222 +
 .../cryptoTools/cryptoTools/Common/config.h   |   27 +
 .../cryptoTools/cryptoTools/Crypto/AES.cpp    |  792 +++
 .../ext/cryptoTools/cryptoTools/Crypto/AES.h  |  136 +
 .../cryptoTools/cryptoTools/Crypto/PRNG.cpp   |   78 +
 .../ext/cryptoTools/cryptoTools/Crypto/PRNG.h |  178 +
 .../cryptoTools/cryptoTools/gsl/GSL.natvis    |   98 +
 .../cryptoTools/cryptoTools/gsl/gls-lite.hpp  | 2382 +++++++
 .../ext/cryptoTools/cryptoTools/gsl/gsl       |  207 +
 .../cryptoTools/cryptoTools/gsl/gsl_algorithm |   61 +
 .../cryptoTools/cryptoTools/gsl/gsl_assert    |   97 +
 .../ext/cryptoTools/cryptoTools/gsl/gsl_byte  |  190 +
 .../ext/cryptoTools/cryptoTools/gsl/gsl_util  |  170 +
 .../cryptoTools/cryptoTools/gsl/multi_span    | 2266 +++++++
 .../ext/cryptoTools/cryptoTools/gsl/span      |  735 +++
 .../cryptoTools/cryptoTools/gsl/string_span   |  847 +++
 GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt  |   64 +
 GPU-MPC/ext/sytorch/ext/llama/and.cpp         |   75 +
 GPU-MPC/ext/sytorch/ext/llama/and.h           |   28 +
 GPU-MPC/ext/sytorch/ext/llama/api.cpp         | 4842 ++++++++++++++
 GPU-MPC/ext/sytorch/ext/llama/clip.cpp        |   93 +
 GPU-MPC/ext/sytorch/ext/llama/clip.h          |   28 +
 GPU-MPC/ext/sytorch/ext/llama/conv.cpp        |  472 ++
 GPU-MPC/ext/sytorch/ext/llama/dcf.cpp         |  378 ++
 GPU-MPC/ext/sytorch/ext/llama/dpf.cpp         |  538 ++
 .../ext/sytorch/ext/llama/fixtobfloat16.cpp   |  135 +
 GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h |   29 +
 GPU-MPC/ext/sytorch/ext/llama/float.cpp       |  209 +
 GPU-MPC/ext/sytorch/ext/llama/float.h         |   46 +
 .../ext/sytorch/ext/llama/include/llama/api.h |  196 +
 .../sytorch/ext/llama/include/llama/array.h   |   72 +
 .../sytorch/ext/llama/include/llama/assert.h  |   30 +
 .../sytorch/ext/llama/include/llama/comms.h   |  488 ++
 .../sytorch/ext/llama/include/llama/config.h  |   38 +
 .../sytorch/ext/llama/include/llama/conv.h    |  129 +
 .../ext/sytorch/ext/llama/include/llama/dcf.h |   82 +
 .../ext/sytorch/ext/llama/include/llama/dpf.h |   36 +
 .../sytorch/ext/llama/include/llama/freekey.h |  410 ++
 .../ext/llama/include/llama/group_element.h   |  120 +
 .../ext/llama/include/llama/input_prng.h      |   61 +
 .../sytorch/ext/llama/include/llama/keypack.h |  445 ++
 .../sytorch/ext/llama/include/llama/prng.h    |   29 +
 .../sytorch/ext/llama/include/llama/stats.h   |   77 +
 .../sytorch/ext/llama/include/llama/utils.h   |  173 +
 GPU-MPC/ext/sytorch/ext/llama/lut.cpp         |  225 +
 GPU-MPC/ext/sytorch/ext/llama/lut.h           |   33 +
 GPU-MPC/ext/sytorch/ext/llama/mic.cpp         |   89 +
 GPU-MPC/ext/sytorch/ext/llama/mic.h           |   34 +
 GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp       |   76 +
 GPU-MPC/ext/sytorch/ext/llama/msnzb.h         |   34 +
 GPU-MPC/ext/sytorch/ext/llama/mult.cpp        |  115 +
 GPU-MPC/ext/sytorch/ext/llama/mult.h          |   51 +
 GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp      |   61 +
 GPU-MPC/ext/sytorch/ext/llama/pubcmp.h        |   27 +
 GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp      |  248 +
 GPU-MPC/ext/sytorch/ext/llama/pubdiv.h        |   61 +
 GPU-MPC/ext/sytorch/ext/llama/relu.cpp        |  312 +
 GPU-MPC/ext/sytorch/ext/llama/relu.h          |   64 +
 GPU-MPC/ext/sytorch/ext/llama/select.cpp      |   71 +
 GPU-MPC/ext/sytorch/ext/llama/select.h        |   26 +
 GPU-MPC/ext/sytorch/ext/llama/signextend.cpp  |   80 +
 GPU-MPC/ext/sytorch/ext/llama/signextend.h    |   28 +
 .../ext/sytorch/ext/llama/src/llama/comms.cpp | 1808 ++++++
 .../sytorch/ext/llama/src/llama/config.cpp    |   35 +
 .../ext/llama/src/llama/input_prng.cpp        |  173 +
 .../ext/sytorch/ext/llama/src/llama/prng.cpp  |   48 +
 .../ext/sytorch/ext/llama/src/llama/stats.cpp |  105 +
 .../ext/sytorch/ext/llama/src/llama/utils.cpp |  685 ++
 GPU-MPC/ext/sytorch/ext/llama/taylor.cpp      |  172 +
 GPU-MPC/ext/sytorch/ext/llama/taylor.h        |   43 +
 GPU-MPC/ext/sytorch/ext/llama/truncate.cpp    |  108 +
 GPU-MPC/ext/sytorch/ext/llama/truncate.h      |   38 +
 GPU-MPC/ext/sytorch/ext/llama/wrap.cpp        |  125 +
 GPU-MPC/ext/sytorch/ext/llama/wrap.h          |   29 +
 GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt    |   44 +
 GPU-MPC/ext/sytorch/ext/sci/README.md         |   59 +
 GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore  |    1 +
 .../sytorch/ext/sci/cmake/SCIConfig.cmake.in  |   18 +
 .../sci/cmake/SecureFixedPointConfig.cmake    |   16 +
 .../sytorch/ext/sci/cmake/install_EMP.cmake   |   49 +
 .../ext/sci/cmake/install_Eigen3.cmake        |   18 +
 GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch  |   13 +
 .../ext/sci/src/BuildingBlocks/CMakeLists.txt |   10 +
 .../sci/src/BuildingBlocks/aux-protocols.cpp  |  745 +++
 .../sci/src/BuildingBlocks/aux-protocols.h    |  167 +
 .../ext/sci/src/BuildingBlocks/truncation.cpp |  298 +
 .../ext/sci/src/BuildingBlocks/truncation.h   |  111 +
 .../src/BuildingBlocks/value-extension.cpp    |  111 +
 .../sci/src/BuildingBlocks/value-extension.h  |   50 +
 .../ext/sytorch/ext/sci/src/CMakeLists.txt    |  123 +
 .../ext/sci/src/FloatingPoint/CMakeLists.txt  |    4 +
 .../ext/sci/src/FloatingPoint/bool-data.cpp   |  171 +
 .../ext/sci/src/FloatingPoint/bool-data.h     |  162 +
 .../ext/sci/src/FloatingPoint/fixed-point.cpp | 1095 ++++
 .../ext/sci/src/FloatingPoint/fixed-point.h   |  414 ++
 .../sci/src/FloatingPoint/floating-point.cpp  | 1611 +++++
 .../sci/src/FloatingPoint/floating-point.h    |  462 ++
 .../sci/src/FloatingPoint/fp-math-coeffs.h    |  712 +++
 .../ext/sci/src/FloatingPoint/fp-math.cpp     |  924 +++
 .../ext/sci/src/FloatingPoint/fp-math.h       |   65 +
 .../ext/sytorch/ext/sci/src/GC/CMakeLists.txt |    7 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h  |  152 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h      |   68 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp    |   80 +
 .../ext/sci/src/GC/circuit_execution.h        |   60 +
 .../ext/sytorch/ext/sci/src/GC/comparable.h   |   57 +
 .../ext/sytorch/ext/sci/src/GC/emp-sh2pc.h    |    4 +
 .../ext/sytorch/ext/sci/src/GC/emp-tool.cpp   |   43 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h |   19 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h      |  232 +
 .../sytorch/ext/sci/src/GC/halfgate_eva.cpp   |   52 +
 .../ext/sytorch/ext/sci/src/GC/halfgate_eva.h |   66 +
 .../sytorch/ext/sci/src/GC/halfgate_gen.cpp   |   61 +
 .../ext/sytorch/ext/sci/src/GC/halfgate_gen.h |   80 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h  |   87 +
 .../ext/sytorch/ext/sci/src/GC/integer.hpp    |  424 ++
 GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h  |   86 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h   |   74 +
 .../ext/sci/src/GC/protocol_execution.h       |   55 +
 .../ext/sytorch/ext/sci/src/GC/semihonest.h   |   51 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h   |  112 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h   |  118 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h |   68 +
 .../ext/sytorch/ext/sci/src/GC/swappable.h    |   50 +
 GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h    |   47 +
 .../ext/sci/src/LinearHE/CMakeLists.txt       |   34 +
 .../ext/sci/src/LinearHE/conv-field.cpp       |  946 +++
 .../sytorch/ext/sci/src/LinearHE/conv-field.h |  173 +
 .../sytorch/ext/sci/src/LinearHE/defines-HE.h |   42 +
 .../sci/src/LinearHE/elemwise-prod-field.cpp  |  220 +
 .../sci/src/LinearHE/elemwise-prod-field.h    |   56 +
 .../sytorch/ext/sci/src/LinearHE/fc-field.cpp |  395 ++
 .../sytorch/ext/sci/src/LinearHE/fc-field.h   |   98 +
 .../ext/sci/src/LinearHE/generate_primes.py   |   26 +
 .../sytorch/ext/sci/src/LinearHE/utils-HE.cpp |  237 +
 .../sytorch/ext/sci/src/LinearHE/utils-HE.h   |   70 +
 .../ext/sci/src/LinearOT/CMakeLists.txt       |    6 +
 .../ext/sci/src/LinearOT/linear-ot.cpp        |  813 +++
 .../sytorch/ext/sci/src/LinearOT/linear-ot.h  |  113 +
 .../ext/sci/src/LinearOT/linear-uniform.h     |  750 +++
 .../sytorch/ext/sci/src/Math/CMakeLists.txt   |    4 +
 .../ext/sci/src/Math/math-functions.cpp       |  889 +++
 .../sytorch/ext/sci/src/Math/math-functions.h |   82 +
 .../ext/sci/src/Millionaire/CMakeLists.txt    |    4 +
 .../src/Millionaire/bit-triple-generator.h    |  323 +
 .../ext/sci/src/Millionaire/equality.h        |  353 ++
 .../ext/sci/src/Millionaire/millionaire.h     |  557 ++
 .../Millionaire/millionaire_with_equality.h   |  391 ++
 .../ext/sci/src/NonLinear/CMakeLists.txt      |    4 +
 .../sytorch/ext/sci/src/NonLinear/argmax.h    |  476 ++
 .../ext/sci/src/NonLinear/drelu-field.h       |  355 ++
 .../sytorch/ext/sci/src/NonLinear/maxpool.h   |  181 +
 .../ext/sci/src/NonLinear/relu-field.h        |  213 +
 .../ext/sci/src/NonLinear/relu-interface.h    |   31 +
 .../sytorch/ext/sci/src/NonLinear/relu-ring.h |  251 +
 .../ext/sytorch/ext/sci/src/OT/CMakeLists.txt |    4 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h   |   13 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h    |  159 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h     |  783 +++
 GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h     |  239 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h       |  218 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h |  217 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h       |  124 +
 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h  |  124 +
 .../ext/sytorch/ext/sci/src/OT/split-iknp.h   | 1023 +++
 .../ext/sytorch/ext/sci/src/OT/split-kkot.h   |  604 ++
 .../ext/sytorch/ext/sci/src/OT/split-utils.h  |  190 +
 .../ext/sci/src/cleartext_library_fixed.cpp   | 1224 ++++
 .../ext/sci/src/cleartext_library_fixed.h     |  180 +
 .../sci/src/cleartext_library_fixed_uniform.h | 3061 +++++++++
 .../ext/sci/src/cleartext_library_float.cpp   |  162 +
 .../ext/sci/src/cleartext_library_float.h     |   62 +
 GPU-MPC/ext/sytorch/ext/sci/src/defines.h     |   54 +
 .../ext/sytorch/ext/sci/src/defines_float.h   |   37 +
 .../ext/sytorch/ext/sci/src/defines_uniform.h |   88 +
 .../ext/sci/src/functionalities_uniform.h     | 1127 ++++
 GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp   |  118 +
 GPU-MPC/ext/sytorch/ext/sci/src/globals.h     |  140 +
 .../ext/sytorch/ext/sci/src/globals_float.cpp |   35 +
 .../ext/sytorch/ext/sci/src/globals_float.h   |   40 +
 .../ext/sytorch/ext/sci/src/library_fixed.cpp | 2810 +++++++++
 .../ext/sytorch/ext/sci/src/library_fixed.h   |  298 +
 .../ext/sci/src/library_fixed_common.h        |  361 ++
 .../ext/sci/src/library_fixed_uniform.cpp     | 2093 +++++++
 .../ext/sci/src/library_fixed_uniform.h       |  209 +
 .../ext/sytorch/ext/sci/src/library_float.cpp | 1252 ++++
 .../ext/sytorch/ext/sci/src/library_float.h   |  207 +
 .../ext/sci/src/utils/ArgMapping/ArgMapping.h |  142 +
 .../ext/sci/src/utils/ArgMapping/LICENSE      |  202 +
 .../ext/sci/src/utils/ArgMapping/NOTICE       |    3 +
 .../sytorch/ext/sci/src/utils/CMakeLists.txt  |   27 +
 .../sytorch/ext/sci/src/utils/ThreadPool.h    |  116 +
 .../ext/sytorch/ext/sci/src/utils/aes-ni.h    |  408 ++
 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h   |  147 +
 .../ext/sytorch/ext/sci/src/utils/aes_opt.h   | 1052 ++++
 GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h |  393 ++
 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h  |   58 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h  |   65 +
 .../ext/sci/src/utils/cmake/FindGMP.cmake     |   21 +
 .../utils/cmake/source_of_randomness.cmake    |   27 +
 .../ext/sytorch/ext/sci/src/utils/constants.h |  573 ++
 GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h   |   84 +
 .../ext/sytorch/ext/sci/src/utils/emp-tool.h  |   18 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h   |  205 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h |   99 +
 .../sytorch/ext/sci/src/utils/group_openssl.h |  216 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h  |  115 +
 .../sytorch/ext/sci/src/utils/io_channel.h    |   84 +
 .../ext/sytorch/ext/sci/src/utils/io_pack.h   |   44 +
 .../ext/sci/src/utils/net_io_channel.h        |  205 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h   |  300 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h   |  113 +
 .../ext/sytorch/ext/sci/src/utils/sse2neon.h  | 5566 +++++++++++++++++
 GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h |   73 +
 .../sci/src/utils/ubuntu_terminal_colors.h    |   39 +
 GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h |   89 +
 .../ext/sytorch/ext/sci/src/utils/utils.hpp   |  445 ++
 .../ext/sytorch/ext/sci/tests/CMakeLists.txt  |   45 +
 .../ext/sytorch/ext/sci/tests/FindMPFR.cmake  |   72 +
 .../sytorch/ext/sci/tests/GC/CMakeLists.txt   |    9 +
 .../ext/sytorch/ext/sci/tests/GC/test_and.cpp |   49 +
 .../ext/sytorch/ext/sci/tests/GC/test_bit.cpp |   89 +
 .../ext/sytorch/ext/sci/tests/GC/test_int.cpp |   67 +
 .../sytorch/ext/sci/tests/GC/test_msnzb.cpp   |  126 +
 .../ext/sytorch/ext/sci/tests/float_utils.h   |   13 +
 .../ext/sci/tests/test_field_argmax.cpp       |  186 +
 .../sytorch/ext/sci/tests/test_field_conv.cpp |  141 +
 .../sci/tests/test_field_elemwise_prod.cpp    |   80 +
 .../sytorch/ext/sci/tests/test_field_fc.cpp   |   91 +
 .../ext/sci/tests/test_field_maxpool.cpp      |  194 +
 .../sytorch/ext/sci/tests/test_field_relu.cpp |  271 +
 .../ext/sci/tests/test_float_bench_op.cpp     |  236 +
 .../sytorch/ext/sci/tests/test_float_math.cpp |  308 +
 .../ext/sci/tests/test_float_primitive.cpp    |  349 ++
 .../ext/sci/tests/test_ring_argmax.cpp        |  164 +
 .../ext/sci/tests/test_ring_aux_protocols.cpp |  484 ++
 .../sytorch/ext/sci/tests/test_ring_exp.cpp   |  192 +
 .../sci/tests/test_ring_hadamard_product.cpp  |   99 +
 .../ext/sci/tests/test_ring_matmul.cpp        |  211 +
 .../ext/sci/tests/test_ring_maxpool.cpp       |  200 +
 .../sytorch/ext/sci/tests/test_ring_relu.cpp  |  180 +
 .../ext/sci/tests/test_ring_sigmoid.cpp       |  184 +
 .../sytorch/ext/sci/tests/test_ring_sqrt.cpp  |  206 +
 .../sytorch/ext/sci/tests/test_ring_tanh.cpp  |  184 +
 .../ext/sci/tests/test_ring_truncation.cpp    |  189 +
 .../sci/tests/test_ring_value_extension.cpp   |  145 +
 GPU-MPC/ext/sytorch/ezpc-cli-2.sh             |  436 ++
 GPU-MPC/ext/sytorch/ezpc-cli.sh               |  298 +
 .../sytorch/include/sytorch/backend/backend.h |  194 +
 .../sytorch/backend/baseline_cleartext.h      |  115 +
 .../include/sytorch/backend/cleartext.h       |  130 +
 .../sytorch/backend/crypten_cleartext.h       |  142 +
 .../sytorch/include/sytorch/backend/default.h |   35 +
 .../sytorch/include/sytorch/backend/float.h   |   70 +
 .../include/sytorch/backend/llama_base.h      |  578 ++
 .../include/sytorch/backend/llama_extended.h  |   89 +
 .../sytorch/backend/llama_transformer.h       |  209 +
 .../sytorch/backend/piranha_cleartext.h       |  135 +
 .../sytorch/backend/secureml_cleartext.h      |  113 +
 GPU-MPC/ext/sytorch/include/sytorch/graph.h   |  121 +
 .../sytorch/include/sytorch/layers/layers.h   | 1758 ++++++
 GPU-MPC/ext/sytorch/include/sytorch/module.h  |  740 +++
 GPU-MPC/ext/sytorch/include/sytorch/random.h  |   30 +
 GPU-MPC/ext/sytorch/include/sytorch/softmax.h |  174 +
 GPU-MPC/ext/sytorch/include/sytorch/tensor.h  |  801 +++
 GPU-MPC/ext/sytorch/include/sytorch/utils.h   |  442 ++
 GPU-MPC/ext/sytorch/scores/.gitignore         |    1 +
 GPU-MPC/ext/sytorch/scripts/dealer.py         |  119 +
 GPU-MPC/ext/sytorch/scripts/diff.py           |   33 +
 GPU-MPC/ext/sytorch/scripts/download_keys.py  |   62 +
 GPU-MPC/ext/sytorch/scripts/gptacc.py         |   46 +
 .../ext/sytorch/scripts/mnli_matched_acc.py   |   38 +
 .../sytorch/scripts/mnnli_mismatched_acc.py   |   38 +
 GPU-MPC/ext/sytorch/scripts/mrpcacc.py        |   38 +
 GPU-MPC/ext/sytorch/scripts/qnli_acc.py       |   38 +
 GPU-MPC/ext/sytorch/scripts/server.py         |   33 +
 GPU-MPC/ext/sytorch/scripts/sst2acc.py        |   37 +
 .../sytorch/backend/baseline_cleartext.cpp    |  575 ++
 .../sytorch/src/sytorch/backend/cleartext.cpp | 1108 ++++
 .../ext/sytorch/src/sytorch/backend/float.cpp |  411 ++
 GPU-MPC/ext/sytorch/src/sytorch/random.cpp    |   31 +
 GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp   |  107 +
 GPU-MPC/ext/sytorch/tests/bf16.cpp            |   67 +
 GPU-MPC/ext/sytorch/tests/clip.cpp            |   88 +
 GPU-MPC/ext/sytorch/tests/dcf.cpp             |   74 +
 GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp      |   58 +
 GPU-MPC/ext/sytorch/tests/dpf.cpp             |  124 +
 GPU-MPC/ext/sytorch/tests/dpfet.cpp           |   57 +
 GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp  |   43 +
 .../ext/sytorch/tests/evalallbenchmark.cpp    |   65 +
 GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp        |   68 +
 GPU-MPC/ext/sytorch/tests/lutss.cpp           |   57 +
 .../ext/sytorch/tests/multi_party/bf16.cpp    |   90 +
 .../ext/sytorch/tests/multi_party/clip.cpp    |   87 +
 GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp |   83 +
 .../ext/sytorch/tests/multi_party/gelu.cpp    |   98 +
 .../ext/sytorch/tests/multi_party/gemm.cpp    |   82 +
 .../sytorch/tests/multi_party/layernorm.cpp   |  100 +
 GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp |   89 +
 .../ext/sytorch/tests/multi_party/prtrunc.cpp |   91 +
 .../ext/sytorch/tests/multi_party/rsqrt.cpp   |   84 +
 .../sytorch/tests/multi_party/sloth_ars.cpp   |   85 +
 .../tests/multi_party/sloth_ars_faithful.cpp  |   85 +
 .../sytorch/tests/multi_party/sloth_clip.cpp  |   87 +
 .../sytorch/tests/multi_party/sloth_drelu.cpp |   87 +
 .../sytorch/tests/multi_party/sloth_lrs.cpp   |   82 +
 .../tests/multi_party/sloth_maxpool.cpp       |   81 +
 .../tests/multi_party/sloth_maxpool_tri.cpp   |   84 +
 .../sytorch/tests/multi_party/sloth_relu.cpp  |   87 +
 .../ext/sytorch/tests/multi_party/softmax.cpp |   91 +
 .../ext/sytorch/tests/multi_party/tanh.cpp    |   82 +
 .../tests/multi_party/truncatereduce.cpp      |   81 +
 GPU-MPC/ext/sytorch/tests/pubcmp.cpp          |   58 +
 GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp     |   55 +
 GPU-MPC/ext/sytorch/tests/truncatereduce.cpp  |   56 +
 GPU-MPC/ext/sytorch/tests/wrap.cpp            |   92 +
 GPU-MPC/fss/dcf/gpu_dcf.cu                    |  378 ++
 GPU-MPC/fss/dcf/gpu_dcf.h                     |  102 +
 GPU-MPC/fss/dcf/gpu_dcf_templates.h           |  109 +
 GPU-MPC/fss/dcf/gpu_maxpool.cu                |  119 +
 GPU-MPC/fss/dcf/gpu_maxpool.h                 |   54 +
 GPU-MPC/fss/dcf/gpu_relu.cu                   |  168 +
 GPU-MPC/fss/dcf/gpu_relu.h                    |  102 +
 GPU-MPC/fss/dcf/gpu_sgd.cu                    |  330 +
 GPU-MPC/fss/dcf/gpu_sgd.h                     |   57 +
 GPU-MPC/fss/dcf/gpu_sstab.h                   |   67 +
 GPU-MPC/fss/dcf/gpu_truncate.cu               |  270 +
 GPU-MPC/fss/dcf/gpu_truncate.h                |  132 +
 GPU-MPC/fss/gpu_add.h                         |   45 +
 GPU-MPC/fss/gpu_aes_shm.cu                    |  241 +
 GPU-MPC/fss/gpu_aes_shm.h                     |   52 +
 GPU-MPC/fss/gpu_aes_table.h                   | 1317 ++++
 GPU-MPC/fss/gpu_and.cu                        |   56 +
 GPU-MPC/fss/gpu_and.h                         |   53 +
 GPU-MPC/fss/gpu_avgpool.cu                    |   92 +
 GPU-MPC/fss/gpu_avgpool.h                     |   53 +
 GPU-MPC/fss/gpu_conv2d.cu                     |  457 ++
 GPU-MPC/fss/gpu_conv2d.h                      |   50 +
 GPU-MPC/fss/gpu_dpf.cu                        |  480 ++
 GPU-MPC/fss/gpu_dpf.h                         |  107 +
 GPU-MPC/fss/gpu_dpf_templates.h               |  135 +
 GPU-MPC/fss/gpu_fss_helper.h                  |  135 +
 GPU-MPC/fss/gpu_gelu.cu                       |  162 +
 GPU-MPC/fss/gpu_gelu.h                        |   78 +
 GPU-MPC/fss/gpu_inverse.cu                    |   49 +
 GPU-MPC/fss/gpu_inverse.h                     |   43 +
 GPU-MPC/fss/gpu_layernorm.cu                  |  229 +
 GPU-MPC/fss/gpu_layernorm.h                   |   92 +
 GPU-MPC/fss/gpu_linear_helper.cu              |  175 +
 GPU-MPC/fss/gpu_linear_helper.h               |   30 +
 GPU-MPC/fss/gpu_local_truncate.h              |   62 +
 GPU-MPC/fss/gpu_lut.cu                        |  223 +
 GPU-MPC/fss/gpu_lut.h                         |  147 +
 GPU-MPC/fss/gpu_matmul.cu                     |  399 ++
 GPU-MPC/fss/gpu_matmul.h                      |   98 +
 GPU-MPC/fss/gpu_maxpool.cu                    |  698 +++
 GPU-MPC/fss/gpu_maxpool.h                     |   77 +
 GPU-MPC/fss/gpu_mha.cu                        |  207 +
 GPU-MPC/fss/gpu_mha.h                         |  190 +
 GPU-MPC/fss/gpu_mul.cu                        |   82 +
 GPU-MPC/fss/gpu_mul.h                         |   55 +
 GPU-MPC/fss/gpu_nexp.cu                       |   81 +
 GPU-MPC/fss/gpu_nexp.h                        |   57 +
 GPU-MPC/fss/gpu_relu.cu                       |  115 +
 GPU-MPC/fss/gpu_relu.h                        |   72 +
 GPU-MPC/fss/gpu_scalarmul.h                   |   52 +
 GPU-MPC/fss/gpu_select.cu                     |  135 +
 GPU-MPC/fss/gpu_select.h                      |   62 +
 GPU-MPC/fss/gpu_softmax.cu                    |  141 +
 GPU-MPC/fss/gpu_softmax.h                     |   59 +
 GPU-MPC/fss/gpu_sstab.h                       |  145 +
 GPU-MPC/fss/gpu_truncate.cu                   |  295 +
 GPU-MPC/fss/gpu_truncate.h                    |  155 +
 GPU-MPC/fss/gpu_window.cu                     |  159 +
 GPU-MPC/fss/gpu_window.h                      |   37 +
 GPU-MPC/nn/orca/avg_pool_layer.cu             |  104 +
 GPU-MPC/nn/orca/avg_pool_layer.h              |   55 +
 GPU-MPC/nn/orca/conv2d_layer.cu               |  344 +
 GPU-MPC/nn/orca/conv2d_layer.h                |   72 +
 GPU-MPC/nn/orca/fc_layer.cu                   |  340 +
 GPU-MPC/nn/orca/fc_layer.h                    |   79 +
 GPU-MPC/nn/orca/gpu_layer.h                   |   65 +
 GPU-MPC/nn/orca/gpu_model.h                   |   79 +
 GPU-MPC/nn/orca/maxpool_layer.cu              |  181 +
 GPU-MPC/nn/orca/maxpool_layer.h               |   60 +
 GPU-MPC/nn/orca/relu_extend_layer.cu          |  117 +
 GPU-MPC/nn/orca/relu_extend_layer.h           |   57 +
 GPU-MPC/nn/orca/relu_layer.cu                 |  117 +
 GPU-MPC/nn/orca/relu_layer.h                  |   58 +
 GPU-MPC/nn/orca_opt.h                         |  217 +
 GPU-MPC/setup.sh                              |   76 +
 GPU-MPC/tests/fss/dcf/aes.cu                  |   80 +
 GPU-MPC/tests/fss/dcf/dcf.cu                  |  116 +
 GPU-MPC/tests/fss/dcf/maxpool.cu              |  150 +
 GPU-MPC/tests/fss/dcf/relu.cu                 |   98 +
 GPU-MPC/tests/fss/dcf/relu_extend.cu          |   97 +
 GPU-MPC/tests/fss/dcf/stochastic_truncate.cu  |   71 +
 GPU-MPC/tests/fss/dpf.cu                      |  107 +
 GPU-MPC/tests/fss/dpf_drelu.cu                |   93 +
 GPU-MPC/tests/fss/dpf_eval_all.cu             |  198 +
 GPU-MPC/tests/fss/dpf_lut.cu                  |   94 +
 GPU-MPC/tests/fss/gelu.cu                     |  104 +
 GPU-MPC/tests/fss/layernorm.cu                |  136 +
 GPU-MPC/tests/fss/mha.cu                      |  225 +
 GPU-MPC/tests/fss/piranha_softmax.cu          |   75 +
 GPU-MPC/tests/fss/relu.cu                     |  101 +
 GPU-MPC/tests/fss/rmsnorm.cu                  |  140 +
 GPU-MPC/tests/fss/secfloat_softmax.cu         |   79 +
 GPU-MPC/tests/fss/silu.cu                     |  105 +
 GPU-MPC/tests/fss/softmax.cu                  |  133 +
 GPU-MPC/tests/fss/truncate.cu                 |   95 +
 GPU-MPC/tests/nn/orca/conv2d_test.cu          |  142 +
 GPU-MPC/tests/nn/orca/fc_test.cu              |  132 +
 GPU-MPC/tests/nn/orca/maxpool_test.cu         |  177 +
 GPU-MPC/tests/nn/orca/relu_extend_test.cu     |   96 +
 GPU-MPC/tests/nn/orca/relu_test.cu            |  112 +
 GPU-MPC/utils/cpu_comms.h                     |  243 +
 GPU-MPC/utils/curand_utils.h                  |   68 +
 GPU-MPC/utils/exception.h                     |  151 +
 GPU-MPC/utils/gpu_comms.h                     |  326 +
 GPU-MPC/utils/gpu_data_types.h                |   60 +
 GPU-MPC/utils/gpu_file_utils.cpp              |  172 +
 GPU-MPC/utils/gpu_file_utils.h                |   41 +
 GPU-MPC/utils/gpu_mem.cu                      |  133 +
 GPU-MPC/utils/gpu_mem.h                       |   38 +
 GPU-MPC/utils/gpu_random.cu                   |  212 +
 GPU-MPC/utils/gpu_random.h                    |   47 +
 GPU-MPC/utils/gpu_stats.h                     |   98 +
 GPU-MPC/utils/helper_cuda.h                   |  970 +++
 GPU-MPC/utils/helper_cutlass.h                |   41 +
 GPU-MPC/utils/helper_functions.h              |   59 +
 GPU-MPC/utils/helper_string.h                 |  428 ++
 GPU-MPC/utils/misc_utils.h                    |  236 +
 GPU-MPC/utils/sigma_comms.cpp                 |  182 +
 GPU-MPC/utils/sigma_comms.h                   |   97 +
 492 files changed, 115841 insertions(+)
 create mode 100644 GPU-MPC/Dockerfile_Gen
 create mode 100644 GPU-MPC/Makefile
 create mode 100644 GPU-MPC/README.md
 create mode 100644 GPU-MPC/backend/orca.h
 create mode 100644 GPU-MPC/backend/orca_base.h
 create mode 100644 GPU-MPC/backend/piranha.h
 create mode 100644 GPU-MPC/backend/sigma.h
 create mode 100644 GPU-MPC/experiments/__init__.py
 create mode 100644 GPU-MPC/experiments/orca/__init__.py
 create mode 100644 GPU-MPC/experiments/orca/cnn.h
 create mode 100644 GPU-MPC/experiments/orca/config.json
 create mode 100755 GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh
 create mode 100644 GPU-MPC/experiments/orca/datasets/cifar10.h
 create mode 100644 GPU-MPC/experiments/orca/datasets/gpu_data.cu
 create mode 100644 GPU-MPC/experiments/orca/datasets/gpu_data.h
 create mode 100644 GPU-MPC/experiments/orca/datasets/mnist.cpp
 create mode 100644 GPU-MPC/experiments/orca/datasets/mnist.h
 create mode 100644 GPU-MPC/experiments/orca/model_accuracy.cu
 create mode 100644 GPU-MPC/experiments/orca/model_accuracy.h
 create mode 100644 GPU-MPC/experiments/orca/orca_dealer.cu
 create mode 100644 GPU-MPC/experiments/orca/orca_evaluator.cu
 create mode 100644 GPU-MPC/experiments/orca/orca_inference.cu
 create mode 100644 GPU-MPC/experiments/orca/piranha.cu
 create mode 100644 GPU-MPC/experiments/orca/run_experiment.py
 create mode 100644 GPU-MPC/experiments/orca/share_data.cpp
 create mode 100644 GPU-MPC/experiments/sigma/bert.h
 create mode 100644 GPU-MPC/experiments/sigma/gpt2.h
 create mode 100644 GPU-MPC/experiments/sigma/llama2.h
 create mode 100644 GPU-MPC/experiments/sigma/sigma.cu
 create mode 100644 GPU-MPC/experiments/utils.py
 create mode 100644 GPU-MPC/ext/sytorch/.gitignore
 create mode 100755 GPU-MPC/ext/sytorch/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/README.md
 create mode 100644 GPU-MPC/ext/sytorch/Toy example- multiple inference.md
 create mode 100644 GPU-MPC/ext/sytorch/Toy example- single inference.md
 create mode 100644 GPU-MPC/ext/sytorch/examples/bert.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gpt-neo_nexttoken.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2benchmark.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2correctness.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gptneo.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/gptneobenchmark.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/llama7b.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/resnet18.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/resnet50.cpp
 create mode 100644 GPU-MPC/ext/sytorch/examples/vgg16.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/include/bitpack/bitpack.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/src/bitpack/bitpack.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/tests/test.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/config.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/GSL.natvis
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gls-lite.hpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_algorithm
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_assert
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_byte
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_util
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/multi_span
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/span
 create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/string_span
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/and.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/and.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/api.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/clip.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/clip.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/conv.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/dcf.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/dpf.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/float.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/float.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/array.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/assert.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/comms.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/config.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/conv.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/dcf.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/dpf.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/freekey.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/group_element.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/input_prng.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/keypack.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/prng.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/stats.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/lut.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/lut.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mic.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mic.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/msnzb.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mult.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mult.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubcmp.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubdiv.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/relu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/relu.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/select.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/select.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/signextend.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/signextend.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/comms.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/config.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/input_prng.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/prng.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/stats.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/utils.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/taylor.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/taylor.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/truncate.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/truncate.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/wrap.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/llama/wrap.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/README.md
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/SCIConfig.cmake.in
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/SecureFixedPointConfig.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/install_EMP.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/install_Eigen3.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math-coeffs.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/circuit_execution.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/comparable.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-sh2pc.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.hpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/protocol_execution.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/semihonest.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/swappable.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/defines-HE.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/generate_primes.py
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-uniform.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/bit-triple-generator.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/equality.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire_with_equality.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/argmax.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/drelu-field.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/maxpool.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-field.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-interface.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-ring.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-iknp.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-kkot.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed_uniform.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines_float.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines_uniform.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/functionalities_uniform.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals_float.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals_float.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_common.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_float.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_float.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/ArgMapping.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/LICENSE
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/NOTICE
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ThreadPool.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes-ni.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes_opt.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/FindGMP.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/source_of_randomness.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/constants.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/emp-tool.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/group_openssl.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/io_channel.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/io_pack.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/net_io_channel.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/sse2neon.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ubuntu_terminal_colors.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.hpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/FindMPFR.cmake
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/CMakeLists.txt
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_and.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_bit.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_int.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_msnzb.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/float_utils.h
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_argmax.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_conv.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_elemwise_prod.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_fc.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_maxpool.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_relu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_bench_op.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_math.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_primitive.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_argmax.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_aux_protocols.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_exp.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_hadamard_product.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_matmul.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_maxpool.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_relu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sigmoid.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sqrt.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_tanh.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_truncation.cpp
 create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_value_extension.cpp
 create mode 100755 GPU-MPC/ext/sytorch/ezpc-cli-2.sh
 create mode 100755 GPU-MPC/ext/sytorch/ezpc-cli.sh
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/baseline_cleartext.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/cleartext.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/crypten_cleartext.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/default.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/float.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_extended.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/piranha_cleartext.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/secureml_cleartext.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/graph.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/module.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/random.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/softmax.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/tensor.h
 create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/utils.h
 create mode 100644 GPU-MPC/ext/sytorch/scores/.gitignore
 create mode 100644 GPU-MPC/ext/sytorch/scripts/dealer.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/diff.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/download_keys.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/gptacc.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/mnli_matched_acc.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/mnnli_mismatched_acc.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/mrpcacc.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/qnli_acc.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/server.py
 create mode 100644 GPU-MPC/ext/sytorch/scripts/sst2acc.py
 create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/baseline_cleartext.cpp
 create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/cleartext.cpp
 create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/float.cpp
 create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/random.cpp
 create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/bf16.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/clip.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/dcf.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/dpf.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/dpfet.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/evalallbenchmark.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/lutss.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/bf16.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/clip.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/gelu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/gemm.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/layernorm.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/prtrunc.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/rsqrt.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars_faithful.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_clip.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_drelu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_lrs.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool_tri.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_relu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/softmax.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/tanh.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/truncatereduce.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/pubcmp.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/truncatereduce.cpp
 create mode 100644 GPU-MPC/ext/sytorch/tests/wrap.cpp
 create mode 100644 GPU-MPC/fss/dcf/gpu_dcf.cu
 create mode 100644 GPU-MPC/fss/dcf/gpu_dcf.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_dcf_templates.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_maxpool.cu
 create mode 100644 GPU-MPC/fss/dcf/gpu_maxpool.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_relu.cu
 create mode 100644 GPU-MPC/fss/dcf/gpu_relu.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_sgd.cu
 create mode 100644 GPU-MPC/fss/dcf/gpu_sgd.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_sstab.h
 create mode 100644 GPU-MPC/fss/dcf/gpu_truncate.cu
 create mode 100644 GPU-MPC/fss/dcf/gpu_truncate.h
 create mode 100644 GPU-MPC/fss/gpu_add.h
 create mode 100644 GPU-MPC/fss/gpu_aes_shm.cu
 create mode 100644 GPU-MPC/fss/gpu_aes_shm.h
 create mode 100644 GPU-MPC/fss/gpu_aes_table.h
 create mode 100644 GPU-MPC/fss/gpu_and.cu
 create mode 100644 GPU-MPC/fss/gpu_and.h
 create mode 100644 GPU-MPC/fss/gpu_avgpool.cu
 create mode 100644 GPU-MPC/fss/gpu_avgpool.h
 create mode 100644 GPU-MPC/fss/gpu_conv2d.cu
 create mode 100644 GPU-MPC/fss/gpu_conv2d.h
 create mode 100644 GPU-MPC/fss/gpu_dpf.cu
 create mode 100644 GPU-MPC/fss/gpu_dpf.h
 create mode 100644 GPU-MPC/fss/gpu_dpf_templates.h
 create mode 100644 GPU-MPC/fss/gpu_fss_helper.h
 create mode 100644 GPU-MPC/fss/gpu_gelu.cu
 create mode 100644 GPU-MPC/fss/gpu_gelu.h
 create mode 100644 GPU-MPC/fss/gpu_inverse.cu
 create mode 100644 GPU-MPC/fss/gpu_inverse.h
 create mode 100644 GPU-MPC/fss/gpu_layernorm.cu
 create mode 100644 GPU-MPC/fss/gpu_layernorm.h
 create mode 100644 GPU-MPC/fss/gpu_linear_helper.cu
 create mode 100644 GPU-MPC/fss/gpu_linear_helper.h
 create mode 100644 GPU-MPC/fss/gpu_local_truncate.h
 create mode 100644 GPU-MPC/fss/gpu_lut.cu
 create mode 100644 GPU-MPC/fss/gpu_lut.h
 create mode 100644 GPU-MPC/fss/gpu_matmul.cu
 create mode 100644 GPU-MPC/fss/gpu_matmul.h
 create mode 100644 GPU-MPC/fss/gpu_maxpool.cu
 create mode 100644 GPU-MPC/fss/gpu_maxpool.h
 create mode 100644 GPU-MPC/fss/gpu_mha.cu
 create mode 100644 GPU-MPC/fss/gpu_mha.h
 create mode 100644 GPU-MPC/fss/gpu_mul.cu
 create mode 100644 GPU-MPC/fss/gpu_mul.h
 create mode 100644 GPU-MPC/fss/gpu_nexp.cu
 create mode 100644 GPU-MPC/fss/gpu_nexp.h
 create mode 100644 GPU-MPC/fss/gpu_relu.cu
 create mode 100644 GPU-MPC/fss/gpu_relu.h
 create mode 100644 GPU-MPC/fss/gpu_scalarmul.h
 create mode 100644 GPU-MPC/fss/gpu_select.cu
 create mode 100644 GPU-MPC/fss/gpu_select.h
 create mode 100644 GPU-MPC/fss/gpu_softmax.cu
 create mode 100644 GPU-MPC/fss/gpu_softmax.h
 create mode 100644 GPU-MPC/fss/gpu_sstab.h
 create mode 100644 GPU-MPC/fss/gpu_truncate.cu
 create mode 100644 GPU-MPC/fss/gpu_truncate.h
 create mode 100644 GPU-MPC/fss/gpu_window.cu
 create mode 100644 GPU-MPC/fss/gpu_window.h
 create mode 100644 GPU-MPC/nn/orca/avg_pool_layer.cu
 create mode 100644 GPU-MPC/nn/orca/avg_pool_layer.h
 create mode 100644 GPU-MPC/nn/orca/conv2d_layer.cu
 create mode 100644 GPU-MPC/nn/orca/conv2d_layer.h
 create mode 100644 GPU-MPC/nn/orca/fc_layer.cu
 create mode 100644 GPU-MPC/nn/orca/fc_layer.h
 create mode 100644 GPU-MPC/nn/orca/gpu_layer.h
 create mode 100644 GPU-MPC/nn/orca/gpu_model.h
 create mode 100644 GPU-MPC/nn/orca/maxpool_layer.cu
 create mode 100644 GPU-MPC/nn/orca/maxpool_layer.h
 create mode 100644 GPU-MPC/nn/orca/relu_extend_layer.cu
 create mode 100644 GPU-MPC/nn/orca/relu_extend_layer.h
 create mode 100644 GPU-MPC/nn/orca/relu_layer.cu
 create mode 100644 GPU-MPC/nn/orca/relu_layer.h
 create mode 100644 GPU-MPC/nn/orca_opt.h
 create mode 100644 GPU-MPC/setup.sh
 create mode 100644 GPU-MPC/tests/fss/dcf/aes.cu
 create mode 100644 GPU-MPC/tests/fss/dcf/dcf.cu
 create mode 100644 GPU-MPC/tests/fss/dcf/maxpool.cu
 create mode 100644 GPU-MPC/tests/fss/dcf/relu.cu
 create mode 100644 GPU-MPC/tests/fss/dcf/relu_extend.cu
 create mode 100644 GPU-MPC/tests/fss/dcf/stochastic_truncate.cu
 create mode 100644 GPU-MPC/tests/fss/dpf.cu
 create mode 100644 GPU-MPC/tests/fss/dpf_drelu.cu
 create mode 100644 GPU-MPC/tests/fss/dpf_eval_all.cu
 create mode 100644 GPU-MPC/tests/fss/dpf_lut.cu
 create mode 100644 GPU-MPC/tests/fss/gelu.cu
 create mode 100644 GPU-MPC/tests/fss/layernorm.cu
 create mode 100644 GPU-MPC/tests/fss/mha.cu
 create mode 100644 GPU-MPC/tests/fss/piranha_softmax.cu
 create mode 100644 GPU-MPC/tests/fss/relu.cu
 create mode 100644 GPU-MPC/tests/fss/rmsnorm.cu
 create mode 100644 GPU-MPC/tests/fss/secfloat_softmax.cu
 create mode 100644 GPU-MPC/tests/fss/silu.cu
 create mode 100644 GPU-MPC/tests/fss/softmax.cu
 create mode 100644 GPU-MPC/tests/fss/truncate.cu
 create mode 100644 GPU-MPC/tests/nn/orca/conv2d_test.cu
 create mode 100644 GPU-MPC/tests/nn/orca/fc_test.cu
 create mode 100644 GPU-MPC/tests/nn/orca/maxpool_test.cu
 create mode 100644 GPU-MPC/tests/nn/orca/relu_extend_test.cu
 create mode 100644 GPU-MPC/tests/nn/orca/relu_test.cu
 create mode 100644 GPU-MPC/utils/cpu_comms.h
 create mode 100644 GPU-MPC/utils/curand_utils.h
 create mode 100644 GPU-MPC/utils/exception.h
 create mode 100644 GPU-MPC/utils/gpu_comms.h
 create mode 100644 GPU-MPC/utils/gpu_data_types.h
 create mode 100644 GPU-MPC/utils/gpu_file_utils.cpp
 create mode 100644 GPU-MPC/utils/gpu_file_utils.h
 create mode 100644 GPU-MPC/utils/gpu_mem.cu
 create mode 100644 GPU-MPC/utils/gpu_mem.h
 create mode 100644 GPU-MPC/utils/gpu_random.cu
 create mode 100644 GPU-MPC/utils/gpu_random.h
 create mode 100644 GPU-MPC/utils/gpu_stats.h
 create mode 100644 GPU-MPC/utils/helper_cuda.h
 create mode 100644 GPU-MPC/utils/helper_cutlass.h
 create mode 100644 GPU-MPC/utils/helper_functions.h
 create mode 100644 GPU-MPC/utils/helper_string.h
 create mode 100644 GPU-MPC/utils/misc_utils.h
 create mode 100644 GPU-MPC/utils/sigma_comms.cpp
 create mode 100644 GPU-MPC/utils/sigma_comms.h

diff --git a/GPU-MPC/Dockerfile_Gen b/GPU-MPC/Dockerfile_Gen
new file mode 100644
index 00000000..baca632e
--- /dev/null
+++ b/GPU-MPC/Dockerfile_Gen
@@ -0,0 +1,34 @@
+# Author: Tanmay Rajore,Neha Jawalkar
+#
+# Copyright:
+#     Copyright (c) 2024 Microsoft Research
+#     Permission is hereby granted, free of charge, to any person obtaining a copy
+#     of this software and associated documentation files (the "Software"), to deal
+#     in the Software without restriction, including without limitation the rights
+#     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#     copies of the Software, and to permit persons to whom the Software is
+#     furnished to do so, subject to the following conditions:
+#     The above copyright notice and this permission notice shall be included in all
+#     copies or substantial portions of the Software.
+#     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#     SOFTWARE.
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+
+WORKDIR /home
+RUN ln -sf /bin/bash /bin/sh
+
+RUN apt update && apt upgrade -y &&  apt install -y git apt-utils; \
+    apt install -y sudo ; \
+    sudo apt install -y gcc-9 g++-9; \
+    sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9;\
+    sudo update-alternatives --config gcc;\
+    sudo apt install libssl-dev cmake python3-pip libgmp-dev libmpfr-dev -y;\
+    sudo apt install cmake make libeigen3-dev -y ;
+
+RUN git config --global --add safe.directory *
+#RUN git submodule update --init --recursive
diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile
new file mode 100644
index 00000000..022da47b
--- /dev/null
+++ b/GPU-MPC/Makefile
@@ -0,0 +1,132 @@
+CUDA_VERSION ?= $(value CUDA_VERSION)
+ifeq ($(CUDA_VERSION),)
+	CUDA_VERSION = 11.7
+endif
+CUTLASS_PATH=./ext/cutlass
+SYTORCH_PATH=./ext/sytorch
+SYTORCH_BUILD_PATH=$(SYTORCH_PATH)/build
+LLAMA_PATH=$(SYTORCH_PATH)/ext/llama
+CUDA_ARCH =$(GPU_ARCH)
+
+CXX=/usr/local/cuda-$(CUDA_VERSION)/bin/nvcc
+FLAGS := -O3 -gencode arch=compute_$(CUDA_ARCH),code=[sm_$(CUDA_ARCH),compute_$(CUDA_ARCH)] -std=c++17 -m64 -Xcompiler="-O3,-w,-std=c++17,-fpermissive,-fpic,-pthread,-fopenmp,-march=native" 
+LIBS := -lsytorch -lcryptoTools -lLLAMA -lbitpack -lcuda -lcudart -lcurand
+SECFLOAT_LIBS := -lSCI-FloatML -lSCI-FloatingPoint -lSCI-BuildingBlocks -lSCI-LinearOT -lSCI-GC -lcrypto
+
+UTIL_FILES := ./utils/gpu_mem.cu ./utils/gpu_file_utils.cpp ./utils/sigma_comms.cpp
+OBJ_INCLUDES := -I '$(CUTLASS_PATH)/include' -I '$(CUTLASS_PATH)/tools/util/include' -I '$(SYTORCH_PATH)/include' -I '$(LLAMA_PATH)/include' -I '$(SYTORCH_PATH)/ext/cryptoTools' -I '.'
+INCLUDES := $(OBJ_INCLUDES) -L$(CUTLASS_PATH)/build/tools/library -L$(SYTORCH_BUILD_PATH) -L$(SYTORCH_BUILD_PATH)/ext/cryptoTools -L$(SYTORCH_BUILD_PATH)/ext/llama -L$(SYTORCH_BUILD_PATH)/ext/bitpack -L$(SYTORCH_BUILD_PATH)/lib
+
+dpf: tests/fss/dpf.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf
+
+dpf_eval_all: tests/fss/dpf_eval_all.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_eval_all
+
+dpf_drelu: tests/fss/dpf_drelu.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_drelu
+
+dpf_lut: tests/fss/dpf_lut.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_lut
+
+gelu: tests/fss/gelu.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/gelu
+
+relu: tests/fss/relu.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/relu
+
+rmsnorm: tests/fss/rmsnorm.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/rmsnorm
+
+softmax: tests/fss/softmax.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/softmax
+
+fc: tests/fss/fc.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/fc
+
+layernorm: tests/fss/layernorm.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/layernorm
+
+silu: tests/fss/silu.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/silu
+
+truncate: tests/fss/truncate.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/truncate
+
+mha: tests/fss/mha.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/mha
+
+secfloat_softmax: tests/fss/secfloat_softmax.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o tests/fss/secfloat_softmax
+
+piranha_softmax: tests/fss/piranha_softmax.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/piranha_softmax
+
+orca_dealer: experiments/orca/orca_dealer.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o experiments/orca/orca_dealer
+
+orca_evaluator: experiments/orca/orca_evaluator.cu experiments/orca/datasets/mnist.cpp
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o experiments/orca/orca_evaluator
+
+dcf: tests/fss/dcf/dcf.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/dcf
+
+aes: tests/fss/dcf/aes.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/aes
+
+dcf_relu_extend: tests/fss/dcf/relu_extend.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/relu_extend
+
+dcf_stochastic_truncate: tests/fss/dcf/stochastic_truncate.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/stochastic_truncate
+
+dcf_relu: tests/fss/dcf/relu.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/relu
+
+orca_conv2d: tests/nn/orca/conv2d_test.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/conv2d
+
+orca_maxpool: tests/nn/orca/maxpool_test.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/maxpool
+
+orca_relu_extend: tests/nn/orca/relu_extend_test.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/relu_extend
+
+orca_fc: tests/nn/orca/fc_test.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/fc
+
+orca_relu: tests/nn/orca/relu_test.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/relu
+
+orca_inference: experiments/orca/orca_inference.cu 
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/orca_inference
+
+orca_inference_u32: experiments/orca/orca_inference.cu
+	$(CXX) $(FLAGS) -DInfType=u32 $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/orca_inference_u32
+
+sigma: experiments/sigma/sigma.cu 
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
+
+piranha: experiments/orca/piranha.cu 
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/piranha
+
+share_data: experiments/orca/share_data.cpp experiments/orca/datasets/mnist.cpp
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/share_data
+
+model_accuracy: experiments/orca/model_accuracy.cu experiments/orca/datasets/mnist.cpp
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/model_accuracy
+
+orca: orca_dealer orca_evaluator orca_inference orca_inference_u32 piranha
+
+clean:
+	rm -rf ext/cutlass/build
+	rm -rf ext/sytorch/build
+	rm -rf orca/experiments/output
+	rm -rf sigma/experiments/output
+	rm experiments/orca/orca_dealer
+	rm experiments/orca/orca_evaluator
+	rm experiments/orca/orca_inference
+	rm experiments/orca/orca_inference_u32
+	rm experiments/orca/piranha
+	rm experiments/sigma/sigma
+	
diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
new file mode 100644
index 00000000..a9515405
--- /dev/null
+++ b/GPU-MPC/README.md
@@ -0,0 +1,123 @@
+
+# Orca: FSS-based Secure Training and Inference with GPUs
+
+Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+
+3. Make Orca
+
+```
+make orca
+```
+
+## Run
+
+1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "key_dir": <The directory in which the dealer will store keys>
+    }
+```
+
+FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
+
+Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "peer": <The address of the remote peer>
+    }
+```
+
+You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+
+Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
+
+Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
+
+```
+usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
+
+optional arguments:
+  --figure FIGURE  Figure # to run.
+  --table TABLE    Table # to run.
+  --all true       Run all the experiments.
+```
+
+Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+
+Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+
+## Docker Build
+
+You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 
+
+### Install Nvidia Container Toolkit
+- Configure the repository:
+```
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+&& sudo apt-get update
+```
+
+- Install the NVIDIA Container Toolkit packages:
+```
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+### Build the Docker Image / pull the image from Docker Hub
+```
+# Local Build
+docker build -t gpu_mpc -f Dockerfile_Gen .
+
+# Pull from Docker Hub (Cuda 11.8)
+docker pull trajore/gpu_mpc
+```
+### Run the Docker Container
+```
+sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
+
+```
+Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@INPROCEEDINGS {,
+author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
+booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
+title = {Orca: FSS-based Secure Training and Inference with GPUs},
+year = {2024}
+}
+```
+
diff --git a/GPU-MPC/backend/orca.h b/GPU-MPC/backend/orca.h
new file mode 100644
index 00000000..4850da52
--- /dev/null
+++ b/GPU-MPC/backend/orca.h
@@ -0,0 +1,243 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_random.h"
+#include "utils/gpu_mem.h"
+
+#include "orca_base.h"
+
+#include "fss/dcf/gpu_relu.h"
+#include "fss/dcf/gpu_truncate.h"
+#include "fss/dcf/gpu_maxpool.h"
+// pin all the weights and activations in cpu memory
+
+template <typename T>
+class Orca : public OrcaBase<T>
+{
+public:
+    Orca() : OrcaBase<T>() {}
+
+    Orca(int party, std::string ip, int bw, int scale, std::string keyFile = "") : OrcaBase<T>(party, ip, bw, scale, keyFile, false)
+    {
+    }
+
+    void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode)
+    {
+        if (mode == 2)
+        {
+            // auto h_inp = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
+            // printf("Relu input=%ld, %ld, %ld\n", h_inp[0], h_inp[1], h_inp[2]);
+
+            auto start = std::chrono::high_resolution_clock::now();
+
+            auto k = dcf::readGPUReluExtendKey<T>(&(this->keyBuf));
+            auto d_temp = dcf::gpuReluExtend(this->peer, this->party, k, in.d_data, &(this->g), &(this->s));
+            auto d_drelu = d_temp.first;
+            gpuFree(d_drelu);
+            out.d_data = d_temp.second;
+            auto end = std::chrono::high_resolution_clock::now();
+            auto elapsed = end - start;
+            this->s.reluext_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+
+            // printf("Num relus=%d, %lx, %lu\n", out.size(), in.d_data, out.size() * sizeof(T));
+            // auto h_data = (T*) moveToCPU((u8*) out.d_data, out.size() * sizeof(T), NULL);
+            // printf("Relu output=%lu, %lu, %ld\n", h_data[0], h_data[1], h_data[2]);
+        }
+        else
+        {
+            auto start = std::chrono::high_resolution_clock::now();
+
+            auto k = dcf::readTwoRoundReluKey<T>(&(this->keyBuf));
+            auto d_temp = dcf::gpuTwoRoundRelu(this->peer, this->party, k, in.d_data, &(this->g), &(this->s));
+            auto d_drelu = d_temp.first;
+            gpuFree(d_drelu);
+            out.d_data = d_temp.second;
+            auto end = std::chrono::high_resolution_clock::now();
+            auto elapsed = end - start;
+            this->s.relu_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        }
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size);
+        auto start = std::chrono::high_resolution_clock::now();
+        if (mode == 0)
+        {
+            auto k = dcf::readGPUTrStochasticKey<T>(&(this->keyBuf));
+            dcf::gpuStochasticTruncate(k, this->party, this->peer, in.d_data, &(this->g), &(this->s));
+        }
+        else if (mode == 1)
+        {
+            auto k = dcf::readGPUStTRKey<T>(&(this->keyBuf));
+            dcf::gpuStochasticTR(k, this->party, this->peer, in.d_data, &(this->g), &(this->s));
+        }
+        else
+        {
+            assert(0);
+        }
+        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
+        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        this->s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void signext(Tensor<T> &x, u64 scale)
+    {
+        // printf("Sign ext=%lu\n", x.size());
+        auto start = std::chrono::high_resolution_clock::now();
+        auto k = dcf::readGPUSignExtendKey<T>(&(this->keyBuf));
+        dcf::gpuSignExtend(k, this->party, this->peer, x.d_data, &(this->g), &(this->s));
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        this->s.signext_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        assert(in.d1 == out.d1);
+        assert(in.d4 == out.d4);
+        int tmpBw = this->bw;
+        // Neha: ugly hack
+        if (mode == 3)
+            tmpBw -= scale;
+        MaxpoolParams p = {
+            tmpBw, tmpBw, 0, 0, this->bw,
+            (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4,
+            (int)ks, (int)ks,
+            (int)stride, (int)stride,
+            (int)padding, (int)padding,
+            (int)padding, (int)padding,
+            0, 0, false};
+        initPoolParams(p);
+        auto k = dcf::readGPUMaxpoolKey<T>(p, &(this->keyBuf));
+        out.d_data = dcf::gpuMaxPool(this->peer, this->party, p, k, in.d_data, (u32 *)NULL, &(this->g), &(this->s));
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        this->s.maxpool_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+};
+
+template <typename T>
+class OrcaKeygen : public OrcaBaseKeygen<T>
+{
+public:
+    OrcaKeygen(int party, int bw, int scale, std::string keyFile) : OrcaBaseKeygen<T>(party, bw, scale, keyFile)
+    {
+    }
+
+    void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode)
+    {
+        assert(in.is_same_shape(out));
+        assert(in.is_same_shape(drelu));
+        // printf("Keygen relu mode=%d\n", mode);
+        if (mode == 2)
+        {
+            // auto h_inp = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
+            // printf("Relu inp mask=%ld, %ld\n", h_inp[0], h_inp[1]);
+            // printf("Addr=%lx\n", in.d_data);
+            auto d_tempMask = dcf::gpuKeygenReluExtend<T>(&(this->keyBuf), this->party, this->bw - scale, this->bw, in.size(), in.d_data, &(this->g));
+            auto d_dreluMask = d_tempMask.first;
+            gpuFree(d_dreluMask);
+            auto d_reluMask = d_tempMask.second;
+            out.d_data = d_reluMask;
+            // auto h_out = (T*) moveToCPU((u8*) out.d_data, in.size() * sizeof(T), NULL);
+            // printf("Relu out mask=%ld, %ld\n", h_out[0], h_out[1]);
+        }
+        else
+        {
+            int tmpBw = this->bw;
+            if (mode == 3)
+                tmpBw -= scale;
+            auto d_tempMask = dcf::gpuGenTwoRoundReluKey(&(this->keyBuf), this->party, tmpBw, tmpBw, in.size(), in.d_data, &(this->g));
+            auto d_dreluMask = d_tempMask.first;
+            gpuFree(d_dreluMask);
+            auto d_reluMask = d_tempMask.second;
+            out.d_data = d_reluMask;
+        }
+        // printf("Done keygen relu\n");
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        if (mode == 0)
+        {
+            in.d_data = dcf::genGPUStochasticTruncateKey(&(this->keyBuf), this->party, this->bw, this->bw, shift, in.size(), in.d_data, &(this->g));
+        }
+        else if (mode == 1)
+        {
+            in.d_data = dcf::genGPUStTRKey(&(this->keyBuf), this->party, this->bw, this->bw - shift, shift, in.size(), in.d_data, &(this->g));
+        }
+        else
+        {
+            assert(0);
+        }
+    }
+
+    void
+    signext(Tensor<T> &x, u64 scale)
+    {
+        // printf("Signext inp mask %lx\n", x.d_data);
+
+        int bin = this->bw - scale;
+        int bout = this->bw;
+        x.d_data = dcf::genSignExtendKey(&(this->keyBuf), this->party, bin, bout, x.size(), x.d_data, &(this->g));
+
+        // auto h_mask = (T*) moveToCPU((u8*) x.d_data, x.size() * sizeof(T), NULL);
+        // printf("Signext out mask %lx=%ld, %ld\n", x.d_data, h_mask[0], h_mask[1]);
+    }
+
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode)
+    {
+        int tmpBw = this->bw;
+        // Neha: ugly hack
+        if (mode == 3)
+            tmpBw -= scale;
+        MaxpoolParams p = {
+            tmpBw, tmpBw, 0, 0, this->bw,
+            (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4,
+            (int)ks, (int)ks,
+            (int)stride, (int)stride,
+            (int)padding, (int)padding,
+            (int)padding, (int)padding,
+            0, 0, false};
+        initPoolParams(p);
+        out.d_data = dcf::gpuKeygenMaxpool(&(this->keyBuf), this->party, p, in.d_data, (u8 *)NULL, &(this->g));
+        // printf("done with keygen maxpool=%lx\n", out.d_data);
+    }
+};
+
+template <typename T>
+class OrcaDummy : public Orca<T>
+{
+public:
+    OrcaDummy()
+    {
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/backend/orca_base.h b/GPU-MPC/backend/orca_base.h
new file mode 100644
index 00000000..7e2b1104
--- /dev/null
+++ b/GPU-MPC/backend/orca_base.h
@@ -0,0 +1,311 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <omp.h>
+
+#include <sytorch/backend/backend.h>
+#include <sytorch/backend/llama_base.h>
+#include <llama/comms.h>
+#include <llama/api.h>
+
+#include "nn/orca_opt.h"
+
+#include "utils/gpu_random.h"
+#include "utils/gpu_mem.h"
+
+#include "fss/gpu_matmul.h"
+#include "fss/gpu_conv2d.h"
+#include "fss/gpu_relu.h"
+#include "fss/gpu_maxpool.h"
+#include "fss/gpu_avgpool.h"
+#include "fss/gpu_add.h"
+
+template <typename T>
+class OrcaBase : public Backend<T>
+{
+public:
+    u8 *startPtr = NULL;
+    u8 *keyBuf = NULL;
+    size_t keySize = 0;
+    int fd = -1;
+    GpuPeer *peer = NULL;
+    int party = -1;
+    Stats s;
+    int bw;
+    int scale;
+    AESGlobalContext g;
+
+    OrcaBase() {}
+
+    OrcaBase(int party, std::string ip, int bw, int scale, std::string keyFile = "", bool compress = true) : party(party), bw(bw), scale(scale)
+    {
+        initAESContext(&g);
+        initGPUMemPool();
+        // omp_set_num_threads(2);
+        if (keyFile.compare("") != 0)
+        {
+            auto filename = keyFile + "_inference_key" + std::to_string(party) + ".dat";
+            keySize = std::filesystem::file_size(filename);
+            fd = openForReading(filename);
+            // printf("%s, %d\n", filename.data(), fd);
+            getAlignedBuf(&keyBuf, keySize);
+            startPtr = keyBuf;
+        }
+        peer = new GpuPeer(compress);
+        peer->connect(party, ip);
+    }
+
+    void close()
+    {
+        peer->close();
+        // printf("Key read=%lu\n", keyBuf - startPtr);
+    }
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, bool useBias, const Tensor1D<T> &bias, Tensor4D<T> &output, bool isFirst)
+    {
+        auto comm_start = s.comm_time;
+        auto start = std::chrono::high_resolution_clock::now();
+        GPUConv2DKey<T> k;
+        k.p = {
+            bw, bw, (int)input.d1, (int)input.d2, (int)input.d3, (int)ci,
+            (int)fh, (int)fw, (int)co, (int)padding, (int)padding, (int)padding, (int)padding,
+            (int)stride, (int)stride, 0, 0, 0, 0, 0};
+        fillConv2DParams(&(k.p));
+        k.mem_size_I = k.p.size_I * sizeof(T);
+        k.mem_size_F = k.p.size_F * sizeof(T);
+        k.mem_size_O = k.p.size_O * sizeof(T);
+
+        k.I = (T *)keyBuf;
+        keyBuf += k.mem_size_I;
+        k.F = (T *)keyBuf;
+        keyBuf += k.mem_size_F;
+        k.O = (T *)keyBuf;
+        keyBuf += k.mem_size_O;
+
+        auto d_mask_I = (T *)moveToGPU((u8 *)k.I, k.mem_size_I, &s);
+        if (isFirst)
+        {
+            gpuLinearComb(bw, k.p.size_I, input.d_data, T(1), input.d_data, T(1), d_mask_I);
+            peer->reconstructInPlace(input.d_data, bw, k.p.size_I, &s);
+        }
+        // printf("Input=%lx\n", input.d_data);
+        auto d_F = (T *)moveToGPU((u8 *)filter.data, k.mem_size_F, &s);
+        // printf("filter=%lu\n", filter.data[k.p.size_F - 1]);
+        auto d_mask_F = (T *)moveToGPU((u8 *)k.F, k.mem_size_F, &s);
+        auto d_C = gpuConv2DBeaver<T>(k, party, input.d_data, d_F, d_mask_I, d_mask_F, useBias && party == SERVER0 ? bias.data : (T *)NULL, &s, 0);
+
+        gpuFree(d_F);
+        gpuFree(d_mask_I);
+        gpuFree(d_mask_F);
+        // printf("size O=%lu\n", k.p.size_O);
+        peer->reconstructInPlace(d_C, k.p.bout, k.p.size_O, &s);
+        output.d_data = d_C;
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.conv_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        auto comm_end = s.comm_time;
+        s.conv_comm_time += (comm_end - comm_start);
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c, bool useBias, Tensor1D<T> &d, bool isFirst)
+    {
+        // auto h_data = (T*) moveToCPU((u8*) a.d_data, a.size() * sizeof(T), NULL);
+        // printf("Matmul input=%ld, %ld\n", h_data[0], h_data[1]);
+        // for(int i = 0; i < a.size(); i++) printf("Matmul input=%ld\n", h_data[i]);
+        auto comm_start = s.comm_time;
+        auto start = std::chrono::high_resolution_clock::now();
+
+        MatmulParams p;
+        p.M = a.d1;
+        p.K = a.d2;
+        p.N = b.d2;
+        p.batchSz = 1;
+        stdInit(p, bw, 0);
+        auto k = readGPUMatmulKey<T>(p, TruncateType::None, &keyBuf);
+
+        auto d_mask_A = (T *)moveToGPU((u8 *)k.A, k.mem_size_A, &s);
+        if (isFirst)
+        {
+            gpuLinearComb(bw, p.size_A, a.d_data, T(1), a.d_data, T(1), d_mask_A);
+            peer->reconstructInPlace(a.d_data, bw, p.size_A, &s);
+        }
+        c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false, d_mask_A);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.matmul_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        auto comm_end = s.comm_time;
+        s.matmul_comm_time += (comm_end - comm_start);
+    }
+
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale)
+    {
+        AvgPoolParams p = {
+            bw, bw, (int)scale, (int)scale, 0, (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4,
+            (int)ks, (int)ks, (int)stride, (int)stride, (int)padding, (int)padding, (int)padding, (int)padding, 0, 0, false};
+        initPoolParams(p);
+        out.d_data = gpuAddPool(p, in.d_data, &s);
+    }
+
+    void output(Tensor<T> &a)
+    {
+        // int tmpBw = bw - scale;
+        int N = a.size();
+        unmaskValues(/*tmpBw*/ bw, N, a.d_data, (T *)keyBuf, &s);
+        gpuLocalTr<T, T, ars>(party, bw, scale, N, a.d_data, true);
+        moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s);
+    }
+
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        int tmpBw = bw - scale;
+        int N = in[0]->size();
+        std::vector<T *> gpuInp;
+        for (int i = 0; i < in.size(); i++)
+        {
+            gpuInp.push_back(in[i]->d_data);
+        }
+        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { orcaOpt<T>(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+    }
+};
+
+template <typename T>
+class OrcaBaseKeygen : public Backend<T>
+{
+public:
+    u8 *startPtr;
+    u8 *keyBuf = NULL;
+    size_t keyBufSize = 0;
+    int party = -1;
+    std::string keyFile;
+    int scale;
+    int bw;
+    AESGlobalContext g;
+
+    OrcaBaseKeygen(int party, int bw, int scale, std::string keyFile) : party(party), bw(bw), scale(scale), keyFile(keyFile)
+    {
+        initAESContext(&g);
+        initGPURandomness();
+        initCPURandomness();
+        initGPUMemPool();
+        keyBufSize = 20 * OneGB;
+        getAlignedBuf(&keyBuf, keyBufSize, true);
+        startPtr = keyBuf;
+    }
+
+    void close()
+    {
+        size_t keySize = keyBuf - startPtr;
+        size_t padding = 4096 - (keySize % 4096);
+        char *zeros = new char[padding];
+        memset(zeros, 0, padding);
+        memcpy(keyBuf, zeros, padding);
+        keyBuf += padding;
+        keySize += padding;
+        assert(keySize < keyBufSize);
+        int fd = openForWriting(keyFile + "_inference_key" + std::to_string(party) + ".dat");
+        writeKeyBuf(fd, keySize, startPtr);
+        assert(0 == fsync(fd) && "sync error!");
+        closeFile(fd);
+        cpuFree(startPtr, true);
+        destroyGPURandomness();
+        destroyCPURandomness();
+    }
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst)
+    {
+        GPUConv2DKey<T> k;
+        k.p = {
+            bw, bw, (int)input.d1, (int)input.d2, (int)input.d3, (int)ci,
+            (int)fh, (int)fw, (int)co, (int)padding, (int)padding, (int)padding, (int)padding,
+            (int)stride, (int)stride, 0, 0, 0, 0, 0};
+        fillConv2DParams(&(k.p));
+        k.mem_size_I = k.p.size_I * sizeof(T);
+        k.mem_size_F = k.p.size_F * sizeof(T);
+        k.mem_size_O = k.p.size_O * sizeof(T);
+        output.d_data = gpuKeygenConv2D<T>(&keyBuf, party, k, input.d_data, filter.data, true);
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        MatmulParams p;
+        p.M = a.d1;
+        p.K = a.d2;
+        p.N = b.d2;
+        p.batchSz = 1;
+        stdInit(p, bw, 0);
+        // printf("####### X=%lu\n", a.size());
+        // auto h_temp = (u8*) moveToCPU((u8*) a.d_data, a.size() * sizeof(T), (Stats*) NULL);
+        c.d_data = gpuKeygenMatmul<T>(&keyBuf, party, p, a.d_data, b.data, (T *)NULL, TruncateType::None, &g, false);
+    }
+
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale)
+    {
+        AvgPoolParams p = {
+            bw, bw, (int)scale, (int)scale, 0, (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4,
+            (int)ks, (int)ks, (int)stride, (int)stride, (int)padding, (int)padding, (int)padding, (int)padding, 0, 0, false};
+        initPoolParams(p);
+        out.d_data = gpuAddPool(p, in.d_data, (Stats *)NULL);
+    }
+
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        int tmpBw = this->bw - this->scale;
+        int N = in[0]->size();
+        std::vector<T *> gpuInp;
+        for (int i = 0; i < in.size(); i++)
+        {
+            gpuInp.push_back(in[i]->d_data);
+        }
+        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+    }
+
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+    {
+        gpuAddBias(1, x.size() / bias.d1, bias.d1, bw, x.d_data, bias.data, NULL);
+    }
+
+    void output(Tensor<T> &a)
+    {
+        int N = a.size();
+        size_t memSz = N * sizeof(T);
+        moveIntoCPUMem((u8 *)keyBuf, (u8 *)a.d_data, memSz, (Stats *)NULL);
+        keyBuf += memSz;
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { orcaOpt<T>(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+    }
+};
diff --git a/GPU-MPC/backend/piranha.h b/GPU-MPC/backend/piranha.h
new file mode 100644
index 00000000..5d2f10ec
--- /dev/null
+++ b/GPU-MPC/backend/piranha.h
@@ -0,0 +1,160 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_random.h"
+#include "utils/gpu_mem.h"
+
+#include "orca.h"
+// pin all the weights and activations in cpu memory
+
+template <typename T>
+void piranhaOpt(LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+{
+    if (!n->layer->isTrainingMode && n->children.size() == 0)
+    {
+        n->layer->doTruncationForward = false;
+    }
+    if (n->layer->name == "AvgPool2D" && n->children[0]->layer->name == "ReLU")
+    {
+        auto avgPool = static_cast<AvgPool2D<T> *>(n->layer);
+        auto relu = n->children[0]->layer;
+        relu->mode = (int)std::log2(avgPool->ks * avgPool->ks);
+    }
+    else if (n->layer->name == "Flatten")
+    {
+        // delete flatten and add a flag to FC instead
+        assert(n->parents.size() == 1 && n->children.size() == 1);
+        auto parent = n->parents[0];
+        // printf("%s\n", parent->layer->name.data());
+        auto child = n->children[0];
+        assert(parent->children.size() == 1);
+        assert(child->parents.size() == 1);
+        parent->children[0] = child;
+        child->parents[0] = parent;
+        always_assert(parent->currTensor->shape.size() == 4);
+        always_assert(child->layer->name == "FC");
+        auto fc = static_cast<FC<T> *>(child->layer);
+        // // todo: free the memory used up by flatten
+        auto batchSz = parent->currTensor->shape[0];
+        auto h = parent->currTensor->shape[1];
+        auto w = parent->currTensor->shape[2];
+        auto c = parent->currTensor->shape[3];
+        int m = fc->out;
+        assert(h * w * c == fc->in);
+        parent->currTensor = new Tensor<T>(parent->layer->activation.data, parent->layer->activation.d_data, {batchSz, h * w * c});
+        // printf("New tensor=%lx\n", parent->currTensor);
+        parent->currTensor->graphNode = parent;
+        int i;
+        for (i = 0; i < n->allNodesInExecutionOrderRef->size(); i++)
+        {
+            if (n->allNodesInExecutionOrderRef->at(i) == n)
+            {
+                break;
+            }
+        }
+        n->allNodesInExecutionOrderRef->erase(n->allNodesInExecutionOrderRef->begin() + i);
+    }
+}
+
+template <typename T>
+class Piranha : public Orca<T>
+{
+public:
+    Piranha() : Orca<T>() {}
+
+    Piranha(int party, std::string ip, int bw, int scale, std::string keyFile = "") : Orca<T>(party, ip, bw, scale, keyFile)
+    {
+    }
+
+    void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode)
+    {
+        // assert(mode == 2);
+        auto start = std::chrono::high_resolution_clock::now();
+        auto k = dcf::readTwoRoundReluKey<T>(&(this->keyBuf));
+        auto d_temp = dcf::gpuTwoRoundRelu(this->peer, this->party, k, in.d_data, &(this->g), &(this->s));
+        auto d_drelu = d_temp.first;
+        gpuFree(d_drelu);
+        out.d_data = d_temp.second;
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        this->s.relu_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        auto d_inp = in.d_data;
+        GPUTruncateKey<T> k;
+        in.d_data = gpuTruncate<T, T>(this->bw, this->bw, TruncateType::LocalARS, k, (int)shift, this->peer, (int)this->party, (int)in.size(), (T *)in.d_data, &(this->g), &(this->s));
+        gpuFree(d_inp);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        this->s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { piranhaOpt<T>(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+    }
+};
+
+template <typename T>
+class PiranhaKeygen : public OrcaKeygen<T>
+{
+public:
+    PiranhaKeygen(int party, int bw, int scale, std::string keyFile) : OrcaKeygen<T>(party, bw, scale, keyFile)
+    {
+    }
+
+    void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode)
+    {
+        assert(in.is_same_shape(out));
+        assert(in.is_same_shape(drelu));
+        // assert(mode == 2);
+        int tmpBw = this->bw - scale - mode;
+        // printf("Inp=%lx, mode=%d, N=%lu\n", in.d_data, mode, in.size());
+        auto d_tempMask = dcf::gpuGenTwoRoundReluKey(&(this->keyBuf), this->party, tmpBw, this->bw, in.size(), in.d_data, &(this->g));
+        auto d_dreluMask = d_tempMask.first;
+        gpuFree(d_dreluMask);
+        auto d_reluMask = d_tempMask.second;
+        out.d_data = d_reluMask;
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        auto d_inp = in.d_data;
+        in.d_data = genGPUTruncateKey<T, T>(&(this->keyBuf), this->party, TruncateType::LocalARS, this->bw, this->bw, shift, in.size(), in.d_data, &(this->g));
+        gpuFree(d_inp);
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { piranhaOpt<T>(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h
new file mode 100644
index 00000000..d5667478
--- /dev/null
+++ b/GPU-MPC/backend/sigma.h
@@ -0,0 +1,428 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <omp.h>
+
+#include <sytorch/backend/backend.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <llama/comms.h>
+#include <llama/api.h>
+
+#include "nn/orca_opt.h"
+
+#include "utils/gpu_random.h"
+#include "utils/gpu_mem.h"
+
+#include "fss/gpu_matmul.h"
+#include "fss/gpu_gelu.h"
+#include "fss/gpu_layernorm.h"
+#include "fss/gpu_mha.h"
+#include "fss/gpu_add.h"
+
+template <typename T>
+void noTruncateAfterRmsnorm(LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+{
+    if (n->layer->name == "RMSNorm")
+    {
+        n->layer->doTruncationForward = false;
+    }
+}
+
+template <typename T>
+class SIGMA : public Backend<T>
+{
+public:
+    u8 *startPtr = NULL;
+    u8 *keyBuf = NULL;
+    size_t keySize = 0;
+    // int fd = -1;
+    GpuPeer *peer = NULL;
+    int party = -1;
+    Stats s;
+    int bw = 0, scale = 0, n_seq = 0;
+    AESGlobalContext g;
+    MHATables<T> d_mhaTab;
+    T *d_geluTab, *d_siluTab;
+    std::vector<GroupElement> *invSqrtTab;
+    LlamaTransformer<T> *llama;
+
+    SIGMA(int party, std::string ip, std::string keyFile, int bw, int scale, int n_seq, int n_embed, int numThreads) : party(party), bw(bw), scale(scale), n_seq(n_seq)
+    {
+        initAESContext(&g);
+        initGPUMemPool();
+        // initCommBufs(true);
+
+        d_geluTab = genLUT<T, reluSubGelu<T>>(8, 6, scale);
+        d_siluTab = genLUT<T, reluSubSilu<T>>(10, 6, scale);
+        d_mhaTab = initMHATables<T>(n_seq, scale);
+
+        omp_set_num_threads(numThreads);
+
+        invSqrtTab = new std::vector<GroupElement>(1LL << 13);
+#pragma omp parallel for
+        for (int i = 0; i < (1LL << 13); ++i)
+        {
+            GroupElement k = i % (1LL << 6);
+            GroupElement m = i >> 6;
+            double val = double(m + 128) * std::pow(2.0, k - 7);
+            (*invSqrtTab)[i] = GroupElement(double(1LL << (2 * scale)) / sqrt(val / n_embed));
+        }
+
+        auto filename = keyFile + "_" + std::to_string(party) + ".dat";
+        keySize = std::filesystem::file_size(filename);
+        int fd = openForReading(filename);
+        printf("%s, %d\n", filename.data(), fd);
+        getAlignedBuf(&keyBuf, keySize);
+        readKey(fd, keySize, keyBuf, NULL);
+
+        startPtr = keyBuf;
+
+        LlamaConfig::bitlength = bw;
+        LlamaConfig::party = party + 2;
+        LlamaConfig::stochasticT = false;
+        LlamaConfig::stochasticRT = false;
+
+        llama = new LlamaTransformer<T>();
+        if (party == SERVER0)
+            llama->initServer(ip, (char **)&keyBuf);
+        else
+            llama->initClient(ip, (char **)&keyBuf);
+
+        peer = new GpuPeer(true);
+        peer->peer = LlamaConfig::peer;
+    }
+
+    void close()
+    {
+        peer->close();
+        // printf("Key read=%lu\n", keyBuf - startPtr);
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c, bool useBias, Tensor1D<T> &d, bool isFirst)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        MatmulParams p;
+        p.M = a.d1;
+        p.K = a.d2;
+        p.N = b.d2;
+        p.batchSz = 1;
+        stdInit(p, bw, 0);
+        auto k = readGPUMatmulKey<T>(p, TruncateType::None, &keyBuf);
+        c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false);
+        // printf("Matmul weights=%ld, %ld, %ld\n", b.data[0], b.data[1], b.data[b.size() - 1]);
+
+        // auto h_out = (T*) moveToCPU((u8*) c.d_data, p.size_C * sizeof(T), NULL);
+        // printf("Matmul output=%ld, %ld\n", h_out[0], h_out[1]);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.matmul_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        u64 b0 = peer->bytesSent() + peer->bytesReceived();
+        auto start = std::chrono::high_resolution_clock::now();
+
+        auto k = readGpuGeluKey<T, u8>(&keyBuf);
+        out.d_data = gpuGelu<T, u8, 8>(peer, party, k, bw, bw - scale, (int)scale, in.size(), in.d_data, d_geluTab, &g, &s);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.gelu_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        u64 b1 = peer->bytesSent() + peer->bytesReceived();
+        s.gelu_comm_bytes += (b1 - b0);
+    }
+
+    void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        u64 b0 = peer->bytesSent() + peer->bytesReceived();
+        auto start = std::chrono::high_resolution_clock::now();
+
+        auto k = readGpuGeluKey<T, u16>(&keyBuf);
+        out.d_data = gpuGelu<T, u16, 10>(peer, party, k, bw, bw - scale, (int)scale, in.size(), in.d_data, d_siluTab, &g, &s);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.gelu_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        u64 b1 = peer->bytesSent() + peer->bytesReceived();
+        s.gelu_comm_bytes += (b1 - b0);
+    }
+
+    void SIGMALayernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale, bool computeMu)
+    {
+        u64 b0 = peer->bytesSent() + peer->bytesReceived();
+        auto start = std::chrono::high_resolution_clock::now();
+
+        AvgPoolParams p = {bw, bw, scale, 0, 0, 1, x.shape[0], x.shape[1], 1, 1, x.shape[1], 1, x.shape[1], 0, 0, 0, 0};
+        initPoolParams(p);
+        auto k = readGPULayerNormKey<T>(p, &keyBuf, computeMu);
+        // assert(d_invSqrtTab);
+        auto d_A = (T *)moveToGPU((u8 *)A.data, A.size() * sizeof(T), &s);
+        auto d_B = (T *)moveToGPU((u8 *)B.data, B.size() * sizeof(T), &s);
+        y.d_data = gpuLayerNorm(peer, party, p, k, d_A, d_B, x.d_data, /*(std::vector<GroupElement> *)*/ invSqrtTab, &g, &s, computeMu);
+        gpuFree(d_A);
+        gpuFree(d_B);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.layernorm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        u64 b1 = peer->bytesSent() + peer->bytesReceived();
+        s.layernorm_comm_bytes += (b1 - b0);
+    }
+
+    void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        SIGMALayernorm(A, B, x, y, scale, true);
+    }
+
+    void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        SIGMALayernorm(A, B, x, y, scale, false);
+    }
+
+    void mha(int n_heads, int n_embed, int dim_W, bool selfAttn, bool doNormQKt, bool doRotEmb, const Tensor2D<T> &wQKV, const Tensor1D<T> &bQKV, const Tensor2D<T> &wProj, const Tensor1D<T> &bProj, const Tensor2D<T> &X, Tensor2D<T> &Y)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        MHAParams pMHA = {n_seq, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb};
+        MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
+        auto k = readGPUMHAKey<T>(pMHA, pMHAMul, &keyBuf);
+        Y.d_data = gpuMHA(peer, party, bw, scale, pMHA, pMHAMul, k, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, d_mhaTab, &g, &s);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.mha_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size);
+        auto start = std::chrono::high_resolution_clock::now();
+
+        TruncateType t = TruncateType::TrFloor;
+        auto k = readGPUTruncateKey<T>(t, &keyBuf);
+        in.d_data = gpuTruncate<T, T>(k.bin, k.bout, t, k, k.shift, peer, party, k.N, in.d_data, &g, &s);
+
+        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
+        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    void mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out)
+    {
+        u64 N = a.size();
+        auto k = readGPUMulKey<T>(&keyBuf, N, N, N, TruncateType::None);
+        out.d_data = gpuMul(peer, party, bw, scale, N, k, a.d_data, b.d_data, TruncateType::None, &g, &s);
+    }
+
+    void output(Tensor<T> &a)
+    {
+        // printf("Inside output=%lx\n", a.d_data);
+        // int tmpBw = bw - scale;
+        int N = a.size();
+        // printf("keyBuf=%lx, %lu\n", keyBuf, keyBuf - startPtr);
+        unmaskValues(bw, N, a.d_data, (T *)keyBuf, &s);
+        // printf("boo\n");
+        moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s);
+    }
+
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        int tmpBw = bw - scale;
+        int N = in[0]->size();
+        std::vector<T *> gpuInp;
+        for (int i = 0; i < in.size(); i++)
+        {
+            gpuInp.push_back(in[i]->d_data);
+        }
+        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { noTruncateAfterRmsnorm(n, r); });
+    }
+};
+
+template <typename T>
+class SIGMAKeygen : public Backend<T>
+{
+public:
+    u8 *startPtr;
+    u8 *keyBuf = NULL;
+    size_t keyBufSize = 0;
+    int party = -1;
+    std::string keyFile;
+    int scale;
+    int bw;
+    AESGlobalContext g;
+    LlamaTransformer<T> *llama;
+    u8 *llamaBuf1, *llamaBuf2;
+    u8 *dummyBuf1, *dummyBuf2;
+
+    SIGMAKeygen(int party, int bw, int scale, std::string keyFile, size_t keyBufSize) : party(party), bw(bw), scale(scale), keyFile(keyFile), keyBufSize(keyBufSize)
+    {
+        initAESContext(&g);
+        initGPURandomness();
+        initGPUMemPool();
+        // keyBufSize = 20 * OneGB;
+        keyBuf = cpuMalloc(keyBufSize);
+        startPtr = keyBuf;
+
+        LlamaConfig::bitlength = bw;
+        LlamaConfig::party = DEALER;
+        LlamaConfig::stochasticT = false;
+        LlamaConfig::stochasticRT = false;
+
+        llama = new LlamaTransformer<T>();
+        llamaBuf1 = (u8 *)cpuMalloc(OneGB);
+        dummyBuf1 = (u8 *)cpuMalloc(OneGB);
+        llamaBuf2 = llamaBuf1;
+        dummyBuf2 = dummyBuf1;
+        llama->initDealer((char **)(party == SERVER0 ? &llamaBuf2 : &dummyBuf2), (char **)(party == SERVER1 ? &llamaBuf2 : &dummyBuf2));
+    }
+
+    void close()
+    {
+        size_t keySize = keyBuf - startPtr;
+        size_t padding = 4096 - (keySize % 4096);
+        char *zeros = new char[padding];
+        memset(zeros, 0, padding);
+        memcpy(keyBuf, zeros, padding);
+        keyBuf += padding;
+        keySize += padding;
+        assert(keySize < keyBufSize);
+        std::ofstream f(keyFile + "_" + std::to_string(party) + ".dat");
+        f.write((char *)startPtr, keySize);
+        f.close();
+        cpuFree(startPtr);
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        MatmulParams p;
+        p.M = a.d1;
+        p.K = a.d2;
+        p.N = b.d2;
+        p.batchSz = 1;
+        stdInit(p, bw, 0);
+        c.d_data = gpuKeygenMatmul<T>(&keyBuf, party, p, a.d_data, b.data, (T *)NULL, TruncateType::None, &g, false);
+    }
+
+    void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        out.d_data = gpuKeyGenGelu<T, u8, 8>(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g);
+    }
+
+    void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        out.d_data = gpuKeyGenGelu<T, u16, 10>(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g);
+    }
+
+    void SIGMALayernormKeygen(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale, bool computeMu)
+    {
+        AvgPoolParams p = {bw, bw, scale, 0, 0, 1, x.shape[0], x.shape[1], 1, 1, x.shape[1], 1, x.shape[1], 0, 0, 0, 0};
+        initPoolParams(p);
+        auto d_mask_A = (T *)moveToGPU((u8 *)A.data, A.size() * sizeof(T), (Stats *)NULL);
+        auto d_mask_B = (T *)moveToGPU((u8 *)B.data, B.size() * sizeof(T), (Stats *)NULL);
+        y.d_data = gpuKeygenLayerNorm(&keyBuf, party, p, d_mask_A, d_mask_B, x.d_data, &g, computeMu);
+        size_t llamaKeySz = llamaBuf2 - llamaBuf1;
+        memcpy(keyBuf, llamaBuf1, llamaKeySz);
+        keyBuf += llamaKeySz;
+        llamaBuf2 = llamaBuf1;
+        gpuFree(d_mask_A);
+        gpuFree(d_mask_B);
+    }
+
+    void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        SIGMALayernormKeygen(A, B, x, y, scale, true);
+    }
+
+    void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        SIGMALayernormKeygen(A, B, x, y, scale, false);
+    }
+
+    void mha(int n_heads, int n_embed, int dim_W, bool selfAttn, bool doNormQKt, bool doRotEmb, const Tensor2D<T> &wQKV, const Tensor1D<T> &bQKV, const Tensor2D<T> &wProj, const Tensor1D<T> &bProj, const Tensor2D<T> &X, Tensor2D<T> &Y)
+    {
+        MHAParams pMHA = {X.d1, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb};
+        MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
+        printf("scale=%d\n", pMHAMul.pQKV.shift);
+        Y.d_data = gpuKeygenMHA(&keyBuf, party, bw, scale, pMHA, pMHAMul, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, &g);
+    }
+
+    void mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out)
+    {
+        out.d_data = gpuKeygenMul(&keyBuf, party, bw, scale, a.size(), a.d_data, b.d_data, TruncateType::None, &g);
+    }
+
+    void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        TruncateType t = TruncateType::TrFloor;
+        in.d_data = genGPUTruncateKey<T, T>(&keyBuf, party, t, bw, bw, shift, in.size(), in.d_data, &g);
+    }
+
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        int tmpBw = bw - scale;
+        int N = in[0]->size();
+        // printf("Add input=%d, %lx, %lx\n", N, in[0]->d_data, in[1]->d_data);
+        std::vector<T *> gpuInp;
+        for (int i = 0; i < in.size(); i++)
+        {
+            gpuInp.push_back(in[i]->d_data);
+            // printf("Add inp=%lx\n", in[i]->d_data);
+        }
+        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+    }
+
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+    {
+        gpuAddBias(1, x.size() / bias.d1, bias.d1, bw, x.d_data, bias.data, NULL);
+    }
+
+    void output(Tensor<T> &a)
+    {
+        int N = a.size();
+        size_t memSz = N * sizeof(T);
+        moveIntoCPUMem((u8 *)keyBuf, (u8 *)a.d_data, memSz, (Stats *)NULL);
+        keyBuf += memSz;
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { pinCpuMem(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { noTruncateAfterRmsnorm(n, r); });
+    }
+};
diff --git a/GPU-MPC/experiments/__init__.py b/GPU-MPC/experiments/__init__.py
new file mode 100644
index 00000000..11ba17de
--- /dev/null
+++ b/GPU-MPC/experiments/__init__.py
@@ -0,0 +1,21 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
diff --git a/GPU-MPC/experiments/orca/__init__.py b/GPU-MPC/experiments/orca/__init__.py
new file mode 100644
index 00000000..11ba17de
--- /dev/null
+++ b/GPU-MPC/experiments/orca/__init__.py
@@ -0,0 +1,21 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
diff --git a/GPU-MPC/experiments/orca/cnn.h b/GPU-MPC/experiments/orca/cnn.h
new file mode 100644
index 00000000..fd3d59b6
--- /dev/null
+++ b/GPU-MPC/experiments/orca/cnn.h
@@ -0,0 +1,1388 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/module.h>
+
+#include "utils/gpu_data_types.h"
+
+#include "nn/orca/gpu_model.h"
+#include "nn/orca/conv2d_layer.h"
+#include "nn/orca/maxpool_layer.h"
+#include "nn/orca/relu_layer.h"
+#include "nn/orca/relu_extend_layer.h"
+#include "nn/orca/avg_pool_layer.h"
+#include "nn/orca/fc_layer.h"
+
+#include "backend/orca.h"
+#include "backend/piranha.h"
+
+template <typename T>
+class CNN2 : public SytorchModule<T>
+{
+    Conv2D<T> *conv1;
+    ReLU<T> *relu1;
+    MaxPool2D<T> *maxpool1;
+    Conv2D<T> *conv2;
+    ReLU<T> *relu2;
+    MaxPool2D<T> *maxpool2;
+    Flatten<T> *flatten3;
+    FC<T> *fc4;
+    ReLU<T> *relu4;
+    FC<T> *fc5;
+
+public:
+    CNN2()
+    {
+        conv1 = new Conv2D<T>(1, 8, 5, 0, 1, true);
+        relu1 = new ReLU<T>();
+        maxpool1 = new MaxPool2D<T>(2, 0, 2);
+
+        conv2 = new Conv2D<T>(8, 16, 5, 0, 1, true);
+        relu2 = new ReLU<T>();
+        maxpool2 = new MaxPool2D<T>(2, 0, 2);
+
+        flatten3 = new Flatten<T>();
+        fc4 = new FC<T>(256, 128, true);
+        relu4 = new ReLU<T>();
+
+        fc5 = new FC<T>(128, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var1 = conv1->forward(input);
+        auto &var2 = relu1->forward(var1);
+        auto &var3 = maxpool1->forward(var2);
+        auto &var4 = conv2->forward(var3);
+        auto &var5 = relu2->forward(var4);
+        auto &var6 = maxpool2->forward(var5);
+        auto &var7 = flatten3->forward(var6);
+        auto &var8 = fc4->forward(var7);
+        auto &var9 = relu4->forward(var8);
+        auto &var10 = fc5->forward(var9);
+        return var10;
+    }
+};
+
+template <typename T>
+class PLenetNoReluAvgPool : public SytorchModule<T>
+{
+    Conv2D<T> *conv1;
+    AvgPool2D<T> *avgpool1;
+    ReLU<T> *relu1;
+    Conv2D<T> *conv2;
+    AvgPool2D<T> *avgpool2;
+    ReLU<T> *relu2;
+    Flatten<T> *flatten3;
+    FC<T> *fc4;
+    ReLU<T> *relu4;
+    FC<T> *fc5;
+
+public:
+    PLenetNoReluAvgPool()
+    {
+        conv1 = new Conv2D<T>(1, 20, 5, 0, 1, false);
+        avgpool1 = new AvgPool2D<T>(2, 0, 2);
+        relu1 = new ReLU<T>();
+        conv2 = new Conv2D<T>(20, 50, 5, 0, 1, false);
+        avgpool2 = new AvgPool2D<T>(2, 0, 2);
+        relu2 = new ReLU<T>();
+        flatten3 = new Flatten<T>();
+        fc4 = new FC<T>(800, 500, true);
+        relu4 = new ReLU<T>();
+        fc5 = new FC<T>(500, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var1 = conv1->forward(input);
+        auto &var2 = avgpool1->forward(var1);
+        auto &var3 = relu1->forward(var2);
+        auto &var4 = conv2->forward(var3);
+        auto &var5 = avgpool2->forward(var4);
+        auto &var6 = relu2->forward(var5);
+        auto &var7 = flatten3->forward(var6);
+        auto &var8 = fc4->forward(var7);
+        auto &var9 = relu4->forward(var8);
+        auto &var10 = fc5->forward(var9);
+        return var10;
+    }
+};
+
+template <typename T>
+class MinionnLenet : public SytorchModule<T>
+{
+    Conv2D<T> *conv1;
+    ReLU<T> *relu1;
+    MaxPool2D<T> *maxpool1;
+    Conv2D<T> *conv2;
+    ReLU<T> *relu2;
+    MaxPool2D<T> *maxpool2;
+    Flatten<T> *flatten3;
+    FC<T> *fc4;
+    ReLU<T> *relu4;
+    FC<T> *fc5;
+
+public:
+    MinionnLenet()
+    {
+        conv1 = new Conv2D<T>(1, 16, 5, 0, 1, true);
+        relu1 = new ReLU<T>();
+        maxpool1 = new MaxPool2D<T>(2, 0, 2);
+
+        conv2 = new Conv2D<T>(16, 16, 5, 0, 1, true);
+        relu2 = new ReLU<T>();
+        maxpool2 = new MaxPool2D<T>(2, 0, 2);
+
+        flatten3 = new Flatten<T>();
+        fc4 = new FC<T>(256, 100, true);
+        relu4 = new ReLU<T>();
+
+        fc5 = new FC<T>(100, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var1 = conv1->forward(input);
+        auto &var2 = relu1->forward(var1);
+        auto &var3 = maxpool1->forward(var2);
+        auto &var4 = conv2->forward(var3);
+        auto &var5 = relu2->forward(var4);
+        auto &var6 = maxpool2->forward(var5);
+        auto &var7 = flatten3->forward(var6);
+        auto &var8 = fc4->forward(var7);
+        auto &var9 = relu4->forward(var8);
+        auto &var10 = fc5->forward(var9);
+        return var10;
+    }
+};
+
+template <typename T>
+class PSecureMlNoRelu : public SytorchModule<T>
+{
+    FC<T> *fc1;
+    ReLU<T> *relu1;
+    FC<T> *fc2;
+    ReLU<T> *relu2;
+    FC<T> *fc3;
+
+public:
+    PSecureMlNoRelu()
+    {
+        fc1 = new FC<T>(784, 128, true);
+        relu1 = new ReLU<T>();
+        fc2 = new FC<T>(128, 128, true);
+        relu2 = new ReLU<T>();
+        fc3 = new FC<T>(128, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var1 = fc1->forward(input);
+        auto &var2 = relu1->forward(var1);
+        auto &var3 = fc2->forward(var2);
+        auto &var4 = relu2->forward(var3);
+        auto &var5 = fc3->forward(var4);
+        return var5;
+    }
+};
+
+template <typename T>
+class CNN3 : public SytorchModule<T>
+{
+    Conv2D<T> *conv1;
+    ReLU<T> *relu1;
+    MaxPool2D<T> *maxpool1;
+    Conv2D<T> *conv2;
+    ReLU<T> *relu2;
+    MaxPool2D<T> *maxpool2;
+    Conv2D<T> *conv3;
+    ReLU<T> *relu3;
+    MaxPool2D<T> *maxpool3;
+    Flatten<T> *flatten4;
+    FC<T> *fc5;
+
+public:
+    CNN3()
+    {
+        conv1 = new Conv2D<T>(3, 64, 5, 1, 1, true);
+        relu1 = new ReLU<T>();
+        maxpool1 = new MaxPool2D<T>(3, 0, 2);
+
+        conv2 = new Conv2D<T>(64, 64, 5, 1, 1, true);
+        relu2 = new ReLU<T>();
+        maxpool2 = new MaxPool2D<T>(3, 0, 2);
+
+        conv3 = new Conv2D<T>(64, 64, 5, 1, 1, true);
+        relu3 = new ReLU<T>();
+        maxpool3 = new MaxPool2D<T>(3, 0, 2);
+
+        flatten4 = new Flatten<T>();
+        fc5 = new FC<T>(64, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var1 = conv1->forward(input);
+        auto &var2 = relu1->forward(var1);
+        auto &var3 = maxpool1->forward(var2);
+        auto &var4 = conv2->forward(var3);
+        auto &var5 = relu2->forward(var4);
+        auto &var6 = maxpool2->forward(var5);
+        auto &var7 = conv3->forward(var6);
+        auto &var8 = relu3->forward(var7);
+        auto &var9 = maxpool3->forward(var8);
+        auto &var10 = flatten4->forward(var9);
+        auto &var11 = fc5->forward(var10);
+        return var11;
+    }
+};
+
+template <typename T>
+class VGG16 : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+public:
+    Conv2D<T> *conv0;
+    ReLU<T> *relu1;
+    Conv2D<T> *conv2;
+    MaxPool2D<T> *maxpool3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu6;
+    Conv2D<T> *conv7;
+    MaxPool2D<T> *maxpool8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu11;
+    Conv2D<T> *conv12;
+    ReLU<T> *relu13;
+    Conv2D<T> *conv14;
+    MaxPool2D<T> *maxpool15;
+    ReLU<T> *relu16;
+    Conv2D<T> *conv17;
+    ReLU<T> *relu18;
+    Conv2D<T> *conv19;
+    ReLU<T> *relu20;
+    Conv2D<T> *conv21;
+    MaxPool2D<T> *maxpool22;
+    ReLU<T> *relu23;
+    Conv2D<T> *conv24;
+    ReLU<T> *relu25;
+    Conv2D<T> *conv26;
+    ReLU<T> *relu27;
+    Conv2D<T> *conv28;
+    MaxPool2D<T> *maxpool29;
+    ReLU<T> *relu30;
+    Flatten<T> *reshape31;
+    FC<T> *gemm32;
+    ReLU<T> *relu33;
+    FC<T> *gemm34;
+    ReLU<T> *relu35;
+    FC<T> *gemm36;
+
+public:
+    VGG16()
+    {
+        conv0 = new Conv2D<T>(3, 64, 3, 1, 1, true);
+        relu1 = new ReLU<T>();
+        conv2 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        maxpool3 = new MaxPool2D<T>(2, 0, 2);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 128, 3, 1, 1, true);
+        relu6 = new ReLU<T>();
+        conv7 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        maxpool8 = new MaxPool2D<T>(2, 0, 2);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(128, 256, 3, 1, 1, true);
+        relu11 = new ReLU<T>();
+        conv12 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu13 = new ReLU<T>();
+        conv14 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        maxpool15 = new MaxPool2D<T>(2, 0, 2);
+        relu16 = new ReLU<T>();
+        conv17 = new Conv2D<T>(256, 512, 3, 1, 1, true);
+        relu18 = new ReLU<T>();
+        conv19 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu20 = new ReLU<T>();
+        conv21 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        maxpool22 = new MaxPool2D<T>(2, 0, 2);
+        relu23 = new ReLU<T>();
+        conv24 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu25 = new ReLU<T>();
+        conv26 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu27 = new ReLU<T>();
+        conv28 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        maxpool29 = new MaxPool2D<T>(2, 0, 2);
+        relu30 = new ReLU<T>();
+        reshape31 = new Flatten<T>();
+        gemm32 = new FC<T>(25088, 4096, true);
+        relu33 = new ReLU<T>();
+        gemm34 = new FC<T>(4096, 4096, true);
+        relu35 = new ReLU<T>();
+        gemm36 = new FC<T>(4096, 1000, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var35 = conv0->forward(input);
+        auto &var36 = relu1->forward(var35);
+        auto &var37 = conv2->forward(var36);
+        auto &var38 = maxpool3->forward(var37);
+        auto &var39 = relu4->forward(var38);
+        auto &var40 = conv5->forward(var39);
+        auto &var41 = relu6->forward(var40);
+        auto &var42 = conv7->forward(var41);
+        auto &var43 = maxpool8->forward(var42);
+        auto &var44 = relu9->forward(var43);
+        auto &var45 = conv10->forward(var44);
+        auto &var46 = relu11->forward(var45);
+        auto &var47 = conv12->forward(var46);
+        auto &var48 = relu13->forward(var47);
+        auto &var49 = conv14->forward(var48);
+        auto &var50 = maxpool15->forward(var49);
+        auto &var51 = relu16->forward(var50);
+        auto &var52 = conv17->forward(var51);
+        auto &var53 = relu18->forward(var52);
+        auto &var54 = conv19->forward(var53);
+        auto &var55 = relu20->forward(var54);
+        auto &var56 = conv21->forward(var55);
+        auto &var57 = maxpool22->forward(var56);
+        auto &var58 = relu23->forward(var57);
+        auto &var59 = conv24->forward(var58);
+        auto &var60 = relu25->forward(var59);
+        auto &var61 = conv26->forward(var60);
+        auto &var62 = relu27->forward(var61);
+        auto &var63 = conv28->forward(var62);
+        auto &var64 = maxpool29->forward(var63);
+        auto &var65 = relu30->forward(var64);
+        auto &var66 = reshape31->forward(var65);
+        auto &var67 = gemm32->forward(var66);
+        auto &var68 = relu33->forward(var67);
+        auto &var69 = gemm34->forward(var68);
+        auto &var70 = relu35->forward(var69);
+        auto &var71 = gemm36->forward(var70);
+        return var71;
+    }
+};
+
+template <typename T>
+class ResNet18 : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::concat;
+
+public:
+    Conv2D<T> *conv0;
+    MaxPool2D<T> *maxpool1;
+    ReLU<T> *relu2;
+    Conv2D<T> *conv3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu7;
+    Conv2D<T> *conv8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu12;
+    Conv2D<T> *conv13;
+    ReLU<T> *relu14;
+    Conv2D<T> *conv15;
+    Conv2D<T> *conv16;
+    ReLU<T> *relu18;
+    Conv2D<T> *conv19;
+    ReLU<T> *relu20;
+    Conv2D<T> *conv21;
+    ReLU<T> *relu23;
+    Conv2D<T> *conv24;
+    ReLU<T> *relu25;
+    Conv2D<T> *conv26;
+    Conv2D<T> *conv27;
+    ReLU<T> *relu29;
+    Conv2D<T> *conv30;
+    ReLU<T> *relu31;
+    Conv2D<T> *conv32;
+    ReLU<T> *relu34;
+    Conv2D<T> *conv35;
+    ReLU<T> *relu36;
+    Conv2D<T> *conv37;
+    Conv2D<T> *conv38;
+    ReLU<T> *relu40;
+    Conv2D<T> *conv41;
+    ReLU<T> *relu42;
+    Conv2D<T> *conv43;
+    ReLU<T> *relu45;
+    GlobalAvgPool2D<T> *globalaveragepool46;
+    Flatten<T> *flatten47;
+    FC<T> *gemm48;
+
+public:
+    ResNet18()
+    {
+        conv0 = new Conv2D<T>(3, 64, 7, 3, 2, true);
+        maxpool1 = new MaxPool2D<T>(3, 1, 2);
+        relu2 = new ReLU<T>();
+        conv3 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu7 = new ReLU<T>();
+        conv8 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu12 = new ReLU<T>();
+        conv13 = new Conv2D<T>(64, 128, 3, 1, 2, true);
+        relu14 = new ReLU<T>();
+        conv15 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        conv16 = new Conv2D<T>(64, 128, 1, 0, 2, true);
+        relu18 = new ReLU<T>();
+        conv19 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu20 = new ReLU<T>();
+        conv21 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu23 = new ReLU<T>();
+        conv24 = new Conv2D<T>(128, 256, 3, 1, 2, true);
+        relu25 = new ReLU<T>();
+        conv26 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        conv27 = new Conv2D<T>(128, 256, 1, 0, 2, true);
+        relu29 = new ReLU<T>();
+        conv30 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu31 = new ReLU<T>();
+        conv32 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu34 = new ReLU<T>();
+        conv35 = new Conv2D<T>(256, 512, 3, 1, 2, true);
+        relu36 = new ReLU<T>();
+        conv37 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        conv38 = new Conv2D<T>(256, 512, 1, 0, 2, true);
+        relu40 = new ReLU<T>();
+        conv41 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu42 = new ReLU<T>();
+        conv43 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu45 = new ReLU<T>();
+        globalaveragepool46 = new GlobalAvgPool2D<T>();
+        flatten47 = new Flatten<T>();
+        gemm48 = new FC<T>(512, 1000, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var44 = conv0->forward(input);
+        auto &var45 = maxpool1->forward(var44);
+        auto &var46 = relu2->forward(var45);
+        auto &var47 = conv3->forward(var46);
+        auto &var48 = relu4->forward(var47);
+        auto &var49 = conv5->forward(var48);
+        auto &var50 = add(var49, var46);
+        auto &var51 = relu7->forward(var50);
+        auto &var52 = conv8->forward(var51);
+        auto &var53 = relu9->forward(var52);
+        auto &var54 = conv10->forward(var53);
+        auto &var55 = add(var54, var51);
+        auto &var56 = relu12->forward(var55);
+        auto &var57 = conv13->forward(var56);
+        auto &var58 = relu14->forward(var57);
+        auto &var59 = conv15->forward(var58);
+        auto &var60 = conv16->forward(var56);
+        auto &var61 = add(var59, var60);
+        auto &var62 = relu18->forward(var61);
+        auto &var63 = conv19->forward(var62);
+        auto &var64 = relu20->forward(var63);
+        auto &var65 = conv21->forward(var64);
+        auto &var66 = add(var65, var62);
+        auto &var67 = relu23->forward(var66);
+        auto &var68 = conv24->forward(var67);
+        auto &var69 = relu25->forward(var68);
+        auto &var70 = conv26->forward(var69);
+        auto &var71 = conv27->forward(var67);
+        auto &var72 = add(var70, var71);
+        auto &var73 = relu29->forward(var72);
+        auto &var74 = conv30->forward(var73);
+        auto &var75 = relu31->forward(var74);
+        auto &var76 = conv32->forward(var75);
+        auto &var77 = add(var76, var73);
+        auto &var78 = relu34->forward(var77);
+        auto &var79 = conv35->forward(var78);
+        auto &var80 = relu36->forward(var79);
+        auto &var81 = conv37->forward(var80);
+        auto &var82 = conv38->forward(var78);
+        auto &var83 = add(var81, var82);
+        auto &var84 = relu40->forward(var83);
+        auto &var85 = conv41->forward(var84);
+        auto &var86 = relu42->forward(var85);
+        auto &var87 = conv43->forward(var86);
+        auto &var88 = add(var87, var84);
+        auto &var89 = relu45->forward(var88);
+        auto &var90 = globalaveragepool46->forward(var89);
+        auto &var91 = flatten47->forward(var90);
+        auto &var92 = gemm48->forward(var91);
+        return var92;
+    }
+};
+
+template <typename T>
+class ResNet50 : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::concat;
+
+public:
+    Conv2D<T> *conv0;
+    MaxPool2D<T> *maxpool1;
+    ReLU<T> *relu2;
+    Conv2D<T> *conv3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu6;
+    Conv2D<T> *conv7;
+    Conv2D<T> *conv8;
+    ReLU<T> *relu10;
+    Conv2D<T> *conv11;
+    ReLU<T> *relu12;
+    Conv2D<T> *conv13;
+    ReLU<T> *relu14;
+    Conv2D<T> *conv15;
+    ReLU<T> *relu17;
+    Conv2D<T> *conv18;
+    ReLU<T> *relu19;
+    Conv2D<T> *conv20;
+    ReLU<T> *relu21;
+    Conv2D<T> *conv22;
+    ReLU<T> *relu24;
+    Conv2D<T> *conv25;
+    ReLU<T> *relu26;
+    Conv2D<T> *conv27;
+    ReLU<T> *relu28;
+    Conv2D<T> *conv29;
+    Conv2D<T> *conv30;
+    ReLU<T> *relu32;
+    Conv2D<T> *conv33;
+    ReLU<T> *relu34;
+    Conv2D<T> *conv35;
+    ReLU<T> *relu36;
+    Conv2D<T> *conv37;
+    ReLU<T> *relu39;
+    Conv2D<T> *conv40;
+    ReLU<T> *relu41;
+    Conv2D<T> *conv42;
+    ReLU<T> *relu43;
+    Conv2D<T> *conv44;
+    ReLU<T> *relu46;
+    Conv2D<T> *conv47;
+    ReLU<T> *relu48;
+    Conv2D<T> *conv49;
+    ReLU<T> *relu50;
+    Conv2D<T> *conv51;
+    ReLU<T> *relu53;
+    Conv2D<T> *conv54;
+    ReLU<T> *relu55;
+    Conv2D<T> *conv56;
+    ReLU<T> *relu57;
+    Conv2D<T> *conv58;
+    Conv2D<T> *conv59;
+    ReLU<T> *relu61;
+    Conv2D<T> *conv62;
+    ReLU<T> *relu63;
+    Conv2D<T> *conv64;
+    ReLU<T> *relu65;
+    Conv2D<T> *conv66;
+    ReLU<T> *relu68;
+    Conv2D<T> *conv69;
+    ReLU<T> *relu70;
+    Conv2D<T> *conv71;
+    ReLU<T> *relu72;
+    Conv2D<T> *conv73;
+    ReLU<T> *relu75;
+    Conv2D<T> *conv76;
+    ReLU<T> *relu77;
+    Conv2D<T> *conv78;
+    ReLU<T> *relu79;
+    Conv2D<T> *conv80;
+    ReLU<T> *relu82;
+    Conv2D<T> *conv83;
+    ReLU<T> *relu84;
+    Conv2D<T> *conv85;
+    ReLU<T> *relu86;
+    Conv2D<T> *conv87;
+    ReLU<T> *relu89;
+    Conv2D<T> *conv90;
+    ReLU<T> *relu91;
+    Conv2D<T> *conv92;
+    ReLU<T> *relu93;
+    Conv2D<T> *conv94;
+    ReLU<T> *relu96;
+    Conv2D<T> *conv97;
+    ReLU<T> *relu98;
+    Conv2D<T> *conv99;
+    ReLU<T> *relu100;
+    Conv2D<T> *conv101;
+    Conv2D<T> *conv102;
+    ReLU<T> *relu104;
+    Conv2D<T> *conv105;
+    ReLU<T> *relu106;
+    Conv2D<T> *conv107;
+    ReLU<T> *relu108;
+    Conv2D<T> *conv109;
+    ReLU<T> *relu111;
+    Conv2D<T> *conv112;
+    ReLU<T> *relu113;
+    Conv2D<T> *conv114;
+    ReLU<T> *relu115;
+    Conv2D<T> *conv116;
+    ReLU<T> *relu118;
+    GlobalAvgPool2D<T> *globalaveragepool119;
+    Flatten<T> *flatten120;
+    FC<T> *gemm121;
+
+public:
+    ResNet50()
+    {
+        conv0 = new Conv2D<T>(3, 64, 7, 3, 2, true);
+        maxpool1 = new MaxPool2D<T>(3, 1, 2);
+        relu2 = new ReLU<T>();
+        conv3 = new Conv2D<T>(64, 64, 1, 0, 1, true);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu6 = new ReLU<T>();
+        conv7 = new Conv2D<T>(64, 256, 1, 0, 1, true);
+        conv8 = new Conv2D<T>(64, 256, 1, 0, 1, true);
+        relu10 = new ReLU<T>();
+        conv11 = new Conv2D<T>(256, 64, 1, 0, 1, true);
+        relu12 = new ReLU<T>();
+        conv13 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu14 = new ReLU<T>();
+        conv15 = new Conv2D<T>(64, 256, 1, 0, 1, true);
+        relu17 = new ReLU<T>();
+        conv18 = new Conv2D<T>(256, 64, 1, 0, 1, true);
+        relu19 = new ReLU<T>();
+        conv20 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu21 = new ReLU<T>();
+        conv22 = new Conv2D<T>(64, 256, 1, 0, 1, true);
+        relu24 = new ReLU<T>();
+        conv25 = new Conv2D<T>(256, 128, 1, 0, 1, true);
+        relu26 = new ReLU<T>();
+        conv27 = new Conv2D<T>(128, 128, 3, 1, 2, true);
+        relu28 = new ReLU<T>();
+        conv29 = new Conv2D<T>(128, 512, 1, 0, 1, true);
+        conv30 = new Conv2D<T>(256, 512, 1, 0, 2, true);
+        relu32 = new ReLU<T>();
+        conv33 = new Conv2D<T>(512, 128, 1, 0, 1, true);
+        relu34 = new ReLU<T>();
+        conv35 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu36 = new ReLU<T>();
+        conv37 = new Conv2D<T>(128, 512, 1, 0, 1, true);
+        relu39 = new ReLU<T>();
+        conv40 = new Conv2D<T>(512, 128, 1, 0, 1, true);
+        relu41 = new ReLU<T>();
+        conv42 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu43 = new ReLU<T>();
+        conv44 = new Conv2D<T>(128, 512, 1, 0, 1, true);
+        relu46 = new ReLU<T>();
+        conv47 = new Conv2D<T>(512, 128, 1, 0, 1, true);
+        relu48 = new ReLU<T>();
+        conv49 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu50 = new ReLU<T>();
+        conv51 = new Conv2D<T>(128, 512, 1, 0, 1, true);
+        relu53 = new ReLU<T>();
+        conv54 = new Conv2D<T>(512, 256, 1, 0, 1, true);
+        relu55 = new ReLU<T>();
+        conv56 = new Conv2D<T>(256, 256, 3, 1, 2, true);
+        relu57 = new ReLU<T>();
+        conv58 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        conv59 = new Conv2D<T>(512, 1024, 1, 0, 2, true);
+        relu61 = new ReLU<T>();
+        conv62 = new Conv2D<T>(1024, 256, 1, 0, 1, true);
+        relu63 = new ReLU<T>();
+        conv64 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu65 = new ReLU<T>();
+        conv66 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        relu68 = new ReLU<T>();
+        conv69 = new Conv2D<T>(1024, 256, 1, 0, 1, true);
+        relu70 = new ReLU<T>();
+        conv71 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu72 = new ReLU<T>();
+        conv73 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        relu75 = new ReLU<T>();
+        conv76 = new Conv2D<T>(1024, 256, 1, 0, 1, true);
+        relu77 = new ReLU<T>();
+        conv78 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu79 = new ReLU<T>();
+        conv80 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        relu82 = new ReLU<T>();
+        conv83 = new Conv2D<T>(1024, 256, 1, 0, 1, true);
+        relu84 = new ReLU<T>();
+        conv85 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu86 = new ReLU<T>();
+        conv87 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        relu89 = new ReLU<T>();
+        conv90 = new Conv2D<T>(1024, 256, 1, 0, 1, true);
+        relu91 = new ReLU<T>();
+        conv92 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu93 = new ReLU<T>();
+        conv94 = new Conv2D<T>(256, 1024, 1, 0, 1, true);
+        relu96 = new ReLU<T>();
+        conv97 = new Conv2D<T>(1024, 512, 1, 0, 1, true);
+        relu98 = new ReLU<T>();
+        conv99 = new Conv2D<T>(512, 512, 3, 1, 2, true);
+        relu100 = new ReLU<T>();
+        conv101 = new Conv2D<T>(512, 2048, 1, 0, 1, true);
+        conv102 = new Conv2D<T>(1024, 2048, 1, 0, 2, true);
+        relu104 = new ReLU<T>();
+        conv105 = new Conv2D<T>(2048, 512, 1, 0, 1, true);
+        relu106 = new ReLU<T>();
+        conv107 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu108 = new ReLU<T>();
+        conv109 = new Conv2D<T>(512, 2048, 1, 0, 1, true);
+        relu111 = new ReLU<T>();
+        conv112 = new Conv2D<T>(2048, 512, 1, 0, 1, true);
+        relu113 = new ReLU<T>();
+        conv114 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu115 = new ReLU<T>();
+        conv116 = new Conv2D<T>(512, 2048, 1, 0, 1, true);
+        relu118 = new ReLU<T>();
+        globalaveragepool119 = new GlobalAvgPool2D<T>();
+        flatten120 = new Flatten<T>();
+        gemm121 = new FC<T>(2048, 1000, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var110 = conv0->forward(input);
+        auto &var111 = maxpool1->forward(var110);
+        auto &var112 = relu2->forward(var111);
+        auto &var113 = conv3->forward(var112);
+        auto &var114 = relu4->forward(var113);
+        auto &var115 = conv5->forward(var114);
+        auto &var116 = relu6->forward(var115);
+        auto &var117 = conv7->forward(var116);
+        auto &var118 = conv8->forward(var112);
+        auto &var119 = add(var117, var118);
+        auto &var120 = relu10->forward(var119);
+        auto &var121 = conv11->forward(var120);
+        auto &var122 = relu12->forward(var121);
+        auto &var123 = conv13->forward(var122);
+        auto &var124 = relu14->forward(var123);
+        auto &var125 = conv15->forward(var124);
+        auto &var126 = add(var125, var120);
+        auto &var127 = relu17->forward(var126);
+        auto &var128 = conv18->forward(var127);
+        auto &var129 = relu19->forward(var128);
+        auto &var130 = conv20->forward(var129);
+        auto &var131 = relu21->forward(var130);
+        auto &var132 = conv22->forward(var131);
+        auto &var133 = add(var132, var127);
+        auto &var134 = relu24->forward(var133);
+        auto &var135 = conv25->forward(var134);
+        auto &var136 = relu26->forward(var135);
+        auto &var137 = conv27->forward(var136);
+        auto &var138 = relu28->forward(var137);
+        auto &var139 = conv29->forward(var138);
+        auto &var140 = conv30->forward(var134);
+        auto &var141 = add(var139, var140);
+        auto &var142 = relu32->forward(var141);
+        auto &var143 = conv33->forward(var142);
+        auto &var144 = relu34->forward(var143);
+        auto &var145 = conv35->forward(var144);
+        auto &var146 = relu36->forward(var145);
+        auto &var147 = conv37->forward(var146);
+        auto &var148 = add(var147, var142);
+        auto &var149 = relu39->forward(var148);
+        auto &var150 = conv40->forward(var149);
+        auto &var151 = relu41->forward(var150);
+        auto &var152 = conv42->forward(var151);
+        auto &var153 = relu43->forward(var152);
+        auto &var154 = conv44->forward(var153);
+        auto &var155 = add(var154, var149);
+        auto &var156 = relu46->forward(var155);
+        auto &var157 = conv47->forward(var156);
+        auto &var158 = relu48->forward(var157);
+        auto &var159 = conv49->forward(var158);
+        auto &var160 = relu50->forward(var159);
+        auto &var161 = conv51->forward(var160);
+        auto &var162 = add(var161, var156);
+        auto &var163 = relu53->forward(var162);
+        auto &var164 = conv54->forward(var163);
+        auto &var165 = relu55->forward(var164);
+        auto &var166 = conv56->forward(var165);
+        auto &var167 = relu57->forward(var166);
+        auto &var168 = conv58->forward(var167);
+        auto &var169 = conv59->forward(var163);
+        auto &var170 = add(var168, var169);
+        auto &var171 = relu61->forward(var170);
+        auto &var172 = conv62->forward(var171);
+        auto &var173 = relu63->forward(var172);
+        auto &var174 = conv64->forward(var173);
+        auto &var175 = relu65->forward(var174);
+        auto &var176 = conv66->forward(var175);
+        auto &var177 = add(var176, var171);
+        auto &var178 = relu68->forward(var177);
+        auto &var179 = conv69->forward(var178);
+        auto &var180 = relu70->forward(var179);
+        auto &var181 = conv71->forward(var180);
+        auto &var182 = relu72->forward(var181);
+        auto &var183 = conv73->forward(var182);
+        auto &var184 = add(var183, var178);
+        auto &var185 = relu75->forward(var184);
+        auto &var186 = conv76->forward(var185);
+        auto &var187 = relu77->forward(var186);
+        auto &var188 = conv78->forward(var187);
+        auto &var189 = relu79->forward(var188);
+        auto &var190 = conv80->forward(var189);
+        auto &var191 = add(var190, var185);
+        auto &var192 = relu82->forward(var191);
+        auto &var193 = conv83->forward(var192);
+        auto &var194 = relu84->forward(var193);
+        auto &var195 = conv85->forward(var194);
+        auto &var196 = relu86->forward(var195);
+        auto &var197 = conv87->forward(var196);
+        auto &var198 = add(var197, var192);
+        auto &var199 = relu89->forward(var198);
+        auto &var200 = conv90->forward(var199);
+        auto &var201 = relu91->forward(var200);
+        auto &var202 = conv92->forward(var201);
+        auto &var203 = relu93->forward(var202);
+        auto &var204 = conv94->forward(var203);
+        auto &var205 = add(var204, var199);
+        auto &var206 = relu96->forward(var205);
+        auto &var207 = conv97->forward(var206);
+        auto &var208 = relu98->forward(var207);
+        auto &var209 = conv99->forward(var208);
+        auto &var210 = relu100->forward(var209);
+        auto &var211 = conv101->forward(var210);
+        auto &var212 = conv102->forward(var206);
+        auto &var213 = add(var211, var212);
+        auto &var214 = relu104->forward(var213);
+        auto &var215 = conv105->forward(var214);
+        auto &var216 = relu106->forward(var215);
+        auto &var217 = conv107->forward(var216);
+        auto &var218 = relu108->forward(var217);
+        auto &var219 = conv109->forward(var218);
+        auto &var220 = add(var219, var214);
+        auto &var221 = relu111->forward(var220);
+        auto &var222 = conv112->forward(var221);
+        auto &var223 = relu113->forward(var222);
+        auto &var224 = conv114->forward(var223);
+        auto &var225 = relu115->forward(var224);
+        auto &var226 = conv116->forward(var225);
+        auto &var227 = add(var226, var221);
+        auto &var228 = relu118->forward(var227);
+        auto &var229 = globalaveragepool119->forward(var228);
+        auto &var230 = flatten120->forward(var229);
+        auto &var231 = gemm121->forward(var230);
+        return var231;
+    }
+};
+
+template <typename T>
+class PVGG16NoRelu : public SytorchModule<T>
+{
+
+public:
+    Conv2D<T> *conv0;
+    ReLU<T> *relu1;
+    Conv2D<T> *conv2;
+    AvgPool2D<T> *maxpool3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu6;
+    Conv2D<T> *conv7;
+    AvgPool2D<T> *maxpool8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu11;
+    Conv2D<T> *conv12;
+    ReLU<T> *relu13;
+    Conv2D<T> *conv14;
+    AvgPool2D<T> *maxpool15;
+    ReLU<T> *relu16;
+    Conv2D<T> *conv17;
+    ReLU<T> *relu18;
+    Conv2D<T> *conv19;
+    ReLU<T> *relu20;
+    Conv2D<T> *conv21;
+    AvgPool2D<T> *maxpool22;
+    ReLU<T> *relu23;
+    Conv2D<T> *conv24;
+    ReLU<T> *relu25;
+    Conv2D<T> *conv26;
+    ReLU<T> *relu27;
+    Conv2D<T> *conv28;
+    AvgPool2D<T> *maxpool29;
+    ReLU<T> *relu30;
+    Flatten<T> *reshape31;
+    FC<T> *gemm32;
+    ReLU<T> *relu33;
+    FC<T> *gemm34;
+    ReLU<T> *relu35;
+    FC<T> *gemm36;
+
+public:
+    PVGG16NoRelu()
+    {
+        conv0 = new Conv2D<T>(3, 64, 3, 1, 1, false);
+        relu1 = new ReLU<T>();
+        conv2 = new Conv2D<T>(64, 64, 3, 1, 1, false);
+        maxpool3 = new AvgPool2D<T>(2, 0, 2);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 128, 3, 1, 1, false);
+        relu6 = new ReLU<T>();
+        conv7 = new Conv2D<T>(128, 128, 3, 1, 1, false);
+        maxpool8 = new AvgPool2D<T>(2, 0, 2);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(128, 256, 3, 1, 1, false);
+        relu11 = new ReLU<T>();
+        conv12 = new Conv2D<T>(256, 256, 3, 1, 1, false);
+        relu13 = new ReLU<T>();
+        conv14 = new Conv2D<T>(256, 256, 3, 1, 1, false);
+        maxpool15 = new AvgPool2D<T>(2, 0, 2);
+        relu16 = new ReLU<T>();
+        conv17 = new Conv2D<T>(256, 512, 3, 1, 1, false);
+        relu18 = new ReLU<T>();
+        conv19 = new Conv2D<T>(512, 512, 3, 1, 1, false);
+        relu20 = new ReLU<T>();
+        conv21 = new Conv2D<T>(512, 512, 3, 1, 1, false);
+        maxpool22 = new AvgPool2D<T>(2, 0, 2);
+        relu23 = new ReLU<T>();
+        conv24 = new Conv2D<T>(512, 512, 3, 1, 1, false);
+        relu25 = new ReLU<T>();
+        conv26 = new Conv2D<T>(512, 512, 3, 1, 1, false);
+        relu27 = new ReLU<T>();
+        conv28 = new Conv2D<T>(512, 512, 3, 1, 1, false);
+        maxpool29 = new AvgPool2D<T>(2, 0, 2);
+        relu30 = new ReLU<T>();
+        reshape31 = new Flatten<T>();
+        gemm32 = new FC<T>(512, 256, true);
+        relu33 = new ReLU<T>();
+        gemm34 = new FC<T>(256, 256, true);
+        relu35 = new ReLU<T>();
+        gemm36 = new FC<T>(256, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var35 = conv0->forward(input);
+        auto &var36 = relu1->forward(var35);
+        auto &var37 = conv2->forward(var36);
+        auto &var38 = maxpool3->forward(var37);
+        auto &var39 = relu4->forward(var38);
+        auto &var40 = conv5->forward(var39);
+        auto &var41 = relu6->forward(var40);
+        auto &var42 = conv7->forward(var41);
+        auto &var43 = maxpool8->forward(var42);
+        auto &var44 = relu9->forward(var43);
+        auto &var45 = conv10->forward(var44);
+        auto &var46 = relu11->forward(var45);
+        auto &var47 = conv12->forward(var46);
+        auto &var48 = relu13->forward(var47);
+        auto &var49 = conv14->forward(var48);
+        auto &var50 = maxpool15->forward(var49);
+        auto &var51 = relu16->forward(var50);
+        auto &var52 = conv17->forward(var51);
+        auto &var53 = relu18->forward(var52);
+        auto &var54 = conv19->forward(var53);
+        auto &var55 = relu20->forward(var54);
+        auto &var56 = conv21->forward(var55);
+        auto &var57 = maxpool22->forward(var56);
+        auto &var58 = relu23->forward(var57);
+        auto &var59 = conv24->forward(var58);
+        auto &var60 = relu25->forward(var59);
+        auto &var61 = conv26->forward(var60);
+        auto &var62 = relu27->forward(var61);
+        auto &var63 = conv28->forward(var62);
+        auto &var64 = maxpool29->forward(var63);
+        auto &var65 = relu30->forward(var64);
+        auto &var66 = reshape31->forward(var65);
+        auto &var67 = gemm32->forward(var66);
+        auto &var68 = relu33->forward(var67);
+        auto &var69 = gemm34->forward(var68);
+        auto &var70 = relu35->forward(var69);
+        auto &var71 = gemm36->forward(var70);
+        return var71;
+    }
+};
+
+template <typename T>
+class PAlexnetNoRelu : public SytorchModule<T>
+{
+
+public:
+    Conv2D<T> *conv0;
+    AvgPool2D<T> *maxpool1;
+    ReLU<T> *relu2;
+    Conv2D<T> *conv3;
+    AvgPool2D<T> *maxpool4;
+    ReLU<T> *relu5;
+    Conv2D<T> *conv6;
+    ReLU<T> *relu7;
+    Conv2D<T> *conv8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu11;
+    Flatten<T> *reshape12;
+    FC<T> *gemm13;
+    ReLU<T> *relu14;
+    FC<T> *gemm15;
+    ReLU<T> *relu16;
+    FC<T> *gemm17;
+
+public:
+    PAlexnetNoRelu()
+    {
+        conv0 = new Conv2D<T>(3, 96, 11, 9, 4, false);
+        maxpool1 = new AvgPool2D<T>(3, 0, 2);
+        relu2 = new ReLU<T>();
+        conv3 = new Conv2D<T>(96, 256, 5, 1, 1, false);
+        maxpool4 = new AvgPool2D<T>(2, 0, 1);
+        relu5 = new ReLU<T>();
+        conv6 = new Conv2D<T>(256, 384, 3, 1, 1, false);
+        relu7 = new ReLU<T>();
+        conv8 = new Conv2D<T>(384, 384, 3, 1, 1, false);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(384, 256, 3, 1, 1, false);
+        relu11 = new ReLU<T>();
+        reshape12 = new Flatten<T>();
+        gemm13 = new FC<T>(256, 256, true);
+        relu14 = new ReLU<T>();
+        gemm15 = new FC<T>(256, 256, true);
+        relu16 = new ReLU<T>();
+        gemm17 = new FC<T>(256, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var15 = conv0->forward(input);
+        auto &var16 = maxpool1->forward(var15);
+        auto &var17 = relu2->forward(var16);
+        auto &var18 = conv3->forward(var17);
+        auto &var19 = maxpool4->forward(var18);
+        auto &var20 = relu5->forward(var19);
+        auto &var21 = conv6->forward(var20);
+        auto &var22 = relu7->forward(var21);
+        auto &var23 = conv8->forward(var22);
+        auto &var24 = relu9->forward(var23);
+        auto &var25 = conv10->forward(var24);
+        auto &var26 = relu11->forward(var25);
+        auto &var27 = reshape12->forward(var26);
+        auto &var28 = gemm13->forward(var27);
+        auto &var29 = relu14->forward(var28);
+        auto &var30 = gemm15->forward(var29);
+        auto &var31 = relu16->forward(var30);
+        auto &var32 = gemm17->forward(var31);
+        return var32;
+    }
+};
+
+template <typename T>
+class FalconAlexnetNoRelu : public SytorchModule<T>
+{
+
+public:
+    Conv2D<T> *conv0;
+    MaxPool2D<T> *maxpool1;
+    ReLU<T> *relu2;
+    Conv2D<T> *conv3;
+    MaxPool2D<T> *maxpool4;
+    ReLU<T> *relu5;
+    Conv2D<T> *conv6;
+    ReLU<T> *relu7;
+    Conv2D<T> *conv8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu11;
+    Flatten<T> *reshape12;
+    FC<T> *gemm13;
+    ReLU<T> *relu14;
+    FC<T> *gemm15;
+    ReLU<T> *relu16;
+    FC<T> *gemm17;
+
+public:
+    FalconAlexnetNoRelu()
+    {
+        conv0 = new Conv2D<T>(3, 96, 11, 9, 4, true);
+        maxpool1 = new MaxPool2D<T>(3, 0, 2);
+        relu2 = new ReLU<T>();
+        conv3 = new Conv2D<T>(96, 256, 5, 1, 1, true);
+        maxpool4 = new MaxPool2D<T>(2, 0, 1);
+        relu5 = new ReLU<T>();
+        conv6 = new Conv2D<T>(256, 384, 3, 1, 1, true);
+        relu7 = new ReLU<T>();
+        conv8 = new Conv2D<T>(384, 384, 3, 1, 1, true);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(384, 256, 3, 1, 1, true);
+        relu11 = new ReLU<T>();
+        reshape12 = new Flatten<T>();
+        gemm13 = new FC<T>(256, 256, true);
+        relu14 = new ReLU<T>();
+        gemm15 = new FC<T>(256, 256, true);
+        relu16 = new ReLU<T>();
+        gemm17 = new FC<T>(256, 10, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var15 = conv0->forward(input);
+        auto &var16 = maxpool1->forward(var15);
+        auto &var17 = relu2->forward(var16);
+        auto &var18 = conv3->forward(var17);
+        auto &var19 = maxpool4->forward(var18);
+        auto &var20 = relu5->forward(var19);
+        auto &var21 = conv6->forward(var20);
+        auto &var22 = relu7->forward(var21);
+        auto &var23 = conv8->forward(var22);
+        auto &var24 = relu9->forward(var23);
+        auto &var25 = conv10->forward(var24);
+        auto &var26 = relu11->forward(var25);
+        auto &var27 = reshape12->forward(var26);
+        auto &var28 = gemm13->forward(var27);
+        auto &var29 = relu14->forward(var28);
+        auto &var30 = gemm15->forward(var29);
+        auto &var31 = relu16->forward(var30);
+        auto &var32 = gemm17->forward(var31);
+        return var32;
+    }
+};
+
+template <typename T>
+SytorchModule<T> *getCNN(std::string name)
+{
+    SytorchModule<T> *m;
+    if (name.compare("CNN2") == 0)
+    {
+        m = new CNN2<T>();
+    }
+    else if (name.compare("CNN3") == 0)
+    {
+        m = new CNN3<T>();
+    }
+    else if (name.compare("ResNet18") == 0)
+    {
+        m = new ResNet18<T>();
+    }
+    else if (name.compare("ResNet50") == 0)
+    {
+        m = new ResNet50<T>();
+    }
+    else if (name.compare("VGG16") == 0)
+    {
+        m = new VGG16<T>();
+    }
+    else if (name.compare("P-LeNet") == 0)
+    {
+        m = new PLenetNoReluAvgPool<T>();
+    }
+    else if (name.compare("P-SecureML") == 0)
+    {
+        m = new PSecureMlNoRelu<T>();
+    }
+    else if (name.compare("P-VGG16") == 0)
+    {
+        m = new PVGG16NoRelu<T>();
+    }
+    else if (name.compare("P-AlexNet") == 0)
+    {
+        m = new PAlexnetNoRelu<T>();
+    }
+    else if (name.compare("AlexNet") == 0)
+    {
+        m = new FalconAlexnetNoRelu<T>();
+    }
+    else if (name.compare("ModelB") == 0)
+    {
+        m = new MinionnLenet<T>();
+    }
+    else
+    {
+        assert(0 && "unknown model");
+    }
+    return m;
+}
+
+template <typename T>
+dcf::orca::GPUModel<T> *getGPUModel(std::string modelName, Tensor<T> inp)
+{
+    dcf::orca::GPUModel<T> *m;
+    if (*(modelName.data()) == 'P')
+    {
+        m = getPiranhaCNN(modelName, inp);
+    }
+    else
+    {
+        m = getOrcaCNN(modelName, inp);
+    }
+    return m;
+}
+
+// in LlamaImproved, mode takes the value according to the following rule:
+// 0: the layer takes as input \ell bits and outputs \ell bits
+// 1: the layer takes as input \ell bits and outputs \ell - scale bits
+// 2: the layer takes as input \ell - scale bits and outputs \ell bits
+// 3: the layer takes as input \ell - scale bits and outputs \ell - scale bits
+
+template <typename T>
+dcf::orca::GPUModel<T> *getOrcaCNN(std::string modelName, Tensor<T> inp)
+{
+    auto m = getCNN<T>(modelName);
+    m->init((u64)dcf::orca::global::scale, inp);
+    m->train();
+    auto b = new Orca<T>();
+    m->setBackend(b);
+    m->optimize();
+    dcf::orca::GPUModel<T> *gpuModel = new dcf::orca::GPUModel<T>();
+    for (auto n : m->allNodesInExecutionOrder)
+    {
+        auto layer = n->layer;
+        if (layer->name == "Conv2D")
+        {
+            assert(layer->mode == 1);
+            auto convLayer = (Conv2D<T> *)(layer);
+            int N, h, w, c;
+            N = convLayer->inputDerivative.shape[0];
+            h = convLayer->inputDerivative.shape[1];
+            w = convLayer->inputDerivative.shape[2];
+            c = convLayer->inputDerivative.shape[3];
+            assert(c == convLayer->ci);
+            auto orcaConv2D = new dcf::orca::Conv2DLayer<T>((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, N, h, w, (int)convLayer->ci, (int)convLayer->fh, (int)convLayer->fw, (int)convLayer->co, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->stride, (int)convLayer->stride, convLayer->useBias, dcf::TruncateType::StochasticTR, dcf::TruncateType::StochasticTruncate, !layer->isFirst, layer->isFirst);
+            auto filter = convLayer->getweights();
+            // memcpy(orcaConv2D->F, filter.data, filter.size * sizeof(T));
+            if (convLayer->useBias)
+            {
+                auto bias = convLayer->getbias();
+                // memcpy(orcaConv2D->b, bias.data, bias.size * sizeof(T));
+            }
+            gpuModel->layers.push_back(orcaConv2D);
+        }
+
+        else if (layer->name == "MaxPool2D")
+        {
+            assert(layer->mode == 3);
+            auto maxPoolLayer = (MaxPool2D<T> *)(layer);
+            int bwToUse = dcf::orca::global::bw;
+            bwToUse -= dcf::orca::global::scale;
+            int N, h, w, c;
+            N = maxPoolLayer->inputDerivative.shape[0];
+            h = maxPoolLayer->inputDerivative.shape[1];
+            w = maxPoolLayer->inputDerivative.shape[2];
+            c = maxPoolLayer->inputDerivative.shape[3];
+            auto orcaMaxPool = new dcf::orca::MaxPool2DLayer<T>(bwToUse, bwToUse, dcf::orca::global::bw, N, h, w, c, maxPoolLayer->ks, maxPoolLayer->ks, maxPoolLayer->stride, maxPoolLayer->stride, maxPoolLayer->padding, maxPoolLayer->padding, maxPoolLayer->padding, maxPoolLayer->padding);
+            gpuModel->layers.push_back(orcaMaxPool);
+        }
+        else if (layer->name == "FC")
+        {
+            assert(layer->mode == 1);
+            auto fcLayer = (FC<T> *)(layer);
+            auto orcaFC = new dcf::orca::FCLayer<T>(dcf::orca::global::bw, dcf::orca::global::bw, (int)fcLayer->inputDerivative.shape[0], (int)fcLayer->out, (int)fcLayer->in, dcf::TruncateType::StochasticTR, dcf::TruncateType::StochasticTruncate, fcLayer->useBias, !layer->isFirst, layer->isFirst);
+            auto W = fcLayer->getweights();
+            // memcpy(orcaFC->W, W.data, W.size * sizeof(T));
+            if (fcLayer->useBias)
+            {
+                auto bias = fcLayer->getbias();
+                // memcpy(orcaFC->Y, bias.data, bias.size * sizeof(T));
+            }
+            gpuModel->layers.push_back(orcaFC);
+        }
+        else if (layer->name == "ReLU")
+        {
+            assert(layer->mode == 2);
+            auto reluLayer = (ReLU<T> *)(layer);
+            int r = layer->activation.size();
+            auto orcaRelu = new dcf::orca::ReluExtendLayer<T>(dcf::orca::global::bw - dcf::orca::global::scale, dcf::orca::global::bw, r);
+            gpuModel->layers.push_back(orcaRelu);
+        }
+    }
+    int l = m->allNodesInExecutionOrder.size();
+    gpuModel->batchSz = inp.shape[0];
+    gpuModel->inpSz = inp.size();
+    gpuModel->classes = m->allNodesInExecutionOrder[l - 1]->currTensor->shape[1];
+    return gpuModel;
+}
+
+template <typename T>
+dcf::orca::GPUModel<T> *getPiranhaCNN(std::string modelName, Tensor<T> inp)
+{
+    auto m = getCNN<T>(modelName);
+    if (modelName.compare("P-SecureML") == 0)
+    {
+        Tensor<T> temp(nullptr, {inp.shape[0], inp.size() / inp.shape[0]});
+        m->init((u64)dcf::orca::global::scale, temp);
+    }
+    else
+    {
+        m->init((u64)dcf::orca::global::scale, inp);
+    }
+    m->train();
+    auto b = new Piranha<T>();
+    m->setBackend(b);
+    m->optimize();
+    dcf::orca::GPUModel<T> *gpuModel = new dcf::orca::GPUModel<T>();
+    for (auto n : m->allNodesInExecutionOrder)
+    {
+        auto layer = n->layer;
+        if (layer->name == "Conv2D")
+        {
+            auto convLayer = (Conv2D<T> *)(layer);
+            assert(!convLayer->useBias);
+            int N, h, w, c;
+            N = convLayer->inputDerivative.shape[0];
+            h = convLayer->inputDerivative.shape[1];
+            w = convLayer->inputDerivative.shape[2];
+            c = convLayer->inputDerivative.shape[3];
+            assert(c == convLayer->ci);
+            auto gpuLayer = new dcf::orca::Conv2DLayer<T>((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, (int)N, (int)h, (int)w, (int)convLayer->ci, (int)convLayer->fh, (int)convLayer->fw, (int)convLayer->co, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->stride, (int)convLayer->stride, convLayer->useBias, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS, !layer->isFirst, layer->isFirst);
+            gpuModel->layers.push_back(gpuLayer);
+        }
+        else if (layer->name == "FC")
+        {
+            auto fcLayer = (FC<T> *)(layer);
+            auto gpuLayer = new dcf::orca::FCLayer<T>((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, (int)fcLayer->inputDerivative.shape[0], (int)fcLayer->out, (int)fcLayer->in, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS, fcLayer->useBias, !layer->isFirst, layer->isFirst);
+            gpuModel->layers.push_back(gpuLayer);
+        }
+        else if (layer->name == "ReLU")
+        {
+            auto reluLayer = (ReLU<T> *)(layer);
+            int r = layer->activation.size();
+            // printf("r=%lu\n", r);
+            int inputBw = dcf::orca::global::bw - dcf::orca::global::scale - layer->mode;
+            auto gpuLayer = new dcf::orca::ReluLayer<T>(inputBw, dcf::orca::global::bw, r);
+            gpuModel->layers.push_back(gpuLayer);
+        }
+        else if (layer->name == "AvgPool2D")
+        {
+            auto avgPoolLayer = (AvgPool2D<T> *)(layer);
+            assert(n->parents.size() == 1);
+            auto p = n->parents[0];
+            auto &a = p->layer->activation;
+            assert(a.shape.size() == 4);
+            int N, h, w, c;
+            N = a.shape[0];
+            h = a.shape[1];
+            w = a.shape[2];
+            c = a.shape[3];
+            auto gpuLayer = new dcf::orca::AvgPool2DLayer<T>(dcf::orca::global::bw, dcf::orca::global::bw, dcf::orca::global::scale, N, h, w, c, avgPoolLayer->ks, avgPoolLayer->ks, avgPoolLayer->stride, avgPoolLayer->stride, avgPoolLayer->padding, avgPoolLayer->padding, avgPoolLayer->padding, avgPoolLayer->padding, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS);
+            gpuModel->layers.push_back(gpuLayer);
+        }
+    }
+    int l = m->allNodesInExecutionOrder.size();
+    printf("########Layers=%d\n", l);
+    gpuModel->batchSz = inp.shape[0];
+    gpuModel->inpSz = inp.size();
+    gpuModel->classes = m->allNodesInExecutionOrder[l - 1]->currTensor->shape[1];
+    return gpuModel;
+}
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/config.json b/GPU-MPC/experiments/orca/config.json
new file mode 100644
index 00000000..7251b442
--- /dev/null
+++ b/GPU-MPC/experiments/orca/config.json
@@ -0,0 +1,22 @@
+{
+    "P0": {
+        "dealer": {
+            "gpu": 0,
+            "key_dir": "/tmp/"
+        },
+        "evaluator": {
+            "gpu": 1,
+            "peer": "0.0.0.0"
+        }
+    },
+    "P1": {
+        "dealer": {
+            "gpu": 2,
+            "key_dir": "/tmp/"
+        },
+        "evaluator": {
+            "gpu": 3,
+            "peer": "0.0.0.0"
+        }
+    }
+}
diff --git a/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh b/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh
new file mode 100755
index 00000000..ea42d46d
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# If not already downloaded
+if [ ! -f ./cifar-10-batches-bin/data_batch_1.bin ]; then
+    # If the archive does not exist, download it
+    if [ ! -f ./cifar-10-binary.tar.gz ]; then
+        wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
+    fi
+
+    # Extract all the files
+    tar xf cifar-10-binary.tar.gz
+fi
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/datasets/cifar10.h b/GPU-MPC/experiments/orca/datasets/cifar10.h
new file mode 100644
index 00000000..2ccc0462
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/cifar10.h
@@ -0,0 +1,333 @@
+//=======================================================================
+// Copyright (c) 2017 Baptiste Wicht
+// Distributed under the terms of the MIT License.
+// (See accompanying file LICENSE or copy at
+//  http://opensource.org/licenses/MIT)
+//=======================================================================
+
+/*!
+ * \file
+ * \brief Contains functions to read the CIFAR-10 dataset
+ */
+
+#ifndef CIFAR10_READER_HPP
+#define CIFAR10_READER_HPP
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <memory>
+
+namespace cifar {
+
+/*!
+ * \brief Represents a complete CIFAR10 dataset
+ * \tparam Container The container to use
+ * \tparam Image The type of image
+ * \tparam Label The type of label
+ */
+template <template <typename...> class Container, typename Image, typename Label>
+struct CIFAR10_dataset {
+    Container<Image> training_images; ///< The training images
+    Container<Image> test_images;     ///< The test images
+    Container<Label> training_labels; ///< The training labels
+    Container<Label> test_labels;     ///< The test labels
+
+    /*!
+     * \brief Resize the training set to new_size
+     *
+     * If new_size is less than the current size, this function has no effect.
+     *
+     * \param new_size The size to resize the training sets to.
+     */
+    void resize_training(std::size_t new_size) {
+        if (training_images.size() > new_size) {
+            training_images.resize(new_size);
+            training_labels.resize(new_size);
+        }
+    }
+
+    /*!
+     * \brief Resize the test set to new_size
+     *
+     * If new_size is less than the current size, this function has no effect.
+     *
+     * \param new_size The size to resize the test sets to.
+     */
+    void resize_test(std::size_t new_size) {
+        if (test_images.size() > new_size) {
+            test_images.resize(new_size);
+            test_labels.resize(new_size);
+        }
+    }
+};
+
+/*!
+ * \brief Read a CIFAR 10 data file inside the given containers
+ * \param images The container to fill with the labels
+ * \param path The path to the label file
+ * \param limit The maximum number of elements to read (0: no limit)
+ */
+template <typename Images, typename Labels, typename Func>
+void read_cifar10_file(Images& images, Labels& labels, const std::string& path, std::size_t limit, Func func) {
+    if(limit && limit <= images.size()){
+        return;
+    }
+
+    std::ifstream file;
+    file.open(path, std::ios::in | std::ios::binary | std::ios::ate);
+
+    if (!file) {
+        std::cout << "Error opening file: " << path << std::endl;
+        return;
+    }
+
+    auto file_size = file.tellg();
+    std::unique_ptr<char[]> buffer(new char[file_size]);
+
+    //Read the entire file at once
+    file.seekg(0, std::ios::beg);
+    file.read(buffer.get(), file_size);
+    file.close();
+
+    std::size_t start = images.size();
+
+    size_t size = 10000;
+    size_t capacity = limit - images.size();
+
+    if(capacity > 0 && capacity < size){
+        size = capacity;
+    }
+
+    // Prepare the size for the new
+    images.reserve(images.size() + size);
+    labels.resize(labels.size() + size);
+
+    for(std::size_t i = 0; i < size; ++i){
+        labels[start + i] = buffer[i * 3073];
+
+        images.push_back(func());
+
+        for(std::size_t j = 1; j < 3073; ++j){
+            images[start + i][j - 1] = buffer[i * 3073 + j];
+        }
+    }
+}
+
+/*!
+ * \brief Read all test data.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels, typename Functor>
+void read_test(const std::string& folder, std::size_t limit, Images& images, Labels& labels, Functor func) {
+    read_cifar10_file(images, labels, folder + "/test_batch.bin", limit, func);
+}
+
+/*!
+ * \brief Read all training data
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels, typename Functor>
+void read_training(const std::string& folder, std::size_t limit, Images& images, Labels& labels, Functor func) {
+    read_cifar10_file(images, labels, folder + "/data_batch_1.bin", limit, func);
+    read_cifar10_file(images, labels, folder + "/data_batch_2.bin", limit, func);
+    read_cifar10_file(images, labels, folder + "/data_batch_3.bin", limit, func);
+    read_cifar10_file(images, labels, folder + "/data_batch_4.bin", limit, func);
+    read_cifar10_file(images, labels, folder + "/data_batch_5.bin", limit, func);
+}
+
+/*!
+ * \brief Read all test data.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels, typename Functor>
+void read_test(std::size_t limit, Images& images, Labels& labels, Functor func) {
+    read_test("./datasets/cifar-10/cifar-10-batches-bin", limit, images, labels, func);
+}
+
+/*!
+ * \brief Read all training data
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels, typename Functor>
+void read_training(std::size_t limit, Images& images, Labels& labels, Functor func) {
+    read_training("./datasets/cifar-10/cifar-10-batches-bin", limit, images, labels, func);
+}
+
+/*!
+ * \brief Read a CIFAR 10 data file inside the given containers
+ * \param images The container to fill with the labels
+ * \param path The path to the label file
+ * \param limit The maximum number of elements to read (0: no limit)
+ */
+template <typename Images, typename Labels>
+void read_cifar10_file_categorical(Images& images, Labels& labels, const std::string& path, std::size_t limit, size_t start) {
+    if(limit && limit <= start){
+        return;
+    }
+
+    std::ifstream file;
+    file.open(path, std::ios::in | std::ios::binary | std::ios::ate);
+
+    if (!file) {
+        std::cout << "Error opening file: " << path << std::endl;
+        return;
+    }
+
+    auto file_size = file.tellg();
+    std::unique_ptr<char[]> buffer(new char[file_size]);
+
+    //Read the entire file at once
+    file.seekg(0, std::ios::beg);
+    file.read(buffer.get(), file_size);
+    file.close();
+
+    size_t size = 10000;
+    size_t capacity = limit - start;
+
+    if(capacity > 0 && capacity < size){
+        size = capacity;
+    }
+
+    for(std::size_t i = 0; i < size; ++i){
+        const size_t l = buffer[i * 3073];
+
+        labels(start + i)(l) = 1.0;
+
+        for(std::size_t j = 1; j < 3073; ++j){
+            images(start + i)[j - 1] = buffer[i * 3073 + j];
+        }
+    }
+}
+
+/*!
+ * \brief Read all training data
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels>
+void read_training_categorical(const std::string& folder, std::size_t limit, Images& images, Labels& labels) {
+    read_cifar10_file_categorical(images, labels, folder + "/data_batch_1.bin", limit, 0);
+    read_cifar10_file_categorical(images, labels, folder + "/data_batch_2.bin", limit, 10000);
+    read_cifar10_file_categorical(images, labels, folder + "/data_batch_3.bin", limit, 20000);
+    read_cifar10_file_categorical(images, labels, folder + "/data_batch_4.bin", limit, 30000);
+    read_cifar10_file_categorical(images, labels, folder + "/data_batch_5.bin", limit, 40000);
+}
+
+/*!
+ * \brief Read all test data.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels>
+void read_test_categorical(const std::string& folder, std::size_t limit, Images& images, Labels& labels) {
+    read_cifar10_file_categorical(images, labels, folder + "/test_batch.bin", limit, 0);
+}
+
+/*!
+ * \brief Read all training data
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels>
+void read_training_categorical(std::size_t limit, Images& images, Labels& labels) {
+    read_training_categorical("cifar-10/cifar-10-batches-bin", limit, images, labels);
+}
+
+/*!
+ * \brief Read all test data.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param limit The maximum number of elements to read (0: no limit)
+ * \param func The functor to create the image objects.
+ */
+template <typename Images, typename Labels>
+void read_test_categorical(std::size_t limit, Images& images, Labels& labels) {
+    read_test_categorical("cifar-10/cifar-10-batches-bin", limit, images, labels);
+}
+
+/*!
+ * \brief Read dataset and assume images in 3D (3x32x32)
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param training_limit The maximum number of elements to read from data set (0: no limit)
+ * \param test_limit The maximum number of elements to read from test set (0: no limit)
+ *
+ * \return The dataset
+ */
+template <template <typename...> class Container, typename Image, typename Label = uint8_t>
+CIFAR10_dataset<Container, Image, Label> read_dataset_3d(std::size_t training_limit = 0, std::size_t test_limit = 0) {
+    CIFAR10_dataset<Container, Image, Label> dataset;
+
+    read_training(training_limit, dataset.training_images, dataset.training_labels, [] { return Image(3, 32, 32); });
+    read_test(test_limit, dataset.training_images, dataset.training_labels, [] { return Image(3, 32, 32); });
+
+    return dataset;
+}
+
+/*!
+ * \brief Read dataset.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param training_limit The maximum number of elements to read from data set (0: no limit)
+ * \param test_limit The maximum number of elements to read from test set (0: no limit)
+ *
+ * \return The dataset
+ */
+template <template <typename...> class Container, typename Image, typename Label = uint8_t>
+CIFAR10_dataset<Container, Image, Label> read_dataset_direct(std::size_t training_limit = 0, std::size_t test_limit = 0) {
+    CIFAR10_dataset<Container, Image, Label> dataset;
+
+    read_training(training_limit, dataset.training_images, dataset.training_labels, [] { return Image(3 * 32 * 32); });
+    read_test(test_limit, dataset.test_images, dataset.test_labels, [] { return Image(3 * 32 * 32); });
+
+    return dataset;
+}
+
+/*!
+ * \brief Read dataset.
+ *
+ * The dataset is assumed to be in a cifar-10 subfolder
+ *
+ * \param training_limit The maximum number of elements to read from training set (0: no limit)
+ * \param test_limit The maximum number of elements to read from test set (0: no limit)
+ * \return The dataset
+ */
+template <template <typename...> class Container = std::vector, template <typename...> class Sub = std::vector, typename Pixel = uint8_t, typename Label = uint8_t>
+CIFAR10_dataset<Container, Sub<Pixel>, Label> read_dataset(std::size_t training_limit = 0, std::size_t test_limit = 0) {
+    return read_dataset_direct<Container, Sub<Pixel>, Label>(training_limit, test_limit);
+}
+
+} //end of namespace cifar
+
+#endif
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/datasets/gpu_data.cu b/GPU-MPC/experiments/orca/datasets/gpu_data.cu
new file mode 100644
index 00000000..4fb347af
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/gpu_data.cu
@@ -0,0 +1,71 @@
+#include <fstream>
+#include <cassert>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "gpu_data.h"
+
+
+Dataset readDataset(std::string name, int party)
+{
+    Dataset d;
+    if (name.compare("cifar10") == 0)
+    {
+        d.images = 50000;
+        d.H = 32;
+        d.W = 32;
+        d.C = 3;
+        d.classes = 10;
+    }
+    else if (name.compare("mnist") == 0)
+    {
+        d.images = 60000;
+        d.H = 28;
+        d.W = 28;
+        d.C = 1;
+        d.classes = 10;
+    }
+    else if (name.compare("imagenet") == 0)
+    {
+        d.images = 16;
+        d.H = 224;
+        d.W = 224;
+        d.C = 3;
+        d.classes = 1000;
+    }
+    else if (name.compare("320x320x3") == 0)
+    {
+        d.images = 16;
+        d.H = 320;
+        d.W = 320;
+        d.C = 3;
+        d.classes = 14;
+    }
+    else
+    {
+        assert(0 && "nothing matched!");
+    }
+    u64 *data, *labels;
+    size_t dataSize, labelSize;
+    if (name.compare("imagenet") == 0 || name.compare("320x320x3") == 0)
+    {
+        dataSize = d.images * d.H * d.W * d.C;
+        data = (u64 *)cpuMalloc(dataSize * sizeof(u64));
+        randomGEOnCpu(dataSize, 64, data);
+        labelSize = d.images * d.classes;
+        labels = (u64 *)cpuMalloc(labelSize * sizeof(u64));
+        randomGEOnCpu(labelSize, 64, labels);
+    }
+    else
+    {
+        auto sharesDir = "./datasets/shares/" + name + "/";
+        data = (u64 *)readFile(sharesDir + name + "_share" + std::to_string(party + 1) + ".dat", &dataSize);
+        labels = (u64 *)readFile(sharesDir + name + "_labels" + std::to_string(party + 1) + ".dat", &labelSize);
+    }
+    d.data = data;
+    d.labels = labels;
+    return d;
+}
diff --git a/GPU-MPC/experiments/orca/datasets/gpu_data.h b/GPU-MPC/experiments/orca/datasets/gpu_data.h
new file mode 100644
index 00000000..d8cdc571
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/gpu_data.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "utils/gpu_data_types.h"
+
+struct Dataset {
+    int images;
+    int H, W, C;
+    int classes;
+    u64* data;
+    u64* labels;
+};
+
+Dataset readDataset(std::string name, int party);
+
+#include "gpu_data.cu"
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/datasets/mnist.cpp b/GPU-MPC/experiments/orca/datasets/mnist.cpp
new file mode 100644
index 00000000..9560232b
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/mnist.cpp
@@ -0,0 +1,17 @@
+#include "mnist.h"
+
+unsigned char image[MAX_NUM_OF_IMAGES][MAX_IMAGESIZE][MAX_IMAGESIZE];
+int width[MAX_NUM_OF_IMAGES], height[MAX_NUM_OF_IMAGES];
+
+int info_image[LEN_INFO_IMAGE];
+int info_label[LEN_INFO_LABEL];
+
+unsigned char train_image_char[NUM_TRAIN][SIZE];
+unsigned char test_image_char[NUM_TEST][SIZE];
+unsigned char train_label_char[NUM_TRAIN][1];
+unsigned char test_label_char[NUM_TEST][1];
+
+double train_image[NUM_TRAIN][SIZE];
+double test_image[NUM_TEST][SIZE];
+int  train_label[NUM_TRAIN];
+int test_label[NUM_TEST];
diff --git a/GPU-MPC/experiments/orca/datasets/mnist.h b/GPU-MPC/experiments/orca/datasets/mnist.h
new file mode 100644
index 00000000..c26958ac
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/mnist.h
@@ -0,0 +1,227 @@
+/*
+Takafumi Hoiruchi. 2018.
+https://github.com/takafumihoriuchi/MNIST_for_C
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+// set appropriate path for data
+#define TRAIN_IMAGE "./datasets/mnist/train-images.idx3-ubyte"
+#define TRAIN_LABEL "./datasets/mnist/train-labels.idx1-ubyte"
+#define TEST_IMAGE "./datasets/mnist/t10k-images.idx3-ubyte"
+#define TEST_LABEL "./datasets/mnist/t10k-labels.idx1-ubyte"
+
+#define SIZE 784 // 28*28
+#define NUM_TRAIN 60000
+#define NUM_TEST 10000
+#define LEN_INFO_IMAGE 4
+#define LEN_INFO_LABEL 2
+
+#define MAX_IMAGESIZE 1280
+#define MAX_BRIGHTNESS 255
+#define MAX_FILENAME 256
+#define MAX_NUM_OF_IMAGES 1
+
+extern unsigned char image[MAX_NUM_OF_IMAGES][MAX_IMAGESIZE][MAX_IMAGESIZE];
+extern int width[MAX_NUM_OF_IMAGES], height[MAX_NUM_OF_IMAGES];
+
+extern int info_image[LEN_INFO_IMAGE];
+extern int info_label[LEN_INFO_LABEL];
+
+extern unsigned char train_image_char[NUM_TRAIN][SIZE];
+extern unsigned char test_image_char[NUM_TEST][SIZE];
+extern unsigned char train_label_char[NUM_TRAIN][1];
+extern unsigned char test_label_char[NUM_TEST][1];
+
+extern double train_image[NUM_TRAIN][SIZE];
+extern double test_image[NUM_TEST][SIZE];
+extern int  train_label[NUM_TRAIN];
+extern int test_label[NUM_TEST];
+
+
+inline void FlipLong(unsigned char * ptr)
+{
+    unsigned char val;
+    
+    // Swap 1st and 4th bytes
+    val = *(ptr);
+    *(ptr) = *(ptr+3);
+    *(ptr+3) = val;
+    
+    // Swap 2nd and 3rd bytes
+    ptr += 1;
+    val = *(ptr);
+    *(ptr) = *(ptr+1);
+    *(ptr+1) = val;
+}
+
+
+inline void read_mnist_char_img(char *file_path, int num_data, int len_info, int arr_n, unsigned char data_char[][SIZE], int info_arr[])
+{
+    int i, j, k, fd;
+    unsigned char *ptr;
+
+    if ((fd = open(file_path, O_RDONLY)) == -1) {
+        fprintf(stderr, "couldn't open image file\n");
+        exit(-1);
+    }
+    
+    read(fd, info_arr, len_info * sizeof(int));
+    
+    // read-in information about size of data
+    for (i=0; i<len_info; i++) { 
+        ptr = (unsigned char *)(info_arr + i);
+        FlipLong(ptr);
+        ptr = ptr + sizeof(int);
+    }
+    
+    // read-in mnist numbers (pixels|labels)
+    for (i=0; i<num_data; i++) {
+        read(fd, data_char[i], arr_n * sizeof(unsigned char));   
+    }
+
+    close(fd);
+}
+
+inline void read_mnist_char_lab(char *file_path, int num_data, int len_info, int arr_n, unsigned char data_char[][1], int info_arr[])
+{
+    int i, j, k, fd;
+    unsigned char *ptr;
+
+    if ((fd = open(file_path, O_RDONLY)) == -1) {
+        fprintf(stderr, "couldn't open image file");
+        exit(-1);
+    }
+    
+    read(fd, info_arr, len_info * sizeof(int));
+    
+    // read-in information about size of data
+    for (i=0; i<len_info; i++) { 
+        ptr = (unsigned char *)(info_arr + i);
+        FlipLong(ptr);
+        ptr = ptr + sizeof(int);
+    }
+    
+    // read-in mnist numbers (pixels|labels)
+    for (i=0; i<num_data; i++) {
+        read(fd, data_char[i], arr_n * sizeof(unsigned char));   
+    }
+
+    close(fd);
+}
+
+
+inline void image_char2double(int num_data, unsigned char data_image_char[][SIZE], double data_image[][SIZE])
+{
+    int i, j;
+    for (i=0; i<num_data; i++)
+        for (j=0; j<SIZE; j++) {
+            data_image[i][j]  = (double)data_image_char[i][j] / 255.0;
+        }
+
+}
+
+
+inline void label_char2int(int num_data, unsigned char data_label_char[][1], int data_label[])
+{
+    int i;
+    for (i=0; i<num_data; i++)
+        data_label[i]  = (int)data_label_char[i][0];
+}
+
+
+inline void load_mnist()
+{
+    read_mnist_char_img(TRAIN_IMAGE, NUM_TRAIN, LEN_INFO_IMAGE, SIZE, train_image_char, info_image);
+    image_char2double(NUM_TRAIN, train_image_char, train_image);
+
+    read_mnist_char_img(TEST_IMAGE, NUM_TEST, LEN_INFO_IMAGE, SIZE, test_image_char, info_image);
+    image_char2double(NUM_TEST, test_image_char, test_image);
+    
+    read_mnist_char_lab(TRAIN_LABEL, NUM_TRAIN, LEN_INFO_LABEL, 1, train_label_char, info_label);
+    label_char2int(NUM_TRAIN, train_label_char, train_label);
+    
+    read_mnist_char_lab(TEST_LABEL, NUM_TEST, LEN_INFO_LABEL, 1, test_label_char, info_label);
+    label_char2int(NUM_TEST, test_label_char, test_label);
+}
+
+
+inline void print_mnist_pixel(double data_image[][SIZE], int num_data)
+{
+    int i, j;
+    for (i=0; i<num_data; i++) {
+        printf("image %d/%d\n", i+1, num_data);
+        for (j=0; j<SIZE; j++) {
+            printf("%1.1f ", data_image[i][j]);
+            if ((j+1) % 28 == 0) putchar('\n');
+        }
+        putchar('\n');
+    }
+}
+
+
+inline void print_mnist_label(int data_label[], int num_data)
+{
+    int i;
+    if (num_data == NUM_TRAIN)
+        for (i=0; i<num_data; i++)
+            printf("train_label[%d]: %d\n", i, train_label[i]);
+    else
+        for (i=0; i<num_data; i++)
+            printf("test_label[%d]: %d\n", i, test_label[i]);
+}
+
+
+// name: path for saving image (ex: "./images/sample.pgm")
+inline void save_image(int n, char name[])
+{
+    char file_name[MAX_FILENAME];
+    FILE *fp;
+    int x, y;
+
+    if (name[0] == '\0') {
+        printf("output file name (*.pgm) : ");
+        scanf("%s", file_name);
+    } else strcpy(file_name, name);
+
+    if ( (fp=fopen(file_name, "wb"))==NULL ) {
+        printf("could not open file\n");
+        exit(1);
+    }
+
+    fputs("P5\n", fp);
+    fputs("# Created by Image Processing\n", fp);
+    fprintf(fp, "%d %d\n", width[n], height[n]);
+    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
+    for (y=0; y<height[n]; y++)
+        for (x=0; x<width[n]; x++)
+            fputc(image[n][x][y], fp);
+        fclose(fp);
+        printf("Image was saved successfully\n");
+}
+
+
+// save mnist image (call for each image)
+// store train_image[][] into image[][][]
+inline void save_mnist_pgm(double data_image[][SIZE], int index)
+{
+    int n = 0; // id for image (set to 0)
+    int x, y;
+
+    width[n] = 28;
+    height[n] = 28;
+
+    for (y=0; y<height[n]; y++) {
+        for (x=0; x<width[n]; x++) {
+            image[n][x][y] = data_image[index][y * width[n] + x] * 255.0;
+        }
+    }
+
+    save_image(n, "");
+}
diff --git a/GPU-MPC/experiments/orca/model_accuracy.cu b/GPU-MPC/experiments/orca/model_accuracy.cu
new file mode 100644
index 00000000..e5902340
--- /dev/null
+++ b/GPU-MPC/experiments/orca/model_accuracy.cu
@@ -0,0 +1,30 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "model_accuracy.h"
+
+int main() {
+    omp_set_num_threads(64);
+    sytorch_init();
+    using T = i64;
+    auto res = getLossCIFAR10<T>("CNN3", 24, "output/training/e2e/loss/CNN31e/weights/", 0, 0, 0);
+    printf("Accuracy=%lf, Loss=%lf\n", res.first, res.second);
+}
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/model_accuracy.h b/GPU-MPC/experiments/orca/model_accuracy.h
new file mode 100644
index 00000000..ed09a991
--- /dev/null
+++ b/GPU-MPC/experiments/orca/model_accuracy.h
@@ -0,0 +1,228 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "datasets/cifar10.h"
+#include "datasets/mnist.h"
+#include "utils/gpu_file_utils.h"
+#include "cnn.h"
+#include "nn/orca_opt.h"
+
+template <typename T>
+class LossBackend : public ClearText<T>
+{
+public:
+
+    LossBackend() : ClearText<T>()
+    {
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { orcaOpt(n, r); });
+    }
+};
+
+template <typename T>
+void softmax(u64 scale, const Tensor2D<T> &in, Tensor2D<double> &out)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d2 == out.d2);
+    assert(std::is_integral<T>::value || (scale == 0));
+
+    auto batchSz = in.d1;
+    auto numClasses = in.d2;
+    for (int b = 0; b < batchSz; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < numClasses; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        double den = 0.0;
+        double exps[numClasses];
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            double x = in(b, j) - max;
+            exps[j] = std::exp(x / (1ULL << scale));
+            den += exps[j];
+        }
+
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            out(b, j) = exps[j] / den;
+        }
+    }
+}
+
+template <typename T>
+void readWeights(SytorchModule<T> *m, std::string lossDir, int party, int l, int k)
+{
+    size_t weightsSize;
+    auto maskedWFile = lossDir + "masked_weights_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat";
+    auto masked_W = (u64 *)readFile(maskedWFile, &weightsSize);
+    u64 *mask_W = NULL;
+    auto wMaskFile = lossDir + "weights_mask_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat";
+    mask_W = (u64 *)readFile(wMaskFile, &weightsSize);
+    printf("Weights file=%s, %s\n", maskedWFile.data(), wMaskFile.data());
+    int N = weightsSize / sizeof(u64);
+    // printf("%d\n", N);
+    auto W = new u64[N];
+    for (int i = 0; i < N; i++)
+    {
+        W[i] = masked_W[i] - (mask_W ? mask_W[i] : 0);
+        // if (i < 10)
+        //     printf("%ld, %ld, %ld\n", masked_W[i], mask_W[i], W[i]);
+    }
+    int wIdx = 0;
+    std::vector<Layer<T> *> layers;
+    for (int i = 0; i < m->allNodesInExecutionOrder.size(); i++)
+        layers.push_back(m->allNodesInExecutionOrder[i]->layer);
+    for (int i = 0; i < layers.size(); i++)
+    {
+        if (layers[i]->name.find("Conv2D") != std::string::npos || layers[i]->name.find("FC") != std::string::npos)
+        {
+            auto weights = layers[i]->getweights();
+            memcpy(weights.data, &W[wIdx], weights.size * sizeof(T));
+            wIdx += weights.size;
+            auto bias = layers[i]->getbias();
+            memcpy(bias.data, &W[wIdx], bias.size * sizeof(T));
+            wIdx += bias.size;
+        }
+    }
+    assert(wIdx == N);
+}
+
+template <typename T>
+std::pair<double, double> getLossCIFAR10(std::string modelName, u64 scale, std::string lossDir, int party, int l, int k)
+{
+    auto dataset = cifar::read_dataset<std::vector, std::vector, uint8_t, uint8_t>();
+    const u64 testLen = dataset.test_images.size();
+    int batchSz = 100;
+
+    auto m = getCNN<T>(modelName);
+    Tensor<T> testSet({(u64)batchSz, 32, 32, 3});
+    m->init(scale, testSet);
+    m->train();
+    auto dummy = new LossBackend<T>();
+    dummy->bw = 8 * sizeof(T);
+    dummy->probablistic = true;
+    dummy->localTruncationEmulation = false;
+    m->setBackend(dummy);
+    m->optimize();
+
+    Tensor2D<double> e(batchSz, 10);
+    auto testSet_4d = testSet.as_4d();
+
+    readWeights(m, lossDir, party, l, k);
+
+    // printf("Test len=%d\n", testLen);
+    double loss = 0.0;
+    u64 correct = 0;
+    for (u64 i = 0; i < testLen; i += batchSz)
+    {
+        for (u64 b = 0; b < batchSz; ++b)
+        {
+            for (u64 j = 0; j < 32; ++j)
+            {
+                for (u64 k = 0; k < 32; ++k)
+                {
+                    for (u64 l = 0; l < 3; ++l)
+                    {
+                        testSet_4d(b, j, k, l) = (T)((dataset.test_images[i + b][j * 32 + k + l * 32 * 32] / 255.0) * (1LL << (scale)));
+                    }
+                }
+            }
+        }
+        auto &activation = m->forward(testSet);
+        auto act_2d = activation.as_2d();
+        softmax<T>(scale, act_2d, e);
+        for (u64 b = 0; b < batchSz; b++)
+        {
+            assert(act_2d.argmax(b) == e.argmax(b));
+            if (e.argmax(b) == dataset.test_labels[i + b])
+            {
+                correct++;
+            }
+            loss += (double)std::log(e(b, dataset.test_labels[i + b]));
+        }
+        // printf("correct %d: %d\n", i, correct);
+    }
+    auto accuracy = (correct * 100.0) / testLen;
+    loss = -loss / testLen;
+    // printf("accuracy: %lf, loss: %lf\n", accuracy, loss);
+    return std::make_pair(accuracy, loss);
+}
+
+template <typename T>
+std::pair<double, double> getLossMNIST(std::string modelName, u64 scale, std::string lossDir, int party, int l, int k)
+{
+    load_mnist();
+    const u64 testLen = 10000;
+    auto m = getCNN<T>(modelName);
+    Tensor<T> testSet({testLen, 28, 28, 1});
+    m->init(scale, testSet);
+    m->train();
+    auto dummy = new LossBackend<T>();
+    dummy->bw = 8 * sizeof(T);
+    dummy->probablistic = true;
+    dummy->localTruncationEmulation = false;
+    m->setBackend(dummy);
+    m->optimize();
+    auto testSet_4d = testSet.as_4d();
+
+    Tensor2D<double> e(testLen, 10);
+    for (u64 i = 0; i < testLen; ++i)
+    {
+        for (u64 j = 0; j < 28; ++j)
+        {
+            for (u64 k = 0; k < 28; ++k)
+            {
+                testSet_4d(i, j, k, 0) = (T)(test_image[i][j * 28 + k] * (1LL << scale));
+            }
+        }
+    }
+
+    double loss = 0.0;
+    readWeights(m, lossDir, party, l, k);
+    auto &activation = m->forward(testSet);
+    auto act_2d = activation.as_2d();
+    softmax<T>(scale, act_2d, e);
+    u64 correct = 0;
+    for (u64 i = 0; i < testLen; i++)
+    {
+        assert(act_2d.argmax(i) == e.argmax(i));
+        // printf("%d, %d\n", m->activation.argmax(i), test_label[i]);
+        if (e.argmax(i) == test_label[i])
+        {
+            correct++;
+        }
+        loss += (double)std::log(e(i, test_label[i]));
+        // if (i % 100 == 0)
+        //     printf("correct: %d\n", correct);
+    }
+    auto accuracy = (correct * 100.0) / testLen;
+    loss = -loss / testLen;
+    return std::make_pair(accuracy, loss);
+}
\ No newline at end of file
diff --git a/GPU-MPC/experiments/orca/orca_dealer.cu b/GPU-MPC/experiments/orca/orca_dealer.cu
new file mode 100644
index 00000000..52f8cecf
--- /dev/null
+++ b/GPU-MPC/experiments/orca/orca_dealer.cu
@@ -0,0 +1,360 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <chrono>
+#include <fcntl.h>
+#include <filesystem>
+#include <omp.h>
+#include <unistd.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_random.h"
+
+#include "cnn.h"
+
+#include <sytorch/backend/llama_base.h>
+#include <sytorch/softmax.h>
+
+u64 *gpuGenSoftmaxKey(int batchSz, int numClasses, u64 *d_mask_I, bool secfloat, LlamaBase<u64> *llama)
+{
+    Tensor4D<u64> inpMask(batchSz, numClasses, 1, 1);
+    Tensor4D<u64> softmaxOpMask(batchSz, numClasses, 1, 1);
+    size_t memSz = batchSz * numClasses * sizeof(u64);
+    moveIntoCPUMem((u8 *)inpMask.data, (u8 *)d_mask_I, memSz, NULL);
+    gpuFree(d_mask_I);
+    if (secfloat)
+    {
+        softmax_secfloat(inpMask, softmaxOpMask, dcf::orca::global::scale, 1);
+    }
+    else
+    {
+        pirhana_softmax(inpMask, softmaxOpMask, dcf::orca::global::scale);
+    }
+    d_mask_I = (u64 *)moveToGPU((u8 *)softmaxOpMask.data, memSz, NULL);
+    return d_mask_I;
+}
+
+void genModelKey(dcf::orca::GPUModel<u64> *m, u8 **bufPtr, int party, AESGlobalContext *g, bool secfloat, LlamaBase<u64> *llama, int epoch)
+{
+    auto d_mask_I = randomGEOnGpu<u64>(m->inpSz, dcf::orca::global::bw);
+    u64 *d_mask_O = NULL;
+    for (int i = 0; i < m->layers.size(); i++)
+    {
+        // printf("Layer=%s\n", m->layers[i]->name.data());
+        d_mask_O = m->layers[i]->genForwardKey(bufPtr, party, d_mask_I, g);
+        assert(d_mask_O != d_mask_I);
+        gpuFree(d_mask_I);
+        d_mask_I = d_mask_O;
+    }
+    d_mask_I = gpuGenSoftmaxKey(m->batchSz, m->classes, d_mask_I, secfloat, llama);
+    for (int i = m->layers.size() - 1; i >= 0; i--)
+    {
+        d_mask_I = m->layers[i]->genBackwardKey(bufPtr, party, d_mask_I, g, epoch);
+    }
+}
+
+void writeKeySz(std::string dir, std::string modelName, u64 keySz)
+{
+    makeDir(dir);
+    std::ofstream keySzFile(dir + modelName + ".txt");
+    keySzFile << keySz;
+    keySzFile.close();
+}
+
+void dealerE2E(std::string modelName, int party, int epochs, int blocks, int blockSz, int batchSz, int H, int W, int C, bool secfloat, bool momentum, std::string keyDir, int sleepInt, std::string weightsMask = "")
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    initGPUMemPool();
+    sytorch_init();
+    assert(epochs < 6);
+
+    auto expName = modelName + "-" + std::to_string(epochs) + "e-" + std::to_string(blocks) + "b";
+    auto trainingDir = "output/P" + std::to_string(party) + "/training/";
+    auto lossDir = trainingDir + "loss/" + expName + "/";
+    auto keySzDir = trainingDir + "keysize/";
+    auto weightsDir = lossDir + "weights/";
+
+    // assumes output/P0/training exists
+    makeDir(trainingDir + "loss/");
+    makeDir(lossDir);
+    makeDir(weightsDir);
+    makeDir(keySzDir);
+
+    auto dealerFifoName = "/tmp/dealerFifo" + std::to_string(party);
+    auto evalFifoName = "/tmp/evaluatorFifo" + std::to_string(party);
+
+    int dealerFifo, evalFifo;
+    bool sync = (epochs * blocks > 1);
+
+    char one = 1;
+    char two = 2;
+
+    // load the model
+    dcf::orca::GPUModel<u64> *m = getGPUModel<u64>(modelName, Tensor<u64>(nullptr, {(u64)batchSz, (u64)H, (u64)W, (u64)C}));
+    m->setTrain(momentum);
+    m->initWeights(weightsMask, false);
+
+    char *zeros;
+    size_t padding, bufSize = 20 * OneGB;
+    u8 *startPtr, *curPtr, *tmpPtr1, *tmpPtr2;
+    getAlignedBuf(&startPtr, bufSize);
+
+    // initialize llama
+    LlamaConfig::party = DEALER;
+    auto llama = new LlamaBase<u64>();
+    tmpPtr1 = (u8 *)malloc(OneGB);
+    bool isServer = party + 2 == SERVER;
+    llama->initDealer((char **)(isServer ? &curPtr : &tmpPtr2), (char **)(isServer ? &tmpPtr2 : &curPtr));
+
+    std::string keyFile = keyDir + modelName + "_training_key" + std::to_string(party) + ".dat";
+
+    for (int l = 0; l < epochs; l++)
+    {
+        for (int k = 0; k < blocks; k++)
+        {
+            std::cout << keyFile << std::endl;
+            int fd = openForWriting(keyFile);
+            printf("Iteration=%u\n", l * blocks * blockSz + k * blockSz);
+            for (int j = 0; j < blockSz; j++)
+            {
+                curPtr = startPtr;
+                tmpPtr2 = tmpPtr1;
+                genModelKey(m, &curPtr, party, &g, secfloat, (LlamaBase<u64> *)llama, l);
+                if (l == 0 && k == 0 && j == 0)
+                {
+                    size_t keySz = curPtr - startPtr;
+                    padding = 4096 - (keySz % 4096);
+                    keySz += padding;
+                    zeros = new char[padding];
+                    memset(zeros, 0, padding);
+                    writeKeySz(keySzDir, modelName, keySz);
+                    if (sync)
+                    {
+                        mkfifo(dealerFifoName.c_str(), 0666);
+                        mkfifo(evalFifoName.c_str(), 0666);
+                        // dealer writes to this fifo
+                        printf("Opening dealer fifo=%s\n", dealerFifoName.c_str());
+                        dealerFifo = open(dealerFifoName.c_str(), O_WRONLY);
+                        // dealer reads from this fifo
+                        printf("Opening evaluator fifo=%s\n", evalFifoName.c_str());
+                        evalFifo = open(evalFifoName.c_str(), O_RDONLY);
+                    }
+                }
+                memcpy(curPtr, zeros, padding);
+                curPtr += padding;
+                writeKeyBuf(fd, curPtr - startPtr, startPtr);
+            }
+            assert(0 == fsync(fd) && "sync error!");
+            close(fd);
+            if (party == SERVER0)
+                m->dumpWeights(weightsDir + "weights_mask_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat");
+            if (sync)
+            {
+                printf("Sleeping for %d seconds.\n", sleepInt);
+                sleep(sleepInt);
+                // dealer writes to one and evaluator writes to two
+                assert(1 == write(dealerFifo, &one, sizeof(char)));
+                assert(1 == read(evalFifo, &two, sizeof(char)));
+                assert(two == 2);
+            }
+        }
+    }
+    delete[] zeros;
+    destroyGPURandomness();
+    if (sync)
+    {
+        close(dealerFifo);
+        close(evalFifo);
+    }
+}
+
+void dealerPerf(std::string modelName, int party, int iterations, int batchSz, int H, int W, int C, bool secfloat, bool momentum, std::string keyDir, int sleepInt)
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    initGPUMemPool();
+    sytorch_init();
+
+    auto trainingDir = "output/P" + std::to_string(party) + "/training/";
+    auto keySzDir = trainingDir + "keysize/";
+    makeDir(keySzDir);
+
+    dcf::orca::GPUModel<u64> *m = getGPUModel<u64>(modelName, Tensor<u64>(nullptr, {(u64)batchSz, (u64)H, (u64)W, (u64)C}));
+    m->setTrain(momentum);
+
+    char *zeros;
+    // Neha: remember to change this later
+    size_t padding, bufSize = 20 * OneGB;
+    u8 *startPtr, *curPtr, *tmpPtr1, *tmpPtr2;
+    getAlignedBuf(&startPtr, bufSize);
+
+    // initialize llama
+    LlamaConfig::party = DEALER;
+    auto llama = new LlamaBase<u64>();
+    tmpPtr1 = (u8 *)malloc(OneGB);
+    bool isServer = party + 2 == SERVER;
+    llama->initDealer((char **)(isServer ? &curPtr : &tmpPtr2), (char **)(isServer ? &tmpPtr2 : &curPtr));
+
+    std::string keyFile = keyDir + modelName + "_training_key" + std::to_string(party) + ".dat";
+
+    std::cout << keyFile << std::endl;
+    int fd = openForWriting(keyFile);
+
+    for (int j = 0; j < iterations; j++)
+    {
+        curPtr = startPtr;
+        tmpPtr2 = tmpPtr1;
+        genModelKey(m, &curPtr, party, &g, secfloat, (LlamaBase<u64> *)llama, 0);
+        if (j == 0)
+        {
+            size_t keySz = curPtr - startPtr;
+            padding = 4096 - (keySz % 4096);
+            zeros = new char[padding];
+            memset(zeros, 0, padding);
+            keySz += padding;
+            writeKeySz(keySzDir, modelName, keySz);
+        }
+        memcpy(curPtr, zeros, padding);
+        curPtr += padding;
+        writeKeyBuf(fd, curPtr - startPtr, startPtr);
+    }
+    assert(0 == fsync(fd) && "sync error!");
+    close(fd);
+    printf("Sleeping for %d seconds.\n", sleepInt);
+    // sleep for a minute
+    sleep(sleepInt);
+    delete[] zeros;
+    destroyGPURandomness();
+}
+
+int main(int argc, char *argv[])
+{
+    int party = atoi(argv[1]);
+    auto experiment = std::string(argv[2]);
+    auto keyDir = std::string(argv[3]);
+
+    omp_set_num_threads(32);
+
+    if (experiment.compare("CNN2") == 0)
+    {
+        int epochs = 1;
+        int blocks = 1;
+        int blockSz = 600; // 600
+        int batchSz = 100;
+        dealerE2E("CNN2", party, epochs, blocks, blockSz, batchSz, 28, 28, 1, true, true, keyDir, 300);
+    }
+    else if (experiment.compare("CNN3-5e") == 0)
+    {
+        int epochs = 5;
+        int blocks = 20;
+        int blockSz = 25;
+        int batchSz = 100;
+        dealerE2E("CNN3", party, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir, 300);
+    }
+    else if (experiment.compare("CNN3-2e") == 0)
+    {
+        int epochs = 2;   // 5;
+        int blocks = 20;  // 10;
+        int blockSz = 25; // 50;
+        int batchSz = 100;
+        dealerE2E("CNN3", party, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir, 300);
+    }
+    else if (experiment.compare("CNN2-loss") == 0)
+    {
+        int epochs = 1;
+        int blocks = 60;
+        int blockSz = 10;
+        int batchSz = 100;
+        dealerE2E("CNN2", party, epochs, blocks, blockSz, batchSz, 28, 28, 1, true, true, keyDir, 5);
+    }
+    else if (experiment.compare("CNN3-2e-loss") == 0)
+    {
+        int epochs = 2;
+        int blocks = 50;
+        int blockSz = 10;
+        int batchSz = 100;
+        dealerE2E("CNN3", party, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir, 5);
+    }
+    else if (experiment.compare("P-VGG16") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        dealerPerf("P-VGG16", party, iterations, batchSz, 32, 32, 3, false, false, keyDir, 300);
+    }
+    else if (experiment.compare("P-AlexNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        dealerPerf("P-AlexNet", party, iterations, batchSz, 32, 32, 3, false, false, keyDir, 300);
+    }
+    else if (experiment.compare("P-LeNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        dealerPerf("P-LeNet", party, iterations, batchSz, 28, 28, 1, false, false, keyDir, 60);
+    }
+    else if (experiment.compare("P-SecureML") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        dealerPerf("P-SecureML", party, iterations, batchSz, 28, 28, 1, false, false, keyDir, 60);
+    }
+    else if (experiment.compare("AlexNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        dealerPerf("AlexNet", party, iterations, batchSz, 32, 32, 3, true, true, keyDir, 300);
+    }
+    else if (experiment.compare("ModelB") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        dealerPerf("ModelB", party, iterations, batchSz, 28, 28, 1, true, true, keyDir, 60);
+    }
+    else if (experiment.compare("CNN2-perf") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        dealerPerf("CNN2", party, iterations, batchSz, 28, 28, 1, true, true, keyDir, 60);
+    }
+    else if (experiment.compare("CNN3-perf") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        dealerPerf("CNN3", party, iterations, batchSz, 32, 32, 3, true, true, keyDir, 300);
+    }
+    else
+    {
+        assert(0 && "unknown experiment");
+    }
+    return 0;
+}
diff --git a/GPU-MPC/experiments/orca/orca_evaluator.cu b/GPU-MPC/experiments/orca/orca_evaluator.cu
new file mode 100644
index 00000000..df24877a
--- /dev/null
+++ b/GPU-MPC/experiments/orca/orca_evaluator.cu
@@ -0,0 +1,504 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <chrono>
+#include <cstdio>
+#include <fcntl.h>
+#include <filesystem>
+#include <omp.h>
+#include <string>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_random.h"
+#include "datasets/gpu_data.h"
+
+#include "nn/orca/gpu_layer.h"
+#include "nn/orca/gpu_model.h"
+
+#include "cnn.h"
+#include "model_accuracy.h"
+
+#include <sytorch/softmax.h>
+#include <sytorch/backend/llama_base.h>
+
+u64 *gpuSoftmax(int batchSz, int numClasses, int party, SigmaPeer *peer, u64 *d_I, u64 *labels, bool secfloat, LlamaBase<u64> *llama)
+{
+    Tensor4D<u64> inp(batchSz, numClasses, 1, 1);
+    Tensor4D<u64> softmaxOp(batchSz, numClasses, 1, 1);
+
+    size_t memSz = batchSz * numClasses * sizeof(u64);
+    moveIntoCPUMem((u8 *)inp.data, (u8 *)d_I, memSz, NULL);
+    gpuFree(d_I);
+    if (secfloat)
+    {
+        softmax_secfloat(inp, softmaxOp, dcf::orca::global::scale, LlamaConfig::party);
+    }
+    else
+    {
+        pirhana_softmax(inp, softmaxOp, dcf::orca::global::scale);
+    }
+    for (int img = 0; img < batchSz; img++)
+    {
+        for (int c = 0; c < numClasses; c++)
+        {
+            softmaxOp(img, c, 0, 0) -= (labels[numClasses * img + c] * (((1LL << dcf::orca::global::scale)) / batchSz));
+        }
+    }
+    reconstruct(inp.d1 * inp.d2, softmaxOp.data, 64);
+    d_I = (u64 *)moveToGPU((u8 *)softmaxOp.data, memSz, NULL);
+    return d_I;
+}
+
+void trainModel(dcf::orca::GPUModel<u64> *m, u8 **keyBuf, int party, SigmaPeer *peer, u64 *data, u64 *labels, AESGlobalContext *g, bool secfloat, LlamaBase<u64> *llama, int epoch)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    size_t inpMemSz = m->inpSz * sizeof(u64);
+    auto d_I = (u64 *)moveToGPU((u8 *)data, inpMemSz, &(m->layers[0]->s));
+    u64 *d_O;
+    for (int i = 0; i < m->layers.size(); i++)
+    {
+        m->layers[i]->readForwardKey(keyBuf);
+        d_O = m->layers[i]->forward(peer, party, d_I, g);
+        if (d_O != d_I)
+            gpuFree(d_I);
+        d_I = d_O;
+    }
+    checkCudaErrors(cudaDeviceSynchronize());
+    d_I = gpuSoftmax(m->batchSz, m->classes, party, peer, d_I, labels, secfloat, llama);
+    for (int i = m->layers.size() - 1; i >= 0; i--)
+    {
+        m->layers[i]->readBackwardKey(keyBuf, epoch);
+        d_I = m->layers[i]->backward(peer, party, d_I, g, epoch);
+    }
+}
+
+u64 getKeySz(std::string dir, std::string modelName)
+{
+    std::ifstream kFile(dir + modelName + ".txt");
+    u64 keySz;
+    kFile >> keySz;
+    return keySz;
+}
+
+void rmWeights(std::string lossDir, int party, int l, int k)
+{
+    assert(std::filesystem::remove(lossDir + "weights_mask_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat"));
+    assert(std::filesystem::remove(lossDir + "masked_weights_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat"));
+}
+
+void evaluatorE2E(std::string modelName, std::string dataset, int party, std::string ip, std::string weightsFile, bool floatWeights, int epochs, int blocks, int blockSz, int batchSz, int H, int W, int C, bool secfloat, bool momentum, std::string keyDir)
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPUMemPool();
+    initGPURandomness();
+    initCPURandomness();
+    assert(epochs < 6);
+
+    omp_set_num_threads(2);
+
+    auto dealerFifoName = "/tmp/dealerFifo" + std::to_string(party);
+    auto evalFifoName = "/tmp/evaluatorFifo" + std::to_string(party);
+
+    int dealerFifo, evalFifo;
+    bool sync = ((epochs * blocks) > 1);
+    if (sync)
+    {
+        mkfifo(dealerFifoName.c_str(), 0666);
+        mkfifo(evalFifoName.c_str(), 0666);
+        // evaluator reads from this fifo
+        printf("Opening dealer fifo=%s\n", dealerFifoName.data());
+        dealerFifo = open(dealerFifoName.c_str(), O_RDONLY);
+        // evalutor writes to this fifo
+        printf("Opening evaluator fifo=%s\n", evalFifoName.data());
+        evalFifo = open(evalFifoName.c_str(), O_WRONLY);
+    }
+    char one = 1;
+    char two = 2;
+
+    auto trainingDir = "output/P" + std::to_string(party) + "/training/";
+    auto expName = modelName + "-" + std::to_string(epochs) + "e-" + std::to_string(blocks) + "b";
+    auto lossDir = trainingDir + "loss/" + expName + "/";
+    auto weightsDir = lossDir + "weights/";
+    auto keySzDir = trainingDir + "keysize/";
+    std::ofstream lossFile(lossDir + "loss.txt");
+    std::ofstream accFile(lossDir + "accuracy.txt");
+
+    dcf::orca::GPUModel<u64> *m = getGPUModel<u64>(modelName, Tensor<u64>(nullptr, {(u64)batchSz, (u64)H, (u64)W, (u64)C}));
+    m->setTrain(momentum);
+    m->initWeights(weightsFile, floatWeights);
+    Dataset d = readDataset(dataset, party);
+
+    u8 *keyBuf1, *keyBuf2, *curKeyBuf, *nextKeyBuf;
+    u64 keySz = getKeySz(keySzDir, modelName);
+    getAlignedBuf(&keyBuf1, keySz);
+    getAlignedBuf(&keyBuf2, keySz);
+    int curBuf = 0;
+    curKeyBuf = keyBuf1;
+    nextKeyBuf = keyBuf2;
+
+    SigmaPeer *peer = new GpuPeer(false);
+    LlamaBase<u64> *llama = nullptr;
+
+    // automatically truncates by scale
+    LlamaConfig::party = party + 2;
+    llama = new LlamaBase<u64>();
+    if (LlamaConfig::party == SERVER)
+        llama->initServer(ip, (char **)&curKeyBuf);
+    else
+        llama->initClient(ip, (char **)&curKeyBuf);
+    peer->peer = LlamaConfig::peer;
+
+    if (secfloat)
+        secfloat_init(party + 1, ip);
+
+    std::string keyFile = keyDir + modelName + "_training_key" + std::to_string(party) + ".dat";
+    dropOSPageCache();
+    std::chrono::duration<int64_t, std::milli> onlineTime = std::chrono::duration<int64_t, std::milli>::zero();
+    std::chrono::duration<int64_t, std::milli> computeTime = std::chrono::duration<int64_t, std::milli>::zero();
+    uint64_t keyReadTime = 0;
+    size_t commBytes = 0;
+    for (int l = 0; l < epochs; l++)
+    {
+        for (int k = 0; k < blocks; k++)
+        {
+            if (sync)
+            {
+                // evaluator reads from one and write to two
+                assert(1 == read(dealerFifo, &one, sizeof(char)));
+                assert(one == 1);
+            }
+            int fd = openForReading(keyFile);
+            printf("Iteration=%u\n", l * blocks * blockSz + k * blockSz);
+            // uncomment for end to end run
+            peer->sync();
+            auto startComm = peer->bytesSent() + peer->bytesReceived();
+            auto start = std::chrono::high_resolution_clock::now();
+            readKey(fd, keySz, curKeyBuf, &keyReadTime);
+            for (int j = 0; j < blockSz; j++)
+            {
+#pragma omp parallel num_threads(2)
+                {
+#pragma omp sections
+                    {
+#pragma omp section
+                        {
+                            if (j < blockSz - 1)
+                                readKey(fd, keySz, nextKeyBuf, &keyReadTime);
+                        }
+#pragma omp section
+                        {
+                            peer->sync();
+                            auto computeStart = std::chrono::high_resolution_clock::now();
+                            auto labelsIdx = (k * blockSz + j) * batchSz * d.classes;
+                            int dataIdx = (k * blockSz + j) * d.H * d.W * d.C * batchSz;
+                            trainModel(m, &curKeyBuf, party, peer, &(d.data[dataIdx]), &(d.labels[labelsIdx]), &g, secfloat, llama, l);
+                            auto computeEnd = std::chrono::high_resolution_clock::now();
+                            auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(computeEnd - computeStart);
+                            computeTime += elapsed;
+                        }
+                    }
+                }
+                curKeyBuf = curBuf == 0 ? keyBuf2 : keyBuf1;
+                nextKeyBuf = curBuf == 0 ? keyBuf1 : keyBuf2;
+                curBuf = (curBuf + 1) % 2;
+            }
+            auto end = std::chrono::high_resolution_clock::now();
+            auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+            onlineTime += elapsed;
+            printf("Online time (ms): %lu\n", elapsed.count());
+
+            auto endComm = peer->bytesSent() + peer->bytesReceived();
+            commBytes += (endComm - startComm);
+            close(fd);
+            if (sync)
+            {
+                // evaluator writes to two
+                assert(1 == write(evalFifo, &two, sizeof(char)));
+            }
+            if (party == SERVER0)
+            {
+                m->dumpWeights(weightsDir + "masked_weights_" + std::to_string(party) + "_" + std::to_string(l) + "_" + std::to_string(k) + ".dat");
+                std::pair<double, double> res;
+                if (dataset == "mnist")
+                {
+                    res = getLossMNIST<i64>(modelName, (u64)dcf::orca::global::scale, weightsDir, party, l, k);
+                }
+                else
+                {
+                    res = getLossCIFAR10<i64>(modelName, (u64)dcf::orca::global::scale, weightsDir, party, l, k);
+                }
+                auto accuracy = res.first;
+                auto loss = res.second;
+                printf("Accuracy=%lf, Loss=%lf\n", accuracy, loss);
+                lossFile << loss << std::endl;
+                accFile << accuracy << std::endl;
+                rmWeights(weightsDir, party, l, k);
+            }
+        }
+    }
+    LlamaConfig::peer->close();
+    int iterations = epochs * blocks * blockSz;
+    commBytes += secFloatComm;
+    std::ofstream stats(trainingDir + expName + ".txt");
+    auto statsString = "Total time taken (ms): " + std::to_string(onlineTime.count()) + "\nTotal bytes communicated: " + std::to_string(commBytes) + "\nSecfloat softmax bytes: " + std::to_string(secFloatComm);
+
+    auto avgKeyReadTime = (double)keyReadTime / (double)iterations;
+    auto avgComputeTime = (double)computeTime.count() / (double)iterations;
+
+    double commPerIt = (double)commBytes / (double)iterations;
+    statsString += "\nAvg key read time (ms): " + std::to_string(avgKeyReadTime) + "\nAvg compute time (ms): " + std::to_string(avgComputeTime);
+    statsString += "\nComm per iteration (B): " + std::to_string(commPerIt);
+    stats << statsString;
+    stats.close();
+    std::cout << statsString << std::endl;
+    if (sync)
+    {
+        close(dealerFifo);
+        close(evalFifo);
+    }
+    lossFile.close();
+    accFile.close();
+    destroyCPURandomness();
+    destroyGPURandomness();
+}
+
+void evaluatorPerf(std::string modelName, std::string dataset, int party, std::string ip, int iterations, int batchSz, int H, int W, int C, bool secfloat, bool momentum, std::string keyDir)
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPUMemPool();
+    initGPURandomness();
+    initCPURandomness();
+
+    omp_set_num_threads(2);
+
+    dcf::orca::GPUModel<u64> *m = getGPUModel<u64>(modelName, Tensor<u64>(nullptr, {(u64)batchSz, (u64)H, (u64)W, (u64)C}));
+    m->setTrain(momentum);
+    size_t inpMemSz = m->inpSz * sizeof(u64);
+    auto inp = (u64 *)cpuMalloc(inpMemSz);
+    memset(inp, 0, inpMemSz);
+    size_t opMemSz = m->batchSz * m->classes * sizeof(u64);
+    auto labels = (u64 *)cpuMalloc(opMemSz);
+    memset(labels, 0, opMemSz);
+
+    u8 *keyBuf1, *keyBuf2, *curKeyBuf, *nextKeyBuf;
+    auto trainingDir = "output/P" + std::to_string(party) + "/training/";
+    auto keySzDir = trainingDir + "keysize/";
+    u64 keySz = getKeySz(keySzDir, modelName);
+    getAlignedBuf(&keyBuf1, keySz);
+    getAlignedBuf(&keyBuf2, keySz);
+    int curBuf = 0;
+    curKeyBuf = keyBuf1;
+    nextKeyBuf = keyBuf2;
+
+    SigmaPeer *peer = new GpuPeer(false);
+    LlamaBase<u64> *llama = nullptr;
+
+    LlamaConfig::party = party + 2;
+    llama = new LlamaBase<u64>();
+    if (LlamaConfig::party == SERVER)
+        llama->initServer(ip, (char **)&curKeyBuf);
+    else
+        llama->initClient(ip, (char **)&curKeyBuf);
+    peer->peer = LlamaConfig::peer;
+
+    if (secfloat)
+        secfloat_init(party + 1, ip);
+
+    std::string keyFile = keyDir + modelName + "_training_key" + std::to_string(party) + ".dat";
+    dropOSPageCache();
+    std::chrono::duration<int64_t, std::milli> onlineTime = std::chrono::duration<int64_t, std::milli>::zero();
+    std::chrono::duration<int64_t, std::milli> computeTime = std::chrono::duration<int64_t, std::milli>::zero();
+    uint64_t keyReadTime = 0;
+    size_t commBytes = 0;
+    int fd = openForReading(keyFile);
+    auto startComm = peer->bytesSent() + peer->bytesReceived();
+    readKey(fd, keySz, curKeyBuf, &keyReadTime);
+    peer->sync();
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int j = 0; j < iterations; j++)
+    {
+#pragma omp parallel num_threads(2)
+        {
+#pragma omp sections
+            {
+#pragma omp section
+                {
+                    if (j < iterations - 1)
+                        readKey(fd, keySz, nextKeyBuf, &keyReadTime);
+                }
+#pragma omp section
+                {
+                    peer->sync();
+                    auto computeStart = std::chrono::high_resolution_clock::now();
+                    trainModel(m, &curKeyBuf, party, peer, inp, labels, &g, secfloat, llama, 0);
+                    auto computeEnd = std::chrono::high_resolution_clock::now();
+                    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(computeEnd - computeStart);
+                    computeTime += elapsed;
+                }
+            }
+        }
+        if (j == iterations - 2)
+        {
+            auto end = std::chrono::high_resolution_clock::now();
+            onlineTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+            printf("Online time (ms): %lu\n", onlineTime.count());
+        }
+        curKeyBuf = curBuf == 0 ? keyBuf2 : keyBuf1;
+        nextKeyBuf = curBuf == 0 ? keyBuf1 : keyBuf2;
+        curBuf = (curBuf + 1) % 2;
+    }
+    auto endComm = peer->bytesSent() + peer->bytesReceived();
+    commBytes += (endComm - startComm);
+    close(fd);
+    commBytes += secFloatComm;
+    LlamaConfig::peer->close();
+    std::ofstream stats(trainingDir + modelName + ".txt");
+    auto statsString = "\n" + modelName + "\n";
+    statsString += "Total time taken (ms): " + std::to_string(onlineTime.count()) + "\nTotal bytes communicated: " + std::to_string(commBytes) + "\nSecfloat softmax bytes: " + std::to_string(secFloatComm);
+    statsString += "\nIterations: " + std::to_string(iterations) + "\n";
+    auto totTimeByIt = (double)onlineTime.count() / (double)(iterations - 1);
+    auto avgKeyReadTime = (double)keyReadTime / (double)iterations;
+    auto avgComputeTIme = (double)computeTime.count() / (double)iterations;
+
+    double commPerIt = (double)commBytes / (double)iterations;
+    statsString += "\nTotal time / iterations (ms): " + std::to_string(totTimeByIt) + "\nAvg key read time (ms): " + std::to_string(avgKeyReadTime) + "\nAvg compute time (ms): " + std::to_string(avgComputeTIme);
+    statsString += "\nComm per iteration (B): " + std::to_string(commPerIt) + "\n";
+    stats << statsString;
+    stats.close();
+    std::cout << statsString << std::endl;
+    destroyCPURandomness();
+    destroyGPURandomness();
+}
+
+int main(int argc, char *argv[])
+{
+    sytorch_init();
+    int party = atoi(argv[1]);
+    auto ip = argv[2];
+    auto experiment = std::string(argv[3]);
+    auto keyDir = std::string(argv[4]);
+    using T = u64;
+    // Neha: need to fix this later
+    if (experiment.compare("CNN2") == 0)
+    {
+        int epochs = 1;
+        int blocks = 1;
+        int blockSz = 600;
+        int batchSz = 100;
+        evaluatorE2E("CNN2", "mnist", party, ip, "weights/CNN2.dat", false, epochs, blocks, blockSz, batchSz, 28, 28, 1, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN3-2e") == 0)
+    {
+        int epochs = 2;   // 2
+        int blocks = 20;  // 10
+        int blockSz = 25; // 50
+        int batchSz = 100;
+        evaluatorE2E("CNN3", "cifar10", party, ip, "weights/CNN3-2e.dat", true, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN3-5e") == 0)
+    {
+        int epochs = 5;
+        int blocks = 20;
+        int blockSz = 25;
+        int batchSz = 100;
+        evaluatorE2E("CNN3", "cifar10", party, ip, "weights/CNN3-2e.dat", true, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN2-loss") == 0)
+    {
+        int epochs = 1;
+        int blocks = 60;
+        int blockSz = 10;
+        int batchSz = 100;
+        evaluatorE2E("CNN2", "mnist", party, ip, "weights/CNN2.dat", false, epochs, blocks, blockSz, batchSz, 28, 28, 1, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN3-2e-loss") == 0)
+    {
+        int epochs = 2;
+        int blocks = 50;
+        int blockSz = 10;
+        int batchSz = 100;
+        evaluatorE2E("CNN3", "cifar10", party, ip, "weights/CNN3.dat", false, epochs, blocks, blockSz, batchSz, 32, 32, 3, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN2-perf") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        evaluatorPerf("CNN2", "mnist", party, ip, iterations, batchSz, 28, 28, 1, true, true, keyDir);
+    }
+    else if (experiment.compare("CNN3-perf") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        evaluatorPerf("CNN3", "cifar10", party, ip, iterations, batchSz, 32, 32, 3, true, true, keyDir);
+    }
+    else if (experiment.compare("P-VGG16") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        evaluatorPerf("P-VGG16", "cifar10", party, ip, iterations, batchSz, 32, 32, 3, false, false, keyDir);
+    }
+    else if (experiment.compare("P-AlexNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        evaluatorPerf("P-AlexNet", "cifar10", party, ip, iterations, batchSz, 32, 32, 3, false, false, keyDir);
+    }
+    else if (experiment.compare("P-LeNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        evaluatorPerf("P-LeNet", "mnist", party, ip, iterations, batchSz, 28, 28, 1, false, false, keyDir);
+    }
+    else if (experiment.compare("P-SecureML") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 128;
+        evaluatorPerf("P-SecureML", "mnist", party, ip, iterations, batchSz, 28, 28, 1, false, false, keyDir);
+    }
+    else if (experiment.compare("AlexNet") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        evaluatorPerf("AlexNet", "cifar10", party, ip, iterations, batchSz, 32, 32, 3, true, true, keyDir);
+    }
+    else if (experiment.compare("ModelB") == 0)
+    {
+        int iterations = 11;
+        int batchSz = 100;
+        evaluatorPerf("ModelB", "mnist", party, ip, iterations, batchSz, 28, 28, 1, true, true, keyDir);
+    }
+    else
+    {
+        assert(0 && "unknown experiment");
+    }
+
+    return 0;
+}
diff --git a/GPU-MPC/experiments/orca/orca_inference.cu b/GPU-MPC/experiments/orca/orca_inference.cu
new file mode 100644
index 00000000..cc1b2fd0
--- /dev/null
+++ b/GPU-MPC/experiments/orca/orca_inference.cu
@@ -0,0 +1,122 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <chrono>
+#include <fcntl.h>
+#include <filesystem>
+#include <omp.h>
+
+#include "backend/orca.h"
+#include "cnn.h"
+
+#ifndef InfType
+#define InfType u64
+#endif
+
+int main(int argc, char *argv[])
+{
+    sytorch_init();
+    auto modelName = std::string(argv[1]);
+    auto model = getCNN<InfType>(modelName);
+    int bw = atoi(argv[2]);
+    u64 scale = strtoul(argv[3], 0, 10);
+    assert(bw <= 8 * sizeof(InfType));
+    assert(scale < bw);
+    std::vector<u64> inpShape;
+    if (modelName.compare("VGG16") == 0 || modelName.compare("ResNet50") == 0 || modelName.compare("ResNet18") == 0)
+    {
+        u64 shape[4] = {1, 224, 224, 3};
+        inpShape.insert(inpShape.end(), shape, shape + 4);
+    }
+    else if (modelName.compare("CNN3") == 0 || modelName.compare("AlexNet") == 0)
+    {
+        u64 shape[4] = {100, 32, 32, 3};
+        inpShape.insert(inpShape.end(), shape, shape + 4);
+    }
+    else
+    {
+        u64 shape[4] = {100, 28, 28, 1};
+        inpShape.insert(inpShape.end(), shape, shape + 4);
+    }
+    Tensor<InfType> inp(inpShape);
+    inp.zero();
+    model->init(scale, inp);
+    model->zero();
+    int role = atoi(argv[4]);
+    int party = atoi(argv[5]);
+    auto keyDir = argv[6];
+    auto expName = modelName + "_" + std::to_string(bw) + "_" + std::to_string(scale);
+    auto keyFileName = keyDir + expName;
+    // 0 is dealer, 1 is evaluator
+    if (role == 0)
+    {
+        auto fss = new OrcaKeygen<InfType>(party, bw, scale, keyFileName);
+        model->setBackend(fss);
+        model->optimize();
+        inp.d_data = (InfType *)moveToGPU((u8 *)inp.data, inp.size() * sizeof(InfType), (Stats *)NULL);
+        auto &activation = model->forward(inp);
+        fss->output(activation);
+        fss->close();
+    }
+    else
+    {
+        auto ip = argv[7];
+        auto fss = new Orca<InfType>(party, ip, bw, (int)scale, keyFileName);
+        model->setBackend(fss);
+        model->optimize();
+        std::vector<u64> time;
+        u64 commBytes;
+        lseek(fss->fd, 0, SEEK_SET);
+        readKey(fss->fd, fss->keySize, fss->startPtr, NULL);
+        for (int i = 0; i < 11; i++)
+        {
+            fss->keyBuf = fss->startPtr;
+            fss->s.reset();
+            fss->peer->sync();
+            auto commStart = fss->peer->bytesSent() + fss->peer->bytesReceived();
+            auto start = std::chrono::high_resolution_clock::now();
+            inp.d_data = (InfType *)moveToGPU((u8 *)inp.data, inp.size() * sizeof(InfType), &(fss->s));
+            auto &activation = model->forward(inp);
+            fss->output(activation);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto elapsed = end - start;
+            if (i > 0)
+                time.push_back(std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+            auto commEnd = fss->peer->bytesSent() + fss->peer->bytesReceived();
+            if (i == 0)
+                commBytes = commEnd - commStart;
+        }
+        fss->close();
+        auto avgTime = std::reduce(time.begin(), time.end()) / (float)time.size();
+        printf("Average time taken (microseconds)=%f\n", avgTime);
+        printf("Comm (B)=%lu\n", commBytes);
+        // assumes output/P0/inference exists
+        auto inferenceDir = "output/P" + std::to_string(party) + "/inference/";
+        std::ofstream statsFile(inferenceDir + expName + ".txt");
+        statsFile << "Average time taken (microseconds)=" << avgTime << std::endl;
+        statsFile << "Comm (B)=" << commBytes << std::endl;
+        statsFile.close();
+    }
+    return 0;
+}
diff --git a/GPU-MPC/experiments/orca/piranha.cu b/GPU-MPC/experiments/orca/piranha.cu
new file mode 100644
index 00000000..b07a68da
--- /dev/null
+++ b/GPU-MPC/experiments/orca/piranha.cu
@@ -0,0 +1,119 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <chrono>
+#include <fcntl.h>
+#include <filesystem>
+#include <omp.h>
+
+#include "cnn.h"
+#include "backend/piranha.h"
+
+int main(int argc, char *argv[])
+{
+    prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
+    using T = u64;
+    auto modelName = std::string(argv[1]);
+    printf("Model name=%s\n", modelName.data());
+    auto model = getCNN<T>(modelName);
+    int bw = 64;
+    u64 scale = 24;
+    u64 bs = 128;
+    std::vector<u64> inpShape;
+    if (modelName.compare("P-VGG16") == 0 || modelName.compare("P-AlexNet") == 0)
+    {
+        u64 shape[4] = {bs, 32, 32, 3};
+        inpShape.insert(inpShape.end(), shape, shape + 4);
+    }
+    else if (modelName.compare("P-LeNet") == 0)
+    {
+        u64 shape[4] = {bs, 28, 28, 1};
+        inpShape.insert(inpShape.end(), shape, shape + 4);
+    }
+    else
+    {
+        u64 shape[2] = {bs, 784};
+        inpShape.insert(inpShape.end(), shape, shape + 2);
+    }
+    Tensor<T> inp(inpShape);
+    inp.zero();
+    model->init(scale, inp);
+    model->zero();
+    int role = atoi(argv[2]);
+    int party = atoi(argv[3]);
+    auto keyDir = argv[4];
+    auto keyFileName = keyDir + modelName;
+    printf("Key file=%s\n", keyFileName.data());
+    // 0 is dealer, 1 is evaluator
+    if (role == 0)
+    {
+        auto piranha = new PiranhaKeygen<T>(party, bw, scale, keyFileName);
+        model->setBackend(piranha);
+        model->optimize();
+        inp.d_data = (T *)moveToGPU((u8 *)inp.data, inp.size() * sizeof(T), (Stats *)NULL);
+        auto &activation = model->forward(inp);
+        piranha->output(activation);
+        piranha->close();
+    }
+    else
+    {
+        auto ip = argv[5];
+        auto piranha = new Piranha<T>(party, ip, bw, (int)scale, keyFileName);
+        model->setBackend(piranha);
+        model->optimize();
+        std::vector<u64> time;
+        u64 commBytes;
+        lseek(piranha->fd, 0, SEEK_SET);
+        readKey(piranha->fd, piranha->keySize, piranha->startPtr, NULL);
+        for (int i = 0; i < 11; i++)
+        {
+            piranha->keyBuf = piranha->startPtr;
+            piranha->s.reset();
+            piranha->peer->sync();
+            auto commStart = piranha->peer->bytesSent() + piranha->peer->bytesReceived();
+            auto start = std::chrono::high_resolution_clock::now();
+            inp.d_data = (T *)moveToGPU((u8 *)inp.data, inp.size() * sizeof(T), &(piranha->s));
+            auto &activation = model->forward(inp);
+            piranha->output(activation);
+            auto end = std::chrono::high_resolution_clock::now();
+            auto elapsed = end - start;
+            if (i > 0)
+                time.push_back(std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+            auto commEnd = piranha->peer->bytesSent() + piranha->peer->bytesReceived();
+            if (i == 0)
+                commBytes = commEnd - commStart;
+        }
+        piranha->close();
+        auto avgTime = std::reduce(time.begin(), time.end()) / (float)time.size();
+        printf("Average time taken (microseconds)=%f\n", avgTime);
+        printf("Comm (B)=%lu\n", commBytes);
+        // assumes output/P0/inference exists
+        auto inferenceDir = "output/P" + std::to_string(party) + "/inference/";
+        std::ofstream statsFile(inferenceDir + modelName + ".txt");
+        statsFile << "Average time taken (microseconds)=" << avgTime << std::endl;
+        statsFile << "Comm (B)=" << commBytes << std::endl;
+        statsFile.close();
+    }
+    return 0;
+}
diff --git a/GPU-MPC/experiments/orca/run_experiment.py b/GPU-MPC/experiments/orca/run_experiment.py
new file mode 100644
index 00000000..8e91a236
--- /dev/null
+++ b/GPU-MPC/experiments/orca/run_experiment.py
@@ -0,0 +1,326 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import subprocess
+from pathlib import Path
+import argparse
+import json
+import time
+import os
+import csv
+
+# -- matplotlib stuff --
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import sys
+sys.path.insert(0, '../..')
+from experiments.utils import run_seq, run_parallel, run_one, remove_key
+
+def run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, exp_name, fig_name, loss_dir, dataset):
+    dealer_cmd = 'CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {} {}'.format(dealer_gpu, party, exp_name, dealer_key_dir)
+    eval_cmd = 'CUDA_VISIBLE_DEVICES={} ./orca_evaluator {} {} {} {}'.format(eval_gpu, party, peer_ip, exp_name, dealer_key_dir)
+
+    log_dir = 'output/P{}/{}/logs/'.format(party, fig_name)
+    run_parallel(dealer_cmd, eval_cmd, log_dir)
+    key_file = '{}_training_key{}.dat'.format(exp_name.split('-')[0], party)
+    print("Key file={}".format(key_file))
+    remove_key(dealer_key_dir, key_file)
+
+    loss = list(map(lambda x: float(x), open('output/P{}/training/loss/{}/loss.txt'.format(party, loss_dir)).readlines()))
+    X = list(map(lambda x: 10 * (x + 1), range(len(loss))))
+    plt.plot(X, loss)
+    plt.title("{} on {}".format(exp_name.split('-')[0], dataset))
+    plt.xlabel("Iterations")
+    plt.ylabel("Cross-entropy loss")
+    plt.savefig("output/P{}/{}/{}.png".format(party, fig_name, fig_name), dpi=300, bbox_inches='tight')
+    
+    with open('output/P{}/{}/loss.csv'.format(party, fig_name),'w') as out_file:
+        writer = csv.writer(out_file)
+        writer.writerow(['Iteration','Cross-Entropy Loss'])
+        for i in range(len(X)):
+            writer.writerow((X[i], loss[i]))
+
+
+def run_figure(fig_number, party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    print("Generating Figure {}".format(fig_number))
+    if fig_number == '5a':
+        run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, 'CNN2-loss', 'Fig5a', 'CNN2-1e-60b', 'MNIST')
+    elif fig_number == '5b':
+        run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, 'CNN3-2e-loss', 'Fig5b', 'CNN3-2e-50b', 'CIFAR-10')
+    else:
+        print('unrecognized figure number', fig_number)
+ 
+
+def run_table3(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table3/logs/'.format(party)
+    table = dict()
+    for exp in ['CNN2', 'CNN3-2e', 'CNN3-5e']:
+    # for exp in ['CNN2']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {} {}".format(dealer_gpu, party, exp, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_evaluator {} {} {} {}".format(eval_gpu, party, peer_ip, exp, dealer_key_dir)
+        log_dir = "output/P{}/Table3/logs/{}/".format(party, exp)
+        if exp == 'CNN2':
+            run_seq(dealer_cmd, eval_cmd, log_dir)
+        else:
+            run_parallel(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_training_key{}.dat'.format(exp.split('-')[0], party)
+        remove_key(dealer_key_dir, key_file)
+
+    for tup in [('CNN2-1e-1b', 1), ('CNN3-2e-20b', 40), ('CNN3-5e-20b', 100)]:
+    # for tup in [('CNN3-5e-20b', 1)]:
+        exp, blocks = tup
+        network = exp.split('-')[0]
+        training_stats = list(map(lambda x: x.split(":")[-1], open("output/P{}/training/{}.txt".format(party, exp)).readlines()))
+        time = float(training_stats[0])
+        comm = float(training_stats[1])
+        key_read_time = float(training_stats[3])
+        compute_time = float(training_stats[4])
+        time -= (blocks - 1) * min(key_read_time, compute_time)
+        network = '-'.join(exp.split('-')[:-1])
+        table[network] = dict()
+        table[network]['Epochs'] = int(exp.split('-')[1][0])
+        table[network]['Time (min)'] = round(time / (1000 * 60), 2)
+        table[network]['Comm (GB)'] = round(comm / (1024**3), 2)
+        lines = open('output/P{}/training/loss/{}/accuracy.txt'.format(party, exp)).readlines()
+        accuracy = 0.0
+        if len(lines) > 0:
+            accuracy = float(lines[-1])
+        table[network]['Accuracy'] = round(accuracy, 2)
+    
+    with open('output/P{}/Table3/Table3.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+
+
+def run_table4(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table4/logs/'.format(party)
+
+    for network in ['P-SecureML', 'P-LeNet', 'P-AlexNet', 'P-VGG16']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {} {}".format(dealer_gpu, party, network, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_evaluator {} {} {} {}".format(eval_gpu, party, peer_ip, network, dealer_key_dir)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_training_key{}.dat'.format(network, party)
+        remove_key(dealer_key_dir, key_file)
+
+    for network in ['P-SecureML', 'P-LeNet', 'P-AlexNet', 'P-VGG16']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./piranha {} {} {} {}".format(dealer_gpu, network, 0, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./piranha {} {} {} {} {}".format(eval_gpu, network, 1, party, dealer_key_dir, peer_ip)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_inference_key{}.dat'.format(network, party)
+        remove_key(dealer_key_dir, key_file)
+    
+    table = dict()
+    for network in ['P-SecureML', 'P-LeNet', 'P-AlexNet', 'P-VGG16']:
+        table[network] = {'Training': dict(), 'Inference': dict()}
+
+        training_stats = list(map(lambda x: x.split(":")[-1], open('output/P{}/training/{}.txt'.format(party, network)).readlines()))
+        training_time = training_stats[-4]
+        training_comm = training_stats[-1]
+        
+        table[network]['Training']['Time (ms)'] = round(float(training_time), 2)
+        table[network]['Training']['Comm (MB)'] = round(float(training_comm) / 1024**2, 2)
+
+        inference_stats = list(map(lambda x: x.split("=")[-1], open('output/P{}/inference/{}.txt'.format(party, network)).readlines()))
+        inference_time = inference_stats[0]
+        inference_comm = inference_stats[1]
+        table[network]['Inference']['Time (ms)'] = round(float(inference_time) / 1000, 2)
+        table[network]['Inference']['Comm (MB)'] = round(float(inference_comm) / 1024**2, 2)
+
+    with open('output/P{}/Table4/Table4.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+
+
+def run_table6(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table6/logs/'.format(party)
+
+    for tup in [('CNN2', '-perf'), ('ModelB', ''), ('AlexNet', ''), ('CNN3', '-perf')]:
+        network, suffix = tup         
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {}{} {}".format(dealer_gpu, party, network, suffix, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_evaluator {} {} {}{} {}".format(eval_gpu, party, peer_ip, network, suffix, dealer_key_dir)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_training_key{}.dat'.format(network, party)
+        remove_key(dealer_key_dir, key_file)
+    
+    table = dict()
+    for network in ['CNN2', 'ModelB', 'AlexNet', 'CNN3']:
+        stats = list(map(lambda x: x.split(":")[-1], open("output/P{}/training/{}.txt".format(party, network)).readlines()))
+        time = stats[-4]
+        comm = stats[-1]
+        table[network] = dict()
+        table[network]['Time (s)'] = round(float(time) / 1000, 2)
+        table[network]['Comm (MB)'] = round(float(comm) / 1024**2, 2)
+    
+    with open('output/P{}/Table6/Table6.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+
+
+
+def run_table7(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table7/logs/'.format(party)
+
+    for network in ['CNN2', 'CNN3']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {}-perf {}".format(dealer_gpu, party, network, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_evaluator {} {} {}-perf {}".format(eval_gpu, party, peer_ip, network, dealer_key_dir)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_training_key{}.dat'.format(network, party)
+        remove_key(dealer_key_dir, key_file)
+
+    for network in ['CNN2', 'CNN3']:
+        bw = 64
+        scale = 24
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_inference {} {} {} {} {} {}".format(dealer_gpu, network, bw, scale, 0, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_inference {} {} {} {} {} {} {}".format(eval_gpu, network, bw, scale, 1, party, dealer_key_dir, peer_ip)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_{}_{}_inference_key{}.dat'.format(network, bw, scale, party)
+        remove_key(dealer_key_dir, key_file)
+
+    table = dict()
+    for network in ['CNN2', 'CNN3']:
+        bw = 64
+        scale = 24
+        table[network] = {'Training': dict(), 'Inference': dict()}
+
+        training_stats = list(map(lambda x: x.split(":")[-1], open('output/P{}/training/{}.txt'.format(party, network)).readlines()))
+        training_time = training_stats[-4]        
+        table[network]['Training']['Time (s)'] = round(float(training_time) / 1000, 2)
+        
+        inference_stats = list(map(lambda x: x.split("=")[-1], open('output/P{}/inference/{}_{}_{}.txt'.format(party, network, bw, scale)).readlines()))
+        inference_time = inference_stats[0]        
+        table[network]['Inference']['Time (s)'] = round(float(inference_time) / 1000**2, 2)
+
+    with open('output/P{}/Table7/Table7.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+    
+
+def run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table8/logs/'.format(party)
+    table = dict()
+    for network in ['CNN2', 'CNN3']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_dealer {} {}-perf {}".format(dealer_gpu, party, network, dealer_key_dir)
+        run_one(dealer_cmd, log_dir)
+        key_file = '{}_training_key{}.dat'.format(network, party)
+        remove_key(dealer_key_dir, key_file)
+        table[network] = dict()
+        key_size = float(open('output/P{}/training/keysize/{}.txt'.format(party, network)).readlines()[0])
+        table[network]['Training (GB)'] = round(float(key_size) / 1024**3, 2)
+
+    for network in ['CNN2', 'CNN3']:
+        bw = 64
+        scale = 24
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_inference {} {} {} {} {} {}".format(dealer_gpu, network, bw, scale, 0, party, dealer_key_dir)
+        run_one(dealer_cmd, log_dir)
+
+        key_file = '{}_{}_{}_inference_key{}.dat'.format(network, bw, scale, party)
+        key_size = os.path.getsize(dealer_key_dir + key_file) 
+        table[network]['Inference (GB)'] = round(float(key_size) / 1024**3, 2)
+        remove_key(dealer_key_dir, key_file)
+    
+    with open('output/P{}/Table8/Table8.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+    
+
+
+def run_table9(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    log_dir = 'output/P{}/Table9/logs/'.format(party)
+
+    for tup in [('VGG16', 64, 24, ''), ('ResNet18', 64, 24, ''), ('ResNet50', 64, 24, ''), ('ResNet50', 37, 12, ''), ('VGG16', 32, 12, '_u32'), ('ResNet18', 32, 10, '_u32')]:
+        network, bw, scale, bin_suffix = tup
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_inference{} {} {} {} {} {} {}".format(dealer_gpu, bin_suffix, network, bw, scale, 0, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./orca_inference{} {} {} {} {} {} {} {}".format(eval_gpu, bin_suffix, network, bw, scale, 1, party, dealer_key_dir, peer_ip)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_{}_{}_inference_key{}.dat'.format(network, bw, scale, party)
+        remove_key(dealer_key_dir, key_file)
+
+    table = dict()
+    for tup in [('VGG16', 64, 24), ('ResNet50', 64, 24), ('ResNet18', 64, 24), ('ResNet50', 37, 12), ('VGG16', 32, 12), ('ResNet18', 32, 10)]:
+        network, bw, scale = tup
+        if not network in table:
+            table[network] = dict()
+        table[network][bw] = dict()
+        table[network][bw][scale] = dict()
+        inference_stats = list(map(lambda x: x.split("=")[-1], open('output/P{}/inference/{}_{}_{}.txt'.format(party, network, bw, scale)).readlines()))
+        inference_time = inference_stats[0]
+        table[network][bw][scale]['Time (s)'] = round(float(inference_time) / 1000**2, 3)
+   
+    with open('output/P{}/Table9/Table9.json'.format(party), 'w') as outfile:
+        json.dump(table, outfile, indent=4)
+
+
+def run_table(table_number, party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip):
+    print("Generating Table {}".format(table_number))
+    if table_number == 3:
+        run_table3(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    elif table_number == 4:
+        run_table4(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    elif table_number == 6:
+        run_table6(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    elif table_number == 7:
+        run_table7(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    elif table_number == 8:
+        run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    elif table_number == 9:
+        run_table9(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip)
+    else:
+        print('unrecognized table number', table_number)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Run artifact evaluation!')
+    parser.add_argument('--figure', default=None, help='Figure # to run.')
+    parser.add_argument('--table', default=None, type=int, help='Table # to run.')
+    parser.add_argument('--all', default=None, help='Run all experiments.')
+    parser.add_argument('--party', default=None, type=int, help='Party to run (0/1).')
+
+    args = parser.parse_args();
+    global_config = None
+    with open('config.json', 'r') as f:
+        global_config = json.load(f)
+    config = None
+
+    if args.party == None:
+        raise Exception("Must specify party")
+
+    if args.party == 0:
+        config = global_config['P0']
+    else:
+        config = global_config['P1']
+    dealer_config = config['dealer']
+    eval_config = config['evaluator']
+
+    if args.all:
+        for i in [4, 6, 7, 8, 9]:
+            run_table(i, args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'])
+        
+        for i in ['5a', '5b']:
+            run_figure(i, args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'])
+        
+        run_table(3, args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'])
+    # Handle figure experiments
+    elif args.figure:
+        run_figure(args.figure, args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'])
+        
+    # Handle tables
+    elif args.table:
+        run_table(args.table, args.party,  dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'])
+
+if __name__ == '__main__':
+    main();
diff --git a/GPU-MPC/experiments/orca/share_data.cpp b/GPU-MPC/experiments/orca/share_data.cpp
new file mode 100644
index 00000000..a6300454
--- /dev/null
+++ b/GPU-MPC/experiments/orca/share_data.cpp
@@ -0,0 +1,182 @@
+// 
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/tensor.h>
+#include <sytorch/backend/llama_base.h>
+#include "datasets/cifar10.h"
+#include "datasets/mnist.h"
+#include "utils/gpu_file_utils.h"
+
+void writeSharesCpu(std::ostream &f1, std::ostream &f2, int bw, int N, u64 *A)
+{
+    u64 *A0 = new u64[N];
+    u64 *A1 = new u64[N];
+    for (int i = 0; i < N; i++)
+    {
+        auto shares = splitShare(A[i], bw);
+        A0[i] = shares.first;
+        A1[i] = shares.second;
+    }
+
+    f1.write((char *)A0, N * sizeof(u64));
+    f2.write((char *)A1, N * sizeof(u64));
+    delete[] A0;
+    delete[] A1;
+}
+
+std::pair<u64 *, int> readCifar10Labels()
+{
+    int N = 50000;
+    u64 *data = new u64[N * 10];
+    auto rawData = cifar::read_dataset<std::vector, std::vector, uint8_t, uint8_t>();
+#pragma omp parallel for
+    for (int i = 0; i < N; i++)
+    {
+        uint64_t label = rawData.training_labels[i];
+        data[10 * i + label] = 1;
+        for (int j = 0; j < 10; j++)
+        {
+            if (j != label)
+            {
+                data[10 * i + j] = 0;
+            }
+        }
+    }
+    return std::make_pair(data, N * 10);
+}
+
+std::pair<u64 *, int> readCifar10(int scale)
+{
+    int N = 50000, H = 32, W = 32, C = 3;
+    u64 *data = new u64[N * H * W * C];
+    auto rawData = cifar::read_dataset<std::vector, std::vector, uint8_t, uint8_t>();
+#pragma omp parallel for
+    for (int i = 0; i < N; i++)
+    {
+        for (int j = 0; j < H; j++)
+        {
+            for (int k = 0; k < W; k++)
+            {
+                for (int l = 0; l < C; l++)
+                {
+                    data[i * H * W * C + j * W * C + k * C + l] = (rawData.training_images[i][l * H * W + j * W + k] / 255.0) * (1LL << (scale));
+                }
+            }
+        }
+    }
+    return std::make_pair(data, N * H * W * C);
+}
+
+void shareCifar10Labels(int bw)
+{
+    auto res = readCifar10Labels();
+    auto data = res.first;
+    auto N = res.second;
+    std::ofstream f1("./datasets/shares/cifar10/cifar10_labels1.dat"), f2("./datasets/shares/cifar10/cifar10_labels2.dat");
+    writeSharesCpu(f1, f2, bw, N, data);
+}
+
+void shareCifar10Data(int bw)
+{
+    auto res = readCifar10(24);
+    auto data = res.first;
+    auto N = res.second;
+    std::ofstream f1("./datasets/shares/cifar10/cifar10_share1.dat"), f2("./datasets/shares/cifar10/cifar10_share2.dat");
+    writeSharesCpu(f1, f2, bw, N, data);
+}
+
+std::pair<u64 *, int> readMnist(int scale)
+{
+    int N = 60000, H = 28, W = 28, C = 1;
+    u64 *data = new u64[N * H * W * C];
+    load_mnist();
+#pragma omp parallel for
+    for (int i = 0; i < N; i++)
+    {
+        for (int j = 0; j < H; j++)
+        {
+            for (int k = 0; k < W; k++)
+            {
+                for (int l = 0; l < C; l++)
+                {
+                    data[i * H * W * C + j * W * C + k * C + l] = train_image[i][j * 28 + k] * (1ULL << scale);
+                }
+            }
+        }
+    }
+    return std::make_pair(data, N * H * W * C);
+}
+
+std::pair<u64 *, int> readMnistLabels()
+{
+    int N = 60000;
+    u64 *data = new u64[N * 10];
+#pragma omp parallel for
+    for (int i = 0; i < N; i++)
+    {
+        int label = train_label[i];
+        data[10 * i + label] = 1;
+        for (int j = 0; j < 10; j++)
+        {
+            if (j != label)
+            {
+                data[10 * i + j] = 0;
+            }
+        }
+    }
+    return std::make_pair(data, N * 10);
+}
+
+void shareMnistLabels(int bw)
+{
+    auto res = readMnistLabels();
+    auto data = res.first;
+    auto N = res.second;
+    std::ofstream f1("./datasets/shares/mnist/mnist_labels1.dat"), f2("./datasets/shares/mnist/mnist_labels2.dat");
+    writeSharesCpu(f1, f2, bw, N, data);
+}
+
+void shareMnistData(int bw)
+{
+    auto res = readMnist(24);
+    auto data = res.first;
+    auto N = res.second;
+    std::ofstream f1("./datasets/shares/mnist/mnist_share1.dat"), f2("./datasets/shares/mnist/mnist_share2.dat");
+    writeSharesCpu(f1, f2, bw, N, data);
+}
+
+int main()
+{
+    omp_set_num_threads(64);
+    
+    LlamaConfig::party = DEALER;
+    auto llama = new LlamaBase<u64>();
+    llama->initPrngs();
+
+    makeDir("./datasets/shares");
+    makeDir("./datasets/shares/mnist");
+    makeDir("./datasets/shares/cifar10");
+    shareMnistData(64);
+    shareMnistLabels(64);
+    shareCifar10Data(64);
+    shareCifar10Labels(64);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/experiments/sigma/bert.h b/GPU-MPC/experiments/sigma/bert.h
new file mode 100644
index 00000000..976be4eb
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/bert.h
@@ -0,0 +1,103 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/module.h>
+
+template <typename T>
+class GPUBertTransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    _MHADummy<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+    std::string attnMask, qkvFormat;
+
+public:
+    GPUBertTransformerBlock(u64 n_heads, u64 n_embd, std::string attnMask, std::string qkvFormat) //: GPUGPT2TransformerBlock<T>(n_heads, n_embd, attnMask, qkvFormat)
+        : n_heads(n_heads), n_embd(n_embd), attnMask(attnMask), qkvFormat(qkvFormat)
+    {
+        assert(n_embd % n_heads == 0);
+        auto dim_W = n_embd / n_heads;
+        attn = new _MHADummy<T>(n_heads, n_embd, (int)dim_W, attnMask, qkvFormat, true);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class GPUBERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<GPUBertTransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+    std::string attnMask, qkvFormat;
+
+public:
+    GPUBERT(u64 n_layer, u64 n_heads, u64 n_embd, std::string attnMask, std::string qkvFormat) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), attnMask(attnMask), qkvFormat(qkvFormat)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new GPUBertTransformerBlock<T>(n_heads, n_embd, attnMask, qkvFormat));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &y = ln_f->forward(input);
+        // Tensor<T> *x = &y;
+
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/experiments/sigma/gpt2.h b/GPU-MPC/experiments/sigma/gpt2.h
new file mode 100644
index 00000000..355ee64a
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/gpt2.h
@@ -0,0 +1,125 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/module.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+
+template <typename T>
+class GPUGPT2TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+public:
+    _MHADummy<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+    // std::string attnMask, qkvFormat;
+    // bool doNormQKt;
+
+    // public:
+    GPUGPT2TransformerBlock(u64 n_heads, u64 n_embd, std::string attnMask, std::string qkvFormat, bool doNormQKt) : n_heads(n_heads), n_embd(n_embd)
+    // , attnMask(attnMask), qkvFormat(qkvFormat)
+    // , doNormQKt(doNormQKt)
+    {
+        assert(n_embd % n_heads == 0);
+        auto dim_W = n_embd / n_heads;
+        attn = new _MHADummy<T>(n_heads, n_embd, (int)dim_W, attnMask, qkvFormat, doNormQKt);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    virtual Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        // return ln0_out;
+        auto &attn_out = attn->forward(ln0_out);
+        // return attn_out;
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPUGPT2 : public SytorchModule<T>
+{
+    std::vector<GPUGPT2TransformerBlock<T> *> blocks;
+    // LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    std::string attnMask, qkvFormat;
+    bool doNormQKt;
+
+public:
+    GPUGPT2(u64 n_layer, u64 n_heads, u64 n_embd, std::string attnMask, std::string qkvFormat, bool doNormQKt=true) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), attnMask(attnMask), qkvFormat(qkvFormat), doNormQKt(doNormQKt)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new GPUGPT2TransformerBlock<T>(n_heads, n_embd, attnMask, qkvFormat, doNormQKt));
+        }
+        // ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+        // return ln_f->forward(*x);
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/experiments/sigma/llama2.h b/GPU-MPC/experiments/sigma/llama2.h
new file mode 100644
index 00000000..8b15abe2
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/llama2.h
@@ -0,0 +1,120 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/module.h>
+
+template <typename T>
+class LlamaFFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    LlamaFFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+
+template <typename T>
+class GPULlamaTransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    _MHADummy<T> *attn;
+    LlamaFFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+
+    GPULlamaTransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size): n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        auto dim_W = n_embd / n_heads;
+        attn = new _MHADummy<T>(n_heads, n_embd, (int)dim_W, "self", "qkvsep", true, true);
+        ffn = new LlamaFFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPULlama : public SytorchModule<T>
+{
+    std::vector<GPULlamaTransformerBlock<T> *> blocks;
+    // RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    
+    GPULlama(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new GPULlamaTransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        // ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
new file mode 100644
index 00000000..047daba0
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -0,0 +1,204 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/module.h>
+#include "gpt2.h"
+#include "bert.h"
+#include "llama2.h"
+#include "backend/sigma.h"
+
+inline std::string toGB(u64 bytes) {
+    return std::to_string(bytes) + " B (" + std::to_string((float) bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)";
+}
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    u64 n_embd = 0;
+    u64 n_head = 0;
+    u64 n_layer = 0;
+    std::string attnMask = "none";
+    std::string qkvFormat = "qkvconcat";
+    int bw = 0;
+    u64 scale = 12;
+    u64 n_seq = atoi(__argv[2]);
+    int role = atoi(__argv[3]);
+    int party = atoi(__argv[4]);
+
+    std::string model(__argv[1]);
+    printf("Model=%s\n", model.data());
+    std::string keyDir(__argv[5]);
+    auto keyFile = keyDir + model + "_inference_key";
+    u64 keyBufSz = 0;
+    SytorchModule<u64> *net;
+
+    if (model == "gpt2")
+    {
+        n_layer = 12;
+        n_head = 12;
+        n_embd = 768;
+        attnMask = "self";
+        bw = 50;
+        keyBufSz = 20 * OneGB;
+        net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+    }
+    else if (model == "bert-tiny")
+    {
+        n_layer = 2;
+        n_head = 2;
+        n_embd = 128;
+        bw = 37;
+        keyBufSz = OneGB;
+        net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+    }
+    else if (model == "bert-base")
+    {
+        n_layer = 12;
+        n_head = 12;
+        n_embd = 768;
+        bw = 50;
+        keyBufSz = 20 * OneGB;
+        net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+    }
+    else if (model == "bert-large")
+    {
+        n_layer = 24;
+        n_head = 16;
+        n_embd = 1024;
+        bw = 50;
+        keyBufSz = 50 * OneGB;
+        net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+    }
+    else if (model == "gpt-neo")
+    {
+        n_layer = 24;
+        n_head = 16;
+        n_embd = 2048;
+        attnMask = "self";
+        qkvFormat = "kvqsep";
+        bw = 51;
+        keyBufSz = 80 * OneGB;
+        net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+    }
+    else if (model == "gpt-neo-large")
+    {
+        n_layer = 32;
+        n_head = 20;
+        n_embd = 2560;
+        attnMask = "self";
+        qkvFormat = "concat";
+        bw = 51; // 52;
+        keyBufSz = 200 * OneGB;
+        net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+    }
+    else if (model == "llama7b")
+    {
+        n_layer = 32;
+        n_head = 32;
+        n_embd = 4096;
+        attnMask = "self";
+        qkvFormat = "qkvsep";
+        bw = 48;
+        u64 intermediate_size = 11008;
+        keyBufSz = 300 * OneGB;
+        net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+    }
+    else if (model == "llama13b")
+    {
+        n_layer = 40;
+        n_head = 40;
+        n_embd = 5120;
+        attnMask = "self";
+        qkvFormat = "qkvsep";
+        bw = 48;
+        u64 intermediate_size = 13824;
+        keyBufSz = 450 * OneGB;
+        net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+    }
+
+    Tensor<u64> input({n_seq, n_embd});
+    net->init(scale, input);
+    srand(time(NULL));
+
+    if (role == 0)
+    {
+        auto sigma = new SIGMAKeygen<u64>(party, bw, scale, keyFile, keyBufSz);
+        net->setBackend(sigma);
+        net->optimize();
+        input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL);
+        auto &activation = net->forward(input);
+        sigma->output(activation);
+        sigma->close();
+    }
+    else
+    {
+        std::string ip(__argv[6]);
+        auto sigma = new SIGMA<u64>(party, ip, keyFile, bw, scale, n_seq, n_embd, atoi(__argv[7]));
+        net->setBackend(sigma);
+        net->optimize();
+        sigma->peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL);
+        auto &activation = net->forward(input);
+        sigma->output(activation);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+        sigma->close();
+
+        std::stringstream ss;
+
+        ss << "Time in ms" << std::endl;
+        ss << "Total time=" + std::to_string(elapsed.count());
+        ss << std::endl;
+        ss << "Comm time=" + std::to_string(sigma->s.comm_time);
+        ss << std::endl;
+        ss << "Transfer time=" + std::to_string(sigma->s.transfer_time);
+        ss << std::endl;
+        ss << "MHA time=" + std::to_string(sigma->s.mha_time);
+        ss << std::endl;
+        ss << "Matmul time=" + std::to_string(sigma->s.matmul_time);
+        ss << std::endl;
+        ss << "Truncate time=" + std::to_string(sigma->s.truncate_time);
+        ss << std::endl;
+        ss << "Gelu time=" + std::to_string(sigma->s.gelu_time);
+        ss << std::endl;
+        ss << "Softmax time=" + std::to_string(sigma->s.softmax_time);
+        ss << std::endl;
+        ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time);
+        ss << std::endl;
+        ss << std::endl;
+        ss << "Total Comm=" + toGB(sigma->peer->bytesSent() + sigma->peer->bytesReceived());
+        ss << std::endl;
+        ss << "Gelu Comm=" + toGB(sigma->s.gelu_comm_bytes);
+        ss << std::endl;
+        ss << "Softmax Comm=" + toGB(sigma->s.softmax_comm_bytes);
+        ss << std::endl;
+        ss << "Layernorm Comm=" + toGB(sigma->s.layernorm_comm_bytes);
+        ss << std::endl;
+
+        auto inferenceDir = "output/P" + std::to_string(party) + "/";
+        std::ofstream statsFile(inferenceDir + model + ".txt");
+        statsFile << ss.rdbuf();
+        statsFile.close();
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/experiments/utils.py b/GPU-MPC/experiments/utils.py
new file mode 100644
index 00000000..f5e275d9
--- /dev/null
+++ b/GPU-MPC/experiments/utils.py
@@ -0,0 +1,115 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import subprocess
+from pathlib import Path
+import json
+import time
+import os
+import csv
+
+def run_parallel(dealer_cmd, eval_cmd, log_dir):
+    dealer = None
+    evaluator = None
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    dealer_log = log_dir + "dealer.log"
+    eval_log = log_dir + "eval.log"
+    try:
+        print('Running command={}'.format(dealer_cmd))
+        with open(dealer_log, 'a') as dealer_file:
+            dealer = subprocess.Popen(dealer_cmd, shell=True, stdout=dealer_file, stderr=dealer_file)
+
+        print('Running command={}'.format(eval_cmd))
+        with open(eval_log, 'a') as eval_file:
+            evaluator = subprocess.Popen(eval_cmd, shell=True, stdout=eval_file, stderr=eval_file) 
+    except:
+        if dealer:
+            dealer.terminate()
+            dealer.wait()
+        if evaluator:
+            evaluator.terminate()
+            evaluator.wait()
+        raise Exception("Something went wrong. Please check the logs.")
+    
+    dealer_done = False
+    eval_done = False
+    while True:
+        time.sleep(60)
+        dealer_out = dealer.poll()
+        # print("Dealer out={}".format(dealer_out))
+        if dealer_out is not None:
+            if dealer_out > 0:
+                print("Killing evaluator.")
+                evaluator.terminate()
+                evaluator.wait()
+                raise Exception("Dealer did not run properly. Check logs for errors.")
+            else:
+                dealer_done = True
+        eval_out = evaluator.poll()
+        # print("Eval out={}".format(dealer_out))
+        if eval_out is not None:
+            if eval_out > 0:
+                print("Killing dealer.")
+                dealer.terminate()
+                dealer.wait()
+                raise Exception("Evaluator did not run properly. Check logs for errors.")
+            else:
+                eval_done = True 
+        if dealer_done and eval_done:
+            break
+
+
+def run_seq(dealer_cmd, eval_cmd, log_dir):
+    dealer = None
+    evaluator = None
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+
+    dealer_log = log_dir + "dealer.log"
+    print('Running command={}'.format(dealer_cmd))
+    with open(dealer_log, 'a') as dealer_file:
+        dealer = subprocess.run(dealer_cmd, shell=True, stdout=dealer_file, stderr=dealer_file, check=True)
+        if dealer.returncode:
+            raise Exception("Dealer did not run properly. Check logs for errors.")
+
+    eval_log = log_dir + "eval.log"    
+    print('Running command={}'.format(eval_cmd))
+    with open(eval_log, 'a') as eval_file:
+        evaluator = subprocess.run(eval_cmd, shell=True, stdout=eval_file, stderr=eval_file, check=True) 
+        if evaluator.returncode:
+            raise Exception("Evaluator did not run properly. Check logs for errors.")
+
+def run_one(dealer_cmd, log_dir):
+    dealer = None
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+
+    dealer_log = log_dir + "dealer.log"
+    print('Running command={}'.format(dealer_cmd))
+    with open(dealer_log, 'a') as dealer_file:
+        dealer = subprocess.run(dealer_cmd, shell=True, stdout=dealer_file, stderr=dealer_file, check=True)
+        if dealer.returncode:
+            raise Exception("Dealer did not run properly. Check logs for errors.")
+
+
+def remove_key(key_dir, key_file):
+    key_path = key_dir + key_file
+    print("Removing key={}".format(key_path))
+    if os.path.exists(key_path):
+        os.remove(key_path)
diff --git a/GPU-MPC/ext/sytorch/.gitignore b/GPU-MPC/ext/sytorch/.gitignore
new file mode 100644
index 00000000..6911d5f4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/.gitignore
@@ -0,0 +1,25 @@
+.vscode/
+*.out
+release/
+debug/
+build/
+datatorch/
+client.dat
+server.dat
+out.folded
+out.perf
+perf.data
+perf.data.old
+graph.svg
+.venv
+*.dat
+logs/
+temp/
+graph.dot
+graph.png
+graph.pdf
+venv/
+demo/
+scratch/
+*.inp
+*.csv
diff --git a/GPU-MPC/ext/sytorch/CMakeLists.txt b/GPU-MPC/ext/sytorch/CMakeLists.txt
new file mode 100755
index 00000000..544e70a8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/CMakeLists.txt
@@ -0,0 +1,372 @@
+cmake_minimum_required(VERSION 3.17)
+project(sytorch)
+
+if(APPLE)
+
+    if(CMAKE_C_COMPILER_ID MATCHES "Clang\$")
+        set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp")
+        set(OpenMP_C_LIB_NAMES "omp")
+        set(OpenMP_omp_LIBRARY omp)
+    endif()
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang\$")
+        set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp")
+        set(OpenMP_CXX_LIB_NAMES "omp")
+        set(OpenMP_omp_LIBRARY omp)
+    endif()
+
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings -Wno-unused-result -maes -Wno-ignored-attributes -march=native -Wno-deprecated-declarations -fopenmp")
+find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+find_package(Threads REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+add_subdirectory(ext/cryptoTools)
+add_subdirectory(ext/bitpack)
+add_subdirectory(ext/llama)
+add_subdirectory(ext/sci)
+# add_subdirectory(../SCI ext/sci)
+
+add_library(${PROJECT_NAME} STATIC)
+target_sources(${PROJECT_NAME} 
+PRIVATE
+    src/sytorch/random.cpp
+    src/sytorch/backend/cleartext.cpp
+    src/sytorch/backend/baseline_cleartext.cpp
+    src/sytorch/backend/float.cpp
+    src/sytorch/softmax.cpp
+)
+
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+target_link_libraries (${PROJECT_NAME} Eigen3::Eigen Threads::Threads CUDA::cudart LLAMA cryptoTools SCI-FloatML)
+
+add_executable(
+    gpt2
+    examples/gpt2.cpp
+)
+
+target_link_libraries(gpt2 ${PROJECT_NAME})
+
+add_executable(
+    resnet50
+    examples/resnet50.cpp
+)
+
+target_link_libraries(resnet50 ${PROJECT_NAME})
+
+add_executable(
+    resnet18
+    examples/resnet18.cpp
+)
+
+target_link_libraries(resnet18 ${PROJECT_NAME})
+
+add_executable(
+    vgg16
+    examples/vgg16.cpp
+)
+
+target_link_libraries(vgg16 ${PROJECT_NAME})
+
+add_executable(
+    dpftest
+    tests/dpf.cpp
+)
+
+target_link_libraries(dpftest ${PROJECT_NAME})
+
+add_executable(
+    dcftest
+    tests/dcf.cpp
+)
+
+target_link_libraries(dcftest ${PROJECT_NAME})
+
+
+add_executable(
+    pubcmptest
+    tests/pubcmp.cpp
+)
+
+target_link_libraries(pubcmptest ${PROJECT_NAME})
+
+add_executable(
+    cliptest
+    tests/clip.cpp
+)
+
+target_link_libraries(cliptest ${PROJECT_NAME})
+
+add_executable(
+    mp_cliptest
+    tests/multi_party/clip.cpp
+)
+
+target_link_libraries(mp_cliptest ${PROJECT_NAME})
+
+add_executable(
+    mp_luttest
+    tests/multi_party/lut.cpp
+)
+
+target_link_libraries(mp_luttest ${PROJECT_NAME})
+
+add_executable(
+    mp_exptest
+    tests/multi_party/exp.cpp
+)
+
+target_link_libraries(mp_exptest ${PROJECT_NAME})
+
+add_executable(
+    mp_tanhtest
+    tests/multi_party/tanh.cpp
+)
+
+target_link_libraries(mp_tanhtest ${PROJECT_NAME})
+
+add_executable(
+    mp_gelutest
+    tests/multi_party/gelu.cpp
+)
+
+target_link_libraries(mp_gelutest ${PROJECT_NAME})
+
+add_executable(
+    mp_softmax_test
+    tests/multi_party/softmax.cpp
+)
+
+target_link_libraries(mp_softmax_test ${PROJECT_NAME})
+
+add_executable(
+    bf16test
+    tests/bf16.cpp
+)
+
+target_link_libraries(bf16test ${PROJECT_NAME})
+
+add_executable(
+    mp_bf16_test
+    tests/multi_party/bf16.cpp
+)
+
+target_link_libraries(mp_bf16_test ${PROJECT_NAME})
+
+add_executable(
+    mp_rsqrt_test
+    tests/multi_party/rsqrt.cpp
+)
+
+target_link_libraries(mp_rsqrt_test ${PROJECT_NAME})
+
+add_executable(
+    mp_layernorm_test
+    tests/multi_party/layernorm.cpp
+)
+
+target_link_libraries(mp_layernorm_test ${PROJECT_NAME})
+
+add_executable(
+    truncatereducetest
+    tests/truncatereduce.cpp
+)
+
+target_link_libraries(truncatereducetest ${PROJECT_NAME})
+
+add_executable(
+    mp_truncatereduce_test
+    tests/multi_party/truncatereduce.cpp
+)
+
+target_link_libraries(mp_truncatereduce_test ${PROJECT_NAME})
+
+add_executable(
+    bert
+    examples/bert.cpp
+)
+
+target_link_libraries(bert ${PROJECT_NAME})
+
+add_executable(
+    lutsstest
+    tests/lutss.cpp
+)
+
+target_link_libraries(lutsstest ${PROJECT_NAME})
+
+add_executable(
+    dpfet
+    tests/dpfet.cpp
+)
+
+target_link_libraries(dpfet ${PROJECT_NAME})
+
+add_executable(
+    dcf_dpf_et
+    tests/dcf_dpf_et.cpp
+)
+
+target_link_libraries(dcf_dpf_et ${PROJECT_NAME})
+
+
+add_executable(
+    sloth_drelu
+    tests/sloth_drelu.cpp
+)
+
+target_link_libraries(sloth_drelu ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_relu
+    tests/multi_party/sloth_relu.cpp
+)
+
+target_link_libraries(mp_sloth_relu ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_clip
+    tests/multi_party/sloth_clip.cpp
+)
+
+target_link_libraries(mp_sloth_clip ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_maxpool
+    tests/multi_party/sloth_maxpool.cpp
+)
+
+target_link_libraries(mp_sloth_maxpool ${PROJECT_NAME})
+
+add_executable(
+    mp_prtrunc
+    tests/multi_party/prtrunc.cpp
+)
+
+target_link_libraries(mp_prtrunc ${PROJECT_NAME})
+
+add_executable(
+    gelu_ulp
+    tests/gelu_ulp.cpp
+)
+
+target_link_libraries(gelu_ulp ${PROJECT_NAME})
+
+add_executable(
+    mp_gemm
+    tests/multi_party/gemm.cpp
+)
+
+target_link_libraries(mp_gemm ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_drelu
+    tests/multi_party/sloth_drelu.cpp
+)
+
+target_link_libraries(mp_sloth_drelu ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_maxpool_tri
+    tests/multi_party/sloth_maxpool_tri.cpp
+)
+
+target_link_libraries(mp_sloth_maxpool_tri ${PROJECT_NAME})
+
+add_executable(
+    eigenbenchmark
+    tests/eigenbenchmark.cpp
+)
+
+target_link_libraries(eigenbenchmark ${PROJECT_NAME})
+
+add_executable(
+    llama7b
+    examples/llama7b.cpp
+)
+
+target_link_libraries(llama7b ${PROJECT_NAME})
+
+add_executable(
+    wrap
+    tests/wrap.cpp
+)
+
+target_link_libraries(wrap ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_lrs
+    tests/multi_party/sloth_lrs.cpp
+)
+
+target_link_libraries(mp_sloth_lrs ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_ars
+    tests/multi_party/sloth_ars.cpp
+)
+
+target_link_libraries(mp_sloth_ars ${PROJECT_NAME})
+
+add_executable(
+    gpt2dummy
+    examples/gpt2dummy.cpp
+)
+
+target_link_libraries(gpt2dummy ${PROJECT_NAME})
+
+add_executable(
+    gpt2correctness
+    examples/gpt2correctness.cpp
+)
+
+target_link_libraries(gpt2correctness ${PROJECT_NAME})
+
+add_executable(
+    gptneo
+    examples/gptneo.cpp
+)
+
+target_link_libraries(gptneo ${PROJECT_NAME})
+
+add_executable(
+    gpt-neo_nexttoken
+    examples/gpt-neo_nexttoken.cpp
+)
+
+target_link_libraries(gpt-neo_nexttoken ${PROJECT_NAME})
+
+add_executable(
+    mp_sloth_ars_faithful
+    tests/multi_party/sloth_ars_faithful.cpp
+)
+
+target_link_libraries(mp_sloth_ars_faithful ${PROJECT_NAME})
+
+add_executable(
+    bertbenchmark
+    examples/bertbenchmark.cpp
+)
+
+target_link_libraries(bertbenchmark ${PROJECT_NAME})
+
+add_executable(
+    gpt2benchmark
+    examples/gpt2benchmark.cpp
+)
+
+target_link_libraries(gpt2benchmark ${PROJECT_NAME})
+
+add_executable(
+    gptneobenchmark
+    examples/gptneobenchmark.cpp
+)
+
+target_link_libraries(gptneobenchmark ${PROJECT_NAME})
diff --git a/GPU-MPC/ext/sytorch/README.md b/GPU-MPC/ext/sytorch/README.md
new file mode 100644
index 00000000..5bffb115
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/README.md
@@ -0,0 +1,24 @@
+# Sytorch
+
+Sytorch is a frontend (like pytorch) for secure machine-learning which can support multiple crypto-backends. Currently it supports inference tasks and includes LLAMA and Cleartext (no crypto) as backends. Sytorch allows users to describe machine learning models in C++ using a pytorch like API. It also supports conversion of ONNX models into sytorch using OnnxBridge.
+
+## Dependencies
+Sytorch requires Eigen3, cmake and a C++ compiler with OpenMP enabled. 
+```bash
+sudo apt update
+sudo apt install libeigen3-dev cmake build-essential git
+```
+To use Sytorch with OnnxBridge, OnnxBridge's python depenedencies need to be installed using the [requirements.txt](OnnxBridge/requirements.txt) file using the command:
+```bash
+pip3 install -r OnnxBridge/requirements.txt
+```
+
+## Quick start using OnnxBridge
+
+Given an model onnx file, OnnxBridge can be used to generate an executable which can be run on two VMs, server and client (owning the model weights and input image respectively), to get the secure inference output. 
+
+To do this two scripts are available:
+1. [Single Inference](/sytorch/Toy%20example-%20single%20inference.md) - This script is ideal for a single inference scenario.
+2. [Multiple Inference](/sytorch/Toy%20example-%20multiple%20inference.md) - This script is ideal for multiple inference usecases.
+
+
diff --git a/GPU-MPC/ext/sytorch/Toy example- multiple inference.md b/GPU-MPC/ext/sytorch/Toy example- multiple inference.md
new file mode 100644
index 00000000..3f8da1e4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/Toy example- multiple inference.md	
@@ -0,0 +1,113 @@
+## Multiple inference
+
+Given an model onnx file, OnnxBridge can be used to generate an executable which can be run on two VMs, Server and Client (owning the model weights and input image respectively) and a Dealer (which pre-generates the randomness for the inference.), to get the secure inference output. To do this, use the `ezpc-cli-2.sh` script by running the following command locally (not neccesarily on a VM):
+
+```bash
+./ezpc-cli-2.sh -m /absolute/path/to/model.onnx -preprocess /absolute/path/to/preprocess.py -s server-ip -d dealer-ip
+```
+
+In the above command, the paths are not local, but are the locations on the respective VMs. That is, `/absolute/path/to/model.onnx` is the path of model.onnx file on the server VM, `/absolute/path/to/preprocess.py` is the path of preprocessing script on the server VM. To write the preprocessing script for your use case, refer to the preprocessing file of the [chexpert demo](/Athos/demos/onnx/pre_process.py). If your preprocessing script uses some additional python packages, make sure they are installed on the server and client VMs. Also, ensure that the client can communicate with the server through the IP address provided on the ports between the range 42002-42100. Optionally, you can also pass the following arguments:
+
+- `-b <backend>`: the MPC backend to use (default: `LLAMA`)
+- `-scale <scale>`: the scaling factor for the model input (default: `15`)
+- `-bl <bitlength>`: the bitlength to use for the MPC computation (default: `40`)
+
+The script generates 4 scripts:
+
+- `server.sh` - Transfer this script to the server VM in any empty directory. Running this script (without any argument) reads the ONNX file, strips model weights out of it, dumps sytorch code, zips the code required to be sent to the client and dealer and waits for the client to download the zip. Once the zip is transfered, the script waits for dealer to generate the randomness and then starts the inference once the client connects. Once inference is complete, it downloads fresh randomness generated by dealer and again waits for client to start inference, this happens in a loop for multiple inference.
+- `client-offline.sh` - Transfer this script to the client VM in any empty directory. Running this script fetches the stripped code from server and compiles the model. This script must be run on client VM parallely while server VM is running it's server script. 
+- `client-online.sh` - It takes as input absolute path of image for inference. Transfer this script to the client VM in the same directory. Running this script downloads randomness from dealer,  preprocesses the input, connects with the server and starts the inference. After the secure inference is complete, inference output is printed and saved in `output.txt` file. This script needs to be run every time for a new inference with a new input.
+- `dealer.sh` - Transfer this script to the dealer VM in any empty directory. Running this script waits for server to send the zip file, after which it generates and allows the client and server script to automatically download the co-related randomness for server and client. Once transferred, it generates a fresh pair of co-related randomness keys and again allows server and client to download it in a loop for multiple inference.
+
+- Use 'clean' as `script.sh clean` with any of above script to clean the setup. This removes all files created by script from the current directory except the script itself. [Note: **This might remove all files from the current directory, keep backup of any important file.**]
+
+## Toy example - LeNet-MNIST inference
+
+Using the above instructions, we now demonstrate LeNet inference on MNIST images. We assume that we start at the home path `/home/<user>` on all machines. The below instructions also work on three terminals opened on a single machine (each terminal representing client, server and local computer) by passing `127.0.0.1` as IP address. 
+
+1. On all machines, install dependencies.
+
+```bash
+sudo apt update
+sudo apt install libeigen3-dev cmake build-essential git
+```
+
+2. On all machines, install the python dependencies in a virtual environment.
+
+```bash
+sudo apt install python3.10-venv
+python3 -m venv venv
+source venv/bin/activate
+wget https://raw.githubusercontent.com/mpc-msri/EzPC/master/OnnxBridge/requirements.txt
+pip install -r requirements.txt
+pip install tqdm pyftpdlib
+```
+3. Download ONNX file and preprocessing script for LeNet on the server and make a temporary directory.
+
+```bash
+mkdir lenet-demo-server
+cd lenet-demo-server
+wget https://github.com/kanav99/models/raw/main/lenet.onnx
+wget https://github.com/kanav99/models/raw/main/preprocess.py
+mkdir tmp
+cd tmp
+```
+
+4. Download the test image on the client and make a temporary directory.
+
+```bash
+mkdir lenet-demo-client
+cd lenet-demo-client
+wget https://github.com/kanav99/models/raw/main/input.jpg
+mkdir tmp
+cd tmp
+```
+
+5. Make a temporary directory for dealer.
+
+```bash
+mkdir lenet-demo-dealer
+cd lenet-demo-dealer
+mkdir tmp
+cd tmp
+```
+
+6. On the local computer, clone EzPC repository, generate the scripts and transfer them to respective machines. If server, client and dealer are in same local network, then pass the local network IP in the `ezpc_cli-2.sh` command.
+
+```bash
+git clone https://github.com/mpc-msri/EzPC
+cd EzPC
+cd sytorch
+./ezpc-cli-2.sh -m /home/<user>/lenet-demo-server/lenet.onnx -preprocess /home/<user>/lenet-demo-server/preprocess.py -s <SERVER-IP> -d <DEALER-IP>
+scp server.sh <SERVER-IP>:/home/<user>/lenet-demo-server/tmp/
+scp dealer.sh  <DEALER-IP>:/home/<user>/lenet-demo-dealer/tmp/
+scp client-offline.sh <CLIENT-IP>:/home/<user>/lenet-demo-client/tmp/
+scp client-online.sh  <CLIENT-IP>:/home/<user>/lenet-demo-client/tmp/
+```
+
+7. On all machines, make the bash scripts executable and execute them.
+
+```bash
+# (on server)
+chmod +x server.sh
+./server.sh
+
+# (on dealer)
+chmod +x dealer.sh
+./dealer.sh
+
+# (on client)
+chmod +x client-offline.sh client-online.sh
+./client-offline.sh
+```
+
+8. Once client-offline.sh script completes, run below script, as server waits for client to start inference. The inference logs get printed on the client terminal.
+
+```bash
+# (on client for every inference sequentially)
+./client-online.sh /home/<user>/lenet-demo-client/input.jpg
+# Run this script for every inference, after server has started downloading keys from dealer.
+# server and dealer runs in loop to handle multiple inference with fresh co-related randomness.
+```
+
+In this particular example, you should get a score array of `[-2.71362 1.06747 4.43045 0.795044 -3.21173 -2.39871 -8.49094 10.3443 1.0567 -0.694458]`, which is maximum at index 7, which is indeed expected as the [input.jpg](https://github.com/kanav99/models/raw/main/input.jpg) file contains an image of handwritten 7.
diff --git a/GPU-MPC/ext/sytorch/Toy example- single inference.md b/GPU-MPC/ext/sytorch/Toy example- single inference.md
new file mode 100644
index 00000000..722c7cb5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/Toy example- single inference.md	
@@ -0,0 +1,97 @@
+## Single inference
+
+Given an model onnx file, OnnxBridge can be used to generate an executable which can be run on two VMs, server and client (owning the model weights and input image respectively), to get the secure inference output. To do this, use the `ezpc-cli.sh` script by running the following command locally (not neccesarily on a VM):
+
+```bash
+./ezpc-cli.sh -m /absolute/path/to/model.onnx -preprocess /absolute/path/to/preprocess.py -s server-ip -i /absolute/path/to/image.jpg
+```
+
+In the above command, the paths are not local, but are the locations on the respective VMs. That is, `/absolute/path/to/model.onnx` is the path of model.onnx file on the server VM, `/absolute/path/to/preprocess.py` is the path of preprocessing script on the server VM, and `/absolute/path/to/image.jpg` is the path of image on the client VM. To write the preprocessing script for your use case, refer to the preprocessing file of the [chexpert demo](/Athos/demos/onnx/pre_process.py). If your preprocessing script uses some additional python packages, make sure they are installed on the server and client VMs. Also, ensure that the client can communicate with the server through the IP address provided on the ports between the range 42002-42100. Optionally, you can also pass the following arguments:
+
+- `-b <backend>`: the MPC backend to use (default: `LLAMA`)
+- `-scale <scale>`: the scaling factor for the model input (default: `15`)
+- `-bl <bitlength>`: the bitlength to use for the MPC computation (default: `40`)
+
+The script generates 4 scripts:
+
+- `server-offline.sh` - Transfer this script to the server VM in any empty directory. Running this script (without any argument) reads the ONNX file, strips model weights out of it, dumps sytorch code, zips the code required to be sent to the client and waits for the client to download the zip. Once the zip is transfered, the script generates the preprocessing key material.
+- `client-offline.sh` - Transfer this script to the client VM in any empty directory. Running this script fetches the stripped code from server and generates the preprocessing key material. This script must be run on client VM parallely while server VM is running it's server script. 
+- `server-online.sh` - Transfer this script to the server VM in the same directory. Running this script waits for client and starts the inference once the client connects.
+- `client-online.sh` - Transfer this script to the client VM in the same directory. Running this script preprocesses the input, connects with the server and starts the inference. After the secure inference is complete, inference output is printed and saved in `output.txt` file.
+
+## Toy example - LeNet-MNIST inference
+
+Using the above instructions, we now demonstrate LeNet inference on MNIST images. We assume that we start at the home path `/home/<user>` on both machines. The below instructions also work on three terminals opened on a single machine (each terminal representing client, server and local computer) by passing `127.0.0.1` as IP address. 
+
+1. On both machines, install dependencies.
+
+```
+sudo apt update
+sudo apt install libeigen3-dev cmake build-essential git
+```
+
+2. On both machines, install the python dependencies in a virtual environment.
+
+```
+python3 -m venv venv
+source venv/bin/activate
+wget https://raw.githubusercontent.com/mpc-msri/EzPC/master/OnnxBridge/requirements.txt
+pip install -r requirements.txt
+```
+3. Download ONNX file and preprocessing script for LeNet on the server and make a temporary directory.
+
+```
+mkdir lenet-demo-server
+cd lenet-demo-server
+wget https://github.com/kanav99/models/raw/main/lenet.onnx
+wget https://github.com/kanav99/models/raw/main/preprocess.py
+mkdir tmp
+cd tmp
+```
+
+4. Download the test image on the client and make a temporary directory.
+
+```
+mkdir lenet-demo-client
+cd lenet-demo-client
+wget https://github.com/kanav99/models/raw/main/input.jpg
+mkdir tmp
+cd tmp
+```
+
+5. On the local computer, clone EzPC repository, generate the scripts and transfer them to respective machines. If server and client are in same local network, then pass the local network IP in the `ezpc_cli.sh` command.
+
+```
+git clone https://github.com/mpc-msri/EzPC
+cd EzPC
+cd sytorch
+./ezpc-cli.sh -m /home/<user>/lenet-demo-server/lenet.onnx -preprocess /home/<user>/lenet-demo-server/preprocess.py -s <SERVER-IP> -i /home/<user>/lenet-demo-client/input.jpg
+scp server-offline.sh <SERVER-IP>:/home/<user>/lenet-demo-server/tmp/
+scp server-online.sh  <SERVER-IP>:/home/<user>/lenet-demo-server/tmp/
+scp client-offline.sh <CLIENT-IP>:/home/<user>/lenet-demo-client/tmp/
+scp client-online.sh  <CLIENT-IP>:/home/<user>/lenet-demo-client/tmp/
+```
+
+6. On both machines, make the bash scripts executable and start the offline phase.
+
+```
+(on server)
+chmod +x server-offline.sh server-online.sh
+./server-offline.sh
+
+(on client)
+chmod +x client-offline.sh client-online.sh
+./client-offline.sh
+```
+
+7. Once offline phase completes, start the online phase. The inference logits get printed on the client terminal.
+
+```
+(on server)
+./server-online.sh
+
+(on client)
+./client-online.sh
+```
+
+In this particular example, you should get a score array of `[-2.71362 1.06747 4.43045 0.795044 -3.21173 -2.39871 -8.49094 10.3443 1.0567 -0.694458]`, which is maximum at index 7, which is indeed expected as the [input.jpg](https://github.com/kanav99/models/raw/main/input.jpg) file contains an image of handwritten 7.
diff --git a/GPU-MPC/ext/sytorch/examples/bert.cpp b/GPU-MPC/ext/sytorch/examples/bert.cpp
new file mode 100644
index 00000000..c77e6d23
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/bert.cpp
@@ -0,0 +1,554 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasPrinted = true;
+bool hasInit = false;
+
+template <typename T>
+void printfe(Tensor<T> &t)
+{
+    if (hasInit) {
+        std::cout << t.data[0] << std::endl;
+    }
+}
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        auto &x0 = view(*x, 0);
+        auto &x0_unsqueeze = unsqueeze(x0);
+        auto &pool_out = pool->forward(x0_unsqueeze);
+        auto &tanh_out = tanh(pool_out);
+        // return view(tanh_out, 0);
+        return tanh_out;
+    }
+};
+
+
+template <typename T>
+class BERTSequenceClassification : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::view;
+    BERT<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_labels;
+public:
+    
+    BERTSequenceClassification(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_labels): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_labels(n_labels)
+    {
+        gpt2 = new BERT<T>(n_layer, n_heads, n_embd);
+        fc = new FC<T>(n_embd, n_labels, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, 0);
+    }
+};
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+int float_sst2_validation(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 0;
+
+    BERTSequenceClassification<float> bert(n_layer, n_head, n_embd, 2);
+    bert.init(0);
+    hasInit = true;
+    bert.load("bert_sst2_90.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 872; ++i) {
+        std::string fname = std::string("../../../transformers/datasets/sst2/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << n_seq << std::endl;
+        Tensor<float> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int fixed_sst2_validation(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    using T = i64;
+
+    BERTSequenceClassification<T> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    // bert.setBackend(new PiranhaClearText<T>);
+    bert.setBackend(new SecureMLClearText<T>);
+    hasInit = true;
+    bert.load("bert_sst2_90.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 872; ++i) {
+        std::string fname = std::string("../../../transformers/datasets/sst2/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << n_seq << std::endl;
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int float_mrpc_validation(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 0;
+
+    BERTSequenceClassification<float> bert(n_layer, n_head, n_embd, 2);
+    bert.init(0);
+    hasInit = true;
+    bert.load("bert_mrpc_90_t.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 408; ++i) {
+        std::string fname = std::string("../../../transformers/datasets/mrpc/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << n_seq << std::endl;
+        Tensor<float> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int fixed_mrpc_validation(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    using T = i64;
+
+    BERTSequenceClassification<T> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    hasInit = true;
+    bert.load("bert_mrpc_90_t.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 408; ++i) {
+        std::string fname = std::string("../../../transformers/datasets/mrpc/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << n_seq << std::endl;
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int ct_main(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    BERTSequenceClassification<i64> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    hasInit = true;
+    bert.load("bertclass.dat");
+
+    std::string fname = __argv[1];
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    bert.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    print(bert.activation, scale);
+
+    return 0;
+}
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    int party = atoi(__argv[1]);
+    if (party == 0) {
+        return ct_main(__argc, __argv + 1);
+    }
+    std::string ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+
+    llama->init(ip, true);
+
+    BERTSequenceClassification<u64> net(n_layer, n_head, n_embd, 2);
+    net.init(scale);
+    hasInit = true;
+    net.setBackend(llama);
+    // net.optimize();
+    if(party == SERVER){
+        net.load("bertclass.dat");
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    // std::string fname = __argv[2];
+    // u64 n_seq = get_n_seq(fname, n_embd);
+    u64 n_seq = 128;
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        // input.load(fname, scale);
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    if (party == CLIENT) {
+        print(output, scale, LlamaConfig::bitlength);
+    }
+    llama->finalize();
+
+    return 0;
+}
+
+int float_mrpc_single(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 0;
+
+    BERTSequenceClassification<float> bert(n_layer, n_head, n_embd, 2);
+    bert.init(0);
+    hasInit = true;
+    bert.load("bert_mrpc_90_t.dat");
+    // std::cout << bert.gpt2->ln_f->A.data[0] << std::endl;
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    u64 i = atoi(__argv[1]);
+    std::string fname = std::string("../../../transformers/datasets/mrpc/") + std::to_string(i) + ".dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    std::cout << n_seq << std::endl;
+    Tensor<float> input({n_seq, n_embd});
+    input.load(fname, scale);
+    bert.forward(input);
+    bert.activation.print();
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int float_sst2_single(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 0;
+
+    BERTSequenceClassification<double> bert(n_layer, n_head, n_embd, 2);
+    bert.init(0);
+    hasInit = true;
+    bert.load("bert_sst2_90.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    u64 i = atoi(__argv[1]);
+    std::string fname = std::string("../../../transformers/datasets/sst2/") + std::to_string(i) + ".dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    std::cout << n_seq << std::endl;
+    Tensor<double> input({n_seq, n_embd});
+    input.load(fname, scale);
+    // std::cout << input.data[0] << " " << input.data[1] << std::endl;
+    bert.forward(input);
+    bert.activation.print();
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    // float_sst2_validation(__argc, __argv);
+    fixed_sst2_validation(__argc, __argv);
+    // lt_main(__argc, __argv);
+    // float_mrpc_validation(__argc, __argv);
+    // fixed_mrpc_validation(__argc, __argv);
+    // float_sst2_single(__argc, __argv);
+    // float_mrpc_single(__argc, __argv);
+}
diff --git a/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
new file mode 100644
index 00000000..6239b8cd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
@@ -0,0 +1,260 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+    public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &y = ln_f->forward(input);
+        // Tensor<T> *x = &y;
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char**__argv)
+{
+    sytorch_init();
+
+    // bert tiny
+    // const u64 n_embd = 128;
+    // const u64 n_head = n_embd / 64;
+    // const u64 n_layer = 2;
+    // const u64 scale = 12;
+    // const u64 bw = 38;
+    // const u64 n_seq = 128;
+    
+    // bert base
+    // const u64 n_embd = 768;
+    // const u64 n_head = 12;
+    // const u64 n_layer = 12;
+    // const u64 scale = 12;
+    // const u64 bw = 51;
+    // const u64 n_seq = 128;
+
+    // bert large
+    const u64 n_embd = 1024;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 24;
+    const u64 scale = 12;
+    const u64 bw = 51;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    llama->initializeInferencePartyA(net.root);
+
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/gpt-neo_nexttoken.cpp b/GPU-MPC/ext/sytorch/examples/gpt-neo_nexttoken.cpp
new file mode 100644
index 00000000..5747fd58
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gpt-neo_nexttoken.cpp
@@ -0,0 +1,499 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/baseline_cleartext.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasInit = false;
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+    using SytorchModule<T>::local_attention_mask;
+    ///////////////////////////
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::softmax_triangular;
+
+public:
+    // FC<T> *c_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *q_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+    u64 attention_type;
+    u64 window_size;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        // c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &x = c_attn->forward(input);
+        // auto &qkv_heads = split(x, 3);
+        // auto &q_heads = view(qkv_heads, 0);
+        // auto &k_heads = view(qkv_heads, 1);
+        // auto &v_heads = view(qkv_heads, 2);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &q_heads = q_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        // double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+        // double divisor = 1;
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qks = matmul(q, kt);
+
+            // auto &qks = matmul_triangular(q, kt);
+
+            // auto &qk = matmul(q, kt);
+            // auto &qks = scalarmul(qk, divisor);
+
+            
+            Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                auto &qks_masked = attention_mask(qks, 10000000.0);
+                x = &qks_masked;
+            }
+            else 
+            {
+                auto &qks_masked = local_attention_mask(qks, 10000000.0);
+                x = &qks_masked;
+            }
+            auto &qks_sm = softmax(*x);
+            auto &qks_sm_v = matmul(qks_sm, v);
+            
+
+        //    Tensor<T> *x = &input;
+        //     if(attention_type % 2 == 0)
+        //     {   
+        //         auto &qks_sm = softmax_triangular(qks);
+        //         x = &qks_sm;
+        //     }
+        //     else 
+        //     {
+        //         auto &qks_masked = local_attention_mask(qks, 10000.0);
+        //         auto &qks_sm = softmax_triangular(qks_masked);
+        //         x = &qks_sm;
+        //     }
+        //     auto &qks_sm_v = matmul(*x, v);
+
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+    u64 attention_type; 
+    u64 window_size;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd, attention_type, window_size);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    u64 window_size;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, i, window_size));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        // for(u64 i = 0; i < n_layer - 1; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+
+        // auto &block = blocks[n_layer - 1];
+        // return block->forward(*x);
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return ln_f->forward(*x);
+    }
+};
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+
+template <typename T>
+class GPT2NextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    GPT2<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab;
+    u64 window_size;
+public:
+    
+    GPT2NextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab)
+    {
+        gpt2 = new GPT2<T>(n_layer, n_heads, n_embd, window_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        // printshape(fc_in.shape);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+    }
+};
+
+
+int test_gpt2NextWordLogits_ct() {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 2048;
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 scale = 12;
+    const u64 window_size = 256;
+
+    // GPT2<i64> gpt2(n_layer, n_head, n_embd, window_size);
+    GPT2NextWordLogits<i64> gpt2(n_layer, n_head, n_embd, n_vocab, window_size);
+    gpt2.init(scale);
+    // gpt2.setBackend(new BaselineClearText<i64>());
+    gpt2.load("../../../../transformers/gpt-neo-lambada/gpt-neo-1pt3B-weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    // int arr[50] = {3577, 3189, 2165, 2582, 2556, 1808, 2374, 2064, 3081, 221, 1480, 
+    // 3368, 241, 4235, 5018, 929, 3865, 4429, 3329, 4908, 2864, 1794, 3002, 3338, 3222, 
+    // 4813, 745, 3444, 3859, 3102, 4830, 1247, 3459, 4328, 1753, 975, 4664, 4749, 1362, 
+    // 1441, 3026, 442, 3194, 3808, 1196, 3017, 3134, 706, 1331, 326};
+    // for (int i = 0; i < 5153; ++i) {
+    for (int i = 3; i < 4; ++i) {
+        std::string fname = std::string("../../../../transformers/datasets/lambada_large/") + std::to_string(i) + ".dat";
+        // std::string fname = std::string("lambada_large/") + std::to_string(arr[i]) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << "n_seq = " << n_seq << std::endl;
+        Tensor<i64> input({n_seq, n_embd});
+        input.load(fname, scale);
+
+        // auto t1 = std::chrono::high_resolution_clock::now();
+        auto &res = gpt2.forward(input);
+        // printshape(res.shape);
+        // auto t2 = std::chrono::high_resolution_clock::now();
+        // auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        // std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+        // print(gpt2.activation, scale);
+        // print(res, scale);
+        i64 max = INT_MIN;
+        int argmax = 0;
+        for(int i=0; i<50257; i++)
+        {   
+            if(res.data[i]>max)
+            {
+                max = res.data[i];
+                argmax = i;
+            }
+        }
+        std::cout << argmax << std::endl;
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+
+int float_main() {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 2048;
+    const u64 n_embd = 2560;
+    const u64 n_head = 20;
+    const u64 n_layer = 32;
+    const u64 scale = 0;
+    const u64 window_size = 256;
+
+    // GPT2<float> gpt2(n_layer, n_head, n_embd, window_size);
+    GPT2NextWordLogits<float> gpt2(n_layer, n_head, n_embd, n_vocab, window_size);
+    gpt2.init(scale);
+    gpt2.load("../../../../transformers/gpt-neo-lambada/gpt-neo-2pt7B-weights.dat");
+
+    /*
+    // std::string fname = "lambada_large/0.dat";
+    // u64 n_seq = get_n_seq(fname, n_embd);
+    // std::cout << "n_seq = " << n_seq << std::endl;
+    // Tensor<float> input({n_seq, n_embd});
+    // input.load(fname, scale);
+
+    // auto t1 = std::chrono::high_resolution_clock::now();
+    // auto &res = gpt2.forward(input);
+    // // printshape(res.shape);
+    // auto t2 = std::chrono::high_resolution_clock::now();
+    // auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    // std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    // res.print();
+    */
+
+
+   auto t1 = std::chrono::high_resolution_clock::now();
+
+    // int arr[50] = {3577, 3189, 2165, 2582, 2556, 1808, 2374, 2064, 3081, 221, 1480, 
+    // 3368, 241, 4235, 5018, 929, 3865, 4429, 3329, 4908, 2864, 1794, 3002, 3338, 3222, 
+    // 4813, 745, 3444, 3859, 3102, 4830, 1247, 3459, 4328, 1753, 975, 4664, 4749, 1362, 
+    // 1441, 3026, 442, 3194, 3808, 1196, 3017, 3134, 706, 1331, 326};
+    // for (int i = 0; i < 5153; ++i) {
+    for (int i = 0; i < 100; ++i) {
+        std::string fname = std::string("../../../../transformers/datasets/lambada-2pt7B/") + std::to_string(i) + ".dat";
+        // std::string fname = std::string("lambada_large/") + std::to_string(arr[i]) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        // std::cout << "n_seq = " << n_seq << std::endl;
+        Tensor<float> input({n_seq, n_embd});
+        input.load(fname, scale);
+
+        // auto t1 = std::chrono::high_resolution_clock::now();
+        auto &res = gpt2.forward(input);
+        // printshape(res.shape);
+        // auto t2 = std::chrono::high_resolution_clock::now();
+        // auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        // std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+        // print(gpt2.activation, scale);
+        // print(res, scale);
+        i64 max = INT_MIN;
+        int argmax = 0;
+        for(int i=0; i<50257; i++)
+        {   
+            if(res.data[i]>max)
+            {
+                max = res.data[i];
+                argmax = i;
+            }
+        }
+        std::cout << argmax << std::endl;
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+
+    return 0;
+}
+
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 2048;
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    // const u64 n_embd = 768;
+    // const u64 n_head = 12;
+    // const u64 n_layer = 12;
+    const u64 window_size = 256;
+
+    int party = atoi(__argv[1]);
+    // if (party == 0) {
+    //     return ct_main(__argc, __argv + 1);
+    // }
+    std::string ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 52;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+
+    llama->init(ip, true);
+
+    // BERTSequenceClassification<u64> net(n_layer, n_head, n_embd, 2);
+    // net.init(scale);
+
+    GPT2NextWordLogits<u64> net(n_layer, n_head, n_embd, n_vocab, window_size);
+    // GPT2<u64> net(n_layer, n_head, n_embd, window_size);
+    net.init(scale);
+    hasInit = true;
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        net.load("../../../../transformers/gpt-neo-lambada/gpt-neo-1pt3B-weights.dat");
+        net.zero();
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    std::string fname = std::string("../../../../transformers/datasets/lambada_large/") + std::to_string(3) + ".dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    // u64 n_seq = 128;
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.load(fname, scale);
+        // input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    if (party == CLIENT) {
+        print(output, scale, LlamaConfig::bitlength);
+    }
+    llama->finalize();
+
+    return 0;
+}
+
+
+int main(int __argc, char**__argv)
+{
+    // float_main();
+    // test_gpt2NextWordLogits_ct();
+    lt_main(__argc,__argv);
+}
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2.cpp b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
new file mode 100644
index 00000000..a4f09c79
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
@@ -0,0 +1,762 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/crypten_cleartext.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasPrinted = true;
+bool hasInit = false;
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalardiv;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalardiv(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    // LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        // ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for(u64 i = 0; i < n_layer - 1; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        auto &block = blocks[n_layer - 1];
+        return block->forward(*x);
+        
+        // for(u64 i = 0; i < n_layer; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+        // return ln_f->forward(*x);
+    }
+};
+
+/*
+template <typename T>
+class GPT2SequenceClassification : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    GPT2<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_labels;
+public:
+    
+    GPT2SequenceClassification(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_labels): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_labels(n_labels)
+    {
+        gpt2 = new GPT2<T>(n_layer, n_heads, n_embd);
+        fc = new FC<T>(n_embd, n_labels, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        // printshape(fc_in.shape);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+    }
+};
+*/
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+/*
+int ct_main(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    GPT2SequenceClassification<i64> gpt2(n_layer, n_head, n_embd, 2);
+    gpt2.init(scale);
+    hasInit = true;
+    gpt2.load("gpt2lmr.dat");
+
+    u64 imgidx = atoi(__argv[1]);
+    std::string expected_lab = (imgidx >= 12500 ? "neg" : "pos");
+    std::string fname = "/home/t-kanavgupta/gpt2lmr/dataset/" + expected_lab + "/" + std::to_string(imgidx) + ".dat";
+    // std::string fname = __argv[1];
+    // std::string fname = "./dataset/" + expected_lab + "/" + std::to_string(imgidx) + ".dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    gpt2.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    print(gpt2.activation, scale);
+
+
+    return 0;
+}
+
+int baseline_main(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    GPT2SequenceClassification<i64> gpt2(n_layer, n_head, n_embd, 2);
+    gpt2.init(scale);
+    gpt2.setBackend(new BaselineClearText<i64>());
+    hasInit = true;
+    gpt2.load("gpt2lmr.dat");
+
+    u64 imgidx = atoi(__argv[1]);
+    std::string expected_lab = (imgidx >= 12500 ? "neg" : "pos");
+    std::string fname = "/home/t-kanavgupta/gpt2lmr/dataset/" + expected_lab + "/" + std::to_string(imgidx) + ".dat";
+    // std::string fname = __argv[1];
+    // std::string fname = "./dataset/" + expected_lab + "/" + std::to_string(imgidx) + ".dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    gpt2.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    print(gpt2.activation, scale);
+
+
+    return 0;
+}
+
+int float_main(int __argc, char**__argv) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 0;
+
+    GPT2SequenceClassification<float> gpt2(n_layer, n_head, n_embd, 2);
+    gpt2.init(scale);
+    gpt2.load("gpt2lmr.dat");
+
+    std::string fname = __argv[1];
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<float> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    gpt2.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    gpt2.activation.print();
+
+    return 0;
+}
+*/
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+    u64 bitlength = 50;
+    GPT2<u64> net(n_layer, n_head, n_embd);
+    net.init(scale);
+    net.load("gpt2lmr.dat");
+    Tensor<u64> input({128, n_embd});
+    input.load("15469.dat", scale);
+    printf("Starting\n");
+    net.forward(input);
+    auto &output = net.activation;
+    print(output, scale, bitlength);
+    return 0;
+}
+
+int omp_thread_count() {
+    int n = 0;
+    #pragma omp parallel reduction(+:n)
+    n += 1;
+    return n;
+}
+
+/*
+int acc_main()
+{
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+    using T = i64;
+    sytorch_init();
+
+    int numThreads = omp_thread_count();
+    std::cout << "using threads = " << numThreads << std::endl;
+    std::vector<GPT2SequenceClassification<T> *> models;
+    
+    for(int i = 0; i < numThreads; ++i) {
+        auto *model = new GPT2SequenceClassification<T>(n_layer, n_head, n_embd, 2);
+        model->init(scale);
+        model->load("gpt2lmr.dat");
+        models.push_back(model);
+    }
+
+    hasInit = true;
+    std::cout << "[*] Loaded all the models" << std::endl;
+
+    std::vector<Tensor<T> *> images(numThreads);
+    for (int i = 0; i < numThreads; ++i) {
+        images[i] = new Tensor<T>({1, n_embd});
+    }
+
+    std::ofstream outfiles[numThreads];
+    std::string lab = "pos";
+    for(int i = 0; i < numThreads; ++i) {
+        outfiles[i] = std::ofstream("results_final_4june/" + lab + "/thread-" + std::to_string(i));
+    }
+
+    #pragma omp parallel for
+    for (int i = 0; i < 12500; ++i) {
+        int tid = omp_get_thread_num();
+        std::string imgFile = "/home/t-kanavgupta/gpt2lmr/dataset/" + lab + "/" + std::to_string((lab == "neg" ? 12500 : 0) + i) + ".dat";
+        u64 n_seq = get_n_seq(imgFile, n_embd);
+        // std::cout << n_seq << std::endl;
+        images[tid]->resize({n_seq, n_embd});
+        images[tid]->load(imgFile, scale);
+        models[tid]->forward(*images[tid]);
+        auto &out = models[tid]->activation;
+        // printshape(out.shape);
+        outfiles[tid] << (i+1) << " " << (out.data[0] / double(1LL << scale)) << " " << (out.data[1] / double(1LL << scale)) << " " << (out.data[0] > out.data[1] ? "neg" : "pos") << std::endl;
+    }
+
+    lab = "neg";
+    std::ofstream outfiles2[numThreads];
+    for(int i = 0; i < numThreads; ++i) {
+        outfiles2[i] = std::ofstream("results_final_4june/" + lab + "/thread-" + std::to_string(i));
+    }
+
+    #pragma omp parallel for
+    for (int i = 0; i < 12500; ++i) {
+        int tid = omp_get_thread_num();
+        std::string imgFile = "/home/t-kanavgupta/gpt2lmr/dataset/" + lab + "/" + std::to_string((lab == "neg" ? 12500 : 0) + i) + ".dat";
+        u64 n_seq = get_n_seq(imgFile, n_embd);
+        // std::cout << n_seq << std::endl;
+        images[tid]->resize({n_seq, n_embd});
+        images[tid]->load(imgFile, scale);
+        models[tid]->forward(*images[tid]);
+        auto &out = models[tid]->activation;
+        // printshape(out.shape);
+        outfiles2[tid] << (i+1) << " " << (out.data[0] / double(1LL << scale)) << " " << (out.data[1] / double(1LL << scale)) << " " << (out.data[0] > out.data[1] ? "neg" : "pos") << std::endl;
+    }
+
+    return 0;
+}
+
+void softmax_test()
+{
+    u64 scale = 12;
+    Tensor<i64> x {1, 10};
+    for(u64 i = 0; i < 10; ++i)
+    {
+        x.data[i] = (i << scale);
+    }
+    Tensor<i64> out {1, 10};
+    ClearText<i64> backend;
+    backend.softmax(x, out, scale, 0);
+    print(out, scale);
+    backend.softmax(x, out, scale, 1);
+    print(out, scale);
+}
+
+void ex_test()
+{
+    u64 scale = 12;
+    ClearText<i64> backend;
+    // Tensor<i64> x {1, 10};
+    // for(u64 i = 0; i < 10; ++i)
+    // {
+    //     x.data[i] = (i << scale);
+    // }
+    // Tensor<i64> out {1, 10};
+    // backend.softmax(x, out, scale, 0);
+    // print(out, scale);
+    // backend.softmax(x, out, scale, 2);
+    // print(out, scale);
+    Tensor<i64> x {1};
+    Tensor<i64> out {1};
+    float inp = 3.0;
+    x.data[0] = inp * (1 << scale);
+    u64 max_deg = 7;
+    std::vector<double> coeffs(max_deg);
+    coeffs[max_deg-1] = 1.0;
+    for (u64 i = 1; i < max_deg; i++) {
+        coeffs[max_deg-1-i] = coeffs[max_deg-i] * (-1.0 / i);
+    }
+    backend.polyeval(x, out, coeffs, scale);
+    print(out, scale);
+    std::cout << "exp(-3) = " << std::exp(-inp) << std::endl;
+}
+
+int edabits_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 8;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    Tensor<u64> input({10});
+    input.zero();
+    if(party == CLIENT)
+    {
+        for (u64 i = 0; i < input.size(); ++i)
+        {
+            input.data[i] = (i << scale) + 1000;
+            if (i % 2 == 0)
+            {
+                input.data[i] *= -1;
+            }
+        }
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    EdabitsPrTrunc(input.size(), input.data, input.data, scale);
+    llama::end();
+
+    auto &output = input;
+    llama->outputA(output);
+    if (party == CLIENT) 
+    {
+        if (LlamaConfig::bitlength == 64) {
+            for (u64 i = 0; i < output.size(); ++i) {
+                std::cout << (i64)output.data[i] << " ";
+            }
+        }
+        else
+        {
+            for (u64 i = 0; i < output.size(); ++i) {
+                std::cout << output.data[i] % (1LL << (LlamaConfig::bitlength)) << " ";
+            }
+        }
+        std::cout << std::endl;
+    }
+    llama->finalize();
+
+    return 0;
+}
+
+int gpt2_sst2()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    // bert.setBackend(new BaselineClearText<T>);
+    // bert.setBackend(new PiranhaClearText<T>);
+    bert.setBackend(new SecureMLClearText<T>);
+    // bert.setBackend(new CryptenClearText<T>);
+    hasInit = true;
+    bert.load("/Users/kanav/Projects/transformers/gpt2-sst2/weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 872; ++i) {
+        std::string fname = std::string("/Users/kanav/Projects/transformers/gpt2-sst2/dataset/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+
+        // bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cerr << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int gpt2_mrpc()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    // bert.setBackend(new BaselineClearText<T>);
+    // bert.setBackend(new PiranhaClearText<T>);
+    bert.setBackend(new SecureMLClearText<T>);
+    // bert.setBackend(new CryptenClearText<T>);
+    hasInit = true;
+    bert.load("/Users/kanav/Projects/transformers/gpt2-mrpc/weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 408; ++i) {
+        std::string fname = std::string("/Users/kanav/Projects/transformers/gpt2-mrpc/dataset/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+
+        // bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cerr << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int gpt2_mnli()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 3);
+    bert.init(scale);
+    // bert.setBackend(new PiranhaClearText<T>);
+    // bert.setBackend(new SecureMLClearText<T>);
+    bert.setBackend(new CryptenClearText<T>);
+    hasInit = true;
+    bert.load("/Users/kanav/Projects/transformers/gpt2-mnli/weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 9815; ++i) {
+        std::string fname = std::string("/Users/kanav/Projects/transformers/gpt2-mnli/dataset/matched/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+
+        // bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cerr << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int gpt2_mnli_mm()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 3);
+    bert.init(scale);
+    // bert.setBackend(new PiranhaClearText<T>);
+    bert.setBackend(new SecureMLClearText<T>);
+    // bert.setBackend(new CryptenClearText<T>);
+    hasInit = true;
+    bert.load("/Users/kanav/Projects/transformers/gpt2-mnli/weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 9832; ++i) {
+        std::string fname = std::string("/Users/kanav/Projects/transformers/gpt2-mnli/dataset/mismatched/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+
+        // bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cerr << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+
+int gpt2_qnli()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 2);
+    bert.init(scale);
+    // bert.setBackend(new BaselineClearText<T>);
+    // bert.setBackend(new PiranhaClearText<T>);
+    // bert.setBackend(new SecureMLClearText<T>);
+    // bert.setBackend(new CryptenClearText<T>);
+    hasInit = true;
+    bert.load("/Users/kanav/Projects/transformers/gpt2-qnli/weights.dat");
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < 5463; ++i) {
+        std::string fname = std::string("/Users/kanav/Projects/transformers/gpt2-qnli/dataset/") + std::to_string(i) + ".dat";
+        u64 n_seq = get_n_seq(fname, n_embd);
+        Tensor<T> input({n_seq, n_embd});
+        input.load(fname, scale);
+        bert.forward(input);
+        print(bert.activation, scale);
+
+        // bert.activation.print();
+    }
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cerr << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+
+    return 0;
+}
+*/
+int main(int __argc, char**__argv)
+{
+    // softmax_test();
+    // ex_test();
+    // invsqrt_test();
+    // ct_main(__argc, __argv);
+    // float_main(__argc, __argv);
+    lt_main(__argc, __argv);
+    // acc_main();
+    // edabits_main(__argc, __argv);
+    // gpt2_sst2();
+    // gpt2_mrpc();
+    // gpt2_mnli();
+    // gpt2_mnli_mm();
+    // gpt2_qnli();
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2benchmark.cpp b/GPU-MPC/ext/sytorch/examples/gpt2benchmark.cpp
new file mode 100644
index 00000000..3659e0c1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gpt2benchmark.cpp
@@ -0,0 +1,244 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        // return ln_f->forward(*x);
+        return *x;
+    }
+};
+
+int main(int __argc, char**__argv)
+{
+    sytorch_init();
+
+    // gpt2
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 bw = 51;
+
+    // gpt neo large
+    // const u64 n_layer = 32;
+    // const u64 n_head = 20;
+    // const u64 n_embd = 2560;
+    // const u64 bw = 52;
+
+    // gpt neo xl
+    // const u64 n_layer = 32;
+    // const u64 n_head = 32;
+    // const u64 n_embd = 4096;
+    // const u64 bw = 52;
+
+    // llama 13B
+    // const u64 n_layer = 40;
+    // const u64 n_head = 40;
+    // const u64 n_embd = 5120;
+    // const u64 bw = 52;
+
+    const u64 scale = 12;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    llama->initializeInferencePartyA(net.root);
+
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2correctness.cpp b/GPU-MPC/ext/sytorch/examples/gpt2correctness.cpp
new file mode 100644
index 00000000..2094a0c8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gpt2correctness.cpp
@@ -0,0 +1,336 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/crypten_cleartext.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasInit = false;
+
+Tensor<i64> toi64(Tensor<u64> &t, u64 bw)
+{
+    Tensor<i64> res(t.shape);
+    
+    for (int i = 0; i < t.size(); ++i)
+        res.data[i] = (t.data[i] + (1LL << (bw - 1))) % (1LL << bw) - (1LL << (bw - 1));
+    return res;
+}
+
+template <typename T>
+void printfe(Tensor<T> &t, u64 n = 1)
+{
+    if (hasInit) {
+        u64 nm = std::min(n, t.size());
+        for (int i = 0; i < nm; ++i)
+        std::cout << t.data[i] << " ";
+        std::cout << std::endl;
+    }
+}
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class GPT2SequenceClassification : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    GPT2<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_labels;
+public:
+    
+    GPT2SequenceClassification(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_labels): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_labels(n_labels)
+    {
+        gpt2 = new GPT2<T>(n_layer, n_heads, n_embd);
+        fc = new FC<T>(n_embd, n_labels, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+        // return fc_in;
+    }
+};
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+
+void ct_main(std::string fname) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+
+    GPT2SequenceClassification<i64> gpt2(n_layer, n_head, n_embd, 2);
+    gpt2.init(scale);
+    hasInit = true;
+    gpt2.load("gpt2lmr.dat");
+
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    gpt2.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    printfe(gpt2.activation, 5);
+}
+
+int lt_main(std::string fname, int __argc, char**__argv){
+    
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    GPT2SequenceClassification<u64> net(n_layer, n_head, n_embd, 2);
+    net.init(scale);
+    hasInit = true;
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        net.load("gpt2lmr.dat");
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.load(fname, scale);
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    if (party == CLIENT) {
+        // print(output, scale, LlamaConfig::bitlength);
+        auto outputi64 = toi64(output, LlamaConfig::bitlength);
+        printfe(outputi64, 5);
+    }
+    llama->finalize();
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    int party = atoi(__argv[1]);
+    if (party == 0)
+        ct_main("14541.dat");
+    else
+        lt_main("14541.dat", __argc, __argv);
+    return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp b/GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp
new file mode 100644
index 00000000..dea785f2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp
@@ -0,0 +1,162 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/crypten_cleartext.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasPrinted = true;
+bool hasInit = false;
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    _MHADummy<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new _MHADummy<T>();
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class GPT2SequenceClassification : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    GPT2<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_labels;
+public:
+    
+    GPT2SequenceClassification(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_labels): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_labels(n_labels)
+    {
+        gpt2 = new GPT2<T>(n_layer, n_heads, n_embd);
+        fc = new FC<T>(n_embd, n_labels, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        // printshape(fc_in.shape);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+    }
+};
+
+int main(int __argc, char**__argv)
+{
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 1024;
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+
+    const u64 scale = 12;
+    using T = i64;
+
+    GPT2SequenceClassification<T> bert(n_layer, n_head, n_embd, 3);
+    bert.init(scale);
+}
diff --git a/GPU-MPC/ext/sytorch/examples/gptneo.cpp b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
new file mode 100644
index 00000000..87584ff6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
@@ -0,0 +1,398 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/baseline_cleartext.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+bool hasInit = false;
+
+Tensor<i64> toi64(Tensor<u64> &t, u64 bw)
+{
+    Tensor<i64> res(t.shape);
+    
+    for (int i = 0; i < t.size(); ++i)
+        res.data[i] = (t.data[i] + (1LL << (bw - 1))) % (1LL << bw) - (1LL << (bw - 1));
+    return res;
+}
+
+template <typename T>
+void printfe(Tensor<T> &t, u64 n = 1)
+{
+    if (hasInit) {
+        u64 nm = std::min(n, t.size());
+        for (int i = 0; i < nm; ++i)
+        std::cout << t.data[i] << " ";
+        std::cout << std::endl;
+    }
+}
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+    // using SytorchModule<T>::local_attention_mask;
+    ///////////////////////////
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::softmax_triangular;
+
+public:
+    // FC<T> *c_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *q_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+    u64 attention_type;
+    u64 window_size;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        // c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &x = c_attn->forward(input);
+        // auto &qkv_heads = split(x, 3);
+        // auto &q_heads = view(qkv_heads, 0);
+        // auto &k_heads = view(qkv_heads, 1);
+        // auto &v_heads = view(qkv_heads, 2);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &q_heads = q_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        // double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+        // double divisor = 1;
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            // auto &qks = matmul(q, kt);
+            auto &qks = matmul_triangular(q, kt);
+            // auto &qk = matmul(q, kt);
+            // auto &qks = scalarmul(qk, divisor);
+
+            /*
+            Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                // printf("global\n");
+                auto &qks_masked = attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            else 
+            {
+                auto &qks_masked = local_attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            auto &qks_sm = softmax(*x);
+            auto &qks_sm_v = matmul(qks_sm, v);
+            */
+
+           Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            else 
+            {
+                // auto &qks_masked = local_attention_mask(qks, 10000.0);
+                // auto &qks_sm = softmax_triangular(qks_masked);
+
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            auto &qks_sm_v = matmul(*x, v);
+
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+    u64 attention_type; 
+    u64 window_size;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd, attention_type, window_size);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    u64 window_size;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, i, window_size));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        // for(u64 i = 0; i < n_layer - 1; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+
+        // auto &block = blocks[n_layer - 1];
+        // return block->forward(*x);
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class GPT2NextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    GPT2<T> *gpt2;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab;
+    u64 window_size;
+public:
+    
+    GPT2NextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab)
+    {
+        gpt2 = new GPT2<T>(n_layer, n_heads, n_embd, window_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = gpt2->forward(input);
+        // printshape(fc_in.shape);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+    }
+};
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+void ct_main(std::string fname) {
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 2048;
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 scale = 12;
+    const u64 window_size = 256;
+
+    GPT2NextWordLogits <i64> net(n_layer, n_head, n_embd, n_vocab, window_size);
+    net.init(scale);
+    hasInit = true;
+    net.load("gpt-neo-1pt3B-weights.dat");
+
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    net.forward(input);
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
+    printfe(net.activation, 5);
+}
+
+int lt_main(std::string fname, int __argc, char**__argv){
+    
+    sytorch_init();
+
+    const u64 n_vocab = 50257;
+    const u64 n_ctx = 2048;
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 window_size = 256;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 52;
+    LlamaConfig::party = party;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    GPT2NextWordLogits <u64> net(n_layer, n_head, n_embd, n_vocab, window_size);
+    net.init(scale);
+    hasInit = true;
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        net.load("gpt-neo-1pt3B-weights.dat");
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.load(fname, scale);
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    if (party == CLIENT) {
+        // print(output, scale, LlamaConfig::bitlength);
+        auto outputi64 = toi64(output, LlamaConfig::bitlength);
+        printfe(outputi64, 5);
+    }
+    llama->finalize();
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    int party = atoi(__argv[1]);
+    if (party == 0)
+        ct_main("128-2048.dat");
+    else
+        lt_main("128-2048.dat", __argc, __argv);
+    return 0;
+}
+
diff --git a/GPU-MPC/ext/sytorch/examples/gptneobenchmark.cpp b/GPU-MPC/ext/sytorch/examples/gptneobenchmark.cpp
new file mode 100644
index 00000000..c44f5dd6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/gptneobenchmark.cpp
@@ -0,0 +1,304 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+    // using SytorchModule<T>::local_attention_mask;
+    ///////////////////////////
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::softmax_triangular;
+
+public:
+    // FC<T> *c_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *q_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+    u64 attention_type;
+    u64 window_size;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        // c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &x = c_attn->forward(input);
+        // auto &qkv_heads = split(x, 3);
+        // auto &q_heads = view(qkv_heads, 0);
+        // auto &k_heads = view(qkv_heads, 1);
+        // auto &v_heads = view(qkv_heads, 2);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &q_heads = q_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        // double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+        // double divisor = 1;
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            // auto &qks = matmul(q, kt);
+            auto &qks = matmul_triangular(q, kt);
+            // auto &qk = matmul(q, kt);
+            // auto &qks = scalarmul(qk, divisor);
+
+            /*
+            Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                // printf("global\n");
+                auto &qks_masked = attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            else 
+            {
+                auto &qks_masked = local_attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            auto &qks_sm = softmax(*x);
+            auto &qks_sm_v = matmul(qks_sm, v);
+            */
+
+           Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            else 
+            {
+                // auto &qks_masked = local_attention_mask(qks, 10000.0);
+                // auto &qks_sm = softmax_triangular(qks_masked);
+
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            auto &qks_sm_v = matmul(*x, v);
+
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+    u64 attention_type; 
+    u64 window_size;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd, attention_type, window_size);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    // LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    u64 window_size;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, i, window_size));
+        }
+        // ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for(u64 i = 0; i < n_layer - 1; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        auto &block = blocks[n_layer - 1];
+        return block->forward(*x);
+        
+        // for(u64 i = 0; i < n_layer; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+        // return ln_f->forward(*x);
+    }
+};
+
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 window_size = 256;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 52;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd, window_size);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        // net.load("gpt-neo-1pt3B-weights.dat");
+        net.zero();
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    u64 n_seq = 128;
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    lt_main(__argc,__argv);
+}
diff --git a/GPU-MPC/ext/sytorch/examples/llama7b.cpp b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
new file mode 100644
index 00000000..4c239287
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
@@ -0,0 +1,263 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+#include <filesystem>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+    using SytorchModule<T>::mul;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::rotary_embedding;
+
+public:
+    FC<T> *q_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &q_heads = q_attn->forward(input);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+
+            auto &q1 = rotary_embedding(q);
+            auto &k1 = rotary_embedding(k);
+
+            auto &kt = transpose(k1);
+            auto &qk = matmul_triangular(q1, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class LLAMA_MODEL : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class LlamaNextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    LLAMA_MODEL<T> *llama_model;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size;
+
+public:
+    LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size)
+    {
+        llama_model = new LLAMA_MODEL<T>(n_layer, n_heads, n_embd, intermediate_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = llama_model->forward(input);
+        auto &fc_out = fc->forward(fc_in);
+        return view(fc_out, -1);
+    }
+};
+
+u64 get_n_seq(std::string filename, u64 n_embd)
+{
+    u64 n_elements = std::filesystem::file_size(filename);
+    assert(n_elements % (4 * n_embd) == 0);
+    return n_elements / (4 * n_embd);
+}
+
+void ct_main()
+{
+    sytorch_init();
+
+    const u64 n_vocab = 32000;
+    const u64 n_embd = 4096;
+    const u64 n_head = 32;
+    const u64 n_layer = 32;
+    const u64 intermediate_size = 11008;
+    const u64 scale = 12;
+
+    LlamaNextWordLogits<i64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    llama_model.init(scale);
+    llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_7b.dat");
+    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"993.dat";
+    u64 n_seq = get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    input.load(fname, scale);
+    auto &res = llama_model.forward(input);
+    i64 max = INT_MIN;
+    int argmax = 0;
+    for (int i = 0; i < n_vocab; i++)
+    {
+        if (i == 0)
+            printf("res=%ld\n", res.data[i]);
+        if (res.data[i] > max)
+        {
+            max = res.data[i];
+            argmax = i;
+        }
+    }
+    std::cout << argmax << std::endl;
+    std::cout << max << std::endl;
+}
+
+int main()
+{
+    ct_main();
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/resnet18.cpp b/GPU-MPC/ext/sytorch/examples/resnet18.cpp
new file mode 100644
index 00000000..f1c377c7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/resnet18.cpp
@@ -0,0 +1,210 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <sytorch/utils.h>
+
+
+template <typename T>
+class ResNet18 : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::concat;
+
+public:
+    Conv2D<T> *conv0;
+    MaxPool2D<T> *maxpool1;
+    ReLU<T> *relu2;
+    Conv2D<T> *conv3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu7;
+    Conv2D<T> *conv8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu12;
+    Conv2D<T> *conv13;
+    ReLU<T> *relu14;
+    Conv2D<T> *conv15;
+    Conv2D<T> *conv16;
+    ReLU<T> *relu18;
+    Conv2D<T> *conv19;
+    ReLU<T> *relu20;
+    Conv2D<T> *conv21;
+    ReLU<T> *relu23;
+    Conv2D<T> *conv24;
+    ReLU<T> *relu25;
+    Conv2D<T> *conv26;
+    Conv2D<T> *conv27;
+    ReLU<T> *relu29;
+    Conv2D<T> *conv30;
+    ReLU<T> *relu31;
+    Conv2D<T> *conv32;
+    ReLU<T> *relu34;
+    Conv2D<T> *conv35;
+    ReLU<T> *relu36;
+    Conv2D<T> *conv37;
+    Conv2D<T> *conv38;
+    ReLU<T> *relu40;
+    Conv2D<T> *conv41;
+    ReLU<T> *relu42;
+    Conv2D<T> *conv43;
+    ReLU<T> *relu45;
+    GlobalAvgPool2D<T> *globalaveragepool46;
+    Flatten<T> *flatten47;
+    FC<T> *gemm48;
+
+public:
+    ResNet18()
+    {
+        conv0 = new Conv2D<T>(3, 64, 7, 3, 2, true);
+        maxpool1 = new MaxPool2D<T>(3, 1, 2);
+        relu2 = new ReLU<T>();
+        conv3 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu7 = new ReLU<T>();
+        conv8 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        relu12 = new ReLU<T>();
+        conv13 = new Conv2D<T>(64, 128, 3, 1, 2, true);
+        relu14 = new ReLU<T>();
+        conv15 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        conv16 = new Conv2D<T>(64, 128, 1, 0, 2, true);
+        relu18 = new ReLU<T>();
+        conv19 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu20 = new ReLU<T>();
+        conv21 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        relu23 = new ReLU<T>();
+        conv24 = new Conv2D<T>(128, 256, 3, 1, 2, true);
+        relu25 = new ReLU<T>();
+        conv26 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        conv27 = new Conv2D<T>(128, 256, 1, 0, 2, true);
+        relu29 = new ReLU<T>();
+        conv30 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu31 = new ReLU<T>();
+        conv32 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu34 = new ReLU<T>();
+        conv35 = new Conv2D<T>(256, 512, 3, 1, 2, true);
+        relu36 = new ReLU<T>();
+        conv37 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        conv38 = new Conv2D<T>(256, 512, 1, 0, 2, true);
+        relu40 = new ReLU<T>();
+        conv41 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu42 = new ReLU<T>();
+        conv43 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu45 = new ReLU<T>();
+        globalaveragepool46 = new GlobalAvgPool2D<T>();
+        flatten47 = new Flatten<T>();
+        gemm48 = new FC<T>(512, 1000, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var44 = conv0->forward(input);
+        //   return var44;
+        auto &var45 = maxpool1->forward(var44);
+        auto &var46 = relu2->forward(var45);
+        auto &var47 = conv3->forward(var46);
+        auto &var48 = relu4->forward(var47);
+        auto &var49 = conv5->forward(var48);
+        auto &var50 = add(var49, var46);
+        auto &var51 = relu7->forward(var50);
+        auto &var52 = conv8->forward(var51);
+        auto &var53 = relu9->forward(var52);
+        auto &var54 = conv10->forward(var53);
+        auto &var55 = add(var54, var51);
+        auto &var56 = relu12->forward(var55);
+        auto &var57 = conv13->forward(var56);
+        auto &var58 = relu14->forward(var57);
+        auto &var59 = conv15->forward(var58);
+        auto &var60 = conv16->forward(var56);
+        auto &var61 = add(var59, var60);
+        auto &var62 = relu18->forward(var61);
+        auto &var63 = conv19->forward(var62);
+        auto &var64 = relu20->forward(var63);
+        auto &var65 = conv21->forward(var64);
+        auto &var66 = add(var65, var62);
+        auto &var67 = relu23->forward(var66);
+        auto &var68 = conv24->forward(var67);
+        auto &var69 = relu25->forward(var68);
+        auto &var70 = conv26->forward(var69);
+        auto &var71 = conv27->forward(var67);
+        auto &var72 = add(var70, var71);
+        auto &var73 = relu29->forward(var72);
+        auto &var74 = conv30->forward(var73);
+        auto &var75 = relu31->forward(var74);
+        auto &var76 = conv32->forward(var75);
+        auto &var77 = add(var76, var73);
+        auto &var78 = relu34->forward(var77);
+        auto &var79 = conv35->forward(var78);
+        auto &var80 = relu36->forward(var79);
+        auto &var81 = conv37->forward(var80);
+        auto &var82 = conv38->forward(var78);
+        auto &var83 = add(var81, var82);
+        auto &var84 = relu40->forward(var83);
+        auto &var85 = conv41->forward(var84);
+        auto &var86 = relu42->forward(var85);
+        auto &var87 = conv43->forward(var86);
+        auto &var88 = add(var87, var84);
+        auto &var89 = relu45->forward(var88);
+        auto &var90 = globalaveragepool46->forward(var89);
+        auto &var91 = flatten47->forward(var90);
+        auto &var92 = gemm48->forward(var91);
+        return var92;
+    }
+};
+
+int main(int argc, char**__argv){
+
+    prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
+    prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
+    srand(time(NULL));
+    using T = i32;
+    // ClearText<T>::bw = 64;
+    const u64 scale = 12;//24
+
+    ResNet18<T> net;
+    net.init(scale);
+    net.load("resnet18_no_bn_input_weights.dat");
+
+    Tensor<T> input({1, 224, 224, 3});
+    input.load("2.dat", scale);
+
+    net.forward(input);
+    
+    auto act_2d = net.activation.as_2d();
+    // printf("%ld\n", act_2d.data[0]);
+    // for(int i = 200; i < 250; i++) printf("Act=%ld\n", act_2d.data[i]);
+    // for(int i = 600; i < 650; i++) printf("Act=%ld\n", act_2d.data[i]);
+//     print(net.conv0->activation, scale, 64);
+//     printf("\n");
+    // auto w = net.gemm48->weight.as_nd();
+    // printf("%ld, %ld, %ld\n", w.data[0], w.data[1], w.data[w.size() - 1]);
+    std::cout << act_2d.argmax(0) + 1 << std::endl;
+    std::cout << act_2d(0, act_2d.argmax(0)) << std::endl;
+    // printf("%ld\n", net.conv0->activation.data[0]);    
+    
+    return 0;
+
+}
diff --git a/GPU-MPC/ext/sytorch/examples/resnet50.cpp b/GPU-MPC/ext/sytorch/examples/resnet50.cpp
new file mode 100644
index 00000000..87a08889
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/resnet50.cpp
@@ -0,0 +1,404 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <sytorch/utils.h>
+
+template <typename T>
+class Net: public SytorchModule<T> {
+     using SytorchModule<T>::add;
+     using SytorchModule<T>::concat;
+public:
+     Conv2D<T> *conv0;
+     MaxPool2D<T> *maxpool1;
+     ReLU<T> *relu2;
+     Conv2D<T> *conv3;
+     ReLU<T> *relu4;
+     Conv2D<T> *conv5;
+     ReLU<T> *relu6;
+     Conv2D<T> *conv7;
+     Conv2D<T> *conv8;
+     ReLU<T> *relu10;
+     Conv2D<T> *conv11;
+     ReLU<T> *relu12;
+     Conv2D<T> *conv13;
+     ReLU<T> *relu14;
+     Conv2D<T> *conv15;
+     ReLU<T> *relu17;
+     Conv2D<T> *conv18;
+     ReLU<T> *relu19;
+     Conv2D<T> *conv20;
+     ReLU<T> *relu21;
+     Conv2D<T> *conv22;
+     ReLU<T> *relu24;
+     Conv2D<T> *conv25;
+     ReLU<T> *relu26;
+     Conv2D<T> *conv27;
+     ReLU<T> *relu28;
+     Conv2D<T> *conv29;
+     Conv2D<T> *conv30;
+     ReLU<T> *relu32;
+     Conv2D<T> *conv33;
+     ReLU<T> *relu34;
+     Conv2D<T> *conv35;
+     ReLU<T> *relu36;
+     Conv2D<T> *conv37;
+     ReLU<T> *relu39;
+     Conv2D<T> *conv40;
+     ReLU<T> *relu41;
+     Conv2D<T> *conv42;
+     ReLU<T> *relu43;
+     Conv2D<T> *conv44;
+     ReLU<T> *relu46;
+     Conv2D<T> *conv47;
+     ReLU<T> *relu48;
+     Conv2D<T> *conv49;
+     ReLU<T> *relu50;
+     Conv2D<T> *conv51;
+     ReLU<T> *relu53;
+     Conv2D<T> *conv54;
+     ReLU<T> *relu55;
+     Conv2D<T> *conv56;
+     ReLU<T> *relu57;
+     Conv2D<T> *conv58;
+     Conv2D<T> *conv59;
+     ReLU<T> *relu61;
+     Conv2D<T> *conv62;
+     ReLU<T> *relu63;
+     Conv2D<T> *conv64;
+     ReLU<T> *relu65;
+     Conv2D<T> *conv66;
+     ReLU<T> *relu68;
+     Conv2D<T> *conv69;
+     ReLU<T> *relu70;
+     Conv2D<T> *conv71;
+     ReLU<T> *relu72;
+     Conv2D<T> *conv73;
+     ReLU<T> *relu75;
+     Conv2D<T> *conv76;
+     ReLU<T> *relu77;
+     Conv2D<T> *conv78;
+     ReLU<T> *relu79;
+     Conv2D<T> *conv80;
+     ReLU<T> *relu82;
+     Conv2D<T> *conv83;
+     ReLU<T> *relu84;
+     Conv2D<T> *conv85;
+     ReLU<T> *relu86;
+     Conv2D<T> *conv87;
+     ReLU<T> *relu89;
+     Conv2D<T> *conv90;
+     ReLU<T> *relu91;
+     Conv2D<T> *conv92;
+     ReLU<T> *relu93;
+     Conv2D<T> *conv94;
+     ReLU<T> *relu96;
+     Conv2D<T> *conv97;
+     ReLU<T> *relu98;
+     Conv2D<T> *conv99;
+     ReLU<T> *relu100;
+     Conv2D<T> *conv101;
+     Conv2D<T> *conv102;
+     ReLU<T> *relu104;
+     Conv2D<T> *conv105;
+     ReLU<T> *relu106;
+     Conv2D<T> *conv107;
+     ReLU<T> *relu108;
+     Conv2D<T> *conv109;
+     ReLU<T> *relu111;
+     Conv2D<T> *conv112;
+     ReLU<T> *relu113;
+     Conv2D<T> *conv114;
+     ReLU<T> *relu115;
+     Conv2D<T> *conv116;
+     ReLU<T> *relu118;
+     GlobalAvgPool2D<T> *globalaveragepool119;
+     Flatten<T> *flatten120;
+     FC<T> *gemm121;
+     
+
+
+public:
+     Net()
+     {
+          conv0 =    new Conv2D<T>(3, 64, 7, 3, 2, true);
+          maxpool1 =    new MaxPool2D<T>(3, 1, 2);
+          relu2 =    new ReLU<T>();
+          conv3 =    new Conv2D<T>(64, 64, 1, 0, 1, true);
+          relu4 =    new ReLU<T>();
+          conv5 =    new Conv2D<T>(64, 64, 3, 1, 1, true);
+          relu6 =    new ReLU<T>();
+          conv7 =    new Conv2D<T>(64, 256, 1, 0, 1, true);
+          conv8 =    new Conv2D<T>(64, 256, 1, 0, 1, true);
+          relu10 =    new ReLU<T>();
+          conv11 =    new Conv2D<T>(256, 64, 1, 0, 1, true);
+          relu12 =    new ReLU<T>();
+          conv13 =    new Conv2D<T>(64, 64, 3, 1, 1, true);
+          relu14 =    new ReLU<T>();
+          conv15 =    new Conv2D<T>(64, 256, 1, 0, 1, true);
+          relu17 =    new ReLU<T>();
+          conv18 =    new Conv2D<T>(256, 64, 1, 0, 1, true);
+          relu19 =    new ReLU<T>();
+          conv20 =    new Conv2D<T>(64, 64, 3, 1, 1, true);
+          relu21 =    new ReLU<T>();
+          conv22 =    new Conv2D<T>(64, 256, 1, 0, 1, true);
+          relu24 =    new ReLU<T>();
+          conv25 =    new Conv2D<T>(256, 128, 1, 0, 1, true);
+          relu26 =    new ReLU<T>();
+          conv27 =    new Conv2D<T>(128, 128, 3, 1, 2, true);
+          relu28 =    new ReLU<T>();
+          conv29 =    new Conv2D<T>(128, 512, 1, 0, 1, true);
+          conv30 =    new Conv2D<T>(256, 512, 1, 0, 2, true);
+          relu32 =    new ReLU<T>();
+          conv33 =    new Conv2D<T>(512, 128, 1, 0, 1, true);
+          relu34 =    new ReLU<T>();
+          conv35 =    new Conv2D<T>(128, 128, 3, 1, 1, true);
+          relu36 =    new ReLU<T>();
+          conv37 =    new Conv2D<T>(128, 512, 1, 0, 1, true);
+          relu39 =    new ReLU<T>();
+          conv40 =    new Conv2D<T>(512, 128, 1, 0, 1, true);
+          relu41 =    new ReLU<T>();
+          conv42 =    new Conv2D<T>(128, 128, 3, 1, 1, true);
+          relu43 =    new ReLU<T>();
+          conv44 =    new Conv2D<T>(128, 512, 1, 0, 1, true);
+          relu46 =    new ReLU<T>();
+          conv47 =    new Conv2D<T>(512, 128, 1, 0, 1, true);
+          relu48 =    new ReLU<T>();
+          conv49 =    new Conv2D<T>(128, 128, 3, 1, 1, true);
+          relu50 =    new ReLU<T>();
+          conv51 =    new Conv2D<T>(128, 512, 1, 0, 1, true);
+          relu53 =    new ReLU<T>();
+          conv54 =    new Conv2D<T>(512, 256, 1, 0, 1, true);
+          relu55 =    new ReLU<T>();
+          conv56 =    new Conv2D<T>(256, 256, 3, 1, 2, true);
+          relu57 =    new ReLU<T>();
+          conv58 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          conv59 =    new Conv2D<T>(512, 1024, 1, 0, 2, true);
+          relu61 =    new ReLU<T>();
+          conv62 =    new Conv2D<T>(1024, 256, 1, 0, 1, true);
+          relu63 =    new ReLU<T>();
+          conv64 =    new Conv2D<T>(256, 256, 3, 1, 1, true);
+          relu65 =    new ReLU<T>();
+          conv66 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          relu68 =    new ReLU<T>();
+          conv69 =    new Conv2D<T>(1024, 256, 1, 0, 1, true);
+          relu70 =    new ReLU<T>();
+          conv71 =    new Conv2D<T>(256, 256, 3, 1, 1, true);
+          relu72 =    new ReLU<T>();
+          conv73 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          relu75 =    new ReLU<T>();
+          conv76 =    new Conv2D<T>(1024, 256, 1, 0, 1, true);
+          relu77 =    new ReLU<T>();
+          conv78 =    new Conv2D<T>(256, 256, 3, 1, 1, true);
+          relu79 =    new ReLU<T>();
+          conv80 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          relu82 =    new ReLU<T>();
+          conv83 =    new Conv2D<T>(1024, 256, 1, 0, 1, true);
+          relu84 =    new ReLU<T>();
+          conv85 =    new Conv2D<T>(256, 256, 3, 1, 1, true);
+          relu86 =    new ReLU<T>();
+          conv87 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          relu89 =    new ReLU<T>();
+          conv90 =    new Conv2D<T>(1024, 256, 1, 0, 1, true);
+          relu91 =    new ReLU<T>();
+          conv92 =    new Conv2D<T>(256, 256, 3, 1, 1, true);
+          relu93 =    new ReLU<T>();
+          conv94 =    new Conv2D<T>(256, 1024, 1, 0, 1, true);
+          relu96 =    new ReLU<T>();
+          conv97 =    new Conv2D<T>(1024, 512, 1, 0, 1, true);
+          relu98 =    new ReLU<T>();
+          conv99 =    new Conv2D<T>(512, 512, 3, 1, 2, true);
+          relu100 =    new ReLU<T>();
+          conv101 =    new Conv2D<T>(512, 2048, 1, 0, 1, true);
+          conv102 =    new Conv2D<T>(1024, 2048, 1, 0, 2, true);
+          relu104 =    new ReLU<T>();
+          conv105 =    new Conv2D<T>(2048, 512, 1, 0, 1, true);
+          relu106 =    new ReLU<T>();
+          conv107 =    new Conv2D<T>(512, 512, 3, 1, 1, true);
+          relu108 =    new ReLU<T>();
+          conv109 =    new Conv2D<T>(512, 2048, 1, 0, 1, true);
+          relu111 =    new ReLU<T>();
+          conv112 =    new Conv2D<T>(2048, 512, 1, 0, 1, true);
+          relu113 =    new ReLU<T>();
+          conv114 =    new Conv2D<T>(512, 512, 3, 1, 1, true);
+          relu115 =    new ReLU<T>();
+          conv116 =    new Conv2D<T>(512, 2048, 1, 0, 1, true);
+          relu118 =    new ReLU<T>();
+          globalaveragepool119 =    new GlobalAvgPool2D<T>();
+          flatten120 =    new Flatten<T>();
+          gemm121 =    new FC<T>(2048, 1000, true);
+     }
+
+     Tensor<T>& _forward(Tensor<T> &input)
+     {
+          auto &var110 = conv0->forward(input);
+          auto &var111 = maxpool1->forward(var110);
+          auto &var112 = relu2->forward(var111);
+          auto &var113 = conv3->forward(var112);
+          auto &var114 = relu4->forward(var113);
+          auto &var115 = conv5->forward(var114);
+          auto &var116 = relu6->forward(var115);
+          auto &var117 = conv7->forward(var116);
+          auto &var118 = conv8->forward(var112);
+          auto &var119 = add(var117, var118);
+          auto &var120 = relu10->forward(var119);
+          auto &var121 = conv11->forward(var120);
+          auto &var122 = relu12->forward(var121);
+          auto &var123 = conv13->forward(var122);
+          auto &var124 = relu14->forward(var123);
+          auto &var125 = conv15->forward(var124);
+          auto &var126 = add(var125, var120);
+          auto &var127 = relu17->forward(var126);
+          auto &var128 = conv18->forward(var127);
+          auto &var129 = relu19->forward(var128);
+          auto &var130 = conv20->forward(var129);
+          auto &var131 = relu21->forward(var130);
+          auto &var132 = conv22->forward(var131);
+          auto &var133 = add(var132, var127);
+          auto &var134 = relu24->forward(var133);
+          auto &var135 = conv25->forward(var134);
+          auto &var136 = relu26->forward(var135);
+          auto &var137 = conv27->forward(var136);
+          auto &var138 = relu28->forward(var137);
+          auto &var139 = conv29->forward(var138);
+          auto &var140 = conv30->forward(var134);
+          auto &var141 = add(var139, var140);
+          auto &var142 = relu32->forward(var141);
+          auto &var143 = conv33->forward(var142);
+          auto &var144 = relu34->forward(var143);
+          auto &var145 = conv35->forward(var144);
+          auto &var146 = relu36->forward(var145);
+          auto &var147 = conv37->forward(var146);
+          auto &var148 = add(var147, var142);
+          auto &var149 = relu39->forward(var148);
+          auto &var150 = conv40->forward(var149);
+          auto &var151 = relu41->forward(var150);
+          auto &var152 = conv42->forward(var151);
+          auto &var153 = relu43->forward(var152);
+          auto &var154 = conv44->forward(var153);
+          auto &var155 = add(var154, var149);
+          auto &var156 = relu46->forward(var155);
+          auto &var157 = conv47->forward(var156);
+          auto &var158 = relu48->forward(var157);
+          auto &var159 = conv49->forward(var158);
+          auto &var160 = relu50->forward(var159);
+          auto &var161 = conv51->forward(var160);
+          auto &var162 = add(var161, var156);
+          auto &var163 = relu53->forward(var162);
+          auto &var164 = conv54->forward(var163);
+          auto &var165 = relu55->forward(var164);
+          auto &var166 = conv56->forward(var165);
+          auto &var167 = relu57->forward(var166);
+          auto &var168 = conv58->forward(var167);
+          auto &var169 = conv59->forward(var163);
+          auto &var170 = add(var168, var169);
+          auto &var171 = relu61->forward(var170);
+          auto &var172 = conv62->forward(var171);
+          auto &var173 = relu63->forward(var172);
+          auto &var174 = conv64->forward(var173);
+          auto &var175 = relu65->forward(var174);
+          auto &var176 = conv66->forward(var175);
+          auto &var177 = add(var176, var171);
+          auto &var178 = relu68->forward(var177);
+          auto &var179 = conv69->forward(var178);
+          auto &var180 = relu70->forward(var179);
+          auto &var181 = conv71->forward(var180);
+          auto &var182 = relu72->forward(var181);
+          auto &var183 = conv73->forward(var182);
+          auto &var184 = add(var183, var178);
+          auto &var185 = relu75->forward(var184);
+          auto &var186 = conv76->forward(var185);
+          auto &var187 = relu77->forward(var186);
+          auto &var188 = conv78->forward(var187);
+          auto &var189 = relu79->forward(var188);
+          auto &var190 = conv80->forward(var189);
+          auto &var191 = add(var190, var185);
+          auto &var192 = relu82->forward(var191);
+          auto &var193 = conv83->forward(var192);
+          auto &var194 = relu84->forward(var193);
+          auto &var195 = conv85->forward(var194);
+          auto &var196 = relu86->forward(var195);
+          auto &var197 = conv87->forward(var196);
+          auto &var198 = add(var197, var192);
+          auto &var199 = relu89->forward(var198);
+          auto &var200 = conv90->forward(var199);
+          auto &var201 = relu91->forward(var200);
+          auto &var202 = conv92->forward(var201);
+          auto &var203 = relu93->forward(var202);
+          auto &var204 = conv94->forward(var203);
+          auto &var205 = add(var204, var199);
+          auto &var206 = relu96->forward(var205);
+          auto &var207 = conv97->forward(var206);
+          auto &var208 = relu98->forward(var207);
+          auto &var209 = conv99->forward(var208);
+          auto &var210 = relu100->forward(var209);
+          auto &var211 = conv101->forward(var210);
+          auto &var212 = conv102->forward(var206);
+          auto &var213 = add(var211, var212);
+          auto &var214 = relu104->forward(var213);
+          auto &var215 = conv105->forward(var214);
+          auto &var216 = relu106->forward(var215);
+          auto &var217 = conv107->forward(var216);
+          auto &var218 = relu108->forward(var217);
+          auto &var219 = conv109->forward(var218);
+          auto &var220 = add(var219, var214);
+          auto &var221 = relu111->forward(var220);
+          auto &var222 = conv112->forward(var221);
+          auto &var223 = relu113->forward(var222);
+          auto &var224 = conv114->forward(var223);
+          auto &var225 = relu115->forward(var224);
+          auto &var226 = conv116->forward(var225);
+          auto &var227 = add(var226, var221);
+          auto &var228 = relu118->forward(var227);
+          auto &var229 = globalaveragepool119->forward(var228);
+          auto &var230 = flatten120->forward(var229);
+          auto &var231 = gemm121->forward(var230);
+          return var231;
+     }
+};
+
+int main(int argc, char**__argv){
+
+    prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
+    prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
+    srand(time(NULL));
+    const u64 scale = 12;
+
+    Net<i64> net;
+    net.init(scale);
+    net.load("resnet50_no_bn_input_weights.dat");
+
+    Tensor<i64> input({1, 224, 224, 3});
+    input.load("2.dat", scale);
+
+    net.forward(input);
+    
+    auto act_2d = net.activation.as_2d();
+//     print(net.conv0->activation, scale, 64);
+//     printf("\n");
+//     print(net.conv0->filter.as_nd(), scale, 64);
+    std::cout << act_2d.argmax(0) + 1 << std::endl;
+    std::cout << act_2d(0, act_2d.argmax(0)) << std::endl;
+//     printf("%ld\n", net.conv0->activation.data[0]);
+    return 0;
+
+}
diff --git a/GPU-MPC/ext/sytorch/examples/vgg16.cpp b/GPU-MPC/ext/sytorch/examples/vgg16.cpp
new file mode 100644
index 00000000..30947e0b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/examples/vgg16.cpp
@@ -0,0 +1,185 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <sytorch/utils.h>
+
+
+template <typename T>
+class VGG16 : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+public:
+    Conv2D<T> *conv0;
+    ReLU<T> *relu1;
+    Conv2D<T> *conv2;
+    MaxPool2D<T> *maxpool3;
+    ReLU<T> *relu4;
+    Conv2D<T> *conv5;
+    ReLU<T> *relu6;
+    Conv2D<T> *conv7;
+    MaxPool2D<T> *maxpool8;
+    ReLU<T> *relu9;
+    Conv2D<T> *conv10;
+    ReLU<T> *relu11;
+    Conv2D<T> *conv12;
+    ReLU<T> *relu13;
+    Conv2D<T> *conv14;
+    MaxPool2D<T> *maxpool15;
+    ReLU<T> *relu16;
+    Conv2D<T> *conv17;
+    ReLU<T> *relu18;
+    Conv2D<T> *conv19;
+    ReLU<T> *relu20;
+    Conv2D<T> *conv21;
+    MaxPool2D<T> *maxpool22;
+    ReLU<T> *relu23;
+    Conv2D<T> *conv24;
+    ReLU<T> *relu25;
+    Conv2D<T> *conv26;
+    ReLU<T> *relu27;
+    Conv2D<T> *conv28;
+    MaxPool2D<T> *maxpool29;
+    ReLU<T> *relu30;
+    Flatten<T> *reshape31;
+    FC<T> *gemm32;
+    ReLU<T> *relu33;
+    FC<T> *gemm34;
+    ReLU<T> *relu35;
+    FC<T> *gemm36;
+
+public:
+    VGG16()
+    {
+        conv0 = new Conv2D<T>(3, 64, 3, 1, 1, true);
+        relu1 = new ReLU<T>();
+        conv2 = new Conv2D<T>(64, 64, 3, 1, 1, true);
+        maxpool3 = new MaxPool2D<T>(2, 0, 2);
+        relu4 = new ReLU<T>();
+        conv5 = new Conv2D<T>(64, 128, 3, 1, 1, true);
+        relu6 = new ReLU<T>();
+        conv7 = new Conv2D<T>(128, 128, 3, 1, 1, true);
+        maxpool8 = new MaxPool2D<T>(2, 0, 2);
+        relu9 = new ReLU<T>();
+        conv10 = new Conv2D<T>(128, 256, 3, 1, 1, true);
+        relu11 = new ReLU<T>();
+        conv12 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        relu13 = new ReLU<T>();
+        conv14 = new Conv2D<T>(256, 256, 3, 1, 1, true);
+        maxpool15 = new MaxPool2D<T>(2, 0, 2);
+        relu16 = new ReLU<T>();
+        conv17 = new Conv2D<T>(256, 512, 3, 1, 1, true);
+        relu18 = new ReLU<T>();
+        conv19 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu20 = new ReLU<T>();
+        conv21 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        maxpool22 = new MaxPool2D<T>(2, 0, 2);
+        relu23 = new ReLU<T>();
+        conv24 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu25 = new ReLU<T>();
+        conv26 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        relu27 = new ReLU<T>();
+        conv28 = new Conv2D<T>(512, 512, 3, 1, 1, true);
+        maxpool29 = new MaxPool2D<T>(2, 0, 2);
+        relu30 = new ReLU<T>();
+        reshape31 = new Flatten<T>();
+        gemm32 = new FC<T>(25088, 4096, true);
+        relu33 = new ReLU<T>();
+        gemm34 = new FC<T>(4096, 4096, true);
+        relu35 = new ReLU<T>();
+        gemm36 = new FC<T>(4096, 1000, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &var35 = conv0->forward(input);
+        auto &var36 = relu1->forward(var35);
+        auto &var37 = conv2->forward(var36);
+        auto &var38 = maxpool3->forward(var37);
+        auto &var39 = relu4->forward(var38);
+        auto &var40 = conv5->forward(var39);
+        auto &var41 = relu6->forward(var40);
+        auto &var42 = conv7->forward(var41);
+        auto &var43 = maxpool8->forward(var42);
+        auto &var44 = relu9->forward(var43);
+        auto &var45 = conv10->forward(var44);
+        auto &var46 = relu11->forward(var45);
+        auto &var47 = conv12->forward(var46);
+        auto &var48 = relu13->forward(var47);
+        auto &var49 = conv14->forward(var48);
+        auto &var50 = maxpool15->forward(var49);
+        auto &var51 = relu16->forward(var50);
+        auto &var52 = conv17->forward(var51);
+        auto &var53 = relu18->forward(var52);
+        auto &var54 = conv19->forward(var53);
+        auto &var55 = relu20->forward(var54);
+        auto &var56 = conv21->forward(var55);
+        auto &var57 = maxpool22->forward(var56);
+        auto &var58 = relu23->forward(var57);
+        auto &var59 = conv24->forward(var58);
+        auto &var60 = relu25->forward(var59);
+        auto &var61 = conv26->forward(var60);
+        auto &var62 = relu27->forward(var61);
+        auto &var63 = conv28->forward(var62);
+        auto &var64 = maxpool29->forward(var63);
+        auto &var65 = relu30->forward(var64);
+        auto &var66 = reshape31->forward(var65);
+        auto &var67 = gemm32->forward(var66);
+        auto &var68 = relu33->forward(var67);
+        auto &var69 = gemm34->forward(var68);
+        auto &var70 = relu35->forward(var69);
+        auto &var71 = gemm36->forward(var70);
+        return var71;
+    }
+};
+
+
+int main(int argc, char**__argv){
+
+    prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
+    prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
+    srand(time(NULL));
+    // ClearText<i64>::bw = 64;
+    const u64 scale = 24;
+
+    VGG16<i64> net;
+    net.init(scale);
+    net.load("P-VGG16-imgnet-float.dat");
+
+    Tensor<i64> input({1, 224, 224, 3});
+    input.load("2.dat", scale);
+
+    net.forward(input);
+    
+    auto act_2d = net.activation.as_2d();
+    printf("%d\n", act_2d.data[0]);
+//     print(net.conv0->activation, scale, 64);
+//     printf("\n");
+    // auto w = net.gemm48->weight.as_nd();
+    // printf("%ld, %ld, %ld\n", w.data[0], w.data[1], w.data[w.size() - 1]);
+    std::cout << act_2d.argmax(0) + 1 << std::endl;
+    // printf("%ld\n", net.conv0->activation.data[0]);    
+    
+    return 0;
+
+}
diff --git a/GPU-MPC/ext/sytorch/ext/bitpack/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/bitpack/CMakeLists.txt
new file mode 100644
index 00000000..7f6fe45d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/bitpack/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.16)
+project(bitpack)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -Wno-ignored-attributes -march=native")
+
+add_library(${PROJECT_NAME} STATIC
+    src/bitpack/bitpack.cpp
+)
+
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+add_executable(
+    bitpack_test
+    tests/test.cpp
+)
+
+target_link_libraries(bitpack_test ${PROJECT_NAME})
diff --git a/GPU-MPC/ext/sytorch/ext/bitpack/include/bitpack/bitpack.h b/GPU-MPC/ext/sytorch/ext/bitpack/include/bitpack/bitpack.h
new file mode 100644
index 00000000..68dc39bd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/bitpack/include/bitpack/bitpack.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+
+namespace bitpack {
+    uint64_t mod(uint64_t x, int bw);
+    std::size_t packed_size(std::size_t n, int bw);
+    std::size_t pack(uint64_t *dst, const uint64_t *src, std::size_t n, int bw);
+    std::size_t unpack(uint64_t *dst, const uint64_t *src, std::size_t n, int bw);
+};
diff --git a/GPU-MPC/ext/sytorch/ext/bitpack/src/bitpack/bitpack.cpp b/GPU-MPC/ext/sytorch/ext/bitpack/src/bitpack/bitpack.cpp
new file mode 100644
index 00000000..f97af2a6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/bitpack/src/bitpack/bitpack.cpp
@@ -0,0 +1,136 @@
+#include <bitpack/bitpack.h>
+
+namespace bitpack {
+    
+    inline uint64_t mod(uint64_t x, int bw)
+    {
+        return x & ((1LL << bw) - 1);
+    }
+
+    std::size_t packed_size(std::size_t n, int bw)
+    {
+        return (n * bw + 63) / 64;
+    }
+
+    std::size_t pack_1bit(uint64_t *dst, const uint64_t *src, std::size_t n)
+    {
+        for (int i = 0; i < n; ++i)
+        {
+            std::size_t dsti = i / 64;
+            std::size_t dstj = i % 64;
+            dst[dsti] |= ((src[i] & 1) << dstj);
+        }
+        return packed_size(n, 1);
+    }
+
+    /// packs `n` `bw` bit integers from `src` into `dst`
+    std::size_t pack(uint64_t *dst, const uint64_t *src, std::size_t n, int bw)
+    {
+        if (bw == 64)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+            }
+            return n;
+        }
+
+        std::size_t ps = packed_size(n, bw);
+        for (int i = 0; i < ps; ++i)
+            dst[i] = 0;
+
+        if (bw == 1)
+        {
+            return pack_1bit(dst, src, n);
+        }
+
+        std::size_t dsti = 0;
+        std::size_t dstj = 0;
+
+        
+        for (int i = 0; i < n; ++i)
+        {
+            uint64_t x = mod(src[i], bw);
+            std::size_t rem = 64 - dstj;
+            if (bw <= rem)
+            {
+                dst[dsti] |= x << dstj;
+                dstj += bw;
+                if (dstj == 64)
+                {
+                    dstj = 0;
+                    ++dsti;
+                }
+            }
+            else
+            {
+                dst[dsti] |= x << dstj;
+                dstj += bw;
+                dst[dsti + 1] |= x >> rem;
+                dstj -= 64;
+                ++dsti;
+            }
+        }
+
+        return dsti + (dstj > 0);
+    }
+
+    std::size_t unpack_1bit(uint64_t *dst, const uint64_t *src, std::size_t n)
+    {
+        for (int i = 0; i < n; ++i)
+        {
+            std::size_t srci = i / 64;
+            std::size_t srcj = i % 64;
+            dst[i] = src[srci] >> srcj;
+        }
+        return packed_size(n, 1);
+    }
+
+    /// unpacks `n` `bw` bit integers from `src` into `dst`
+    std::size_t unpack(uint64_t *dst, const uint64_t *src, std::size_t n, int bw)
+    {
+        if (bw == 1)
+        {
+            return unpack_1bit(dst, src, n);
+        }
+        
+        if (bw == 64)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+            }
+            return n;
+        }
+
+        std::size_t srci = 0;
+        std::size_t srcj = 0;
+        uint64_t cache = src[0];
+        
+        for (int i = 0; i < n; ++i)
+        {
+            uint64_t x = cache >> srcj;
+            std::size_t rem = 64 - srcj;
+            if (bw <= rem)
+            {
+                dst[i] = x;
+                srcj += bw;
+                if (srcj == 64)
+                {
+                    srcj = 0;
+                    ++srci;
+                    cache = src[srci];
+                }
+            }
+            else
+            {
+                ++srci;
+                cache = src[srci];
+                dst[i] = x | (cache << rem);
+                srcj = srcj + bw - 64;
+            }
+        }
+
+        return srci + (srcj > 0);
+    }
+};
diff --git a/GPU-MPC/ext/sytorch/ext/bitpack/tests/test.cpp b/GPU-MPC/ext/sytorch/ext/bitpack/tests/test.cpp
new file mode 100644
index 00000000..c0064ceb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/bitpack/tests/test.cpp
@@ -0,0 +1,38 @@
+#include <bitpack/bitpack.h>
+#include <cstdlib>
+#include <iostream>
+
+int main()
+{
+    std::size_t n = 1000;
+    std::size_t bw = 64;
+    std::size_t n_dst = bitpack::packed_size(n, bw);
+    
+    uint64_t src[n];
+    uint64_t src2[n];
+
+    for (int i = 0; i < n; ++i)
+        src[i] = bitpack::mod(i, bw);
+
+    uint64_t dst[n_dst];
+    auto sz = bitpack::pack(dst, src, n, bw);
+
+    if (sz != n_dst)
+    {
+        std::cerr << "sz != n_dst" << std::endl;
+        return 1;
+    }
+
+    bitpack::unpack(src2, dst, n, bw);
+    for (int i = 0; i < n; ++i)
+    {
+        if (src[i] != src2[i])
+        {
+            std::cerr << "src[" << i << "] != src2[" << i << "]" << std::endl;
+            std::cout << "expected = " << src[i] << std::endl;
+            std::cout << "got      = " << src2[i] << std::endl;
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/cryptoTools/CMakeLists.txt
new file mode 100644
index 00000000..6660169a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.16)
+project(cryptoTools)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -Wno-ignored-attributes -march=native")
+
+add_library(${PROJECT_NAME} STATIC
+    cryptoTools/Common/Defines.cpp
+    cryptoTools/Common/Log.cpp
+    cryptoTools/Crypto/AES.cpp
+    cryptoTools/Crypto/PRNG.cpp
+)
+
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE b/GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE
new file mode 100644
index 00000000..75995a2a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE
@@ -0,0 +1,50 @@
+Dual-licensed under Unlicense or MIT. 
+
+
+----------------------- Unlicense ---------------------------
+
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+
+
+----------------------- MIT ---------------------------
+Copyright 2021 Peter Rindal
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+software and associated documentation files (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use, copy, modify, 
+merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+permit persons to whom the Software is furnished to do so, subject to the following 
+conditions:
+
+The above copyright notice and this permission notice shall be included in all copies 
+or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.cpp b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.cpp
new file mode 100644
index 00000000..247a7631
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.cpp
@@ -0,0 +1,88 @@
+#include <cryptoTools/Common/Defines.h>
+#include <cryptoTools/Crypto/AES.h>
+#include <random>
+#include <sstream>
+#include <iomanip>
+#include <cstring>
+
+namespace osuCrypto {
+
+    const block ZeroBlock = _mm_set_epi64x(0, 0);
+    const block OneBlock = _mm_set_epi64x(0, 1);
+    const block AllOneBlock = _mm_set_epi64x(u64(-1), u64(-1));
+    const std::array<block, 2> zeroAndAllOne = { { ZeroBlock, AllOneBlock } };
+    const block CCBlock = ([]() {block cc; memset(&cc, 0xcc, sizeof(block)); return cc; })();
+
+
+
+    block PRF(const block& b, u64 i)
+    {
+		return AES(b).ecbEncBlock(toBlock(i));
+    }
+
+    void split(const std::string &s, char delim, std::vector<std::string> &elems) {
+        std::stringstream ss(s);
+        std::string item;
+        while (std::getline(ss, item, delim)) {
+            elems.push_back(item);
+        }
+    }
+
+    std::vector<std::string> split(const std::string &s, char delim) {
+        std::vector<std::string> elems;
+        split(s, delim, elems);
+        return elems;
+    }
+
+    const int tab64[64] = {
+        63,  0, 58,  1, 59, 47, 53,  2,
+        60, 39, 48, 27, 54, 33, 42,  3,
+        61, 51, 37, 40, 49, 18, 28, 20,
+        55, 30, 34, 11, 43, 14, 22,  4,
+        62, 57, 46, 52, 38, 26, 32, 41,
+        50, 36, 17, 19, 29, 10, 13, 21,
+        56, 45, 25, 31, 35, 16,  9, 12,
+        44, 24, 15,  8, 23,  7,  6,  5 };
+
+
+    u64 log2floor(u64 value)
+    {
+        value |= value >> 1;
+        value |= value >> 2;
+        value |= value >> 4;
+        value |= value >> 8;
+        value |= value >> 16;
+        value |= value >> 32;
+        return tab64[((uint64_t)((value - (value >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
+    }
+
+    u64 log2ceil(u64 value)
+    {
+        auto floor = log2floor(value);
+
+        return floor + (value > (1ull << floor));
+        //return u64(std::ceil(std::log2(value)));
+    }
+
+    block sysRandomSeed()
+    {
+        std::random_device rd;
+		auto ret = std::array<unsigned int, 4>{rd(), rd(), rd(), rd()};
+		return *(block*)&ret;
+    }
+}
+
+
+
+std::ostream& operator<<(std::ostream& out, const oc::block& blk)
+{
+	using namespace oc;
+	out << std::hex;
+	u64* data = (u64*)&blk;
+
+	out << std::setw(16) << std::setfill('0') << data[1]
+		<< std::setw(16) << std::setfill('0') << data[0];
+
+	out << std::dec << std::setw(0);
+	return out;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.h b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.h
new file mode 100644
index 00000000..2be99c26
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.h
@@ -0,0 +1,149 @@
+#pragma once
+// This file and the associated implementation has been placed in the public domain, waiving all copyright. No restrictions are placed on its use.
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cryptoTools/Common/config.h>
+
+
+#ifdef ENABLE_FULL_GSL
+#include <cryptoTools/gsl/span>
+#else
+#include <cryptoTools/gsl/gls-lite.hpp>
+#endif
+
+#define STRINGIZE_DETAIL(x) #x
+#define STRINGIZE(x) STRINGIZE_DETAIL(x)
+#define LOCATION __FILE__ ":" STRINGIZE(__LINE__)
+#define RTE_LOC std::runtime_error(LOCATION)
+
+#ifdef _MSC_VER
+#ifndef _WIN32_WINNT
+// compile for win 7 and up.
+#define _WIN32_WINNT 0x0601  
+//#define _WINSOCK_DEPRECATED_NO_WARNINGS
+#endif // !_WIN32_WINNT
+
+
+	#pragma warning( disable : 4018) // signed unsigned comparison warning
+	#define TODO(x) __pragma(message (__FILE__ ":" STRINGIZE(__LINE__) " Warning:TODO - " #x))
+#else
+	#define TODO(x)
+#endif
+
+// add instrinsics names that intel knows but clang doesn't…
+#ifdef __clang__
+#define _mm_cvtsi128_si64x _mm_cvtsi128_si64
+#endif
+
+
+namespace osuCrypto {
+    template<typename T> using ptr = T*;
+    template<typename T> using uPtr = std::unique_ptr<T>;
+    template<typename T> using sPtr = std::shared_ptr<T>;
+    template<typename T> using span = gsl::span<T>;
+
+    typedef uint64_t u64;
+    typedef int64_t i64;
+    typedef uint32_t u32;
+    typedef int32_t i32;
+    typedef uint16_t u16;
+    typedef int16_t i16;
+    typedef uint8_t u8;
+    typedef int8_t i8;
+
+
+ //   template<typename T>
+	//static std::string ToString(const T& t) { return boost::lexical_cast<std::string>(t); }
+
+    typedef  __m128i block;
+    inline block toBlock(u8*data) { return _mm_set_epi64x(((u64*)data)[1], ((u64*)data)[0]);}
+    inline block toBlock(u64 low_u64)        { return _mm_set_epi64x(0, low_u64); }
+    inline block toBlock(u64 high_u64, u64 low_u64) { return _mm_set_epi64x(high_u64, low_u64); }
+
+    extern const block ZeroBlock;
+    extern const block OneBlock;
+    extern const block AllOneBlock;
+    extern const block CCBlock;
+    extern const std::array<block, 2> zeroAndAllOne;
+
+    inline u64 roundUpTo(u64 val, u64 step) { return ((val + step - 1) / step) * step; }
+
+    u64 log2ceil(u64);
+    u64 log2floor(u64);
+
+    block sysRandomSeed();
+}
+
+
+std::ostream& operator<<(std::ostream& out, const osuCrypto::block& block);
+namespace osuCrypto
+{
+	using ::operator<<;
+}
+
+inline bool eq(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+    osuCrypto::block neq = _mm_xor_si128(lhs, rhs);
+    return _mm_test_all_zeros(neq, neq) != 0;
+}
+
+inline bool neq(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+    osuCrypto::block neq = _mm_xor_si128(lhs, rhs);
+    return _mm_test_all_zeros(neq, neq) == 0;
+}
+
+#ifdef _MSC_VER
+inline bool operator<(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+    return lhs.m128i_u64[1] < rhs.m128i_u64[1] || (eq(lhs, rhs) && lhs.m128i_u64[0] < rhs.m128i_u64[0]);
+}
+
+inline osuCrypto::block operator^(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+	return _mm_xor_si128(lhs, rhs);
+}
+inline osuCrypto::block operator&(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+	return _mm_and_si128(lhs, rhs);
+}
+
+inline osuCrypto::block operator|(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+	return _mm_or_si128(lhs, rhs);
+}
+inline osuCrypto::block operator<<(const osuCrypto::block& lhs, const osuCrypto::u8& rhs)
+{
+	return _mm_slli_epi64(lhs, rhs);
+}
+inline osuCrypto::block operator>>(const osuCrypto::block& lhs, const osuCrypto::u8& rhs)
+{
+	return _mm_srli_epi64(lhs, rhs);
+}
+inline osuCrypto::block operator+(const osuCrypto::block& lhs, const osuCrypto::block& rhs)
+{
+	return _mm_add_epi64(lhs, rhs);
+}
+
+
+#ifdef ENABLE_RELIC
+#pragma comment(lib, "relic_s.lib")
+#endif
+
+#ifdef ENABLE_MIRACL
+#pragma comment(lib, "miracl.lib")
+#endif
+
+#ifdef ENABLE_WOLFSSL
+#pragma comment(lib, "wolfssl.lib")
+#endif
+
+#endif
+
+namespace oc = osuCrypto;
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.cpp b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.cpp
new file mode 100644
index 00000000..bb4a9c86
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.cpp
@@ -0,0 +1,136 @@
+#include <cryptoTools/Common/Log.h>
+#include <cryptoTools/Common/Defines.h>
+#include <iostream>
+#ifdef _MSC_VER
+#include <windows.h>
+#endif
+
+
+namespace osuCrypto
+{
+    std::chrono::time_point<std::chrono::system_clock> gStart = std::chrono::system_clock::now();
+
+
+	ostreamLocker lout(std::cout);
+    std::mutex gIoStreamMtx;
+
+    void setThreadName(const std::string name)
+    {
+        setThreadName(name.c_str());
+    }
+    void setThreadName(const char* name)
+    {
+#ifndef NDEBUG
+#ifdef _MSC_VER
+    const DWORD MS_VC_EXCEPTION = 0x406D1388;
+
+#pragma pack(push,8)
+        typedef struct tagTHREADNAME_INFO
+        {
+            DWORD dwType; // Must be 0x1000.
+            LPCSTR szName; // Pointer to name (in user addr space).
+            DWORD dwThreadID; // Thread ID (-1=caller thread).
+            DWORD dwFlags; // Reserved for future use, must be zero.
+        } THREADNAME_INFO;
+#pragma pack(pop)
+
+
+        THREADNAME_INFO info;
+        info.dwType = 0x1000;
+        info.szName = name;
+        info.dwThreadID = -1;
+        info.dwFlags = 0;
+
+        __try
+        {
+            RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
+        }
+        __except (EXCEPTION_EXECUTE_HANDLER)
+        {
+        }
+#endif
+#endif
+    }
+
+    const Color ColorDefault([]() -> Color {
+#ifdef _MSC_VER
+        CONSOLE_SCREEN_BUFFER_INFO   csbi;
+        HANDLE m_hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        GetConsoleScreenBufferInfo(m_hConsole, &csbi);
+
+        return (Color)(csbi.wAttributes & 255);
+#else
+        return Color::White;
+#endif
+
+    }());
+
+#ifdef _MSC_VER
+    static const HANDLE __m_hConsole(GetStdHandle(STD_OUTPUT_HANDLE));
+#endif
+#define RESET   "\033[0m"
+#define BLACK   "\033[30m"      /* Black */
+#define RED     "\033[31m"      /* Red */
+#define GREEN   "\033[32m"      /* Green */
+#define YELLOW  "\033[33m"      /* Yellow */
+#define BLUE    "\033[34m"      /* Blue */
+#define MAGENTA "\033[35m"      /* Magenta */
+#define CYAN    "\033[36m"      /* Cyan */
+#define WHITE   "\033[37m"      /* White */
+#define BOLDBLACK   "\033[1m\033[30m"      /* Bold Black */
+#define BOLDRED     "\033[1m\033[31m"      /* Bold Red */
+#define BOLDGREEN   "\033[1m\033[32m"      /* Bold Green */
+#define BOLDYELLOW  "\033[1m\033[33m"      /* Bold Yellow */
+#define BOLDBLUE    "\033[1m\033[34m"      /* Bold Blue */
+#define BOLDMAGENTA "\033[1m\033[35m"      /* Bold Magenta */
+#define BOLDCYAN    "\033[1m\033[36m"      /* Bold Cyan */
+#define BOLDWHITE   "\033[1m\033[37m"      /* Bold White */
+
+    std::array<const char*,16> colorMap
+    {
+        "",         //    -- = 0,
+        "",         //    -- = 1,
+        GREEN,      //    LightGreen = 2,
+        BLACK,      //    LightGrey = 3,
+        RED,        //    LightRed = 4,
+        WHITE,      //    OffWhite1 = 5,
+        WHITE,      //    OffWhite2 = 6,
+        "",         //         = 7
+        BLACK,      //    Grey = 8,
+        "",         //    -- = 9,
+        BOLDGREEN,  //    Green = 10,
+        BOLDBLUE,   //    Blue = 11,
+        BOLDRED,    //    Red = 12,
+        BOLDCYAN,   //    Pink = 13,
+        BOLDYELLOW, //    Yellow = 14,
+        RESET       //    White = 15
+    };
+
+    std::ostream& operator<<(std::ostream& out, Color tag)
+    {
+        if (tag == Color::Default)
+            tag = ColorDefault;
+#ifdef _MSC_VER
+        SetConsoleTextAttribute(__m_hConsole, (WORD)tag | (240 & (WORD)ColorDefault) );
+#else
+
+        out << colorMap[15 & (char)tag];
+#endif
+        return out;
+    }
+
+
+    std::ostream& operator<<(std::ostream& out, IoStream tag)
+    {
+        if (tag == IoStream::lock)
+        {
+            gIoStreamMtx.lock();
+        }
+        else
+        {
+            gIoStreamMtx.unlock();
+        }
+
+        return out;
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
new file mode 100644
index 00000000..e4cbd43e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
@@ -0,0 +1,222 @@
+#pragma once
+// This file and the associated implementation has been placed in the public domain, waiving all copyright. No restrictions are placed on its use.
+#include <cryptoTools/Common/Defines.h>
+#include <string>
+#include <ostream>
+#include <mutex>
+#include <vector>
+#include <chrono>
+
+
+namespace osuCrypto
+{
+    extern std::chrono::time_point<std::chrono::system_clock> gStart;
+    class Log
+    {
+    public:
+        Log() = default;
+        Log(const Log& c) {
+
+            std::lock_guard<std::mutex>l(const_cast<std::mutex&>(c.mLock));
+            mMessages = c.mMessages;
+        }
+
+        std::vector<std::pair<u64, std::string>> mMessages;
+        std::mutex mLock;
+
+        void push(const std::string& msg)
+        {
+            std::lock_guard<std::mutex>l(mLock);
+
+
+            auto now = std::chrono::system_clock::now();
+            auto ts = std::chrono::duration_cast<std::chrono::microseconds>(now - gStart).count();
+
+            mMessages.emplace_back(ts, msg);
+        }
+
+    };
+    inline std::ostream& operator<<(std::ostream& o, Log& log)
+    {
+        std::lock_guard<std::mutex>l(log.mLock);
+        for (u64 i = 0; i < log.mMessages.size(); ++i)
+        {
+            o << "[" << i << ", " << log.mMessages[i].first / 1000.0 << "ms ]  " << log.mMessages[i].second << std::endl;
+        }
+
+        return o;
+    }
+    class LogAdapter
+    {
+    public:
+        Log* mLog = nullptr;
+
+        LogAdapter() = default;
+        LogAdapter(const LogAdapter&) = default;
+        LogAdapter(Log& log) : mLog(&log) {}
+
+        void push(const std::string& msg)
+        {
+            if (mLog)
+                mLog->push(msg);
+        }
+
+        void setLog(Log& log)
+        {
+            mLog = &log;
+        }
+    };
+
+    inline std::ostream& operator<<(std::ostream& o, LogAdapter& log)
+    {
+        if (log.mLog)
+            o << *log.mLog;
+        else
+            o << "{null log}";
+        return o;
+    }
+
+    enum class Color {
+        LightGreen = 2,
+        LightGrey = 3,
+        LightRed = 4,
+        OffWhite1 = 5,
+        OffWhite2 = 6,
+        Grey = 8,
+        Green = 10,
+        Blue = 11,
+        Red = 12,
+        Pink = 13,
+        Yellow = 14,
+        White = 15,
+        Default
+    };
+
+    extern const Color ColorDefault;
+
+
+    std::ostream& operator<<(std::ostream& out, Color color);
+
+    enum class IoStream
+    {
+        lock,
+        unlock
+    };
+
+    extern std::mutex gIoStreamMtx;
+
+    struct ostreamLock
+    {
+        std::ostream& out;
+        std::unique_lock<std::mutex> mLock;
+
+        ostreamLock(ostreamLock&&) = default;
+
+        ostreamLock(std::ostream& o, std::mutex& lock = gIoStreamMtx) :
+            out(o),
+            mLock(lock)
+        {}
+
+        template<typename T>
+        ostreamLock& operator<<(const T& v)
+        {
+            out << v;
+            return *this;
+        }
+
+        template<typename T>
+        ostreamLock& operator<<(T& v)
+        {
+            out << v;
+            return *this;
+        }
+        ostreamLock& operator<< (std::ostream& (*v)(std::ostream&))
+        {
+            out << v;
+            return *this;
+        }
+        ostreamLock& operator<< (std::ios& (*v)(std::ios&))
+        {
+            out << v;
+            return *this;
+        }
+        ostreamLock& operator<< (std::ios_base& (*v)(std::ios_base&))
+        {
+            out << v;
+            return *this;
+        }
+    };
+
+
+    struct ostreamLocker
+    {
+        std::ostream& out;
+
+        ostreamLocker(std::ostream& o) :
+            out(o)
+        {}
+
+        template<typename T>
+        ostreamLock operator<<(const T& v)
+        {
+            ostreamLock r(out);
+            r << v;
+
+#ifndef NO_RETURN_ELISION
+            return r;
+#else
+            return std::move(r);
+#endif
+        }
+
+        template<typename T>
+        ostreamLock operator<<(T& v)
+        {
+            ostreamLock r(out);
+            r << v;
+#ifndef NO_RETURN_ELISION
+            return r;
+#else
+            return std::move(r);
+#endif
+        }
+        ostreamLock operator<< (std::ostream& (*v)(std::ostream&))
+        {
+            ostreamLock r(out);
+            r << v;
+#ifndef NO_RETURN_ELISION
+            return r;
+#else
+            return std::move(r);
+#endif
+        }
+        ostreamLock operator<< (std::ios& (*v)(std::ios&))
+        {
+            ostreamLock r(out);
+            r << v;
+#ifndef NO_RETURN_ELISION
+            return r;
+#else
+            return std::move(r);
+#endif
+        }
+        ostreamLock operator<< (std::ios_base& (*v)(std::ios_base&))
+        {
+            ostreamLock r(out);
+            r << v;
+#ifndef NO_RETURN_ELISION
+            return r;
+#else
+            return std::move(r);
+#endif
+        }
+    };
+    extern ostreamLocker lout;
+
+    std::ostream& operator<<(std::ostream& out, IoStream color);
+
+
+    void setThreadName(const std::string name);
+    void setThreadName(const char* name);
+
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/config.h b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/config.h
new file mode 100644
index 00000000..f722c709
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/config.h
@@ -0,0 +1,27 @@
+#pragma once
+
+
+// use the miracl library for curves
+/* #undef ENABLE_MIRACL */
+
+// use the relic library for curves
+/* #undef ENABLE_RELIC */
+
+// compile the circuit library
+/* #undef ENABLE_CIRCUITS */
+
+// include the full GSL header (C++14). If not defined, uses gsl_lite (c++11)
+#define ENABLE_FULL_GSL ON 
+
+// defined if we should use cpp 14 and undefined means cpp 11
+#define ENABLE_CPP_14 ON 
+
+// enables the use of assembly based SHA1
+/* #undef ENABLE_NASM */
+
+// Turn on Channel logging for debugging.
+/* #undef ENABLE_NET_LOG */
+
+
+// enable the wolf ssl socket layer.
+/* #undef ENABLE_WOLFSSL */
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.cpp b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.cpp
new file mode 100644
index 00000000..d7edcbd1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.cpp
@@ -0,0 +1,792 @@
+#include <cryptoTools/Crypto/AES.h>
+
+#include <array>
+namespace osuCrypto {
+
+    const AES mAesFixedKey(_mm_set_epi8(36, -100, 50, -22, 92, -26, 49, 9, -82, -86, -51, -96, 98, -20, 29, -13));
+
+
+    block keyGenHelper(block key, block keyRcon)
+    {
+        keyRcon = _mm_shuffle_epi32(keyRcon, _MM_SHUFFLE(3, 3, 3, 3));
+        key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+        key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+        key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+        return _mm_xor_si128(key, keyRcon);
+    }
+
+    AES::AES(const block & userKey)
+    {
+        setKey(userKey);
+    }
+
+    void AES::setKey(const block & userKey)
+    {
+        mRoundKey[0] = userKey;
+        mRoundKey[1] = keyGenHelper(mRoundKey[0], _mm_aeskeygenassist_si128(mRoundKey[0], 0x01));
+        mRoundKey[2] = keyGenHelper(mRoundKey[1], _mm_aeskeygenassist_si128(mRoundKey[1], 0x02));
+        mRoundKey[3] = keyGenHelper(mRoundKey[2], _mm_aeskeygenassist_si128(mRoundKey[2], 0x04));
+        mRoundKey[4] = keyGenHelper(mRoundKey[3], _mm_aeskeygenassist_si128(mRoundKey[3], 0x08));
+        mRoundKey[5] = keyGenHelper(mRoundKey[4], _mm_aeskeygenassist_si128(mRoundKey[4], 0x10));
+        mRoundKey[6] = keyGenHelper(mRoundKey[5], _mm_aeskeygenassist_si128(mRoundKey[5], 0x20));
+        mRoundKey[7] = keyGenHelper(mRoundKey[6], _mm_aeskeygenassist_si128(mRoundKey[6], 0x40));
+        mRoundKey[8] = keyGenHelper(mRoundKey[7], _mm_aeskeygenassist_si128(mRoundKey[7], 0x80));
+        mRoundKey[9] = keyGenHelper(mRoundKey[8], _mm_aeskeygenassist_si128(mRoundKey[8], 0x1B));
+        mRoundKey[10] = keyGenHelper(mRoundKey[9], _mm_aeskeygenassist_si128(mRoundKey[9], 0x36));
+    }
+
+    void  AES::ecbEncBlock(const block & plaintext, block & cyphertext) const
+    {
+        cyphertext = _mm_xor_si128(plaintext, mRoundKey[0]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[1]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[2]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[3]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[4]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[5]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[6]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[7]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[8]);
+        cyphertext = _mm_aesenc_si128(cyphertext, mRoundKey[9]);
+        cyphertext = _mm_aesenclast_si128(cyphertext, mRoundKey[10]);
+    }
+
+    block AES::ecbEncBlock(const block & plaintext) const
+    {
+        block ret;
+        ecbEncBlock(plaintext, ret);
+        return ret;
+    }
+
+    void AES::ecbEncBlocks(const block * plaintexts, u64 blockLength, block * cyphertext) const
+    {
+        const u64 step = 8;
+        u64 idx = 0;
+        u64 length = blockLength - blockLength % step;
+
+        //std::array<block, step> temp;
+		block temp[step];
+
+        for (; idx < length; idx += step)
+        {
+            temp[0] = _mm_xor_si128(plaintexts[idx + 0], mRoundKey[0]);
+            temp[1] = _mm_xor_si128(plaintexts[idx + 1], mRoundKey[0]);
+            temp[2] = _mm_xor_si128(plaintexts[idx + 2], mRoundKey[0]);
+            temp[3] = _mm_xor_si128(plaintexts[idx + 3], mRoundKey[0]);
+            temp[4] = _mm_xor_si128(plaintexts[idx + 4], mRoundKey[0]);
+            temp[5] = _mm_xor_si128(plaintexts[idx + 5], mRoundKey[0]);
+            temp[6] = _mm_xor_si128(plaintexts[idx + 6], mRoundKey[0]);
+            temp[7] = _mm_xor_si128(plaintexts[idx + 7], mRoundKey[0]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[1]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[1]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[1]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[1]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[1]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[1]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[1]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[1]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[2]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[2]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[2]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[2]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[2]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[2]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[2]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[2]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[3]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[3]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[3]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[3]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[3]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[3]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[3]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[3]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[4]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[4]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[4]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[4]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[4]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[4]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[4]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[4]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[5]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[5]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[5]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[5]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[5]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[5]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[5]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[5]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[6]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[6]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[6]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[6]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[6]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[6]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[6]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[6]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[7]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[7]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[7]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[7]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[7]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[7]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[7]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[7]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[8]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[8]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[8]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[8]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[8]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[8]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[8]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[8]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[9]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[9]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[9]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[9]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[9]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[9]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[9]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[9]);
+
+            cyphertext[idx + 0] = _mm_aesenclast_si128(temp[0], mRoundKey[10]);
+            cyphertext[idx + 1] = _mm_aesenclast_si128(temp[1], mRoundKey[10]);
+            cyphertext[idx + 2] = _mm_aesenclast_si128(temp[2], mRoundKey[10]);
+            cyphertext[idx + 3] = _mm_aesenclast_si128(temp[3], mRoundKey[10]);
+            cyphertext[idx + 4] = _mm_aesenclast_si128(temp[4], mRoundKey[10]);
+            cyphertext[idx + 5] = _mm_aesenclast_si128(temp[5], mRoundKey[10]);
+            cyphertext[idx + 6] = _mm_aesenclast_si128(temp[6], mRoundKey[10]);
+            cyphertext[idx + 7] = _mm_aesenclast_si128(temp[7], mRoundKey[10]);
+        }
+
+        for (; idx < blockLength; ++idx)
+        {
+            cyphertext[idx] = _mm_xor_si128(plaintexts[idx], mRoundKey[0]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[1]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[2]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[3]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[4]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[5]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[6]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[7]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[8]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[9]);
+            cyphertext[idx] = _mm_aesenclast_si128(cyphertext[idx], mRoundKey[10]);
+        }
+    }
+
+
+    void AES::ecbEncTwoBlocks(const block * plaintexts, block * cyphertext) const
+    {
+        cyphertext[0] = _mm_xor_si128(plaintexts[0], mRoundKey[0]);
+        cyphertext[1] = _mm_xor_si128(plaintexts[1], mRoundKey[0]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[1]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[1]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[2]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[2]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[3]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[3]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[4]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[4]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[5]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[5]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[6]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[6]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[7]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[7]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[8]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[8]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[9]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[9]);
+
+        cyphertext[0] = _mm_aesenclast_si128(cyphertext[0], mRoundKey[10]);
+        cyphertext[1] = _mm_aesenclast_si128(cyphertext[1], mRoundKey[10]);
+    }
+
+    void AES::ecbEncFourBlocks(const block * plaintexts, block * cyphertext) const
+    {
+        cyphertext[0] = _mm_xor_si128(plaintexts[0], mRoundKey[0]);
+        cyphertext[1] = _mm_xor_si128(plaintexts[1], mRoundKey[0]);
+        cyphertext[2] = _mm_xor_si128(plaintexts[2], mRoundKey[0]);
+        cyphertext[3] = _mm_xor_si128(plaintexts[3], mRoundKey[0]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[1]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[1]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[1]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[1]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[2]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[2]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[2]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[2]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[3]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[3]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[3]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[3]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[4]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[4]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[4]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[4]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[5]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[5]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[5]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[5]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[6]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[6]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[6]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[6]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[7]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[7]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[7]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[7]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[8]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[8]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[8]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[8]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[9]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[9]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[9]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[9]);
+
+        cyphertext[0] = _mm_aesenclast_si128(cyphertext[0], mRoundKey[10]);
+        cyphertext[1] = _mm_aesenclast_si128(cyphertext[1], mRoundKey[10]);
+        cyphertext[2] = _mm_aesenclast_si128(cyphertext[2], mRoundKey[10]);
+        cyphertext[3] = _mm_aesenclast_si128(cyphertext[3], mRoundKey[10]);
+    }
+
+    void AES::ecbEnc16Blocks(const block * plaintexts, block * cyphertext) const
+    {
+        cyphertext[0] = _mm_xor_si128(plaintexts[0], mRoundKey[0]);
+        cyphertext[1] = _mm_xor_si128(plaintexts[1], mRoundKey[0]);
+        cyphertext[2] = _mm_xor_si128(plaintexts[2], mRoundKey[0]);
+        cyphertext[3] = _mm_xor_si128(plaintexts[3], mRoundKey[0]);
+        cyphertext[4] = _mm_xor_si128(plaintexts[4], mRoundKey[0]);
+        cyphertext[5] = _mm_xor_si128(plaintexts[5], mRoundKey[0]);
+        cyphertext[6] = _mm_xor_si128(plaintexts[6], mRoundKey[0]);
+        cyphertext[7] = _mm_xor_si128(plaintexts[7], mRoundKey[0]);
+        cyphertext[8] = _mm_xor_si128(plaintexts[8], mRoundKey[0]);
+        cyphertext[9] = _mm_xor_si128(plaintexts[9], mRoundKey[0]);
+        cyphertext[10] = _mm_xor_si128(plaintexts[10], mRoundKey[0]);
+        cyphertext[11] = _mm_xor_si128(plaintexts[11], mRoundKey[0]);
+        cyphertext[12] = _mm_xor_si128(plaintexts[12], mRoundKey[0]);
+        cyphertext[13] = _mm_xor_si128(plaintexts[13], mRoundKey[0]);
+        cyphertext[14] = _mm_xor_si128(plaintexts[14], mRoundKey[0]);
+        cyphertext[15] = _mm_xor_si128(plaintexts[15], mRoundKey[0]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[1]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[1]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[1]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[1]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[1]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[1]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[1]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[1]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[1]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[1]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[1]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[1]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[1]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[1]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[1]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[1]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[2]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[2]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[2]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[2]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[2]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[2]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[2]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[2]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[2]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[2]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[2]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[2]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[2]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[2]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[2]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[2]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[3]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[3]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[3]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[3]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[3]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[3]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[3]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[3]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[3]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[3]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[3]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[3]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[3]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[3]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[3]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[3]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[4]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[4]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[4]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[4]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[4]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[4]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[4]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[4]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[4]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[4]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[4]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[4]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[4]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[4]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[4]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[4]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[5]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[5]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[5]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[5]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[5]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[5]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[5]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[5]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[5]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[5]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[5]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[5]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[5]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[5]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[5]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[5]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[6]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[6]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[6]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[6]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[6]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[6]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[6]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[6]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[6]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[6]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[6]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[6]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[6]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[6]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[6]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[6]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[7]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[7]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[7]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[7]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[7]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[7]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[7]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[7]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[7]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[7]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[7]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[7]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[7]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[7]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[7]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[7]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[8]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[8]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[8]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[8]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[8]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[8]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[8]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[8]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[8]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[8]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[8]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[8]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[8]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[8]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[8]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[8]);
+
+        cyphertext[0] = _mm_aesenc_si128(cyphertext[0], mRoundKey[9]);
+        cyphertext[1] = _mm_aesenc_si128(cyphertext[1], mRoundKey[9]);
+        cyphertext[2] = _mm_aesenc_si128(cyphertext[2], mRoundKey[9]);
+        cyphertext[3] = _mm_aesenc_si128(cyphertext[3], mRoundKey[9]);
+        cyphertext[4] = _mm_aesenc_si128(cyphertext[4], mRoundKey[9]);
+        cyphertext[5] = _mm_aesenc_si128(cyphertext[5], mRoundKey[9]);
+        cyphertext[6] = _mm_aesenc_si128(cyphertext[6], mRoundKey[9]);
+        cyphertext[7] = _mm_aesenc_si128(cyphertext[7], mRoundKey[9]);
+        cyphertext[8] = _mm_aesenc_si128(cyphertext[8], mRoundKey[9]);
+        cyphertext[9] = _mm_aesenc_si128(cyphertext[9], mRoundKey[9]);
+        cyphertext[10] = _mm_aesenc_si128(cyphertext[10], mRoundKey[9]);
+        cyphertext[11] = _mm_aesenc_si128(cyphertext[11], mRoundKey[9]);
+        cyphertext[12] = _mm_aesenc_si128(cyphertext[12], mRoundKey[9]);
+        cyphertext[13] = _mm_aesenc_si128(cyphertext[13], mRoundKey[9]);
+        cyphertext[14] = _mm_aesenc_si128(cyphertext[14], mRoundKey[9]);
+        cyphertext[15] = _mm_aesenc_si128(cyphertext[15], mRoundKey[9]);
+
+        cyphertext[0] = _mm_aesenclast_si128(cyphertext[0], mRoundKey[10]);
+        cyphertext[1] = _mm_aesenclast_si128(cyphertext[1], mRoundKey[10]);
+        cyphertext[2] = _mm_aesenclast_si128(cyphertext[2], mRoundKey[10]);
+        cyphertext[3] = _mm_aesenclast_si128(cyphertext[3], mRoundKey[10]);
+        cyphertext[4] = _mm_aesenclast_si128(cyphertext[4], mRoundKey[10]);
+        cyphertext[5] = _mm_aesenclast_si128(cyphertext[5], mRoundKey[10]);
+        cyphertext[6] = _mm_aesenclast_si128(cyphertext[6], mRoundKey[10]);
+        cyphertext[7] = _mm_aesenclast_si128(cyphertext[7], mRoundKey[10]);
+        cyphertext[8] = _mm_aesenclast_si128(cyphertext[8], mRoundKey[10]);
+        cyphertext[9] = _mm_aesenclast_si128(cyphertext[9], mRoundKey[10]);
+        cyphertext[10] = _mm_aesenclast_si128(cyphertext[10], mRoundKey[10]);
+        cyphertext[11] = _mm_aesenclast_si128(cyphertext[11], mRoundKey[10]);
+        cyphertext[12] = _mm_aesenclast_si128(cyphertext[12], mRoundKey[10]);
+        cyphertext[13] = _mm_aesenclast_si128(cyphertext[13], mRoundKey[10]);
+        cyphertext[14] = _mm_aesenclast_si128(cyphertext[14], mRoundKey[10]);
+        cyphertext[15] = _mm_aesenclast_si128(cyphertext[15], mRoundKey[10]);
+
+    }
+
+    void AES::ecbEncCounterMode(u64 baseIdx, u64 blockLength, block * cyphertext) const 
+    {
+        const i32 step = 8;
+        i32 idx = 0;
+        i32 length = i32(blockLength - blockLength % step);
+
+        //std::array<block, step> temp;
+		block temp[step];
+
+        for (; idx < length; idx += step, baseIdx += step)
+        {
+            temp[0] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 0), mRoundKey[0]);
+            temp[1] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 1), mRoundKey[0]);
+            temp[2] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 2), mRoundKey[0]);
+            temp[3] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 3), mRoundKey[0]);
+            temp[4] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 4), mRoundKey[0]);
+            temp[5] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 5), mRoundKey[0]);
+            temp[6] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 6), mRoundKey[0]);
+            temp[7] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 7), mRoundKey[0]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[1]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[1]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[1]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[1]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[1]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[1]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[1]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[1]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[2]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[2]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[2]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[2]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[2]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[2]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[2]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[2]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[3]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[3]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[3]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[3]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[3]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[3]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[3]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[3]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[4]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[4]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[4]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[4]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[4]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[4]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[4]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[4]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[5]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[5]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[5]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[5]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[5]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[5]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[5]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[5]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[6]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[6]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[6]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[6]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[6]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[6]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[6]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[6]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[7]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[7]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[7]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[7]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[7]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[7]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[7]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[7]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[8]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[8]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[8]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[8]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[8]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[8]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[8]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[8]);
+
+            temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[9]);
+            temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[9]);
+            temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[9]);
+            temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[9]);
+            temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[9]);
+            temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[9]);
+            temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[9]);
+            temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[9]);
+
+            cyphertext[idx + 0] = _mm_aesenclast_si128(temp[0], mRoundKey[10]);
+            cyphertext[idx + 1] = _mm_aesenclast_si128(temp[1], mRoundKey[10]);
+            cyphertext[idx + 2] = _mm_aesenclast_si128(temp[2], mRoundKey[10]);
+            cyphertext[idx + 3] = _mm_aesenclast_si128(temp[3], mRoundKey[10]);
+            cyphertext[idx + 4] = _mm_aesenclast_si128(temp[4], mRoundKey[10]);
+            cyphertext[idx + 5] = _mm_aesenclast_si128(temp[5], mRoundKey[10]);
+            cyphertext[idx + 6] = _mm_aesenclast_si128(temp[6], mRoundKey[10]);
+            cyphertext[idx + 7] = _mm_aesenclast_si128(temp[7], mRoundKey[10]);
+        }
+
+        for (; idx < static_cast<i32>(blockLength); ++idx, ++baseIdx)
+        {
+            cyphertext[idx] = _mm_xor_si128(_mm_set1_epi64x(baseIdx), mRoundKey[0]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[1]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[2]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[3]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[4]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[5]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[6]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[7]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[8]);
+            cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], mRoundKey[9]);
+            cyphertext[idx] = _mm_aesenclast_si128(cyphertext[idx], mRoundKey[10]);
+        }
+
+    }
+
+
+    //void AES::ecbEncCounterMode(u64 baseIdx, u64 blockLength, block* cyphertext, const u64* destIdxs)
+    //{
+    //    const u64 step = 8;
+    //    u64 idx = 0;
+    //    u64 length = blockLength - blockLength % step;
+
+    //    std::array<block, step> temp;
+
+    //    for (; idx < length; idx += step, baseIdx += step)
+    //    {
+    //        temp[0] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 0), mRoundKey[0]);
+    //        temp[1] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 1), mRoundKey[0]);
+    //        temp[2] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 2), mRoundKey[0]);
+    //        temp[3] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 3), mRoundKey[0]);
+    //        temp[4] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 4), mRoundKey[0]);
+    //        temp[5] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 5), mRoundKey[0]);
+    //        temp[6] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 6), mRoundKey[0]);
+    //        temp[7] = _mm_xor_si128(_mm_set1_epi64x(baseIdx + 7), mRoundKey[0]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[1]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[1]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[1]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[1]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[1]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[1]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[1]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[1]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[2]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[2]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[2]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[2]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[2]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[2]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[2]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[2]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[3]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[3]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[3]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[3]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[3]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[3]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[3]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[3]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[4]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[4]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[4]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[4]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[4]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[4]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[4]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[4]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[5]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[5]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[5]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[5]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[5]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[5]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[5]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[5]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[6]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[6]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[6]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[6]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[6]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[6]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[6]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[6]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[7]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[7]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[7]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[7]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[7]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[7]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[7]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[7]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[8]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[8]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[8]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[8]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[8]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[8]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[8]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[8]);
+
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[9]);
+    //        temp[1] = _mm_aesenc_si128(temp[1], mRoundKey[9]);
+    //        temp[2] = _mm_aesenc_si128(temp[2], mRoundKey[9]);
+    //        temp[3] = _mm_aesenc_si128(temp[3], mRoundKey[9]);
+    //        temp[4] = _mm_aesenc_si128(temp[4], mRoundKey[9]);
+    //        temp[5] = _mm_aesenc_si128(temp[5], mRoundKey[9]);
+    //        temp[6] = _mm_aesenc_si128(temp[6], mRoundKey[9]);
+    //        temp[7] = _mm_aesenc_si128(temp[7], mRoundKey[9]);
+
+    //        cyphertext[destIdxs[idx + 0]] = _mm_aesenclast_si128(temp[0], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 1]] = _mm_aesenclast_si128(temp[1], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 2]] = _mm_aesenclast_si128(temp[2], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 3]] = _mm_aesenclast_si128(temp[3], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 4]] = _mm_aesenclast_si128(temp[4], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 5]] = _mm_aesenclast_si128(temp[5], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 6]] = _mm_aesenclast_si128(temp[6], mRoundKey[10]);
+    //        cyphertext[destIdxs[idx + 7]] = _mm_aesenclast_si128(temp[7], mRoundKey[10]);
+    //    }
+
+
+    //    for (; idx < blockLength; ++idx, ++baseIdx)
+    //    {
+    //        temp[0] = _mm_xor_si128(_mm_set1_epi64x(baseIdx), mRoundKey[0]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[1]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[2]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[3]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[4]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[5]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[6]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[7]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[8]);
+    //        temp[0] = _mm_aesenc_si128(temp[0], mRoundKey[9]);
+    //        cyphertext[destIdxs[idx]] = _mm_aesenclast_si128(temp[0], mRoundKey[10]);
+    //    }
+
+    //}
+
+    AESDec::AESDec()
+    {
+    }
+
+    AESDec::AESDec(const block & userKey)
+    {
+        setKey(userKey);
+    }
+
+    void AESDec::setKey(const block & userKey)
+    {
+        const block& v0 = userKey;
+        const block  v1 = keyGenHelper(v0, _mm_aeskeygenassist_si128(v0, 0x01));
+        const block  v2 = keyGenHelper(v1, _mm_aeskeygenassist_si128(v1, 0x02));
+        const block  v3 = keyGenHelper(v2, _mm_aeskeygenassist_si128(v2, 0x04));
+        const block  v4 = keyGenHelper(v3, _mm_aeskeygenassist_si128(v3, 0x08));
+        const block  v5 = keyGenHelper(v4, _mm_aeskeygenassist_si128(v4, 0x10));
+        const block  v6 = keyGenHelper(v5, _mm_aeskeygenassist_si128(v5, 0x20));
+        const block  v7 = keyGenHelper(v6, _mm_aeskeygenassist_si128(v6, 0x40));
+        const block  v8 = keyGenHelper(v7, _mm_aeskeygenassist_si128(v7, 0x80));
+        const block  v9 = keyGenHelper(v8, _mm_aeskeygenassist_si128(v8, 0x1B));
+        const block  v10 = keyGenHelper(v9, _mm_aeskeygenassist_si128(v9, 0x36));
+
+
+        _mm_storeu_si128(mRoundKey, v10);
+        _mm_storeu_si128(mRoundKey + 1, _mm_aesimc_si128(v9));
+        _mm_storeu_si128(mRoundKey + 2, _mm_aesimc_si128(v8));
+        _mm_storeu_si128(mRoundKey + 3, _mm_aesimc_si128(v7));
+        _mm_storeu_si128(mRoundKey + 4, _mm_aesimc_si128(v6));
+        _mm_storeu_si128(mRoundKey + 5, _mm_aesimc_si128(v5));
+        _mm_storeu_si128(mRoundKey + 6, _mm_aesimc_si128(v4));
+        _mm_storeu_si128(mRoundKey + 7, _mm_aesimc_si128(v3));
+        _mm_storeu_si128(mRoundKey + 8, _mm_aesimc_si128(v2));
+        _mm_storeu_si128(mRoundKey + 9, _mm_aesimc_si128(v1));
+        _mm_storeu_si128(mRoundKey + 10, v0);
+
+    }
+
+    void  AESDec::ecbDecBlock(const block & cyphertext, block & plaintext)
+    {
+        plaintext = _mm_xor_si128(cyphertext, mRoundKey[0]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[1]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[2]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[3]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[4]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[5]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[6]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[7]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[8]);
+        plaintext = _mm_aesdec_si128(plaintext, mRoundKey[9]);
+        plaintext = _mm_aesdeclast_si128(plaintext, mRoundKey[10]);
+
+    }
+
+    block AESDec::ecbDecBlock(const block & plaintext)
+    {
+        block ret;
+        ecbDecBlock(plaintext, ret);
+        return ret;
+    }
+
+
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.h b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.h
new file mode 100644
index 00000000..7e4eced8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.h
@@ -0,0 +1,136 @@
+#pragma once
+// This file and the associated implementation has been placed in the public domain, waiving all copyright. No restrictions are placed on its use.
+#include <cryptoTools/Common/Defines.h>
+#include <wmmintrin.h>
+
+namespace osuCrypto {
+
+
+    // An AES-NI implemenation of AES encryption. 
+    class AES
+    {
+    public:
+
+        // Default constructor leave the class in an invalid state
+        // until setKey(...) is called.
+        AES() {};
+        AES(const AES&) = default;
+
+        // Constructor to initialize the class with the given key
+        AES(const block& userKey);
+
+        // Set the key to be used for encryption.
+        void setKey(const block& userKey);
+
+        // Encrypts the plaintext block and stores the result in ciphertext
+        void ecbEncBlock(const block& plaintext, block& ciphertext) const;
+
+        // Encrypts the plaintext block and returns the result 
+        block ecbEncBlock(const block& plaintext) const;
+
+        // Encrypts blockLength starting at the plaintexts pointer and writes the result
+        // to the ciphertext pointer
+        void ecbEncBlocks(const block* plaintexts, u64 blockLength, block* ciphertext) const;
+
+        void ecbEncBlocks(span<const block> plaintexts, span<block> ciphertext) const
+        {
+            if (plaintexts.size() != ciphertext.size())
+                throw RTE_LOC;
+            ecbEncBlocks(plaintexts.data(), plaintexts.size(), ciphertext.data());
+        }
+
+
+        // Encrypts 2 blocks pointer to by plaintexts and writes the result to ciphertext
+        void ecbEncTwoBlocks(const block* plaintexts, block* ciphertext) const;
+
+        // Encrypts 4 blocks pointer to by plaintexts and writes the result to ciphertext
+        void ecbEncFourBlocks(const block* plaintexts, block* ciphertext) const;
+
+        // Encrypts 16 blocks pointer to by plaintexts and writes the result to ciphertext
+        void ecbEnc16Blocks(const block* plaintexts, block* ciphertext) const;
+
+        // Encrypts the vector of blocks {baseIdx, baseIdx + 1, ..., baseIdx + length - 1} 
+        // and writes the result to ciphertext.
+        void ecbEncCounterMode(u64 baseIdx, u64 length, block* ciphertext) const;
+
+        void ecbEncCounterMode(u64 baseIdx, span<block> ciphertext) const
+        {
+            ecbEncCounterMode(baseIdx, ciphertext.size(), ciphertext.data());
+        }
+
+        // Returns the current key.
+        const block& getKey() const { return mRoundKey[0]; }
+
+        // The expanded key.
+        block mRoundKey[11];
+    };
+
+
+    // Specialization of the AES class to support encryption of N values under N different keys
+    template<int N>
+    class MultiKeyAES
+    {
+    public:
+        std::array<AES, N> mAESs;
+
+        // Default constructor leave the class in an invalid state
+        // until setKey(...) is called.
+        MultiKeyAES() = default;
+
+        // Constructor to initialize the class with the given key
+        MultiKeyAES(span<block> keys) { setKeys(keys); }
+
+        // Set the N keys to be used for encryption.
+        void setKeys(span<block> keys)
+        {
+            for (u64 i = 0; i < N; ++i)
+            {
+                mAESs[i].setKey(keys[i]);
+            }
+        }
+
+        // Computes the encrpytion of N blocks pointed to by plaintext 
+        // and stores the result at ciphertext.
+        void ecbEncNBlocks(const block* plaintext, block* ciphertext) const
+        {
+
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_xor_si128(plaintext[i], mAESs[i].mRoundKey[0]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[1]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[2]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[3]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[4]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[5]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[6]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[7]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[8]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenc_si128(ciphertext[i], mAESs[i].mRoundKey[9]);
+            for (int i = 0; i < N; ++i) ciphertext[i] = _mm_aesenclast_si128(ciphertext[i], mAESs[i].mRoundKey[10]);
+        }
+
+        // Utility to compare the keys.
+        const MultiKeyAES<N>& operator=(const MultiKeyAES<N>& rhs)
+        {
+            for (u64 i = 0; i < N; ++i)
+                for (u64 j = 0; j < 11; ++j)
+                    mAESs[i].mRoundKey[j] = rhs.mAESs[i].mRoundKey[j];
+
+            return rhs;
+        }
+    };
+
+    // An AES instance with a fixed and public key.
+    extern const AES mAesFixedKey;
+
+    // A class to perform AES decryption.
+    class AESDec
+    {
+    public:
+        AESDec();
+        AESDec(const block& userKey);
+        void setKey(const block& userKey);
+        void ecbDecBlock(const block& ciphertext, block& plaintext);
+        block ecbDecBlock(const block& ciphertext);
+        block mRoundKey[11];
+    };
+
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.cpp b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.cpp
new file mode 100644
index 00000000..a87b4824
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.cpp
@@ -0,0 +1,78 @@
+#include <cryptoTools/Crypto/PRNG.h>
+#include <cryptoTools/Common/Log.h>
+#include <algorithm>
+#include <cstring>
+
+namespace osuCrypto {
+
+    PRNG::PRNG(const block& seed, u64 bufferSize)
+        :
+        mBytesIdx(0),
+        mBlockIdx(0)
+    {
+		SetSeed(seed, bufferSize);
+    }
+
+    PRNG::PRNG(PRNG && s) :
+        mBuffer(std::move(s.mBuffer)),
+        mAes(std::move(s.mAes)),
+        mBytesIdx(s.mBytesIdx),
+        mBlockIdx(s.mBlockIdx),
+        mBufferByteCapacity(s.mBufferByteCapacity)
+    {
+        s.mBuffer.resize(0);
+        s.mBytesIdx = 0;
+        s.mBlockIdx = 0;
+        s.mBufferByteCapacity = 0;
+    }
+
+    void PRNG::operator=(PRNG&&s) 
+    {
+        mBuffer = (std::move(s.mBuffer));
+        mAes = (std::move(s.mAes));
+        mBytesIdx = (s.mBytesIdx);
+        mBlockIdx = (s.mBlockIdx);
+        mBufferByteCapacity = (s.mBufferByteCapacity);
+
+        s.mBuffer.resize(0);
+        s.mBytesIdx = 0;
+        s.mBlockIdx = 0;
+        s.mBufferByteCapacity = 0;
+    }
+
+
+    void PRNG::SetSeed(const block& seed, u64 bufferSize)
+    {
+        mAes.setKey(seed);
+        mBlockIdx = 0;
+
+        if (mBuffer.size() == 0)
+        {
+            mBuffer.resize(bufferSize);
+            mBufferByteCapacity = (sizeof(block) * bufferSize);
+        }
+
+
+        refillBuffer();
+    }
+
+    u8 PRNG::getBit() { return get<bool>(); }
+
+    const block PRNG::getSeed() const
+    {
+		if(mBuffer.size())
+	        return mAes.mRoundKey[0];
+
+		throw std::runtime_error("PRNG has not been keyed " LOCATION);
+    }
+
+    void PRNG::refillBuffer()
+    {
+		if (mBuffer.size() == 0)
+			throw std::runtime_error("PRNG has not been keyed " LOCATION);
+
+		mAes.ecbEncCounterMode(mBlockIdx, mBuffer.size(), mBuffer.data());
+		mBlockIdx += mBuffer.size();
+        mBytesIdx = 0;
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.h b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.h
new file mode 100644
index 00000000..f1cab0a1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.h
@@ -0,0 +1,178 @@
+#pragma once
+// This file and the associated implementation has been placed in the public domain, waiving all copyright. No restrictions are placed on its use.
+#include <cryptoTools/Common/Defines.h>
+#include <cryptoTools/Crypto/AES.h>
+#include <vector>
+#include <cstring>
+
+namespace osuCrypto
+{
+
+	// A Peudorandom number generator implemented using AES-NI.
+    class PRNG
+    {
+    public:
+
+		// default construct leaves the PRNG in an invalid state.
+		// SetSeed(...) must be called before get(...)
+        PRNG() = default;
+
+		// explicit constructor to initialize the PRNG with the 
+		// given seed and to buffer bufferSize number of AES block
+        PRNG(const block& seed, u64 bufferSize = 256);
+
+		// standard move constructor. The moved from PRNG is invalid
+		// unless SetSeed(...) is called.
+        PRNG(PRNG&& s);
+
+		// Copy is not allowed.
+        PRNG(const PRNG&) = delete;
+
+        // standard move assignment. The moved from PRNG is invalid
+        // unless SetSeed(...) is called.
+        void operator=(PRNG&&);
+
+        // Set seed from a block and set the desired buffer size.
+        void SetSeed(const block& b, u64 bufferSize = 256);
+
+		// Return the seed for this PRNG.
+        const block getSeed() const;
+
+
+        struct AnyPOD
+        {
+            PRNG& mPrng;
+
+            template<typename T, typename U = typename std::enable_if<std::is_pod<T>::value, T>::type>
+                operator T()
+            {
+                return mPrng.get<T>();
+            }
+
+        };
+
+        AnyPOD get()
+        {
+            return { *this };
+        }
+
+		// Templated function that returns the a random element
+		// of the given type T. 
+		// Required: T must be a POD type.
+        template<typename T>
+        typename std::enable_if<std::is_pod<T>::value, T>::type
+			get()
+        {
+            T ret;
+            get((u8*)&ret, sizeof(T));
+            return ret;
+        }
+
+		// Templated function that fills the provided buffer 
+		// with random elements of the given type T. 
+		// Required: T must be a POD type.
+        template<typename T>
+		typename std::enable_if<std::is_pod<T>::value, void>::type 
+			get(T* dest, u64 length)
+        {
+            u64 lengthu8 = length * sizeof(T);
+            u8* destu8 = (u8*)dest;
+            while (lengthu8)
+            {
+                u64 step = std::min(lengthu8, mBufferByteCapacity - mBytesIdx);
+
+                memcpy(destu8, ((u8*)mBuffer.data()) + mBytesIdx, step);
+
+                destu8 += step;
+                lengthu8 -= step;
+                mBytesIdx += step;
+
+                if (mBytesIdx == mBufferByteCapacity)
+                    refillBuffer();
+            }
+        }
+
+		// Templated function that fills the provided buffer 
+		// with random elements of the given type T. 
+		// Required: T must be a POD type.
+		template<typename T>
+		typename std::enable_if<std::is_pod<T>::value, void>::type
+			get(span<T> dest)
+		{
+			get(dest.data(), dest.size());
+		}
+
+        // returns the buffer of maximum maxSize bytes or however 
+        // many the internal buffer has, which ever is smaller. The 
+        // returned bytes are "consumed" and will not be used on 
+        // later calls to get*(...). Note, buffer may be invalidated 
+        // on the next call to get*(...) or destruction.
+        span<u8> getBufferSpan(u64 maxSize)
+        {
+            if (mBytesIdx == mBufferByteCapacity)
+                refillBuffer();
+
+            auto data = ((u8*)mBuffer.data()) + mBytesIdx;
+            auto size = std::min(maxSize, mBufferByteCapacity - mBytesIdx);
+
+            mBytesIdx += size;
+
+            return span<u8>(data, size);
+        }
+
+		// Returns a random element from {0,1}
+        u8 getBit();
+
+		// STL random number interface
+        typedef u64 result_type;
+        static constexpr result_type min() { return 0; }
+        static constexpr result_type max() { return (result_type)-1; }
+        result_type operator()() {
+            return get<result_type>();
+        }
+
+        template<typename R>
+        R operator()(R mod) {
+            return get<typename std::make_unsigned<R>::type>() % mod;
+        }
+
+		// internal buffer to store future random values.
+		std::vector<block> mBuffer;
+
+		// AES that generates the randomness by computing AES_seed({0,1,2,...})
+		AES mAes;
+
+		// Indicators denoting the current state of the buffer.
+		u64 mBytesIdx = 0,
+			mBlockIdx = 0,
+			mBufferByteCapacity = 0;
+
+		// refills the internal buffer with fresh randomness
+		void refillBuffer();
+    };
+
+	// specialization to make bool work correctly.
+    template<>
+    inline void PRNG::get<bool>(bool* dest, u64 length)
+    {
+        get((u8*)dest, length);
+        for (u64 i = 0; i < length; ++i) dest[i] = ((u8*)dest)[i] & 1;
+    }
+
+	// specialization to make bool work correctly.
+    template<>
+    inline bool PRNG::get<bool>()
+    {
+        u8 ret;
+        get((u8*)&ret, 1);
+        return ret & 1;
+    }
+
+
+	template<typename T>
+	typename std::enable_if<std::is_pod<T>::value, PRNG&>::type operator<<(T& rhs, PRNG& lhs)
+	{
+		lhs.get(&rhs, 1);
+	}
+
+}
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/GSL.natvis b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/GSL.natvis
new file mode 100644
index 00000000..55bace84
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/GSL.natvis
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- 
+    This will make GitHub and some editors recognize this code as XML: 
+    vim: syntax=xml
+-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+    <!-- These types are from the gsl_assert header. -->
+    <Type Name="gsl::fail_fast">
+        <!-- na hides the address, otherwise it would appear as 0x.... "Message" -->
+        <DisplayString>{_Data._What,nasb}</DisplayString>
+    </Type>
+
+    <!-- These types are from the gsl_util header. -->
+    <Type Name="gsl::final_act&lt;*&gt;">
+        <DisplayString>{{ invoke = {invoke_}, action = {f_} }}</DisplayString>
+        <Expand>
+            <Item Name="[invoke]">invoke_</Item>
+            <Item Name="[callback]">f_</Item>
+        </Expand>
+    </Type>
+
+    <!-- These types are from the span header. -->
+    <!-- This is for dynamic_extent spans. -->
+    <Type Name="gsl::span&lt;*, -1&gt;">
+        <DisplayString>{{ extent = {storage_.size_} }}</DisplayString>
+        <Expand>
+            <ArrayItems>
+                <Size>storage_.size_</Size>
+                <ValuePointer>storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+
+    <!-- This works for constexpr size spans. -->
+    <Type Name="gsl::span&lt;*, *&gt;">
+        <DisplayString>{{ extent = {extent} }}</DisplayString>
+        <Expand>
+            <ArrayItems>
+                <Size>extent</Size>
+                <ValuePointer>storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+
+    <!-- This is for dynamic_extent string_spans. -->
+    <Type Name="gsl::basic_string_span&lt;*, -1&gt;">
+        <DisplayString>{span_.storage_.data_,[span_.storage_.size_]na}</DisplayString>
+        <Expand>
+            <Item Name="[size]">span_.storage_.size_</Item>
+            <ArrayItems>
+                <Size>span_.storage_.size_</Size>
+                <ValuePointer>span_.storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+
+    <!-- This works for constexpr size string_spans. -->
+    <Type Name="gsl::basic_string_span&lt;*, *&gt;">
+        <DisplayString>{span_.storage_.data_,[span_.extent]na}</DisplayString>
+        <Expand>
+            <Item Name="[size]">span_.extent</Item>
+            <ArrayItems>
+                <Size>span_.extent</Size>
+                <ValuePointer>span_.storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+
+    <!-- This is for dynamic_extent zstring_spans. -->
+    <Type Name="gsl::basic_zstring_span&lt;*, -1&gt;">
+        <DisplayString>{span_.storage_.data_,[span_.storage_.size_]na}</DisplayString>
+        <Expand>
+            <Item Name="[size]">span_.storage_.size_</Item>
+            <ArrayItems>
+                <Size>span_.storage_.size_</Size>
+                <ValuePointer>span_.storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+    
+    <!-- This works for constexpr size string_spans. -->
+    <Type Name="gsl::basic_zstring_span&lt;*, *&gt;">
+        <DisplayString>{span_.storage_.data_,[span_.extent]na}</DisplayString>
+        <Expand>
+            <Item Name="[size]">span_.extent</Item>
+            <ArrayItems>
+                <Size>span_.extent</Size>
+                <ValuePointer>span_.storage_.data_</ValuePointer>
+            </ArrayItems>
+        </Expand>
+    </Type>
+    
+    <!-- These types are from the gsl header. -->
+    <Type Name="gsl::not_null&lt;*&gt;">
+        <!-- We can always dereference this since it's an invariant. -->
+        <DisplayString>value = {*ptr_}</DisplayString>
+    </Type>
+</AutoVisualizer>  
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gls-lite.hpp b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gls-lite.hpp
new file mode 100644
index 00000000..0b4c3fce
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gls-lite.hpp
@@ -0,0 +1,2382 @@
+//
+// gsl-lite is based on GSL: Guideline Support Library.
+// For more information see https://github.com/martinmoene/gsl-lite
+//
+// Copyright (c) 2015-2018 Martin Moene
+// Copyright (c) 2015-2018 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#ifndef GSL_GSL_LITE_HPP_INCLUDED
+#define GSL_GSL_LITE_HPP_INCLUDED
+
+#include <exception>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+
+#define  gsl_lite_VERSION "0.28.0"
+
+
+#define GSL_UNLIKELY(x) x
+#define GSL_LIKELY(x) x
+
+// gsl-lite backward compatibility:
+
+#ifdef gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR
+# define gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR  gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR
+# pragma message ("gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR is deprecated since gsl-lite 0.7.0; replace with gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR, or consider span(with_container, cont).")
+#endif
+
+// M-GSL compatibility:
+
+#if defined( GSL_THROW_ON_CONTRACT_VIOLATION )
+# define gsl_CONFIG_CONTRACT_VIOLATION_THROWS  1
+#endif
+
+#if defined( GSL_TERMINATE_ON_CONTRACT_VIOLATION )
+# define gsl_CONFIG_CONTRACT_VIOLATION_THROWS  0
+#endif
+
+#if defined( GSL_UNENFORCED_ON_CONTRACT_VIOLATION )
+# define gsl_CONFIG_CONTRACT_LEVEL_OFF  1
+#endif
+
+// Configuration:
+
+#ifndef  gsl_FEATURE_HAVE_IMPLICIT_MACRO
+# define gsl_FEATURE_HAVE_IMPLICIT_MACRO  1
+#endif
+
+#ifndef  gsl_FEATURE_HAVE_OWNER_MACRO
+# define gsl_FEATURE_HAVE_OWNER_MACRO  1
+#endif
+
+#ifndef  gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
+# define gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD  0
+#endif
+
+#ifndef  gsl_CONFIG_SPAN_INDEX_TYPE
+# define gsl_CONFIG_SPAN_INDEX_TYPE  size_t
+#endif
+
+#ifndef  gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS
+# define gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS  0
+#endif
+
+#ifndef  gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+# define gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON  1
+#endif
+
+#ifndef  gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR
+# define gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR  0
+#endif
+
+#if    defined( gsl_CONFIG_CONTRACT_LEVEL_ON )
+# define        gsl_CONFIG_CONTRACT_LEVEL_MASK  0x11
+#elif  defined( gsl_CONFIG_CONTRACT_LEVEL_OFF )
+# define        gsl_CONFIG_CONTRACT_LEVEL_MASK  0x00
+#elif  defined( gsl_CONFIG_CONTRACT_LEVEL_EXPECTS_ONLY )
+# define        gsl_CONFIG_CONTRACT_LEVEL_MASK  0x01
+#elif  defined( gsl_CONFIG_CONTRACT_LEVEL_ENSURES_ONLY )
+# define        gsl_CONFIG_CONTRACT_LEVEL_MASK  0x10
+#else
+# define        gsl_CONFIG_CONTRACT_LEVEL_MASK  0x11
+#endif
+
+#if   !defined( gsl_CONFIG_CONTRACT_VIOLATION_THROWS     ) && \
+      !defined( gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES )
+# define        gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 0
+#elif  defined( gsl_CONFIG_CONTRACT_VIOLATION_THROWS     ) && \
+      !defined( gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES )
+# define        gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 1
+#elif !defined( gsl_CONFIG_CONTRACT_VIOLATION_THROWS     ) && \
+       defined( gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES )
+# define        gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 0
+#else
+# error only one of gsl_CONFIG_CONTRACT_VIOLATION_THROWS and gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES may be defined.
+#endif
+
+// Compiler detection (C++20 is speculative):
+// Note: MSVC supports C++14 since it supports C++17.
+
+#ifdef _MSVC_LANG
+# define gsl_MSVC_LANG  _MSVC_LANG
+#else
+# define gsl_MSVC_LANG  0
+#endif
+
+#define gsl_CPP11_OR_GREATER  (__cplusplus >= 201103L || gsl_MSVC_LANG >= 201103L )
+#define gsl_CPP14_OR_GREATER  (__cplusplus >= 201402L || gsl_MSVC_LANG >= 201703L )
+#define gsl_CPP17_OR_GREATER  (__cplusplus >= 201703L || gsl_MSVC_LANG >= 201703L )
+#define gsl_CPP20_OR_GREATER  (__cplusplus >= 202000L || gsl_MSVC_LANG >= 202000L )
+
+// half-open range [lo..hi):
+#define gsl_BETWEEN( v, lo, hi ) ( lo <= v && v < hi )
+
+#if defined(_MSC_VER) && !defined(__clang__)
+# define gsl_COMPILER_MSVC_VERSION   (_MSC_VER / 100 - 5 - (_MSC_VER < 1900))
+#else
+# define gsl_COMPILER_MSVC_VERSION   0
+# define gsl_COMPILER_NON_MSVC       1
+#endif
+
+// Note: simplistic version computation; works for GCC versions on http://godbolt.org/
+#if defined(__GNUC__) && !defined(__clang__)
+# define gsl_COMPILER_GNUC_VERSION  ( 10 * (10 *__GNUC__ + __GNUC_MINOR__) + __GNUC_PATCHLEVEL__)
+#else
+# define gsl_COMPILER_GNUC_VERSION  0
+#endif
+
+// Presence of C++11 language features:
+
+#define gsl_CPP11_10  (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 10)
+#define gsl_CPP11_12  (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12)
+#define gsl_CPP11_14  (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 14)
+
+#define gsl_HAVE_AUTO     gsl_CPP11_10
+#define gsl_HAVE_NULLPTR  gsl_CPP11_10
+
+#define gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG  gsl_CPP11_12
+
+#define gsl_HAVE_ALIAS_TEMPLATE       gsl_CPP11_14
+#define gsl_HAVE_CONSTEXPR_11         gsl_CPP11_14
+#define gsl_HAVE_ENUM_CLASS           gsl_CPP11_14
+#define gsl_HAVE_EXPLICIT_CONVERSION  gsl_CPP11_14
+#define gsl_HAVE_INITIALIZER_LIST     gsl_CPP11_14
+#define gsl_HAVE_IS_DEFAULT           gsl_CPP11_14
+#define gsl_HAVE_IS_DELETE            gsl_CPP11_14
+#define gsl_HAVE_NOEXCEPT             gsl_CPP11_14
+
+#if gsl_CPP11_OR_GREATER
+// see above
+#endif
+
+// Presence of C++14 language features:
+
+#define gsl_CPP14_00  (gsl_CPP14_OR_GREATER)
+#define gsl_CPP14_14  (gsl_CPP14_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 14)
+
+#define gsl_HAVE_CONSTEXPR_14   gsl_CPP14_00
+#define gsl_HAVE_DECLTYPE_AUTO  gsl_CPP14_14
+
+// Presence of C++17 language features:
+
+#define gsl_CPP17_00  (gsl_CPP17_OR_GREATER)
+#define gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE  gsl_CPP17_00
+
+// Presence of C++ library features:
+
+#ifdef _HAS_CPP0X
+# define gsl_HAS_CPP0X  _HAS_CPP0X
+#else
+# define gsl_HAS_CPP0X  0
+#endif
+
+#define gsl_CPP11_LA                   (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 11)
+#define gsl_HAVE_ARRAY                  gsl_CPP11_LA
+#define gsl_HAVE_TR1_TYPE_TRAITS        gsl_CPP11_LA
+
+#define gsl_CPP11_LB                   (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 14 || (gsl_COMPILER_MSVC_VERSION >= 9 && gsl_HAS_CPP0X))
+#define gsl_HAVE_CONTAINER_DATA_METHOD  gsl_CPP11_LB
+
+#define gsl_CPP11_LC                   (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 14)
+#define gsl_HAVE_SIZED_TYPES            gsl_CPP11_LC
+
+#define gsl_CPP11_LD                   (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 14 || (gsl_COMPILER_MSVC_VERSION >= 10 && gsl_HAS_CPP0X))
+#define gsl_HAVE_MAKE_SHARED            gsl_CPP11_LD
+#define gsl_HAVE_SHARED_PTR             gsl_CPP11_LD
+#define gsl_HAVE_UNIQUE_PTR             gsl_CPP11_LD
+
+#define gsl_CPP14_LA                   (gsl_CPP14_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12)
+#define gsl_HAVE_MAKE_UNIQUE            gsl_CPP14_LA
+
+#define gsl_HAVE_TYPE_TRAITS           (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 11)
+
+#define gsl_HAVE_ADD_CONST              gsl_HAVE_TYPE_TRAITS
+#define gsl_HAVE_INTEGRAL_CONSTANT      gsl_HAVE_TYPE_TRAITS
+#define gsl_HAVE_REMOVE_CONST           gsl_HAVE_TYPE_TRAITS
+#define gsl_HAVE_REMOVE_REFERENCE       gsl_HAVE_TYPE_TRAITS
+
+#define gsl_HAVE_TR1_ADD_CONST          gsl_HAVE_TR1_TYPE_TRAITS
+#define gsl_HAVE_TR1_INTEGRAL_CONSTANT  gsl_HAVE_TR1_TYPE_TRAITS
+#define gsl_HAVE_TR1_REMOVE_CONST       gsl_HAVE_TR1_TYPE_TRAITS
+#define gsl_HAVE_TR1_REMOVE_REFERENCE   gsl_HAVE_TR1_TYPE_TRAITS
+
+// For the rest, consider VC12, VC14 as C++11 for GSL Lite.
+
+// C++ feature usage:
+
+#if gsl_HAVE_CONSTEXPR_11
+# define gsl_constexpr constexpr
+#else
+# define gsl_constexpr /*constexpr*/
+#endif
+
+#if gsl_HAVE_CONSTEXPR_14
+# define gsl_constexpr14 constexpr
+#else
+# define gsl_constexpr14 /*constexpr*/
+#endif
+
+#if gsl_HAVE_EXPLICIT_CONVERSION
+# define gsl_explicit explicit
+#else
+# define gsl_explicit /*explicit*/
+#endif
+
+#if gsl_FEATURE_HAVE_IMPLICIT_MACRO
+# define implicit /*implicit*/
+#endif
+
+#if gsl_HAVE_IS_DELETE
+# define gsl_is_delete = delete
+#else
+# define gsl_is_delete
+#endif
+
+#if gsl_HAVE_IS_DELETE
+# define gsl_is_delete_access public
+#else
+# define gsl_is_delete_access private
+#endif
+
+#if !gsl_HAVE_NOEXCEPT || gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+# define gsl_noexcept /*noexcept*/
+#else
+# define gsl_noexcept noexcept
+#endif
+
+#if gsl_HAVE_NULLPTR
+# define gsl_nullptr  nullptr
+#else
+# define gsl_nullptr  NULL
+#endif
+
+#define gsl_DIMENSION_OF( a ) ( sizeof(a) / sizeof(0[a]) )
+
+// Other features:
+
+#define gsl_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR  \
+    ( gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG && gsl_HAVE_CONTAINER_DATA_METHOD )
+
+// Note: !defined(__NVCC__) doesn't work with nvcc here:
+#define gsl_HAVE_UNCONSTRAINED_SPAN_CONTAINER_CTOR  \
+    ( gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR && (__NVCC__== 0) )
+
+// GSL API (e.g. for CUDA platform):
+
+#ifndef   gsl_api
+# ifdef   __CUDACC__
+#  define gsl_api __host__ __device__
+# else
+#  define gsl_api /*gsl_api*/
+# endif
+#endif
+
+// Additional includes:
+
+#if gsl_HAVE_ARRAY
+# include <array>
+#endif
+
+#if gsl_HAVE_TYPE_TRAITS
+# include <type_traits>
+#elif gsl_HAVE_TR1_TYPE_TRAITS
+# include <tr1/type_traits>
+#endif
+
+#if gsl_HAVE_SIZED_TYPES
+# include <cstdint>
+#endif
+
+// MSVC warning suppression macros:
+
+#if gsl_COMPILER_MSVC_VERSION >= 14
+# define gsl_SUPPRESS_MSGSL_WARNING(expr)        [[gsl::suppress(expr)]]
+# define gsl_SUPPRESS_MSVC_WARNING(code, descr)  __pragma(warning(suppress: code) )
+# define gsl_DISABLE_MSVC_WARNINGS(codes)        __pragma(warning(push))  __pragma(warning(disable: codes))
+# define gsl_RESTORE_MSVC_WARNINGS()             __pragma(warning(pop ))
+#else
+# define gsl_SUPPRESS_MSGSL_WARNING(expr)
+# define gsl_SUPPRESS_MSVC_WARNING(code, descr)
+# define gsl_DISABLE_MSVC_WARNINGS(codes)
+# define gsl_RESTORE_MSVC_WARNINGS()
+#endif
+
+// Suppress the following MSVC GSL warnings:
+// - C26410: gsl::r.32: the parameter 'ptr' is a reference to const unique pointer, use const T* or const T& instead
+// - C26415: gsl::r.30: smart pointer parameter 'ptr' is used only to access contained pointer. Use T* or T& instead
+// - C26418: gsl::r.36: shared pointer parameter 'ptr' is not copied or moved. Use T* or T& instead
+// - C26472, gsl::t.1 : don't use a static_cast for arithmetic conversions;
+//                      use brace initialization, gsl::narrow_cast or gsl::narow
+// - C26439, gsl::f.6 : special function 'function' can be declared 'noexcept'
+// - C26440, gsl::f.6 : function 'function' can be declared 'noexcept'
+// - C26473: gsl::t.1 : don't cast between pointer types where the source type and the target type are the same
+// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
+// - C26482, gsl::b.2 : only index into arrays using constant expressions
+// - C26490: gsl::t.1 : don't use reinterpret_cast
+
+gsl_DISABLE_MSVC_WARNINGS(26410 26415 26418 26472 26439 26440 26473 26481 26482 26490)
+
+namespace gsl {
+
+	namespace detail {
+
+		// C++11 emulation:
+
+#if gsl_HAVE_ADD_CONST
+
+		using std::add_const;
+
+#elif gsl_HAVE_TR1_ADD_CONST
+
+		using std::tr1::add_const;
+
+#else
+
+		template< class T > struct add_const { typedef const T type; };
+
+#endif // gsl_HAVE_ADD_CONST
+
+#if gsl_HAVE_REMOVE_CONST
+
+		using std::remove_cv;
+		using std::remove_const;
+		using std::remove_volatile;
+
+#elif gsl_HAVE_TR1_REMOVE_CONST
+
+		using std::tr1::remove_cv;
+		using std::tr1::remove_const;
+		using std::tr1::remove_volatile;
+
+#else
+
+		template< class T > struct remove_const { typedef T type; };
+		template< class T > struct remove_const<T const> { typedef T type; };
+
+		template< class T > struct remove_volatile { typedef T type; };
+		template< class T > struct remove_volatile<T volatile> { typedef T type; };
+
+		template< class T >
+		struct remove_cv
+		{
+			typedef typename detail::remove_volatile<typename detail::remove_const<T>::type>::type type;
+		};
+
+#endif // gsl_HAVE_REMOVE_CONST
+
+#if gsl_HAVE_INTEGRAL_CONSTANT
+
+		using std::integral_constant;
+		using std::true_type;
+		using std::false_type;
+
+#elif gsl_HAVE_TR1_INTEGRAL_CONSTANT
+
+		using std::tr1::integral_constant;
+		using std::tr1::true_type;
+		using std::tr1::false_type;
+
+#else
+
+		template< int v > struct integral_constant { enum { value = v }; };
+		typedef integral_constant< true  > true_type;
+		typedef integral_constant< false > false_type;
+
+#endif
+
+#if gsl_HAVE_ARRAY
+
+		template< class T >
+		struct is_std_array_oracle : false_type {};
+
+		template< class T, std::size_t N >
+		struct is_std_array_oracle< std::array<T, N> > : true_type {};
+
+		template< class T >
+		struct is_std_array : is_std_array_oracle< typename remove_cv<T>::type > {};
+
+#endif
+
+	} // namespace detail
+
+	  //
+	  // GSL.owner: ownership pointers
+	  //
+#if gsl_HAVE_SHARED_PTR
+	using std::unique_ptr;
+	using std::shared_ptr;
+	using std::make_shared;
+# if gsl_HAVE_MAKE_UNIQUE
+	using std::make_unique;
+# endif
+#endif
+
+#if  gsl_HAVE_ALIAS_TEMPLATE
+# if gsl_HAVE_TYPE_TRAITS
+	template< class T, class = typename std::enable_if< std::is_pointer<T>::value >::type >
+	using owner = T;
+# else
+	template< class T > using owner = T;
+# endif
+#else
+	template< class T > struct owner { typedef T type; };
+#endif
+
+#define gsl_HAVE_OWNER_TEMPLATE  gsl_HAVE_ALIAS_TEMPLATE
+
+#if gsl_FEATURE_HAVE_OWNER_MACRO
+# if gsl_HAVE_OWNER_TEMPLATE
+#  define Owner(t)  ::gsl::owner<t>
+# else
+#  define Owner(t)  ::gsl::owner<t>::type
+# endif
+#endif
+
+	//
+	// GSL.assert: assertions
+	//
+
+#define gsl_ELIDE_CONTRACT_EXPECTS  ( 0 == ( gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x01 ) )
+#define gsl_ELIDE_CONTRACT_ENSURES  ( 0 == ( gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x10 ) )
+
+#if gsl_ELIDE_CONTRACT_EXPECTS
+# define Expects( x )  /* Expects elided */
+#elif gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+# define Expects( x )  ::gsl::fail_fast_assert( (x), "GSL: Precondition failure at " __FILE__ ": " gsl_STRINGIFY(__LINE__) );
+#else
+# define Expects( x )  ::gsl::fail_fast_assert( (x) )
+#endif
+
+#if gsl_ELIDE_CONTRACT_ENSURES
+# define Ensures( x )  /* Ensures elided */
+#elif gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+# define Ensures( x )  ::gsl::fail_fast_assert( (x), "GSL: Postcondition failure at " __FILE__ ": " gsl_STRINGIFY(__LINE__) );
+#else
+# define Ensures( x )  ::gsl::fail_fast_assert( (x) )
+#endif
+
+#define gsl_STRINGIFY(  x )  gsl_STRINGIFY_( x )
+#define gsl_STRINGIFY_( x )  #x
+
+	struct fail_fast : public std::logic_error
+	{
+		gsl_api explicit fail_fast(char const * const message)
+			: std::logic_error(message) {}
+	};
+
+	// workaround for gcc 5 throw/terminate constexpr bug:
+
+#if gsl_BETWEEN( gsl_COMPILER_GNUC_VERSION, 430, 600 ) && gsl_HAVE_CONSTEXPR_14
+
+# if gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+
+	gsl_api inline gsl_constexpr14 auto fail_fast_assert(bool cond, char const * const message) -> void
+	{
+		!cond ? throw fail_fast(message) : 0;
+	}
+
+# else
+
+	gsl_api inline gsl_constexpr14 auto fail_fast_assert(bool cond) -> void
+	{
+		struct F { static gsl_constexpr14 void f() {}; };
+
+		!cond ? std::terminate() : F::f();
+	}
+
+# endif
+
+#else // workaround
+
+# if gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+
+	gsl_api inline gsl_constexpr14 void fail_fast_assert(bool cond, char const * const message)
+	{
+		if (!cond)
+			throw fail_fast(message);
+	}
+
+# else
+
+	gsl_api inline gsl_constexpr14 void fail_fast_assert(bool cond) gsl_noexcept
+	{
+		if (!cond)
+			std::terminate();
+	}
+
+# endif
+#endif // workaround
+
+	//
+	// GSL.util: utilities
+	//
+
+#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 11
+
+	template< class F >
+	class final_action
+	{
+	public:
+		gsl_api explicit final_action(F action) gsl_noexcept
+			: action_(std::move(action))
+			, invoke_(true)
+		{}
+
+		gsl_api final_action(final_action && other) gsl_noexcept
+			: action_(std::move(other.action_))
+			, invoke_(other.invoke_)
+		{
+			other.invoke_ = false;
+		}
+
+		gsl_api virtual ~final_action() gsl_noexcept
+		{
+			if (invoke_)
+				action_();
+		}
+
+	gsl_is_delete_access:
+		gsl_api final_action(final_action const  &) gsl_is_delete;
+		gsl_api final_action & operator=(final_action const &) gsl_is_delete;
+		gsl_api final_action & operator=(final_action &&) gsl_is_delete;
+
+	protected:
+		gsl_api void dismiss() gsl_noexcept
+		{
+			invoke_ = false;
+		}
+
+#if gsl_CPP17_OR_GREATER
+		gsl_api int uncaught_exceptions() gsl_noexcept
+		{
+			return std::uncaught_exceptions();
+		}
+#else
+		gsl_api int uncaught_exceptions()
+		{
+			return std::uncaught_exception() ? 1 : 0;
+		}
+#endif
+
+	private:
+		F action_;
+		bool invoke_;
+	};
+
+	template< class F >
+	gsl_api inline final_action<F> finally(F const & action) gsl_noexcept
+	{
+		return final_action<F>(action);
+	}
+
+	template< class F >
+	gsl_api inline final_action<F> finally(F && action) gsl_noexcept
+	{
+		return final_action<F>(std::forward<F>(action));
+	}
+
+#if gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
+
+	template< class F >
+	class final_action_return : public final_action<F>
+	{
+	public:
+		gsl_api explicit final_action_return(F && action) gsl_noexcept
+			: final_action<F>(std::move(action))
+		{}
+
+		gsl_api final_action_return(final_action_return && other) gsl_noexcept
+			: final_action<F>(std::move(other))
+		{}
+
+		gsl_api ~final_action_return() override
+		{
+			if (this->uncaught_exceptions())
+				this->dismiss();
+		}
+
+	gsl_is_delete_access:
+		gsl_api final_action_return(final_action_return const &) gsl_is_delete;
+		gsl_api final_action_return & operator=(final_action_return const &) gsl_is_delete;
+	};
+
+	template< class F >
+	gsl_api inline final_action_return<F> on_return(F const & action) gsl_noexcept
+	{
+		return final_action_return<F>(action);
+	}
+
+	template< class F >
+	gsl_api inline final_action_return<F> on_return(F && action) gsl_noexcept
+	{
+		return final_action_return<F>(std::forward<F>(action));
+	}
+
+	template< class F >
+	class final_action_error : public final_action<F>
+	{
+	public:
+		gsl_api explicit final_action_error(F && action) gsl_noexcept
+			: final_action<F>(std::move(action))
+		{}
+
+		gsl_api final_action_error(final_action_error && other) gsl_noexcept
+			: final_action<F>(std::move(other))
+		{}
+
+		gsl_api ~final_action_error() override
+		{
+			if (!this->uncaught_exceptions())
+				this->dismiss();
+		}
+
+	gsl_is_delete_access:
+		gsl_api final_action_error(final_action_error const &) gsl_is_delete;
+		gsl_api final_action_error & operator=(final_action_error const &) gsl_is_delete;
+	};
+
+	template< class F >
+	gsl_api inline final_action_error<F> on_error(F const & action) gsl_noexcept
+	{
+		return final_action_error<F>(action);
+	}
+
+	template< class F >
+	gsl_api inline final_action_error<F> on_error(F && action) gsl_noexcept
+	{
+		return final_action_error<F>(std::forward<F>(action));
+	}
+
+#endif // gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
+
+#else // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 11
+
+	class final_action
+	{
+	public:
+		typedef void(*Action)();
+
+		gsl_api final_action(Action action)
+			: action_(action)
+			, invoke_(true)
+		{}
+
+		gsl_api final_action(final_action const & other)
+			: action_(other.action_)
+			, invoke_(other.invoke_)
+		{
+			other.invoke_ = false;
+		}
+
+		gsl_api virtual ~final_action()
+		{
+			if (invoke_)
+				action_();
+		}
+
+	protected:
+		gsl_api void dismiss()
+		{
+			invoke_ = false;
+		}
+
+		gsl_api int uncaught_exceptions()
+		{
+			return std::uncaught_exception() ? 1 : 0;
+		}
+
+	private:
+		gsl_api final_action & operator=(final_action const &);
+
+	private:
+		Action action_;
+		mutable bool invoke_;
+	};
+
+	template< class F >
+	gsl_api inline final_action finally(F const & f)
+	{
+		return final_action((f));
+	}
+
+#if gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
+
+	class final_action_return : public final_action
+	{
+	public:
+		gsl_api explicit final_action_return(Action action)
+			: final_action(action)
+		{}
+
+		gsl_api ~final_action_return()
+		{
+			if (this->uncaught_exceptions())
+				this->dismiss();
+		}
+
+	private:
+		gsl_api final_action_return & operator=(final_action_return const &);
+	};
+
+	template< class F >
+	gsl_api inline final_action_return on_return(F const & action)
+	{
+		return final_action_return(action);
+	}
+
+	class final_action_error : public final_action
+	{
+	public:
+		gsl_api explicit final_action_error(Action action)
+			: final_action(action)
+		{}
+
+		gsl_api ~final_action_error()
+		{
+			if (!this->uncaught_exceptions())
+				this->dismiss();
+		}
+
+	private:
+		gsl_api final_action_error & operator=(final_action_error const &);
+	};
+
+	template< class F >
+	gsl_api inline final_action_error on_error(F const & action)
+	{
+		return final_action_error(action);
+	}
+
+#endif // gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
+
+#endif // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION == 11
+
+#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr T narrow_cast(U && u) gsl_noexcept
+	{
+		return static_cast<T>(std::forward<U>(u));
+	}
+
+#else
+
+	template< class T, class U >
+	gsl_api inline T narrow_cast(U u) gsl_noexcept
+	{
+		return static_cast<T>(u);
+	}
+
+#endif // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12
+
+	struct narrowing_error : public std::exception {};
+
+#if gsl_HAVE_TYPE_TRAITS
+
+	namespace details
+	{
+		template< class T, class U >
+		struct is_same_signedness : public std::integral_constant<bool, std::is_signed<T>::value == std::is_signed<U>::value>
+		{};
+	}
+#endif
+
+	template< class T, class U >
+	gsl_api inline T narrow(U u)
+	{
+		T t = narrow_cast<T>(u);
+
+		if (static_cast<U>(t) != u)
+		{
+#if gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+			throw narrowing_error();
+#else
+			std::terminate();
+#endif
+		}
+
+#if gsl_HAVE_TYPE_TRAITS
+		if (!details::is_same_signedness<T, U>::value && ((t < T()) != (u < U())))
+#else
+		// Don't assume T() works:
+		if ((t < 0) != (u < 0))
+#endif
+		{
+#if gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V
+			throw narrowing_error();
+#else
+			std::terminate();
+#endif
+		}
+		return t;
+	}
+
+	//
+	// at() - Bounds-checked way of accessing static arrays, std::array, std::vector.
+	//
+
+	template< class T, size_t N >
+	gsl_api inline gsl_constexpr14 T & at(T(&arr)[N], size_t index)
+	{
+		Expects(index < N);
+		return arr[index];
+	}
+
+#if gsl_HAVE_ARRAY
+
+	template< class T, size_t N >
+	gsl_api inline gsl_constexpr14 T & at(std::array<T, N> & arr, size_t index)
+	{
+		Expects(index < N);
+		return arr[index];
+	}
+#endif
+
+	template< class Cont >
+	gsl_api inline gsl_constexpr14 typename Cont::value_type & at(Cont & cont, size_t index)
+	{
+		Expects(index < cont.size());
+		return cont[index];
+	}
+
+#if gsl_HAVE_INITIALIZER_LIST
+
+	template< class T >
+	gsl_api inline const gsl_constexpr14 T & at(std::initializer_list<T> cont, size_t index)
+	{
+		Expects(index < cont.size());
+		return *(cont.begin() + index);
+	}
+#endif
+
+	template< class T >
+	class span;
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 T & at(span<T> s, size_t index)
+	{
+		return s.at(index);
+	}
+
+	//
+	// GSL.views: views
+	//
+
+	//
+	// not_null<> - Wrap any indirection and enforce non-null.
+	//
+	template< class T >
+	class not_null
+	{
+	public:
+		gsl_api gsl_constexpr14 not_null(T t) : ptr_(t) { Expects(ptr_ != gsl_nullptr); }
+		gsl_api                 not_null & operator=(T t) { ptr_ = t;  Expects(ptr_ != gsl_nullptr); return *this; }
+
+#if gsl_HAVE_IS_DEFAULT
+		gsl_api gsl_constexpr   not_null(not_null const & other) = default;
+		gsl_api gsl_constexpr   not_null(not_null &&      other) = default;
+		gsl_api                ~not_null() = default;
+		gsl_api                 not_null & operator=(not_null const & other) = default;
+		gsl_api                 not_null & operator=(not_null &&      other) = default;
+#else
+		gsl_api gsl_constexpr   not_null(not_null const & other) : ptr_(other.ptr_) {}
+		gsl_api                ~not_null() {};
+		gsl_api                 not_null & operator=(not_null const & other) { ptr_ = other.ptr_; return *this; }
+#endif
+
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+
+		template< class U, class Dummy = typename std::enable_if<std::is_convertible<U, T>::value, void>::type >
+		gsl_api gsl_constexpr not_null(not_null<U> const & other)
+			: ptr_(other.get())
+		{}
+
+		template< class U, class Dummy = typename std::enable_if<std::is_convertible<U, T>::value, void>::type >
+		gsl_api not_null & operator=(not_null<U> const & other)
+		{
+			ptr_ = other.get();
+			return *this;
+		}
+#else
+		template< class U >
+		gsl_api gsl_constexpr not_null(not_null<U> const & other)
+			: ptr_(other.get())
+		{}
+
+		template< class U >
+		gsl_api gsl_constexpr not_null & operator=(not_null<U> const & other)
+		{
+			ptr_ = other.get();
+			return *this;
+		}
+#endif
+
+		gsl_api gsl_constexpr14 T get() const
+		{
+			Ensures(ptr_ != gsl_nullptr);
+			return ptr_;
+		}
+
+		gsl_api gsl_constexpr      operator T() const { return get(); }
+		gsl_api gsl_constexpr T    operator->() const { return get(); }
+
+#if gsl_HAVE_DECLTYPE_AUTO
+		gsl_api gsl_constexpr decltype(auto) operator*() const { return *get(); }
+#endif
+
+	gsl_is_delete_access:
+		// prevent compilation when initialized with a nullptr or literal 0:
+#if gsl_HAVE_NULLPTR
+		gsl_api not_null(std::nullptr_t) gsl_is_delete;
+		gsl_api not_null & operator=(std::nullptr_t) gsl_is_delete;
+#else
+		gsl_api not_null(int) gsl_is_delete;
+		gsl_api not_null & operator=(int) gsl_is_delete;
+#endif
+
+		// unwanted operators...pointers only point to single objects!
+		gsl_api not_null & operator++() gsl_is_delete;
+		gsl_api not_null & operator--() gsl_is_delete;
+		gsl_api not_null   operator++(int) gsl_is_delete;
+		gsl_api not_null   operator--(int) gsl_is_delete;
+		gsl_api not_null & operator+ (size_t) gsl_is_delete;
+		gsl_api not_null & operator+=(size_t) gsl_is_delete;
+		gsl_api not_null & operator- (size_t) gsl_is_delete;
+		gsl_api not_null & operator-=(size_t) gsl_is_delete;
+		gsl_api not_null & operator+=(std::ptrdiff_t) gsl_is_delete;
+		gsl_api not_null & operator-=(std::ptrdiff_t) gsl_is_delete;
+		gsl_api void       operator[](std::ptrdiff_t) const gsl_is_delete;
+
+	private:
+		T ptr_;
+	};
+
+	// more not_null unwanted operators
+
+	template< class T, class U >
+	std::ptrdiff_t operator-(not_null<T> const &, not_null<U> const &) gsl_is_delete;
+
+	template< class T >
+	not_null<T> operator-(not_null<T> const &, std::ptrdiff_t) gsl_is_delete;
+
+	template< class T >
+	not_null<T> operator+(not_null<T> const &, std::ptrdiff_t) gsl_is_delete;
+
+	template< class T >
+	not_null<T> operator+(std::ptrdiff_t, not_null<T> const &) gsl_is_delete;
+
+
+	// not_null comparisons
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator==(not_null<T> const & l, not_null<U> const & r)
+	{
+		return  l.get() == r.get();
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator< (not_null<U> const & l, not_null<U> const & r)
+	{
+		return l.get() < r.get();
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator!=(not_null<U> const & l, not_null<U> const & r)
+	{
+		return !(l == r);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator<=(not_null<U> const & l, not_null<U> const & r)
+	{
+		return !(r < l);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator> (not_null<U> const & l, not_null<U> const & r)
+	{
+		return (r < l);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator>=(not_null<U> const & l, not_null<U> const & r)
+	{
+		return !(l < r);
+	}
+
+	//
+	// Byte-specific type.
+	//
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+	enum class byte : unsigned char {};
+#else
+	struct byte { typedef unsigned char type; type v; };
+#endif
+
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+# define gsl_ENABLE_IF_INTEGRAL_T(T)  \
+    , class = typename std::enable_if<std::is_integral<T>::value>::type
+#else
+# define gsl_ENABLE_IF_INTEGRAL_T(T)
+#endif
+
+	template< class T >
+	gsl_api inline gsl_constexpr byte to_byte(T v) gsl_noexcept
+	{
+#if    gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return static_cast<byte>(v);
+#elif  gsl_HAVE_CONSTEXPR_11
+		return { static_cast<typename byte::type>(v) };
+#else
+		byte b = { static_cast<typename byte::type>(v) }; return b;
+#endif
+	}
+
+	template< class IntegerType  gsl_ENABLE_IF_INTEGRAL_T(IntegerType) >
+	gsl_api inline gsl_constexpr IntegerType to_integer(byte b) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return static_cast<typename std::underlying_type<byte>::type>(b);
+#else
+		return b.v;
+#endif
+	}
+
+	gsl_api inline gsl_constexpr unsigned char to_uchar(byte b) gsl_noexcept
+	{
+		return to_integer<unsigned char>(b);
+	}
+
+	gsl_api inline gsl_constexpr unsigned char to_uchar(int i) gsl_noexcept
+	{
+		return static_cast<unsigned char>(i);
+	}
+
+#if ! gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+
+	gsl_api inline gsl_constexpr bool operator==(byte l, byte r) gsl_noexcept
+	{
+		return l.v == r.v;
+	}
+
+	gsl_api inline gsl_constexpr bool operator!=(byte l, byte r) gsl_noexcept
+	{
+		return !(l == r);
+	}
+
+	gsl_api inline gsl_constexpr bool operator< (byte l, byte r) gsl_noexcept
+	{
+		return l.v < r.v;
+	}
+
+	gsl_api inline gsl_constexpr bool operator<=(byte l, byte r) gsl_noexcept
+	{
+		return !(r < l);
+	}
+
+	gsl_api inline gsl_constexpr bool operator> (byte l, byte r) gsl_noexcept
+	{
+		return (r < l);
+	}
+
+	gsl_api inline gsl_constexpr bool operator>=(byte l, byte r) gsl_noexcept
+	{
+		return !(l < r);
+	}
+#endif
+
+	template< class IntegerType  gsl_ENABLE_IF_INTEGRAL_T(IntegerType) >
+	gsl_api inline gsl_constexpr14 byte & operator<<=(byte & b, IntegerType shift) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return b = to_byte(to_uchar(b) << shift);
+#else
+		b.v = to_uchar(b.v << shift); return b;
+#endif
+	}
+
+	template< class IntegerType  gsl_ENABLE_IF_INTEGRAL_T(IntegerType) >
+	gsl_api inline gsl_constexpr byte operator<<(byte b, IntegerType shift) gsl_noexcept
+	{
+		return to_byte(to_uchar(b) << shift);
+	}
+
+	template< class IntegerType  gsl_ENABLE_IF_INTEGRAL_T(IntegerType) >
+	gsl_api inline gsl_constexpr14 byte & operator>>=(byte & b, IntegerType shift) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return b = to_byte(to_uchar(b) >> shift);
+#else
+		b.v = to_uchar(b.v >> shift); return b;
+#endif
+	}
+
+	template< class IntegerType  gsl_ENABLE_IF_INTEGRAL_T(IntegerType) >
+	gsl_api inline gsl_constexpr byte operator>>(byte b, IntegerType shift) gsl_noexcept
+	{
+		return to_byte(to_uchar(b) >> shift);
+	}
+
+	gsl_api inline gsl_constexpr14 byte & operator|=(byte & l, byte r) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return l = to_byte(to_uchar(l) | to_uchar(r));
+#else
+		l.v = to_uchar(l) | to_uchar(r); return l;
+#endif
+	}
+
+	gsl_api inline gsl_constexpr byte operator|(byte l, byte r) gsl_noexcept
+	{
+		return to_byte(to_uchar(l) | to_uchar(r));
+	}
+
+	gsl_api inline gsl_constexpr14 byte & operator&=(byte & l, byte r) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return l = to_byte(to_uchar(l) & to_uchar(r));
+#else
+		l.v = to_uchar(l) & to_uchar(r); return l;
+#endif
+	}
+
+	gsl_api inline gsl_constexpr byte operator&(byte l, byte r) gsl_noexcept
+	{
+		return to_byte(to_uchar(l) & to_uchar(r));
+	}
+
+	gsl_api inline gsl_constexpr14 byte & operator^=(byte & l, byte r) gsl_noexcept
+	{
+#if gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE
+		return l = to_byte(to_uchar(l) ^ to_uchar(r));
+#else
+		l.v = to_uchar(l) ^ to_uchar(r); return l;
+#endif
+	}
+
+	gsl_api inline gsl_constexpr byte operator^(byte l, byte r) gsl_noexcept
+	{
+		return to_byte(to_uchar(l) ^ to_uchar(r));
+	}
+
+	gsl_api inline gsl_constexpr byte operator~(byte b) gsl_noexcept
+	{
+		return to_byte(~to_uchar(b));
+	}
+
+	// tag to select span constructor taking a container (prevent ms-gsl warning C26426):
+
+#if gsl_CPP14_OR_GREATER
+	struct with_container_t { constexpr with_container_t() noexcept {} };
+	const with_container_t with_container;
+#else
+	struct with_container_t { with_container_t() {} };
+	const with_container_t with_container;
+#endif
+
+	//
+	// span<> - A 1D view of contiguous T's, replace (*,len).
+	//
+	template< class T >
+	class span
+	{
+		template< class U > friend class span;
+
+	public:
+		typedef gsl_CONFIG_SPAN_INDEX_TYPE index_type;  // p0122r3 uses std::ptrdiff_t
+
+        typedef index_type size_type;
+        typedef T value_type;
+        typedef T element_type;
+		typedef T & reference;
+		typedef T * pointer;
+		typedef T const * const_pointer;
+		typedef T const & const_reference;
+
+		typedef pointer       iterator;
+		typedef const_pointer const_iterator;
+
+		typedef std::reverse_iterator< iterator >       reverse_iterator;
+		typedef std::reverse_iterator< const_iterator > const_reverse_iterator;
+
+		typedef typename std::iterator_traits< iterator >::difference_type difference_type;
+
+		gsl_api gsl_constexpr14 span() gsl_noexcept
+			: first_(gsl_nullptr)
+			, last_(gsl_nullptr)
+		{
+			Expects(size() == 0);
+		}
+
+#if gsl_HAVE_NULLPTR
+		gsl_api gsl_constexpr14 span(std::nullptr_t, index_type size_in)
+			: first_(nullptr)
+			, last_(nullptr)
+		{
+			Expects(size_in == 0);
+		}
+#endif
+
+#if gsl_HAVE_IS_DELETE
+		gsl_api gsl_constexpr14 span(reference data_in)
+			: span(&data_in, 1)
+		{}
+
+		gsl_api gsl_constexpr14 span(element_type &&) = delete;
+#endif
+
+		gsl_api gsl_constexpr14 span(pointer first_in, pointer last_in)
+			: first_(first_in)
+			, last_(last_in)
+		{
+			Expects(first_in <= last_in);
+		}
+
+		template<typename Iter>
+		gsl_api gsl_constexpr14 span(typename std::enable_if<std::is_base_of<
+			std::random_access_iterator_tag,
+			typename std::iterator_traits<Iter>::iterator_category>::value, Iter>::type firstElem, Iter lastElem)
+			: first_(firstElem == lastElem ? gsl_nullptr : &*firstElem)
+			, last_(firstElem == lastElem ? gsl_nullptr : &*firstElem + (lastElem-firstElem))
+		{
+		}
+
+
+		gsl_api gsl_constexpr14 span(pointer data_in, index_type size_in)
+			: first_(data_in)
+			, last_(data_in + size_in)
+		{
+			Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
+		}
+
+		template< class U >
+		gsl_api gsl_constexpr14 span(U * & data_in, index_type size_in)
+			: first_(data_in)
+			, last_(data_in + size_in)
+		{
+			Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
+		}
+
+		template< class U >
+		gsl_api gsl_constexpr14 span(U * const & data_in, index_type size_in)
+			: first_(data_in)
+			, last_(data_in + size_in)
+		{
+			Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
+		}
+
+		template< class U, size_t N >
+		gsl_api gsl_constexpr14 span(U(&arr)[N]) gsl_noexcept
+			: first_(&arr[0])
+			, last_(&arr[0] + N)
+		{}
+
+#if gsl_HAVE_ARRAY
+		template< class U, size_t N >
+		gsl_api gsl_constexpr14 span(std::array< U, N > & arr)
+			: first_(arr.data())
+			, last_(arr.data() + N)
+		{}
+
+		template< class U, size_t N >
+		gsl_api gsl_constexpr14 span(std::array< U, N > const & arr)
+			: first_(arr.data())
+			, last_(arr.data() + N)
+		{}
+#endif
+
+#if gsl_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR
+		// SFINAE enable only if Cont has a data() member function
+		template< class Cont, class = decltype(std::declval<Cont>().data()), class =typename std::enable_if<std::is_same<typename Cont::value_type, T>::value>::type >
+		gsl_api gsl_constexpr14 span(Cont & cont)
+			: first_(cont.data())
+			, last_(cont.data() + cont.size())
+		{}
+#elif gsl_HAVE_UNCONSTRAINED_SPAN_CONTAINER_CTOR
+		template< class Cont >
+		gsl_api gsl_constexpr14 span(Cont & cont)
+			: first_(cont.size() == 0 ? gsl_nullptr : &cont[0])
+			, last_(cont.size() == 0 ? gsl_nullptr : &cont[0] + cont.size())
+		{}
+#endif
+
+		template< class Cont >
+		gsl_api gsl_constexpr14 span(with_container_t, Cont & cont)
+			: first_(cont.size() == 0 ? gsl_nullptr : &cont[0])
+			, last_(cont.size() == 0 ? gsl_nullptr : &cont[0] + cont.size())
+		{}
+
+#if gsl_HAVE_SHARED_PTR
+		gsl_api gsl_constexpr14 span(shared_ptr<element_type> const & ptr)
+			: first_(ptr.get())
+			, last_(ptr.get() ? ptr.get() + 1 : 0)
+		{}
+#endif
+
+#if gsl_HAVE_UNIQUE_PTR
+# if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+		template< class ArrayElementType = typename std::add_pointer<element_type>::type >
+# else
+		template< class ArrayElementType >
+# endif
+		gsl_api gsl_constexpr14 span(unique_ptr<ArrayElementType> const & ptr, index_type count)
+			: first_(ptr.get())
+			, last_(ptr.get() + count)
+		{}
+
+		gsl_api gsl_constexpr14 span(unique_ptr<element_type> const & ptr)
+			: first_(ptr.get())
+			, last_(ptr.get() ? ptr.get() + 1 : 0)
+		{}
+#endif
+
+#if gsl_HAVE_IS_DEFAULT && ! gsl_BETWEEN( gsl_COMPILER_GNUC_VERSION, 430, 600)
+		gsl_api gsl_constexpr14 span(span &&) = default;
+		gsl_api gsl_constexpr14 span(span const &) = default;
+#else
+		gsl_api gsl_constexpr14 span(span const & other)
+			: first_(other.begin())
+			, last_(other.end())
+		{}
+#endif
+
+		template< class U >
+		gsl_api gsl_constexpr14 span(span<U> const & other)
+			: first_(other.begin())
+			, last_(other.end())
+		{}
+
+#if gsl_HAVE_IS_DEFAULT
+		~span() = default;
+#else
+		~span() {}
+#endif
+
+#if gsl_HAVE_IS_DEFAULT
+		gsl_api span & operator=(span &&) = default;
+		gsl_api span & operator=(span const &) = default;
+#else
+		gsl_api span & operator=(span other)
+		{
+			other.swap(*this);
+			return *this;
+		}
+#endif
+
+#if 0
+		// Converting from other span ?
+		template< class U > operator=();
+#endif
+
+		gsl_api gsl_constexpr14 span first(index_type count) const gsl_noexcept
+		{
+			Expects(count <= this->size());
+			return span(this->data(), count);
+		}
+
+		gsl_api gsl_constexpr14 span last(index_type count) const gsl_noexcept
+		{
+			Expects(count <= this->size());
+			return span(this->data() + this->size() - count, count);
+		}
+
+		gsl_api gsl_constexpr14 span subspan(index_type offset) const gsl_noexcept
+		{
+			Expects(offset <= this->size());
+			return span(this->data() + offset, this->length() - offset);
+		}
+
+		gsl_api gsl_constexpr14 span subspan(index_type offset, index_type count) const gsl_noexcept
+		{
+			Expects(offset <= this->size() && count <= this->size() - offset);
+			return span(this->data() + offset, count);
+		}
+
+		gsl_api gsl_constexpr14 iterator begin() const gsl_noexcept
+		{
+			return iterator(first_);
+		}
+
+		gsl_api gsl_constexpr14 iterator end() const gsl_noexcept
+		{
+			return iterator(last_);
+		}
+
+		gsl_api gsl_constexpr14 const_iterator cbegin() const gsl_noexcept
+		{
+#if gsl_CPP11_OR_GREATER
+			return { begin() };
+#else
+			return const_iterator(begin());
+#endif
+		}
+
+		gsl_api gsl_constexpr14 const_iterator cend() const gsl_noexcept
+		{
+#if gsl_CPP11_OR_GREATER
+			return { end() };
+#else
+			return const_iterator(end());
+#endif
+		}
+
+		gsl_api gsl_constexpr14 reverse_iterator rbegin() const gsl_noexcept
+		{
+			return reverse_iterator(end());
+		}
+
+		gsl_api gsl_constexpr14 reverse_iterator rend() const gsl_noexcept
+		{
+			return reverse_iterator(begin());
+		}
+
+		gsl_api gsl_constexpr14 const_reverse_iterator crbegin() const gsl_noexcept
+		{
+			return const_reverse_iterator(cend());
+		}
+
+		gsl_api gsl_constexpr14 const_reverse_iterator crend() const gsl_noexcept
+		{
+			return const_reverse_iterator(cbegin());
+		}
+
+		gsl_api gsl_constexpr14 reference operator[](index_type index) const
+		{
+			return at(index);
+		}
+
+		gsl_api gsl_constexpr14 reference operator()(index_type index) const
+		{
+			return at(index);
+		}
+
+		gsl_api gsl_constexpr14 reference at(index_type index) const
+		{
+			Expects(index < size());
+			return first_[index];
+		}
+
+		gsl_api gsl_constexpr14 pointer data() const gsl_noexcept
+		{
+			return first_;
+		}
+
+		gsl_api gsl_constexpr14 bool empty() const gsl_noexcept
+		{
+			return size() == 0;
+		}
+
+		gsl_api gsl_constexpr14 index_type size() const gsl_noexcept
+		{
+			return narrow_cast<index_type>(last_ - first_);
+		}
+
+		gsl_api gsl_constexpr14 index_type length() const gsl_noexcept
+		{
+			return size();
+		}
+
+		gsl_api gsl_constexpr14 index_type size_bytes() const gsl_noexcept
+		{
+			return size() * narrow_cast<index_type>(sizeof(element_type));
+		}
+
+		gsl_api gsl_constexpr14 index_type length_bytes() const gsl_noexcept
+		{
+			return size_bytes();
+		}
+
+		gsl_api void swap(span & other) gsl_noexcept
+		{
+			using std::swap;
+			swap(first_, other.first_);
+			swap(last_, other.last_);
+		}
+
+		// member as_bytes(), as_writeable_bytes deprecated since 0.17.0
+
+		gsl_api span< const byte > as_bytes() const gsl_noexcept
+		{
+			return span< const byte >(reinterpret_cast<const byte *>(data()), size_bytes()); // NOLINT
+		}
+
+		gsl_api span< byte > as_writeable_bytes() const gsl_noexcept
+		{
+			return span< byte >(reinterpret_cast<byte *>(data()), size_bytes()); // NOLINT
+		}
+
+		template< class U >
+		gsl_api span< U > as_span() const gsl_noexcept
+		{
+			Expects((this->size_bytes() % sizeof(U)) == 0);
+			return span< U >(reinterpret_cast<U *>(this->data()), this->size_bytes() / sizeof(U)); // NOLINT
+		}
+
+	private:
+		pointer first_;
+		pointer last_;
+	};
+
+	// span comparison functions
+
+#if gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator==(span<T> const & l, span<U> const & r)
+	{
+		return  l.size() == r.size()
+			&& (l.begin() == r.begin() || std::equal(l.begin(), l.end(), r.begin()));
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator< (span<T> const & l, span<U> const & r)
+	{
+		return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+	}
+
+#else
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 bool operator==(span<T> const & l, span<T> const & r)
+	{
+		return  l.size() == r.size()
+			&& (l.begin() == r.begin() || std::equal(l.begin(), l.end(), r.begin()));
+	}
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 bool operator< (span<T> const & l, span<T> const & r)
+	{
+		return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+	}
+#endif
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator!=(span<T> const & l, span<U> const & r)
+	{
+		return !(l == r);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator<=(span<T> const & l, span<U> const & r)
+	{
+		return !(r < l);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator> (span<T> const & l, span<U> const & r)
+	{
+		return (r < l);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator>=(span<T> const & l, span<U> const & r)
+	{
+		return !(l < r);
+	}
+
+	// span algorithms
+
+	namespace detail {
+
+		template< class II, class N, class OI >
+		gsl_api inline OI copy_n(II first, N count, OI result)
+		{
+			if (count > 0)
+			{
+				*result++ = *first;
+				for (N i = 1; i < count; ++i)
+				{
+					*result++ = *++first;
+				}
+			}
+			return result;
+		}
+	}
+
+	template< class T, class U >
+	gsl_api inline void copy(span<T> src, span<U> dest)
+	{
+#if gsl_CPP14_OR_GREATER // gsl_HAVE_TYPE_TRAITS (circumvent Travis clang 3.4)
+		static_assert(std::is_assignable<U &, T const &>::value, "Cannot assign elements of source span to elements of destination span");
+#endif
+		Expects(dest.size() >= src.size());
+		detail::copy_n(src.data(), src.size(), dest.data());
+	}
+
+	// span creator functions (see ctors)
+
+	template< class T >
+	gsl_api inline span< const byte > as_bytes(span<T> spn) gsl_noexcept
+	{
+		return span< const byte >(reinterpret_cast<const byte *>(spn.data()), spn.size_bytes()); // NOLINT
+	}
+
+	template< class T>
+	gsl_api inline span< byte > as_writeable_bytes(span<T> spn) gsl_noexcept
+	{
+		return span< byte >(reinterpret_cast<byte *>(spn.data()), spn.size_bytes()); // NOLINT
+	}
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 span<T> make_span(T * first, T * last)
+	{
+		return span<T>(first, last);
+	}
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 span<T> make_span(T * ptr, typename span<T>::index_type count)
+	{
+		return span<T>(ptr, count);
+	}
+
+	template< class T, size_t N >
+	gsl_api inline gsl_constexpr14 span<T> make_span(T(&arr)[N])
+	{
+		return span<T>(&arr[0], N);
+	}
+
+#if gsl_HAVE_ARRAY
+
+	template< class T, size_t N >
+	gsl_api inline gsl_constexpr14 span<T> make_span(std::array<T, N> & arr)
+	{
+		return span<T>(arr);
+	}
+
+	template< class T, size_t N >
+	gsl_api inline gsl_constexpr14 span<const T> make_span(std::array<T, N> const & arr)
+	{
+		return span<const T>(arr);
+	}
+#endif
+
+#if gsl_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR && gsl_HAVE_AUTO
+
+	template< class Cont >
+	gsl_api inline gsl_constexpr14 auto make_span(Cont & cont) -> span< typename Cont::value_type >
+	{
+		return span< typename Cont::value_type >(cont);
+	}
+
+	template< class Cont >
+	gsl_api inline gsl_constexpr14 auto make_span(Cont const & cont) -> span< const typename Cont::value_type >
+	{
+		return span< const typename Cont::value_type >(cont);
+	}
+
+#else
+
+	template< class T >
+	gsl_api inline span<T> make_span(std::vector<T> & cont)
+	{
+		return span<T>(with_container, cont);
+	}
+
+	template< class T >
+	gsl_api inline span<const T> make_span(std::vector<T> const & cont)
+	{
+		return span<const T>(with_container, cont);
+	}
+#endif
+
+	template< class Ptr >
+	gsl_api inline span<typename Ptr::element_type> make_span(Ptr & ptr)
+	{
+		return span<typename Ptr::element_type>(ptr);
+	}
+
+	template< class Ptr >
+	span<typename Ptr::element_type> make_span(Ptr & ptr, typename span<typename Ptr::element_type>::index_type count)
+	{
+		return span<typename Ptr::element_type>(ptr, count);
+	}
+
+	//
+	// basic_string_span:
+	//
+
+	template< class T >
+	class basic_string_span;
+
+	namespace detail {
+
+		template< class T >
+		struct is_basic_string_span_oracle : false_type {};
+
+		template< class T >
+		struct is_basic_string_span_oracle< basic_string_span<T> > : true_type {};
+
+		template< class T >
+		struct is_basic_string_span : is_basic_string_span_oracle< typename remove_cv<T>::type > {};
+
+		template< class T >
+		gsl_api inline gsl_constexpr14 std::size_t string_length(T * ptr, std::size_t max)
+		{
+			if (ptr == gsl_nullptr || max <= 0)
+				return 0;
+
+			std::size_t len = 0;
+			while (len < max && ptr[len]) // NOLINT
+				++len;
+
+			return len;
+		}
+
+	} // namespace detail
+
+	  //
+	  // basic_string_span<> - A view of contiguous characters, replace (*,len).
+	  //
+	template< class T >
+	class basic_string_span
+	{
+	public:
+		typedef T element_type;
+		typedef span<T> span_type;
+
+		typedef typename span_type::index_type index_type;
+		typedef typename span_type::difference_type difference_type;
+
+		typedef typename span_type::pointer pointer;
+		typedef typename span_type::reference reference;
+
+		typedef typename span_type::iterator iterator;
+		typedef typename span_type::const_iterator const_iterator;
+		typedef typename span_type::reverse_iterator reverse_iterator;
+		typedef typename span_type::const_reverse_iterator const_reverse_iterator;
+
+		// construction:
+
+#if gsl_HAVE_IS_DEFAULT
+		gsl_api gsl_constexpr basic_string_span() gsl_noexcept = default;
+#else
+		gsl_api gsl_constexpr basic_string_span() gsl_noexcept {}
+#endif
+
+#if gsl_HAVE_NULLPTR
+		gsl_api gsl_constexpr basic_string_span(std::nullptr_t ptr) gsl_noexcept
+			: span_(ptr, 0)
+		{}
+#endif
+
+		gsl_api gsl_constexpr basic_string_span(pointer ptr)
+			: span_(remove_z(ptr, std::numeric_limits<index_type>::max()))
+		{}
+
+		gsl_api gsl_constexpr basic_string_span(pointer ptr, index_type count)
+			: span_(ptr, count)
+		{}
+
+		gsl_api gsl_constexpr basic_string_span(pointer firstElem, pointer lastElem)
+			: span_(firstElem, lastElem)
+		{}
+
+		template< std::size_t N >
+		gsl_api gsl_constexpr basic_string_span(element_type(&arr)[N])
+			: span_(remove_z(&arr[0], N))
+		{}
+
+#if gsl_HAVE_ARRAY
+
+		template< std::size_t N >
+		gsl_api gsl_constexpr basic_string_span(std::array< typename detail::remove_const<element_type>::type, N> & arr)
+			: span_(remove_z(arr))
+		{}
+
+		template< std::size_t N >
+		gsl_api gsl_constexpr basic_string_span(std::array< typename detail::remove_const<element_type>::type, N> const & arr)
+			: span_(remove_z(arr))
+		{}
+
+#endif
+
+#if gsl_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR
+
+		// Exclude: array, [basic_string,] basic_string_span
+
+		template<
+			class Cont,
+			class = typename std::enable_if<
+			!detail::is_std_array< Cont >::value
+			&& !detail::is_basic_string_span< Cont >::value
+			&& std::is_convertible< typename Cont::pointer, pointer >::value
+			&& std::is_convertible< typename Cont::pointer, decltype(std::declval<Cont>().data()) >::value
+			>::type
+		>
+			gsl_api gsl_constexpr basic_string_span(Cont & cont)
+			: span_((cont))
+		{}
+
+		// Exclude: array, [basic_string,] basic_string_span
+
+		template<
+			class Cont,
+			class = typename std::enable_if<
+			!detail::is_std_array< Cont >::value
+			&& !detail::is_basic_string_span< Cont >::value
+			&& std::is_convertible< typename Cont::pointer, pointer >::value
+			&& std::is_convertible< typename Cont::pointer, decltype(std::declval<Cont>().data()) >::value
+			>::type
+		>
+			gsl_api gsl_constexpr basic_string_span(Cont const & cont)
+			: span_((cont))
+		{}
+
+#elif gsl_HAVE_UNCONSTRAINED_SPAN_CONTAINER_CTOR
+
+		template< class Cont >
+		gsl_api gsl_constexpr basic_string_span(Cont & cont)
+			: span_(cont)
+		{}
+
+		template< class Cont >
+		gsl_api gsl_constexpr basic_string_span(Cont const & cont)
+			: span_(cont)
+		{}
+
+#else
+
+		template< class U >
+		gsl_api gsl_constexpr basic_string_span(span<U> const & rhs)
+			: span_(rhs)
+		{}
+
+#endif
+
+		template< class Cont >
+		gsl_api gsl_constexpr14 basic_string_span(with_container_t, Cont & cont)
+			: span_(with_container, cont)
+		{}
+
+#if gsl_HAVE_IS_DEFAULT
+# if gsl_BETWEEN( gsl_COMPILER_GNUC_VERSION, 440, 600 )
+		gsl_api gsl_constexpr basic_string_span(basic_string_span const & rhs) = default;
+
+		gsl_api gsl_constexpr basic_string_span(basic_string_span && rhs) = default;
+# else
+		gsl_api gsl_constexpr basic_string_span(basic_string_span const & rhs) gsl_noexcept = default;
+
+		gsl_api gsl_constexpr basic_string_span(basic_string_span && rhs) gsl_noexcept = default;
+# endif
+#endif
+
+		template< class U
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+			, class = typename std::enable_if< std::is_convertible<typename basic_string_span<U>::pointer, pointer>::value >::type
+#endif
+		>
+			gsl_api gsl_constexpr basic_string_span(basic_string_span<U> const & rhs)
+			: span_(reinterpret_cast<pointer>(rhs.data()), rhs.length()) // NOLINT
+		{}
+
+#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12
+		template< class U
+			, class = typename std::enable_if< std::is_convertible<typename basic_string_span<U>::pointer, pointer>::value >::type
+		>
+			gsl_api gsl_constexpr basic_string_span(basic_string_span<U> && rhs)
+			: span_(reinterpret_cast<pointer>(rhs.data()), rhs.length()) // NOLINT
+		{}
+#endif
+
+		template< class CharTraits, class Allocator >
+		gsl_api gsl_constexpr basic_string_span(
+			std::basic_string< typename detail::remove_const<element_type>::type, CharTraits, Allocator > & str)
+			: span_(&str[0], str.length())
+		{}
+
+		template< class CharTraits, class Allocator >
+		gsl_api gsl_constexpr basic_string_span(
+			std::basic_string< typename detail::remove_const<element_type>::type, CharTraits, Allocator > const & str)
+			: span_(&str[0], str.length())
+		{}
+
+		// destruction, assignment:
+
+#if gsl_HAVE_IS_DEFAULT
+		gsl_api ~basic_string_span() gsl_noexcept = default;
+
+		gsl_api basic_string_span & operator=(basic_string_span const & rhs) gsl_noexcept = default;
+
+		gsl_api basic_string_span & operator=(basic_string_span && rhs) gsl_noexcept = default;
+#endif
+
+		// sub span:
+
+		gsl_api gsl_constexpr basic_string_span first(index_type count) const
+		{
+			return span_.first(count);
+		}
+
+		gsl_api gsl_constexpr basic_string_span last(index_type count) const
+		{
+			return span_.last(count);
+		}
+
+		gsl_api gsl_constexpr basic_string_span subspan(index_type offset) const
+		{
+			return span_.subspan(offset);
+		}
+
+		gsl_api gsl_constexpr basic_string_span subspan(index_type offset, index_type count) const
+		{
+			return span_.subspan(offset, count);
+		}
+
+		// observers:
+
+		gsl_api gsl_constexpr index_type length() const gsl_noexcept
+		{
+			return span_.size();
+		}
+
+		gsl_api gsl_constexpr index_type size() const gsl_noexcept
+		{
+			return span_.size();
+		}
+
+		gsl_api gsl_constexpr index_type length_bytes() const gsl_noexcept
+		{
+			return span_.size_bytes();
+		}
+
+		gsl_api gsl_constexpr index_type size_bytes() const gsl_noexcept
+		{
+			return span_.size_bytes();
+		}
+
+		gsl_api gsl_constexpr bool empty() const gsl_noexcept
+		{
+			return size() == 0;
+		}
+
+		gsl_api gsl_constexpr reference operator[](index_type idx) const
+		{
+			return span_[idx];
+		}
+
+		gsl_api gsl_constexpr reference operator()(index_type idx) const
+		{
+			return span_[idx];
+		}
+
+		gsl_api gsl_constexpr pointer data() const gsl_noexcept
+		{
+			return span_.data();
+		}
+
+		gsl_api iterator begin() const gsl_noexcept
+		{
+			return span_.begin();
+		}
+
+		gsl_api iterator end() const gsl_noexcept
+		{
+			return span_.end();
+		}
+
+		gsl_api reverse_iterator rbegin() const gsl_noexcept
+		{
+			return span_.rbegin();
+		}
+
+		gsl_api reverse_iterator rend() const gsl_noexcept
+		{
+			return span_.rend();
+		}
+
+		// const version not in p0123r2:
+
+		gsl_api const_iterator cbegin() const gsl_noexcept
+		{
+			return span_.cbegin();
+		}
+
+		gsl_api const_iterator cend() const gsl_noexcept
+		{
+			return span_.cend();
+		}
+
+		gsl_api const_reverse_iterator crbegin() const gsl_noexcept
+		{
+			return span_.crbegin();
+		}
+
+		gsl_api const_reverse_iterator crend() const gsl_noexcept
+		{
+			return span_.crend();
+		}
+
+	private:
+		gsl_api static gsl_constexpr14 span_type remove_z(pointer const & sz, std::size_t max)
+		{
+			return span_type(sz, detail::string_length(sz, max));
+		}
+
+#if gsl_HAVE_ARRAY
+		template< size_t N >
+		gsl_api static gsl_constexpr14 span_type remove_z(std::array<typename detail::remove_const<element_type>::type, N> & arr)
+		{
+			return remove_z(&arr[0], narrow_cast< std::size_t >(N));
+		}
+
+		template< size_t N >
+		gsl_api static gsl_constexpr14 span_type remove_z(std::array<typename detail::remove_const<element_type>::type, N> const & arr)
+		{
+			return remove_z(&arr[0], narrow_cast< std::size_t >(N));
+		}
+#endif
+
+	private:
+		span_type span_;
+	};
+
+	// basic_string_span comparison functions:
+
+#if gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator==(basic_string_span<T> const & l, U const & u) gsl_noexcept
+	{
+		const basic_string_span< typename detail::add_const<T>::type > r(u);
+
+		return l.size() == r.size()
+			&& std::equal(l.begin(), l.end(), r.begin());
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator<(basic_string_span<T> const & l, U const & u) gsl_noexcept
+	{
+		const basic_string_span< typename detail::add_const<T>::type > r(u);
+
+		return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+	}
+
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator==(U const & u, basic_string_span<T> const & r) gsl_noexcept
+	{
+		const basic_string_span< typename detail::add_const<T>::type > l(u);
+
+		return l.size() == r.size()
+			&& std::equal(l.begin(), l.end(), r.begin());
+	}
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator<(U const & u, basic_string_span<T> const & r) gsl_noexcept
+	{
+		const basic_string_span< typename detail::add_const<T>::type > l(u);
+
+		return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+	}
+#endif
+
+#else //gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 bool operator==(basic_string_span<T> const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return l.size() == r.size()
+			&& std::equal(l.begin(), l.end(), r.begin());
+	}
+
+	template< class T >
+	gsl_api inline gsl_constexpr14 bool operator<(basic_string_span<T> const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+	}
+
+#endif // gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator!=(basic_string_span<T> const & l, U const & r) gsl_noexcept
+	{
+		return !(l == r);
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator<=(basic_string_span<T> const & l, U const & r) gsl_noexcept
+	{
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG || ! gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+		return !(r < l);
+#else
+		basic_string_span< typename detail::add_const<T>::type > rr(r);
+		return !(rr < l);
+#endif
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator>(basic_string_span<T> const & l, U const & r) gsl_noexcept
+	{
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG || ! gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
+		return (r < l);
+#else
+		basic_string_span< typename detail::add_const<T>::type > rr(r);
+		return (rr < l);
+#endif
+	}
+
+	template< class T, class U >
+	gsl_api inline gsl_constexpr14 bool operator>=(basic_string_span<T> const & l, U const & r) gsl_noexcept
+	{
+		return !(l < r);
+	}
+
+#if gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator!=(U const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return !(l == r);
+	}
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator<=(U const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return !(r < l);
+	}
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator>(U const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return (r < l);
+	}
+
+	template< class T, class U,
+		class = typename std::enable_if<!detail::is_basic_string_span<U>::value >::type >
+		gsl_api inline gsl_constexpr14 bool operator>=(U const & l, basic_string_span<T> const & r) gsl_noexcept
+	{
+		return !(l < r);
+	}
+
+#endif // gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+
+	// convert basic_string_span to byte span:
+
+	template< class T >
+	gsl_api inline span< const byte > as_bytes(basic_string_span<T> spn) gsl_noexcept
+	{
+		return span< const byte >(reinterpret_cast<const byte *>(spn.data()), spn.size_bytes()); // NOLINT
+	}
+
+	//
+	// String types:
+	//
+
+	typedef char * zstring;
+	typedef wchar_t * zwstring;
+	typedef const char * czstring;
+	typedef const wchar_t * cwzstring;
+
+	typedef basic_string_span< char > string_span;
+	typedef basic_string_span< wchar_t > wstring_span;
+	typedef basic_string_span< char const > cstring_span;
+	typedef basic_string_span< wchar_t const > cwstring_span;
+
+	// to_string() allow (explicit) conversions from string_span to string
+
+#if 0
+
+	template< class T >
+	gsl_api inline std::basic_string< typename std::remove_const<T>::type > to_string(basic_string_span<T> spn)
+	{
+		std::string(spn.data(), spn.length());
+	}
+
+#else
+
+	gsl_api inline std::string to_string(string_span const & spn)
+	{
+		return std::string(spn.data(), spn.length());
+	}
+
+	gsl_api inline std::string to_string(cstring_span const & spn)
+	{
+		return std::string(spn.data(), spn.length());
+	}
+
+	gsl_api inline std::wstring to_string(wstring_span const & spn)
+	{
+		return std::wstring(spn.data(), spn.length());
+	}
+
+	gsl_api inline std::wstring to_string(cwstring_span const & spn)
+	{
+		return std::wstring(spn.data(), spn.length());
+	}
+
+#endif // to_string()
+
+	//
+	// Stream output for string_span types
+	//
+
+	namespace detail {
+
+		template< class Stream >
+		gsl_api void write_padding(Stream & os, std::streamsize n)
+		{
+			for (std::streamsize i = 0; i < n; ++i)
+				os.rdbuf()->sputc(os.fill());
+		}
+
+		template< class Stream, class Span >
+		gsl_api Stream & write_to_stream(Stream & os, Span const & spn)
+		{
+			typename Stream::sentry sentry(os);
+
+			if (!os)
+				return os;
+
+			const std::streamsize length = narrow<std::streamsize>(spn.length());
+
+			// Whether, and how, to pad
+			const bool pad = (length < os.width());
+			const bool left_pad = pad && (os.flags() & std::ios_base::adjustfield) == std::ios_base::right;
+
+			if (left_pad)
+				write_padding(os, os.width() - length);
+
+			// Write span characters
+			os.rdbuf()->sputn(spn.begin(), length);
+
+			if (pad && !left_pad)
+				write_padding(os, os.width() - length);
+
+			// Reset output stream width
+			os.width(0);
+
+			return os;
+		}
+
+	} // namespace detail
+
+	template< typename Traits >
+	gsl_api std::basic_ostream< char, Traits > & operator<<(std::basic_ostream< char, Traits > & os, string_span const & spn)
+	{
+		return detail::write_to_stream(os, spn);
+	}
+
+	template< typename Traits >
+	gsl_api std::basic_ostream< char, Traits > & operator<<(std::basic_ostream< char, Traits > & os, cstring_span const & spn)
+	{
+		return detail::write_to_stream(os, spn);
+	}
+
+	template< typename Traits >
+	gsl_api std::basic_ostream< wchar_t, Traits > & operator<<(std::basic_ostream< wchar_t, Traits > & os, wstring_span const & spn)
+	{
+		return detail::write_to_stream(os, spn);
+	}
+
+	template< typename Traits >
+	gsl_api std::basic_ostream< wchar_t, Traits > & operator<<(std::basic_ostream< wchar_t, Traits > & os, cwstring_span const & spn)
+	{
+		return detail::write_to_stream(os, spn);
+	}
+
+	//
+	// ensure_sentinel()
+	//
+	// Provides a way to obtain a span from a contiguous sequence
+	// that ends with a (non-inclusive) sentinel value.
+	//
+	// Will fail-fast if sentinel cannot be found before max elements are examined.
+	//
+	namespace detail {
+
+		template< class T, class SizeType, const T Sentinel >
+		gsl_api static span<T> ensure_sentinel(T * seq, SizeType max = std::numeric_limits<SizeType>::max())
+		{
+			typedef T * pointer;
+
+			gsl_SUPPRESS_MSVC_WARNING(26429, "f.23: symbol 'cur' is never tested for nullness, it can be marked as not_null")
+
+				pointer cur = seq;
+
+			while (static_cast<SizeType>(cur - seq) < max && *cur != Sentinel)
+				++cur;
+
+			Expects(*cur == Sentinel);
+
+			return span<T>(seq, narrow_cast< typename span<T>::index_type >(cur - seq));
+		}
+	} // namespace detail
+
+	  //
+	  // ensure_z - creates a string_span for a czstring or cwzstring.
+	  // Will fail fast if a null-terminator cannot be found before
+	  // the limit of size_type.
+	  //
+
+	template< class T >
+	gsl_api inline span<T> ensure_z(T * const & sz, size_t max = std::numeric_limits<size_t>::max())
+	{
+		return detail::ensure_sentinel<T, size_t, 0>(sz, max);
+	}
+
+	template< class T, size_t N >
+	gsl_api inline span<T> ensure_z(T(&sz)[N])
+	{
+		return ensure_z(&sz[0], N);
+	}
+
+# if gsl_HAVE_TYPE_TRAITS
+
+	template< class Cont >
+	gsl_api inline span< typename std::remove_pointer<typename Cont::pointer>::type >
+		ensure_z(Cont & cont)
+	{
+		return ensure_z(cont.data(), cont.length());
+	}
+# endif
+
+} // namespace gsl
+
+#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 12
+
+namespace std {
+
+	template<>
+	struct hash< gsl::byte >
+	{
+	public:
+		std::size_t operator()(gsl::byte v) const gsl_noexcept
+		{
+			return gsl::to_integer<std::size_t>(v);
+		}
+	};
+
+} // namespace std
+
+#endif
+
+gsl_RESTORE_MSVC_WARNINGS()
+
+#endif // GSL_GSL_LITE_HPP_INCLUDED
+
+// end of file
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl
new file mode 100644
index 00000000..e00de092
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl
@@ -0,0 +1,207 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_GSL_H
+#define GSL_GSL_H
+
+#include <cryptoTools/gsl/gsl_assert>  // Ensures/Expects
+#include <cryptoTools/gsl/gsl_util>    // finally()/narrow()/narrow_cast()...
+#include <cryptoTools/gsl/multi_span>  // multi_span, strided_span...
+#include <cryptoTools/gsl/span>        // span
+#include <cryptoTools/gsl/string_span> // zstring, string_span, zstring_builder...
+
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+// MSVC 2013 workarounds
+#if _MSC_VER <= 1800
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+
+// turn off some misguided warnings
+#pragma warning(push)
+#pragma warning(disable : 4351) // warns about newly introduced aggregate initializer behavior
+#endif                          // _MSC_VER <= 1800
+#endif                          // defined(_MSC_VER) && _MSC_VER < 1910
+
+namespace gsl
+{
+
+//
+// GSL.owner: ownership pointers
+//
+using std::unique_ptr;
+using std::shared_ptr;
+
+//
+// owner
+//
+//
+// owner<T> is designed as a bridge for code that must deal directly with owning pointers for some reason
+//
+// T must be a pointer type
+// - disallow construction from any type other than pointer type
+//
+template <class T, class = std::enable_if_t<std::is_pointer<T>::value>>
+using owner = T;
+
+//
+// not_null
+//
+// Restricts a pointer or smart pointer to only hold non-null values.
+//
+// Has zero size overhead over T.
+//
+// If T is a pointer (i.e. T == U*) then
+// - allow construction from U*
+// - disallow construction from nullptr_t
+// - disallow default construction
+// - ensure construction from null U* fails
+// - allow implicit conversion to U*
+//
+template <class T>
+class not_null
+{
+public:
+    static_assert(std::is_assignable<T&, std::nullptr_t>::value, "T cannot be assigned nullptr.");
+
+    template <typename U, typename Dummy = std::enable_if_t<std::is_convertible<U, T>::value>>
+    constexpr not_null(U&& u) : ptr_(std::forward<U>(u))
+    {
+        Expects(ptr_ != nullptr);
+    }
+
+    template <typename U, typename Dummy = std::enable_if_t<std::is_convertible<U, T>::value>>
+    constexpr not_null(const not_null<U>& other) : not_null(other.get())
+    {
+    }
+
+    not_null(const not_null& other) = default;
+    not_null& operator=(const not_null& other) = default;
+
+    constexpr T get() const
+    {
+        Ensures(ptr_ != nullptr);
+        return ptr_;
+    }
+
+    constexpr operator T() const { return get(); }
+    constexpr T operator->() const { return get(); }
+    constexpr decltype(auto) operator*() const { return *get(); } 
+
+    // prevents compilation when someone attempts to assign a null pointer constant
+    not_null(std::nullptr_t) = delete;
+    not_null& operator=(std::nullptr_t) = delete;
+
+    // unwanted operators...pointers only point to single objects!
+    not_null& operator++() = delete;
+    not_null& operator--() = delete;
+    not_null operator++(int) = delete;
+    not_null operator--(int) = delete;
+    not_null& operator+=(std::ptrdiff_t) = delete;
+    not_null& operator-=(std::ptrdiff_t) = delete;
+    void operator[](std::ptrdiff_t) const = delete;
+
+private:
+    T ptr_;
+};
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, const not_null<T>& val)
+{
+    os << val.get();
+    return os;
+}
+
+template <class T, class U>
+auto operator==(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() == rhs.get())
+{
+    return lhs.get() == rhs.get();
+}
+
+template <class T, class U>
+auto operator!=(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() != rhs.get())
+{
+    return lhs.get() != rhs.get();
+}
+
+template <class T, class U>
+auto operator<(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() < rhs.get())
+{
+    return lhs.get() < rhs.get();
+}
+
+template <class T, class U>
+auto operator<=(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() <= rhs.get())
+{
+    return lhs.get() <= rhs.get();
+}
+
+template <class T, class U>
+auto operator>(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() > rhs.get())
+{
+    return lhs.get() > rhs.get();
+}
+
+template <class T, class U>
+auto operator>=(const not_null<T>& lhs, const not_null<U>& rhs) -> decltype(lhs.get() >= rhs.get())
+{
+    return lhs.get() >= rhs.get();
+}
+
+// more unwanted operators
+template <class T, class U>
+std::ptrdiff_t operator-(const not_null<T>&, const not_null<U>&) = delete;
+template <class T>
+not_null<T> operator-(const not_null<T>&, std::ptrdiff_t) = delete;
+template <class T>
+not_null<T> operator+(const not_null<T>&, std::ptrdiff_t) = delete;
+template <class T>
+not_null<T> operator+(std::ptrdiff_t, const not_null<T>&) = delete;
+
+} // namespace gsl
+
+namespace std
+{
+template <class T>
+struct hash<gsl::not_null<T>>
+{
+    std::size_t operator()(const gsl::not_null<T>& value) const { return hash<T>{}(value); }
+};
+
+} // namespace std
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+#if _MSC_VER <= 1800
+#undef noexcept
+#pragma pop_macro("noexcept")
+
+#pragma warning(pop)
+#endif // _MSC_VER <= 1800
+#endif // defined(_MSC_VER) && _MSC_VER < 1910
+
+#endif // GSL_GSL_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_algorithm b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_algorithm
new file mode 100644
index 00000000..cd59df08
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_algorithm
@@ -0,0 +1,61 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_ALGORITHM_H
+#define GSL_ALGORITHM_H
+
+#include <cryptoTools/gsl/span>
+
+#include <algorithm>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+
+// turn off some warnings that are noisy about our Expects statements
+#pragma warning(disable : 4127) // conditional expression is constant
+
+// blanket turn off warnings from CppCoreCheck for now
+// so people aren't annoyed by them when running the tool.
+// more targeted suppressions will be added in a future update to the GSL
+#pragma warning(disable : 26481 26482 26483 26485 26490 26491 26492 26493 26495)
+#endif // _MSC_VER
+
+namespace gsl
+{
+
+template <class SrcElementType, std::ptrdiff_t SrcExtent, class DestElementType,
+          std::ptrdiff_t DestExtent>
+void copy(span<SrcElementType, SrcExtent> src, span<DestElementType, DestExtent> dest)
+{
+    static_assert(std::is_assignable<decltype(*dest.data()), decltype(*src.data())>::value,
+                  "Elements of source span can not be assigned to elements of destination span");
+    static_assert(SrcExtent == dynamic_extent || DestExtent == dynamic_extent ||
+                      (SrcExtent <= DestExtent),
+                  "Source range is longer than target range");
+
+    Expects(dest.size() >= src.size());
+    std::copy_n(src.data(), src.size(), dest.data());
+}
+
+} // namespace gsl
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // GSL_ALGORITHM_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_assert b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_assert
new file mode 100644
index 00000000..daccdb14
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_assert
@@ -0,0 +1,97 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_CONTRACTS_H
+#define GSL_CONTRACTS_H
+
+#include <exception>
+#include <stdexcept>
+
+//
+// There are three configuration options for this GSL implementation's behavior
+// when pre/post conditions on the GSL types are violated:
+//
+// 1. GSL_TERMINATE_ON_CONTRACT_VIOLATION: std::terminate will be called (default)
+// 2. GSL_THROW_ON_CONTRACT_VIOLATION: a gsl::fail_fast exception will be thrown
+// 3. GSL_UNENFORCED_ON_CONTRACT_VIOLATION: nothing happens
+//
+#if !(defined(GSL_THROW_ON_CONTRACT_VIOLATION) || defined(GSL_TERMINATE_ON_CONTRACT_VIOLATION) ||  \
+      defined(GSL_UNENFORCED_ON_CONTRACT_VIOLATION))
+#define GSL_TERMINATE_ON_CONTRACT_VIOLATION
+#endif
+
+#define GSL_STRINGIFY_DETAIL(x) #x
+#define GSL_STRINGIFY(x) GSL_STRINGIFY_DETAIL(x)
+
+#if defined(__clang__) || defined(__GNUC__)
+#define GSL_LIKELY(x) __builtin_expect(!!(x), 1)
+#define GSL_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define GSL_LIKELY(x) (!!(x))
+#define GSL_UNLIKELY(x) (!!(x))
+#endif
+
+//
+// GSL_ASSUME(cond)
+//
+// Tell the optimizer that the predicate cond must hold. It is unspecified
+// whether or not cond is actually evaluated.
+//
+#ifdef _MSC_VER
+#define GSL_ASSUME(cond) __assume(cond)
+#elif defined(__clang__)
+#define GSL_ASSUME(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+#define GSL_ASSUME(cond) ((cond) ? static_cast<void>(0) : __builtin_unreachable())
+#else
+#define GSL_ASSUME(cond) static_cast<void>(!!(cond))
+#endif
+
+//
+// GSL.assert: assertions
+//
+
+namespace gsl
+{
+struct fail_fast : public std::logic_error
+{
+    explicit fail_fast(char const* const message) : std::logic_error(message) {}
+};
+}
+
+#if defined(GSL_THROW_ON_CONTRACT_VIOLATION)
+
+#define GSL_CONTRACT_CHECK(type, cond)                                                             \
+    (GSL_LIKELY(cond) ? static_cast<void>(0)                                                       \
+                      : throw gsl::fail_fast("GSL: " type " failure at " __FILE__                  \
+                                             ": " GSL_STRINGIFY(__LINE__)))
+
+#elif defined(GSL_TERMINATE_ON_CONTRACT_VIOLATION)
+
+#define GSL_CONTRACT_CHECK(type, cond) (GSL_LIKELY(cond) ? static_cast<void>(0) : std::terminate())
+
+#elif defined(GSL_UNENFORCED_ON_CONTRACT_VIOLATION)
+
+#define GSL_CONTRACT_CHECK(type, cond) GSL_ASSUME(cond)
+
+#endif
+
+#define Expects(cond) GSL_CONTRACT_CHECK("Precondition", cond)
+#define Ensures(cond) GSL_CONTRACT_CHECK("Postcondition", cond)
+
+#endif // GSL_CONTRACTS_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_byte b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_byte
new file mode 100644
index 00000000..32e5ea18
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_byte
@@ -0,0 +1,190 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_BYTE_H
+#define GSL_BYTE_H
+
+#include <type_traits>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+
+// don't warn about function style casts in byte related operators
+#pragma warning(disable : 26493)
+
+// MSVC 2013 workarounds
+#if _MSC_VER <= 1800
+// constexpr is not understood
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+#endif           // _MSC_VER <= 1800
+
+// this tests if we are under MSVC and the standard lib has std::byte and it is enabled
+#if _MSC_VER >= 1911 && (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE) 
+
+#define GSL_USE_STD_BYTE 1
+
+#else // _MSC_VER >= 1911 && (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE) 
+
+#define GSL_USE_STD_BYTE 0
+
+#endif // _MSC_VER >= 1911 && (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE) 
+
+#else // _MSC_VER
+
+// this tests if we are under GCC or Clang with enough -std:c++1z power to get us std::byte
+#if defined(__cplusplus) && (__cplusplus >= 201703L)
+
+#define GSL_USE_STD_BYTE 1
+#include <cstddef>
+
+#else // defined(__cplusplus) && (__cplusplus >= 201703L)
+
+#define GSL_USE_STD_BYTE 0
+
+#endif //defined(__cplusplus) && (__cplusplus >= 201703L)
+
+#endif           // _MSC_VER
+
+namespace gsl
+{
+#if GSL_USE_STD_BYTE
+
+
+using std::byte;
+using std::to_integer;
+
+#else // GSL_USE_STD_BYTE
+
+// This is a simple definition for now that allows
+// use of byte within span<> to be standards-compliant
+enum class byte : unsigned char
+{
+};
+
+template <class IntegerType, class = std::enable_if_t<std::is_integral<IntegerType>::value>>
+inline constexpr byte& operator<<=(byte& b, IntegerType shift) noexcept
+{
+    return b = byte(static_cast<unsigned char>(b) << shift);
+}
+
+template <class IntegerType, class = std::enable_if_t<std::is_integral<IntegerType>::value>>
+inline constexpr byte operator<<(byte b, IntegerType shift) noexcept
+{
+    return byte(static_cast<unsigned char>(b) << shift);
+}
+
+template <class IntegerType, class = std::enable_if_t<std::is_integral<IntegerType>::value>>
+inline constexpr byte& operator>>=(byte& b, IntegerType shift) noexcept
+{
+    return b = byte(static_cast<unsigned char>(b) >> shift);
+}
+
+template <class IntegerType, class = std::enable_if_t<std::is_integral<IntegerType>::value>>
+inline constexpr byte operator>>(byte b, IntegerType shift) noexcept
+{
+    return byte(static_cast<unsigned char>(b) >> shift);
+}
+
+inline constexpr byte& operator|=(byte& l, byte r) noexcept
+{
+    return l = byte(static_cast<unsigned char>(l) | static_cast<unsigned char>(r));
+}
+
+inline constexpr byte operator|(byte l, byte r) noexcept
+{
+    return byte(static_cast<unsigned char>(l) | static_cast<unsigned char>(r));
+}
+
+inline constexpr byte& operator&=(byte& l, byte r) noexcept
+{
+    return l = byte(static_cast<unsigned char>(l) & static_cast<unsigned char>(r));
+}
+
+inline constexpr byte operator&(byte l, byte r) noexcept
+{
+    return byte(static_cast<unsigned char>(l) & static_cast<unsigned char>(r));
+}
+
+inline constexpr byte& operator^=(byte& l, byte r) noexcept
+{
+    return l = byte(static_cast<unsigned char>(l) ^ static_cast<unsigned char>(r));
+}
+
+inline constexpr byte operator^(byte l, byte r) noexcept
+{
+    return byte(static_cast<unsigned char>(l) ^ static_cast<unsigned char>(r));
+}
+
+inline constexpr byte operator~(byte b) noexcept { return byte(~static_cast<unsigned char>(b)); }
+
+template <class IntegerType, class = std::enable_if_t<std::is_integral<IntegerType>::value>>
+inline constexpr IntegerType to_integer(byte b) noexcept
+{
+    return static_cast<IntegerType>(b);
+}
+
+#endif // GSL_USE_STD_BYTE
+
+template <bool E, typename T>
+inline constexpr byte to_byte_impl(T t) noexcept
+{
+    static_assert(
+        E, "gsl::to_byte(t) must be provided an unsigned char, otherwise data loss may occur. "
+           "If you are calling to_byte with an integer contant use: gsl::to_byte<t>() version.");
+    return static_cast<byte>(t);
+}
+template <>
+inline constexpr byte to_byte_impl<true, unsigned char>(unsigned char t) noexcept
+{
+    return byte(t);
+}
+
+template <typename T>
+inline constexpr byte to_byte(T t) noexcept
+{
+    return to_byte_impl<std::is_same<T, unsigned char>::value, T>(t);
+}
+
+template <int I>
+inline constexpr byte to_byte() noexcept
+{
+    static_assert(I >= 0 && I <= 255,
+                  "gsl::byte only has 8 bits of storage, values must be in range 0-255");
+    return static_cast<byte>(I);
+}
+
+} // namespace gsl
+
+#ifdef _MSC_VER
+#if _MSC_VER <= 1800
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+#undef noexcept
+#pragma pop_macro("noexcept")
+#endif // _MSC_VER <= 1800
+
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // GSL_BYTE_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_util b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_util
new file mode 100644
index 00000000..94c2256a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_util
@@ -0,0 +1,170 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_UTIL_H
+#define GSL_UTIL_H
+
+#include <cryptoTools/gsl/gsl_assert> // Ensures/Expects
+
+#include <array>
+#include <exception>
+#include <type_traits>
+#include <utility>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4127) // conditional expression is constant
+
+#if _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+// MSVC 2013 workarounds
+#if _MSC_VER <= 1800
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+// turn off some misguided warnings
+#pragma warning(disable : 4351) // warns about newly introduced aggregate initializer behavior
+#endif                          // _MSC_VER <= 1800
+#endif                          // _MSC_VER < 1910
+#endif                          // _MSC_VER
+
+namespace gsl
+{
+//
+// GSL.util: utilities
+//
+
+// final_act allows you to ensure something gets run at the end of a scope
+template <class F>
+class final_act
+{
+public:
+    explicit final_act(F f) noexcept : f_(std::move(f)), invoke_(true) {}
+
+    final_act(final_act&& other) noexcept : f_(std::move(other.f_)), invoke_(other.invoke_)
+    {
+        other.invoke_ = false;
+    }
+
+    final_act(const final_act&) = delete;
+    final_act& operator=(const final_act&) = delete;
+
+    ~final_act() noexcept
+    {
+        if (invoke_) f_();
+    }
+
+private:
+    F f_;
+    bool invoke_;
+};
+
+// finally() - convenience function to generate a final_act
+template <class F>
+inline final_act<F> finally(const F& f) noexcept
+{
+    return final_act<F>(f);
+}
+
+template <class F>
+inline final_act<F> finally(F&& f) noexcept
+{
+    return final_act<F>(std::forward<F>(f));
+}
+
+// narrow_cast(): a searchable way to do narrowing casts of values
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+template <class T, class U>
+inline constexpr T narrow_cast(U u) noexcept
+{
+    return static_cast<T>(u);
+}
+#else
+template <class T, class U>
+inline constexpr T narrow_cast(U&& u) noexcept
+{
+    return static_cast<T>(std::forward<U>(u));
+}
+#endif
+
+struct narrowing_error : public std::exception
+{
+};
+
+namespace details
+{
+    template <class T, class U>
+    struct is_same_signedness
+        : public std::integral_constant<bool, std::is_signed<T>::value == std::is_signed<U>::value>
+    {
+    };
+}
+
+// narrow() : a checked version of narrow_cast() that throws if the cast changed the value
+template <class T, class U>
+inline T narrow(U u)
+{
+    T t = narrow_cast<T>(u);
+    if (static_cast<U>(t) != u) throw narrowing_error();
+    if (!details::is_same_signedness<T, U>::value && ((t < T{}) != (u < U{})))
+        throw narrowing_error();
+    return t;
+}
+
+//
+// at() - Bounds-checked way of accessing builtin arrays, std::array, std::vector
+//
+template <class T, std::size_t N>
+inline constexpr T& at(T (&arr)[N], const std::ptrdiff_t index)
+{
+    Expects(index >= 0 && index < narrow_cast<std::ptrdiff_t>(N));
+    return arr[static_cast<std::size_t>(index)];
+}
+
+template <class Cont>
+inline constexpr auto at(Cont& cont, const std::ptrdiff_t index) -> decltype(cont[cont.size()])
+{
+    Expects(index >= 0 && index < narrow_cast<std::ptrdiff_t>(cont.size()));
+    using size_type = decltype(cont.size());
+    return cont[static_cast<size_type>(index)];
+}
+
+template <class T>
+inline constexpr T at(const std::initializer_list<T> cont, const std::ptrdiff_t index)
+{
+    Expects(index >= 0 && index < narrow_cast<std::ptrdiff_t>(cont.size()));
+    return *(cont.begin() + index);
+}
+
+} // namespace gsl
+
+#if defined(_MSC_VER)
+#if _MSC_VER < 1910
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+#if _MSC_VER <= 1800
+#undef noexcept
+#pragma pop_macro("noexcept")
+#endif // _MSC_VER <= 1800
+#endif // _MSC_VER < 1910
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // GSL_UTIL_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/multi_span b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/multi_span
new file mode 100644
index 00000000..eaa9abe6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/multi_span
@@ -0,0 +1,2266 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_MULTI_SPAN_H
+#define GSL_MULTI_SPAN_H
+
+#include <cryptoTools/gsl/gsl_assert>
+#include <cryptoTools/gsl/gsl_byte>
+#include <cryptoTools/gsl/gsl_util>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <new>
+#include <numeric>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#ifdef _MSC_VER
+// turn off some warnings that are noisy about our Expects statements
+#pragma warning(push)
+#pragma warning(disable : 4127) // conditional expression is constant
+
+#if _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+// VS 2013 workarounds
+#if _MSC_VER <= 1800
+#define GSL_MSVC_HAS_VARIADIC_CTOR_BUG
+#define GSL_MSVC_NO_SUPPORT_FOR_MOVE_CTOR_DEFAULT
+
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+
+// turn off some misguided warnings
+#pragma warning(disable : 4351) // warns about newly introduced aggregate initializer behavior
+#pragma warning(disable : 4512) // warns that assignment op could not be generated
+#endif                          // _MSC_VER <= 1800
+#endif                          // _MSC_VER < 1910
+#endif                          // _MSC_VER
+
+#ifdef GSL_THROW_ON_CONTRACT_VIOLATION
+#define GSL_NOEXCEPT /*noexcept*/
+#else
+#define GSL_NOEXCEPT noexcept
+#endif // GSL_THROW_ON_CONTRACT_VIOLATION
+
+namespace gsl
+{
+
+/*
+** begin definitions of index and bounds
+*/
+namespace details
+{
+    template <typename SizeType>
+    struct SizeTypeTraits
+    {
+        static const SizeType max_value = std::numeric_limits<SizeType>::max();
+    };
+
+    template <typename... Ts>
+    class are_integral : public std::integral_constant<bool, true>
+    {
+    };
+
+    template <typename T, typename... Ts>
+    class are_integral<T, Ts...>
+        : public std::integral_constant<bool,
+                                        std::is_integral<T>::value && are_integral<Ts...>::value>
+    {
+    };
+}
+
+template <std::size_t Rank>
+class index final
+{
+    static_assert(Rank > 0, "Rank must be greater than 0!");
+
+    template <std::size_t OtherRank>
+    friend class index;
+
+public:
+    static const std::size_t rank = Rank;
+    using value_type = std::ptrdiff_t;
+    using size_type = value_type;
+    using reference = std::add_lvalue_reference_t<value_type>;
+    using const_reference = std::add_lvalue_reference_t<std::add_const_t<value_type>>;
+
+    constexpr index() GSL_NOEXCEPT {}
+
+    constexpr index(const value_type (&values)[Rank]) GSL_NOEXCEPT
+    {
+        std::copy(values, values + Rank, elems);
+    }
+
+#ifdef GSL_MSVC_HAS_VARIADIC_CTOR_BUG
+    template <
+        typename T, typename... Ts,
+        typename = std::enable_if_t<((sizeof...(Ts) + 1) == Rank) && std::is_integral<T>::value &&
+                                    details::are_integral<Ts...>::value>>
+    constexpr index(T t, Ts... ds)
+        : index({narrow_cast<value_type>(t), narrow_cast<value_type>(ds)...})
+    {
+    }
+#else
+    template <typename... Ts, typename = std::enable_if_t<(sizeof...(Ts) == Rank) &&
+                                                          details::are_integral<Ts...>::value>>
+    constexpr index(Ts... ds) GSL_NOEXCEPT : elems{narrow_cast<value_type>(ds)...}
+    {
+    }
+#endif
+
+    constexpr index(const index& other) GSL_NOEXCEPT = default;
+
+    constexpr index& operator=(const index& rhs) GSL_NOEXCEPT = default;
+
+    // Preconditions: component_idx < rank
+    constexpr reference operator[](std::size_t component_idx)
+    {
+        Expects(component_idx < Rank); // Component index must be less than rank
+        return elems[component_idx];
+    }
+
+    // Preconditions: component_idx < rank
+    constexpr const_reference operator[](std::size_t component_idx) const GSL_NOEXCEPT
+    {
+        Expects(component_idx < Rank); // Component index must be less than rank
+        return elems[component_idx];
+    }
+
+    constexpr bool operator==(const index& rhs) const GSL_NOEXCEPT
+    {
+        return std::equal(elems, elems + rank, rhs.elems);
+    }
+
+    constexpr bool operator!=(const index& rhs) const GSL_NOEXCEPT { return !(this == rhs); }
+
+    constexpr index operator+() const GSL_NOEXCEPT { return *this; }
+
+    constexpr index operator-() const GSL_NOEXCEPT
+    {
+        index ret = *this;
+        std::transform(ret, ret + rank, ret, std::negate<value_type>{});
+        return ret;
+    }
+
+    constexpr index operator+(const index& rhs) const GSL_NOEXCEPT
+    {
+        index ret = *this;
+        ret += rhs;
+        return ret;
+    }
+
+    constexpr index operator-(const index& rhs) const GSL_NOEXCEPT
+    {
+        index ret = *this;
+        ret -= rhs;
+        return ret;
+    }
+
+    constexpr index& operator+=(const index& rhs) GSL_NOEXCEPT
+    {
+        std::transform(elems, elems + rank, rhs.elems, elems, std::plus<value_type>{});
+        return *this;
+    }
+
+    constexpr index& operator-=(const index& rhs) GSL_NOEXCEPT
+    {
+        std::transform(elems, elems + rank, rhs.elems, elems, std::minus<value_type>{});
+        return *this;
+    }
+
+    constexpr index operator*(value_type v) const GSL_NOEXCEPT
+    {
+        index ret = *this;
+        ret *= v;
+        return ret;
+    }
+
+    constexpr index operator/(value_type v) const GSL_NOEXCEPT
+    {
+        index ret = *this;
+        ret /= v;
+        return ret;
+    }
+
+    friend constexpr index operator*(value_type v, const index& rhs) GSL_NOEXCEPT
+    {
+        return rhs * v;
+    }
+
+    constexpr index& operator*=(value_type v) GSL_NOEXCEPT
+    {
+        std::transform(elems, elems + rank, elems,
+                       [v](value_type x) { return std::multiplies<value_type>{}(x, v); });
+        return *this;
+    }
+
+    constexpr index& operator/=(value_type v) GSL_NOEXCEPT
+    {
+        std::transform(elems, elems + rank, elems,
+                       [v](value_type x) { return std::divides<value_type>{}(x, v); });
+        return *this;
+    }
+
+private:
+    value_type elems[Rank] = {};
+};
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+
+struct static_bounds_dynamic_range_t
+{
+    template <typename T, typename Dummy = std::enable_if_t<std::is_integral<T>::value>>
+    constexpr operator T() const GSL_NOEXCEPT
+    {
+        return narrow_cast<T>(-1);
+    }
+};
+
+constexpr bool operator==(static_bounds_dynamic_range_t, static_bounds_dynamic_range_t) GSL_NOEXCEPT
+{
+    return true;
+}
+
+constexpr bool operator!=(static_bounds_dynamic_range_t, static_bounds_dynamic_range_t) GSL_NOEXCEPT
+{
+    return false;
+}
+
+template <typename T, typename Dummy = std::enable_if_t<std::is_integral<T>::value>>
+constexpr bool operator==(static_bounds_dynamic_range_t, T other) GSL_NOEXCEPT
+{
+    return narrow_cast<T>(-1) == other;
+}
+
+template <typename T, typename Dummy = std::enable_if_t<std::is_integral<T>::value>>
+constexpr bool operator==(T left, static_bounds_dynamic_range_t right) GSL_NOEXCEPT
+{
+    return right == left;
+}
+
+template <typename T, typename Dummy = std::enable_if_t<std::is_integral<T>::value>>
+constexpr bool operator!=(static_bounds_dynamic_range_t, T other) GSL_NOEXCEPT
+{
+    return narrow_cast<T>(-1) != other;
+}
+
+template <typename T, typename Dummy = std::enable_if_t<std::is_integral<T>::value>>
+constexpr bool operator!=(T left, static_bounds_dynamic_range_t right) GSL_NOEXCEPT
+{
+    return right != left;
+}
+
+constexpr static_bounds_dynamic_range_t dynamic_range{};
+#else
+const std::ptrdiff_t dynamic_range = -1;
+#endif
+
+struct generalized_mapping_tag
+{
+};
+struct contiguous_mapping_tag : generalized_mapping_tag
+{
+};
+
+namespace details
+{
+
+    template <std::ptrdiff_t Left, std::ptrdiff_t Right>
+    struct LessThan
+    {
+        static const bool value = Left < Right;
+    };
+
+    template <std::ptrdiff_t... Ranges>
+    struct BoundsRanges
+    {
+        using size_type = std::ptrdiff_t;
+        static const size_type Depth = 0;
+        static const size_type DynamicNum = 0;
+        static const size_type CurrentRange = 1;
+        static const size_type TotalSize = 1;
+
+        // TODO : following signature is for work around VS bug
+        template <typename OtherRange>
+        BoundsRanges(const OtherRange&, bool /* firstLevel */)
+        {
+        }
+
+        BoundsRanges(const std::ptrdiff_t* const) {}
+        BoundsRanges() = default;
+
+        template <typename T, std::size_t Dim>
+        void serialize(T&) const
+        {
+        }
+
+        template <typename T, std::size_t Dim>
+        size_type linearize(const T&) const
+        {
+            return 0;
+        }
+
+        template <typename T, std::size_t Dim>
+        size_type contains(const T&) const
+        {
+            return -1;
+        }
+
+        size_type elementNum(std::size_t) const GSL_NOEXCEPT { return 0; }
+
+        size_type totalSize() const GSL_NOEXCEPT { return TotalSize; }
+
+        bool operator==(const BoundsRanges&) const GSL_NOEXCEPT { return true; }
+    };
+
+    template <std::ptrdiff_t... RestRanges>
+    struct BoundsRanges<dynamic_range, RestRanges...> : BoundsRanges<RestRanges...>
+    {
+        using Base = BoundsRanges<RestRanges...>;
+        using size_type = std::ptrdiff_t;
+        static const std::size_t Depth = Base::Depth + 1;
+        static const std::size_t DynamicNum = Base::DynamicNum + 1;
+        static const size_type CurrentRange = dynamic_range;
+        static const size_type TotalSize = dynamic_range;
+
+    private:
+        size_type m_bound;
+
+    public:
+        BoundsRanges(const std::ptrdiff_t* const arr)
+            : Base(arr + 1), m_bound(*arr * this->Base::totalSize())
+        {
+            Expects(0 <= *arr);
+        }
+
+        BoundsRanges() : m_bound(0) {}
+
+        template <std::ptrdiff_t OtherRange, std::ptrdiff_t... RestOtherRanges>
+        BoundsRanges(const BoundsRanges<OtherRange, RestOtherRanges...>& other,
+                     bool /* firstLevel */ = true)
+            : Base(static_cast<const BoundsRanges<RestOtherRanges...>&>(other), false)
+            , m_bound(other.totalSize())
+        {
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        void serialize(T& arr) const
+        {
+            arr[Dim] = elementNum();
+            this->Base::template serialize<T, Dim + 1>(arr);
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        size_type linearize(const T& arr) const
+        {
+            const size_type index = this->Base::totalSize() * arr[Dim];
+            Expects(index < m_bound);
+            return index + this->Base::template linearize<T, Dim + 1>(arr);
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        size_type contains(const T& arr) const
+        {
+            const ptrdiff_t last = this->Base::template contains<T, Dim + 1>(arr);
+            if (last == -1) return -1;
+            const ptrdiff_t cur = this->Base::totalSize() * arr[Dim];
+            return cur < m_bound ? cur + last : -1;
+        }
+
+        size_type totalSize() const GSL_NOEXCEPT { return m_bound; }
+
+        size_type elementNum() const GSL_NOEXCEPT { return totalSize() / this->Base::totalSize(); }
+
+        size_type elementNum(std::size_t dim) const GSL_NOEXCEPT
+        {
+            if (dim > 0)
+                return this->Base::elementNum(dim - 1);
+            else
+                return elementNum();
+        }
+
+        bool operator==(const BoundsRanges& rhs) const GSL_NOEXCEPT
+        {
+            return m_bound == rhs.m_bound &&
+                   static_cast<const Base&>(*this) == static_cast<const Base&>(rhs);
+        }
+    };
+
+    template <std::ptrdiff_t CurRange, std::ptrdiff_t... RestRanges>
+    struct BoundsRanges<CurRange, RestRanges...> : BoundsRanges<RestRanges...>
+    {
+        using Base = BoundsRanges<RestRanges...>;
+        using size_type = std::ptrdiff_t;
+        static const std::size_t Depth = Base::Depth + 1;
+        static const std::size_t DynamicNum = Base::DynamicNum;
+        static const size_type CurrentRange = CurRange;
+        static const size_type TotalSize =
+            Base::TotalSize == dynamic_range ? dynamic_range : CurrentRange * Base::TotalSize;
+
+        BoundsRanges(const std::ptrdiff_t* const arr) : Base(arr) {}
+        BoundsRanges() = default;
+
+        template <std::ptrdiff_t OtherRange, std::ptrdiff_t... RestOtherRanges>
+        BoundsRanges(const BoundsRanges<OtherRange, RestOtherRanges...>& other,
+                     bool firstLevel = true)
+            : Base(static_cast<const BoundsRanges<RestOtherRanges...>&>(other), false)
+        {
+            (void) firstLevel;
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        void serialize(T& arr) const
+        {
+            arr[Dim] = elementNum();
+            this->Base::template serialize<T, Dim + 1>(arr);
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        size_type linearize(const T& arr) const
+        {
+            Expects(arr[Dim] >= 0 && arr[Dim] < CurrentRange); // Index is out of range
+            return this->Base::totalSize() * arr[Dim] +
+                   this->Base::template linearize<T, Dim + 1>(arr);
+        }
+
+        template <typename T, std::size_t Dim = 0>
+        size_type contains(const T& arr) const
+        {
+            if (arr[Dim] >= CurrentRange) return -1;
+            const size_type last = this->Base::template contains<T, Dim + 1>(arr);
+            if (last == -1) return -1;
+            return this->Base::totalSize() * arr[Dim] + last;
+        }
+
+        size_type totalSize() const GSL_NOEXCEPT { return CurrentRange * this->Base::totalSize(); }
+
+        size_type elementNum() const GSL_NOEXCEPT { return CurrentRange; }
+
+        size_type elementNum(std::size_t dim) const GSL_NOEXCEPT
+        {
+            if (dim > 0)
+                return this->Base::elementNum(dim - 1);
+            else
+                return elementNum();
+        }
+
+        bool operator==(const BoundsRanges& rhs) const GSL_NOEXCEPT
+        {
+            return static_cast<const Base&>(*this) == static_cast<const Base&>(rhs);
+        }
+    };
+
+    template <typename SourceType, typename TargetType>
+    struct BoundsRangeConvertible
+        : public std::integral_constant<bool, (SourceType::TotalSize >= TargetType::TotalSize ||
+                                               TargetType::TotalSize == dynamic_range ||
+                                               SourceType::TotalSize == dynamic_range ||
+                                               TargetType::TotalSize == 0)>
+    {
+    };
+
+    template <typename TypeChain>
+    struct TypeListIndexer
+    {
+        const TypeChain& obj_;
+        TypeListIndexer(const TypeChain& obj) : obj_(obj) {}
+
+        template <std::size_t N>
+        const TypeChain& getObj(std::true_type)
+        {
+            return obj_;
+        }
+
+        template <std::size_t N, typename MyChain = TypeChain,
+                  typename MyBase = typename MyChain::Base>
+        auto getObj(std::false_type)
+            -> decltype(TypeListIndexer<MyBase>(static_cast<const MyBase&>(obj_)).template get<N>())
+        {
+            return TypeListIndexer<MyBase>(static_cast<const MyBase&>(obj_)).template get<N>();
+        }
+
+        template <std::size_t N>
+        auto get() -> decltype(getObj<N - 1>(std::integral_constant<bool, N == 0>()))
+        {
+            return getObj<N - 1>(std::integral_constant<bool, N == 0>());
+        }
+    };
+
+    template <typename TypeChain>
+    TypeListIndexer<TypeChain> createTypeListIndexer(const TypeChain& obj)
+    {
+        return TypeListIndexer<TypeChain>(obj);
+    }
+
+    template <std::size_t Rank, bool Enabled = (Rank > 1),
+              typename Ret = std::enable_if_t<Enabled, index<Rank - 1>>>
+    inline constexpr Ret shift_left(const index<Rank>& other) GSL_NOEXCEPT
+    {
+        Ret ret{};
+        for (std::size_t i = 0; i < Rank - 1; ++i) {
+            ret[i] = other[i + 1];
+        }
+        return ret;
+    }
+}
+
+template <typename IndexType>
+class bounds_iterator;
+
+template <std::ptrdiff_t... Ranges>
+class static_bounds
+{
+public:
+    static_bounds(const details::BoundsRanges<Ranges...>&) {}
+};
+
+template <std::ptrdiff_t FirstRange, std::ptrdiff_t... RestRanges>
+class static_bounds<FirstRange, RestRanges...>
+{
+    using MyRanges = details::BoundsRanges<FirstRange, RestRanges...>;
+
+    MyRanges m_ranges;
+    constexpr static_bounds(const MyRanges& range) : m_ranges(range) {}
+
+    template <std::ptrdiff_t... OtherRanges>
+    friend class static_bounds;
+
+public:
+    static const std::size_t rank = MyRanges::Depth;
+    static const std::size_t dynamic_rank = MyRanges::DynamicNum;
+    static const std::ptrdiff_t static_size = MyRanges::TotalSize;
+
+    using size_type = std::ptrdiff_t;
+    using index_type = index<rank>;
+    using const_index_type = std::add_const_t<index_type>;
+    using iterator = bounds_iterator<const_index_type>;
+    using const_iterator = bounds_iterator<const_index_type>;
+    using difference_type = std::ptrdiff_t;
+    using sliced_type = static_bounds<RestRanges...>;
+    using mapping_type = contiguous_mapping_tag;
+
+    constexpr static_bounds(const static_bounds&) = default;
+
+    template <typename SourceType, typename TargetType, std::size_t Rank>
+    struct BoundsRangeConvertible2;
+
+    template <std::size_t Rank, typename SourceType, typename TargetType,
+              typename Ret = BoundsRangeConvertible2<typename SourceType::Base,
+                                                     typename TargetType::Base, Rank>>
+    static auto helpBoundsRangeConvertible(SourceType, TargetType, std::true_type) -> Ret;
+
+    template <std::size_t Rank, typename SourceType, typename TargetType>
+    static auto helpBoundsRangeConvertible(SourceType, TargetType, ...) -> std::false_type;
+
+    template <typename SourceType, typename TargetType, std::size_t Rank>
+    struct BoundsRangeConvertible2
+        : decltype(helpBoundsRangeConvertible<Rank - 1>(
+              SourceType(), TargetType(),
+              std::integral_constant<bool,
+                                     SourceType::Depth == TargetType::Depth &&
+                                         (SourceType::CurrentRange == TargetType::CurrentRange ||
+                                          TargetType::CurrentRange == dynamic_range ||
+                                          SourceType::CurrentRange == dynamic_range)>()))
+    {
+    };
+
+    template <typename SourceType, typename TargetType>
+    struct BoundsRangeConvertible2<SourceType, TargetType, 0> : std::true_type
+    {
+    };
+
+    template <typename SourceType, typename TargetType, std::ptrdiff_t Rank = TargetType::Depth>
+    struct BoundsRangeConvertible
+        : decltype(helpBoundsRangeConvertible<Rank - 1>(
+              SourceType(), TargetType(),
+              std::integral_constant<bool,
+                                     SourceType::Depth == TargetType::Depth &&
+                                         (!details::LessThan<SourceType::CurrentRange,
+                                                             TargetType::CurrentRange>::value ||
+                                          TargetType::CurrentRange == dynamic_range ||
+                                          SourceType::CurrentRange == dynamic_range)>()))
+    {
+    };
+
+    template <typename SourceType, typename TargetType>
+    struct BoundsRangeConvertible<SourceType, TargetType, 0> : std::true_type
+    {
+    };
+
+    template <std::ptrdiff_t... Ranges,
+              typename = std::enable_if_t<details::BoundsRangeConvertible<
+                  details::BoundsRanges<Ranges...>,
+                  details::BoundsRanges<FirstRange, RestRanges...>>::value>>
+    constexpr static_bounds(const static_bounds<Ranges...>& other) : m_ranges(other.m_ranges)
+    {
+        Expects((MyRanges::DynamicNum == 0 && details::BoundsRanges<Ranges...>::DynamicNum == 0) ||
+                MyRanges::DynamicNum > 0 || other.m_ranges.totalSize() >= m_ranges.totalSize());
+    }
+
+    constexpr static_bounds(std::initializer_list<size_type> il)
+        : m_ranges(static_cast<const std::ptrdiff_t*>(il.begin()))
+    {
+        // Size of the initializer list must match the rank of the array
+        Expects((MyRanges::DynamicNum == 0 && il.size() == 1 && *il.begin() == static_size) ||
+                MyRanges::DynamicNum == il.size());
+        // Size of the range must be less than the max element of the size type
+        Expects(m_ranges.totalSize() <= PTRDIFF_MAX);
+    }
+
+    constexpr static_bounds() = default;
+
+    constexpr sliced_type slice() const GSL_NOEXCEPT
+    {
+        return sliced_type{static_cast<const details::BoundsRanges<RestRanges...>&>(m_ranges)};
+    }
+
+    constexpr size_type stride() const GSL_NOEXCEPT { return rank > 1 ? slice().size() : 1; }
+
+    constexpr size_type size() const GSL_NOEXCEPT { return m_ranges.totalSize(); }
+
+    constexpr size_type total_size() const GSL_NOEXCEPT { return m_ranges.totalSize(); }
+
+    constexpr size_type linearize(const index_type& idx) const { return m_ranges.linearize(idx); }
+
+    constexpr bool contains(const index_type& idx) const GSL_NOEXCEPT
+    {
+        return m_ranges.contains(idx) != -1;
+    }
+
+    constexpr size_type operator[](std::size_t index) const GSL_NOEXCEPT
+    {
+        return m_ranges.elementNum(index);
+    }
+
+    template <std::size_t Dim = 0>
+    constexpr size_type extent() const GSL_NOEXCEPT
+    {
+        static_assert(Dim < rank,
+                      "dimension should be less than rank (dimension count starts from 0)");
+        return details::createTypeListIndexer(m_ranges).template get<Dim>().elementNum();
+    }
+
+    template <typename IntType>
+    constexpr size_type extent(IntType dim) const GSL_NOEXCEPT
+    {
+        static_assert(std::is_integral<IntType>::value,
+                      "Dimension parameter must be supplied as an integral type.");
+        auto real_dim = narrow_cast<std::size_t>(dim);
+        Expects(real_dim < rank);
+
+        return m_ranges.elementNum(real_dim);
+    }
+
+    constexpr index_type index_bounds() const GSL_NOEXCEPT
+    {
+        size_type extents[rank] = {};
+        m_ranges.serialize(extents);
+        return {extents};
+    }
+
+    template <std::ptrdiff_t... Ranges>
+    constexpr bool operator==(const static_bounds<Ranges...>& rhs) const GSL_NOEXCEPT
+    {
+        return this->size() == rhs.size();
+    }
+
+    template <std::ptrdiff_t... Ranges>
+    constexpr bool operator!=(const static_bounds<Ranges...>& rhs) const GSL_NOEXCEPT
+    {
+        return !(*this == rhs);
+    }
+
+    constexpr const_iterator begin() const GSL_NOEXCEPT
+    {
+        return const_iterator(*this, index_type{});
+    }
+
+    constexpr const_iterator end() const GSL_NOEXCEPT
+    {
+        return const_iterator(*this, this->index_bounds());
+    }
+};
+
+template <std::size_t Rank>
+class strided_bounds
+{
+    template <std::size_t OtherRank>
+    friend class strided_bounds;
+
+public:
+    static const std::size_t rank = Rank;
+    using value_type = std::ptrdiff_t;
+    using reference = std::add_lvalue_reference_t<value_type>;
+    using const_reference = std::add_const_t<reference>;
+    using size_type = value_type;
+    using difference_type = value_type;
+    using index_type = index<rank>;
+    using const_index_type = std::add_const_t<index_type>;
+    using iterator = bounds_iterator<const_index_type>;
+    using const_iterator = bounds_iterator<const_index_type>;
+    static const value_type dynamic_rank = rank;
+    static const value_type static_size = dynamic_range;
+    using sliced_type = std::conditional_t<rank != 0, strided_bounds<rank - 1>, void>;
+    using mapping_type = generalized_mapping_tag;
+
+    constexpr strided_bounds(const strided_bounds&) GSL_NOEXCEPT = default;
+
+    constexpr strided_bounds& operator=(const strided_bounds&) GSL_NOEXCEPT = default;
+
+    constexpr strided_bounds(const value_type (&values)[rank], index_type strides)
+        : m_extents(values), m_strides(std::move(strides))
+    {
+    }
+
+    constexpr strided_bounds(const index_type& extents, const index_type& strides) GSL_NOEXCEPT
+        : m_extents(extents),
+          m_strides(strides)
+    {
+    }
+
+    constexpr index_type strides() const GSL_NOEXCEPT { return m_strides; }
+
+    constexpr size_type total_size() const GSL_NOEXCEPT
+    {
+        size_type ret = 0;
+        for (std::size_t i = 0; i < rank; ++i) {
+            ret += (m_extents[i] - 1) * m_strides[i];
+        }
+        return ret + 1;
+    }
+
+    constexpr size_type size() const GSL_NOEXCEPT
+    {
+        size_type ret = 1;
+        for (std::size_t i = 0; i < rank; ++i) {
+            ret *= m_extents[i];
+        }
+        return ret;
+    }
+
+    constexpr bool contains(const index_type& idx) const GSL_NOEXCEPT
+    {
+        for (std::size_t i = 0; i < rank; ++i) {
+            if (idx[i] < 0 || idx[i] >= m_extents[i]) return false;
+        }
+        return true;
+    }
+
+    constexpr size_type linearize(const index_type& idx) const GSL_NOEXCEPT
+    {
+        size_type ret = 0;
+        for (std::size_t i = 0; i < rank; i++) {
+            Expects(idx[i] < m_extents[i]); // index is out of bounds of the array
+            ret += idx[i] * m_strides[i];
+        }
+        return ret;
+    }
+
+    constexpr size_type stride() const GSL_NOEXCEPT { return m_strides[0]; }
+
+    template <bool Enabled = (rank > 1), typename Ret = std::enable_if_t<Enabled, sliced_type>>
+    constexpr sliced_type slice() const
+    {
+        return {details::shift_left(m_extents), details::shift_left(m_strides)};
+    }
+
+    template <std::size_t Dim = 0>
+    constexpr size_type extent() const GSL_NOEXCEPT
+    {
+        static_assert(Dim < Rank,
+                      "dimension should be less than rank (dimension count starts from 0)");
+        return m_extents[Dim];
+    }
+
+    constexpr index_type index_bounds() const GSL_NOEXCEPT { return m_extents; }
+    constexpr const_iterator begin() const GSL_NOEXCEPT
+    {
+        return const_iterator{*this, index_type{}};
+    }
+
+    constexpr const_iterator end() const GSL_NOEXCEPT
+    {
+        return const_iterator{*this, index_bounds()};
+    }
+
+private:
+    index_type m_extents;
+    index_type m_strides;
+};
+
+template <typename T>
+struct is_bounds : std::integral_constant<bool, false>
+{
+};
+template <std::ptrdiff_t... Ranges>
+struct is_bounds<static_bounds<Ranges...>> : std::integral_constant<bool, true>
+{
+};
+template <std::size_t Rank>
+struct is_bounds<strided_bounds<Rank>> : std::integral_constant<bool, true>
+{
+};
+
+template <typename IndexType>
+class bounds_iterator : public std::iterator<std::random_access_iterator_tag, IndexType>
+{
+private:
+    using Base = std::iterator<std::random_access_iterator_tag, IndexType>;
+
+public:
+    static const std::size_t rank = IndexType::rank;
+    using typename Base::reference;
+    using typename Base::pointer;
+    using typename Base::difference_type;
+    using typename Base::value_type;
+    using index_type = value_type;
+    using index_size_type = typename IndexType::value_type;
+    template <typename Bounds>
+    explicit bounds_iterator(const Bounds& bnd, value_type curr) GSL_NOEXCEPT
+        : boundary_(bnd.index_bounds()),
+          curr_(std::move(curr))
+    {
+        static_assert(is_bounds<Bounds>::value, "Bounds type must be provided");
+    }
+
+    constexpr reference operator*() const GSL_NOEXCEPT { return curr_; }
+
+    constexpr pointer operator->() const GSL_NOEXCEPT { return &curr_; }
+
+    constexpr bounds_iterator& operator++() GSL_NOEXCEPT
+    {
+        for (std::size_t i = rank; i-- > 0;) {
+            if (curr_[i] < boundary_[i] - 1) {
+                curr_[i]++;
+                return *this;
+            }
+            curr_[i] = 0;
+        }
+        // If we're here we've wrapped over - set to past-the-end.
+        curr_ = boundary_;
+        return *this;
+    }
+
+    constexpr bounds_iterator operator++(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        ++(*this);
+        return ret;
+    }
+
+    constexpr bounds_iterator& operator--() GSL_NOEXCEPT
+    {
+        if (!less(curr_, boundary_)) {
+            // if at the past-the-end, set to last element
+            for (std::size_t i = 0; i < rank; ++i) {
+                curr_[i] = boundary_[i] - 1;
+            }
+            return *this;
+        }
+        for (std::size_t i = rank; i-- > 0;) {
+            if (curr_[i] >= 1) {
+                curr_[i]--;
+                return *this;
+            }
+            curr_[i] = boundary_[i] - 1;
+        }
+        // If we're here the preconditions were violated
+        // "pre: there exists s such that r == ++s"
+        Expects(false);
+        return *this;
+    }
+
+    constexpr bounds_iterator operator--(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        --(*this);
+        return ret;
+    }
+
+    constexpr bounds_iterator operator+(difference_type n) const GSL_NOEXCEPT
+    {
+        bounds_iterator ret{*this};
+        return ret += n;
+    }
+
+    constexpr bounds_iterator& operator+=(difference_type n) GSL_NOEXCEPT
+    {
+        auto linear_idx = linearize(curr_) + n;
+        std::remove_const_t<value_type> stride = 0;
+        stride[rank - 1] = 1;
+        for (std::size_t i = rank - 1; i-- > 0;) {
+            stride[i] = stride[i + 1] * boundary_[i + 1];
+        }
+        for (std::size_t i = 0; i < rank; ++i) {
+            curr_[i] = linear_idx / stride[i];
+            linear_idx = linear_idx % stride[i];
+        }
+        // index is out of bounds of the array
+        Expects(!less(curr_, index_type{}) && !less(boundary_, curr_));
+        return *this;
+    }
+
+    constexpr bounds_iterator operator-(difference_type n) const GSL_NOEXCEPT
+    {
+        bounds_iterator ret{*this};
+        return ret -= n;
+    }
+
+    constexpr bounds_iterator& operator-=(difference_type n) GSL_NOEXCEPT { return *this += -n; }
+
+    constexpr difference_type operator-(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return linearize(curr_) - linearize(rhs.curr_);
+    }
+
+    constexpr value_type operator[](difference_type n) const GSL_NOEXCEPT { return *(*this + n); }
+
+    constexpr bool operator==(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return curr_ == rhs.curr_;
+    }
+
+    constexpr bool operator!=(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(*this == rhs);
+    }
+
+    constexpr bool operator<(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return less(curr_, rhs.curr_);
+    }
+
+    constexpr bool operator<=(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(rhs < *this);
+    }
+
+    constexpr bool operator>(const bounds_iterator& rhs) const GSL_NOEXCEPT { return rhs < *this; }
+
+    constexpr bool operator>=(const bounds_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(rhs > *this);
+    }
+
+    void swap(bounds_iterator& rhs) GSL_NOEXCEPT
+    {
+        std::swap(boundary_, rhs.boundary_);
+        std::swap(curr_, rhs.curr_);
+    }
+
+private:
+    constexpr bool less(index_type& one, index_type& other) const GSL_NOEXCEPT
+    {
+        for (std::size_t i = 0; i < rank; ++i) {
+            if (one[i] < other[i]) return true;
+        }
+        return false;
+    }
+
+    constexpr index_size_type linearize(const value_type& idx) const GSL_NOEXCEPT
+    {
+        // TODO: Smarter impl.
+        // Check if past-the-end
+        index_size_type multiplier = 1;
+        index_size_type res = 0;
+        if (!less(idx, boundary_)) {
+            res = 1;
+            for (std::size_t i = rank; i-- > 0;) {
+                res += (idx[i] - 1) * multiplier;
+                multiplier *= boundary_[i];
+            }
+        }
+        else
+        {
+            for (std::size_t i = rank; i-- > 0;) {
+                res += idx[i] * multiplier;
+                multiplier *= boundary_[i];
+            }
+        }
+        return res;
+    }
+
+    value_type boundary_;
+    std::remove_const_t<value_type> curr_;
+};
+
+template <typename IndexType>
+bounds_iterator<IndexType> operator+(typename bounds_iterator<IndexType>::difference_type n,
+                                     const bounds_iterator<IndexType>& rhs) GSL_NOEXCEPT
+{
+    return rhs + n;
+}
+
+namespace details
+{
+    template <typename Bounds>
+    inline constexpr std::enable_if_t<
+        std::is_same<typename Bounds::mapping_type, generalized_mapping_tag>::value,
+        typename Bounds::index_type>
+    make_stride(const Bounds& bnd) GSL_NOEXCEPT
+    {
+        return bnd.strides();
+    }
+
+    // Make a stride vector from bounds, assuming contiguous memory.
+    template <typename Bounds>
+    inline constexpr std::enable_if_t<
+        std::is_same<typename Bounds::mapping_type, contiguous_mapping_tag>::value,
+        typename Bounds::index_type>
+    make_stride(const Bounds& bnd) GSL_NOEXCEPT
+    {
+        auto extents = bnd.index_bounds();
+        typename Bounds::size_type stride[Bounds::rank] = {};
+
+        stride[Bounds::rank - 1] = 1;
+        for (std::size_t i = 1; i < Bounds::rank; ++i) {
+            stride[Bounds::rank - i - 1] = stride[Bounds::rank - i] * extents[Bounds::rank - i];
+        }
+        return {stride};
+    }
+
+    template <typename BoundsSrc, typename BoundsDest>
+    void verifyBoundsReshape(const BoundsSrc& src, const BoundsDest& dest)
+    {
+        static_assert(is_bounds<BoundsSrc>::value && is_bounds<BoundsDest>::value,
+                      "The src type and dest type must be bounds");
+        static_assert(std::is_same<typename BoundsSrc::mapping_type, contiguous_mapping_tag>::value,
+                      "The source type must be a contiguous bounds");
+        static_assert(BoundsDest::static_size == dynamic_range ||
+                          BoundsSrc::static_size == dynamic_range ||
+                          BoundsDest::static_size == BoundsSrc::static_size,
+                      "The source bounds must have same size as dest bounds");
+        Expects(src.size() == dest.size());
+    }
+
+} // namespace details
+
+template <typename Span>
+class contiguous_span_iterator;
+template <typename Span>
+class general_span_iterator;
+
+template <std::ptrdiff_t DimSize = dynamic_range>
+struct dim_t
+{
+    static const std::ptrdiff_t value = DimSize;
+};
+template <>
+struct dim_t<dynamic_range>
+{
+    static const std::ptrdiff_t value = dynamic_range;
+    const std::ptrdiff_t dvalue;
+    dim_t(std::ptrdiff_t size) : dvalue(size) {}
+};
+
+template <std::ptrdiff_t N, class = std::enable_if_t<(N >= 0)>>
+inline constexpr dim_t<N> dim() GSL_NOEXCEPT
+{
+    return dim_t<N>();
+}
+
+template <std::ptrdiff_t N = dynamic_range, class = std::enable_if_t<N == dynamic_range>>
+inline constexpr dim_t<N> dim(std::ptrdiff_t n) GSL_NOEXCEPT
+{
+    return dim_t<>(n);
+}
+
+template <typename ValueType, std::ptrdiff_t FirstDimension = dynamic_range,
+          std::ptrdiff_t... RestDimensions>
+class multi_span;
+
+template <typename ValueType, std::size_t Rank>
+class strided_span;
+
+namespace details
+{
+    template <typename T, typename = std::true_type>
+    struct SpanTypeTraits
+    {
+        using value_type = T;
+        using size_type = std::size_t;
+    };
+
+    template <typename Traits>
+    struct SpanTypeTraits<Traits, typename std::is_reference<typename Traits::span_traits&>::type>
+    {
+        using value_type = typename Traits::span_traits::value_type;
+        using size_type = typename Traits::span_traits::size_type;
+    };
+
+    template <typename T, std::ptrdiff_t... Ranks>
+    struct SpanArrayTraits
+    {
+        using type = multi_span<T, Ranks...>;
+        using value_type = T;
+        using bounds_type = static_bounds<Ranks...>;
+        using pointer = T*;
+        using reference = T&;
+    };
+    template <typename T, std::ptrdiff_t N, std::ptrdiff_t... Ranks>
+    struct SpanArrayTraits<T[N], Ranks...> : SpanArrayTraits<T, Ranks..., N>
+    {
+    };
+
+    template <typename BoundsType>
+    BoundsType newBoundsHelperImpl(std::ptrdiff_t totalSize, std::true_type) // dynamic size
+    {
+        Expects(totalSize >= 0 && totalSize <= PTRDIFF_MAX);
+        return BoundsType{totalSize};
+    }
+    template <typename BoundsType>
+    BoundsType newBoundsHelperImpl(std::ptrdiff_t totalSize, std::false_type) // static size
+    {
+        Expects(BoundsType::static_size <= totalSize);
+        return {};
+    }
+    template <typename BoundsType>
+    BoundsType newBoundsHelper(std::ptrdiff_t totalSize)
+    {
+        static_assert(BoundsType::dynamic_rank <= 1, "dynamic rank must less or equal to 1");
+        return newBoundsHelperImpl<BoundsType>(
+            totalSize, std::integral_constant<bool, BoundsType::dynamic_rank == 1>());
+    }
+
+    struct Sep
+    {
+    };
+
+    template <typename T, typename... Args>
+    T static_as_multi_span_helper(Sep, Args... args)
+    {
+        return T{narrow_cast<typename T::size_type>(args)...};
+    }
+    template <typename T, typename Arg, typename... Args>
+    std::enable_if_t<
+        !std::is_same<Arg, dim_t<dynamic_range>>::value && !std::is_same<Arg, Sep>::value, T>
+    static_as_multi_span_helper(Arg, Args... args)
+    {
+        return static_as_multi_span_helper<T>(args...);
+    }
+    template <typename T, typename... Args>
+    T static_as_multi_span_helper(dim_t<dynamic_range> val, Args... args)
+    {
+        return static_as_multi_span_helper<T>(args..., val.dvalue);
+    }
+
+    template <typename... Dimensions>
+    struct static_as_multi_span_static_bounds_helper
+    {
+        using type = static_bounds<(Dimensions::value)...>;
+    };
+
+    template <typename T>
+    struct is_multi_span_oracle : std::false_type
+    {
+    };
+
+    template <typename ValueType, std::ptrdiff_t FirstDimension, std::ptrdiff_t... RestDimensions>
+    struct is_multi_span_oracle<multi_span<ValueType, FirstDimension, RestDimensions...>>
+        : std::true_type
+    {
+    };
+
+    template <typename ValueType, std::ptrdiff_t Rank>
+    struct is_multi_span_oracle<strided_span<ValueType, Rank>> : std::true_type
+    {
+    };
+
+    template <typename T>
+    struct is_multi_span : is_multi_span_oracle<std::remove_cv_t<T>>
+    {
+    };
+}
+
+template <typename ValueType, std::ptrdiff_t FirstDimension, std::ptrdiff_t... RestDimensions>
+class multi_span
+{
+    // TODO do we still need this?
+    template <typename ValueType2, std::ptrdiff_t FirstDimension2,
+              std::ptrdiff_t... RestDimensions2>
+    friend class multi_span;
+
+public:
+    using bounds_type = static_bounds<FirstDimension, RestDimensions...>;
+    static const std::size_t Rank = bounds_type::rank;
+    using size_type = typename bounds_type::size_type;
+    using index_type = typename bounds_type::index_type;
+    using value_type = ValueType;
+    using const_value_type = std::add_const_t<value_type>;
+    using pointer = std::add_pointer_t<value_type>;
+    using reference = std::add_lvalue_reference_t<value_type>;
+    using iterator = contiguous_span_iterator<multi_span>;
+    using const_span = multi_span<const_value_type, FirstDimension, RestDimensions...>;
+    using const_iterator = contiguous_span_iterator<const_span>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    using sliced_type =
+        std::conditional_t<Rank == 1, value_type, multi_span<value_type, RestDimensions...>>;
+
+private:
+    pointer data_;
+    bounds_type bounds_;
+
+    friend iterator;
+    friend const_iterator;
+
+public:
+    // default constructor - same as constructing from nullptr_t
+    constexpr multi_span() GSL_NOEXCEPT : multi_span(nullptr, bounds_type{})
+    {
+        static_assert(bounds_type::dynamic_rank != 0 ||
+                          (bounds_type::dynamic_rank == 0 && bounds_type::static_size == 0),
+                      "Default construction of multi_span<T> only possible "
+                      "for dynamic or fixed, zero-length spans.");
+    }
+
+    // construct from nullptr - get an empty multi_span
+    constexpr multi_span(std::nullptr_t) GSL_NOEXCEPT : multi_span(nullptr, bounds_type{})
+    {
+        static_assert(bounds_type::dynamic_rank != 0 ||
+                          (bounds_type::dynamic_rank == 0 && bounds_type::static_size == 0),
+                      "nullptr_t construction of multi_span<T> only possible "
+                      "for dynamic or fixed, zero-length spans.");
+    }
+
+    // construct from nullptr with size of 0 (helps with template function calls)
+    template <class IntType, typename = std::enable_if_t<std::is_integral<IntType>::value>>
+    constexpr multi_span(std::nullptr_t, IntType size) GSL_NOEXCEPT
+        : multi_span(nullptr, bounds_type{})
+    {
+        static_assert(bounds_type::dynamic_rank != 0 ||
+                          (bounds_type::dynamic_rank == 0 && bounds_type::static_size == 0),
+                      "nullptr_t construction of multi_span<T> only possible "
+                      "for dynamic or fixed, zero-length spans.");
+        Expects(size == 0);
+    }
+
+    // construct from a single element
+    constexpr multi_span(reference data) GSL_NOEXCEPT : multi_span(&data, bounds_type{1})
+    {
+        static_assert(bounds_type::dynamic_rank > 0 || bounds_type::static_size == 0 ||
+                          bounds_type::static_size == 1,
+                      "Construction from a single element only possible "
+                      "for dynamic or fixed spans of length 0 or 1.");
+    }
+
+    // prevent constructing from temporaries for single-elements
+    constexpr multi_span(value_type&&) = delete;
+
+    // construct from pointer + length
+    constexpr multi_span(pointer ptr, size_type size) GSL_NOEXCEPT
+        : multi_span(ptr, bounds_type{size})
+    {
+    }
+
+    // construct from pointer + length - multidimensional
+    constexpr multi_span(pointer data, bounds_type bounds) GSL_NOEXCEPT : data_(data),
+                                                                          bounds_(std::move(bounds))
+    {
+        Expects((bounds_.size() > 0 && data != nullptr) || bounds_.size() == 0);
+    }
+
+    // construct from begin,end pointer pair
+    template <typename Ptr,
+              typename = std::enable_if_t<std::is_convertible<Ptr, pointer>::value &&
+                                          details::LessThan<bounds_type::dynamic_rank, 2>::value>>
+    constexpr multi_span(pointer begin, Ptr end)
+        : multi_span(begin,
+                     details::newBoundsHelper<bounds_type>(static_cast<pointer>(end) - begin))
+    {
+        Expects(begin != nullptr && end != nullptr && begin <= static_cast<pointer>(end));
+    }
+
+    // construct from n-dimensions static array
+    template <typename T, std::size_t N, typename Helper = details::SpanArrayTraits<T, N>>
+    constexpr multi_span(T (&arr)[N])
+        : multi_span(reinterpret_cast<pointer>(arr), bounds_type{typename Helper::bounds_type{}})
+    {
+        static_assert(std::is_convertible<typename Helper::value_type(*)[], value_type(*)[]>::value,
+                      "Cannot convert from source type to target multi_span type.");
+        static_assert(std::is_convertible<typename Helper::bounds_type, bounds_type>::value,
+                      "Cannot construct a multi_span from an array with fewer elements.");
+    }
+
+    // construct from n-dimensions dynamic array (e.g. new int[m][4])
+    // (precedence will be lower than the 1-dimension pointer)
+    template <typename T, typename Helper = details::SpanArrayTraits<T, dynamic_range>>
+    constexpr multi_span(T* const& data, size_type size)
+        : multi_span(reinterpret_cast<pointer>(data), typename Helper::bounds_type{size})
+    {
+        static_assert(std::is_convertible<typename Helper::value_type(*)[], value_type(*)[]>::value,
+                      "Cannot convert from source type to target multi_span type.");
+    }
+
+    // construct from std::array
+    template <typename T, std::size_t N>
+    constexpr multi_span(std::array<T, N>& arr)
+        : multi_span(arr.data(), bounds_type{static_bounds<N>{}})
+    {
+        static_assert(
+            std::is_convertible<T(*)[], typename std::remove_const_t<value_type>(*)[]>::value,
+            "Cannot convert from source type to target multi_span type.");
+        static_assert(std::is_convertible<static_bounds<N>, bounds_type>::value,
+                      "You cannot construct a multi_span from a std::array of smaller size.");
+    }
+
+    // construct from const std::array
+    template <typename T, std::size_t N>
+    constexpr multi_span(const std::array<T, N>& arr)
+        : multi_span(arr.data(), bounds_type{static_bounds<N>{}})
+    {
+        static_assert(std::is_convertible<T(*)[], typename std::remove_const_t<value_type>(*)[]>::value,
+                      "Cannot convert from source type to target multi_span type.");
+        static_assert(std::is_convertible<static_bounds<N>, bounds_type>::value,
+                      "You cannot construct a multi_span from a std::array of smaller size.");
+    }
+
+    // prevent constructing from temporary std::array
+    template <typename T, std::size_t N>
+    constexpr multi_span(std::array<T, N>&& arr) = delete;
+
+    // construct from containers
+    // future: could use contiguous_iterator_traits to identify only contiguous containers
+    // type-requirements: container must have .size(), operator[] which are value_type compatible
+    template <typename Cont, typename DataType = typename Cont::value_type,
+              typename = std::enable_if_t<
+                  !details::is_multi_span<Cont>::value &&
+                  std::is_convertible<DataType (*)[], value_type (*)[]>::value &&
+                  std::is_same<std::decay_t<decltype(std::declval<Cont>().size(),
+                                                     *std::declval<Cont>().data())>,
+                               DataType>::value>>
+    constexpr multi_span(Cont& cont)
+        : multi_span(static_cast<pointer>(cont.data()),
+                     details::newBoundsHelper<bounds_type>(narrow_cast<size_type>(cont.size())))
+    {
+    }
+
+    // prevent constructing from temporary containers
+    template <typename Cont, typename DataType = typename Cont::value_type,
+              typename = std::enable_if_t<
+                  !details::is_multi_span<Cont>::value &&
+                  std::is_convertible<DataType (*)[], value_type (*)[]>::value &&
+                  std::is_same<std::decay_t<decltype(std::declval<Cont>().size(),
+                                                     *std::declval<Cont>().data())>,
+                               DataType>::value>>
+    explicit constexpr multi_span(Cont&& cont) = delete;
+
+    // construct from a convertible multi_span
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename OtherBounds = static_bounds<OtherDimensions...>,
+              typename = std::enable_if_t<std::is_convertible<OtherValueType, ValueType>::value &&
+                                          std::is_convertible<OtherBounds, bounds_type>::value>>
+    constexpr multi_span(multi_span<OtherValueType, OtherDimensions...> other) GSL_NOEXCEPT
+        : data_(other.data_),
+          bounds_(other.bounds_)
+    {
+    }
+
+// trivial copy and move
+#ifndef GSL_MSVC_NO_SUPPORT_FOR_MOVE_CTOR_DEFAULT
+    constexpr multi_span(multi_span&&) = default;
+#endif
+    constexpr multi_span(const multi_span&) = default;
+
+// trivial assignment
+#ifndef GSL_MSVC_NO_SUPPORT_FOR_MOVE_CTOR_DEFAULT
+    constexpr multi_span& operator=(multi_span&&) = default;
+#endif
+    constexpr multi_span& operator=(const multi_span&) = default;
+
+    // first() - extract the first Count elements into a new multi_span
+    template <std::ptrdiff_t Count>
+    constexpr multi_span<ValueType, Count> first() const GSL_NOEXCEPT
+    {
+        static_assert(Count >= 0, "Count must be >= 0.");
+        static_assert(bounds_type::static_size == dynamic_range ||
+                          Count <= bounds_type::static_size,
+                      "Count is out of bounds.");
+
+        Expects(bounds_type::static_size != dynamic_range || Count <= this->size());
+        return {this->data(), Count};
+    }
+
+    // first() - extract the first count elements into a new multi_span
+    constexpr multi_span<ValueType, dynamic_range> first(size_type count) const GSL_NOEXCEPT
+    {
+        Expects(count >= 0 && count <= this->size());
+        return {this->data(), count};
+    }
+
+    // last() - extract the last Count elements into a new multi_span
+    template <std::ptrdiff_t Count>
+    constexpr multi_span<ValueType, Count> last() const GSL_NOEXCEPT
+    {
+        static_assert(Count >= 0, "Count must be >= 0.");
+        static_assert(bounds_type::static_size == dynamic_range ||
+                          Count <= bounds_type::static_size,
+                      "Count is out of bounds.");
+
+        Expects(bounds_type::static_size != dynamic_range || Count <= this->size());
+        return {this->data() + this->size() - Count, Count};
+    }
+
+    // last() - extract the last count elements into a new multi_span
+    constexpr multi_span<ValueType, dynamic_range> last(size_type count) const GSL_NOEXCEPT
+    {
+        Expects(count >= 0 && count <= this->size());
+        return {this->data() + this->size() - count, count};
+    }
+
+    // subspan() - create a subview of Count elements starting at Offset
+    template <std::ptrdiff_t Offset, std::ptrdiff_t Count>
+    constexpr multi_span<ValueType, Count> subspan() const GSL_NOEXCEPT
+    {
+        static_assert(Count >= 0, "Count must be >= 0.");
+        static_assert(Offset >= 0, "Offset must be >= 0.");
+        static_assert(bounds_type::static_size == dynamic_range ||
+                          ((Offset <= bounds_type::static_size) &&
+                           Count <= bounds_type::static_size - Offset),
+                      "You must describe a sub-range within bounds of the multi_span.");
+
+        Expects(bounds_type::static_size != dynamic_range ||
+                (Offset <= this->size() && Count <= this->size() - Offset));
+        return {this->data() + Offset, Count};
+    }
+
+    // subspan() - create a subview of count elements starting at offset
+    // supplying dynamic_range for count will consume all available elements from offset
+    constexpr multi_span<ValueType, dynamic_range>
+    subspan(size_type offset, size_type count = dynamic_range) const GSL_NOEXCEPT
+    {
+        Expects((offset >= 0 && offset <= this->size()) &&
+                (count == dynamic_range || (count <= this->size() - offset)));
+        return {this->data() + offset, count == dynamic_range ? this->length() - offset : count};
+    }
+
+    // section - creates a non-contiguous, strided multi_span from a contiguous one
+    constexpr strided_span<ValueType, Rank> section(index_type origin,
+                                                    index_type extents) const GSL_NOEXCEPT
+    {
+        size_type size = this->bounds().total_size() - this->bounds().linearize(origin);
+        return {&this->operator[](origin), size,
+                strided_bounds<Rank>{extents, details::make_stride(bounds())}};
+    }
+
+    // length of the multi_span in elements
+    constexpr size_type size() const GSL_NOEXCEPT { return bounds_.size(); }
+
+    // length of the multi_span in elements
+    constexpr size_type length() const GSL_NOEXCEPT { return this->size(); }
+
+    // length of the multi_span in bytes
+    constexpr size_type size_bytes() const GSL_NOEXCEPT
+    {
+        return narrow_cast<size_type>(sizeof(value_type)) * this->size();
+    }
+
+    // length of the multi_span in bytes
+    constexpr size_type length_bytes() const GSL_NOEXCEPT { return this->size_bytes(); }
+
+    constexpr bool empty() const GSL_NOEXCEPT { return this->size() == 0; }
+
+    static constexpr std::size_t rank() { return Rank; }
+
+    template <std::size_t Dim = 0>
+    constexpr size_type extent() const GSL_NOEXCEPT
+    {
+        static_assert(Dim < Rank,
+                      "Dimension should be less than rank (dimension count starts from 0).");
+        return bounds_.template extent<Dim>();
+    }
+
+    template <typename IntType>
+    constexpr size_type extent(IntType dim) const GSL_NOEXCEPT
+    {
+        return bounds_.extent(dim);
+    }
+
+    constexpr bounds_type bounds() const GSL_NOEXCEPT { return bounds_; }
+
+    constexpr pointer data() const GSL_NOEXCEPT { return data_; }
+
+    template <typename FirstIndex>
+    constexpr reference operator()(FirstIndex index)
+    {
+        return this->operator[](narrow_cast<std::ptrdiff_t>(index));
+    }
+
+    template <typename FirstIndex, typename... OtherIndices>
+    constexpr reference operator()(FirstIndex index, OtherIndices... indices)
+    {
+        index_type idx = {narrow_cast<std::ptrdiff_t>(index),
+                          narrow_cast<std::ptrdiff_t>(indices)...};
+        return this->operator[](idx);
+    }
+
+    constexpr reference operator[](const index_type& idx) const GSL_NOEXCEPT
+    {
+        return data_[bounds_.linearize(idx)];
+    }
+
+    template <bool Enabled = (Rank > 1), typename Ret = std::enable_if_t<Enabled, sliced_type>>
+    constexpr Ret operator[](size_type idx) const GSL_NOEXCEPT
+    {
+        Expects(idx >= 0 && idx < bounds_.size()); // index is out of bounds of the array
+        const size_type ridx = idx * bounds_.stride();
+
+        // index is out of bounds of the underlying data
+        Expects(ridx < bounds_.total_size());
+        return Ret{data_ + ridx, bounds_.slice()};
+    }
+
+    constexpr iterator begin() const GSL_NOEXCEPT { return iterator{this, true}; }
+
+    constexpr iterator end() const GSL_NOEXCEPT { return iterator{this, false}; }
+
+    constexpr const_iterator cbegin() const GSL_NOEXCEPT
+    {
+        return const_iterator{reinterpret_cast<const const_span*>(this), true};
+    }
+
+    constexpr const_iterator cend() const GSL_NOEXCEPT
+    {
+        return const_iterator{reinterpret_cast<const const_span*>(this), false};
+    }
+
+    constexpr reverse_iterator rbegin() const GSL_NOEXCEPT { return reverse_iterator{end()}; }
+
+    constexpr reverse_iterator rend() const GSL_NOEXCEPT { return reverse_iterator{begin()}; }
+
+    constexpr const_reverse_iterator crbegin() const GSL_NOEXCEPT
+    {
+        return const_reverse_iterator{cend()};
+    }
+
+    constexpr const_reverse_iterator crend() const GSL_NOEXCEPT
+    {
+        return const_reverse_iterator{cbegin()};
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator==(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return bounds_.size() == other.bounds_.size() &&
+               (data_ == other.data_ || std::equal(this->begin(), this->end(), other.begin()));
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator!=(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return !(*this == other);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator<(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return std::lexicographical_compare(this->begin(), this->end(), other.begin(), other.end());
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator<=(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return !(other < *this);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator>(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return (other < *this);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t... OtherDimensions,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator>=(const multi_span<OtherValueType, OtherDimensions...>& other) const GSL_NOEXCEPT
+    {
+        return !(*this < other);
+    }
+};
+
+//
+// Free functions for manipulating spans
+//
+
+// reshape a multi_span into a different dimensionality
+// DimCount and Enabled here are workarounds for a bug in MSVC 2015
+template <typename SpanType, typename... Dimensions2, std::size_t DimCount = sizeof...(Dimensions2),
+          bool Enabled = (DimCount > 0), typename = std::enable_if_t<Enabled>>
+inline constexpr auto as_multi_span(SpanType s, Dimensions2... dims)
+    -> multi_span<typename SpanType::value_type, Dimensions2::value...>
+{
+    static_assert(details::is_multi_span<SpanType>::value,
+                  "Variadic as_multi_span() is for reshaping existing spans.");
+    using BoundsType =
+        typename multi_span<typename SpanType::value_type, (Dimensions2::value)...>::bounds_type;
+    auto tobounds = details::static_as_multi_span_helper<BoundsType>(dims..., details::Sep{});
+    details::verifyBoundsReshape(s.bounds(), tobounds);
+    return {s.data(), tobounds};
+}
+
+// convert a multi_span<T> to a multi_span<const byte>
+template <typename U, std::ptrdiff_t... Dimensions>
+multi_span<const byte, dynamic_range> as_bytes(multi_span<U, Dimensions...> s) GSL_NOEXCEPT
+{
+    static_assert(std::is_trivial<std::decay_t<U>>::value,
+                  "The value_type of multi_span must be a trivial type.");
+    return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
+}
+
+// convert a multi_span<T> to a multi_span<byte> (a writeable byte multi_span)
+// this is not currently a portable function that can be relied upon to work
+// on all implementations. It should be considered an experimental extension
+// to the standard GSL interface.
+template <typename U, std::ptrdiff_t... Dimensions>
+multi_span<byte> as_writeable_bytes(multi_span<U, Dimensions...> s) GSL_NOEXCEPT
+{
+    static_assert(std::is_trivial<std::decay_t<U>>::value,
+                  "The value_type of multi_span must be a trivial type.");
+    return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
+}
+
+// convert a multi_span<const byte> to a multi_span<const T>
+// this is not currently a portable function that can be relied upon to work
+// on all implementations. It should be considered an experimental extension
+// to the standard GSL interface.
+template <typename U, std::ptrdiff_t... Dimensions>
+inline constexpr auto
+as_multi_span(multi_span<const byte, Dimensions...> s) GSL_NOEXCEPT -> multi_span<
+    const U, static_cast<std::ptrdiff_t>(
+                 multi_span<const byte, Dimensions...>::bounds_type::static_size != dynamic_range
+                     ? (static_cast<std::size_t>(
+                            multi_span<const byte, Dimensions...>::bounds_type::static_size) /
+                        sizeof(U))
+                     : dynamic_range)>
+{
+    using ConstByteSpan = multi_span<const byte, Dimensions...>;
+    static_assert(
+        std::is_trivial<std::decay_t<U>>::value &&
+            (ConstByteSpan::bounds_type::static_size == dynamic_range ||
+             ConstByteSpan::bounds_type::static_size % narrow_cast<std::ptrdiff_t>(sizeof(U)) == 0),
+        "Target type must be a trivial type and its size must match the byte array size");
+
+    Expects((s.size_bytes() % narrow_cast<std::ptrdiff_t>(sizeof(U))) == 0 &&
+            (s.size_bytes() / narrow_cast<std::ptrdiff_t>(sizeof(U))) < PTRDIFF_MAX);
+    return {reinterpret_cast<const U*>(s.data()),
+            s.size_bytes() / narrow_cast<std::ptrdiff_t>(sizeof(U))};
+}
+
+// convert a multi_span<byte> to a multi_span<T>
+// this is not currently a portable function that can be relied upon to work
+// on all implementations. It should be considered an experimental extension
+// to the standard GSL interface.
+template <typename U, std::ptrdiff_t... Dimensions>
+inline constexpr auto as_multi_span(multi_span<byte, Dimensions...> s) GSL_NOEXCEPT
+    -> multi_span<U, narrow_cast<std::ptrdiff_t>(
+                         multi_span<byte, Dimensions...>::bounds_type::static_size != dynamic_range
+                             ? static_cast<std::size_t>(
+                                   multi_span<byte, Dimensions...>::bounds_type::static_size) /
+                                   sizeof(U)
+                             : dynamic_range)>
+{
+    using ByteSpan = multi_span<byte, Dimensions...>;
+    static_assert(
+        std::is_trivial<std::decay_t<U>>::value &&
+            (ByteSpan::bounds_type::static_size == dynamic_range ||
+             ByteSpan::bounds_type::static_size % static_cast<std::size_t>(sizeof(U)) == 0),
+        "Target type must be a trivial type and its size must match the byte array size");
+
+    Expects((s.size_bytes() % sizeof(U)) == 0);
+    return {reinterpret_cast<U*>(s.data()),
+            s.size_bytes() / narrow_cast<std::ptrdiff_t>(sizeof(U))};
+}
+
+template <typename T, std::ptrdiff_t... Dimensions>
+inline constexpr auto as_multi_span(T* const& ptr, dim_t<Dimensions>... args)
+    -> multi_span<std::remove_all_extents_t<T>, Dimensions...>
+{
+    return {reinterpret_cast<std::remove_all_extents_t<T>*>(ptr),
+            details::static_as_multi_span_helper<static_bounds<Dimensions...>>(args...,
+                                                                               details::Sep{})};
+}
+
+template <typename T>
+inline constexpr auto as_multi_span(T* arr, std::ptrdiff_t len) ->
+    typename details::SpanArrayTraits<T, dynamic_range>::type
+{
+    return {reinterpret_cast<std::remove_all_extents_t<T>*>(arr), len};
+}
+
+template <typename T, std::size_t N>
+inline constexpr auto as_multi_span(T (&arr)[N]) -> typename details::SpanArrayTraits<T, N>::type
+{
+    return {arr};
+}
+
+template <typename T, std::size_t N>
+inline constexpr multi_span<const T, N> as_multi_span(const std::array<T, N>& arr)
+{
+    return {arr};
+}
+
+template <typename T, std::size_t N>
+inline constexpr multi_span<const T, N> as_multi_span(const std::array<T, N>&&) = delete;
+
+template <typename T, std::size_t N>
+inline constexpr multi_span<T, N> as_multi_span(std::array<T, N>& arr)
+{
+    return {arr};
+}
+
+template <typename T>
+inline constexpr multi_span<T, dynamic_range> as_multi_span(T* begin, T* end)
+{
+    return {begin, end};
+}
+
+template <typename Cont>
+inline constexpr auto as_multi_span(Cont& arr) -> std::enable_if_t<
+    !details::is_multi_span<std::decay_t<Cont>>::value,
+    multi_span<std::remove_reference_t<decltype(arr.size(), *arr.data())>, dynamic_range>>
+{
+    Expects(arr.size() < PTRDIFF_MAX);
+    return {arr.data(), narrow_cast<std::ptrdiff_t>(arr.size())};
+}
+
+template <typename Cont>
+inline constexpr auto as_multi_span(Cont&& arr) -> std::enable_if_t<
+    !details::is_multi_span<std::decay_t<Cont>>::value,
+    multi_span<std::remove_reference_t<decltype(arr.size(), *arr.data())>, dynamic_range>> = delete;
+
+// from basic_string which doesn't have nonconst .data() member like other contiguous containers
+template <typename CharT, typename Traits, typename Allocator>
+inline constexpr auto as_multi_span(std::basic_string<CharT, Traits, Allocator>& str)
+    -> multi_span<CharT, dynamic_range>
+{
+    Expects(str.size() < PTRDIFF_MAX);
+    return {&str[0], narrow_cast<std::ptrdiff_t>(str.size())};
+}
+
+// strided_span is an extension that is not strictly part of the GSL at this time.
+// It is kept here while the multidimensional interface is still being defined.
+template <typename ValueType, std::size_t Rank>
+class strided_span
+{
+public:
+    using bounds_type = strided_bounds<Rank>;
+    using size_type = typename bounds_type::size_type;
+    using index_type = typename bounds_type::index_type;
+    using value_type = ValueType;
+    using const_value_type = std::add_const_t<value_type>;
+    using pointer = std::add_pointer_t<value_type>;
+    using reference = std::add_lvalue_reference_t<value_type>;
+    using iterator = general_span_iterator<strided_span>;
+    using const_strided_span = strided_span<const_value_type, Rank>;
+    using const_iterator = general_span_iterator<const_strided_span>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    using sliced_type =
+        std::conditional_t<Rank == 1, value_type, strided_span<value_type, Rank - 1>>;
+
+private:
+    pointer data_;
+    bounds_type bounds_;
+
+    friend iterator;
+    friend const_iterator;
+    template <typename OtherValueType, std::size_t OtherRank>
+    friend class strided_span;
+
+public:
+    // from raw data
+    constexpr strided_span(pointer ptr, size_type size, bounds_type bounds)
+        : data_(ptr), bounds_(std::move(bounds))
+    {
+        Expects((bounds_.size() > 0 && ptr != nullptr) || bounds_.size() == 0);
+        // Bounds cross data boundaries
+        Expects(this->bounds().total_size() <= size);
+        (void) size;
+    }
+
+    // from static array of size N
+    template <size_type N>
+    constexpr strided_span(value_type (&values)[N], bounds_type bounds)
+        : strided_span(values, N, std::move(bounds))
+    {
+    }
+
+    // from array view
+    template <typename OtherValueType, std::ptrdiff_t... Dimensions,
+              bool Enabled1 = (sizeof...(Dimensions) == Rank),
+              bool Enabled2 = std::is_convertible<OtherValueType*, ValueType*>::value,
+              typename Dummy = std::enable_if_t<Enabled1 && Enabled2>>
+    constexpr strided_span(multi_span<OtherValueType, Dimensions...> av, bounds_type bounds)
+        : strided_span(av.data(), av.bounds().total_size(), std::move(bounds))
+    {
+    }
+
+    // convertible
+    template <typename OtherValueType, typename Dummy = std::enable_if_t<std::is_convertible<
+                                           OtherValueType (*)[], value_type (*)[]>::value>>
+    constexpr strided_span(const strided_span<OtherValueType, Rank>& other)
+        : data_(other.data_), bounds_(other.bounds_)
+    {
+    }
+
+    // convert from bytes
+    template <typename OtherValueType>
+    constexpr strided_span<
+        typename std::enable_if<std::is_same<value_type, const byte>::value, OtherValueType>::type,
+        Rank>
+    as_strided_span() const
+    {
+        static_assert((sizeof(OtherValueType) >= sizeof(value_type)) &&
+                          (sizeof(OtherValueType) % sizeof(value_type) == 0),
+                      "OtherValueType should have a size to contain a multiple of ValueTypes");
+        auto d = narrow_cast<size_type>(sizeof(OtherValueType) / sizeof(value_type));
+
+        size_type size = this->bounds().total_size() / d;
+        return {const_cast<OtherValueType*>(reinterpret_cast<const OtherValueType*>(this->data())),
+                size,
+                bounds_type{resize_extent(this->bounds().index_bounds(), d),
+                            resize_stride(this->bounds().strides(), d)}};
+    }
+
+    constexpr strided_span section(index_type origin, index_type extents) const
+    {
+        size_type size = this->bounds().total_size() - this->bounds().linearize(origin);
+        return {&this->operator[](origin), size,
+                bounds_type{extents, details::make_stride(bounds())}};
+    }
+
+    constexpr reference operator[](const index_type& idx) const
+    {
+        return data_[bounds_.linearize(idx)];
+    }
+
+    template <bool Enabled = (Rank > 1), typename Ret = std::enable_if_t<Enabled, sliced_type>>
+    constexpr Ret operator[](size_type idx) const
+    {
+        Expects(idx < bounds_.size()); // index is out of bounds of the array
+        const size_type ridx = idx * bounds_.stride();
+
+        // index is out of bounds of the underlying data
+        Expects(ridx < bounds_.total_size());
+        return {data_ + ridx, bounds_.slice().total_size(), bounds_.slice()};
+    }
+
+    constexpr bounds_type bounds() const GSL_NOEXCEPT { return bounds_; }
+
+    template <std::size_t Dim = 0>
+    constexpr size_type extent() const GSL_NOEXCEPT
+    {
+        static_assert(Dim < Rank,
+                      "dimension should be less than Rank (dimension count starts from 0)");
+        return bounds_.template extent<Dim>();
+    }
+
+    constexpr size_type size() const GSL_NOEXCEPT { return bounds_.size(); }
+
+    constexpr pointer data() const GSL_NOEXCEPT { return data_; }
+
+    constexpr explicit operator bool() const GSL_NOEXCEPT { return data_ != nullptr; }
+
+    constexpr iterator begin() const { return iterator{this, true}; }
+
+    constexpr iterator end() const { return iterator{this, false}; }
+
+    constexpr const_iterator cbegin() const
+    {
+        return const_iterator{reinterpret_cast<const const_strided_span*>(this), true};
+    }
+
+    constexpr const_iterator cend() const
+    {
+        return const_iterator{reinterpret_cast<const const_strided_span*>(this), false};
+    }
+
+    constexpr reverse_iterator rbegin() const { return reverse_iterator{end()}; }
+
+    constexpr reverse_iterator rend() const { return reverse_iterator{begin()}; }
+
+    constexpr const_reverse_iterator crbegin() const { return const_reverse_iterator{cend()}; }
+
+    constexpr const_reverse_iterator crend() const { return const_reverse_iterator{cbegin()}; }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator==(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return bounds_.size() == other.bounds_.size() &&
+               (data_ == other.data_ || std::equal(this->begin(), this->end(), other.begin()));
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator!=(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return !(*this == other);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator<(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return std::lexicographical_compare(this->begin(), this->end(), other.begin(), other.end());
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator<=(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return !(other < *this);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator>(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return (other < *this);
+    }
+
+    template <typename OtherValueType, std::ptrdiff_t OtherRank,
+              typename Dummy = std::enable_if_t<std::is_same<
+                  std::remove_cv_t<value_type>, std::remove_cv_t<OtherValueType>>::value>>
+    constexpr bool
+    operator>=(const strided_span<OtherValueType, OtherRank>& other) const GSL_NOEXCEPT
+    {
+        return !(*this < other);
+    }
+
+private:
+    static index_type resize_extent(const index_type& extent, std::ptrdiff_t d)
+    {
+        // The last dimension of the array needs to contain a multiple of new type elements
+        Expects(extent[Rank - 1] >= d && (extent[Rank - 1] % d == 0));
+
+        index_type ret = extent;
+        ret[Rank - 1] /= d;
+
+        return ret;
+    }
+
+    template <bool Enabled = (Rank == 1), typename Dummy = std::enable_if_t<Enabled>>
+    static index_type resize_stride(const index_type& strides, std::ptrdiff_t, void* = nullptr)
+    {
+        // Only strided arrays with regular strides can be resized
+        Expects(strides[Rank - 1] == 1);
+
+        return strides;
+    }
+
+    template <bool Enabled = (Rank > 1), typename Dummy = std::enable_if_t<Enabled>>
+    static index_type resize_stride(const index_type& strides, std::ptrdiff_t d)
+    {
+        // Only strided arrays with regular strides can be resized
+        Expects(strides[Rank - 1] == 1);
+        // The strides must have contiguous chunks of
+        // memory that can contain a multiple of new type elements
+        Expects(strides[Rank - 2] >= d && (strides[Rank - 2] % d == 0));
+
+        for (std::size_t i = Rank - 1; i > 0; --i) {
+            // Only strided arrays with regular strides can be resized
+            Expects((strides[i - 1] >= strides[i]) && (strides[i - 1] % strides[i] == 0));
+        }
+
+        index_type ret = strides / d;
+        ret[Rank - 1] = 1;
+
+        return ret;
+    }
+};
+
+template <class Span>
+class contiguous_span_iterator
+    : public std::iterator<std::random_access_iterator_tag, typename Span::value_type>
+{
+    using Base = std::iterator<std::random_access_iterator_tag, typename Span::value_type>;
+
+public:
+    using typename Base::reference;
+    using typename Base::pointer;
+    using typename Base::difference_type;
+
+private:
+    template <typename ValueType, std::ptrdiff_t FirstDimension, std::ptrdiff_t... RestDimensions>
+    friend class multi_span;
+
+    pointer data_;
+    const Span* m_validator;
+    void validateThis() const
+    {
+        // iterator is out of range of the array
+        Expects(data_ >= m_validator->data_ && data_ < m_validator->data_ + m_validator->size());
+    }
+    contiguous_span_iterator(const Span* container, bool isbegin)
+        : data_(isbegin ? container->data_ : container->data_ + container->size())
+        , m_validator(container)
+    {
+    }
+
+public:
+    reference operator*() const GSL_NOEXCEPT
+    {
+        validateThis();
+        return *data_;
+    }
+    pointer operator->() const GSL_NOEXCEPT
+    {
+        validateThis();
+        return data_;
+    }
+    contiguous_span_iterator& operator++() GSL_NOEXCEPT
+    {
+        ++data_;
+        return *this;
+    }
+    contiguous_span_iterator operator++(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        ++(*this);
+        return ret;
+    }
+    contiguous_span_iterator& operator--() GSL_NOEXCEPT
+    {
+        --data_;
+        return *this;
+    }
+    contiguous_span_iterator operator--(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        --(*this);
+        return ret;
+    }
+    contiguous_span_iterator operator+(difference_type n) const GSL_NOEXCEPT
+    {
+        contiguous_span_iterator ret{*this};
+        return ret += n;
+    }
+    contiguous_span_iterator& operator+=(difference_type n) GSL_NOEXCEPT
+    {
+        data_ += n;
+        return *this;
+    }
+    contiguous_span_iterator operator-(difference_type n) const GSL_NOEXCEPT
+    {
+        contiguous_span_iterator ret{*this};
+        return ret -= n;
+    }
+    contiguous_span_iterator& operator-=(difference_type n) GSL_NOEXCEPT { return *this += -n; }
+    difference_type operator-(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_validator == rhs.m_validator);
+        return data_ - rhs.data_;
+    }
+    reference operator[](difference_type n) const GSL_NOEXCEPT { return *(*this + n); }
+    bool operator==(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_validator == rhs.m_validator);
+        return data_ == rhs.data_;
+    }
+    bool operator!=(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(*this == rhs);
+    }
+    bool operator<(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_validator == rhs.m_validator);
+        return data_ < rhs.data_;
+    }
+    bool operator<=(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(rhs < *this);
+    }
+    bool operator>(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT { return rhs < *this; }
+    bool operator>=(const contiguous_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        return !(rhs > *this);
+    }
+    void swap(contiguous_span_iterator& rhs) GSL_NOEXCEPT
+    {
+        std::swap(data_, rhs.data_);
+        std::swap(m_validator, rhs.m_validator);
+    }
+};
+
+template <typename Span>
+contiguous_span_iterator<Span> operator+(typename contiguous_span_iterator<Span>::difference_type n,
+                                         const contiguous_span_iterator<Span>& rhs) GSL_NOEXCEPT
+{
+    return rhs + n;
+}
+
+template <typename Span>
+class general_span_iterator
+    : public std::iterator<std::random_access_iterator_tag, typename Span::value_type>
+{
+    using Base = std::iterator<std::random_access_iterator_tag, typename Span::value_type>;
+
+public:
+    using typename Base::reference;
+    using typename Base::pointer;
+    using typename Base::difference_type;
+    using typename Base::value_type;
+
+private:
+    template <typename ValueType, std::size_t Rank>
+    friend class strided_span;
+
+    const Span* m_container;
+    typename Span::bounds_type::iterator m_itr;
+    general_span_iterator(const Span* container, bool isbegin)
+        : m_container(container)
+        , m_itr(isbegin ? m_container->bounds().begin() : m_container->bounds().end())
+    {
+    }
+
+public:
+    reference operator*() GSL_NOEXCEPT { return (*m_container)[*m_itr]; }
+    pointer operator->() GSL_NOEXCEPT { return &(*m_container)[*m_itr]; }
+    general_span_iterator& operator++() GSL_NOEXCEPT
+    {
+        ++m_itr;
+        return *this;
+    }
+    general_span_iterator operator++(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        ++(*this);
+        return ret;
+    }
+    general_span_iterator& operator--() GSL_NOEXCEPT
+    {
+        --m_itr;
+        return *this;
+    }
+    general_span_iterator operator--(int) GSL_NOEXCEPT
+    {
+        auto ret = *this;
+        --(*this);
+        return ret;
+    }
+    general_span_iterator operator+(difference_type n) const GSL_NOEXCEPT
+    {
+        general_span_iterator ret{*this};
+        return ret += n;
+    }
+    general_span_iterator& operator+=(difference_type n) GSL_NOEXCEPT
+    {
+        m_itr += n;
+        return *this;
+    }
+    general_span_iterator operator-(difference_type n) const GSL_NOEXCEPT
+    {
+        general_span_iterator ret{*this};
+        return ret -= n;
+    }
+    general_span_iterator& operator-=(difference_type n) GSL_NOEXCEPT { return *this += -n; }
+    difference_type operator-(const general_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_container == rhs.m_container);
+        return m_itr - rhs.m_itr;
+    }
+    value_type operator[](difference_type n) const GSL_NOEXCEPT { return (*m_container)[m_itr[n]]; }
+
+    bool operator==(const general_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_container == rhs.m_container);
+        return m_itr == rhs.m_itr;
+    }
+    bool operator!=(const general_span_iterator& rhs) const GSL_NOEXCEPT { return !(*this == rhs); }
+    bool operator<(const general_span_iterator& rhs) const GSL_NOEXCEPT
+    {
+        Expects(m_container == rhs.m_container);
+        return m_itr < rhs.m_itr;
+    }
+    bool operator<=(const general_span_iterator& rhs) const GSL_NOEXCEPT { return !(rhs < *this); }
+    bool operator>(const general_span_iterator& rhs) const GSL_NOEXCEPT { return rhs < *this; }
+    bool operator>=(const general_span_iterator& rhs) const GSL_NOEXCEPT { return !(rhs > *this); }
+    void swap(general_span_iterator& rhs) GSL_NOEXCEPT
+    {
+        std::swap(m_itr, rhs.m_itr);
+        std::swap(m_container, rhs.m_container);
+    }
+};
+
+template <typename Span>
+general_span_iterator<Span> operator+(typename general_span_iterator<Span>::difference_type n,
+                                      const general_span_iterator<Span>& rhs) GSL_NOEXCEPT
+{
+    return rhs + n;
+}
+
+} // namespace gsl
+
+#undef GSL_NOEXCEPT
+
+#ifdef _MSC_VER
+#if _MSC_VER < 1910
+#if _MSC_VER <= 1800
+#undef noexcept
+#pragma pop_macro("noexcept")
+
+#undef GSL_MSVC_HAS_VARIADIC_CTOR_BUG
+#undef GSL_MSVC_NO_SUPPORT_FOR_MOVE_CTOR_DEFAULT
+#endif // _MSC_VER <= 1800
+
+#undef constexpr
+#pragma pop_macro("constexpr")
+#endif // _MSC_VER < 1910
+
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // GSL_MULTI_SPAN_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/span b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/span
new file mode 100644
index 00000000..9fbd3f8d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/span
@@ -0,0 +1,735 @@
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_SPAN_H
+#define GSL_SPAN_H
+
+#include <cryptoTools/gsl/gsl_assert>
+#include <cryptoTools/gsl/gsl_byte>
+#include <cryptoTools/gsl/gsl_util>
+
+#include <array>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+
+// turn off some warnings that are noisy about our Expects statements
+#pragma warning(disable : 4127) // conditional expression is constant
+
+// blanket turn off warnings from CppCoreCheck for now
+// so people aren't annoyed by them when running the tool.
+// more targeted suppressions will be added in a future update to the GSL
+#pragma warning(disable : 26481 26482 26483 26485 26490 26491 26492 26493 26495)
+
+#if _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+// VS 2013 workarounds
+#if _MSC_VER <= 1800
+#define GSL_MSVC_HAS_VARIADIC_CTOR_BUG
+#define GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+#define GSL_MSVC_NO_CPP14_STD_EQUAL
+
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+
+#pragma push_macro("alignof")
+#define alignof __alignof
+
+// turn off some misguided warnings
+#pragma warning(disable : 4351) // warns about newly introduced aggregate initializer behavior
+#pragma warning(disable : 4512) // warns that assignment op could not be generated
+#endif                          // _MSC_VER <= 1800
+#endif                          // _MSC_VER < 1910
+#endif                          // _MSC_VER
+
+#ifdef GSL_THROW_ON_CONTRACT_VIOLATION
+#define GSL_NOEXCEPT /*noexcept*/
+#else
+#define GSL_NOEXCEPT noexcept
+#endif // GSL_THROW_ON_CONTRACT_VIOLATION
+
+namespace gsl
+{
+
+// [views.constants], constants
+constexpr const std::ptrdiff_t dynamic_extent = -1;
+
+template <class ElementType, std::ptrdiff_t Extent = dynamic_extent>
+class span;
+
+// implementation details
+namespace details
+{
+    template <class T>
+    struct is_span_oracle : std::false_type
+    {
+    };
+
+    template <class ElementType, std::ptrdiff_t Extent>
+    struct is_span_oracle<gsl::span<ElementType, Extent>> : std::true_type
+    {
+    };
+
+    template <class T>
+    struct is_span : public is_span_oracle<std::remove_cv_t<T>>
+    {
+    };
+
+    template <class T>
+    struct is_std_array_oracle : std::false_type
+    {
+    };
+
+    template <class ElementType, std::size_t Extent>
+    struct is_std_array_oracle<std::array<ElementType, Extent>> : std::true_type
+    {
+    };
+
+    template <class T>
+    struct is_std_array : public is_std_array_oracle<std::remove_cv_t<T>>
+    {
+    };
+
+    template <std::ptrdiff_t From, std::ptrdiff_t To>
+    struct is_allowed_extent_conversion
+        : public std::integral_constant<bool, From == To || From == gsl::dynamic_extent ||
+                                                  To == gsl::dynamic_extent>
+    {
+    };
+
+    template <class From, class To>
+    struct is_allowed_element_type_conversion
+        : public std::integral_constant<bool, std::is_convertible<From (*)[], To (*)[]>::value>
+    {
+    };
+
+    template <class Span, bool IsConst>
+    class span_iterator
+    {
+        using element_type_ = typename Span::element_type;
+
+    public:
+        using iterator_category = std::random_access_iterator_tag;
+        using value_type = std::remove_cv_t<element_type_>;
+        using difference_type = typename Span::index_type;
+
+        using reference = std::conditional_t<IsConst, const element_type_, element_type_>&;
+        using pointer = std::add_pointer_t<reference>;
+
+        span_iterator() = default;
+
+        constexpr span_iterator(const Span* span, typename Span::index_type index) GSL_NOEXCEPT
+            : span_(span), index_(index)
+        {
+            Expects(span == nullptr || (index_ >= 0 && index <= span_->length()));
+        }
+
+        friend span_iterator<Span, true>;
+        template<bool B, std::enable_if_t<!B && IsConst>* = nullptr>
+        constexpr span_iterator(const span_iterator<Span, B>& other) GSL_NOEXCEPT
+            : span_iterator(other.span_, other.index_)
+        {
+        }
+
+        constexpr reference operator*() const GSL_NOEXCEPT
+        {
+            Expects(span_);
+            return (*span_)[index_];
+        }
+
+        constexpr pointer operator->() const GSL_NOEXCEPT
+        {
+            Expects(span_ && index_ >= 0 && index_ < span_->length());
+            return span_->data() + index_;
+        }
+
+        constexpr span_iterator& operator++() GSL_NOEXCEPT
+        {
+            Expects(span_ && index_ >= 0 && index_ < span_->length());
+            ++index_;
+            return *this;
+        }
+
+        constexpr span_iterator operator++(int) GSL_NOEXCEPT
+        {
+            auto ret = *this;
+            ++(*this);
+            return ret;
+        }
+
+        constexpr span_iterator& operator--() GSL_NOEXCEPT
+        {
+            Expects(span_ && index_ > 0 && index_ <= span_->length());
+            --index_;
+            return *this;
+        }
+
+        constexpr span_iterator operator--(int) GSL_NOEXCEPT
+        {
+            auto ret = *this;
+            --(*this);
+            return ret;
+        }
+
+        constexpr span_iterator operator+(difference_type n) const GSL_NOEXCEPT
+        {
+            auto ret = *this;
+            return ret += n;
+        }
+
+        constexpr span_iterator& operator+=(difference_type n) GSL_NOEXCEPT
+        {
+            Expects(span_ && (index_ + n) >= 0 && (index_ + n) <= span_->length());
+            index_ += n;
+            return *this;
+        }
+
+        constexpr span_iterator operator-(difference_type n) const GSL_NOEXCEPT
+        {
+            auto ret = *this;
+            return ret -= n;
+        }
+
+        constexpr span_iterator& operator-=(difference_type n) GSL_NOEXCEPT { return *this += -n; }
+
+        constexpr difference_type operator-(const span_iterator& rhs) const GSL_NOEXCEPT
+        {
+            Expects(span_ == rhs.span_);
+            return index_ - rhs.index_;
+        }
+
+        constexpr reference operator[](difference_type n) const GSL_NOEXCEPT
+        {
+            return *(*this + n);
+        }
+
+        constexpr friend bool operator==(const span_iterator& lhs,
+                                         const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            return lhs.span_ == rhs.span_ && lhs.index_ == rhs.index_;
+        }
+
+        constexpr friend bool operator!=(const span_iterator& lhs,
+                                         const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            return !(lhs == rhs);
+        }
+
+        constexpr friend bool operator<(const span_iterator& lhs,
+                                        const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            Expects(lhs.span_ == rhs.span_);
+            return lhs.index_ < rhs.index_;
+        }
+
+        constexpr friend bool operator<=(const span_iterator& lhs,
+                                         const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            return !(rhs < lhs);
+        }
+
+        constexpr friend bool operator>(const span_iterator& lhs,
+                                        const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            return rhs < lhs;
+        }
+
+        constexpr friend bool operator>=(const span_iterator& lhs,
+                                         const span_iterator& rhs) GSL_NOEXCEPT
+        {
+            return !(rhs > lhs);
+        }
+
+    protected:
+        const Span* span_ = nullptr;
+        std::ptrdiff_t index_ = 0;
+    };
+
+    template <class Span, bool IsConst>
+    inline constexpr span_iterator<Span, IsConst>
+    operator+(typename span_iterator<Span, IsConst>::difference_type n,
+              const span_iterator<Span, IsConst>& rhs) GSL_NOEXCEPT
+    {
+        return rhs + n;
+    }
+
+    template <class Span, bool IsConst>
+    inline constexpr span_iterator<Span, IsConst>
+    operator-(typename span_iterator<Span, IsConst>::difference_type n,
+              const span_iterator<Span, IsConst>& rhs) GSL_NOEXCEPT
+    {
+        return rhs - n;
+    }
+
+    template <std::ptrdiff_t Ext>
+    class extent_type
+    {
+    public:
+        using index_type = std::ptrdiff_t;
+
+        static_assert(Ext >= 0, "A fixed-size span must be >= 0 in size.");
+
+        constexpr extent_type() GSL_NOEXCEPT {}
+
+        template <index_type Other>
+        constexpr extent_type(extent_type<Other> ext)
+        {
+            static_assert(Other == Ext || Other == dynamic_extent,
+                          "Mismatch between fixed-size extent and size of initializing data.");
+            Expects(ext.size() == Ext);
+        }
+
+        constexpr extent_type(index_type size) { Expects(size == Ext); }
+
+        constexpr index_type size() const GSL_NOEXCEPT { return Ext; }
+    };
+
+    template <>
+    class extent_type<dynamic_extent>
+    {
+    public:
+        using index_type = std::ptrdiff_t;
+
+        template <index_type Other>
+        explicit constexpr extent_type(extent_type<Other> ext) : size_(ext.size())
+        {
+        }
+
+        explicit constexpr extent_type(index_type size) : size_(size) { Expects(size >= 0); }
+
+        constexpr index_type size() const GSL_NOEXCEPT { return size_; }
+
+    private:
+        index_type size_;
+    };
+} // namespace details
+
+// [span], class template span
+template <class ElementType, std::ptrdiff_t Extent>
+class span
+{
+public:
+    // constants and types
+    using element_type = ElementType;
+    using value_type = std::remove_cv_t<ElementType>;
+    using index_type = std::ptrdiff_t;
+    using pointer = element_type*;
+    using reference = element_type&;
+
+    using iterator = details::span_iterator<span<ElementType, Extent>, false>;
+    using const_iterator = details::span_iterator<span<ElementType, Extent>, true>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    using size_type = index_type;
+
+    constexpr static const index_type extent = Extent;
+
+    // [span.cons], span constructors, copy, assignment, and destructor
+    template <bool Dependent = false,
+              // "Dependent" is needed to make "std::enable_if_t<Dependent || Extent <= 0>" SFINAE,
+              // since "std::enable_if_t<Extent <= 0>" is ill-formed when Extent is greater than 0.
+              class = std::enable_if_t<(Dependent || Extent <= 0)>>
+    constexpr span() GSL_NOEXCEPT : storage_(nullptr, details::extent_type<0>())
+    {
+    }
+
+    constexpr span(std::nullptr_t) GSL_NOEXCEPT : span() {}
+
+    constexpr span(pointer ptr, index_type count) : storage_(ptr, count) {}
+
+    constexpr span(pointer firstElem, pointer lastElem)
+        : storage_(firstElem, std::distance(firstElem, lastElem))
+    {
+    }
+
+    template<typename Iter>
+    constexpr span(typename std::enable_if<std::is_base_of<
+        std::random_access_iterator_tag,
+        typename std::iterator_traits<Iter>::iterator_category>::value, Iter>::type firstElem, Iter lastElem)
+        : storage_(firstElem == lastElem ? nullptr : &*firstElem, std::distance(firstElem, lastElem))
+    {
+    }
+
+    template <std::size_t N>
+    constexpr span(element_type (&arr)[N]) GSL_NOEXCEPT
+        : storage_(&arr[0], details::extent_type<N>())
+    {
+    }
+
+    template <std::size_t N, class ArrayElementType = std::remove_const_t<element_type>>
+    constexpr span(std::array<ArrayElementType, N>& arr) GSL_NOEXCEPT
+        : storage_(&arr[0], details::extent_type<N>())
+    {
+    }
+
+    template <std::size_t N>
+    constexpr span(const std::array<std::remove_const_t<element_type>, N>& arr) GSL_NOEXCEPT
+        : storage_(&arr[0], details::extent_type<N>())
+    {
+    }
+
+    template <class ArrayElementType = std::add_pointer<element_type>>
+    constexpr span(const std::unique_ptr<ArrayElementType>& ptr, index_type count)
+        : storage_(ptr.get(), count)
+    {
+    }
+
+    constexpr span(const std::unique_ptr<ElementType>& ptr) : storage_(ptr.get(), ptr.get() ? 1 : 0)
+    {
+    }
+    constexpr span(const std::shared_ptr<ElementType>& ptr) : storage_(ptr.get(), ptr.get() ? 1 : 0)
+    {
+    }
+
+    // NB: the SFINAE here uses .data() as a incomplete/imperfect proxy for the requirement
+    // on Container to be a contiguous sequence container.
+    template <class Container,
+              class = std::enable_if_t<
+                  !details::is_span<Container>::value && !details::is_std_array<Container>::value &&
+                  std::is_convertible<typename Container::pointer, pointer>::value &&
+                  std::is_convertible<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>::value>>
+    constexpr span(Container& cont) : span(cont.data(), narrow<index_type>(cont.size()))
+    {
+    }
+
+    template <class Container,
+              class = std::enable_if_t<
+                  std::is_const<element_type>::value && !details::is_span<Container>::value &&
+                  std::is_convertible<typename Container::pointer, pointer>::value &&
+                  std::is_convertible<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>::value>>
+    constexpr span(const Container& cont) : span(cont.data(), narrow<index_type>(cont.size()))
+    {
+    }
+
+    constexpr span(const span& other) GSL_NOEXCEPT = default;
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr span(span&& other) GSL_NOEXCEPT = default;
+#else
+    constexpr span(span&& other) GSL_NOEXCEPT : storage_(std::move(other.storage_)) {}
+#endif
+
+    template <
+        class OtherElementType, std::ptrdiff_t OtherExtent,
+        class = std::enable_if_t<
+            details::is_allowed_extent_conversion<OtherExtent, Extent>::value &&
+            details::is_allowed_element_type_conversion<OtherElementType, element_type>::value>>
+    constexpr span(const span<OtherElementType, OtherExtent>& other)
+        : storage_(other.data(), details::extent_type<OtherExtent>(other.size()))
+    {
+    }
+
+    template <
+        class OtherElementType, std::ptrdiff_t OtherExtent,
+        class = std::enable_if_t<
+            details::is_allowed_extent_conversion<OtherExtent, Extent>::value &&
+            details::is_allowed_element_type_conversion<OtherElementType, element_type>::value>>
+    constexpr span(span<OtherElementType, OtherExtent>&& other)
+        : storage_(other.data(), details::extent_type<OtherExtent>(other.size()))
+    {
+    }
+
+    ~span() GSL_NOEXCEPT = default;
+    constexpr span& operator=(const span& other) GSL_NOEXCEPT = default;
+
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr span& operator=(span&& other) GSL_NOEXCEPT = default;
+#else
+    constexpr span& operator=(span&& other) GSL_NOEXCEPT
+    {
+        storage_ = std::move(other.storage_);
+        return *this;
+    }
+#endif
+    // [span.sub], span subviews
+    template <std::ptrdiff_t Count>
+    constexpr span<element_type, Count> first() const
+    {
+        Expects(Count >= 0 && Count <= size());
+        return {data(), Count};
+    }
+
+    template <std::ptrdiff_t Count>
+    constexpr span<element_type, Count> last() const
+    {
+        Expects(Count >= 0 && Count <= size());
+        return {data() + (size() - Count), Count};
+    }
+
+    template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
+    constexpr span<element_type, Count> subspan() const
+    {
+        Expects((Offset == 0 || (Offset > 0 && Offset <= size())) &&
+                (Count == dynamic_extent || (Count >= 0 && Offset + Count <= size())));
+        return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+    }
+
+    constexpr span<element_type, dynamic_extent> first(index_type count) const
+    {
+        Expects(count >= 0 && count <= size());
+        return {data(), count};
+    }
+
+    constexpr span<element_type, dynamic_extent> last(index_type count) const
+    {
+        Expects(count >= 0 && count <= size());
+        return {data() + (size() - count), count};
+    }
+
+    constexpr span<element_type, dynamic_extent> subspan(index_type offset,
+                                                         index_type count = dynamic_extent) const
+    {
+        Expects((offset == 0 || (offset > 0 && offset <= size())) &&
+                (count == dynamic_extent || (count >= 0 && offset + count <= size())));
+        return {data() + offset, count == dynamic_extent ? size() - offset : count};
+    }
+
+    // [span.obs], span observers
+    constexpr index_type length() const GSL_NOEXCEPT { return size(); }
+    constexpr index_type size() const GSL_NOEXCEPT { return storage_.size(); }
+    constexpr index_type length_bytes() const GSL_NOEXCEPT { return size_bytes(); }
+    constexpr index_type size_bytes() const GSL_NOEXCEPT
+    {
+        return size() * narrow_cast<index_type>(sizeof(element_type));
+    }
+    constexpr bool empty() const GSL_NOEXCEPT { return size() == 0; }
+
+    // [span.elem], span element access
+    constexpr reference operator[](index_type idx) const
+    {
+        Expects(idx >= 0 && idx < storage_.size());
+        return data()[idx];
+    }
+
+    constexpr reference at(index_type idx) const { return this->operator[](idx); }
+    constexpr reference operator()(index_type idx) const { return this->operator[](idx); }
+    constexpr pointer data() const GSL_NOEXCEPT { return storage_.data(); }
+
+    // [span.iter], span iterator support
+    iterator begin() const GSL_NOEXCEPT { return {this, 0}; }
+    iterator end() const GSL_NOEXCEPT { return {this, length()}; }
+
+    const_iterator cbegin() const GSL_NOEXCEPT { return {this, 0}; }
+    const_iterator cend() const GSL_NOEXCEPT { return {this, length()}; }
+
+    reverse_iterator rbegin() const GSL_NOEXCEPT { return reverse_iterator{end()}; }
+    reverse_iterator rend() const GSL_NOEXCEPT { return reverse_iterator{begin()}; }
+
+    const_reverse_iterator crbegin() const GSL_NOEXCEPT { return const_reverse_iterator{cend()}; }
+    const_reverse_iterator crend() const GSL_NOEXCEPT { return const_reverse_iterator{cbegin()}; }
+
+private:
+    // this implementation detail class lets us take advantage of the
+    // empty base class optimization to pay for only storage of a single
+    // pointer in the case of fixed-size spans
+    template <class ExtentType>
+    class storage_type : public ExtentType
+    {
+    public:
+        template <class OtherExtentType>
+        constexpr storage_type(pointer data, OtherExtentType ext) : ExtentType(ext), data_(data)
+        {
+            Expects((!data && ExtentType::size() == 0) || (data && ExtentType::size() >= 0));
+        }
+
+        constexpr pointer data() const GSL_NOEXCEPT { return data_; }
+
+    private:
+        pointer data_;
+    };
+
+    storage_type<details::extent_type<Extent>> storage_;
+};
+
+// [span.comparison], span comparison operators
+template <class ElementType, std::ptrdiff_t FirstExtent, std::ptrdiff_t SecondExtent>
+inline constexpr bool operator==(const span<ElementType, FirstExtent>& l,
+                                 const span<ElementType, SecondExtent>& r)
+{
+#ifdef GSL_MSVC_NO_CPP14_STD_EQUAL
+    return (l.size() == r.size()) && std::equal(l.begin(), l.end(), r.begin());
+#else
+    return std::equal(l.begin(), l.end(), r.begin(), r.end());
+#endif
+}
+
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr bool operator!=(const span<ElementType, Extent>& l,
+                                 const span<ElementType, Extent>& r)
+{
+    return !(l == r);
+}
+
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr bool operator<(const span<ElementType, Extent>& l,
+                                const span<ElementType, Extent>& r)
+{
+    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr bool operator<=(const span<ElementType, Extent>& l,
+                                 const span<ElementType, Extent>& r)
+{
+    return !(l > r);
+}
+
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr bool operator>(const span<ElementType, Extent>& l,
+                                const span<ElementType, Extent>& r)
+{
+    return r < l;
+}
+
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr bool operator>=(const span<ElementType, Extent>& l,
+                                 const span<ElementType, Extent>& r)
+{
+    return !(l < r);
+}
+
+namespace details
+{
+    // if we only supported compilers with good constexpr support then
+    // this pair of classes could collapse down to a constexpr function
+
+    // we should use a narrow_cast<> to go to std::size_t, but older compilers may not see it as
+    // constexpr
+    // and so will fail compilation of the template
+    template <class ElementType, std::ptrdiff_t Extent>
+    struct calculate_byte_size
+        : std::integral_constant<std::ptrdiff_t,
+                                 static_cast<std::ptrdiff_t>(sizeof(ElementType) *
+                                                             static_cast<std::size_t>(Extent))>
+    {
+    };
+
+    template <class ElementType>
+    struct calculate_byte_size<ElementType, dynamic_extent>
+        : std::integral_constant<std::ptrdiff_t, dynamic_extent>
+    {
+    };
+}
+
+// [span.objectrep], views of object representation
+template <class ElementType, std::ptrdiff_t Extent>
+span<const byte, details::calculate_byte_size<ElementType, Extent>::value>
+as_bytes(span<ElementType, Extent> s) GSL_NOEXCEPT
+{
+    return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
+}
+
+template <class ElementType, std::ptrdiff_t Extent,
+          class = std::enable_if_t<!std::is_const<ElementType>::value>>
+span<byte, details::calculate_byte_size<ElementType, Extent>::value>
+as_writeable_bytes(span<ElementType, Extent> s) GSL_NOEXCEPT
+{
+    return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
+}
+
+//
+// make_span() - Utility functions for creating spans
+//
+template <class ElementType>
+span<ElementType> make_span(ElementType* ptr, typename span<ElementType>::index_type count)
+{
+    return span<ElementType>(ptr, count);
+}
+
+template <class ElementType>
+span<ElementType> make_span(ElementType* firstElem, ElementType* lastElem)
+{
+    return span<ElementType>(firstElem, lastElem);
+}
+
+template <class ElementType, std::size_t N>
+span<ElementType, N> make_span(ElementType (&arr)[N])
+{
+    return span<ElementType, N>(arr);
+}
+
+template <class Container>
+span<typename Container::value_type> make_span(Container& cont)
+{
+    return span<typename Container::value_type>(cont);
+}
+
+template <class Container>
+span<const typename Container::value_type> make_span(const Container& cont)
+{
+    return span<const typename Container::value_type>(cont);
+}
+
+template <class Ptr>
+span<typename Ptr::element_type> make_span(Ptr& cont, std::ptrdiff_t count)
+{
+    return span<typename Ptr::element_type>(cont, count);
+}
+
+template <class Ptr>
+span<typename Ptr::element_type> make_span(Ptr& cont)
+{
+    return span<typename Ptr::element_type>(cont);
+}
+
+// Specialization of gsl::at for span
+template <class ElementType, std::ptrdiff_t Extent>
+inline constexpr ElementType& at(const span<ElementType, Extent>& s, std::ptrdiff_t index)
+{
+    // No bounds checking here because it is done in span::operator[] called below
+    return s[index];
+}
+
+} // namespace gsl
+
+#undef GSL_NOEXCEPT
+
+#ifdef _MSC_VER
+#if _MSC_VER < 1910
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+#if _MSC_VER <= 1800
+#undef noexcept
+#pragma pop_macro("noexcept")
+
+#undef alignof
+#pragma pop_macro("alignof")
+
+#undef GSL_MSVC_HAS_VARIADIC_CTOR_BUG
+#undef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+#undef GSL_MSVC_NO_CPP14_STD_EQUAL
+#endif // _MSC_VER <= 1800
+#endif // _MSC_VER < 1910
+
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#endif // GSL_SPAN_H
diff --git a/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/string_span b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/string_span
new file mode 100644
index 00000000..1d04526d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/string_span
@@ -0,0 +1,847 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef GSL_STRING_SPAN_H
+#define GSL_STRING_SPAN_H
+
+#include <cryptoTools/gsl/gsl_assert>
+#include <cryptoTools/gsl/gsl_util>
+#include <cryptoTools/gsl/span>
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+
+// blanket turn off warnings from CppCoreCheck for now
+// so people aren't annoyed by them when running the tool.
+// more targeted suppressions will be added in a future update to the GSL
+#pragma warning(disable : 26481 26482 26483 26485 26490 26491 26492 26493 26495)
+
+#if _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+// VS 2013 workarounds
+#if _MSC_VER <= 1800
+#define GSL_MSVC_HAS_TYPE_DEDUCTION_BUG
+#define GSL_MSVC_HAS_SFINAE_SUBSTITUTION_ICE
+#define GSL_MSVC_NO_CPP14_STD_EQUAL
+#define GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+
+// noexcept is not understood
+#pragma push_macro("noexcept")
+#define noexcept /*noexcept*/
+#endif           // _MSC_VER <= 1800
+#endif           // _MSC_VER < 1910
+#endif           // _MSC_VER
+
+// In order to test the library, we need it to throw exceptions that we can catch
+#ifdef GSL_THROW_ON_CONTRACT_VIOLATION
+#define GSL_NOEXCEPT /*noexcept*/
+#else
+#define GSL_NOEXCEPT noexcept
+#endif // GSL_THROW_ON_CONTRACT_VIOLATION
+
+namespace gsl
+{
+//
+// czstring and wzstring
+//
+// These are "tag" typedefs for C-style strings (i.e. null-terminated character arrays)
+// that allow static analysis to help find bugs.
+//
+// There are no additional features/semantics that we can find a way to add inside the
+// type system for these types that will not either incur significant runtime costs or
+// (sometimes needlessly) break existing programs when introduced.
+//
+
+template <typename CharT, std::ptrdiff_t Extent = dynamic_extent>
+using basic_zstring = CharT*;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using czstring = basic_zstring<const char, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using cwzstring = basic_zstring<const wchar_t, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using zstring = basic_zstring<char, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using wzstring = basic_zstring<wchar_t, Extent>;
+
+namespace details
+{
+    inline std::ptrdiff_t string_length(const char* str, std::ptrdiff_t n)
+    {
+        if (str == nullptr || n <= 0) return 0;
+
+        span<const char> str_span{str, n};
+
+        std::ptrdiff_t len = 0;
+        while (len < n && str_span[len]) len++;
+
+        return len;
+    }
+
+    inline std::ptrdiff_t wstring_length(const wchar_t* str, std::ptrdiff_t n)
+    {
+        if (str == nullptr || n <= 0) return 0;
+
+        span<const wchar_t> str_span{str, n};
+
+        std::ptrdiff_t len = 0;
+        while (len < n && str_span[len]) len++;
+
+        return len;
+    }
+}
+
+//
+// ensure_sentinel()
+//
+// Provides a way to obtain an span from a contiguous sequence
+// that ends with a (non-inclusive) sentinel value.
+//
+// Will fail-fast if sentinel cannot be found before max elements are examined.
+//
+template <typename T, const T Sentinel>
+span<T, dynamic_extent> ensure_sentinel(T* seq, std::ptrdiff_t max = PTRDIFF_MAX)
+{
+    auto cur = seq;
+    while ((cur - seq) < max && *cur != Sentinel) ++cur;
+    Ensures(*cur == Sentinel);
+    return {seq, cur - seq};
+}
+
+//
+// ensure_z - creates a span for a czstring or cwzstring.
+// Will fail fast if a null-terminator cannot be found before
+// the limit of size_type.
+//
+template <typename T>
+inline span<T, dynamic_extent> ensure_z(T* const& sz, std::ptrdiff_t max = PTRDIFF_MAX)
+{
+    return ensure_sentinel<T, 0>(sz, max);
+}
+
+// TODO (neilmac) there is probably a better template-magic way to get the const and non-const
+// overloads to share an implementation
+inline span<char, dynamic_extent> ensure_z(char* const& sz, std::ptrdiff_t max)
+{
+    auto len = details::string_length(sz, max);
+    Ensures(sz[len] == 0);
+    return {sz, len};
+}
+
+inline span<const char, dynamic_extent> ensure_z(const char* const& sz, std::ptrdiff_t max)
+{
+    auto len = details::string_length(sz, max);
+    Ensures(sz[len] == 0);
+    return {sz, len};
+}
+
+inline span<wchar_t, dynamic_extent> ensure_z(wchar_t* const& sz, std::ptrdiff_t max)
+{
+    auto len = details::wstring_length(sz, max);
+    Ensures(sz[len] == 0);
+    return {sz, len};
+}
+
+inline span<const wchar_t, dynamic_extent> ensure_z(const wchar_t* const& sz, std::ptrdiff_t max)
+{
+    auto len = details::wstring_length(sz, max);
+    Ensures(sz[len] == 0);
+    return {sz, len};
+}
+
+template <typename T, std::size_t N>
+span<T, dynamic_extent> ensure_z(T (&sz)[N])
+{
+    return ensure_z(&sz[0], static_cast<std::ptrdiff_t>(N));
+}
+
+template <class Cont>
+span<typename std::remove_pointer<typename Cont::pointer>::type, dynamic_extent>
+ensure_z(Cont& cont)
+{
+    return ensure_z(cont.data(), static_cast<std::ptrdiff_t>(cont.length()));
+}
+
+template <typename CharT, std::ptrdiff_t>
+class basic_string_span;
+
+namespace details
+{
+    template <typename T>
+    struct is_basic_string_span_oracle : std::false_type
+    {
+    };
+
+    template <typename CharT, std::ptrdiff_t Extent>
+    struct is_basic_string_span_oracle<basic_string_span<CharT, Extent>> : std::true_type
+    {
+    };
+
+    template <typename T>
+    struct is_basic_string_span : is_basic_string_span_oracle<std::remove_cv_t<T>>
+    {
+    };
+
+    template <typename T>
+    struct length_func
+    {
+    };
+
+    template <>
+    struct length_func<char>
+    {
+        std::ptrdiff_t operator()(const char* const ptr, std::ptrdiff_t length) GSL_NOEXCEPT
+        {
+            return details::string_length(ptr, length);
+        }
+    };
+
+    template <>
+    struct length_func<wchar_t>
+    {
+        std::ptrdiff_t operator()(const wchar_t* const ptr, std::ptrdiff_t length) GSL_NOEXCEPT
+        {
+            return details::wstring_length(ptr, length);
+        }
+    };
+
+    template <>
+    struct length_func<const char>
+    {
+        std::ptrdiff_t operator()(const char* const ptr, std::ptrdiff_t length) GSL_NOEXCEPT
+        {
+            return details::string_length(ptr, length);
+        }
+    };
+
+    template <>
+    struct length_func<const wchar_t>
+    {
+        std::ptrdiff_t operator()(const wchar_t* const ptr, std::ptrdiff_t length) GSL_NOEXCEPT
+        {
+            return details::wstring_length(ptr, length);
+        }
+    };
+}
+
+//
+// string_span and relatives
+//
+template <typename CharT, std::ptrdiff_t Extent = dynamic_extent>
+class basic_string_span
+{
+public:
+    using element_type = CharT;
+    using pointer = std::add_pointer_t<element_type>;
+    using reference = std::add_lvalue_reference_t<element_type>;
+    using const_reference = std::add_lvalue_reference_t<std::add_const_t<element_type>>;
+    using impl_type = span<element_type, Extent>;
+
+    using index_type = typename impl_type::index_type;
+    using iterator = typename impl_type::iterator;
+    using const_iterator = typename impl_type::const_iterator;
+    using reverse_iterator = typename impl_type::reverse_iterator;
+    using const_reverse_iterator = typename impl_type::const_reverse_iterator;
+
+    // default (empty)
+    constexpr basic_string_span() GSL_NOEXCEPT = default;
+
+    // copy
+    constexpr basic_string_span(const basic_string_span& other) GSL_NOEXCEPT = default;
+
+// move
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr basic_string_span(basic_string_span&& other) GSL_NOEXCEPT = default;
+#else
+    constexpr basic_string_span(basic_string_span&& other) : span_(std::move(other.span_)) {}
+#endif
+
+    // assign
+    constexpr basic_string_span& operator=(const basic_string_span& other) GSL_NOEXCEPT = default;
+
+// move assign
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr basic_string_span& operator=(basic_string_span&& other) GSL_NOEXCEPT = default;
+#else
+    constexpr basic_string_span& operator=(basic_string_span&& other) GSL_NOEXCEPT
+    {
+        span_ = std::move(other.span_);
+        return *this;
+    }
+#endif
+
+    // from nullptr
+    constexpr basic_string_span(std::nullptr_t ptr) GSL_NOEXCEPT : span_(ptr) {}
+
+    constexpr basic_string_span(pointer ptr, index_type length) : span_(ptr, length) {}
+    constexpr basic_string_span(pointer firstElem, pointer lastElem) : span_(firstElem, lastElem) {}
+
+    // From static arrays - if 0-terminated, remove 0 from the view
+    // All other containers allow 0s within the length, so we do not remove them
+    template <std::size_t N>
+    constexpr basic_string_span(element_type (&arr)[N]) : span_(remove_z(arr))
+    {
+    }
+
+    template <std::size_t N, class ArrayElementType = std::remove_const_t<element_type>>
+    constexpr basic_string_span(std::array<ArrayElementType, N>& arr) GSL_NOEXCEPT : span_(arr)
+    {
+    }
+
+    template <std::size_t N, class ArrayElementType = std::remove_const_t<element_type>>
+    constexpr basic_string_span(const std::array<ArrayElementType, N>& arr) GSL_NOEXCEPT
+        : span_(arr)
+    {
+    }
+
+    // Container signature should work for basic_string after C++17 version exists
+    template <class Traits, class Allocator>
+    constexpr basic_string_span(std::basic_string<element_type, Traits, Allocator>& str)
+        : span_(&str[0], narrow_cast<std::ptrdiff_t>(str.length()))
+    {
+    }
+
+    template <class Traits, class Allocator>
+    constexpr basic_string_span(const std::basic_string<element_type, Traits, Allocator>& str)
+        : span_(&str[0], str.length())
+    {
+    }
+
+    // from containers. Containers must have a pointer type and data() function signatures
+    template <class Container,
+              class = std::enable_if_t<
+                  !details::is_basic_string_span<Container>::value &&
+                  std::is_convertible<typename Container::pointer, pointer>::value &&
+                  std::is_convertible<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>::value>>
+    constexpr basic_string_span(Container& cont) : span_(cont)
+    {
+    }
+
+    template <class Container,
+              class = std::enable_if_t<
+                  !details::is_basic_string_span<Container>::value &&
+                  std::is_convertible<typename Container::pointer, pointer>::value &&
+                  std::is_convertible<typename Container::pointer,
+                                      decltype(std::declval<Container>().data())>::value>>
+    constexpr basic_string_span(const Container& cont) : span_(cont)
+    {
+    }
+
+    // from string_span
+    template <
+        class OtherValueType, std::ptrdiff_t OtherExtent,
+        class = std::enable_if_t<std::is_convertible<
+            typename basic_string_span<OtherValueType, OtherExtent>::impl_type, impl_type>::value>>
+    constexpr basic_string_span(basic_string_span<OtherValueType, OtherExtent> other)
+        : span_(other.data(), other.length())
+    {
+    }
+
+    template <index_type Count>
+    constexpr basic_string_span<element_type, Count> first() const
+    {
+        return {span_.template first<Count>()};
+    }
+
+    constexpr basic_string_span<element_type, dynamic_extent> first(index_type count) const
+    {
+        return {span_.first(count)};
+    }
+
+    template <index_type Count>
+    constexpr basic_string_span<element_type, Count> last() const
+    {
+        return {span_.template last<Count>()};
+    }
+
+    constexpr basic_string_span<element_type, dynamic_extent> last(index_type count) const
+    {
+        return {span_.last(count)};
+    }
+
+    template <index_type Offset, index_type Count>
+    constexpr basic_string_span<element_type, Count> subspan() const
+    {
+        return {span_.template subspan<Offset, Count>()};
+    }
+
+    constexpr basic_string_span<element_type, dynamic_extent>
+    subspan(index_type offset, index_type count = dynamic_extent) const
+    {
+        return {span_.subspan(offset, count)};
+    }
+
+    constexpr reference operator[](index_type idx) const { return span_[idx]; }
+    constexpr reference operator()(index_type idx) const { return span_[idx]; }
+
+    constexpr pointer data() const { return span_.data(); }
+
+    constexpr index_type length() const GSL_NOEXCEPT { return span_.size(); }
+    constexpr index_type size() const GSL_NOEXCEPT { return span_.size(); }
+    constexpr index_type size_bytes() const GSL_NOEXCEPT { return span_.size_bytes(); }
+    constexpr index_type length_bytes() const GSL_NOEXCEPT { return span_.length_bytes(); }
+    constexpr bool empty() const GSL_NOEXCEPT { return size() == 0; }
+
+    constexpr iterator begin() const GSL_NOEXCEPT { return span_.begin(); }
+    constexpr iterator end() const GSL_NOEXCEPT { return span_.end(); }
+
+    constexpr const_iterator cbegin() const GSL_NOEXCEPT { return span_.cbegin(); }
+    constexpr const_iterator cend() const GSL_NOEXCEPT { return span_.cend(); }
+
+    constexpr reverse_iterator rbegin() const GSL_NOEXCEPT { return span_.rbegin(); }
+    constexpr reverse_iterator rend() const GSL_NOEXCEPT { return span_.rend(); }
+
+    constexpr const_reverse_iterator crbegin() const GSL_NOEXCEPT { return span_.crbegin(); }
+    constexpr const_reverse_iterator crend() const GSL_NOEXCEPT { return span_.crend(); }
+
+private:
+    static impl_type remove_z(pointer const& sz, std::ptrdiff_t max)
+    {
+        return {sz, details::length_func<element_type>()(sz, max)};
+    }
+
+    template <std::size_t N>
+    static impl_type remove_z(element_type (&sz)[N])
+    {
+        return remove_z(&sz[0], narrow_cast<std::ptrdiff_t>(N));
+    }
+
+    impl_type span_;
+};
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using string_span = basic_string_span<char, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using cstring_span = basic_string_span<const char, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using wstring_span = basic_string_span<wchar_t, Extent>;
+
+template <std::ptrdiff_t Extent = dynamic_extent>
+using cwstring_span = basic_string_span<const wchar_t, Extent>;
+
+//
+// to_string() allow (explicit) conversions from string_span to string
+//
+#ifndef GSL_MSVC_HAS_TYPE_DEDUCTION_BUG
+
+template <typename CharT, std::ptrdiff_t Extent>
+std::basic_string<typename std::remove_const<CharT>::type>
+to_string(basic_string_span<CharT, Extent> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+#else
+
+inline std::string to_string(cstring_span<> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+inline std::string to_string(string_span<> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+inline std::wstring to_string(cwstring_span<> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+inline std::wstring to_string(wstring_span<> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+#endif
+
+template <typename CharT, typename Traits = typename std::char_traits<CharT>,
+          typename Allocator = std::allocator<CharT>, typename gCharT, std::ptrdiff_t Extent>
+std::basic_string<CharT, Traits, Allocator> to_basic_string(basic_string_span<gCharT, Extent> view)
+{
+    return {view.data(), static_cast<std::size_t>(view.length())};
+}
+
+// zero-terminated string span, used to convert
+// zero-terminated spans to legacy strings
+template <typename CharT, std::ptrdiff_t Extent = dynamic_extent>
+class basic_zstring_span
+{
+public:
+    using value_type = CharT;
+    using const_value_type = std::add_const_t<CharT>;
+
+    using pointer = std::add_pointer_t<value_type>;
+    using const_pointer = std::add_pointer_t<const_value_type>;
+
+    using zstring_type = basic_zstring<value_type, Extent>;
+    using const_zstring_type = basic_zstring<const_value_type, Extent>;
+
+    using impl_type = span<value_type, Extent>;
+    using string_span_type = basic_string_span<value_type, Extent>;
+
+    constexpr basic_zstring_span(impl_type s) GSL_NOEXCEPT : span_(s)
+    {
+        // expects a zero-terminated span
+        Expects(s[s.size() - 1] == '\0');
+    }
+
+    // copy
+    constexpr basic_zstring_span(const basic_zstring_span& other) = default;
+
+// move
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr basic_zstring_span(basic_zstring_span&& other) = default;
+#else
+    constexpr basic_zstring_span(basic_zstring_span&& other) : span_(std::move(other.span_)) {}
+#endif
+
+    // assign
+    constexpr basic_zstring_span& operator=(const basic_zstring_span& other) = default;
+
+// move assign
+#ifndef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+    constexpr basic_zstring_span& operator=(basic_zstring_span&& other) = default;
+#else
+    constexpr basic_zstring_span& operator=(basic_zstring_span&& other)
+    {
+        span_ = std::move(other.span_);
+        return *this;
+    }
+#endif
+
+    constexpr bool empty() const GSL_NOEXCEPT { return span_.size() == 0; }
+
+    constexpr string_span_type as_string_span() const GSL_NOEXCEPT
+    {
+        auto sz = span_.size();
+        return span_.first(sz <= 0 ? 0 : sz - 1);
+    }
+
+    constexpr string_span_type ensure_z() const GSL_NOEXCEPT { return gsl::ensure_z(span_); }
+
+    constexpr const_zstring_type assume_z() const GSL_NOEXCEPT { return span_.data(); }
+
+private:
+    impl_type span_;
+};
+
+template <std::ptrdiff_t Max = dynamic_extent>
+using zstring_span = basic_zstring_span<char, Max>;
+
+template <std::ptrdiff_t Max = dynamic_extent>
+using wzstring_span = basic_zstring_span<wchar_t, Max>;
+
+template <std::ptrdiff_t Max = dynamic_extent>
+using czstring_span = basic_zstring_span<const char, Max>;
+
+template <std::ptrdiff_t Max = dynamic_extent>
+using cwzstring_span = basic_zstring_span<const wchar_t, Max>;
+
+// operator ==
+template <class CharT, std::ptrdiff_t Extent, class T,
+          class = std::enable_if_t<
+              details::is_basic_string_span<T>::value ||
+              std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>>>::value>>
+bool operator==(const gsl::basic_string_span<CharT, Extent>& one, const T& other) GSL_NOEXCEPT
+{
+    const gsl::basic_string_span<std::add_const_t<CharT>> tmp(other);
+#ifdef GSL_MSVC_NO_CPP14_STD_EQUAL
+    return (one.size() == tmp.size()) && std::equal(one.begin(), one.end(), tmp.begin());
+#else
+    return std::equal(one.begin(), one.end(), tmp.begin(), tmp.end());
+#endif
+}
+
+template <class CharT, std::ptrdiff_t Extent, class T,
+          class = std::enable_if_t<
+              !details::is_basic_string_span<T>::value &&
+              std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>>>::value>>
+bool operator==(const T& one, const gsl::basic_string_span<CharT, Extent>& other) GSL_NOEXCEPT
+{
+    gsl::basic_string_span<std::add_const_t<CharT>> tmp(one);
+#ifdef GSL_MSVC_NO_CPP14_STD_EQUAL
+    return (tmp.size() == other.size()) && std::equal(tmp.begin(), tmp.end(), other.begin());
+#else
+    return std::equal(tmp.begin(), tmp.end(), other.begin(), other.end());
+#endif
+}
+
+// operator !=
+template <typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+          typename = std::enable_if_t<std::is_convertible<
+              T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value>>
+bool operator!=(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return !(one == other);
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename Dummy = std::enable_if_t<
+        std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value &&
+        !gsl::details::is_basic_string_span<T>::value>>
+bool operator!=(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return !(one == other);
+}
+
+// operator<
+template <typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+          typename = std::enable_if_t<std::is_convertible<
+              T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value>>
+bool operator<(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    const gsl::basic_string_span<std::add_const_t<CharT>, Extent> tmp(other);
+    return std::lexicographical_compare(one.begin(), one.end(), tmp.begin(), tmp.end());
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename Dummy = std::enable_if_t<
+        std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value &&
+        !gsl::details::is_basic_string_span<T>::value>>
+bool operator<(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    gsl::basic_string_span<std::add_const_t<CharT>, Extent> tmp(one);
+    return std::lexicographical_compare(tmp.begin(), tmp.end(), other.begin(), other.end());
+}
+
+#ifndef _MSC_VER
+
+// VS treats temp and const containers as convertible to basic_string_span,
+// so the cases below are already covered by the previous operators
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator<(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    gsl::basic_string_span<std::add_const_t<CharT>, Extent> tmp(other);
+    return std::lexicographical_compare(one.begin(), one.end(), tmp.begin(), tmp.end());
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator<(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    gsl::basic_string_span<std::add_const_t<CharT>, Extent> tmp(one);
+    return std::lexicographical_compare(tmp.begin(), tmp.end(), other.begin(), other.end());
+}
+#endif
+
+// operator <=
+template <typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+          typename = std::enable_if_t<std::is_convertible<
+              T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value>>
+bool operator<=(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return !(other < one);
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename Dummy = std::enable_if_t<
+        std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value &&
+        !gsl::details::is_basic_string_span<T>::value>>
+bool operator<=(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return !(other < one);
+}
+
+#ifndef _MSC_VER
+
+// VS treats temp and const containers as convertible to basic_string_span,
+// so the cases below are already covered by the previous operators
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator<=(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return !(other < one);
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator<=(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return !(other < one);
+}
+#endif
+
+// operator>
+template <typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+          typename = std::enable_if_t<std::is_convertible<
+              T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value>>
+bool operator>(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return other < one;
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename Dummy = std::enable_if_t<
+        std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value &&
+        !gsl::details::is_basic_string_span<T>::value>>
+bool operator>(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return other < one;
+}
+
+#ifndef _MSC_VER
+
+// VS treats temp and const containers as convertible to basic_string_span,
+// so the cases below are already covered by the previous operators
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator>(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return other < one;
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator>(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return other < one;
+}
+#endif
+
+// operator >=
+template <typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+          typename = std::enable_if_t<std::is_convertible<
+              T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value>>
+bool operator>=(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return !(one < other);
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename Dummy = std::enable_if_t<
+        std::is_convertible<T, gsl::basic_string_span<std::add_const_t<CharT>, Extent>>::value &&
+        !gsl::details::is_basic_string_span<T>::value>>
+bool operator>=(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return !(one < other);
+}
+
+#ifndef _MSC_VER
+
+// VS treats temp and const containers as convertible to basic_string_span,
+// so the cases below are already covered by the previous operators
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator>=(gsl::basic_string_span<CharT, Extent> one, const T& other) GSL_NOEXCEPT
+{
+    return !(one < other);
+}
+
+template <
+    typename CharT, std::ptrdiff_t Extent = gsl::dynamic_extent, typename T,
+    typename DataType = typename T::value_type,
+    typename Dummy = std::enable_if_t<
+        !gsl::details::is_span<T>::value && !gsl::details::is_basic_string_span<T>::value &&
+        std::is_convertible<DataType*, CharT*>::value &&
+        std::is_same<std::decay_t<decltype(std::declval<T>().size(), *std::declval<T>().data())>,
+                     DataType>::value>>
+bool operator>=(const T& one, gsl::basic_string_span<CharT, Extent> other) GSL_NOEXCEPT
+{
+    return !(one < other);
+}
+#endif
+} // namespace GSL
+
+#undef GSL_NOEXCEPT
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+
+#if _MSC_VER < 1910
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+// VS 2013 workarounds
+#if _MSC_VER <= 1800
+#undef noexcept
+#pragma pop_macro("noexcept")
+
+#undef GSL_MSVC_HAS_TYPE_DEDUCTION_BUG
+#undef GSL_MSVC_HAS_SFINAE_SUBSTITUTION_ICE
+#undef GSL_MSVC_NO_CPP14_STD_EQUAL
+#undef GSL_MSVC_NO_DEFAULT_MOVE_CTOR
+#endif // _MSC_VER <= 1800
+#endif // _MSC_VER < 1910
+#endif // _MSC_VER
+
+#endif // GSL_STRING_SPAN_H
diff --git a/GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt
new file mode 100644
index 00000000..18772a16
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt
@@ -0,0 +1,64 @@
+# Authors: Kanav Gupta, Neha Jawalkar
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cmake_minimum_required(VERSION 3.16)
+project(LLAMA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings -Wno-unused-result -maes -Wno-ignored-attributes -march=native -Wno-deprecated-declarations -fopenmp")
+find_package (Eigen3 3.3 REQUIRED NO_MODULE)
+find_package(Threads REQUIRED)
+
+add_library(${PROJECT_NAME}
+    src/llama/config.cpp
+    src/llama/comms.cpp
+    src/llama/input_prng.cpp
+    src/llama/prng.cpp
+    src/llama/stats.cpp
+    src/llama/utils.cpp
+    and.cpp
+    api.cpp
+    conv.cpp
+    dcf.cpp
+    mult.cpp
+    pubdiv.cpp
+    relu.cpp
+    signextend.cpp
+    pubcmp.cpp
+    clip.cpp
+    dpf.cpp
+    lut.cpp
+    select.cpp
+    fixtobfloat16.cpp
+    wrap.cpp
+    float.cpp
+    taylor.cpp
+    mic.cpp
+    msnzb.cpp
+    truncate.cpp
+)
+
+target_link_libraries (${PROJECT_NAME} Eigen3::Eigen Threads::Threads cryptoTools bitpack)
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/and.cpp b/GPU-MPC/ext/sytorch/ext/llama/and.cpp
new file mode 100644
index 00000000..2abf8600
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/and.cpp
@@ -0,0 +1,75 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "and.h"
+
+// 64 ANDs at a time
+std::pair<BitwiseAndKeyPack, BitwiseAndKeyPack> keyGenBitwiseAnd(GroupElement rin1, GroupElement rin2, GroupElement rout)
+{
+    BitwiseAndKeyPack k0, k1;
+    GroupElement t[4];
+    GroupElement ones = -1;
+    t[0] = ((0 ^ rin1) & (0 ^ rin2)) ^ rout;
+    t[1] = ((0 ^ rin1) & (ones ^ rin2)) ^ rout;
+    t[2] = ((ones ^ rin1) & (0 ^ rin2)) ^ rout;
+    t[3] = ((ones ^ rin1) & (ones ^ rin2)) ^ rout;
+
+    auto t0Pair = splitShareXor(t[0], 64);
+    auto t1Pair = splitShareXor(t[1], 64);
+    auto t2Pair = splitShareXor(t[2], 64);
+    auto t3Pair = splitShareXor(t[3], 64);
+
+    k0.t[0] = t0Pair.first;
+    k0.t[1] = t1Pair.first;
+    k0.t[2] = t2Pair.first;
+    k0.t[3] = t3Pair.first;
+
+    k1.t[0] = t0Pair.second;
+    k1.t[1] = t1Pair.second;
+    k1.t[2] = t2Pair.second;
+    k1.t[3] = t3Pair.second;
+
+    return std::make_pair(k0, k1);
+}
+
+// 64 ANDs at a time
+GroupElement evalBitwiseAnd(int party, GroupElement x, GroupElement y, const BitwiseAndKeyPack &key)
+{
+    GroupElement res = 0;
+    for(uint64_t i = 0; i < 64; ++i)
+    {
+        uint8_t xBit = (x >> i) & 1;
+        uint8_t yBit = (y >> i) & 1;
+        GroupElement t = key.t[xBit * 2 + yBit];
+        t = t & (1ULL << i);
+        res = res ^ t;
+    }
+    return res;
+}
+
+GroupElement evalAnd(int party, GroupElement x, GroupElement y, const BitwiseAndKeyPack &key)
+{
+    uint8_t xBit = x & 1;
+    uint8_t yBit = y & 1;
+    GroupElement t = key.t[xBit * 2 + yBit] & 1;
+    GroupElement res = res ^ t;
+    return res;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/and.h b/GPU-MPC/ext/sytorch/ext/llama/and.h
new file mode 100644
index 00000000..a9f4135e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/and.h
@@ -0,0 +1,28 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <llama/keypack.h>
+
+// 64 ANDs at a time
+std::pair<BitwiseAndKeyPack, BitwiseAndKeyPack> keyGenBitwiseAnd(GroupElement rin1, GroupElement rin2, GroupElement rout);
+GroupElement evalBitwiseAnd(int party, GroupElement x, GroupElement y, const BitwiseAndKeyPack &key);
+GroupElement evalAnd(int party, GroupElement x, GroupElement y, const BitwiseAndKeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/api.cpp b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
new file mode 100644
index 00000000..abc5c6c0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
@@ -0,0 +1,4842 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/comms.h>
+#include <llama/utils.h>
+#include <llama/array.h>
+#include <llama/config.h>
+#include <llama/stats.h>
+#include <llama/assert.h>
+#include <llama/freekey.h>
+#include <llama/api.h>
+#include <llama/conv.h>
+
+#include "and.h"
+#include "mult.h"
+#include "pubdiv.h"
+#include "relu.h"
+#include "signextend.h"
+#include "clip.h"
+#include <llama/dcf.h>
+#include "lut.h"
+#include "select.h"
+#include "fixtobfloat16.h"
+#include "wrap.h"
+#include <llama/dpf.h>
+#include "taylor.h"
+#include "float.h"
+
+#include <cassert>
+#include <iostream>
+#include <assert.h>
+#include <chrono>
+#include <thread>
+#include <Eigen/Dense>
+#include <bitpack/bitpack.h>
+
+template <typename T>
+using pair = std::pair<T, T>;
+
+bool localTruncation = false;
+
+using namespace LlamaConfig;
+
+template <typename Functor>
+uint64_t time_this_block(Functor f)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    f();
+    auto end = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+}
+
+template <typename Functor>
+auto time_comm_this_block(Functor f)
+{
+    uint64_t comm_start = peer->bytesReceived() + peer->bytesSent();
+    auto start = std::chrono::high_resolution_clock::now();
+    f();
+    auto end = std::chrono::high_resolution_clock::now();
+    uint64_t comm_end = peer->bytesReceived() + peer->bytesSent();
+    return std::make_pair((uint64_t)(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()), comm_end - comm_start);
+}
+
+void llama::start()
+{
+    Llama::stats.clear();
+    // std::cerr << "=== COMPUTATION START ===\n\n";
+    if (party != DEALER)
+        peer->sync();
+
+    startTime = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+    if (party != DEALER)
+    {
+        if (party == SERVER)
+        {
+            inputOfflineComm = peer->bytesSent();
+            inputOnlineComm = peer->bytesReceived();
+        }
+        else
+        {
+            inputOfflineComm = peer->bytesReceived();
+            inputOnlineComm = peer->bytesSent();
+        }
+        peer->zeroBytesSent();
+        peer->zeroBytesReceived();
+    }
+    else
+    {
+        // always_assert(server->bytesSent() == 16);
+        // always_assert(server->bytesSent() == 16);
+        server->zeroBytesSent();
+        client->zeroBytesSent();
+    }
+
+    if (party == DEALER)
+    {
+        osuCrypto::AES aesSeed(prngs[0].get<osuCrypto::block>());
+        auto commonSeed = aesSeed.ecbEncBlock(osuCrypto::ZeroBlock);
+        server->send_block(commonSeed);
+        prngShared.SetSeed(commonSeed);
+    }
+    else if (party == SERVER)
+    {
+        auto commonSeed = dealer->recv_block();
+        prngShared.SetSeed(commonSeed);
+    }
+    sendTime = 0;
+    recvTime = 0;
+    packTime = 0;
+    unpackTime = 0;
+}
+
+void llama::end()
+{
+    // std::cerr << "\n=== COMPUTATION END ===\n\n";
+    if (party != DEALER)
+    {
+        uint64_t agg_time = 0;
+        uint64_t recons_time = 0;
+        uint64_t keyread_time = 0;
+        for (auto &func : Llama::stats)
+        {
+            uint64_t online_time = func.second.compute_time + func.second.reconstruct_time;
+            agg_time += online_time;
+            recons_time += func.second.reconstruct_time;
+            keyread_time += func.second.keyread_time;
+        }
+        std::cerr << "Offline Communication = " << inputOfflineComm << " bytes\n";
+        std::cerr << "Offline Time = " << accumulatedInputTimeOffline / 1000.0 << " milliseconds\n";
+        std::cerr << "Online Rounds = " << numRounds << "\n";
+        std::cerr << "Online Communication = " << (peer->bytesSent() + peer->bytesReceived() + inputOnlineComm + secFloatComm) /*/ (1024.0 * 1024.0)*/ << " B\n";
+        std::cerr << "Input Online Communication = " << (inputOnlineComm) << " B\n";
+        std::cerr << "Secfloat Online Communication = " << (secFloatComm) /*/ (1024.0 * 1024.0)*/ << " B\n";
+
+        std::cerr << "Online Time = " << (evalMicroseconds + accumulatedInputTimeOnline + agg_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "Key Read Time = " << keyread_time / 1000.0 << " milliseconds\n";
+        std::cerr << "Total Eigen Time = " << eigenMicroseconds / 1000.0 << " milliseconds\n";
+        auto endTime = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+        std::cerr << "Key Read Time = " << keyread_time / 1000.0 << " milliseconds\n";
+        std::cerr << "Total Time (including Key Read) = " << (endTime - startTime) / 1000000.0 << " milliseconds\n";
+
+        std::cerr << "packTime = " << packTime / 1000.0 << " miliseconds\n";
+        std::cerr << "sendTime = " << sendTime / 1000.0 << " miliseconds\n";
+        std::cerr << "recvTime = " << recvTime / 1000.0 << " miliseconds\n";
+        std::cerr << "unpackTime = " << unpackTime / 1000.0 << " miliseconds\n";
+        std::cerr << "reconsTime = " << recons_time / 1000.0 << " miliseconds\n";
+        std::cerr << "accumulatedInputTimeOnline = " << accumulatedInputTimeOnline / 1000.0 << " miliseconds\n";
+
+        if (convEvalMicroseconds > 0)
+            std::cerr << "Conv Time = " << convEvalMicroseconds / 1000.0 << " milliseconds\n";
+        if (arsEvalMicroseconds > 0)
+            std::cerr << "ARS Time = " << arsEvalMicroseconds / 1000.0 << " milliseconds\n";
+
+        if (convOnlineComm > 0)
+            std::cerr << "Conv Online Communication = " << convOnlineComm << " bytes\n";
+        if (arsOnlineComm > 0)
+            std::cerr << "ARS Online Communication = " << arsOnlineComm << " bytes\n";
+
+        Llama::dump_stats_csv("llama" + std::to_string(party) + ".csv");
+    }
+
+    std::cerr << "=========\n";
+}
+
+const bool parallel_reconstruct = true;
+const bool doPack = false;
+
+inline void pack_wrapper(GroupElement *dst, const GroupElement *src, std::size_t n, int bw)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    bitpack::pack(dst, src, n, bw);
+    auto end = std::chrono::high_resolution_clock::now();
+    packTime += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+}
+
+inline void unpack_wrapper(GroupElement *dst, const GroupElement *src, std::size_t n, int bw)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    bitpack::unpack(dst, src, n, bw);
+    auto end = std::chrono::high_resolution_clock::now();
+    unpackTime += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+}
+
+void packed_reconstruct(int32_t size, GroupElement *arr, int bw)
+{
+    auto psize = bitpack::packed_size(size, bw);
+    GroupElement *packedArr = new GroupElement[psize];
+    GroupElement *packedTmp = new GroupElement[psize];
+    pack_wrapper(packedArr, arr, size, bw);
+
+    if (parallel_reconstruct)
+    {
+#pragma omp parallel sections
+        {
+#pragma omp section
+            {
+                peer->send_batched_input(packedArr, psize, 64);
+            }
+
+#pragma omp section
+            {
+                peer->recv_batched_input(packedTmp, psize, 64);
+            }
+        }
+    }
+    else
+    {
+        peer->send_batched_input(packedArr, psize, 64);
+        peer->recv_batched_input(packedTmp, psize, 64);
+    }
+
+    GroupElement *tmp = new GroupElement[size];
+    unpack_wrapper(tmp, packedTmp, size, bw);
+
+    for (int i = 0; i < size; i++)
+    {
+        arr[i] = arr[i] + tmp[i];
+    }
+    delete[] tmp;
+    delete[] packedArr;
+    delete[] packedTmp;
+    numRounds += 1;
+}
+
+void reconstruct(int32_t size, GroupElement *arr, int bw)
+{
+    if (doPack)
+    {
+        return packed_reconstruct(size, arr, bw);
+    }
+    uint64_t *tmp = new uint64_t[size];
+
+    if (parallel_reconstruct)
+    {
+#pragma omp parallel sections
+        {
+#pragma omp section
+            {
+                peer->send_batched_input(arr, size, bw);
+            }
+
+#pragma omp section
+            {
+                peer->recv_batched_input(tmp, size, bw);
+            }
+        }
+    }
+    else
+    {
+        peer->send_batched_input(arr, size, bw);
+        peer->recv_batched_input(tmp, size, bw);
+    }
+    for (int i = 0; i < size; i++)
+    {
+        arr[i] = arr[i] + tmp[i];
+    }
+    delete[] tmp;
+    numRounds += 1;
+}
+
+void serverReconstruct(int32_t size, GroupElement *arr, int bw)
+{
+    // TODO: do packing
+    if (party == CLIENT)
+    {
+        peer->send_batched_input(arr, size, bw);
+    }
+    else
+    {
+        uint64_t *tmp = new uint64_t[size];
+        peer->recv_batched_input(tmp, size, bw);
+        for (int i = 0; i < size; i++)
+        {
+            arr[i] = arr[i] + tmp[i];
+        }
+        delete[] tmp;
+    }
+    numRounds += 1;
+}
+
+void serverToClient(int32_t size, GroupElement *arr, int bw)
+{
+    // TODO: do packing
+    if (party == SERVER)
+    {
+        peer->send_batched_input(arr, size, bw);
+    }
+    else
+    {
+        peer->recv_batched_input(arr, size, bw);
+    }
+    numRounds += 1;
+}
+
+void reconstructRT(int32_t size, GroupElement *arr, int bw)
+{
+    int bitarraySize = size % 8 == 0 ? size / 8 : size / 8 + 1;
+
+    uint8_t *tmp2 = new uint8_t[bitarraySize];
+    uint8_t *tmp3 = new uint8_t[bitarraySize];
+
+    packBitArray(arr + size, size, tmp2);
+
+    uint64_t *tmp = new uint64_t[size];
+    GroupElement *sendArr, *recvArr;
+    int sendSize;
+    if (doPack)
+    {
+        auto psize = bitpack::packed_size(size, bw);
+        sendArr = new GroupElement[psize];
+        recvArr = new GroupElement[psize];
+        pack_wrapper(sendArr, arr, size, bw);
+        sendSize = psize;
+    }
+    else
+    {
+        sendArr = arr;
+        recvArr = tmp;
+        sendSize = size;
+    }
+
+    if (parallel_reconstruct)
+    {
+#pragma omp parallel sections
+        {
+#pragma omp section
+            {
+                peer->send_batched_input(sendArr, sendSize, (doPack ? 64 : bw));
+                peer->send_uint8_array(tmp2, bitarraySize);
+            }
+
+#pragma omp section
+            {
+                peer->recv_batched_input(recvArr, sendSize, (doPack ? 64 : bw));
+                peer->recv_uint8_array(tmp3, bitarraySize);
+            }
+        }
+    }
+    else
+    {
+        peer->send_batched_input(sendArr, sendSize, (doPack ? 64 : bw));
+        peer->send_uint8_array(tmp2, bitarraySize);
+        peer->recv_batched_input(recvArr, sendSize, (doPack ? 64 : bw));
+        peer->recv_uint8_array(tmp3, bitarraySize);
+    }
+
+    if (doPack)
+    {
+        unpack_wrapper(tmp, recvArr, size, bw);
+        delete[] sendArr;
+        delete[] recvArr;
+    }
+
+    for (int i = 0; i < size; i++)
+    {
+        arr[i] = arr[i] + tmp[i];
+        arr[i + size] = arr[i + size] + ((tmp3[i / 8] >> (i % 8)) & 1);
+    }
+
+    delete[] tmp;
+    delete[] tmp2;
+    delete[] tmp3;
+    numRounds += 1;
+}
+
+inline std::pair<int32_t, int32_t> get_start_end(int32_t size, int32_t thread_idx)
+{
+    int32_t chunk_size = size / num_threads;
+    if (thread_idx == num_threads - 1)
+    {
+        return std::make_pair(thread_idx * chunk_size, size);
+    }
+    else
+    {
+        return std::make_pair(thread_idx * chunk_size, (thread_idx + 1) * chunk_size);
+    }
+}
+
+void Conv2DWrapper(int32_t N, int32_t H, int32_t W,
+                   int32_t CI, int32_t FH, int32_t FW,
+                   int32_t CO, int32_t zPadHLeft,
+                   int32_t zPadHRight, int32_t zPadWLeft,
+                   int32_t zPadWRight, int32_t strideH,
+                   int32_t strideW, MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr),
+                   MASK_PAIR(GroupElement *outArr))
+{
+    std::cerr << ">> Conv2D - Start" << std::endl;
+    int d0 = N;
+    int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d3 = CO;
+
+    if (party == DEALER)
+    {
+        auto local_start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < d0; ++i)
+        {
+            for (int j = 0; j < d1; ++j)
+            {
+                for (int k = 0; k < d2; ++k)
+                {
+                    for (int l = 0; l < d3; ++l)
+                    {
+                        Arr4DIdx(outArr_mask, d0, d1, d2, d3, i, j, k, l) = random_ge(bitlength);
+                    }
+                }
+            }
+        }
+
+        auto keys = KeyGenConv2D(bitlength, bitlength, N, H, W, CI, FH, FW, CO,
+                                 zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW,
+                                 inputArr_mask, filterArr_mask, outArr_mask);
+
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        // server->send_conv2d_key(keys.first);
+        freeConv2dKey(keys.first);
+        client->send_conv2d_key(keys.second);
+        freeConv2dKey(keys.second);
+        auto local_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(local_end -
+                                                                                      local_start)
+                                    .count();
+        dealerMicroseconds += local_time_taken;
+        std::cerr << "   Dealer Time = " << local_time_taken / 1000.0 << " milliseconds\n";
+    }
+    else
+    {
+
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        auto key = dealer->recv_conv2d_key(bitlength, bitlength, N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW);
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+
+        peer->sync();
+        uint64_t eigen_start = eigenMicroseconds;
+        auto local_start = std::chrono::high_resolution_clock::now();
+        EvalConv2D(party, key, N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, inputArr, filterArr, outArr);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        reconstruct(d0 * d1 * d2 * d3, outArr, bitlength);
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        convOnlineComm += (onlineComm1 - onlineComm0);
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        freeConv2dKey(key);
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t1 -
+                                                                                  local_start)
+                                .count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(local_end -
+                                                                                      t1)
+                                    .count();
+        convEvalMicroseconds += (reconstruct_time + compute_time);
+        evalMicroseconds += (reconstruct_time + compute_time);
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "      Eigen Time = " << (eigenMicroseconds - eigen_start) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+    }
+
+    std::cerr << ">> Conv2D - End" << std::endl;
+}
+
+void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W,
+                   int32_t CI, int32_t FD, int32_t FH, int32_t FW,
+                   int32_t CO, int32_t zPadDLeft, int32_t zPadDRight, int32_t zPadHLeft,
+                   int32_t zPadHRight, int32_t zPadWLeft,
+                   int32_t zPadWRight, int32_t strideD, int32_t strideH,
+                   int32_t strideW, GroupElement *inputArr, GroupElement *filterArr,
+                   GroupElement *outArr)
+{
+    std::cerr << ">> Conv3D - Start" << std::endl;
+    int d0 = N;
+    int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
+    int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d3 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d4 = CO;
+
+    if (party == DEALER)
+    {
+        auto local_start = std::chrono::high_resolution_clock::now();
+
+        // not good for in place operations
+        for (int i = 0; i < d0 * d1 * d2 * d3 * d4; ++i)
+        {
+            outArr[i] = random_ge(bitlength);
+        }
+
+        auto keys = KeyGenConv3D(bitlength, bitlength, N, D, H, W, CI, FD, FH, FW, CO,
+                                 zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW,
+                                 inputArr, filterArr, outArr);
+
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        // server->send_conv2d_key(keys.first);
+        freeConv3dKey(keys.first);
+        client->send_conv3d_key(keys.second);
+        freeConv3dKey(keys.second);
+        auto local_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(local_end -
+                                                                                      local_start)
+                                    .count();
+        dealerMicroseconds += local_time_taken;
+        std::cerr << "   Dealer Time = " << local_time_taken / 1000.0 << " milliseconds\n";
+    }
+    else
+    {
+
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        auto key = dealer->recv_conv3d_key(bitlength, bitlength, N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW);
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end - keyread_start).count();
+
+        peer->sync();
+        uint64_t eigen_start = eigenMicroseconds;
+        auto local_start = std::chrono::high_resolution_clock::now();
+        EvalConv3D(party, key, N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW, inputArr, filterArr, outArr);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        reconstruct(d0 * d1 * d2 * d3 * d4, outArr, bitlength);
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        convOnlineComm += (onlineComm1 - onlineComm0);
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        freeConv3dKey(key);
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t1 -
+                                                                                  local_start)
+                                .count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(local_end -
+                                                                                      t1)
+                                    .count();
+        convEvalMicroseconds += (reconstruct_time + compute_time);
+        evalMicroseconds += (reconstruct_time + compute_time);
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "      Eigen Time = " << (eigenMicroseconds - eigen_start) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+    }
+
+    std::cerr << ">> Conv3D - End" << std::endl;
+}
+
+void Conv2DGroupWrapper(int64_t N, int64_t H, int64_t W,
+                        int64_t CI, int64_t FH, int64_t FW,
+                        int64_t CO, int64_t zPadHLeft,
+                        int64_t zPadHRight, int64_t zPadWLeft,
+                        int64_t zPadWRight, int64_t strideH,
+                        int64_t strideW, int64_t G,
+                        MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr), MASK_PAIR(GroupElement *outArr))
+{
+    if (G == 1)
+    {
+        Conv2DWrapper(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, inputArr, inputArr_mask, filterArr, filterArr_mask, outArr, outArr_mask);
+    }
+    else
+    {
+        // TODO
+        assert(false && "Conv2DGroup not implemented");
+    }
+}
+
+void ScaleUp(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf)
+{
+    if (party == DEALER)
+    {
+        for (int i = 0; i < size; ++i)
+        {
+            inArr_mask[i] = inArr_mask[i] << sf;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < size; ++i)
+        {
+            inArr[i] = inArr[i] << sf;
+        }
+    }
+}
+
+void ars_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, GroupElement *outArr, ARSKeyPack *keys)
+{
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        outArr[i] = evalARS(party - 2, inArr[i], keys[i].shift, keys[i]);
+        freeARSKeyPack(keys[i]);
+    }
+}
+
+/*
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                            keyread_start).count();
+        auto start = std::chrono::high_resolution_clock::now();
+        auto mid = std::chrono::high_resolution_clock::now();
+        auto end = std::chrono::high_resolution_clock::now();
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(mid - start).count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(end - mid).count();
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        evalMicroseconds += (reconstruct_time + compute_time);
+        lolEvalMicroseconds += (reconstruct_time + compute_time);
+*/
+void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t shift)
+{
+    std::cerr << ">> Truncate" << (LlamaConfig::stochasticT ? " (stochastic)" : "") << " - Start" << std::endl;
+    if (party == DEALER)
+    {
+        pair<ARSKeyPack> *keys = new pair<ARSKeyPack>[size];
+        auto dealer_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel for
+        for (int i = 0; i < size; i++)
+        {
+            GroupElement rout = random_ge(bitlength);
+            keys[i] = keyGenARS(bitlength, bitlength, shift, inArr_mask[i], rout);
+            outArr_mask[i] = rout;
+        }
+        auto dealer_end = std::chrono::high_resolution_clock::now();
+        auto dealer_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(dealer_end -
+                                                                                       dealer_start)
+                                     .count();
+
+        for (int i = 0; i < size; i++)
+        {
+            server->send_ars_key(keys[i].first);
+            client->send_ars_key(keys[i].second);
+            freeARSKeyPackPair(keys[i]);
+        }
+        dealerMicroseconds += dealer_time_taken;
+        delete[] keys;
+        std::cerr << "   Dealer Time = " << dealer_time_taken / 1000.0 << " milliseconds\n";
+    }
+    else
+    {
+        ARSKeyPack *keys = new ARSKeyPack[size];
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; i++)
+        {
+            keys[i] = dealer->recv_ars_key(bitlength, bitlength, shift);
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        std::thread thread_pool[num_threads];
+        for (int i = 0; i < num_threads; ++i)
+        {
+            thread_pool[i] = std::thread(ars_threads_helper, i, size, inArr, outArr, keys);
+        }
+
+        for (int i = 0; i < num_threads; ++i)
+        {
+            thread_pool[i].join();
+        }
+        auto mid = std::chrono::high_resolution_clock::now();
+
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        reconstruct(size, outArr, bitlength);
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        arsOnlineComm += (onlineComm1 - onlineComm0);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(mid - start).count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(end - mid).count();
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+        evalMicroseconds += (reconstruct_time + compute_time);
+        arsEvalMicroseconds += (reconstruct_time + compute_time);
+        delete[] keys;
+    }
+    std::cerr << ">> Truncate - End" << std::endl;
+}
+
+void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf)
+{
+    std::cerr << ">> ScaleDown - Start " << std::endl;
+
+    if (localTruncation)
+    {
+        uint64_t m = ((1L << sf) - 1) << (bitlength - sf);
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; i++)
+        {
+            if (party == DEALER)
+            {
+                auto x_msb = msb(inArr_mask[i], bitlength);
+                inArr_mask[i] = x_msb ? (inArr_mask[i] >> sf) | m : inArr_mask[i] >> sf;
+                mod(inArr_mask[i], bitlength);
+            }
+            else
+            {
+                auto x_msb = msb(inArr[i], bitlength);
+                inArr[i] = x_msb ? (inArr[i] >> sf) | m : inArr[i] >> sf;
+                mod(inArr[i], bitlength);
+            }
+        }
+        auto end = std::chrono::high_resolution_clock::now();
+        auto timeMicroseconds = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        if (party == DEALER)
+        {
+            dealerMicroseconds += timeMicroseconds;
+        }
+        else
+        {
+            evalMicroseconds += timeMicroseconds;
+            arsEvalMicroseconds += timeMicroseconds;
+            std::cerr << "   Eval Time = " << timeMicroseconds / 1000.0 << " milliseconds\n";
+        }
+    }
+    else
+    {
+        ARS(size, inArr, inArr_mask, inArr, inArr_mask, sf);
+    }
+    std::cerr << ">> ScaleDown - End " << std::endl;
+}
+
+inline void matmul2d_server_helper(int thread_idx, int s1, int s2, int s3, GroupElement *A, GroupElement *B, GroupElement *C, GroupElement *a, GroupElement *b, GroupElement *c)
+{
+    auto p = get_start_end(s1 * s3, thread_idx);
+    for (int ik = p.first; ik < p.second; ik += 1)
+    {
+        int i = ik / s3;
+        int k = ik % s3;
+        Arr2DIdx(C, s1, s3, i, k) = Arr2DIdx(c, s1, s3, i, k);
+        for (int j = 0; j < s2; j++)
+        {
+            Arr2DIdx(C, s1, s3, i, k) = Arr2DIdx(C, s1, s3, i, k) - Arr2DIdx(A, s1, s2, i, j) * Arr2DIdx(b, s2, s3, j, k) - Arr2DIdx(a, s1, s2, i, j) * Arr2DIdx(B, s2, s3, j, k) + Arr2DIdx(A, s1, s2, i, j) * Arr2DIdx(B, s2, s3, j, k);
+        }
+        // mod(Arr2DIdx(C, s1, s3, i, k));
+    }
+}
+
+inline void matmul2d_client_helper(int thread_idx, int s1, int s2, int s3, GroupElement *A, GroupElement *B, GroupElement *C, GroupElement *a, GroupElement *b, GroupElement *c)
+{
+    auto p = get_start_end(s1 * s3, thread_idx);
+    for (int ik = p.first; ik < p.second; ik += 1)
+    {
+        int i = ik / s3;
+        int k = ik % s3;
+        Arr2DIdx(C, s1, s3, i, k) = Arr2DIdx(c, s1, s3, i, k);
+        for (int j = 0; j < s2; j++)
+        {
+            Arr2DIdx(C, s1, s3, i, k) = Arr2DIdx(C, s1, s3, i, k) - Arr2DIdx(A, s1, s2, i, j) * Arr2DIdx(b, s2, s3, j, k) - Arr2DIdx(a, s1, s2, i, j) * Arr2DIdx(B, s2, s3, j, k);
+        }
+        // mod(Arr2DIdx(C, s1, s3, i, k));
+    }
+}
+
+void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
+              MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA)
+{
+    if (party == DEALER)
+    {
+
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s3; ++j)
+            {
+                Arr2DIdx(C_mask, s1, s3, i, j) = random_ge(bitlength);
+            }
+        }
+
+        auto keys = KeyGenMatMul(bitlength, bitlength, s1, s2, s3, A_mask, B_mask, C_mask);
+
+        // server->send_matmul_key(keys.first);
+        freeMatMulKey(keys.first);
+        client->send_matmul_key(keys.second);
+        freeMatMulKey(keys.second);
+    }
+    else
+    {
+        MatMulKey key;
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            { key = dealer->recv_matmul_key(bitlength, bitlength, s1, s2, s3); });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            { matmul_eval_helper(party, s1, s2, s3, A, B, C, key.a, key.b, key.c); });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(s1 * s3, C, bitlength); });
+
+        Llama::stat_t stat = {"Linear::MatMul", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        freeMatMulKey(key);
+    }
+}
+
+void ArgMax(int32_t rows, int32_t cols, MASK_PAIR(GroupElement *inp), MASK_PAIR(GroupElement *out))
+{
+    // inp is a vector of size rows*columns and max (resp. maxidx) is caclulated for every
+    // column chunk of elements. Result maxidx is stored in out (size: rows)
+
+    std::cerr << ">> ArgMax - Start" << std::endl;
+    always_assert(rows == 1);
+    if (party == DEALER)
+    {
+        int32_t curCols = cols;
+        int32_t round = 0;
+
+        GroupElement *tmpMax_mask = make_array<GroupElement>(rows, cols);
+        GroupElement *tmpIdx_mask = make_array<GroupElement>(rows, cols);
+        GroupElement *drelu_mask = make_array<GroupElement>(rows, cols / 2);
+        GroupElement *mult_res_mask = make_array<GroupElement>(2 * rows, cols / 2);
+
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < rows; ++i)
+        {
+            for (int j = 0; j < cols; ++j)
+            {
+                Arr2DIdx(tmpMax_mask, rows, cols, i, j) = Arr2DIdx(inp_mask, rows, cols, i, j);
+                Arr2DIdx(tmpIdx_mask, rows, cols, i, j) = 0;
+            }
+        }
+
+        while (curCols > 1)
+        {
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    Arr2DIdx(drelu_mask, rows, curCols / 2, row, j) = random_ge(bitlength);
+                    auto scmpKeys = keyGenSCMP(bitlength, bitlength, Arr2DIdx(tmpMax_mask, rows, curCols, row, 2 * j), Arr2DIdx(tmpMax_mask, rows, curCols, row, 2 * j + 1), Arr2DIdx(drelu_mask, rows, curCols / 2, row, j));
+                    server->send_scmp_keypack(scmpKeys.first);
+                    client->send_scmp_keypack(scmpKeys.second);
+                }
+            }
+
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+
+                    Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, row, j) = random_ge(bitlength);
+                    auto multKeys1 = MultGen(Arr2DIdx(drelu_mask, rows, curCols / 2, row, j), Arr2DIdx(tmpMax_mask, rows, curCols, row, 2 * j) - Arr2DIdx(tmpMax_mask, rows, curCols, row, 2 * j + 1), Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, row, j));
+
+                    server->send_mult_key(multKeys1.first);
+                    client->send_mult_key(multKeys1.second);
+
+                    Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, rows + row, j) = random_ge(bitlength);
+                    auto multKeys2 = MultGen(Arr2DIdx(drelu_mask, rows, curCols / 2, row, j), Arr2DIdx(tmpIdx_mask, rows, curCols, row, 2 * j) - Arr2DIdx(tmpIdx_mask, rows, curCols, row, 2 * j + 1), Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, rows + row, j));
+
+                    server->send_mult_key(multKeys2.first);
+                    client->send_mult_key(multKeys2.second);
+                }
+            }
+
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    Arr2DIdx(tmpMax_mask, rows, curCols / 2, row, j) = Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, row, j) + Arr2DIdx(tmpMax_mask, rows, curCols, row, 2 * j + 1);
+                    Arr2DIdx(tmpIdx_mask, rows, curCols / 2, row, j) = Arr2DIdx(mult_res_mask, 2 * rows, curCols / 2, rows + row, j) + Arr2DIdx(tmpIdx_mask, rows, curCols, row, 2 * j + 1);
+                }
+                if (curCols % 2 == 1)
+                {
+                    Arr2DIdx(tmpMax_mask, rows, curCols / 2, row, curCols / 2) = Arr2DIdx(tmpMax_mask, 2 * rows, curCols, row, curCols - 1);
+                    Arr2DIdx(tmpIdx_mask, rows, curCols / 2, row, curCols / 2) = Arr2DIdx(tmpIdx_mask, 2 * rows, curCols, row, curCols - 1);
+                }
+            }
+
+            curCols = (curCols + 1) / 2;
+            round += 1;
+        }
+
+        for (int row = 0; row < rows; row++)
+        {
+            out_mask[row] = Arr2DIdx(tmpIdx_mask, rows, 1, row, 0);
+        }
+        auto end = std::chrono::high_resolution_clock::now();
+        dealerMicroseconds += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        delete[] tmpMax_mask;
+        delete[] tmpIdx_mask;
+        delete[] drelu_mask;
+        delete[] mult_res_mask;
+    }
+    else
+    {
+
+        ScmpKeyPack keys[(cols - 1) * rows];
+        MultKey mult_keys1[(cols - 1) * rows];
+        MultKey mult_keys2[(cols - 1) * rows];
+        int k1 = 0;
+        int k2 = 0;
+        int k3 = 0;
+
+        int32_t curCols = cols;
+        while (curCols > 1)
+        {
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    keys[k1++] = dealer->recv_scmp_keypack(bitlength, bitlength);
+                }
+            }
+
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    mult_keys1[k2++] = dealer->recv_mult_key();
+                    mult_keys2[k3++] = dealer->recv_mult_key();
+                }
+            }
+            curCols = (curCols + 1) / 2;
+        }
+
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        curCols = cols;
+        k1 = 0;
+        k2 = 0;
+        k3 = 0;
+
+        GroupElement *tmpMax = make_array<GroupElement>(rows, cols);
+        GroupElement *tmpIdx = make_array<GroupElement>(rows, cols);
+
+        for (int i = 0; i < rows; ++i)
+        {
+            for (int j = 0; j < cols; ++j)
+            {
+                Arr2DIdx(tmpMax, rows, cols, i, j) = Arr2DIdx(inp, rows, cols, i, j);
+                Arr2DIdx(tmpIdx, rows, cols, i, j) = j;
+            }
+        }
+
+        GroupElement *drelu = make_array<GroupElement>(rows, cols / 2);
+        GroupElement *mult_res = make_array<GroupElement>(2 * rows, cols / 2);
+
+        while (curCols > 1)
+        {
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    Arr2DIdx(drelu, rows, curCols / 2, row, j) = evalSCMP(party - 2, keys[k1++], Arr2DIdx(tmpMax, rows, curCols, row, 2 * j), Arr2DIdx(tmpMax, rows, curCols, row, 2 * j + 1));
+                }
+            }
+
+            reconstruct(rows * (curCols / 2), drelu, bitlength);
+
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+
+                    Arr2DIdx(mult_res, 2 * rows, curCols / 2, row, j) = MultEval(party - 2, mult_keys1[k2++], Arr2DIdx(drelu, rows, curCols / 2, row, j), Arr2DIdx(tmpMax, rows, curCols, row, 2 * j) - Arr2DIdx(tmpMax, rows, curCols, row, 2 * j + 1));
+
+                    Arr2DIdx(mult_res, 2 * rows, curCols / 2, rows + row, j) = MultEval(party - 2, mult_keys2[k3++],
+                                                                                        Arr2DIdx(drelu, rows, curCols / 2, row, j),
+                                                                                        Arr2DIdx(tmpIdx, rows, curCols, row, 2 * j) - Arr2DIdx(tmpIdx, rows, curCols, row, 2 * j + 1));
+                }
+            }
+
+            reconstruct((2 * rows) * (curCols / 2), mult_res, bitlength);
+
+            for (int row = 0; row < rows; row++)
+            {
+                for (int j = 0; j < curCols / 2; ++j)
+                {
+                    Arr2DIdx(tmpMax, rows, curCols / 2, row, j) = Arr2DIdx(mult_res, 2 * rows, curCols / 2, row, j) + Arr2DIdx(tmpMax, rows, curCols, row, 2 * j + 1);
+                    Arr2DIdx(tmpIdx, rows, curCols / 2, row, j) = Arr2DIdx(mult_res, 2 * rows, curCols / 2, rows + row, j) + Arr2DIdx(tmpIdx, rows, curCols, row, 2 * j + 1);
+                }
+                if (curCols % 2 == 1)
+                {
+                    Arr2DIdx(tmpMax, rows, curCols / 2, row, curCols / 2) = Arr2DIdx(tmpMax, 2 * rows, curCols, row, curCols - 1);
+                    Arr2DIdx(tmpIdx, rows, curCols / 2, row, curCols / 2) = Arr2DIdx(tmpIdx, 2 * rows, curCols, row, curCols - 1);
+                }
+            }
+
+            curCols = (curCols + 1) / 2;
+        }
+
+        for (int row = 0; row < rows; row++)
+        {
+            out[row] = Arr2DIdx(tmpIdx, rows, 1, row, 0);
+        }
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        argmaxEvalMicroseconds += eval_time;
+        evalMicroseconds += eval_time;
+        std::cerr << "   Eval time: " << eval_time / 1000.0 << " milliseconds" << std::endl;
+        delete[] tmpMax;
+        delete[] tmpIdx;
+        delete[] drelu;
+        delete[] mult_res;
+    }
+    std::cerr << ">> ArgMax - End" << std::endl;
+}
+
+void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr))
+{
+    // taken from the equivalent function in Porthos/src/EzPCFunctionalities.cpp
+    std::cerr << ">> AvgPool - Start" << std::endl;
+    int rows = N * H * W * C;
+    std::vector<GroupElement> filterAvg(rows, 0);
+    std::vector<GroupElement> filterAvg_mask(rows, 0);
+    std::vector<GroupElement> outp(rows), outp_mask(rows);
+
+    auto common_start = std::chrono::high_resolution_clock::now();
+    int rowIdx = 0;
+    for (int n = 0; n < N; n++)
+    {
+        for (int c = 0; c < C; c++)
+        {
+            int32_t leftTopCornerH = -zPadHLeft;
+            int32_t extremeRightBottomCornerH = imgH - 1 + zPadHRight;
+            while ((leftTopCornerH + ksizeH - 1) <= extremeRightBottomCornerH)
+            {
+                int32_t leftTopCornerW = -zPadWLeft;
+                int32_t extremeRightBottomCornerW = imgW - 1 + zPadWRight;
+                while ((leftTopCornerW + ksizeW - 1) <= extremeRightBottomCornerW)
+                {
+
+                    GroupElement curFilterSum = 0, curFilterSum_mask = 0;
+                    for (int fh = 0; fh < ksizeH; fh++)
+                    {
+                        for (int fw = 0; fw < ksizeW; fw++)
+                        {
+                            int32_t curPosH = leftTopCornerH + fh;
+                            int32_t curPosW = leftTopCornerW + fw;
+
+                            GroupElement temp = 0, temp_mask = 0;
+                            if ((((curPosH < 0) || (curPosH >= imgH)) || ((curPosW < 0) || (curPosW >= imgW))))
+                            {
+                                temp = 0;
+                                temp_mask = 0;
+                            }
+                            else
+                            {
+                                temp = Arr4DIdx(inArr, N, imgH, imgW, C, n, curPosH, curPosW, c);
+                                temp_mask = Arr4DIdx(inArr_mask, N, imgH, imgW, C, n, curPosH, curPosW, c);
+                            }
+
+                            curFilterSum = curFilterSum + temp;
+                            curFilterSum_mask = curFilterSum_mask + temp_mask;
+                        }
+                    }
+
+                    // todo: put if-else everywhere when tasks are specific to dealer
+                    filterAvg_mask[rowIdx] = curFilterSum_mask;
+                    filterAvg[rowIdx] = curFilterSum;
+
+                    rowIdx += 1;
+                    leftTopCornerW = leftTopCornerW + strideW;
+                }
+
+                leftTopCornerH = leftTopCornerH + strideH;
+            }
+        }
+    }
+    auto common_end = std::chrono::high_resolution_clock::now();
+    auto common_time = std::chrono::duration_cast<std::chrono::microseconds>(common_end - common_start).count();
+    if (party == DEALER)
+    {
+        dealerMicroseconds += common_time;
+        std::cerr << "   Dealer Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl;
+    }
+    else
+    {
+        avgpoolEvalMicroseconds += common_time;
+        evalMicroseconds += common_time;
+        std::cerr << "   Eval Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl;
+    }
+
+    // The division should always be signed division.
+    // Local division will introduce error
+
+    bool doLocal = false;
+
+    if (doLocal)
+    {
+        // following what porthos does: convert to signed and then back to unsigned
+        // todo: check why this double negative trick works to output signed division
+        // todo: assuming 64 bitlen here
+        for (int rowIdx = 0; rowIdx < rows; rowIdx++)
+        {
+            if (party == DEALER)
+            {
+                filterAvg_mask[rowIdx] = static_cast<uint64_t>((static_cast<int64_t>(filterAvg_mask[rowIdx])) / (ksizeH * ksizeW));
+            }
+            else
+            {
+                filterAvg[rowIdx] = -static_cast<uint64_t>((static_cast<int64_t>(-filterAvg[rowIdx])) / (ksizeH * ksizeW));
+            }
+        }
+    }
+    else
+    {
+        // call fss protocol for division
+        // todo: the divisor ksizeH * ksizeW is 32 bits long when passed as param, but ezpc cleartext explicitly converts to 64 bit value
+        // will this be an issue in the future?
+        // ElemWiseVectorPublicDiv(rows, filterAvg.data(), filterAvg_mask.data(), ksizeH * ksizeW, outp.data(), outp_mask.data());
+        std::cerr << "Error Error Error" << std::endl;
+        exit(1);
+    }
+
+    for (int n = 0; n < N; n++)
+    {
+        for (int c = 0; c < C; c++)
+        {
+            for (int h = 0; h < H; h++)
+            {
+                for (int w = 0; w < W; w++)
+                {
+                    int iidx = n * C * H * W + c * H * W + h * W + w;
+                    if (party == DEALER)
+                    {
+                        Arr4DIdx(outArr_mask, N, H, W, C, n, h, w, c) = outp_mask[iidx];
+                    }
+                    else
+                    {
+                        Arr4DIdx(outArr, N, H, W, C, n, h, w, c) = outp[iidx];
+                    }
+                }
+            }
+        }
+    }
+    std::cerr << ">> AvgPool - End" << std::endl;
+}
+
+void ElemWiseMul(int32_t size, GroupElement *inArr, GroupElement *multArrVec, GroupElement *outputArr, std::string prefix)
+{
+    if (party == DEALER)
+    {
+        pair<MultKey> *keys = new pair<MultKey>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            auto rout = random_ge(bitlength);
+            keys[i] = MultGen(inArr[i], multArrVec[i], rout);
+            outputArr[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_mult_key(keys[i].first);
+            client->send_mult_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        MultKey *keys = new MultKey[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for(int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_mult_key();
+            } });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            {
+#pragma omp parallel for
+            for(int i = 0; i < size; ++i) {
+                outputArr[i] = MultEval(party - SERVER, keys[i], inArr[i], multArrVec[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, outputArr, bitlength); });
+
+        Llama::stat_t stat = {prefix + "ElemWiseMul", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
+             int32_t FW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot, std::string prefix)
+{
+    std::cerr << ">> MaxPool - Start" << std::endl;
+    int d1 = ((imgH - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((imgW - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    always_assert(d1 == H);
+    always_assert(d2 == W);
+    always_assert(N1 == N);
+    always_assert(C1 == C);
+
+    GroupElement *maxUntilNow = outArr;
+    GroupElement *maxUntilNow_mask = outArr_mask;
+
+    if (party == DEALER)
+    {
+        uint64_t dealer_file_read_time = 0;
+        auto dealer_start = std::chrono::high_resolution_clock::now();
+        for (int fh = 0; fh < FH; fh++)
+        {
+            for (int fw = 0; fw < FW; fw++)
+            {
+                for (int n = 0; n < N; n++)
+                {
+                    for (int c = 0; c < C; c++)
+                    {
+                        for (int ctH = 0; ctH < H; ctH++)
+                        {
+                            for (int ctW = 0; ctW < W; ctW++)
+                            {
+                                int leftTopCornerH = ctH * strideH - zPadHLeft;
+                                int leftTopCornerW = ctW * strideW - zPadWLeft;
+
+                                if (fh == 0 && fw == 0)
+                                {
+                                    if (leftTopCornerH < 0 || leftTopCornerW < 0 || leftTopCornerH >= imgH || leftTopCornerW >= imgW)
+                                    {
+                                        Arr4DIdx(maxUntilNow_mask, N, H, W, C, n, ctH, ctW, c) = GroupElement(0);
+                                    }
+                                    else
+                                    {
+                                        Arr4DIdx(maxUntilNow_mask, N, H, W, C, n, ctH, ctW, c) = Arr4DIdx(inArr_mask, N1, imgH, imgW, C1, n, leftTopCornerH, leftTopCornerW, c);
+                                    }
+                                }
+                                else
+                                {
+                                    int curPosH = leftTopCornerH + fh;
+                                    int curPosW = leftTopCornerW + fw;
+
+                                    GroupElement maxi_mask = Arr4DIdx(maxUntilNow_mask, N, H, W, C, n, ctH, ctW, c);
+                                    GroupElement temp_mask;
+                                    if ((((curPosH < 0) || (curPosH >= imgH)) || ((curPosW < 0) || (curPosW >= imgW))))
+                                    {
+                                        temp_mask = GroupElement(0);
+                                    }
+                                    else
+                                    {
+                                        temp_mask = Arr4DIdx(inArr_mask, N1, imgH, imgW, C1, n, curPosH, curPosW, c);
+                                    }
+                                    GroupElement rout = random_ge(bitlength);
+                                    GroupElement routBit = random_ge(1);
+                                    auto keys = keyGenMaxpool(bitlength, bitlength, maxi_mask, temp_mask, rout, routBit);
+                                    Arr5DIdx(oneHot, FH * FW - 1, N, H, W, C, fh * FW + fw - 1, n, ctH, ctW, c) = routBit;
+                                    Arr4DIdx(maxUntilNow_mask, N, H, W, C, n, ctH, ctW, c) = rout;
+
+                                    auto read_start = std::chrono::high_resolution_clock::now();
+                                    server->send_maxpool_key(keys.first);
+                                    client->send_maxpool_key(keys.second);
+                                    freeMaxpoolKeyPackPair(keys);
+                                    auto read_end = std::chrono::high_resolution_clock::now();
+                                    auto read_time = std::chrono::duration_cast<std::chrono::microseconds>(read_end - read_start).count();
+                                    dealer_file_read_time += read_time;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        auto dealer_end = std::chrono::high_resolution_clock::now();
+        auto dealer_time = std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count() - dealer_file_read_time;
+        dealerMicroseconds += dealer_time;
+        std::cerr << "   Dealer time: " << dealer_time / 1000.0 << " milliseconds" << std::endl;
+    }
+    else
+    {
+        MaxpoolKeyPack *keys = new MaxpoolKeyPack[(FH * FW - 1) * N * C * H * W];
+        int kidx = 0;
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        for (int fh = 0; fh < FH; fh++)
+        {
+            for (int fw = 0; fw < FW; fw++)
+            {
+                if (fh == 0 && fw == 0)
+                {
+                    continue;
+                }
+                for (int n = 0; n < N; n++)
+                {
+                    for (int c = 0; c < C; c++)
+                    {
+                        for (int ctH = 0; ctH < H; ctH++)
+                        {
+                            for (int ctW = 0; ctW < W; ctW++)
+                            {
+                                keys[kidx] = dealer->recv_maxpool_key(bitlength, bitlength);
+                                kidx++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        uint64_t keyread_time = std::chrono::duration_cast<std::chrono::microseconds>(keyread_end - keyread_start).count();
+
+        peer->sync();
+        uint64_t timeCompute = 0;
+        uint64_t timeReconstruct = 0;
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int n = 0; n < N; n++)
+        {
+            for (int c = 0; c < C; c++)
+            {
+                for (int ctH = 0; ctH < H; ctH++)
+                {
+                    for (int ctW = 0; ctW < W; ctW++)
+                    {
+                        int leftTopCornerH = ctH * strideH - zPadHLeft;
+                        int leftTopCornerW = ctW * strideW - zPadWLeft;
+                        if (leftTopCornerH < 0 || leftTopCornerW < 0 || leftTopCornerH >= imgH || leftTopCornerW >= imgW)
+                        {
+                            Arr4DIdx(maxUntilNow, N, H, W, C, n, ctH, ctW, c) = 0;
+                        }
+                        else
+                        {
+                            Arr4DIdx(maxUntilNow, N, H, W, C, n, ctH, ctW, c) = Arr4DIdx(inArr, N1, imgH, imgW, C1, n, leftTopCornerH, leftTopCornerW, c);
+                        }
+                    }
+                }
+            }
+        }
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        auto t0 = std::chrono::high_resolution_clock::now();
+        timeCompute += std::chrono::duration_cast<std::chrono::microseconds>(t0 - start).count();
+
+        for (int fh = 0; fh < FH; fh++)
+        {
+            for (int fw = 0; fw < FW; fw++)
+            {
+                if (fh == 0 && fw == 0)
+                {
+                    continue;
+                }
+
+                auto t1 = std::chrono::high_resolution_clock::now();
+
+#pragma omp parallel for
+                for (int i = 0; i < N * C * H * W; i += 1)
+                {
+                    int curr = i;
+                    int ctW = curr % W;
+                    curr = curr / W;
+                    int ctH = curr % H;
+                    curr = curr / H;
+                    int c = curr % C;
+                    curr = curr / C;
+                    int n = curr % N;
+                    curr = curr / N;
+
+                    int leftTopCornerH = ctH * strideH - zPadHLeft;
+                    int leftTopCornerW = ctW * strideW - zPadWLeft;
+                    int curPosH = leftTopCornerH + fh;
+                    int curPosW = leftTopCornerW + fw;
+
+                    GroupElement maxi = Arr4DIdx(maxUntilNow, N, H, W, C, n, ctH, ctW, c);
+                    GroupElement temp;
+                    if ((((curPosH < 0) || (curPosH >= imgH)) || ((curPosW < 0) || (curPosW >= imgW))))
+                    {
+                        temp = GroupElement(0);
+                    }
+                    else
+                    {
+                        temp = Arr4DIdx(inArr, N1, imgH, imgW, C1, n, curPosH, curPosW, c);
+                    }
+                    int kidx = (fh * FW + fw - 1) * (N * C * H * W) + i;
+                    Arr4DIdx(maxUntilNow, N, H, W, C, n, ctH, ctW, c) = evalMaxpool(party - 2, maxi, temp, keys[kidx], Arr5DIdx(oneHot, FH * FW - 1, N, H, W, C, fh * FW + fw - 1, n, ctH, ctW, c));
+                    freeMaxpoolKeyPack(keys[kidx]);
+                }
+
+                auto t2 = std::chrono::high_resolution_clock::now();
+
+                if (!(fh == 0 && fw == 0))
+                {
+                    reconstruct(N * C * H * W, maxUntilNow, bitlength);
+                }
+                auto t3 = std::chrono::high_resolution_clock::now();
+                timeCompute += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+                timeReconstruct += std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
+            }
+        }
+        auto t4 = std::chrono::high_resolution_clock::now();
+        reconstruct(N * C * H * W * (FH * FW - 1), oneHot, 1);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        timeReconstruct += std::chrono::duration_cast<std::chrono::microseconds>(end - t4).count();
+        auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        delete[] keys;
+
+        Llama::stat_t stat = {
+            prefix + "MaxPool",
+            keyread_time,
+            timeCompute,
+            timeReconstruct,
+            onlineComm1 - onlineComm0,
+            dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+    }
+
+    std::cerr << ">> MaxPool - End" << std::endl;
+}
+
+void reluHelper(int thread_idx, int32_t size, GroupElement *inArr, GroupElement *outArr, GroupElement *drelu, ReluKeyPack *keys)
+{
+    auto thread_start = std::chrono::high_resolution_clock::now();
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        outArr[i] = evalRelu(party - 2, inArr[i], keys[i], &drelu[i]);
+        freeReluKeyPack(keys[i]);
+    }
+    auto thread_end = std::chrono::high_resolution_clock::now();
+}
+
+void relu_dealer_threads_helper(int thread_idx, int32_t size, GroupElement *inArr_mask, GroupElement *outArr_mask, GroupElement *drelu, std::pair<ReluKeyPack, ReluKeyPack> *keys)
+{
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        auto rout = random_ge(bitlength); // prng inside multithreads, need some locking
+        drelu[i] = random_ge(1);
+        keys[i] = keyGenRelu(bitlength, bitlength, inArr_mask[i], rout, drelu[i]);
+        outArr_mask[i] = rout;
+    }
+}
+
+void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu, std::string prefix)
+{
+    if (party == DEALER)
+    {
+
+        pair<ReluKeyPack> *keys = new pair<ReluKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; i += 1)
+        {
+            auto rout = random_ge(bitlength);
+            drelu[i] = random_ge(1);
+            keys[i] = keyGenRelu(bitlength, bitlength, inArr_mask[i], rout, drelu[i]);
+            outArr_mask[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_relu_key(keys[i].first);
+            client->send_relu_key(keys[i].second);
+            freeReluKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        ReluKeyPack *keys = new ReluKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for(int i = 0; i < size; i++){
+                keys[i] = dealer->recv_relu_key(bitlength, bitlength);
+            } });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            {
+#pragma omp parallel for
+            for(int i = 0; i < size; i++)
+            {
+                outArr[i] = evalRelu(party - 2, inArr[i], keys[i], &drelu[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         {
+            reconstruct(size, outArr, bitlength);
+            reconstruct(size, drelu, 1); });
+
+        Llama::stat_t stat = {prefix + "ReLU-Spline", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+#pragma omp parallel for
+        for (int i = 0; i < size; i++)
+        {
+            freeReluKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+}
+
+#define BIG_LOOPY(e)                        \
+    for (int n = 0; n < N; ++n)             \
+    {                                       \
+        for (int h = 0; h < H; ++h)         \
+        {                                   \
+            for (int w = 0; w < W; ++w)     \
+            {                               \
+                for (int c = 0; c < C; ++c) \
+                {                           \
+                    e;                      \
+                }                           \
+            }                               \
+        }                                   \
+    }
+
+void maxpool_onehot_threads_helper(int thread_idx, int f, int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
+                                   int32_t FW, GroupElement *maxBits, GroupElement *curr, GroupElement *oneHot, BitwiseAndKeyPack *keys)
+{
+    auto p = get_start_end(N * H * W * C, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        int curridx = i;
+        int c = curridx % C;
+        curridx = curridx / C;
+        int w = curridx % W;
+        curridx = curridx / W;
+        int h = curridx % H;
+        curridx = curridx / H;
+        int n = curridx % N;
+        curridx = curridx / N;
+
+        auto max = Arr5DIdx(maxBits, FH * FW - 1, N, H, W, C, f - 1, n, h, w, c);
+        auto c1 = Arr4DIdx(curr, N, H, W, C, n, h, w, c);
+        auto key = keys[(FH * FW - 2 - f) * N * H * W * C + n * H * W * C + h * W * C + w * C + c];
+        Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c) = evalAnd(party - 2, max, 1 ^ c1, key);
+        mod(Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c), 1);
+    }
+}
+
+// maxBits contains all the comparison bits from maxpool and converts them to one hot
+// For eg - in a filter of size 5, if the numbers where 3, 2, 5, 4, 7  MaxPool would have set the maxBits array to be 0, 1, 0, 1
+// this functionality converts this to 0, 0, 0, 1 (retains the last 1 and makes the rest 0)
+// This is compatible with both MaxPool and MaxPoolDouble
+void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32_t FW, GroupElement *maxBits, GroupElement *oneHot)
+{
+    std::cerr << ">> MaxPoolOneHot - Start" << std::endl;
+    GroupElement *curr = make_array<GroupElement>(N * H * W * C);
+    if (party == DEALER)
+    {
+        BIG_LOOPY(
+            auto m = Arr5DIdx(maxBits, FH * FW - 1, N, H, W, C, FH * FW - 2, n, h, w, c);
+            Arr4DIdx(curr, N, H, W, C, n, h, w, c) = m;
+            Arr5DIdx(oneHot, FH * FW, N, H, W, C, FH * FW - 1, n, h, w, c) = m;)
+
+        for (int f = FH * FW - 2; f >= 1; --f)
+        {
+            // out[f] = max[f - 1] ^ !curr
+            BIG_LOOPY(
+                auto max = Arr5DIdx(maxBits, FH * FW - 1, N, H, W, C, f - 1, n, h, w, c);
+                auto c1 = Arr4DIdx(curr, N, H, W, C, n, h, w, c);
+                auto rout = random_ge(1);
+                auto keys = keyGenBitwiseAnd(max, c1, rout);
+                server->send_bitwise_and_key(keys.first);
+                client->send_bitwise_and_key(keys.second);
+                Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c) = rout;)
+
+            BIG_LOOPY(
+                Arr4DIdx(curr, N, H, W, C, n, h, w, c) = Arr4DIdx(curr, N, H, W, C, n, h, w, c) ^ Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c);)
+        }
+
+        BIG_LOOPY(
+            Arr5DIdx(oneHot, FH * FW, N, H, W, C, 0, n, h, w, c) = Arr4DIdx(curr, N, H, W, C, n, h, w, c);)
+    }
+    else
+    {
+        BitwiseAndKeyPack *keys = new BitwiseAndKeyPack[(FH * FW - 2) * N * H * W * C];
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < (FH * FW - 2) * N * H * W * C; ++i)
+        {
+            keys[i] = dealer->recv_bitwise_and_key();
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time = std::chrono::duration_cast<std::chrono::microseconds>(keyread_end - keyread_start).count();
+
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        BIG_LOOPY(
+            auto m = Arr5DIdx(maxBits, FH * FW - 1, N, H, W, C, FH * FW - 2, n, h, w, c);
+            Arr4DIdx(curr, N, H, W, C, n, h, w, c) = m;
+            Arr5DIdx(oneHot, FH * FW, N, H, W, C, FH * FW - 1, n, h, w, c) = m;)
+
+        for (int f = FH * FW - 2; f >= 1; --f)
+        {
+
+            // out[f] = max[f - 1] ^ !curr
+            BIG_LOOPY(
+                auto max = Arr5DIdx(maxBits, FH * FW - 1, N, H, W, C, f - 1, n, h, w, c);
+                auto c1 = Arr4DIdx(curr, N, H, W, C, n, h, w, c);
+                auto key = keys[(FH * FW - 2 - f) * N * H * W * C + n * H * W * C + h * W * C + w * C + c];
+                Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c) = evalAnd(party - 2, max, 1 ^ c1, key);
+                mod(Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c), 1);)
+
+            // std::thread thread_pool[num_threads];
+            // for(int i = 0; i < num_threads; ++i) {
+            //     thread_pool[i] = std::thread(maxpool_onehot_threads_helper, i, f, N, H, W, C, FH, FW, maxBits, curr, oneHot, keys);
+            // }
+
+            // for(int i = 0; i < num_threads; ++i) {
+            //     thread_pool[i].join();
+            // }
+
+            reconstruct(N * H * W * C, oneHot + f * N * H * W * C, 1);
+
+            BIG_LOOPY(
+                Arr4DIdx(curr, N, H, W, C, n, h, w, c) = Arr4DIdx(curr, N, H, W, C, n, h, w, c) ^ Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c);)
+        }
+
+        BIG_LOOPY(
+            Arr5DIdx(oneHot, FH * FW, N, H, W, C, 0, n, h, w, c) = Arr4DIdx(curr, N, H, W, C, n, h, w, c) ^ 1;)
+        auto end = std::chrono::high_resolution_clock::now();
+        auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        evalMicroseconds += eval_time;
+        selectEvalMicroseconds += eval_time;
+        std::cerr << "   Key Read Time = " << keyread_time / 1000.0 << " miliseconds" << std::endl;
+        std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl;
+        delete[] keys;
+    }
+    delete[] curr;
+    std::cerr << ">> MaxPoolOneHot - End" << std::endl;
+}
+
+void ConvTranspose3DWrapper(int64_t N,
+                            int64_t D,
+                            int64_t H,
+                            int64_t W,
+                            int64_t CI,
+                            int64_t FD,
+                            int64_t FH,
+                            int64_t FW,
+                            int64_t CO,
+                            int64_t zPadDLeft,
+                            int64_t zPadDRight,
+                            int64_t zPadHLeft,
+                            int64_t zPadHRight,
+                            int64_t zPadWLeft,
+                            int64_t zPadWRight,
+                            int64_t strideD,
+                            int64_t strideH,
+                            int64_t strideW,
+                            int64_t outD,
+                            int64_t outH,
+                            int64_t outW,
+                            GroupElement *inputArr,
+                            GroupElement *filterArr,
+                            GroupElement *outArr)
+{
+    std::cerr << ">> ConvTranspose3D - Start" << std::endl;
+    always_assert(outD == (D - 1) * strideD - zPadDLeft - zPadDRight + FD);
+    always_assert(outH == (H - 1) * strideH - zPadHLeft - zPadHRight + FH);
+    always_assert(outW == (W - 1) * strideW - zPadWLeft - zPadWRight + FW);
+
+    if (party == DEALER)
+    {
+        auto local_start = std::chrono::high_resolution_clock::now();
+
+        // not good for in place operations
+        for (int i = 0; i < N * outD * outH * outW * CO; ++i)
+        {
+            outArr[i] = random_ge(bitlength);
+        }
+
+        auto keys = KeyGenConvTranspose3D(bitlength, N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW, outD, outH, outW, inputArr, filterArr, outArr);
+
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        client->send_triple_key(keys.second);
+        freeTripleKey(keys.second);
+        auto local_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(local_end -
+                                                                                      local_start)
+                                    .count();
+        dealerMicroseconds += local_time_taken;
+        std::cerr << "   Dealer Time = " << local_time_taken / 1000.0 << " milliseconds\n";
+    }
+    else
+    {
+
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        auto key = dealer->recv_triple_key(bitlength, N * D * H * W * CI, CI * FD * FH * FW * CO, N * outD * outH * outW * CO);
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end - keyread_start).count();
+
+        peer->sync();
+
+        auto local_start = std::chrono::high_resolution_clock::now();
+
+        EvalConvTranspose3D(party, key, N, D, H, W, CI, FD, FH, FW, CO,
+                            zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight,
+                            strideD, strideH, strideW, outD, outH, outW, inputArr, filterArr, outArr);
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+
+        reconstruct(N * outD * outH * outW * CO, outArr, bitlength);
+
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        convOnlineComm += (onlineComm1 - onlineComm0);
+        auto local_end = std::chrono::high_resolution_clock::now();
+
+        freeTripleKey(key);
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t1 - local_start).count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(local_end - t1).count();
+
+        convEvalMicroseconds += (reconstruct_time + compute_time);
+        evalMicroseconds += (reconstruct_time + compute_time);
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+    }
+
+    std::cerr << ">> ConvTranspose3D - End" << std::endl;
+}
+
+void sign_extend2_eval_threads_helper(int thread_idx, int32_t size, int bin, int bout, GroupElement *x, GroupElement *wrap, SignExtend2KeyPack *keys)
+{
+    auto thread_start = std::chrono::high_resolution_clock::now();
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        GroupElement y = x[i] + (1ULL << (bin - 1));
+        mod(y, bin);
+        evalDCF(party - 2, &wrap[i], y, keys[i].dcfKey);
+        wrap[i] = wrap[i] + keys[i].rw;
+        mod(wrap[i], 1);
+    }
+    auto thread_end = std::chrono::high_resolution_clock::now();
+}
+
+void SignExtend2(int size, int bin, int bout, GroupElement *x, GroupElement *y)
+{
+    for (int i = 0; i < size; i += 1)
+    {
+        mod(x[i], bin);
+    }
+    std::cerr << ">> SignExtend2 - Start" << std::endl;
+    if (party == DEALER)
+    {
+        uint64_t dealer_total_time = 0;
+        std::pair<SignExtend2KeyPack, SignExtend2KeyPack> *keys = new std::pair<SignExtend2KeyPack, SignExtend2KeyPack>[size];
+        auto start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel for
+        for (int i = 0; i < size; i += 1)
+        {
+            auto rout = random_ge(bout); // prng inside multithreads, need some locking
+            keys[i] = keyGenSignExtend2(bin, bout, x[i], rout);
+            y[i] = rout;
+        }
+        auto end = std::chrono::high_resolution_clock::now();
+        dealer_total_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_sign_extend2_key(keys[i].first, bin, bout);
+            client->send_sign_extend2_key(keys[i].second, bin, bout);
+            freeSignExtend2KeyPackPair(keys[i]);
+        }
+        delete[] keys;
+        dealerMicroseconds += dealer_total_time;
+        std::cerr << "   Dealer time = " << dealer_total_time / 1000.0 << " milliseconds" << std::endl;
+    }
+    else
+    {
+        // Step 1: Preprocessing Keys from Dealer
+        SignExtend2KeyPack *keys = new SignExtend2KeyPack[size];
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; i++)
+        {
+            keys[i] = dealer->recv_sign_extend2_key(bin, bout);
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+
+        peer->sync();
+        GroupElement *wrap = new GroupElement[size];
+        auto start = std::chrono::high_resolution_clock::now();
+        if (num_threads == 1)
+        {
+            sign_extend2_eval_threads_helper(0, size, bin, bout, x, wrap, keys);
+        }
+        else
+        {
+            std::thread thread_pool[num_threads];
+            for (int thread_idx = 0; thread_idx < num_threads; thread_idx++)
+            {
+                thread_pool[thread_idx] = std::thread(sign_extend2_eval_threads_helper, thread_idx, size, bin, bout, x, wrap, keys);
+            }
+
+            for (int thread_idx = 0; thread_idx < num_threads; thread_idx++)
+            {
+                thread_pool[thread_idx].join();
+            }
+        }
+
+        auto mid = std::chrono::high_resolution_clock::now();
+        // Step 3: Online Communication
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        reconstruct(size, wrap, 1);
+
+        auto mid2 = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; i++)
+        {
+            GroupElement out = keys[i].p[wrap[i] % 2];
+            if (party == 2)
+            {
+                GroupElement z = x[i] + (1ULL << (bin - 1));
+                mod(z, bin);
+                out = out + z;
+            }
+            y[i] = out;
+            freeSignExtend2KeyPack(keys[i]);
+        }
+        auto mid3 = std::chrono::high_resolution_clock::now();
+
+        reconstruct(size, y, bout);
+
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        // reluOnlineComm += (onlineComm1 - onlineComm0);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(mid - start).count() + std::chrono::duration_cast<std::chrono::microseconds>(mid3 - mid2).count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(mid2 - mid).count() + std::chrono::duration_cast<std::chrono::microseconds>(end - mid3).count();
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+        evalMicroseconds += (reconstruct_time + compute_time);
+        // reluEvalMicroseconds += (reconstruct_time + compute_time);
+        delete[] keys;
+        delete[] wrap;
+    }
+    std::cerr << ">> SignExtend2 - End" << std::endl;
+}
+
+// Masked version of Protocol of Probablistic Truncation over ZZ_{2^k} from ePrint 2020/338
+// Here, we set \ell = k - 1
+void EdabitsPrTrunc(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    if (party == DEALER)
+    {
+        pair<EdabitsPrTruncKeyPack> *keys = new pair<EdabitsPrTruncKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; i += 1)
+        {
+            auto rout = random_ge(bitlength);
+            keys[i] = keyGenEdabitsPrTrunc(bitlength, scale, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_edabits_prtrunc_key(keys[i].first, bitlength);
+            client->send_edabits_prtrunc_key(keys[i].second, bitlength);
+        }
+        delete[] keys;
+    }
+    else
+    {
+        EdabitsPrTruncKeyPack *keys = new EdabitsPrTruncKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for(int i = 0; i < size; i++){
+                keys[i] = dealer->recv_edabits_prtrunc_key(bitlength);
+            } });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            {
+            if (party == SERVER)
+            {
+#pragma omp parallel for
+                for(int i = 0; i < size; i++) {
+                    GroupElement vb = keys[i].a;
+                    GroupElement ip = x[i] + (1ULL << (bitlength - 2));
+                    GroupElement msb = (ip >> (bitlength - 1)) % 2;
+                    if (msb) {
+                        vb = 1 - vb;
+                    }
+                    GroupElement t = keys[i].b + (1ULL << (bitlength - scale - 1)) * vb;
+                    y[i] = t + ((ip % (1ULL << (bitlength - 1))) >> scale) - (1ULL << (bitlength - scale - 2));
+                }
+            }
+            else
+            {
+#pragma omp parallel for
+                for(int i = 0; i < size; i++) {
+                    GroupElement vb = keys[i].a;
+                    GroupElement ip = x[i] + (1ULL << (bitlength - 2));
+                    GroupElement msb = (ip >> (bitlength - 1)) % 2;
+                    if (msb) {
+                        vb = -vb;
+                    }
+                    GroupElement t = keys[i].b + (1ULL << (bitlength - scale - 1)) * vb;
+                    y[i] = t;
+                }
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bitlength); });
+
+        Llama::stat_t stat = {prefix + "EdabitsPrTrunc", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void LUT_ss(int size, int bin, int bout, const std::vector<GroupElement> &tab, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    always_assert(bin == 8);
+    always_assert(tab.size() == (1LL << bin));
+
+    if (party == DEALER)
+    {
+        pair<LUTSSKeyPack> *keys = new pair<LUTSSKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(bout);
+            keys[i] = keyGenLUTSS(bin, bout, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_lutss_key(keys[i].first);
+            client->send_lutss_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        LUTSSKeyPack *keys = new LUTSSKeyPack[size];
+        GroupElement *tmp = new GroupElement[2 * size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_lutss_key(bin, bout);
+            } });
+
+        peer->sync();
+
+        auto compute_time_1 = time_this_block([&]()
+                                              {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                auto res = evalLUTSS_1(party - 2, x[i], tab, keys[i]);
+                tmp[2*i] = res.first;
+                tmp[2*i+1] = res.second;
+            } });
+
+        auto reconstruction_stats_1 = time_comm_this_block([&]()
+                                                           { reconstruct(2 * size, tmp, bout); });
+
+        auto compute_time_2 = time_this_block([&]()
+                                              {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalLUTSS_2(party - 2, tmp[2*i], tmp[2*i+1], keys[i]);
+            } });
+
+        auto reconstruction_stats_2 = time_comm_this_block([&]()
+                                                           { reconstruct(size, y, bout); });
+
+        Llama::stat_t stat = {prefix + "LUTSS", keyread_time, compute_time_1 + compute_time_2, reconstruction_stats_1.first + reconstruction_stats_2.first, reconstruction_stats_1.second + reconstruction_stats_2.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+        delete[] tmp;
+    }
+}
+
+void LUT_dpf(int size, int bin, int bout, const std::vector<GroupElement> &tab, GroupElement *x, GroupElement *y, std::string prefix, bool doReconstruct)
+{
+    if (bin >= 8)
+    {
+        return LUT_dfpet(size, bin, bout, tab, x, y, prefix, doReconstruct);
+    }
+
+    always_assert(false);
+
+    if (bin == 8)
+    {
+        return LUT_ss(size, bin, bout, tab, x, y, prefix);
+    }
+
+    always_assert(tab.size() == (1LL << bin));
+
+    if (party == DEALER)
+    {
+        pair<LUTKeyPack> *keys = new pair<LUTKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            mod(x[i], bin);
+            GroupElement rout = random_ge(bout);
+            keys[i] = keyGenLUT(bin, bout, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_lut_key(keys[i].first);
+            client->send_lut_key(keys[i].second);
+            freeLUTKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        LUTKeyPack *keys = new LUTKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_lut_key(bin, bout);
+            } });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalAll_reduce(party - 2, keys[i].dpfKey, x[i], tab) + keys[i].rout;
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bout); });
+
+        Llama::stat_t stat = {prefix + "LUT", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; i++)
+        {
+            freeLUTKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+}
+
+void LUT_dfpet(int size, int bin, int bout, const std::vector<GroupElement> &tab, GroupElement *x, GroupElement *y, std::string prefix, bool doReconstruct)
+{
+    always_assert(bin >= 8);
+    always_assert(tab.size() == (1LL << bin));
+    GroupElement *tmp = new GroupElement[2 * size];
+    GroupElement *res = tmp;
+    GroupElement *corr = tmp + size;
+
+    if (party == DEALER)
+    {
+        pair<LUTDPFETKeyPack> *keys = new pair<LUTDPFETKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement routRes = random_ge(bout);
+            GroupElement routCorr = random_ge(1);
+            keys[i] = keyGenLUTDPFET(bin, bout, x[i], routRes, routCorr);
+            res[i] = routRes;
+            corr[i] = routCorr;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_lutdpfet_key(keys[i].first);
+            client->send_lutdpfet_key(keys[i].second);
+            freeLUTDPFETKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        LUTDPFETKeyPack *keys = new LUTDPFETKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_lutdpfet_key(bin, bout);
+            } });
+
+        peer->sync();
+
+        auto compute_time_1 = time_this_block([&]()
+                                              {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                auto res_corr = evalLUTDPFET_1(party - 2, x[i], tab, keys[i]);
+                res[i] = res_corr.first;
+                corr[i] = res_corr.second;
+            } });
+
+        auto reconstruction_stats_1 = time_comm_this_block([&]()
+                                                           {
+            // reconstruct(size, res, bout);
+            // reconstruct(size, corr, 1);
+            reconstructRT(size, res, bout); });
+
+        Llama::stat_t stat = {prefix + "LUT::FirstRound", keyread_time, compute_time_1, reconstruction_stats_1.first, reconstruction_stats_1.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            freeLUTDPFETKeyPack(keys[i]);
+        }
+
+        delete[] keys;
+    }
+
+    Select(size, bout, corr, res, y, prefix + "LUT::", doReconstruct);
+
+    auto t = time_this_block([&]()
+                             {
+    if (doReconstruct || party == DEALER || party == SERVER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i) {
+            y[i] = 2 * y[i] - res[i];
+        }
+    }
+    else
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i) {
+            y[i] = 2 * y[i];
+        }
+    } });
+
+    if (party != DEALER)
+    {
+        Llama::stat_t stat = {prefix + "LUT::Misc", 0, t, 0, 0, 0};
+        stat.print();
+        Llama::push_stats(stat);
+    }
+
+    delete[] tmp;
+}
+
+void nExp_SIRNN(int size, int bin, GroupElement *x, GroupElement *y, int scale)
+{
+    SlothClip(size, bin, 16, bitlength, x, y, "nExp::");
+
+    GroupElement *x0 = new GroupElement[size];
+    GroupElement *x1 = new GroupElement[size];
+
+    // ARS(size, y, y, x1, x1, 8);
+    TruncateReduce(size, 16, y, x1, 8, "nExp::");
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        x0[i] = y[i] % (1LL<<8);
+        mod(x1[i], 8);
+    } });
+
+    std::vector<GroupElement> lut_0(1LL << 8);
+    std::vector<GroupElement> lut_1(1LL << 8);
+    for (int i = 0; i < (1LL << 8); ++i)
+    {
+        lut_0[i] = GroupElement(std::exp(-i / double(1LL << scale)) * (1LL << scale));
+        lut_1[i] = GroupElement(std::exp(-i / double(1LL << (scale - 8))) * (1LL << scale));
+    }
+
+    LUT_dpf(size, 8, LlamaConfig::bitlength, lut_0, x0, x0, "nExp::");
+    LUT_dpf(size, 8, LlamaConfig::bitlength, lut_1, x1, x1, "nExp::");
+
+    ElemWiseMul(size, x0, x1, y, "nExp::");
+    SlothARS(size, y, y, scale, "nExp::");
+
+    if (party != DEALER)
+    {
+        Llama::push_stats({"nExp::Misc", 0, t1, 0, 0, 0});
+    }
+
+    delete[] x0;
+    delete[] x1;
+}
+
+void nExp(int size, int bin, GroupElement *x, GroupElement *y, int scale)
+{
+    return nExp_SIRNN(size, bin, x, y, scale);
+    int lutBw = 16;
+    always_assert(bin >= lutBw);
+
+    std::vector<GroupElement> lut(1LL << lutBw);
+    for (int i = 0; i < (1LL << lutBw); ++i)
+    {
+        lut[i] = GroupElement(std::exp(-i / double(1LL << scale)) * (1LL << scale));
+    }
+
+    SlothClip(size, bin, lutBw, bitlength, x, y);
+    LUT_dpf(size, lutBw, LlamaConfig::bitlength, lut, y, y, "nExp::");
+}
+
+void Tanh(int size, GroupElement *x, GroupElement *y, int scale)
+{
+    int lutBw = 14;
+    always_assert(LlamaConfig::bitlength >= lutBw);
+
+    GroupElement *abs = new GroupElement[size];
+    GroupElement *drelu = new GroupElement[size];
+    // Relu2Round(size, x, x, abs, abs, drelu, LlamaConfig::bitlength);
+    Relu(size, x, x, abs, abs, drelu, "Tanh::");
+
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        abs[i] = abs[i] * 2 - x[i];
+        mod(abs[i], LlamaConfig::bitlength);
+    }
+
+    std::vector<GroupElement> lut(1LL << lutBw);
+    for (int i = 0; i < (1LL << lutBw); ++i)
+    {
+        lut[i] = GroupElement(std::tanh(i / double(1LL << scale)) * (1LL << scale));
+    }
+
+    SlothClip(size, bitlength, lutBw, bitlength, abs, abs);
+    LUT_dpf(size, lutBw, LlamaConfig::bitlength, lut, abs, abs, "Tanh::");
+    Select(size, drelu, abs, y);
+
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        y[i] = y[i] * 2 - abs[i];
+    }
+
+    delete[] abs;
+}
+
+// unused
+void Clip(int size, int maxbw, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    if (party == DEALER)
+    {
+        pair<ClipKeyPack> *keys = new pair<ClipKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(bitlength);
+            keys[i] = keyGenClip(bitlength, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_clip_key(keys[i].first);
+            client->send_clip_key(keys[i].second);
+            freeClipKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        ClipKeyPack *keys = new ClipKeyPack[size];
+        GroupElement *tmp = new GroupElement[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_clip_key(bitlength);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time_1 = time_this_block([&]()
+                                                  {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                tmp[i] = evalClip_1(party - 2, maxbw, x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats_1 = time_comm_this_block([&]()
+                                                           { reconstruct(size, tmp, 1); });
+
+        uint64_t compute_time_2 = time_this_block([&]()
+                                                  {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalClip_2(party - 2, maxbw, tmp[i], x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats_2 = time_comm_this_block([&]()
+                                                           { reconstruct(size, y, bitlength); });
+
+        Llama::stat_t stat = {
+            prefix + "Clip",
+            keyread_time,
+            compute_time_1 + compute_time_2,
+            reconstruction_stats_1.first + reconstruction_stats_2.first,
+            reconstruction_stats_1.second + reconstruction_stats_2.second,
+            dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeClipKeyPack(keys[i]);
+        }
+        delete[] keys;
+        delete[] tmp;
+    }
+}
+
+void Select(int32_t size, int bin, GroupElement *s, GroupElement *x, GroupElement *out, std::string prefix, bool doReconstruct)
+{
+
+    if (party == DEALER)
+    {
+        pair<SelectKeyPack> *keys = new pair<SelectKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            auto rout = random_ge(bin);
+            keys[i] = keyGenSelect(bin, s[i], x[i], rout);
+            out[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_select_key(keys[i].first);
+            client->send_select_key(keys[i].second);
+        }
+    }
+    else
+    {
+        SelectKeyPack *keys = new SelectKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for(int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_select_key(bin);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for(int i = 0; i < size; ++i) {
+                out[i] = evalSelect(party - 2, s[i], x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         {
+            if (doReconstruct)
+                reconstruct(size, out, bin); });
+
+        Llama::stat_t stat = {
+            prefix + "Select",
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void Select(int32_t size, GroupElement *s, GroupElement *x, GroupElement *out, std::string prefix, bool doReconstruct)
+{
+    Select(size, bitlength, s, x, out, prefix, doReconstruct);
+}
+
+void InverseLUT(int size, GroupElement *x, GroupElement *y, int scale, int bw, std::string prefix = "")
+{
+
+    // for (int i = 0; i < size; ++i)
+    // {
+    //     y[i] = x[i] >> (bw - 16);
+    // }
+    // ARS(size, x, x, y, y, bw - 16);
+    TruncateReduce(size, bw, x, y, bw - 16, prefix + "Inverse::");
+
+    std::vector<GroupElement> lut(1LL << 16);
+    for (int i = 1; i < (1LL << 16); ++i)
+    {
+        lut[i] = GroupElement(double(1LL << (scale + scale - bw + 16)) / i);
+    }
+
+    LUT_dpf(size, 16, LlamaConfig::bitlength, lut, y, y, prefix + "Inverse::");
+}
+
+void Softmax(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale)
+{
+    // s1 = batch size
+    // s2 = number of classes
+
+    GroupElement *max = make_array<GroupElement>(s1);
+    // step 1 - calculate max for each image in batch
+    // GroupElement *oneHot = make_array<GroupElement>(s1 * (s2 - 1));
+    // MaxPool(s1, 1, 1, 1, s2, 1, 0, 0, 0, 0, 1, 1, s1, s2, 1, 1, x, x, max, max, oneHot);
+    // delete[] oneHot; // TODO: support passing oneHot as nullptr
+
+    SlothMaxpool(s1, s2, bin, x, max, "Softmax::");
+
+    // step 2 - subtract max from each element in each image in batch
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for(int i = 0; i < s1; ++i) {
+        for(int j = 0; j < s2; ++j) {
+            Arr2DIdx(y, s1, s2, i, j) = max[i] - Arr2DIdx(x, s1, s2, i, j);
+        }
+    } });
+
+    // step 3 - exponentiate each element in each image in batch
+    nExp(s1 * s2, std::min(bin + 1, bitlength), y, y, scale);
+
+    GroupElement *denominators = max; // reuse the array
+    // // step 4 - calculate sum of exponentiated elements for each image in batch
+    auto t2 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for(int i = 0; i < s1; ++i) {
+        denominators[i] = 0;
+        for(int j = 0; j < s2; ++j) {
+            denominators[i] = denominators[i] + Arr2DIdx(y, s1, s2, i, j);
+        }
+    } });
+
+    // step 5 - calculate inverse of all the denominators
+    InverseLUT(s1, denominators, denominators, scale, scale + 10, "Softmax::"); // only works if numClasses <= 1024
+
+    // step 6 - multiply each element in each image in batch by the inverse of the denominator
+    GroupElement *expandedDenominator = make_array<GroupElement>(s1 * s2);
+    auto t3 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for(int i = 0; i < s1; ++i) {
+        for(int j = 0; j < s2; ++j) {
+            Arr2DIdx(expandedDenominator, s1, s2, i, j) = denominators[i];
+        }
+    } });
+    delete[] max;
+
+    ElemWiseMul(s1 * s2, expandedDenominator, y, y, "Softmax::");
+    SlothARS(s1 * s2, y, y, scale, "Softmax::");
+
+    if (party != DEALER)
+        Llama::push_stats({"Softmax::Misc", 0, t1 + t2 + t3, 0, 0, 0});
+
+    delete[] expandedDenominator;
+}
+
+void F2BF16(int size, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    // assuming z = 0 and s = 0
+    // y is 13 bit
+
+    if (party == DEALER)
+    {
+        pair<F2BF16KeyPack> *keys = new pair<F2BF16KeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(13);
+            keys[i] = keyGenF2BF16(bitlength, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_f2bf16_key(keys[i].first);
+            client->send_f2bf16_key(keys[i].second);
+            freeF2BF16KeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        F2BF16KeyPack *keys = new F2BF16KeyPack[size];
+        GroupElement *tmp = new GroupElement[2 * size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_f2bf16_key(bitlength);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time_1 = time_this_block([&]()
+                                                  {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                auto p = evalF2BF16_1(party - 2, x[i], keys[i]);
+                tmp[2*i] = p.first;
+                tmp[2*i + 1] = p.second;
+            } });
+
+        auto reconstruction_stats_1 = time_comm_this_block([&]()
+                                                           { reconstruct(2 * size, tmp, bitlength); });
+
+        uint64_t compute_time_2 = time_this_block([&]()
+                                                  {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalF2BF16_2(party - 2, x[i], tmp[2*i], tmp[2*i+1], keys[i]);
+            } });
+
+        auto reconstruction_stats_2 = time_comm_this_block([&]()
+                                                           { reconstruct(size, y, bitlength); });
+
+        uint64_t compute_time_3 = time_this_block([&]()
+                                                  {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalF2BF16_3(party - 2, tmp[2*i], y[i], keys[i]);
+            } });
+
+        auto reconstruction_stats_3 = time_comm_this_block([&]()
+                                                           { reconstruct(size, y, 13); });
+
+        Llama::stat_t stat = {
+            prefix + "F2BF16",
+            keyread_time,
+            compute_time_1 + compute_time_2 + compute_time_3,
+            reconstruction_stats_1.first + reconstruction_stats_2.first + reconstruction_stats_3.first,
+            reconstruction_stats_1.second + reconstruction_stats_2.second + reconstruction_stats_3.second,
+            dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeF2BF16KeyPack(keys[i]);
+        }
+        delete[] keys;
+        delete[] tmp;
+    }
+}
+
+void Rsqrt(int size, GroupElement *x, GroupElement *y, GroupElement extradiv, int scale, std::string prefix, std::vector<GroupElement> *lut)
+{
+    // printf("Inp=%lu\n", x[0]);
+    // assumes x is in precision = 2*scale and have to output in precision = scale
+    F2BF16(size, x, y, prefix + "Rsqrt::");
+
+    // std::vector<GroupElement> lut(1LL<<13);
+    if (!lut)
+    {
+        lut = new std::vector<GroupElement>(1LL << 13);
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+    for(int i = 0; i < (1LL<<13); ++i)
+    {
+        GroupElement k = i % (1LL << 6);
+        GroupElement m = i >> 6;
+        double val = double(m+128) * std::pow(2.0, k-7);
+        (*lut)[i] = GroupElement(double(1LL<<(2*scale)) / sqrt(val / extradiv));
+    } });
+    }
+    if (party != DEALER)
+    {
+        // Llama::stat_t stat = { "Rsqrt::LutGen", 0, t, 0, 0, 0 };
+        // stat.print();
+        // Llama::push_stats(stat);
+    }
+
+    LUT_dpf(size, 13, bitlength, *lut, y, y, prefix + "Rsqrt::");
+    // printf("Op=%lu\n", y[0]);
+}
+
+inline double relu_sub_gelu(double x)
+{
+    double g = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+    return std::max(0.0, x) - g;
+}
+
+inline GroupElement relu_sub_gelu(GroupElement x, int scale_in, int scale_out)
+{
+    return (GroupElement)(relu_sub_gelu((double)x / (1LL << scale_in)) * (1LL << scale_out));
+}
+
+void Gelu(int size, int bin, GroupElement *x, GroupElement *y, int scale)
+{
+    GroupElement *drelu = new GroupElement[size];
+    // Relu(size, x, x, y, y, drelu, "GeLU::");
+    SlothRelu(size, bin, x, y, "GeLU::");
+
+    GroupElement *abs = drelu;
+
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        abs[i] = 2 * y[i] - x[i];
+        mod(abs[i], bitlength);
+    }
+    SlothClip(size, bin, scale + 2, bitlength, abs, abs, "GeLU::");
+
+    SlothTR(size, scale + 2, abs, abs, scale - 6, "GeLU::");
+
+    std::vector<GroupElement> lut(1LL << 8);
+    for (int i = 0; i < (1LL << 8); ++i)
+    {
+        lut[i] = relu_sub_gelu(i, 6, scale);
+    }
+    LUT_dpf(size, 8, bitlength, lut, abs, abs, "GeLU::");
+
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        y[i] = y[i] - abs[i];
+    }
+
+    delete[] drelu;
+}
+
+void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale)
+{
+    always_assert(scale == 12);
+
+    GroupElement *y = new GroupElement[size];
+    GroupElement *d = new GroupElement[size];
+    GroupElement *rp = new GroupElement[size];
+    GroupElement *abs = new GroupElement[size];
+    GroupElement *r = new GroupElement[size];
+
+    SlothTR(size, bin, x, y, 6, "GeLU::");
+    SlothDrelu(size, bin - 6, y, d, "GeLU::");
+
+    Select(size, bin - 6, d, y, rp, "GeLU::");
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        abs[i] = 2 * rp[i] - y[i];
+        mod(abs[i], bin - 6);
+    } });
+
+    SlothClip(size, bin - 6, 8, 8, abs, abs, "GeLU::");
+
+    std::vector<GroupElement> lut(1LL << 8);
+    auto t2 = time_this_block([&]()
+                              {
+    for(int i = 0; i < (1LL<<8); ++i)
+    {
+        lut[i] = relu_sub_gelu(i, 6, scale);
+    } });
+
+    LUT_dpf(size, 8, bitlength, lut, abs, abs, "GeLU::", false);
+
+    Select(size, bitlength, d, x, r, "GeLU::", false);
+
+    auto t3 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        out[i] = r[i] - abs[i];
+    } });
+
+    if (party != DEALER)
+    {
+        Llama::push_stats({"GeLU::Misc", 0, t1 + t3, 0, 0, 0});
+        // Llama::push_stats({ "GeLU::LutGen", 0, t2, 0, 0, 0 });
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, out, bitlength); });
+
+        Llama::stat_t stat = {
+            "GeLU::Reconstruct",
+            0,
+            0,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            0,
+        };
+        stat.print();
+        Llama::push_stats(stat);
+    }
+}
+
+void TruncateReduce(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    if (party == DEALER)
+    {
+        pair<TruncateReduceKeyPack> *keys = new pair<TruncateReduceKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            mod(x[i], bin);
+            GroupElement rout = random_ge(bin - scale);
+            keys[i] = keyGenTruncateReduce(bin, scale, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_truncate_reduce_key(keys[i].first);
+            client->send_truncate_reduce_key(keys[i].second);
+            freeTruncateReduceKeyPackPair(keys[i]);
+        }
+    }
+    else
+    {
+        TruncateReduceKeyPack *keys = new TruncateReduceKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_truncate_reduce_key(bin, scale);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                mod(x[i], bin);
+                y[i] = evalTruncateReduce(party - 2, x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bin - scale); });
+
+        Llama::stat_t stat = {
+            prefix + "TruncateReduce",
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeTruncateReduceKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+}
+
+void SlothDrelu(int size, int bin, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    if (party == DEALER)
+    {
+        pair<SlothDreluKeyPack> *keys = new pair<SlothDreluKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(1);
+            keys[i] = keyGenSlothDrelu(bin, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_sloth_drelu_key(keys[i].first);
+            client->send_sloth_drelu_key(keys[i].second);
+            freeSlothDreluKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        SlothDreluKeyPack *keys = new SlothDreluKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_slothdrelu_key(bin);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalSlothDrelu(party - 2, x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, 1); });
+
+        Llama::stat_t stat = {
+            prefix + "Drelu",
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeSlothDreluKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+}
+
+void SlothRelu(int size, int bin, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    GroupElement *drelu = new GroupElement[size];
+    SlothDrelu(size, bin, x, drelu, prefix + "Relu::");
+    Select(size, drelu, x, y, prefix + "Relu::");
+    delete[] drelu;
+}
+
+// Guaranteed to work only when x is positive
+void SlothClip(int size, int bin, int maxbw, int bout, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    GroupElement *tmp = new GroupElement[size];
+    GroupElement *drelu = new GroupElement[size];
+
+    auto t1 = time_this_block([&]()
+                              {
+    if (party != DEALER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            tmp[i] = x[i] - (1LL << maxbw);
+        }
+    }
+    else
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            tmp[i] = x[i];
+        }
+    } });
+    SlothDrelu(size, bin, tmp, drelu, prefix + "Clip::");
+
+    auto t2 = time_this_block([&]()
+                              {
+    if (party != DEALER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            drelu[i] ^= 1;
+            tmp[i] = tmp[i] + 1;
+        }
+    } });
+
+    Select(size, bout, drelu, tmp, y, prefix + "Clip::");
+
+    auto t3 = time_this_block([&]()
+                              {
+    if (party != DEALER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            y[i] = y[i] + (1LL << maxbw) - 1;
+        }
+    } });
+
+    delete[] tmp;
+    delete[] drelu;
+    if (party != DEALER)
+        Llama::push_stats({prefix + "Clip::Misc", 0, t1 + t2 + t3, 0, 0, 0});
+}
+
+void SlothMax(int size, int bin, GroupElement *x, GroupElement *y, GroupElement *out, std::string prefix)
+{
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        out[i] = x[i] - y[i];
+    } });
+    SlothRelu(size, bin, out, out, prefix + "Max::");
+    auto t2 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        out[i] = out[i] + y[i];
+    } });
+
+    if (party != DEALER)
+        Llama::push_stats({prefix + "Max::Misc", 0, t1 + t2, 0, 0, 0});
+}
+
+// x is [s1 x s2]
+// y is [s1]
+void SlothMaxpool(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    GroupElement *left = new GroupElement[s1 * s2];  // more elements than required but whatever
+    GroupElement *right = new GroupElement[s1 * s2]; // more elements than required but whatever
+    GroupElement *res = new GroupElement[s1 * s2];
+    GroupElement *tmp = new GroupElement[s1];
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        res[i] = x[i];
+    } });
+
+    // do in log rounds
+    int curr = s2;
+    while (curr != 1)
+    {
+        int curr2 = curr / 2;
+
+        auto t2 = time_this_block([&]()
+                                  {
+#pragma omp parallel for
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < curr2; ++j)
+            {
+                Arr2DIdx(left, s1, curr2, i, j) = Arr2DIdx(res, s1, curr, i, 2 * j);
+                Arr2DIdx(right, s1, curr2, i, j) = Arr2DIdx(res, s1, curr, i, 2 * j + 1);
+            }
+        } });
+
+        SlothMax(s1 * curr2, bin, left, right, left, prefix + "Maxpool::");
+
+        int currNext;
+        auto t3 = time_this_block([&]()
+                                  {
+        if ((curr % 2) == 0)
+        {
+            currNext = curr / 2;
+        }
+        else
+        {
+            currNext = curr / 2 + 1;
+#pragma omp parallel for
+            for (int i = 0; i < s1; ++i)
+            {
+                tmp[i] = Arr2DIdx(res, s1, curr, i, curr - 1);
+            }
+#pragma omp parallel for
+            for (int i = 0; i < s1; ++i)
+            {
+                Arr2DIdx(res, s1, currNext, i, currNext - 1) = tmp[i];
+            }
+        }
+
+#pragma omp parallel for
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < curr2; ++j)
+            {
+                Arr2DIdx(res, s1, currNext, i, j) = Arr2DIdx(left, s1, curr2, i, j);
+            }
+        }
+        curr = currNext; });
+
+        if (party != DEALER)
+            Llama::push_stats({prefix + "Maxpool::Misc", 0, t2 + t3, 0, 0, 0});
+    }
+
+    auto t4 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < s1; ++i)
+    {
+        y[i] = Arr2DIdx(res, s1, 1, i, 0);
+    } });
+
+    if (party != DEALER)
+        Llama::push_stats({prefix + "Maxpool::Misc", 0, t1 + t4, 0, 0, 0});
+
+    delete[] left;
+    delete[] right;
+    delete[] res;
+    delete[] tmp;
+}
+
+void SlothMaxpoolTriangular(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    always_assert(s1 == s2);
+    GroupElement *left = new GroupElement[s1 * s2];  // more elements than required but whatever
+    GroupElement *right = new GroupElement[s1 * s2]; // more elements than required but whatever
+    GroupElement *res = new GroupElement[s1 * s2];
+    int curr[s1];
+    int currNext[s1];
+
+    auto t1 = time_this_block([&]()
+                              {
+    int idx = 0;
+    for (int i = 0; i < s1; ++i)
+    {
+        for (int j = 0; j <= i; ++j)
+        {
+            res[idx] = x[s1*i + j];
+            ++idx;
+        }
+    }
+
+    // do in log rounds
+    for (int i = 0; i < s1; ++i)
+    {
+        curr[i] = i + 1;
+    } });
+
+    while (curr[s1 - 1] != 1)
+    {
+        int idx = 0;
+        int offset = 0;
+        auto t2 = time_this_block([&]()
+                                  {
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < curr[i] / 2; ++j)
+            {
+                left[idx] =  res[offset + 2 * j];
+                right[idx] = res[offset + 2 * j + 1];
+                ++idx;
+            }
+            offset += curr[i];
+        } });
+
+        SlothMax(idx, bin, left, right, left, prefix + "Maxpool::");
+
+        int offsetOld = 0;
+        auto t3 = time_this_block([&]()
+                                  {
+        idx = 0;
+        offset = 0;
+        for (int i = 0; i < s1; ++i)
+        {
+            if ((curr[i] % 2) == 0)
+            {
+                currNext[i] = curr[i] / 2;
+                for (int j = 0; j < curr[i] / 2; ++j)
+                {
+                    right[offset + j] = left[idx];
+                    ++idx;
+                }
+            }
+            else
+            {
+                currNext[i] = curr[i] / 2 + 1;
+                for (int j = 0; j < curr[i] / 2; ++j)
+                {
+                    right[offset + j] = left[idx];
+                    ++idx;
+                }
+                right[offset + currNext[i] - 1] = res[offsetOld + curr[i] - 1];
+
+            }
+            offset += currNext[i];
+            offsetOld += curr[i];
+        }
+
+        idx = 0;
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < currNext[i]; ++j)
+            {
+                res[idx] = right[idx];
+                ++idx;
+            }
+        }
+
+        for (int i = 0; i < s1; ++i)
+        {
+            curr[i] = currNext[i];
+        } });
+
+        if (party != DEALER)
+            Llama::push_stats({prefix + "Maxpool::Misc", 0, t2 + t3, 0, 0, 0});
+    }
+
+    auto t4 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < s1; ++i)
+    {
+        y[i] = res[i];
+    } });
+
+    if (party != DEALER)
+        Llama::push_stats({prefix + "Maxpool::Misc", 0, t1 + t4, 0, 0, 0});
+
+    delete[] left;
+    delete[] right;
+    delete[] res;
+}
+
+void SumOfSquare(int s1, int s2, GroupElement *x, GroupElement *y, std::string prefix)
+{
+    int size = s1 * s2;
+    GroupElement *squares = new GroupElement[size];
+    if (party == DEALER)
+    {
+        pair<SquareKey> *keys = new pair<SquareKey>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            auto rout = random_ge(bitlength);
+            keys[i] = keyGenSquare(x[i], rout);
+            squares[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_square_key(keys[i].first);
+            client->send_square_key(keys[i].second);
+        }
+
+        delete[] keys;
+
+#pragma omp parallel for
+        for (int i = 0; i < s1; ++i)
+        {
+            y[i] = 0;
+            for (int j = 0; j < s2; ++j)
+            {
+                y[i] = y[i] + squares[i * s2 + j];
+            }
+        }
+    }
+    else
+    {
+        SquareKey *keys = new SquareKey[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            {
+            for(int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_square_key();
+            } });
+
+        peer->sync();
+
+        auto compute_time = time_this_block([&]()
+                                            {
+#pragma omp parallel for
+            for(int i = 0; i < size; ++i) {
+                squares[i] = evalSquare(party - SERVER, x[i], keys[i]);
+            }
+
+#pragma omp parallel for
+            for (int i = 0; i < s1; ++i)
+            {
+                y[i] = 0;
+                for (int j = 0; j < s2; ++j)
+                {
+                    y[i] = y[i] + squares[i * s2 + j];
+                }
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(s1, y, bitlength); });
+
+        Llama::stat_t stat = {prefix + "SumOfSquare", keyread_time, compute_time, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+GroupElement log2(GroupElement x)
+{
+    GroupElement y = 0;
+    while (x >>= 1)
+        y++;
+    return y;
+}
+
+void SlothDiv(int size, GroupElement *x, GroupElement *y, GroupElement divisor, int scale, std::string prefix = "")
+{
+    if (!(divisor & (divisor - 1)))
+    {
+        SlothARS(size, x, y, log2(divisor), prefix + "Div::");
+    }
+    else
+    {
+        GroupElement divfp = (1LL << scale) / divisor;
+
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+        for (int i = 0; i < size; i++) {
+            y[i] = x[i] * divfp;
+        } });
+
+        Llama::stat_t stat = {prefix + "Div::Misc", 0, t, 0, 0, 0};
+        stat.print();
+        Llama::push_stats(stat);
+
+        SlothARS(size, y, y, scale, prefix + "Div::");
+    }
+}
+
+void SlothDivFaithful(int size, GroupElement *x, GroupElement *y, GroupElement divisor, int scale, std::string prefix = "")
+{
+    if (!(divisor & (divisor - 1)))
+    {
+        SlothFaithfulARS(size, LlamaConfig::bitlength, x, y, log2(divisor), prefix + "Div::");
+    }
+    else
+    {
+        GroupElement divfp = (1LL << scale) / divisor;
+
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+        for (int i = 0; i < size; i++) {
+            y[i] = x[i] * divfp;
+        } });
+
+        Llama::stat_t stat = {prefix + "Div::Misc", 0, t, 0, 0, 0};
+        stat.print();
+        Llama::push_stats(stat);
+
+        SlothFaithfulARS(size, LlamaConfig::bitlength, y, y, scale, prefix + "Div::");
+    }
+}
+
+void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale)
+{
+    GroupElement *mean = new GroupElement[s1];
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < s1; ++i) {
+        mean[i] = 0;
+        for (int j = 0; j < s2; j++) {
+            mean[i] += x[i * s2 + j];
+        }
+    } });
+
+    SlothDivFaithful(s1, mean, mean, s2, scale, "LayerNorm::");
+
+    GroupElement *tmp = new GroupElement[s1 * s2];
+
+    auto t2 = time_this_block([&]()
+                              {
+#pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            tmp[i * s2 + j] = x[i * s2 + j] - mean[i];
+        }
+    } });
+
+    GroupElement *var = new GroupElement[s1];
+    SumOfSquare(s1, s2, tmp, var, "LayerNorm::");
+
+    Rsqrt(s1, var, var, s2, scale, "LayerNorm::");
+
+    auto t3 = time_this_block([&]()
+                              {
+#pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            y[i * s2 + j] = var[i];
+        }
+    } });
+
+    ElemWiseMul(s1 * s2, tmp, y, y, "LayerNorm::");
+    SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    GroupElement *Aexpand = tmp;
+    auto t4 = time_this_block([&]()
+                              {
+#pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            Aexpand[i * s2 + j] = A[j];
+        }
+    } });
+
+    ElemWiseMul(s1 * s2, Aexpand, y, y, "LayerNorm::");
+
+    auto t5 = time_this_block([&]()
+                              {
+#pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            y[i * s2 + j] += B[j];
+        }
+    } });
+
+    SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    Llama::stat_t stat = {"LayerNorm::Misc", 0, t1 + t2 + t3 + t4 + t5, 0, 0, 0};
+    stat.print();
+    Llama::push_stats(stat);
+
+    delete[] mean;
+    delete[] tmp;
+    delete[] var;
+}
+
+// unused
+void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale)
+{
+    if (party == DEALER)
+    {
+        // part 1 : MatMul
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s3; ++j)
+            {
+                Arr2DIdx(y, s1, s3, i, j) = random_ge(bitlength);
+            }
+        }
+
+        auto keys = KeyGenMatMul(bitlength, bitlength, s1, s2, s3, x, A, y);
+
+        freeMatMulKey(keys.first);
+        client->send_matmul_key(keys.second);
+        freeMatMulKey(keys.second);
+
+        // part 2: truncation
+        pair<EdabitsPrTruncKeyPack> *keysTrunc = new pair<EdabitsPrTruncKeyPack>[s1 * s3];
+
+#pragma omp parallel for
+        for (int i = 0; i < s1 * s3; i += 1)
+        {
+            auto rout = random_ge(bitlength);
+            keysTrunc[i] = keyGenEdabitsPrTrunc(bitlength, scale, y[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < s1 * s3; ++i)
+        {
+            server->send_edabits_prtrunc_key(keysTrunc[i].first, bitlength);
+            client->send_edabits_prtrunc_key(keysTrunc[i].second, bitlength);
+        }
+        delete[] keysTrunc;
+    }
+    else
+    {
+        // part 1 : MatMul
+        MatMulKey key;
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time_1 = time_this_block([&]()
+                                              { key = dealer->recv_matmul_key(bitlength, bitlength, s1, s2, s3); });
+
+        peer->sync();
+
+        auto compute_time_1 = time_this_block([&]()
+                                              { matmul_eval_helper(party, s1, s2, s3, x, A, y, key.a, key.b, key.c); });
+
+        auto reconstruction_stats_1 = time_comm_this_block([&]()
+                                                           {
+                                                               serverReconstruct(s1 * s3, y, bitlength); // this is where we get gainz, we only send a one way message of `bitlength` bits
+                                                           });
+
+        GroupElement *msb = new GroupElement[s1 * s3];
+
+        if (party == SERVER)
+        {
+#pragma omp parallel for
+            for (int i = 0; i < s1 * s3; ++i)
+            {
+                y[i] = y[i] + (1ULL << (bitlength - 2));
+                msb[i] = (y[i] >> (bitlength - 1)) & 1;
+            }
+        }
+
+        auto msb_stats = time_comm_this_block([&]()
+                                              { serverToClient(s1 * s3, msb, 1); });
+
+        freeMatMulKey(key);
+
+        // part 2: truncation
+        EdabitsPrTruncKeyPack *keys = new EdabitsPrTruncKeyPack[s1 * s3];
+
+        auto keyread_time_2 = time_this_block([&]()
+                                              {
+            for(int i = 0; i < s1 * s3; i++){
+                keys[i] = dealer->recv_edabits_prtrunc_key(bitlength);
+            } });
+
+        peer->sync();
+
+        auto compute_time_2 = time_this_block([&]()
+                                              {
+            if (party == SERVER)
+            {
+#pragma omp parallel for
+                for(int i = 0; i < s1 * s3; i++) {
+                    GroupElement vb = keys[i].a;
+                    GroupElement ip = y[i];
+                    if (msb[i]) {
+                        vb = 1 - vb;
+                    }
+                    GroupElement t = keys[i].b + (1ULL << (bitlength - scale - 1)) * vb;
+                    y[i] = t + ((ip % (1ULL << (bitlength - 1))) >> scale) - (1ULL << (bitlength - scale - 2));
+                }
+            }
+            else
+            {
+#pragma omp parallel for
+                for(int i = 0; i < s1 * s3; i++) {
+                    GroupElement vb = keys[i].a;
+                    if (msb[i]) {
+                        vb = -vb;
+                    }
+                    GroupElement t = keys[i].b + (1ULL << (bitlength - scale - 1)) * vb;
+                    y[i] = t;
+                }
+            } });
+
+        auto reconstruction_stats_2 = time_comm_this_block([&]()
+                                                           { reconstruct(s1 * s3, y, bitlength); });
+
+        Llama::stat_t stat = {"Gemm", keyread_time_1 + keyread_time_2, compute_time_1 + compute_time_2, reconstruction_stats_1.first + reconstruction_stats_2.first + msb_stats.first, reconstruction_stats_1.second + reconstruction_stats_2.second + msb_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+        delete[] msb;
+    }
+}
+
+void SoftmaxTriangular(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale)
+{
+    // s1 = batch size
+    // s2 = number of classes
+    always_assert(s1 == s2);
+    GroupElement *max = make_array<GroupElement>(s1);
+    // step 1 - calculate max for each image in batch
+    SlothMaxpoolTriangular(s1, s2, bin, x, max, "Softmax::");
+
+    // step 2 - subtract max from each element in each image in batch
+    int idx = 0;
+    auto t1 = time_this_block([&]()
+                              {
+    for(int i = 1; i < s1; ++i) {
+        for(int j = 0; j < i + 1; ++j) {
+            y[idx] = max[i] - Arr2DIdx(x, s1, s2, i, j);
+            idx++;
+        }
+    } });
+    // step 3 - exponentiate each element in each image in batch
+    nExp(idx, std::min(bin + 1, bitlength), y, y, scale);
+
+    GroupElement *denominators = max; // reuse the array
+    // // step 4 - calculate sum of exponentiated elements for each image in batch
+    auto t2 = time_this_block([&]()
+                              {
+    idx = 0;
+    for(int i = 1; i < s1; ++i) {
+        denominators[i] = 0;
+        for(int j = 0; j < i + 1; ++j) {
+            denominators[i] = denominators[i] + y[idx];
+            idx++;
+        }
+    } });
+
+    // step 5 - calculate inverse of all the denominators
+    InverseLUT(s1 - 1, denominators + 1, denominators + 1, scale, scale + 10, "Softmax::"); // only works if numClasses <= 1024
+
+    // step 6 - multiply each element in each image in batch by the inverse of the denominator
+    GroupElement *expandedDenominator = make_array<GroupElement>(s1 * s2);
+    auto t3 = time_this_block([&]()
+                              {
+    idx = 0;
+    for(int i = 1; i < s1; ++i) {
+        for(int j = 0; j < i + 1; ++j) {
+            expandedDenominator[idx] = denominators[i];
+            idx++;
+        }
+    } });
+    delete[] max;
+
+    ElemWiseMul(idx, expandedDenominator, y, y, "Softmax::");
+    SlothARS(idx, y, y, scale, "Softmax::");
+
+    GroupElement *yRes = expandedDenominator; // reuse the array
+    auto t4 = time_this_block([&]()
+                              {
+    idx = 0;
+
+    Arr2DIdx(yRes, s1, s2, 0, 0) = (party == DEALER ? 0 : (1LL << scale));
+    for(int j = 1; j < s2; ++j) {
+        Arr2DIdx(yRes, s1, s2, 0, j) = 0;
+    }
+    for(int i = 1; i < s1; ++i) {
+        for(int j = 0; j < i + 1; ++j) {
+            Arr2DIdx(yRes, s1, s2, i, j) = y[idx];
+            idx++;
+        }
+        for(int j = i + 1; j < s2; ++j) {
+            Arr2DIdx(yRes, s1, s2, i, j) = 0;
+        }
+    }
+
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        y[i] = yRes[i];
+    } });
+
+    if (party != DEALER)
+        Llama::push_stats(Llama::stat_t{"Softmax::Misc", 0, t1 + t2 + t3 + t4, 0, 0, 0});
+
+    delete[] expandedDenominator;
+}
+
+void MatMul2DTriangular(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
+                        MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA)
+{
+    always_assert(s1 == s3);
+    if (party == DEALER)
+    {
+
+        // TODO: dealer can generate less key material
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s3; ++j)
+            {
+                Arr2DIdx(C_mask, s1, s3, i, j) = random_ge(bitlength);
+            }
+        }
+
+        auto keys = KeyGenMatMul(bitlength, bitlength, s1, s2, s3, A_mask, B_mask, C_mask);
+
+        // server->send_matmul_key(keys.first);
+        freeMatMulKey(keys.first);
+        client->send_matmul_key(keys.second);
+        freeMatMulKey(keys.second);
+    }
+    else
+    {
+        MatMulKey key;
+        uint64_t keysize_start = dealer->bytesReceived();
+        auto keyread_time = time_this_block([&]()
+                                            { key = dealer->recv_matmul_key(bitlength, bitlength, s1, s2, s3); });
+        GroupElement *C_compress = make_array<GroupElement>(s1 * s3);
+
+        peer->sync();
+
+        int idx = 0;
+        auto compute_time = time_this_block([&]()
+                                            {
+            matmul_eval_helper_triangular(party, s1, s2, s3, A, B, C, key.a, key.b, key.c);
+            for (int i = 0; i < s1; ++i)
+            {
+                for (int j = 0; j < i + 1; ++j)
+                {
+                    C_compress[idx] = C[i * s3 + j];
+                    idx++;
+                }
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(idx, C_compress, bitlength); });
+
+        auto t1 = time_this_block([&]()
+                                  {
+        idx = 0;
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < i + 1; ++j)
+            {
+                C[i * s3 + j] = C_compress[idx];
+                idx++;
+            }
+        } });
+
+        Llama::stat_t stat = {"Linear::MatMul", keyread_time, compute_time + t1, reconstruction_stats.first, reconstruction_stats.second, dealer->bytesReceived() - keysize_start};
+        stat.print();
+        Llama::push_stats(stat);
+
+        freeMatMulKey(key);
+    }
+}
+
+// unused
+void SlothAttentionTriangular(int n_seq, int n_embd, int n_heads, GroupElement *q, GroupElement *k, GroupElement *v, GroupElement *out, int scale)
+{
+    GroupElement *kt = new GroupElement[n_seq * n_embd];
+    for (int i = 0; i < n_seq; ++i)
+    {
+        for (int j = 0; j < n_embd; ++j)
+        {
+            Arr2DIdx(kt, n_embd, n_seq, j, i) = Arr2DIdx(k, n_seq, n_embd, i, j);
+        }
+    }
+
+    GroupElement *qkt = new GroupElement[n_seq * n_seq];
+
+    MatMul2DTriangular(n_seq, n_embd, n_seq, q, q, kt, kt, qkt, qkt, true);
+
+    GroupElement *qkt_compressed = new GroupElement[n_seq * n_seq];
+    int idx = 0;
+    for (int i = 0; i < n_seq; ++i)
+    {
+        for (int j = 0; j < i + 1; ++j)
+        {
+            qkt_compressed[idx] = Arr2DIdx(qkt, n_seq, n_seq, i, j);
+            idx++;
+        }
+    }
+
+    GroupElement invdiv = (1LL << scale) / sqrt(double(n_embd) / double(n_heads));
+
+    // if indiv is power of 2
+    if ((invdiv & (invdiv - 1)) == 0)
+    {
+        int s2 = log2(invdiv);
+        if (2 * scale > s2)
+        {
+            SlothARS(idx, qkt_compressed, qkt_compressed, 2 * scale - s2, "Attention::");
+        }
+        else
+        {
+            for (int i = 0; i < idx; ++i)
+            {
+                qkt_compressed[i] = qkt_compressed[i] * (1LL << (s2 - 2 * scale));
+            }
+        }
+    }
+    else
+    {
+        SlothARS(idx, qkt_compressed, qkt_compressed, scale, "Attention::");
+        for (int i = 0; i < idx; ++i)
+        {
+            qkt_compressed[i] = qkt_compressed[i] * invdiv;
+        }
+        SlothARS(idx, qkt_compressed, qkt_compressed, scale, "Attention::");
+    }
+
+    GroupElement *qkt_res = qkt;
+    idx = 0;
+    for (int i = 0; i < n_seq; ++i)
+    {
+        for (int j = 0; j < i + 1; ++j)
+        {
+            Arr2DIdx(qkt_res, n_seq, n_seq, i, j) = qkt_compressed[idx];
+            idx++;
+        }
+    }
+
+    SoftmaxTriangular(n_seq, n_seq, bitlength - scale, qkt_res, qkt_res, scale);
+
+    MatMul2D(n_seq, n_seq, n_embd, qkt_res, qkt_res, v, v, out, out, true);
+    SlothARS(n_seq * n_embd, out, out, scale, "Attention::");
+}
+
+void SlothWrap_ss(int size, int bin, GroupElement *x, GroupElement *y, std::string parent)
+{
+    if (party == DEALER)
+    {
+        pair<WrapSSKeyPack> *keys = new pair<WrapSSKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(1);
+            keys[i] = keyGenWrapSS(bin, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_wrap_ss_key(keys[i].first);
+            client->send_wrap_ss_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        WrapSSKeyPack *keys = new WrapSSKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_wrap_ss_key(bin);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalWrapSS(party - 2, x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, 1); });
+
+        Llama::stat_t stat = {
+            parent,
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void SlothWrap_dpf(int size, int bin, GroupElement *x, GroupElement *y, std::string parent)
+{
+    if (party == DEALER)
+    {
+        pair<WrapDPFKeyPack> *keys = new pair<WrapDPFKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(1);
+            keys[i] = keyGenWrapDPF(bin, x[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_wrap_dpf_key(keys[i].first);
+            client->send_wrap_dpf_key(keys[i].second);
+            freeWrapDPFKeyPackPair(keys[i]);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        WrapDPFKeyPack *keys = new WrapDPFKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_wrap_dpf_key(bin);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalWrapDPF(party - 2, x[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, 1); });
+
+        Llama::stat_t stat = {
+            parent,
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeWrapDPFKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+}
+
+void SlothWrap(int size, int bin, GroupElement *x, GroupElement *w, std::string parent)
+{
+    if (bin <= 7)
+    {
+        SlothWrap_ss(size, bin, x, w, parent);
+    }
+    else
+    {
+        SlothWrap_dpf(size, bin, x, w, parent);
+    }
+}
+
+void SlothLRSfromWrap(int size, GroupElement *x, GroupElement *w, GroupElement *y, int scale, std::string parent)
+{
+    if (party == DEALER)
+    {
+        pair<SlothLRSKeyPack> *keys = new pair<SlothLRSKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(1);
+            keys[i] = keyGenSlothLRS(bitlength, scale, x[i], w[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_sloth_lrs_key(keys[i].first);
+            client->send_sloth_lrs_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        SlothLRSKeyPack *keys = new SlothLRSKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_sloth_lrs_key(bitlength, scale);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalSlothLRS(party - 2, x[i], w[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bitlength); });
+
+        Llama::stat_t stat = {
+            parent,
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void SlothLRS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    GroupElement *w = new GroupElement[size];
+    GroupElement *x0 = w;
+
+    auto t = time_this_block([&]()
+                             {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        x0[i] = x[i];
+        mod(x0[i], scale);
+    } });
+
+    SlothWrap(size, scale, x0, w, prefix + "Truncation");
+    SlothLRSfromWrap(size, x, w, y, scale, prefix + "Truncation");
+
+    if (party != DEALER)
+        Llama::push_stats({prefix + "Truncation::Misc", 0, t, 0, 0, 0});
+
+    delete[] w;
+}
+
+void SlothARS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    GroupElement *z = new GroupElement[size];
+
+    if (party == DEALER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            z[i] = x[i];
+        }
+    }
+    else
+    {
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            z[i] = x[i] + (1LL << (bitlength - 2));
+        } });
+        Llama::stat_t stat = {prefix + "Truncation::Misc", 0, t, 0, 0, 0};
+        stat.print();
+        Llama::push_stats(stat);
+    }
+
+    SlothLRS(size, z, z, scale, prefix);
+
+    if (party == DEALER)
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            y[i] = z[i];
+        }
+    }
+    else
+    {
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            y[i] = z[i] - (1LL << (bitlength - scale - 2));
+        } });
+        Llama::stat_t stat = {prefix + "Truncation::Misc", 0, t, 0, 0, 0};
+        stat.print();
+        Llama::push_stats(stat);
+    }
+}
+
+void SlothTRfromWrap(int size, int bin, GroupElement *x, GroupElement *w, GroupElement *y, int scale, std::string parent)
+{
+    if (party == DEALER)
+    {
+        pair<SlothLRSKeyPack> *keys = new pair<SlothLRSKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(1);
+            keys[i] = keyGenSlothLRS(bin, scale, x[i], w[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_sloth_lrs_key(keys[i].first);
+            client->send_sloth_lrs_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        SlothLRSKeyPack *keys = new SlothLRSKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_sloth_lrs_key(bin, scale);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalSlothLRS(party - 2, x[i], w[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bin - scale); });
+
+        Llama::stat_t stat = {
+            parent,
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void SlothTR(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    GroupElement *w = new GroupElement[size];
+    GroupElement *x0 = w;
+
+    auto t = time_this_block([&]()
+                             {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        x0[i] = x[i];
+        mod(x0[i], scale);
+    } });
+
+    SlothWrap(size, scale, x0, w, prefix + "TruncateReduce");
+    SlothTRfromWrap(size, bin, x, w, y, scale, prefix + "TruncateReduce");
+
+    delete[] w;
+
+    if (party != DEALER)
+        Llama::push_stats({prefix + "TruncateReduce::Misc", 0, t, 0, 0, 0});
+}
+
+void SlothSignExtendFromWrap(int size, int bin, int bout, GroupElement *x, GroupElement *w, GroupElement *y, std::string parent)
+{
+    if (party == DEALER)
+    {
+        pair<SlothSignExtendKeyPack> *keys = new pair<SlothSignExtendKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            GroupElement rout = random_ge(bout);
+            keys[i] = keyGenSlothSignExtend(bin, bout, x[i], w[i], rout);
+            y[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_sloth_sign_extend_key(keys[i].first);
+            client->send_sloth_sign_extend_key(keys[i].second);
+        }
+
+        delete[] keys;
+    }
+    else
+    {
+        SlothSignExtendKeyPack *keys = new SlothSignExtendKeyPack[size];
+
+        uint64_t keysize_start = dealer->bytesReceived();
+        uint64_t keyread_time = time_this_block([&]()
+                                                {
+            for (int i = 0; i < size; ++i) {
+                keys[i] = dealer->recv_sloth_sign_extend_key(bin, bout);
+            } });
+
+        peer->sync();
+
+        uint64_t compute_time = time_this_block([&]()
+                                                {
+#pragma omp parallel for
+            for (int i = 0; i < size; ++i) {
+                y[i] = evalSlothSignExtend(party - 2, x[i], w[i], keys[i]);
+            } });
+
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, y, bout); });
+
+        Llama::stat_t stat = {
+            parent,
+            keyread_time,
+            compute_time,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            dealer->bytesReceived() - keysize_start};
+
+        stat.print();
+        Llama::push_stats(stat);
+
+        delete[] keys;
+    }
+}
+
+void SlothFaithfulARS(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix)
+{
+    GroupElement *w = new GroupElement[size];
+
+    SlothTR(size, bin, x, y, scale, prefix + "FaithfulARS::");
+
+    if (party != DEALER)
+    {
+        auto t = time_this_block([&]()
+                                 {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            y[i] = y[i] + (1LL << (bin-scale-1));
+            mod(y[i], bin-scale);
+        } });
+
+        Llama::push_stats({prefix + "FaithfulARS::Misc", 0, t, 0, 0, 0});
+    }
+    else
+    {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            mod(y[i], bin - scale);
+        }
+    }
+
+    SlothWrap(size, bin - scale, y, w, prefix + "FaithfulARS");
+    SlothSignExtendFromWrap(size, bin - scale, bin, y, w, y, prefix + "FaithfulARS");
+
+    delete[] w;
+}
+
+// this API is not exposed directly, so we used single array for values and masks
+// all APIs should do the same in principal
+void InsecureInverse(int32_t size, GroupElement *A, GroupElement *invA, int32_t sf, int32_t upper)
+{
+    // KG: make sure this is inplace secure (i.e can accept invA = A)
+    uint64_t logk = osuCrypto::log2ceil(upper);
+    uint64_t m = logk + 1;
+    std::cerr << ">> InsecureInverse - start" << std::endl;
+
+    if (party == DEALER)
+    {
+        for (int i = 0; i < size; ++i)
+        {
+            auto rout = random_ge(bitlength);
+            auto keys = keyGenTaylor(bitlength, bitlength, 2.630, -5.857, 4.245, A[i], rout, sf, logk);
+            server->send_taylor_key(keys.first, bitlength, m);
+            client->send_taylor_key(keys.second, bitlength, m);
+            invA[i] = rout;
+            // TODO: delete keys[i].first and keys[i].second
+        }
+    }
+    else
+    {
+        TaylorKeyPack *keys = new TaylorKeyPack[size];
+        for (int i = 0; i < size; ++i)
+        {
+            keys[i] = dealer->recv_taylor_key(bitlength, m, sf);
+        }
+
+        peer->sync();
+
+        GroupElement *tmp = new GroupElement[2 * size];
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; ++i)
+        {
+            auto tup = evalTaylor_round1(party - SERVER, bitlength, bitlength, 2.630, -5.857, 4.245, A[i], keys[i], sf, logk);
+            tmp[i] = tup.first;
+            tmp[i + size] = tup.second;
+        }
+        reconstruct(2 * size, tmp, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            auto tup = evalTaylor_round2(party - SERVER, bitlength, bitlength, 2.630, -5.857, 4.245, A[i], keys[i], sf, logk, tmp[i], tmp[i + size]);
+            tmp[i + size] = tup.first + tup.second;
+        }
+        reconstruct(size, tmp + size, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            auto tup = evalTaylor_round3(party - SERVER, bitlength, bitlength, 2.630, -5.857, 4.245, A[i], keys[i], sf, logk, tmp[i], tmp[i + size], tmp[i + size]);
+            tmp[i + size] = tup;
+        }
+        reconstruct(size, tmp + size, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            invA[i] = tmp[i + size];
+        }
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        std::cerr << "   Online Time = " << eval_time / 1000.0 << " milliseconds" << std::endl;
+        evalMicroseconds += eval_time;
+        delete[] tmp;
+    }
+    std::cerr << ">> InsecureInverse - end" << std::endl;
+}
+
+void r2_threads_helper_1(int thread_idx, int32_t size, GroupElement *inArr, GroupElement *tmp, Relu2RoundKeyPack *keys)
+{
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        tmp[i] = evalRelu2_drelu(party - 2, inArr[i], keys[i]);
+    }
+}
+
+void r2_threads_helper_2(int thread_idx, int32_t size, GroupElement *inArr, GroupElement *tmp, GroupElement *outArr, Relu2RoundKeyPack *keys)
+{
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        outArr[i] = evalRelu2_mult(party - 2, tmp[i], inArr[i], keys[i]);
+    }
+}
+
+void Relu2Round(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu_cache, int effectiveInputBw)
+{
+    std::cerr << ">> Relu2Round - Start" << std::endl;
+    GroupElement *tmp = make_array<GroupElement>(size);
+    if (party == DEALER)
+    {
+        GroupElement *drelu_mask = tmp;
+        pair<Relu2RoundKeyPack> *keys = new pair<Relu2RoundKeyPack>[size];
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            drelu_mask[i] = random_ge(1);
+            if (drelu_cache != nullptr)
+                drelu_cache[i] = drelu_mask[i];
+            GroupElement rout = random_ge(bitlength);
+            keys[i] = keyGenRelu2Round(effectiveInputBw, bitlength, inArr_mask[i], drelu_mask[i], rout);
+            outArr_mask[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_relu_2round_key(keys[i].first);
+            client->send_relu_2round_key(keys[i].second);
+            freeRelu2RoundKeyPackPair(keys[i]);
+        }
+        delete[] keys;
+    }
+    else
+    {
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        Relu2RoundKeyPack *keys = new Relu2RoundKeyPack[size];
+        for (int i = 0; i < size; ++i)
+        {
+            keys[i] = dealer->recv_relu_2round_key(effectiveInputBw, bitlength);
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        {
+            std::thread thread_pool[num_threads];
+            for (int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i] = std::thread(r2_threads_helper_1, i, size, inArr, tmp, keys);
+            }
+
+            for (int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i].join();
+            }
+        }
+        auto t1 = std::chrono::high_resolution_clock::now();
+        uint64_t onlineComm0 = peer->bytesReceived() + peer->bytesSent();
+        reconstruct(size, tmp, 1);
+        if (drelu_cache != nullptr)
+            for (int i = 0; i < size; ++i)
+            {
+                drelu_cache[i] = tmp[i];
+            }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        {
+            std::thread thread_pool[num_threads];
+            for (int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i] = std::thread(r2_threads_helper_2, i, size, inArr, tmp, outArr, keys);
+            }
+
+            for (int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i].join();
+            }
+        }
+        auto t3 = std::chrono::high_resolution_clock::now();
+        reconstruct(size, outArr, bitlength);
+        uint64_t onlineComm1 = peer->bytesReceived() + peer->bytesSent();
+        reluOnlineComm += (onlineComm1 - onlineComm0);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        uint64_t time_taken = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        reluEvalMicroseconds += time_taken;
+        evalMicroseconds += time_taken;
+
+        uint64_t time1 = std::chrono::duration_cast<std::chrono::microseconds>(t1 - start).count();
+        uint64_t time2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        uint64_t time3 = std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
+        uint64_t time4 = std::chrono::duration_cast<std::chrono::microseconds>(end - t3).count();
+        // std::cerr << "   Compute time 1: " << time1 / 1000.0 << " milliseconds" << std::endl;
+        // std::cerr << "   Reconstruct time 1: " << time2 / 1000.0 << " milliseconds" << std::endl;
+        // std::cerr << "   Compute time 2: " << time3 / 1000.0 << " milliseconds" << std::endl;
+        // std::cerr << "   Reconstruct time 2: " << time4 / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds" << std::endl;
+        std::cerr << "   Compute Time = " << (time1 + time3) / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Reconstruct Time = " << (time2 + time4) / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Online Time = " << time_taken / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
+
+        for (int i = 0; i < size; ++i)
+        {
+            freeRelu2RoundKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+
+    delete[] tmp;
+    std::cerr << ">> Relu2Round - End" << std::endl;
+}
+
+void fixtofloat_threads_helper(int thread_idx, int32_t size, int scale, GroupElement *inp, GroupElement *out, GroupElement *pl, GroupElement *q,
+                               GroupElement *pow, GroupElement *sm, FixToFloatKeyPack *keys)
+{
+    auto p = get_start_end(size, thread_idx);
+    for (int i = p.first; i < p.second; i += 1)
+    {
+        evalFixToFloat_1(party - 2, bitlength, scale, inp[i], keys[i], pl, q,
+                         out[i * 4 + 0], out[i * 4 + 1], out[i * 4 + 2], out[i * 4 + 3], pow[i], sm[i]);
+    }
+}
+
+void FixToFloat(int size, GroupElement *inp, GroupElement *out, int scale)
+{
+    // std::cerr << ">> FixToFloat - Start" << std::endl;
+    GroupElement *p = new GroupElement[2 * bitlength];
+    GroupElement *q = new GroupElement[2 * bitlength];
+    fill_pq(p, q, bitlength);
+
+    if (party == DEALER)
+    {
+        pair<FixToFloatKeyPack> *keys = new pair<FixToFloatKeyPack>[size];
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            keys[i] = keyGenFixToFloat(bitlength, scale, inp[i], p, q);
+            out[4 * i] = 0;
+            out[4 * i + 1] = 0;
+            out[4 * i + 2] = 0;
+            out[4 * i + 3] = 0;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_fix_to_float_key(keys[i].first, bitlength);
+            client->send_fix_to_float_key(keys[i].second, bitlength);
+            freeFixToFloatKeyPackPair(keys[i]);
+        }
+        delete[] keys;
+    }
+    else
+    {
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        FixToFloatKeyPack *keys = new FixToFloatKeyPack[size];
+
+        for (int i = 0; i < size; ++i)
+        {
+            keys[i] = dealer->recv_fix_to_float_key(bitlength);
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+        GroupElement *pow = new GroupElement[size];
+        GroupElement *sm = new GroupElement[size];
+        GroupElement *ym = new GroupElement[size];
+
+        peer->sync();
+        auto eval_start = std::chrono::high_resolution_clock::now();
+
+        std::thread thread_pool[num_threads];
+        for (int i = 0; i < num_threads; ++i)
+        {
+            thread_pool[i] = std::thread(fixtofloat_threads_helper, i, size, scale, inp, out, p, q, pow, sm, keys);
+        }
+
+        for (int i = 0; i < num_threads; ++i)
+        {
+            thread_pool[i].join();
+        }
+
+        reconstruct(size, sm, 1);
+        reconstruct(size, pow, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            ym[i] = 2 * evalSelect(party - 2, 1 ^ sm[i], inp[i], keys[i].selectKey);
+            if (party == 2)
+            {
+                ym[i] = ym[i] - inp[i];
+            }
+        }
+
+        reconstruct(size, ym, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            out[i * 4 + 0] = -keys[i].ry * pow[i] - keys[i].rpow * ym[i] + keys[i].rm;
+            if (party == 2)
+            {
+                out[i * 4 + 0] = out[i * 4 + 0] + ym[i] * pow[i];
+                out[i * 4 + 0] = -((-out[i * 4 + 0]) >> (bitlength - scale));
+            }
+            else
+            {
+                out[i * 4 + 0] = out[i * 4 + 0] >> (bitlength - scale);
+            }
+        }
+
+        auto eval_end = std::chrono::high_resolution_clock::now();
+        auto eval_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(eval_end -
+                                                                                     eval_start)
+                                   .count();
+        // std::cerr << "   Key Read Time = " << keyread_time_taken << " miliseconds" << std::endl;
+        // std::cerr << "   Online Time = " << eval_time_taken / 1000.0 << " miliseconds" << std::endl;
+        evalMicroseconds += eval_time_taken;
+        delete[] sm;
+        delete[] pow;
+        delete[] ym;
+        for (int i = 0; i < size; ++i)
+        {
+            freeFixToFloatKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+    // std::cerr << ">> FixToFloat - End" << std::endl;
+}
+
+void FloatToFixCt(int size, GroupElement *inp, GroupElement *out, int scale)
+{
+    if (party == DEALER)
+    {
+        memset(out, 0, size * sizeof(GroupElement));
+    }
+    else
+    {
+        GroupElement *m = new GroupElement[2 * size];
+        GroupElement *e = m + size;
+
+        for (int i = 0; i < size; ++i)
+        {
+            m[i] = inp[4 * i + 0];
+            e[i] = inp[4 * i + 1];
+            // if (party == 2)
+            // {
+            //     e[i] += scale;
+            //     e[i] -= 127; // fp32 bias
+            // }
+        }
+        // now have m and e in the clear
+        // reconstruct(2 * size, m, 64);
+        for (int i = 0; i < size; ++i)
+        {
+            mod(m[i], 24);
+            mod(e[i], 10);
+            assert(e[i] < 256);
+
+            // int eAsInt = e[i] < 512 ? e[i] : -1 * (1024 - e[i]);
+            // assert(eAsInt <= 126 && eAsInt >= -127);
+            // if(i < 10) printf("%d=%ld, %ld\n", i, m[i], e[i]);
+            int ePrime = e[i] - 127 + scale;
+            // if(i < 10) printf("%d=%ld, %ld, %d\n", i, m[i], e[i], ePrime);
+            GroupElement x = 0;
+            if (ePrime >= 0 && ePrime <= scale)
+            {
+                x = m[i] * (1ULL << ePrime);
+                assert(x < (1ULL << 63));
+                x >>= 23;
+                // auto xf = x;
+                // mod(xf, scale);
+                // auto s = random_ge(scale);
+                // if(s < xf) x += 1;
+                // if(i < 10) printf("%d=%ld, %ld, %ld\n", i, m[i], ePrime, x);
+            }
+            out[i] = x;
+        }
+        delete[] m;
+    }
+}
+
+void FloatToFix(int size, GroupElement *inp, GroupElement *out, int scale)
+{
+    // std::cerr << ">> FloatToFix - Start" << std::endl;
+
+    if (party == DEALER)
+    {
+        pair<FloatToFixKeyPack> *keys = new pair<FloatToFixKeyPack>[size];
+
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
+            auto rout = random_ge(bitlength);
+            keys[i] = keyGenFloatToFix(bitlength, scale, rout);
+            out[i] = rout;
+        }
+
+        for (int i = 0; i < size; ++i)
+        {
+            server->send_float_to_fix_key(keys[i].first, bitlength);
+            client->send_float_to_fix_key(keys[i].second, bitlength);
+            freeFloatToFixKeyPackPair(keys[i]);
+        }
+        delete[] keys;
+    }
+    else
+    {
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        FloatToFixKeyPack *keys = new FloatToFixKeyPack[size];
+        for (int i = 0; i < size; ++i)
+        {
+            keys[i] = dealer->recv_float_to_fix_key(bitlength);
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                                                        keyread_start)
+                                      .count();
+
+        GroupElement *m = new GroupElement[2 * size];
+        GroupElement *e = m + size;
+        GroupElement *w = new GroupElement[2 * size];
+        GroupElement *t = new GroupElement[size];
+        GroupElement *h = w + size;
+        GroupElement *d = new GroupElement[size];
+
+        peer->sync();
+        auto eval_start = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < size; ++i)
+        {
+            m[i] = inp[4 * i + 0] + keys[i].rm;
+            e[i] = inp[4 * i + 1] + keys[i].re;
+            if (party == 2)
+            {
+                e[i] += (scale);
+                e[i] -= 127; // fp32 bias
+            }
+        }
+
+        // m and e are in a single array. m is the first half and e is the second half
+        reconstruct(2 * size, m, 24);
+
+        for (int i = 0; i < size; ++i)
+        {
+            mod(m[i], 24);
+            evalDCF(party - 2, &w[i], m[i], keys[i].dcfKey);
+            w[i] = w[i] + keys[i].rw;
+        }
+
+        for (int i = 0; i < size; i++)
+        {
+            mod(e[i], 10);
+            d[i] = 0;
+            for (int j = 0; j < 1024; j++)
+            {
+                d[i] = d[i] + (pow_helper(scale, j) * keys[i].p[(j - e[i]) % 1024]);
+            }
+            h[i] = keys[i].rh + (pow((GroupElement)2, 24) * d[i]);
+        }
+
+        // w and h are in a single array w. w is the first half and h is the second half
+        reconstruct(2 * size, w, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            t[i] = evalSelect(party - 2, w[i], h[i], keys[i].selectKey);
+            t[i] = t[i] + keys[i].q[e[i]];
+            t[i] = t[i] + (m[i] * d[i]);
+        }
+
+        reconstruct(size, t, bitlength);
+
+        for (int i = 0; i < size; ++i)
+        {
+            out[i] = evalARS(party - 2, t[i], 23, keys[i].arsKey);
+        }
+
+        // reconstruct(size, out, bitlength);
+
+        auto eval_end = std::chrono::high_resolution_clock::now();
+        auto eval_time_taken = std::chrono::duration_cast<std::chrono::microseconds>(eval_end -
+                                                                                     eval_start)
+                                   .count();
+        // std::cerr << "   Key Read Time = " << keyread_time_taken << " miliseconds" << std::endl;
+        // std::cerr << "   Online Time = " << eval_time_taken / 1000.0 << " miliseconds" << std::endl;
+        evalMicroseconds += eval_time_taken;
+        delete[] m;
+        delete[] w;
+        delete[] t;
+        for (int i = 0; i < size; ++i)
+        {
+            freeFloatToFixKeyPack(keys[i]);
+        }
+        delete[] keys;
+    }
+    // std::cerr << ">> FloatToFix - End" << std::endl;
+}
+
+void mult_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, GroupElement *multArrVec, GroupElement *outArr, MultKey *keys)
+{
+    auto thread_start = std::chrono::high_resolution_clock::now();
+    auto p = get_start_end(size, thread_idx);
+    for(int i = p.first; i < p.second; i += 1){
+        outArr[i] = MultEval(party - SERVER, keys[i], inArr[i], multArrVec[i]);
+    }
+    auto thread_end = std::chrono::high_resolution_clock::now();
+}
+
+void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr),
+                                    MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr))
+{
+    std::cerr << ">> ElemWise Mult - start" << std::endl;
+    if (party == DEALER) {
+        uint64_t dealer_toal_time = 0;
+        pair<MultKey> *keys = new pair<MultKey>[size];
+
+        #pragma omp parallel for
+        for(int i = 0; i < size; ++i) {
+            auto dealer_start = std::chrono::high_resolution_clock::now();
+            auto rout = random_ge(bitlength);
+            keys[i] = MultGen(inArr_mask[i], multArrVec_mask[i], rout);
+            outputArr_mask[i] = rout;
+            auto dealer_end = std::chrono::high_resolution_clock::now();
+            dealer_toal_time += std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count();
+        }
+
+        for(int i = 0; i < size; ++i) {
+            server->send_mult_key(keys[i].first);
+            client->send_mult_key(keys[i].second);
+        }
+        dealerMicroseconds = dealerMicroseconds + dealer_toal_time;
+        delete[] keys;
+    }
+    else {
+        MultKey *keys = new MultKey[size];
+        auto keyread_start = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < size; ++i) {
+            keys[i] = dealer->recv_mult_key();
+        }
+        auto keyread_end = std::chrono::high_resolution_clock::now();
+        auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
+                                                            keyread_start).count();
+
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        std::thread thread_pool[num_threads];
+        for(int i = 0; i < num_threads; ++i) {
+            thread_pool[i] = std::thread(mult_threads_helper, i, size, inArr, multArrVec, outputArr, keys);
+        }
+
+        for(int i = 0; i < num_threads; ++i) {
+            thread_pool[i].join();
+        }
+        auto mid = std::chrono::high_resolution_clock::now();
+        // reconstruct(size, outputArr, bitlength);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(mid - start).count();
+        auto reconstruct_time = std::chrono::duration_cast<std::chrono::microseconds>(end - mid).count();
+        std::cerr << "   Key Read Time = " << keyread_time_taken << " milliseconds\n";
+        std::cerr << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+        std::cerr << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+        evalMicroseconds += (reconstruct_time + compute_time);
+        multEvalMicroseconds += (reconstruct_time + compute_time);
+        delete[] keys;
+
+    }
+    std::cerr << ">> ElemWise Mult - end" << std::endl;
+}
+
+
+void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf) 
+{
+    // s1 = batch size
+    // s2 = number of classes
+
+    std::cerr << ">> Softmax - start" << std::endl;
+    GroupElement *max = make_array<GroupElement>(s1); 
+    // step 1 - calculate max for each image in batch
+    GroupElement *oneHot = make_array<GroupElement>(s1 * (s2 - 1));
+    MaxPool(s1, 1, 1, 1, s2, 1, 0, 0, 0, 0, 1, 1, s1, s2, 1, 1, MASK_PAIR(inArr), max, max, oneHot);
+    delete[] oneHot; // TODO: support passing oneHot as nullptr
+
+    // step 2 - subtract max from each element in each image in batch and add 2
+    if (party == DEALER) {
+        for(int i = 0; i < s1; ++i) {
+            for(int j = 0; j < s2; ++j) {
+                Arr2DIdx(outArr_mask, s1, s2, i, j) = Arr2DIdx(inArr_mask, s1, s2, i, j) - max[i];
+            }
+        }
+    }
+    else {
+        for(int i = 0; i < s1; ++i) {
+            for(int j = 0; j < s2; ++j) {
+                Arr2DIdx(outArr, s1, s2, i, j) = Arr2DIdx(inArr, s1, s2, i, j) - max[i] + (1<<(sf + 1));
+            }
+        }
+    }
+
+    // step 3 - exponentiate each element in each image in batch 
+    // e^x = RT((x+2), 1) for negative x
+    // ReluTruncate(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), 1, nullptr); // Q: can we do this in place? can be a source of bug in future
+    Relu2Round(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), nullptr, 64);
+    for(int i = 0; i < s1 * s2; ++i) {
+        if (party == DEALER) {
+            outArr_mask[i] = outArr_mask[i] / 2;
+        }
+        else {
+            outArr[i] = outArr[i] / 2;
+        }
+    }
+
+    GroupElement *denominators = max; // reuse the array
+    // // step 4 - calculate sum of exponentiated elements for each image in batch
+    if (party == DEALER) {
+        for(int i = 0; i < s1; ++i) {
+            denominators[i] = 0;
+            for(int j = 0; j < s2; ++j) {
+                denominators[i] = denominators[i] + Arr2DIdx(outArr_mask, s1, s2, i, j);
+            }
+            // denominators[i] = denominators[i] * s1;
+        }
+    }
+    else {
+        for(int i = 0; i < s1; ++i) {
+            denominators[i] = 0;
+            for(int j = 0; j < s2; ++j) {
+                denominators[i] = denominators[i] + Arr2DIdx(outArr, s1, s2, i, j);
+            }
+            // denominators[i] = denominators[i] * s1;
+        }
+    }
+    // step 5 - calculate inverse of all the denominators
+    InsecureInverse(s1, denominators, denominators, sf, s2 * s1);
+
+    // step 6 - multiply each element in each image in batch by the inverse of the denominator
+    GroupElement *expandedDenominator = make_array<GroupElement>(s1 * s2);
+    for(int i = 0; i < s1; ++i) {
+        for(int j = 0; j < s2; ++j) {
+            Arr2DIdx(expandedDenominator, s1, s2, i, j) = denominators[i];
+        }
+    }
+    delete[] max;
+
+    ElemWiseSecretSharedVectorMult(s1 * s2, expandedDenominator, expandedDenominator, MASK_PAIR(outArr), MASK_PAIR(outArr));
+    // ScaleDown(s1 * s2, MASK_PAIR(outArr), sf);
+
+    always_assert((s1 & (s1-1)) == 0);
+    auto logs1 = osuCrypto::log2ceil(s1);
+    for(int i = 0; i < s1 * s2; ++i) {
+        if (party == DEALER) {
+            outArr_mask[i] = outArr_mask[i] >> (sf + logs1);
+        }
+        else {
+            outArr[i] = outArr[i] >> (sf + logs1);
+        }
+    }
+    std::cerr << ">> Softmax - end" << std::endl;
+
+    delete[] expandedDenominator;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/clip.cpp b/GPU-MPC/ext/sytorch/ext/llama/clip.cpp
new file mode 100644
index 00000000..cf2af82d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/clip.cpp
@@ -0,0 +1,93 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "clip.h"
+#include <llama/dcf.h>
+#include "pubcmp.h"
+
+std::pair<ClipKeyPack, ClipKeyPack> keyGenClip(int bin, GroupElement rin, GroupElement rout)
+{
+    GroupElement routCmp = random_ge(1);
+    auto cmpKeys = keyGenPubCmp(bin, rin, routCmp);
+    
+    GroupElement a;
+    GroupElement b;
+    GroupElement c;
+    GroupElement d1 = 0;
+    GroupElement d2 = 0;
+    a = routCmp & 1;
+    b = rin;
+    c = a * rin + rout;
+    if (a == 1) {
+        d1 = 2;
+        d2 = -2 * rin;
+    }
+
+    ClipKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.cmpKey = cmpKeys.first; k1.cmpKey = cmpKeys.second;
+    
+    auto aSplit = splitShare(a, bin);
+    k0.a = aSplit.first; k1.a = aSplit.second;
+    auto bSplit = splitShare(b, bin);
+    k0.b = bSplit.first; k1.b = bSplit.second;
+    auto cSplit = splitShare(c, bin);
+    k0.c = cSplit.first; k1.c = cSplit.second;
+    auto d1Split = splitShare(d1, bin);
+    k0.d1 = d1Split.first; k1.d1 = d1Split.second;
+    auto d2Split = splitShare(d2, bin);
+    k0.d2 = d2Split.first; k1.d2 = d2Split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalClip_1(int party, int maxBw, GroupElement x, const ClipKeyPack &key)
+{
+    GroupElement s = evalPubCmp(party, x, (1LL<<maxBw), key.cmpKey);
+    return s;
+}
+
+GroupElement evalClip_2(int party, int maxBw, GroupElement x, GroupElement y, const ClipKeyPack &key)
+{
+    GroupElement t1 = 0;
+    GroupElement t2 = 0;
+    mod(x, 1);
+    if (x == 0) {
+        t1 = key.d1;
+        t2 = key.d2;
+    }
+    GroupElement res;
+    res = -key.a * y - key.b * x + key.c + y * t1 + t2;
+    if (party == 1) {
+        res += x * y;
+    }
+
+    GroupElement maxval = (1LL<<maxBw) - 1;
+    res = res - maxval * key.a;
+    if (x == 1) {
+        res = res + maxval * key.d1;
+    }
+    if ((party == 1) && (x == 0)) {
+        res = res + maxval;
+    }
+    mod(res, key.bin);
+    return res;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/clip.h b/GPU-MPC/ext/sytorch/ext/llama/clip.h
new file mode 100644
index 00000000..1f9c8ee0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/clip.h
@@ -0,0 +1,28 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/keypack.h>
+
+std::pair<ClipKeyPack, ClipKeyPack> keyGenClip(int bin, GroupElement rin, GroupElement rout);
+GroupElement evalClip_1(int party, int maxBw, GroupElement x, const ClipKeyPack &key);
+GroupElement evalClip_2(int party, int maxBw, GroupElement x,  GroupElement y, const ClipKeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/conv.cpp b/GPU-MPC/ext/sytorch/ext/llama/conv.cpp
new file mode 100644
index 00000000..f725fa01
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/conv.cpp
@@ -0,0 +1,472 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/conv.h>
+#include <llama/array.h>
+#include <llama/comms.h>
+#include <llama/utils.h>
+#include <assert.h>
+
+std::pair<MatMulKey, MatMulKey> KeyGenMatMul(int Bin, int Bout, int s1, int s2, int s3, GroupElement *rin1, GroupElement *rin2, GroupElement *rout){
+    MatMulKey k0;
+    MatMulKey k1;
+
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+    k0.s1 = s1; k0.s2 = s2; k0.s3 = s3;
+    k1.s1 = s1; k1.s2 = s2; k1.s3 = s3;
+
+    k0.a = make_array<GroupElement>(s1, s2);
+    k0.b = make_array<GroupElement>(s2, s3);
+    k0.c = make_array<GroupElement>(s1, s3);
+
+    k1.a = make_array<GroupElement>(s1, s2);
+    k1.b = make_array<GroupElement>(s2, s3);
+    k1.c = make_array<GroupElement>(s1, s3);
+    
+    GroupElement *c = make_array<GroupElement>(s1, s3);
+    MatMul(s1, s2, s3, rin1, rin2, c);
+    MatAdd(s1, s3, c, rout, c);
+
+    for (int i = 0; i < s1; i++)
+    {
+        for (int j = 0; j < s2; j++)
+        {
+            auto rin1_split = splitShareCommonPRNG(Arr2DIdx(rin1, s1, s2, i, j), Bin);
+            Arr2DIdx(k0.a, s1, s2, i, j) = rin1_split.first;
+            Arr2DIdx(k1.a, s1, s2, i, j) = rin1_split.second;
+        }
+    }
+    
+    for(int i = 0; i < s2; i++)
+    {
+        for(int j = 0; j < s3; j++)
+        {
+            auto rin2_split = splitShareCommonPRNG(Arr2DIdx(rin2, s2, s3, i, j), Bin);
+            Arr2DIdx(k0.b, s2, s3, i, j) = rin2_split.first;
+            Arr2DIdx(k1.b, s2, s3, i, j) = rin2_split.second;
+        }
+    }
+
+    for(int i = 0; i < s1; i++)
+    {
+        for(int j = 0; j < s3; j++)
+        {
+            auto rout_split = splitShareCommonPRNG(Arr2DIdx(c, s1, s3, i, j), Bout);
+            Arr2DIdx(k0.c, s1, s3, i, j) = rout_split.first;
+            Arr2DIdx(k1.c, s1, s3, i, j) = rout_split.second;
+        }
+    }
+
+    delete[] c;
+
+    return std::make_pair(k0, k1);
+}
+
+std::pair<Conv2DKey, Conv2DKey> KeyGenConv2D(
+    int Bin, int Bout,
+    int N, int H, int W, int CI, int FH, int FW, int CO,
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideH, int strideW,
+    GroupElement *rin1,  GroupElement * rin2, GroupElement * rout)
+{
+    Conv2DKey k0;
+    Conv2DKey k1;
+
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+
+    int d0 = N;
+    int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d3 = CO;
+    k0.a = make_array<GroupElement>(N, H, W, CI);
+    k1.a = make_array<GroupElement>(N, H, W, CI);
+    k0.b = make_array<GroupElement>(FH, FW, CI, CO);
+    k1.b = make_array<GroupElement>(FH, FW, CI, CO);
+    k0.c = make_array<GroupElement>(d0, d1, d2, d3);
+    k1.c = make_array<GroupElement>(d0, d1, d2, d3);
+    k0.N = N; k0.H = H; k0.W = W; k0.CI = CI; k0.FH = FH; k0.FW = FW; k0.CO = CO; 
+    k1.N = N; k1.H = H; k1.W = W; k1.CI = CI; k1.FH = FH; k1.FW = FW; k1.CO = CO;
+    k0.strideH = strideH; k0.strideW = strideW; k0.zPadHLeft = zPadHLeft; k0.zPadHRight = zPadHRight; k0.zPadWLeft = zPadWLeft; k0.zPadWRight = zPadWRight;
+    k1.strideH = strideH; k1.strideW = strideW; k1.zPadHLeft = zPadHLeft; k1.zPadHRight = zPadHRight; k1.zPadWLeft = zPadWLeft; k1.zPadWRight = zPadWRight;
+
+    // Need temp array - matmul cant be done inplace and hence conv2d is not inplace
+    GroupElement* c = make_array<GroupElement>(d0 * d1 * d2 * d3);
+    
+    Conv2DPlaintext(N, H, W, CI, FH, FW, CO,
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideH, strideW, rin1, rin2, c);
+
+    MatAdd4(d0, d1, d2, d3, c, rout, c);
+
+    for(int n = 0; n < N; ++n) {
+        for(int h = 0; h < H; ++h) {
+            for(int w = 0; w < W; ++w) {
+                for(int ci = 0; ci < CI; ++ci) {
+                    auto rin1_split = splitShareCommonPRNG(Arr4DIdx(rin1, N, H, W, CI, n, h, w, ci), Bin);
+                    Arr4DIdx(k0.a, N, H, W, CI, n, h, w, ci) = rin1_split.first;
+                    Arr4DIdx(k1.a, N, H, W, CI, n, h, w, ci) = rin1_split.second;
+                }
+            }
+        }
+    }
+
+    for(int fh = 0; fh < FH; ++fh) {
+        for(int fw = 0; fw < FW; ++fw) {
+            for(int ci = 0; ci < CI; ++ci) {
+                for(int co = 0; co < CO; ++co) {
+                    auto rin2_split = splitShareCommonPRNG(Arr4DIdx(rin2, FH, FW, CI, CO, fh, fw, ci, co), Bin);
+                    Arr4DIdx(k0.b, FH, FW, CI, CO, fh, fw, ci, co) = rin2_split.first;
+                    Arr4DIdx(k1.b, FH, FW, CI, CO, fh, fw, ci, co) = rin2_split.second;
+                }
+            }
+        }
+    }
+
+    for(int i = 0; i < d0; ++i) {
+        for(int j = 0; j < d1; ++j) {
+            for(int k = 0; k < d2; ++k) {
+                for(int l = 0; l < d3; ++l) {
+                    auto c_split = splitShareCommonPRNG(Arr4DIdx(c, d0, d1, d2, d3, i, j, k, l), Bout);
+                    Arr4DIdx(k0.c, d0, d1, d2, d3, i, j, k, l) = c_split.first;
+                    Arr4DIdx(k1.c, d0, d1, d2, d3, i, j, k, l) = c_split.second;
+                }
+            }
+        }
+    }
+
+    delete[] c;
+
+    return std::make_pair(k0, k1);
+}
+
+
+void EvalConv2D(int party, const Conv2DKey &key,
+    int N, int H, int W, int CI, int FH, int FW, int CO,
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideH, int strideW, GroupElement* input, GroupElement* filter, GroupElement* output)
+{
+    int d0 = N;
+    int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d3 = CO;
+
+    Conv2DCache cache = allocateConv2DCache(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW);
+    MatCopy4(d0, d1, d2, d3, key.c, output);
+
+    if (party == SERVER)
+    {
+        GroupElement *tempFilter = make_array<GroupElement>(FH, FW, CI, CO);
+        MatSub4(FH, FW, CI, CO, filter, key.b, tempFilter);
+        printf("Temp filter=%ld\n", tempFilter[0]);
+        Conv2DPlaintext(N, H, W, CI, FH, FW, CO,
+            zPadHLeft, zPadHRight, 
+            zPadWLeft, zPadWRight,
+            strideH, strideW, input, tempFilter, cache.temp, cache);
+        MatAdd4(d0, d1, d2, d3, cache.temp, output, output);
+        delete[] tempFilter;
+    }
+    else
+    {
+        Conv2DPlaintext(N, H, W, CI, FH, FW, CO,
+            zPadHLeft, zPadHRight, 
+            zPadWLeft, zPadWRight,
+            strideH, strideW, input, key.b, cache.temp, cache);
+        MatSub4(d0, d1, d2, d3, output, cache.temp, output);
+    }
+
+    Conv2DPlaintext(N, H, W, CI, FH, FW, CO,
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideH, strideW, key.a, filter, cache.temp, cache);
+    MatSub4(d0, d1, d2, d3, output, cache.temp, output);
+
+    freeConv2DCache(cache);
+}
+
+std::pair<Conv3DKey, Conv3DKey> KeyGenConv3D(
+    int Bin, int Bout,
+    int N, int D, int H, int W, int CI, int FD, int FH, int FW, int CO,
+    int zPadDLeft, int zPadDRight, 
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideD, int strideH, int strideW,
+    GroupElement *rin1,  GroupElement * rin2, GroupElement * rout)
+{
+    Conv3DKey k0;
+    Conv3DKey k1;
+
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+
+    int d0 = N;
+    int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
+    int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d3 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d4 = CO;
+    k0.a = make_array<GroupElement>(N, D, H, W, CI);
+    k1.a = make_array<GroupElement>(N, D, H, W, CI);
+    k0.b = make_array<GroupElement>(FD, FH, FW, CI, CO);
+    k1.b = make_array<GroupElement>(FD, FH, FW, CI, CO);
+    k0.c = make_array<GroupElement>(d0, d1, d2, d3, d4);
+    k1.c = make_array<GroupElement>(d0, d1, d2, d3, d4);
+    k0.N = N; k0.D = D; k0.H = H; k0.W = W; k0.CI = CI; k0.FD = FD; k0.FH = FH; k0.FW = FW; k0.CO = CO; 
+    k1.N = N; k1.D = D; k1.H = H; k1.W = W; k1.CI = CI; k1.FD = FD; k1.FH = FH; k1.FW = FW; k1.CO = CO;
+    k0.strideD = strideD; k0.strideH = strideH; k0.strideW = strideW; k0.zPadDLeft = zPadDLeft; k0.zPadDRight = zPadDRight; k0.zPadHLeft = zPadHLeft; k0.zPadHRight = zPadHRight; k0.zPadWLeft = zPadWLeft; k0.zPadWRight = zPadWRight;
+    k1.strideD = strideD; k1.strideH = strideH; k1.strideW = strideW; k1.zPadDLeft = zPadDLeft; k1.zPadDRight = zPadDRight; k1.zPadHLeft = zPadHLeft; k1.zPadHRight = zPadHRight; k1.zPadWLeft = zPadWLeft; k1.zPadWRight = zPadWRight;
+
+    // Need temp array - matmul cant be done inplace and hence conv3d is not inplace
+    GroupElement* c = make_array<GroupElement>(d0 * d1 * d2 * d3 * d4);
+    
+    Conv3DPlaintext(N, D, H, W, CI, FD, FH, FW, CO,
+        zPadDLeft, zPadDRight, 
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideD, strideH, strideW, rin1, rin2, c);
+
+    MatAdd5(d0, d1, d2, d3, d4, c, rout, c);
+
+    for (int i = 0; i < N * D * H * W * CI; ++i) {
+        auto rin1_split = splitShareCommonPRNG(rin1[i], Bin);
+        k0.a[i] = rin1_split.first;
+        k1.a[i] = rin1_split.second;
+    }
+
+    for (int i = 0; i < FD * FH * FW * CI * CO; ++i) {
+        auto rin2_split = splitShareCommonPRNG(rin2[i], Bin);
+        k0.b[i] = rin2_split.first;
+        k1.b[i] = rin2_split.second;
+    }
+
+    for (int i = 0; i < d0 * d1 * d2 * d3 * d4; ++i) {
+        auto c_split = splitShareCommonPRNG(c[i], Bout);
+        k0.c[i] = c_split.first;
+        k1.c[i] = c_split.second;
+    }
+
+    delete[] c;
+
+    return std::make_pair(k0, k1);
+}
+
+
+void EvalConv3D(int party, const Conv3DKey &key,
+    int N, int D, int H, int W, int CI, int FD, int FH, int FW, int CO,
+    int zPadDLeft, int zPadDRight, 
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideD, int strideH, int strideW, GroupElement* input, GroupElement* filter, GroupElement* output)
+{
+    int d0 = N;
+    int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
+    int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d3 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d4 = CO;
+
+    Conv3DCache cache = allocateConv3DCache(N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW);
+    MatCopy5(d0, d1, d2, d3, d4, key.c, output);
+
+    if (party == SERVER)
+    {
+        GroupElement *tempFilter = make_array<GroupElement>(FD, FH, FW, CI, CO);
+        MatSub5(FD, FH, FW, CI, CO, filter, key.b, tempFilter);
+        Conv3DPlaintext(N, D, H, W, CI, FD, FH, FW, CO,
+            zPadDLeft, zPadDRight, 
+            zPadHLeft, zPadHRight, 
+            zPadWLeft, zPadWRight,
+            strideD, strideH, strideW, input, tempFilter, cache.temp, cache);
+        MatAdd5(d0, d1, d2, d3, d4, cache.temp, output, output);
+        delete[] tempFilter;
+    }
+    else
+    {
+        Conv3DPlaintext(N, D, H, W, CI, FD, FH, FW, CO,
+            zPadDLeft, zPadDRight, 
+            zPadHLeft, zPadHRight, 
+            zPadWLeft, zPadWRight,
+            strideD, strideH, strideW, input, key.b, cache.temp, cache);
+        MatSub5(d0, d1, d2, d3, d4, output, cache.temp, output);
+    }
+
+    Conv3DPlaintext(N, D, H, W, CI, FD, FH, FW, CO,
+        zPadDLeft, zPadDRight, 
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideD, strideH, strideW, key.a, filter, cache.temp, cache);
+    MatSub5(d0, d1, d2, d3, d4, output, cache.temp, output);
+
+    freeConv3DCache(cache);
+}
+
+
+std::pair<TripleKeyPack, TripleKeyPack> KeyGenConvTranspose3D(
+    int bw,
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr)
+{
+    TripleKeyPack k0;
+    TripleKeyPack k1;
+
+    k1.a = make_array<GroupElement>(N, D, H, W, CI);
+    k1.b = make_array<GroupElement>(FD, FH, FW, CI, CO);
+    k1.c = make_array<GroupElement>(N, outD, outH, outW, CO);
+    
+    k1.bw = bw;
+    k1.na = N * D * H * W * CI;
+    k1.nb = FD * FH * FW * CI * CO;
+    k1.nc = N * outD * outH * outW * CO;
+
+    // Need temp array - matmul cant be done inplace and hence conv3d is not inplace
+    GroupElement* c = make_array<GroupElement>(N, outD, outH, outW, CO);
+    
+    ConvTranspose3DLoopInnerClear(N, D, H, W, CI, FD, FH, FW, CO, 
+        zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, 
+        strideD, strideH, strideW, outD, outH, outW, inputArr, filterArr, c);
+
+    MatAdd5(N, outD, outH, outW, CO, c, outArr, c);
+
+    for (int i = 0; i < N * D * H * W * CI; ++i) {
+        auto rin1_split = splitShareCommonPRNG(inputArr[i], bw);
+        k1.a[i] = rin1_split.second;
+    }
+
+    for (int i = 0; i < FD * FH * FW * CI * CO; ++i) {
+        auto rin2_split = splitShareCommonPRNG(filterArr[i], bw);
+        k1.b[i] = rin2_split.second;
+    }
+
+    for (int i = 0; i < N * outD * outH * outW * CO; ++i) {
+        auto c_split = splitShareCommonPRNG(c[i], bw);
+        k1.c[i] = c_split.second;
+    }
+
+    delete[] c;
+
+    return std::make_pair(k0, k1);
+}
+
+
+void EvalConvTranspose3D(int party, const TripleKeyPack &key,
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr)
+{
+
+    MatCopy5(N, outD, outH, outW, CO, key.c, outArr);
+    GroupElement *temp = make_array<GroupElement>(N, outD, outH, outW, CO);
+
+    if (party == SERVER)
+    {
+        GroupElement *tempFilter = make_array<GroupElement>(FD, FH, FW, CI, CO);
+
+        MatSub5(FD, FH, FW, CI, CO, filterArr, key.b, tempFilter);
+        ConvTranspose3DLoopInnerClear(N, D, H, W, CI, FD, FH, FW, CO, 
+            zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, 
+            strideD, strideH, strideW, outD, outH, outW, 
+            inputArr, tempFilter, temp);
+        MatAdd5(N, outD, outH, outW, CO, temp, outArr, outArr);
+        delete[] tempFilter;
+    }
+    else
+    {
+        ConvTranspose3DLoopInnerClear(N, D, H, W, CI, FD, FH, FW, CO, 
+            zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, 
+            strideD, strideH, strideW, outD, outH, outW, 
+            inputArr, key.b, temp);
+        MatSub5(N, outD, outH, outW, CO, outArr, temp, outArr);
+    }
+
+    ConvTranspose3DLoopInnerClear(N, D, H, W, CI, FD, FH, FW, CO, 
+            zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, 
+            strideD, strideH, strideW, outD, outH, outW, 
+            key.a, filterArr, temp);
+    MatSub5(N, outD, outH, outW, CO, outArr, temp, outArr);
+
+    delete[] temp;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/dcf.cpp b/GPU-MPC/ext/sytorch/ext/llama/dcf.cpp
new file mode 100644
index 00000000..c6f2b7ef
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/dcf.cpp
@@ -0,0 +1,378 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/dcf.h>
+#include <omp.h>
+
+using namespace osuCrypto;
+// uint64_t aes_evals_count = 0;
+
+#define SERVER0 0
+#define SERVER1 1
+#define GROUP_LOOP(s)                  \
+    int lp = (evalGroupIdxStart + groupSize) % groupSize;        \
+    int ctr = 0;                       \
+    while(ctr < evalGroupIdxLen)       \
+    {                                  \
+        s                              \
+        lp = (lp + 1) % groupSize;     \
+        ctr++;                         \
+    }
+
+void clearAESevals()
+{
+    // aes_evals_count = 0;
+}
+
+inline int bytesize(const int bitsize) {
+    return (bitsize % 8) == 0 ? bitsize / 8 : (bitsize / 8)  + 1;
+}
+
+void convert(const int bitsize, const int groupSize, const block &b, uint64_t *out)
+{
+    static const block notThreeBlock = toBlock(~0, ~3);
+    const int bys = bytesize(bitsize);
+    const int totalBys = bys * groupSize;
+    if (bys * groupSize <= 16) {
+        uint8_t *bptr = (uint8_t *)&b;
+        for(int i = 0; i < groupSize; i++) {
+            out[i] = *(uint64_t *)(bptr + i * bys);
+        }
+    }
+    else {
+        int numblocks = totalBys % 16 == 0 ? totalBys / 16 : (totalBys / 16) + 1;
+        AES aes(b);
+        block pt[numblocks];
+        block ct[numblocks];
+        for(int i = 0; i < numblocks; i++) {
+            pt[i] = toBlock(0, i);
+        }
+        aes.ecbEncBlocks(pt, numblocks, ct);
+        uint8_t *bptr = (uint8_t *)ct;
+        for(int i = 0; i < groupSize; i++) {
+            out[i] = *(uint64_t *)(bptr + i * bys);
+        }
+    }
+}
+
+block traverseOneDCF(int Bin, int Bout, int groupSize, int party,
+                        const block &s,
+                        const block &cw,
+                        const u8 &keep,
+                        GroupElement *v_share,
+                        GroupElement *v,
+                        uint64_t level,
+                        bool geq,
+                        int evalGroupIdxStart,
+                        int evalGroupIdxLen)
+
+{
+    static const block notThreeBlock = toBlock(~0, ~3);
+    static const block TwoBlock = toBlock(0, 2);
+    static const block ThreeBlock = toBlock(0, 3);
+    static const block blocks[4] = {ZeroBlock, TwoBlock, OneBlock, ThreeBlock};
+
+    block stcw;
+    block ct[2]; // {tau, v_this_level}
+    u8 t_previous = lsb(s);
+    const auto scw = (cw & notThreeBlock);
+    block ds[] = { ((cw >> 1) & OneBlock), (cw & OneBlock) };
+    const auto mask = zeroAndAllOne[t_previous];
+    auto ss = s & notThreeBlock;
+
+    AES ak(ss);
+    ak.ecbEncTwoBlocks(blocks + 2 * keep, ct);
+
+    stcw = ((scw ^ ds[keep]) & mask) ^ ct[0];
+    uint64_t sign = (party == SERVER1) ? -1 : 1;
+    block temp = ZeroBlock;
+    uint64_t v_this_level_converted[groupSize];
+    convert(Bout, groupSize, ct[1], v_this_level_converted);
+    GROUP_LOOP(
+        v_share[lp] = v_share[lp] + sign * (v_this_level_converted[lp] + t_previous * (*(v + ((int)level) * groupSize + lp)));
+    )
+    return stcw;
+}
+
+
+block traversePathDCF(int Bin, int Bout, int groupSize, int party,
+                        GroupElement idx,
+                        block *k,
+                        GroupElement *v_share,
+                        GroupElement *v,
+                        bool geq,
+                        int evalGroupIdxStart,
+                        int evalGroupIdxLen)
+{
+    block s = _mm_loadu_si128(k);
+    GROUP_LOOP(v_share[lp] = 0;)
+
+    for (int i = 0; i < Bin; ++i)
+    {
+        const u8 keep = static_cast<uint8_t>(idx >> (Bin - 1 - i)) & 1;
+        s = traverseOneDCF(Bin, Bout, groupSize, party, s, _mm_loadu_si128(k + (i + 1)), keep, v_share, v, i, geq, evalGroupIdxStart, evalGroupIdxLen);
+    }
+    return s;
+}
+
+
+// Real Endpoints
+std::pair<DCFKeyPack, DCFKeyPack> keyGenDCF(int Bin, int Bout, int groupSize,
+                GroupElement idx, GroupElement* payload)
+{
+    // idx: bitsize Bin, payload: bitsize Bout & size groupSize
+    bool greaterThan = false;
+
+    static const block notOneBlock = toBlock(~0, ~1);
+    static const block notThreeBlock = toBlock(~0, ~3);
+    static const block TwoBlock = toBlock(0, 2);
+    static const block ThreeBlock = toBlock(0, 3);
+    const static block pt[4] = {ZeroBlock, OneBlock, TwoBlock, ThreeBlock};
+
+    int tid = omp_get_thread_num();
+    auto s = LlamaConfig::prngs[tid].get<std::array<block, 2>>();
+    block si[2][2];
+    block vi[2][2];
+
+    GroupElement *v_alpha = new GroupElement[groupSize];
+    for (int i = 0; i < groupSize; ++i)
+    {
+        v_alpha[i] = 0;
+    }
+
+    block *k0 = new block[Bin + 1];
+    block *k1 = new block[Bin + 1];
+    GroupElement *v0 = new GroupElement[Bin * groupSize];    // bitsize Bout, size Bin x groupSize
+    GroupElement *g0 = new GroupElement[groupSize];     // bitsize: Bout
+
+    s[0] = (s[0] & notOneBlock) ^ ((s[1] & OneBlock) ^ OneBlock);
+    k0[0] = s[0];
+    k1[0] = s[1];
+    block ct[4];
+
+    for (int i = 0; i < Bin; ++i)
+    {
+        const u8 keep = static_cast<uint8_t>(idx >> (Bin - 1 - i)) & 1;
+        auto a = toBlock(keep);
+
+        auto ss0 = s[0] & notThreeBlock;
+        auto ss1 = s[1] & notThreeBlock;
+
+        AES ak0(ss0);
+        AES ak1(ss1);
+        ak0.ecbEncFourBlocks(pt, ct);
+        si[0][0] = ct[0];
+        si[0][1] = ct[1];
+        vi[0][0] = ct[2];
+        vi[0][1] = ct[3];
+        ak1.ecbEncFourBlocks(pt, ct);
+        si[1][0] = ct[0];
+        si[1][1] = ct[1];
+        vi[1][0] = ct[2];
+        vi[1][1] = ct[3];
+
+        auto ti0 = lsb(s[0]);
+        auto ti1 = lsb(s[1]);
+        GroupElement sign = (ti1 == 1) ? -1 : +1;
+
+        uint64_t vi_01_converted[groupSize];
+        uint64_t vi_11_converted[groupSize];
+        uint64_t vi_10_converted[groupSize];
+        uint64_t vi_00_converted[groupSize];
+        convert(Bout, groupSize, vi[0][keep], vi_00_converted);
+        convert(Bout, groupSize, vi[1][keep], vi_10_converted);
+        convert(Bout, groupSize, vi[0][keep ^ 1], vi_01_converted);
+        convert(Bout, groupSize, vi[1][keep ^ 1], vi_11_converted);
+
+        for (int lp = 0; lp < groupSize; ++lp)
+        {
+            v0[i * groupSize + lp] = sign * (-v_alpha[lp] - vi_01_converted[lp] + vi_11_converted[lp]);
+            if (keep == 0 && greaterThan)
+            {
+                // Lose is R
+                v0[i * groupSize + lp] = v0[i * groupSize + lp] + sign * payload[lp];
+            }
+            else if (keep == 1 && !greaterThan)
+            {
+                // Lose is L
+                v0[i * groupSize + lp] = v0[i * groupSize + lp] + sign * payload[lp];
+            }
+            v_alpha[lp] = v_alpha[lp] - vi_10_converted[lp] + vi_00_converted[lp] + sign * v0[i * groupSize + lp];
+        }
+
+        std::array<block, 2> siXOR{si[0][0] ^ si[1][0], si[0][1] ^ si[1][1]};
+
+        // get the left and right t_CW bits
+        std::array<block, 2> t{
+            (OneBlock & siXOR[0]) ^ a ^ OneBlock,
+            (OneBlock & siXOR[1]) ^ a};
+
+        // take scw to be the bits [127, 2] as scw = s0_loss ^ s1_loss
+        auto scw = siXOR[keep ^ 1] & notThreeBlock;
+
+        k0[i + 1] = k1[i + 1] = scw           // set bits [127, 2] as scw = s0_loss ^ s1_loss
+                                ^ (t[0] << 1) // set bit 1 as tL
+                                ^ t[1];       // set bit 0 as tR
+
+        auto si0Keep = si[0][keep];
+        auto si1Keep = si[1][keep];
+
+        // extract the t^Keep_CW bit
+        auto TKeep = t[keep];
+
+        // set the next level of s,t
+        s[0] = si0Keep ^ (zeroAndAllOne[ti0] & (scw ^ TKeep));
+        s[1] = si1Keep ^ (zeroAndAllOne[ti1] & (scw ^ TKeep));
+    }
+
+    uint64_t s0_converted[groupSize];
+    uint64_t s1_converted[groupSize];
+    convert(Bout, groupSize, s[0] & notThreeBlock, s0_converted);
+    convert(Bout, groupSize, s[1] & notThreeBlock, s1_converted);
+
+    for (int lp = 0; lp < groupSize; ++lp)
+    {
+        g0[lp] = s1_converted[lp] - s0_converted[lp] - v_alpha[lp];
+        if (lsb(s[1]) == 1)
+        {
+            g0[lp] = g0[lp] * -1;
+        }
+    }
+
+    return std::make_pair(DCFKeyPack(Bin, Bout, groupSize, k0, g0, v0), DCFKeyPack(Bin, Bout, groupSize, k1, g0, v0));
+}
+
+std::pair<DCFKeyPack, DCFKeyPack> keyGenDCF(int Bin, int Bout,
+                GroupElement idx, GroupElement payload)
+{
+    // idx: bitsize Bin, payload: bitsize Bout
+    return keyGenDCF(Bin, Bout, 1, idx, &payload);
+}
+
+void evalDCF(int Bin, int Bout, int groupSize, 
+                GroupElement *out, // groupSize
+                int party, GroupElement idx, 
+                block *k, // bin + 1
+                GroupElement *g , // groupSize
+                GroupElement *v, // bin * groupSize
+                bool geq /*= false*/, int evalGroupIdxStart /*= 0*/,
+                int evalGroupIdxLen /*= -1*/)
+{
+    if (evalGroupIdxLen == 0)
+    {
+        return;
+    }
+    if (evalGroupIdxLen == -1)
+    {
+        evalGroupIdxLen = groupSize;
+    }
+
+    auto s = traversePathDCF(Bin, Bout, groupSize, party, idx, k, out, v, geq, evalGroupIdxStart, evalGroupIdxLen);
+
+    u8 t = lsb(s);
+    block temp = ZeroBlock;
+
+    uint64_t s_converted[groupSize];
+    static const block notThreeBlock = toBlock(~0, ~3);
+    convert(Bout, groupSize, s & notThreeBlock, s_converted);
+    GROUP_LOOP(
+        GroupElement final_term = s_converted[lp];
+        if (t)
+            final_term = final_term + g[lp];
+        if (party == SERVER1)
+        {
+            final_term = -final_term;
+        } out[lp] = out[lp] + final_term;)
+}
+
+void evalDCF(int party, GroupElement *res, GroupElement idx, const DCFKeyPack &key)
+{
+    evalDCF(key.Bin, key.Bout, key.groupSize, res, party, idx, key.k, key.g, key.v);
+}
+
+void evalDCFPartial(int party, GroupElement *res, GroupElement idx, const DCFKeyPack &key, int start, int len)
+{
+    evalDCF(key.Bin, key.Bout, key.groupSize, res, party, idx, key.k, key.g, key.v, false, start, len);
+}
+
+// Dual DCF
+
+std::pair<DualDCFKeyPack, DualDCFKeyPack> keyGenDualDCF(int Bin, int Bout, int groupSize, GroupElement idx, GroupElement *payload1, GroupElement *payload2)
+{
+    DualDCFKeyPack key0, key1;
+
+    key0.Bin = Bin; key1.Bin = Bin;
+    key0.Bout = Bout; key1.Bout = Bout;
+    key0.groupSize = groupSize; key1.groupSize = groupSize;
+
+    GroupElement *payload = new GroupElement[groupSize];
+    for (int i = 0; i < groupSize; i++) {
+        payload[i] = payload1[i] - payload2[i];
+    }
+
+    auto keys = keyGenDCF(Bin, Bout, groupSize, idx, payload);
+    key0.dcfKey = keys.first, key1.dcfKey = keys.second;
+    
+
+    key0.sb = new GroupElement[groupSize];
+    key1.sb = new GroupElement[groupSize];
+
+    for (int i = 0; i < groupSize; i++) {
+        auto payload2_split = splitShare(payload2[i], Bout);
+        key0.sb[i] = payload2_split.first; key1.sb[i] = payload2_split.second;
+    }
+
+    return std::make_pair(key0, key1);
+}
+
+std::pair<DualDCFKeyPack, DualDCFKeyPack> keyGenDualDCF(int Bin, int Bout, GroupElement idx, GroupElement payload1, GroupElement payload2)
+{
+    return keyGenDualDCF(Bin, Bout, 1, idx, &payload1, &payload2);
+}
+
+void evalDualDCF(int party, GroupElement* res, GroupElement idx, const DualDCFKeyPack &key)
+{
+    evalDCF(key.Bin, key.Bout, key.groupSize, res, party, idx, key.dcfKey.k, key.dcfKey.g, key.dcfKey.v);
+    for (int i = 0; i < key.groupSize; i++) {
+        res[i] = res[i] + key.sb[i];
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/dpf.cpp b/GPU-MPC/ext/sytorch/ext/llama/dpf.cpp
new file mode 100644
index 00000000..5d4a109d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/dpf.cpp
@@ -0,0 +1,538 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dpf.h>
+#include <llama/assert.h>
+#include <cassert>
+
+using namespace osuCrypto;
+
+inline u8 lsb(const block &b)
+{
+    return _mm_cvtsi128_si64x(b) & 1;
+}
+
+/*
+ * lambda = 127
+ */
+std::pair<DPFKeyPack, DPFKeyPack> keyGenDPF(int bin, int bout, GroupElement idx, GroupElement payload)
+{
+    always_assert(bin <= 64);
+    always_assert(bout <= 64);
+    static const block notOneBlock = toBlock(~0, ~1);
+    const static block pt[2] = {ZeroBlock, OneBlock};
+
+    DPFKeyPack key0(bin, bout);
+    DPFKeyPack key1(bin, bout);
+
+    int tid = omp_get_thread_num();
+    auto s = LlamaConfig::prngs[tid].get<std::array<block, 2>>();
+    auto s0 = s[0];
+    auto s1 = s[1];
+
+    s0 = s0 & notOneBlock;
+    s1 = s1 & notOneBlock;
+    key0.s[0] = s0;
+    key1.s[0] = s1;
+    
+    u8 t0 = 0;
+    u8 t1 = 1;
+
+    block ct0[2];
+    block ct1[2];
+
+    for (int i = 0; i < bin; ++i)
+    {
+        const u8 keep = static_cast<uint8_t>(idx >> (bin - 1 - i)) & 1;
+        const u8 loose = keep ^ 1;
+
+        AES ak0(s0);
+        AES ak1(s1);
+
+        ak0.ecbEncTwoBlocks(pt, ct0);
+        ak1.ecbEncTwoBlocks(pt, ct1);
+
+        auto scw = (ct0[loose] ^ ct1[loose]) & notOneBlock;
+        u64 tcw[2];
+        u64 &tLcw = tcw[0];
+        u64 &tRcw = tcw[1];
+        tLcw = lsb(ct0[0]) ^ lsb(ct1[0]) ^ keep ^ 1;
+        tRcw = lsb(ct0[1]) ^ lsb(ct1[1]) ^ keep;
+
+        key0.s[i+1] = scw;
+        key1.s[i+1] = scw;
+        key0.tLcw |= (tLcw << (bin - 1 - i));
+        key0.tRcw |= (tRcw << (bin - 1 - i));
+
+        if (t0 == 0)
+        {
+            s0 = ct0[keep] & notOneBlock;
+            t0 = lsb(ct0[keep]);
+        }
+        else
+        {
+            s0 = (ct0[keep] & notOneBlock) ^ scw;
+            t0 = lsb(ct0[keep]) ^ tcw[keep];
+        }
+
+        if (t1 == 0)
+        {
+            s1 = ct1[keep] & notOneBlock;
+            t1 = lsb(ct1[keep]);
+        }
+        else
+        {
+            s1 = (ct1[keep] & notOneBlock) ^ scw;
+            t1 = lsb(ct1[keep]) ^ tcw[keep];
+        }
+    }
+
+    key1.tLcw = key0.tLcw;
+    key1.tRcw = key0.tRcw;
+
+    key0.payload = payload - _mm_extract_epi64(s0, 0) + _mm_extract_epi64(s1, 0);
+    if (t1 == 1) key0.payload = -key0.payload;
+    key1.payload = key0.payload;
+
+    return std::make_pair(key0, key1);
+}
+
+GroupElement evalDPF_EQ(int party, DPFKeyPack &key, GroupElement x)
+{
+    static const block notOneBlock = toBlock(~0, ~1);
+    int bin = key.bin;
+    int bout = key.bout;
+
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    for (int i = 0; i < bin; ++i)
+    {
+        assert(lsb(s) == 0);
+        const u8 x_i = static_cast<uint8_t>(x >> (bin - 1 - i)) & 1;
+        
+        AES ak(s);
+        block ct = ak.ecbEncBlock(toBlock(0, x_i));
+        s = ct & notOneBlock;
+        u8 t_old = t;
+        t = lsb(ct);
+
+        if (t_old) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (bin - 1 - i)) & 1);
+        }
+    }
+
+    return t;
+}
+
+GroupElement evalDPF_GT(int party, DPFKeyPack &key, GroupElement x)
+{
+    static const block notOneBlock = toBlock(~0, ~1);
+    int bin = key.bin;
+    int bout = key.bout;
+
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    
+    u8 x_prev = 0;
+    u8 t_dcf = 0;
+
+    for (int i = 0; i < bin; ++i)
+    {
+        assert(lsb(s) == 0);
+        const u8 x_i = static_cast<uint8_t>(x >> (bin - 1 - i)) & 1;
+
+        if (x_prev != x_i)
+        {
+            t_dcf = t_dcf ^ t;
+        }
+        x_prev = x_i;
+
+        AES ak(s);
+        block ct = ak.ecbEncBlock(toBlock(0, x_i));
+        s = ct & notOneBlock;
+        u8 t_old = t;
+        t = lsb(ct);
+        
+
+        if (t_old) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (bin - 1 - i)) & 1);
+        }
+    }
+
+    if (x_prev == 1)
+    {
+        t_dcf = t_dcf ^ t;
+    }
+    return t_dcf;
+}
+
+GroupElement evalDPF_LT(int party, DPFKeyPack &key, GroupElement x)
+{
+    static const block notOneBlock = toBlock(~0, ~1);
+    int bin = key.bin;
+    int bout = key.bout;
+
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    
+    u8 x_prev = 1;
+    u8 t_dcf = 0;
+
+    for (int i = 0; i < bin; ++i)
+    {
+        assert(lsb(s) == 0);
+        const u8 x_i = static_cast<uint8_t>(x >> (bin - 1 - i)) & 1;
+
+        if (x_prev != x_i)
+        {
+            t_dcf = t_dcf ^ t;
+        }
+        x_prev = x_i;
+
+        AES ak(s);
+        block ct = ak.ecbEncBlock(toBlock(0, x_i));
+        s = ct & notOneBlock;
+        u8 t_old = t;
+        t = lsb(ct);
+        
+
+        if (t_old) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (bin - 1 - i)) & 1);
+        }
+    }
+
+    if (x_prev == 0)
+    {
+        t_dcf = t_dcf ^ t;
+    }
+    return t_dcf;
+}
+
+void evalAll_helper(int party, DPFKeyPack &key, GroupElement rightShift, GroupElement *out, block s_prev, u8 t_prev, int i, GroupElement acc)
+{
+    if (i == key.bin)
+    {
+        GroupElement idx = acc + rightShift;
+        mod(idx, key.bin);
+        out[idx] = (1 - 2 * party) * (_mm_extract_epi64(s_prev, 0) + key.payload * t_prev);
+        return;
+    }
+
+    const static block pt[2] = {ZeroBlock, OneBlock};
+    static const block notOneBlock = toBlock(~0, ~1);
+
+    AES ak(s_prev);
+    block ct[2];
+    ak.ecbEncTwoBlocks(pt, ct);
+
+    for (int x_i = 0; x_i < 2; ++x_i)
+    {
+        block s = ct[x_i] & notOneBlock;
+        u8 t = lsb(ct[x_i]);
+
+        if (t_prev) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (key.bin - 1 - i)) & 1);
+        }
+
+        evalAll_helper(party, key, rightShift, out, s, t, i+1, 2 * acc + x_i);
+    }
+}
+
+void evalAll(int party, DPFKeyPack &key, GroupElement rightShift, GroupElement *out)
+{
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    
+    evalAll_helper(party, key, rightShift, out, s, t, 0, 0);
+}
+
+void evalAll_reduce_helper(int party, DPFKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab, GroupElement &out, block &s_prev, u8 t_prev, int i, GroupElement acc)
+{
+    if (i == key.bin)
+    {
+        GroupElement idx = acc + rightShift;
+        mod(idx, key.bin);
+        out = out + tab[idx] * ((1 - 2 * party) * (_mm_extract_epi64(s_prev, 0) + key.payload * t_prev));
+        return;
+    }
+
+    const static block pt[2] = {ZeroBlock, OneBlock};
+    static const block notOneBlock = toBlock(~0, ~1);
+
+    AES ak(s_prev);
+    block ct[2];
+    ak.ecbEncTwoBlocks(pt, ct);
+
+    for (int x_i = 0; x_i < 2; ++x_i)
+    {
+        block s = ct[x_i] & notOneBlock;
+        u8 t = lsb(ct[x_i]);
+
+        if (t_prev) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (key.bin - 1 - i)) & 1);
+        }
+
+        evalAll_reduce_helper(party, key, rightShift, tab, out, s, t, i+1, 2 * acc + x_i);
+    }
+}
+
+GroupElement evalAll_reduce(int party, DPFKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab)
+{
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    GroupElement out = 0;
+    
+    evalAll_reduce_helper(party, key, rightShift, tab, out, s, t, 0, 0);
+    return out;
+}
+
+std::pair<DPFETKeyPack, DPFETKeyPack> keyGenDPFET(int bin, GroupElement idx)
+{
+    always_assert(bin <= 64);
+    always_assert(bin >= 8);
+    static const block notOneBlock = toBlock(~0, ~1);
+    const static block pt[2] = {ZeroBlock, OneBlock};
+
+    DPFETKeyPack key0(bin);
+    DPFETKeyPack key1(bin);
+
+    int tid = omp_get_thread_num();
+    auto s = LlamaConfig::prngs[tid].get<std::array<block, 2>>();
+    auto s0 = s[0];
+    auto s1 = s[1];
+
+    s0 = s0 & notOneBlock;
+    s1 = s1 & notOneBlock;
+    key0.s[0] = s0;
+    key1.s[0] = s1;
+    
+    u8 t0 = 0;
+    u8 t1 = 1;
+
+    block ct0[2];
+    block ct1[2];
+
+    for (int i = 0; i < bin - 7; ++i)
+    {
+        const u8 keep = static_cast<uint8_t>(idx >> (bin - 1 - i)) & 1;
+        const u8 loose = keep ^ 1;
+
+        AES ak0(s0);
+        AES ak1(s1);
+
+        ak0.ecbEncTwoBlocks(pt, ct0);
+        ak1.ecbEncTwoBlocks(pt, ct1);
+
+        auto scw = (ct0[loose] ^ ct1[loose]) & notOneBlock;
+        u64 tcw[2];
+        u64 &tLcw = tcw[0];
+        u64 &tRcw = tcw[1];
+        tLcw = lsb(ct0[0]) ^ lsb(ct1[0]) ^ keep ^ 1;
+        tRcw = lsb(ct0[1]) ^ lsb(ct1[1]) ^ keep;
+
+        key0.s[i+1] = scw;
+        key1.s[i+1] = scw;
+        key0.tLcw |= (tLcw << (bin - 1 - i));
+        key0.tRcw |= (tRcw << (bin - 1 - i));
+
+        if (t0 == 0)
+        {
+            s0 = ct0[keep] & notOneBlock;
+            t0 = lsb(ct0[keep]);
+        }
+        else
+        {
+            s0 = (ct0[keep] & notOneBlock) ^ scw;
+            t0 = lsb(ct0[keep]) ^ tcw[keep];
+        }
+
+        if (t1 == 0)
+        {
+            s1 = ct1[keep] & notOneBlock;
+            t1 = lsb(ct1[keep]);
+        }
+        else
+        {
+            s1 = (ct1[keep] & notOneBlock) ^ scw;
+            t1 = lsb(ct1[keep]) ^ tcw[keep];
+        }
+    }
+
+    key1.tLcw = key0.tLcw;
+    key1.tRcw = key0.tRcw;
+
+    if (t0 == 1) s0 = s0 ^ OneBlock;
+    if (t1 == 1) s1 = s1 ^ OneBlock;
+    uint64_t e0, e1;
+    GroupElement ip = idx % 128;
+    if (ip >= 64) {
+        e0 = 0;
+        e1 = 1ULL << (127 - ip);
+    }
+    else {
+        e0 = 1ULL << (63 - ip);
+        e1 = 0;
+    }
+    key0.leaf = s0 ^ s1 ^ osuCrypto::toBlock(e0, e1);
+    key1.leaf = key0.leaf;
+
+    return std::make_pair(key0, key1);
+}
+
+GroupElement evalDPFET_LT(int party, const DPFETKeyPack &key, GroupElement x)
+{
+    static const block notOneBlock = toBlock(~0, ~1);
+    int bin = key.bin;
+
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    
+    u8 x_prev = 1;
+    u8 t_dcf = 0;
+
+    for (int i = 0; i < bin - 7; ++i)
+    {
+        assert(lsb(s) == 0);
+        const u8 x_i = static_cast<uint8_t>(x >> (bin - 1 - i)) & 1;
+
+        if (x_prev != x_i)
+        {
+            t_dcf = t_dcf ^ t;
+        }
+        x_prev = x_i;
+
+        AES ak(s);
+        block ct = ak.ecbEncBlock(toBlock(0, x_i));
+        s = ct & notOneBlock;
+        u8 t_old = t;
+        t = lsb(ct);
+
+        if (t_old) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (bin - 1 - i)) & 1);
+        }
+    }
+
+    osuCrypto::block leaf = s;
+    if (t) leaf = leaf ^ OneBlock ^ key.leaf;
+    uint64_t b;
+
+    {
+        const u8 x_i = static_cast<uint8_t>(x >> 6) & 1;
+        if (x_prev != x_i)
+        {
+            t_dcf = t_dcf ^ t;
+        }
+        x_prev = x_i;
+        if (x_i) 
+        {
+            b = _mm_extract_epi64(leaf, 0);
+        }
+        else 
+        {
+            b = _mm_extract_epi64(leaf, 1);
+        }
+        t = __builtin_parityll(b);
+    }
+
+    GroupElement xp = x % 64;
+
+    if (x_prev == 0)
+    {
+        for (int i = 0; i <= xp; ++i)
+        {
+            t_dcf = t_dcf ^ ((b >> (63 - i)) & 1);
+        }
+    }
+    else
+    {
+        for (int i = xp + 1; i < 64; ++i)
+        {
+            t_dcf = t_dcf ^ ((b >> (63 - i)) & 1);
+        }
+    }
+
+    return t_dcf;
+}
+
+void evalAll_reduce_helper_et(int party, DPFETKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab, GroupElement &out, GroupElement &corr, block &s_prev, u8 t_prev, int i, GroupElement acc)
+{
+    if (i == key.bin - 7)
+    {
+        osuCrypto::block leaf = s_prev;
+        if (t_prev) leaf = leaf ^ OneBlock ^ key.leaf;
+        
+        uint64_t b = _mm_extract_epi64(leaf, 1);
+        for (int j = 0; j < 64; ++j) {
+            GroupElement idx = 128 * acc + j + rightShift;
+            mod(idx, key.bin);
+            GroupElement e = ((1 - 2 * party) * ((b >> (63 - j)) & 1));
+            out = out + tab[idx] * e;
+            corr = corr + e;
+        }
+
+        b = _mm_extract_epi64(leaf, 0);
+        for (int j = 64; j < 128; ++j) {
+            GroupElement idx = 128 * acc + j + rightShift;
+            mod(idx, key.bin);
+            GroupElement e = ((1 - 2 * party) * ((b >> (127 - j)) & 1));
+            out = out + tab[idx] * e;
+            corr = corr + e;
+        }
+        return;
+    }
+
+    const static block pt[2] = {ZeroBlock, OneBlock};
+    static const block notOneBlock = toBlock(~0, ~1);
+
+    AES ak(s_prev);
+    block ct[2];
+    ak.ecbEncTwoBlocks(pt, ct);
+
+    for (int x_i = 0; x_i < 2; ++x_i)
+    {
+        block s = ct[x_i] & notOneBlock;
+        u8 t = lsb(ct[x_i]);
+
+        if (t_prev) {
+            s = s ^ _mm_loadu_si128(key.s + i + 1);
+            t = t ^ ((key.tcw[x_i] >> (key.bin - 1 - i)) & 1);
+        }
+
+        evalAll_reduce_helper_et(party, key, rightShift, tab, out, corr, s, t, i+1, 2 * acc + x_i);
+    }
+}
+
+std::pair<GroupElement, GroupElement> evalAll_reduce_et(int party, DPFETKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab)
+{
+    auto s = _mm_loadu_si128(key.s);
+    u8 t = party;
+    GroupElement out = 0;
+    GroupElement corr = 0;
+    
+    evalAll_reduce_helper_et(party, key, rightShift, tab, out, corr, s, t, 0, 0);
+    return std::make_pair(out, corr);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.cpp b/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.cpp
new file mode 100644
index 00000000..b7bff1e5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.cpp
@@ -0,0 +1,135 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "fixtobfloat16.h"
+#include <llama/dcf.h>
+#include <llama/dpf.h>
+
+std::pair<F2BF16KeyPack, F2BF16KeyPack> keyGenF2BF16(int bin, GroupElement rin, GroupElement rout)
+{
+    mod(rin, bin);
+    GroupElement r1, r0;
+    
+    auto dcfN = keyGenDCF(bin, bin, rin, 1);
+    F2BF16KeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.dcfKey = dcfN.first; k1.dcfKey = dcfN.second;
+
+    GroupElement rout_k = random_ge(bin);
+    GroupElement rout_m = random_ge(bin);
+    GroupElement rout_xm = random_ge(bin);
+    GroupElement prod = rin * rout_m + rout_xm;
+
+    auto dcfTruncate = keyGenDCF(bin - 8, 8, rout_xm % (1LL << (bin - 8)), 1);
+    k0.dcfTruncate = dcfTruncate.first;
+    k1.dcfTruncate = dcfTruncate.second;
+
+    auto rout_k_split = splitShare(rout_k, bin);
+    auto rout_m_split = splitShare(rout_m, bin);
+    auto rin_split = splitShare(rin, bin);
+    auto prod_split = splitShare(prod, bin);
+    auto rProd_split = splitShare(rout_xm >> (bin - 8), bin);
+
+    // GroupElement rout_real = rout - rout_k;
+    auto rout_split = splitShare(rout - rout_k, 13);
+
+    k0.rout_k = rout_k_split.first;
+    k1.rout_k = rout_k_split.second;
+
+    k0.rout_m = rout_m_split.first;
+    k1.rout_m = rout_m_split.second;
+
+    k0.rin = rin_split.first;
+    k1.rin = rin_split.second;
+
+    k0.prod = prod_split.first;
+    k1.prod = prod_split.second;
+
+    k0.rout = rout_split.first;
+    k1.rout = rout_split.second;
+
+    k0.rProd = rProd_split.first;
+    k1.rProd = rProd_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+std::pair<GroupElement, GroupElement> evalF2BF16_1(int party, GroupElement x, const F2BF16KeyPack &key)
+{
+    GroupElement t1 = 0;
+    mod(x, key.bin);
+    evalDCF(party, &t1, x, key.dcfKey);
+
+    GroupElement res_prev = 0;
+    GroupElement k_final = 0;
+    GroupElement m_final = 0;
+    for (int i = 1; i < key.bin; ++i)
+    {
+        GroupElement c = (1LL << i);
+        GroupElement y = x - c;
+        mod(y, key.bin);
+        GroupElement t2 = 0;
+        evalDCF(party, &t2, y, key.dcfKey);
+        GroupElement res = t2 - t1;
+        if (party == 1) {
+            GroupElement N = -c;
+            mod(N, key.bin);
+            if (y >= N) {
+                res += 1;
+            }
+        }
+        // res contains  1{x < c}
+        k_final += (i - 1) * (res - res_prev);
+        m_final += (1LL << (key.bin - i)) * (res - res_prev);
+        res_prev = res;
+    }
+
+    k_final += key.rout_k;
+    m_final += key.rout_m;
+
+    return std::make_pair(k_final, m_final);
+}
+
+GroupElement evalF2BF16_2(int party, GroupElement x, GroupElement k, GroupElement m, const F2BF16KeyPack &key)
+{
+    GroupElement res = -key.rin * m - x * key.rout_m + key.prod;
+    if (party == 1)
+    {
+        res += m * x;
+    }
+    return res;
+}
+
+GroupElement evalF2BF16_3(int party, GroupElement k, GroupElement xm, const F2BF16KeyPack &key) {
+    GroupElement t;
+    evalDCF(party, &t, xm % (1LL << (key.bin - 8)), key.dcfTruncate);
+    GroupElement res = party * (xm >> (key.bin - 8)) - key.rProd - t;
+    if (party == 1)
+    {
+        res -= 128; // as the top bit is always 1
+    }
+
+    res = res << 6;
+    if (party == 1)
+        res = res + k;
+
+    return res + key.rout;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h b/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h
new file mode 100644
index 00000000..3dcf2959
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h
@@ -0,0 +1,29 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/keypack.h>
+
+std::pair<F2BF16KeyPack, F2BF16KeyPack> keyGenF2BF16(int bin, GroupElement rin, GroupElement rout);
+std::pair<GroupElement, GroupElement> evalF2BF16_1(int party, GroupElement x, const F2BF16KeyPack &key);
+GroupElement evalF2BF16_2(int party, GroupElement x, GroupElement k, GroupElement m, const F2BF16KeyPack &key);
+GroupElement evalF2BF16_3(int party, GroupElement k, GroupElement xm, const F2BF16KeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/float.cpp b/GPU-MPC/ext/sytorch/ext/llama/float.cpp
new file mode 100644
index 00000000..e4dc5f8f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/float.cpp
@@ -0,0 +1,209 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "float.h"
+#include "pubdiv.h"
+
+
+void fill_pq(GroupElement *p, GroupElement *q, int n)
+{
+    for(int i = 2*n; i >= 1; --i)
+    {
+        int idx = i - 1;
+        if (i == 1) {
+            p[idx] = 0;
+        }
+        else if (i <= n) {
+            p[idx] = (1ULL<<(i - 2));
+        }
+        else if (i == n + 1) {
+            p[idx] = (1ULL<<(n - 1));
+        }
+        else {
+            p[idx] = -(1ULL<<(2*n - i + 1))+ 1;
+            mod(p[idx], n);
+        }
+
+        if (i == 2*n) {
+            q[idx] = -1;
+            mod(q[idx], n);
+        }
+        else {
+            q[idx] = p[idx+1] - 1;
+        }
+    }
+}
+
+pair<FixToFloatKeyPack> keyGenFixToFloat(int bin, int scale, GroupElement rin, GroupElement *p, GroupElement *q)
+{
+    pair<FixToFloatKeyPack> keys;
+    auto micKeys = keyGenMIC(bin, bin, 2*bin, p, q, rin, nullptr);
+    keys.first.micKey = micKeys.first;
+    keys.second.micKey = micKeys.second;
+
+    GroupElement rs = random_ge(1);
+    auto rs_split = splitShare(rs, 1);
+    keys.first.rs = rs_split.first;
+    keys.second.rs = rs_split.second;
+
+    GroupElement rpow = random_ge(bin);
+    auto rpow_split = splitShare(rpow, bin);
+    keys.first.rpow = rpow_split.first;
+    keys.second.rpow = rpow_split.second;
+
+    GroupElement rselect = random_ge(bin);
+    auto selectKeys = keyGenSelect(bin, rs, rin, rselect);
+    keys.first.selectKey = selectKeys.first;
+    keys.second.selectKey = selectKeys.second;
+
+    GroupElement ry = 2 * rselect - rin;
+    auto ry_split = splitShare(ry, bin);
+    keys.first.ry = ry_split.first;
+    keys.second.ry = ry_split.second;
+
+    GroupElement rm = ry * rpow;
+    auto rm_split = splitShare(rm, bin);
+    keys.first.rm = rm_split.first;
+    keys.second.rm = rm_split.second;
+
+    return keys;
+}
+
+inline uint64_t fp32_bias(uint64_t x) {
+    return (x + 127) % 256;
+}
+
+inline uint64_t fp32_unbias(uint64_t x) {
+    return (x - 127) % 256;
+}
+
+void evalFixToFloat_1(int party, int bin, int scale, GroupElement x, const FixToFloatKeyPack &key, GroupElement *p, GroupElement *q, GroupElement &m, GroupElement &e, GroupElement &z, GroupElement &s, GroupElement &pow, GroupElement &sm)
+{
+    mod(x, bin);
+    GroupElement t[2*bin];
+    evalMIC(party, bin, bin, 2*bin, p, q, x, key.micKey, t);
+    z = t[0];
+    mod(z, 1);
+    s = 0;
+    for(int i = bin; i < 2*bin; ++i)
+    {
+        s += t[i];
+    }
+    mod(s, 1);
+    sm = s + key.rs;
+    e = fp32_bias(-126) * t[0] + fp32_bias(bin - scale - 1) * t[bin];
+    pow = key.rpow + t[bin];
+    for (int i = 2; i <= bin; ++i) {
+        e += (t[i-1] + t[2*bin + 1 - i]) * fp32_bias(i - scale - 2);
+        pow += (t[i-1] + t[2*bin + 1 - i]) * (1ULL<<(bin - i + 1));
+    }
+}
+
+GroupElement adjust(GroupElement m, GroupElement e) 
+{
+    mod(m, 24);
+    mod(e, 10);
+    if ((e >= 512) && (e <= (1024-24))) {
+        return 0;
+    }
+    else if (e >= 512) {
+        uint64_t s = 1024 - e;
+        return m >> s;
+    }
+    else if (e < 64) {
+        return m << (e);
+    }
+    else {
+        return 0;
+    }
+}
+
+
+pair<FloatToFixKeyPack> keyGenFloatToFix(int bin, int scale, GroupElement rout)
+{
+    pair<FloatToFixKeyPack> keys;
+    GroupElement rm = random_ge(24);
+    auto rm_split = splitShare(rm, 24);
+    //shares of r(m)
+    keys.first.rm = rm_split.first;
+    keys.second.rm = rm_split.second;
+
+    GroupElement re = random_ge(10);
+    auto re_split = splitShare(re, 10);
+    //shares of r(e) 
+    keys.first.re = re_split.first;
+    keys.second.re = re_split.second;
+
+    auto dcfKeys = keyGenDCF(24, 1, rm, 1);
+    //shares of dcfKey using rm
+    keys.first.dcfKey = dcfKeys.first;
+    keys.second.dcfKey = dcfKeys.second;
+
+    GroupElement rw = random_ge(1); //
+    auto rw_split = splitShare(rw, 1);
+    //shares of r(w)
+    keys.first.rw = rw_split.first;
+    keys.second.rw = rw_split.second;
+
+
+    GroupElement rh = random_ge(bin); 
+    auto rh_split = splitShare(rh, bin);
+    //shares of r(h)
+    keys.first.rh = rh_split.first;
+    keys.second.rh = rh_split.second;
+
+    GroupElement rt = random_ge(bin); 
+    auto rt_split = splitShare(rt, bin);
+    
+    //shares of r(t) //no need to split rt as it is not used in any other function in online mode
+    //keys.first.rt = rt_split.first;
+    //keys.second.rt = rt_split.second;
+
+    auto selectKeys = keyGenSelect(bin, rw, rh, 0);
+    //shares of select keys based on random r(h) and r(w)
+    keys.first.selectKey = selectKeys.first;
+    keys.second.selectKey = selectKeys.second;
+
+    auto arskeys = keyGenARS(bin, bin, 23, rt, rout); // check the shift and bin value
+    //shares of ARS keys based on random r(t) and rout
+    keys.first.arsKey = arskeys.first;
+    keys.second.arsKey = arskeys.second;
+
+    GroupElement p;
+    GroupElement q;
+    for(int i = 0; i < 1024; ++i) {
+        if (i == ((1024 - re) % 1024)) {
+            p = 1ULL;
+        }
+        else {
+            p = 0ULL;
+        }
+        auto p_split = splitShare(p, bin);
+        keys.first.p[i] = p_split.first;
+        keys.second.p[i] = p_split.second;
+        q = rt - (rm * pow_helper(scale,(i-re)%1024));
+        auto q_split = splitShare(q, bin);
+        keys.first.q[i] = q_split.first;
+        keys.second.q[i] = q_split.second;
+    }
+    //counter++; for debugging
+    return keys;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/float.h b/GPU-MPC/ext/sytorch/ext/llama/float.h
new file mode 100644
index 00000000..acf3c669
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/float.h
@@ -0,0 +1,46 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <utility>
+#include "mic.h"
+#include "select.h"
+
+void fill_pq(GroupElement *p, GroupElement *q, int n);
+template <typename T>
+using pair = std::pair<T, T>;
+
+pair<FixToFloatKeyPack> keyGenFixToFloat(int bin, int scale, GroupElement rin, GroupElement *p, GroupElement *q);
+void evalFixToFloat_1(int party, int bin, int scale, GroupElement x, const FixToFloatKeyPack &key, GroupElement *p, GroupElement *q, GroupElement &m, GroupElement &e, GroupElement &z, GroupElement &s, GroupElement &pow, GroupElement &sm);
+
+GroupElement adjust(GroupElement m, GroupElement e);
+pair<FloatToFixKeyPack> keyGenFloatToFix(int bin, int scale, GroupElement rout);
+
+inline GroupElement pow_helper(int scale,GroupElement y) {
+    GroupElement pow = 0;
+    if (y < scale)
+    {
+        pow = (1ULL << y);        
+    }
+    return pow;
+
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
new file mode 100644
index 00000000..5f16e0fb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
@@ -0,0 +1,196 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include <llama/group_element.h>
+
+#define MASK_PAIR(x) x, x##_mask
+
+namespace llama {
+    void start();
+    void end();
+}
+
+void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
+            MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA);
+
+void Conv2DWrapper(int32_t N, int32_t H, int32_t W,
+                   int32_t CI, int32_t FH, int32_t FW,
+                   int32_t CO, int32_t zPadHLeft,
+                   int32_t zPadHRight, int32_t zPadWLeft,
+                   int32_t zPadWRight, int32_t strideH,
+                   int32_t strideW, MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr),
+                   MASK_PAIR(GroupElement *outArr));
+
+void Conv2DGroupWrapper(int64_t N, int64_t H, int64_t W,
+                        int64_t CI, int64_t FH, int64_t FW,
+                        int64_t CO, int64_t zPadHLeft,
+                        int64_t zPadHRight, int64_t zPadWLeft,
+                        int64_t zPadWRight, int64_t strideH,
+                        int64_t strideW, int64_t G,
+                        MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr), MASK_PAIR(GroupElement *outArr));
+
+void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W,
+            int32_t CI, int32_t FD, int32_t FH, int32_t FW,
+            int32_t CO, int32_t zPadDLeft, int32_t zPadDRight, int32_t zPadHLeft,
+            int32_t zPadHRight, int32_t zPadWLeft,
+            int32_t zPadWRight, int32_t strideD, int32_t strideH,
+            int32_t strideW, GroupElement *inputArr, GroupElement *filterArr,
+            GroupElement *outArr);
+
+void ElemWiseActModelVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr),
+                                MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr));
+
+void ArgMax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr));
+
+void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu, std::string prefix = "");
+
+void ReluTruncate(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int sf, GroupElement *drelu_cache);
+
+void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot, std::string prefix = "");
+
+void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr));
+
+void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf);
+
+void ScaleUp(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf);
+
+void ElemWiseMul(int32_t size, GroupElement *inArr, GroupElement *multArrVec, GroupElement *outputArr, std::string prefix = "");
+
+void Floor(int32_t s1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf);
+
+void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf);
+
+void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t shift);
+
+void Select(int32_t size, GroupElement *s, GroupElement *x, GroupElement *out, std::string prefix = "", bool doReconstruct = true);
+void Select(int32_t size, int bin, GroupElement *s, GroupElement *x, GroupElement *out, std::string prefix = "", bool doReconstruct = true);
+
+void Relu2Round(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu_cache, int effectiveInputBw);
+
+void MaxPoolDouble(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
+             int32_t FW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot);
+
+void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32_t FW, GroupElement *maxBits, GroupElement *oneHot);
+
+void MaxPoolBackward(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
+             int32_t FW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot);
+
+void FixToFloat(int size, GroupElement *inp, GroupElement *out, int scale);
+void FloatToFix(int size, GroupElement *inp, GroupElement *out, int scale);
+
+void ReluExtend(int size, int bin, int bout, GroupElement *x, GroupElement *y, GroupElement *drelu);
+void SignExtend2(int size, int bin, int bout, GroupElement *x, GroupElement *y);
+
+void ConvTranspose3DWrapper(int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr);
+
+void EdabitsPrTrunc(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+
+void LUT_dpf(int size, int bin, int bout, const std::vector<GroupElement> &tab, GroupElement *x, GroupElement *y, std::string prefix = "", bool doReconstruct = true);
+
+void nExp(int size, int bin, GroupElement *x, GroupElement *y, int scale);
+void Tanh(int size, GroupElement *x, GroupElement *y, int scale);
+
+void Clip(int size, int maxbw, GroupElement *x, GroupElement *y, std::string prefix = "");
+void Softmax(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale);
+void F2BF16(int size, GroupElement *x, GroupElement *y, std::string prefix = "");
+void Rsqrt(int size, GroupElement *x, GroupElement *y, GroupElement extradiv, int scale, std::string prefix = "", std::vector<GroupElement>* lut = nullptr);
+void Gelu(int size, int bin, GroupElement *x, GroupElement *y, int scale);
+void TruncateReduce(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+void LUT_dfpet(int size, int bin, int bout, const std::vector<GroupElement> &tab, GroupElement *x, GroupElement *y, std::string prefix, bool doReconstruct = true);
+void SlothDrelu(int size, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SlothRelu(int size, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SlothClip(int size, int bin, int maxbw, int bout, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SlothMaxpool(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SlothMaxpoolTriangular(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SumOfSquare(int s1, int s2, GroupElement *x, GroupElement *y, std::string prefix = "");
+void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale);
+void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale);
+void SoftmaxTriangular(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale);
+void MatMul2DTriangular(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
+            MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA);
+void SlothAttentionTriangular(int n_seq, int n_embd, int n_heads, GroupElement *q, GroupElement *k, GroupElement *v, GroupElement *out, int scale);
+void SlothLRS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+void SlothARS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+void SlothTR(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale);
+void SlothFaithfulARS(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
+
+void reconstruct(int32_t size, GroupElement *arr, int bw);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/array.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/array.h
new file mode 100644
index 00000000..2f083097
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/array.h
@@ -0,0 +1,72 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <array>
+// Array initializers
+template <typename T> T *make_array(std::size_t s1) { return new T[s1]; }
+template <typename T> T *make_array(std::size_t s1, std::size_t s2) {
+    return new T[s1 * s2];
+}
+template <typename T> T *make_array(std::size_t s1, std::size_t s2, std::size_t s3) {
+    return new T[s1 * s2 * s3];
+}
+template <typename T>
+T *make_array(std::size_t s1, std::size_t s2, std::size_t s3, std::size_t s4) {
+    return new T[s1 * s2 * s3 * s4];
+}
+template <typename T>
+T *make_array(std::size_t s1, std::size_t s2, std::size_t s3, std::size_t s4, std::size_t s5) {
+    return new T[s1 * s2 * s3 * s4 * s5];
+}
+
+// Indexing Helpers, we use 1D pointers for any dimension array, hence these macros are necessary
+// Copied from SCI
+#define Arr2DIdx(arr, s0, s1, i, j) (*((arr) + (i) * (s1) + (j)))
+// #define Arr3D(arr, s0, s1, s2, i, j, k)                                 \
+  (*((arr) + (i) * (s1) * (s2) + (j) * (s2) + (k)))
+#define Arr4DIdx(arr, s0, s1, s2, s3, i, j, k, l)                          \
+  (*((arr) + (i) * (s1) * (s2) * (s3) + (j) * (s2) * (s3) + (k) * (s3) + (l)))
+#define Arr5DIdx(arr, s0, s1, s2, s3, s4, i, j, k, l, m)                   \
+  (*((arr) + (i) * (s1) * (s2) * (s3) * (s4) + (j) * (s2) * (s3) * (s4) +      \
+     (k) * (s3) * (s4) + (l) * (s4) + (m)))
+
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/assert.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/assert.h
new file mode 100644
index 00000000..db0abc90
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/assert.h
@@ -0,0 +1,30 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <iostream>
+
+inline void assert_failed(const char* file, int line, const char* function, const char* expression) {
+    std::cerr << "Assertion failed: " << expression << " in " << function << " at " << file << ":" << line << std::endl;
+    exit(1);
+}
+
+#define always_assert(expr) (static_cast <bool> (expr) ? void (0) : assert_failed (__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr))
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/comms.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/comms.h
new file mode 100644
index 00000000..7988ec96
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/comms.h
@@ -0,0 +1,488 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include <string>
+#include <iostream>
+#include <llama/keypack.h>
+#include <llama/array.h>
+#include <llama/assert.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+
+#define DEALER 1
+#define SERVER 2
+#define CLIENT 3
+
+typedef enum BufType
+{
+    BUF_FILE,
+    BUF_SOCKET,
+    BUF_MEM
+} BufType;
+
+class KeyBuf
+{
+public:
+    uint64_t bytesSent = 0;
+    uint64_t bytesReceived = 0;
+    BufType t;
+    virtual void sync() {}
+    virtual void read(char *buf, int bytes) = 0;
+    virtual char *read(int bytes) = 0;
+    virtual void write(char *buf, int bytes) = 0;
+    virtual void close() = 0;
+    bool isMem() { return t == BUF_MEM; }
+};
+
+typedef enum FileMode
+{
+    F_RD_ONLY,
+    F_WR_ONLY
+} FileMode;
+
+class FileBuf : public KeyBuf
+{
+public:
+    std::fstream file;
+
+    FileBuf(std::string filename, FileMode f)
+    {
+        printf("Opening file=%s, mode=%d\n", filename.data(), f);
+        this->t = BUF_FILE;
+        if (f == F_WR_ONLY)
+            this->file.open(filename, std::ios::out | std::ios::binary);
+        else
+            this->file.open(filename, std::ios::in | std::ios::binary);
+    }
+
+    void read(char *buf, int bytes)
+    {
+        this->file.read(buf, bytes);
+        bytesReceived += bytes;
+    }
+
+    char *read(int bytes)
+    {
+        char *newBuf = new char[bytes];
+        this->read(newBuf, bytes);
+        return newBuf;
+    }
+
+    void write(char *buf, int bytes)
+    {
+        this->file.write(buf, bytes);
+        bytesSent += bytes;
+    }
+
+    void close()
+    {
+        this->file.close();
+    }
+};
+
+class SocketBuf : public KeyBuf
+{
+public:
+    int sendsocket, recvsocket;
+
+    SocketBuf(std::string ip, int port, bool onlyRecv);
+    SocketBuf(int sendsocket, int recvsocket) : sendsocket(sendsocket), recvsocket(recvsocket)
+    {
+        this->t = BUF_SOCKET;
+    }
+    void sync();
+    void read(char *buf, int bytes);
+    char *read(int bytes);
+    void write(char *buf, int bytes);
+    void close();
+};
+
+class MemBuf : public KeyBuf
+{
+public:
+    char **memBufPtr;
+    char *startPtr;
+
+    MemBuf(char **mBufPtr)
+    {
+        this->t = BUF_MEM;
+        memBufPtr = mBufPtr;
+        startPtr = *mBufPtr;
+    }
+
+    void read(char *buf, int bytes)
+    {
+        memcpy(buf, *memBufPtr, bytes);
+        *memBufPtr += bytes;
+        bytesReceived += bytes;
+    }
+
+    char *read(int bytes)
+    {
+        char *newBuf = *memBufPtr;
+        *memBufPtr += bytes;
+        bytesReceived += bytes;
+        return newBuf;
+    }
+
+    void write(char *buf, int bytes)
+    {
+        memcpy(*memBufPtr, buf, bytes);
+        *memBufPtr += bytes;
+        bytesSent += bytes;
+    }
+
+    void close()
+    {
+        // do nothing yet
+    }
+};
+
+class Peer
+{
+public:
+    KeyBuf *keyBuf;
+
+    Peer(std::string ip, int port)
+    {
+        keyBuf = new SocketBuf(ip, port, false);
+    }
+
+    Peer(int sendsocket, int recvsocket)
+    {
+        keyBuf = new SocketBuf(sendsocket, recvsocket);
+    }
+
+    Peer(std::string filename)
+    {
+        keyBuf = new FileBuf(filename, F_WR_ONLY);
+    }
+
+    Peer(char **mBufPtr)
+    {
+        keyBuf = new MemBuf(mBufPtr);
+    }
+
+    inline uint64_t bytesSent()
+    {
+        return keyBuf->bytesSent;
+    }
+
+    inline uint64_t bytesReceived()
+    {
+        return keyBuf->bytesReceived;
+    }
+
+    void inline zeroBytesSent()
+    {
+        keyBuf->bytesSent = 0;
+    }
+
+    void inline zeroBytesReceived()
+    {
+        keyBuf->bytesReceived = 0;
+    }
+
+    void close();
+
+    void send_ge(const GroupElement &g, int bw);
+    void send_ge_array(const GroupElement *g, int size);
+
+    void send_block(const osuCrypto::block &b);
+
+    void send_mask(const GroupElement &g);
+
+    void send_input(const GroupElement &g);
+
+    void send_batched_input(GroupElement *g, int size, int bw);
+
+    void send_mult_key(const MultKey &k);
+
+    void send_square_key(const SquareKey &k);
+
+    void send_matmul_key(const MatMulKey &k);
+
+    void send_new_mult_key(const MultKeyNew &k, int bw1, int bw2);
+
+    void send_conv2d_key(const Conv2DKey &k);
+
+    void send_conv3d_key(const Conv3DKey &k);
+
+    void send_dcf_keypack(const DCFKeyPack &kp);
+
+    void send_dpf_keypack(const DPFKeyPack &kp);
+
+    void send_dpfet_keypack(const DPFETKeyPack &kp);
+
+    void send_ddcf_keypack(const DualDCFKeyPack &kp);
+
+    void send_relu_key(const ReluKeyPack &kp);
+
+    void send_maxpool_key(const MaxpoolKeyPack &kp);
+
+    void send_maxpool_double_key(const MaxpoolDoubleKeyPack &kp);
+
+    void send_scmp_keypack(const ScmpKeyPack &kp);
+
+    void send_pubdiv_key(const PublicDivKeyPack &kp);
+
+    void send_ars_key(const ARSKeyPack &kp);
+
+    void send_spline_key(const SplineKeyPack &kp);
+
+    void send_signedpubdiv_key(const SignedPublicDivKeyPack &kp);
+
+    void send_publicIC_key(const PublicICKeyPack &kp);
+
+    void send_relu_truncate_key(const ReluTruncateKeyPack &kp);
+
+    void send_relu_2round_key(const Relu2RoundKeyPack &kp);
+
+    void send_select_key(const SelectKeyPack &kp);
+
+    void send_bulkylrs_key(const BulkyLRSKeyPack &kp, int bl, int m);
+
+    void send_taylor_key(const TaylorKeyPack &kp, int bl, int m);
+
+    void send_bitwise_and_key(const BitwiseAndKeyPack &kp);
+
+    void send_mic_key(const MICKeyPack &kp, int bin, int bout, int m);
+
+    void send_fix_to_float_key(const FixToFloatKeyPack &kp, int bl);
+
+    void send_float_to_fix_key(const FloatToFixKeyPack &kp, int bl);
+
+    void send_relu_extend_key(const ReluExtendKeyPack &kp, int bin, int bout);
+
+    void send_sign_extend2_key(const SignExtend2KeyPack &kp, int bin, int bout);
+
+    void send_triple_key(const TripleKeyPack &kp);
+
+    void send_edabits_prtrunc_key(const EdabitsPrTruncKeyPack &kp, int bw);
+
+    void send_pubcmp_key(const PubCmpKeyPack &kp);
+
+    void send_clip_key(const ClipKeyPack &kp);
+
+    void send_lut_key(const LUTKeyPack &kp);
+
+    void send_lutdpfet_key(const LUTDPFETKeyPack &kp);
+
+    void send_f2bf16_key(const F2BF16KeyPack &kp);
+
+    void send_truncate_reduce_key(const TruncateReduceKeyPack &kp);
+
+    void send_lutss_key(const LUTSSKeyPack &kp);
+
+    void send_sloth_drelu_key(const SlothDreluKeyPack &kp);
+
+    void send_wrap_dpf_key(const WrapDPFKeyPack &kp);
+
+    void send_wrap_ss_key(const WrapSSKeyPack &kp);
+
+    void send_sloth_lrs_key(const SlothLRSKeyPack &kp);
+
+    void send_sloth_sign_extend_key(const SlothSignExtendKeyPack &kp);
+
+    void send_uint8_array(const uint8_t *data, int size);
+
+    void recv_uint8_array(uint8_t *data, int size);
+
+    void sync();
+
+    GroupElement recv_input();
+
+    void recv_batched_input(uint64_t *g, int size, int bw);
+};
+
+Peer *waitForPeer(int port);
+
+class Dealer
+{
+public:
+    KeyBuf *keyBuf;
+
+    Dealer(std::string ip, int port);
+
+    Dealer(std::string filename)
+    {
+        keyBuf = new FileBuf(filename, F_RD_ONLY);
+    }
+
+    Dealer(char **mBufPtr)
+    {
+        keyBuf = new MemBuf(mBufPtr);
+    }
+
+    inline uint64_t bytesReceived()
+    {
+        return keyBuf->bytesReceived;
+    }
+
+    void close();
+
+    GroupElement recv_mask();
+
+    MultKey recv_mult_key();
+
+    SquareKey recv_square_key();
+
+    osuCrypto::block recv_block();
+
+    osuCrypto::block *recv_block_array(int numBlocks);
+
+    GroupElement recv_ge(int bw);
+
+    GroupElement *recv_ge_array(int bw, int size);
+
+    void recv_ge_array(const GroupElement *g, int size);
+
+    void recv_ge_array(int bw, int size, GroupElement *arr);
+
+    DCFKeyPack recv_dcf_keypack(int Bin, int Bout, int groupSize);
+
+    DPFKeyPack recv_dpf_keypack(int bin, int bout);
+
+    DPFETKeyPack recv_dpfet_keypack(int bin);
+
+    DualDCFKeyPack recv_ddcf_keypack(int Bin, int Bout, int groupSize);
+
+    MatMulKey recv_matmul_key(int bin, int bout, int s1, int s2, int s3);
+
+    Conv2DKey recv_conv2d_key(int bin, int bout, int64_t N, int64_t H, int64_t W,
+                              int64_t CI, int64_t FH, int64_t FW,
+                              int64_t CO, int64_t zPadHLeft,
+                              int64_t zPadHRight, int64_t zPadWLeft,
+                              int64_t zPadWRight, int64_t strideH,
+                              int64_t strideW);
+
+    Conv3DKey recv_conv3d_key(int bin, int bout, int64_t N, int64_t D, int64_t H, int64_t W,
+                              int64_t CI, int64_t FD, int64_t FH, int64_t FW, int64_t CO,
+                              int64_t zPadDLeft, int64_t zPadDRight,
+                              int64_t zPadHLeft, int64_t zPadHRight,
+                              int64_t zPadWLeft, int64_t zPadWRight,
+                              int64_t strideD, int64_t strideH, int64_t strideW);
+
+    ReluKeyPack recv_relu_key(int Bin, int Bout);
+
+    MaxpoolKeyPack recv_maxpool_key(int Bin, int Bout);
+
+    MaxpoolDoubleKeyPack recv_maxpool_double_key(int Bin, int Bout);
+
+    ScmpKeyPack recv_scmp_keypack(int Bin, int Bout);
+
+    PublicDivKeyPack recv_pubdiv_key(int Bin, int Bout);
+
+    ARSKeyPack recv_ars_key(int Bin, int Bout, int shift);
+
+    MultKeyNew recv_new_mult_key(int Bin, int Bout);
+
+    SplineKeyPack recv_spline_key(int Bin, int Bout, int numPoly, int degree);
+
+    SignedPublicDivKeyPack recv_signedpubdiv_key(int Bin, int Bout);
+
+    PublicICKeyPack recv_publicIC_key(int Bin, int Bout);
+
+    ReluTruncateKeyPack recv_relu_truncate_key(int Bin, int Bout, int s);
+
+    Relu2RoundKeyPack recv_relu_2round_key(int effectiveBin, int Bin);
+
+    SelectKeyPack recv_select_key(int Bin);
+
+    TaylorKeyPack recv_taylor_key(int bl, int m, int sf);
+
+    BulkyLRSKeyPack recv_bulkylrs_key(int bl, int m, uint64_t *scales);
+
+    BitwiseAndKeyPack recv_bitwise_and_key();
+
+    MICKeyPack recv_mic_key(int bin, int bout, int m);
+
+    FixToFloatKeyPack recv_fix_to_float_key(int bl);
+
+    FloatToFixKeyPack recv_float_to_fix_key(int bl);
+
+    ReluExtendKeyPack recv_relu_extend_key(int Bin, int Bout);
+
+    SignExtend2KeyPack recv_sign_extend2_key(int Bin, int Bout);
+
+    TripleKeyPack recv_triple_key(int bw, int64_t na, int64_t nb, int64_t nc);
+
+    EdabitsPrTruncKeyPack recv_edabits_prtrunc_key(int bw);
+
+    PubCmpKeyPack recv_pubcmp_key(int bin);
+
+    ClipKeyPack recv_clip_key(int bin);
+
+    LUTKeyPack recv_lut_key(int bin, int bout);
+
+    LUTDPFETKeyPack recv_lutdpfet_key(int bin, int bout);
+
+    F2BF16KeyPack recv_f2bf16_key(int bin);
+
+    TruncateReduceKeyPack recv_truncate_reduce_key(int bin, int shift);
+
+    LUTSSKeyPack recv_lutss_key(int bin, int bout);
+
+    SlothDreluKeyPack recv_slothdrelu_key(int bin);
+
+    WrapDPFKeyPack recv_wrap_dpf_key(int bin);
+
+    WrapSSKeyPack recv_wrap_ss_key(int bin);
+
+    SlothLRSKeyPack recv_sloth_lrs_key(int bin, int shift);
+
+    SlothSignExtendKeyPack recv_sloth_sign_extend_key(int bin, int bout);
+};
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/config.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/config.h
new file mode 100644
index 00000000..50615452
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/config.h
@@ -0,0 +1,38 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+class Peer;
+class Dealer;
+namespace LlamaConfig {
+
+    extern int bitlength;
+    extern int num_threads;
+    extern int party;
+    extern Peer *client;
+    extern Peer *server;
+    extern Peer *peer;
+    extern Dealer *dealer;
+    extern int port;
+    extern bool stochasticRT;
+    extern bool stochasticT;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/conv.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/conv.h
new file mode 100644
index 00000000..3e18c7cd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/conv.h
@@ -0,0 +1,129 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/keypack.h>
+
+
+std::pair<MatMulKey, MatMulKey> KeyGenMatMul(int Bin, int Bout, int s1, int s2, int s3, GroupElement *rin1, GroupElement *rin2, GroupElement *rout);
+
+std::pair<Conv2DKey, Conv2DKey> KeyGenConv2D(
+    int Bin, int Bout,
+    int N, int H, int W, int CI, int FH, int FW, int CO,
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideH, int strideW,
+    GroupElement *rin1,  GroupElement * rin2, GroupElement * rout);
+
+void EvalConv2D(int party, const Conv2DKey &key,
+    int N, int H, int W, int CI, int FH, int FW, int CO,
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideH, int strideW, GroupElement* input, GroupElement* filter, GroupElement* output);
+
+std::pair<Conv3DKey, Conv3DKey> KeyGenConv3D(
+    int Bin, int Bout,
+    int N, int D, int H, int W, int CI, int FD, int FH, int FW, int CO,
+    int zPadDLeft, int zPadDRight, 
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideD, int strideH, int strideW,
+    GroupElement *rin1,  GroupElement * rin2, GroupElement * rout);
+
+void EvalConv3D(int party, const Conv3DKey &key,
+    int N, int D, int H, int W, int CI, int FD, int FH, int FW, int CO,
+    int zPadDLeft, int zPadDRight, 
+    int zPadHLeft, int zPadHRight, 
+    int zPadWLeft, int zPadWRight,
+    int strideD, int strideH, int strideW, GroupElement* input, GroupElement* filter, GroupElement* output);
+
+std::pair<TripleKeyPack, TripleKeyPack> KeyGenConvTranspose3D(
+    int bw,
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr);
+
+void EvalConvTranspose3D(int party, const TripleKeyPack &key,
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr);
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/dcf.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/dcf.h
new file mode 100644
index 00000000..5e3df46f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/dcf.h
@@ -0,0 +1,82 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <array>
+#include <vector>
+#include <utility>
+#include <cryptoTools/Common/Defines.h>
+#include <cryptoTools/Crypto/AES.h>
+#include <cryptoTools/Crypto/PRNG.h>
+#include <cryptoTools/gsl/span>
+#include <llama/keypack.h>
+
+void clearAESevals();
+inline osuCrypto::u8 lsb(const osuCrypto::block &b)
+{
+    return _mm_cvtsi128_si64x(b) & 1;
+}
+
+std::pair<DCFKeyPack, DCFKeyPack> keyGenDCF(int Bin, int Bout, int groupSize,
+                GroupElement idx, GroupElement* payload);
+
+std::pair<DCFKeyPack, DCFKeyPack> keyGenDCF(int Bin, int Bout,
+                GroupElement idx, GroupElement payload);
+
+void evalDCF(int party, GroupElement *res, GroupElement idx, const DCFKeyPack &key);
+void evalDCF(int Bin, int Bout, int groupSize, 
+                GroupElement *out, // groupSize
+                int party, GroupElement idx, 
+                osuCrypto::block *k, // bin + 1
+                GroupElement *g , // groupSize
+                GroupElement *v, // bin * groupSize
+                bool geq = false, int evalGroupIdxStart = 0,
+                int evalGroupIdxLen = -1);
+
+void evalDCFPartial(int party, GroupElement *res, GroupElement idx, const DCFKeyPack &key, int start, int len);
+
+std::pair<DualDCFKeyPack, DualDCFKeyPack> keyGenDualDCF(int Bin, int Bout, int groupSize, GroupElement idx, GroupElement *payload1, GroupElement *payload2);
+
+std::pair<DualDCFKeyPack, DualDCFKeyPack> keyGenDualDCF(int Bin, int Bout, GroupElement idx, GroupElement payload1, GroupElement payload2);
+
+void evalDualDCF(int party, GroupElement* res, GroupElement idx, const DualDCFKeyPack &key);
+
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/dpf.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/dpf.h
new file mode 100644
index 00000000..5f06c588
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/dpf.h
@@ -0,0 +1,36 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <cryptoTools/Common/Defines.h>
+#include <llama/group_element.h>
+#include <llama/keypack.h>
+
+std::pair<DPFKeyPack, DPFKeyPack> keyGenDPF(int bin, int bout, GroupElement idx, GroupElement payload);
+GroupElement evalDPF_EQ(int party, DPFKeyPack &key, GroupElement x);
+GroupElement evalDPF_GT(int party, DPFKeyPack &key, GroupElement x);
+GroupElement evalDPF_LT(int party, DPFKeyPack &key, GroupElement x);
+void evalAll(int party, DPFKeyPack &key, GroupElement rightShift, GroupElement *out);
+GroupElement evalAll_reduce(int party, DPFKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab);
+
+std::pair<DPFETKeyPack, DPFETKeyPack> keyGenDPFET(int bin, GroupElement idx);
+std::pair<GroupElement, GroupElement> evalAll_reduce_et(int party, DPFETKeyPack &key, GroupElement rightShift, const std::vector<GroupElement> &tab);
+GroupElement evalDPFET_LT(int party, const DPFETKeyPack &key, GroupElement x);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/freekey.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/freekey.h
new file mode 100644
index 00000000..9e4bb52a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/freekey.h
@@ -0,0 +1,410 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+/*
+Authors: Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include <cryptoTools/Common/Defines.h>
+#include <llama/comms.h>
+
+inline void freeDCFKeyPack(DCFKeyPack &key){
+    if (!LlamaConfig::dealer->keyBuf->isMem()) {
+        delete[] key.k;
+    }
+    delete[] key.g;
+    delete[] key.v;
+}
+
+inline void freeDCFKeyPackPair(std::pair<DCFKeyPack, DCFKeyPack> &keys){
+    delete[] keys.first.k;
+    delete[] keys.second.k;
+    delete[] keys.first.g;
+    delete[] keys.first.v;
+}
+
+inline void freeDualDCFKeyPack(DualDCFKeyPack &key){
+    freeDCFKeyPack(key.dcfKey);
+    delete[] key.sb;
+}
+
+inline void freeDualDCFKeyPackPair(std::pair<DualDCFKeyPack, DualDCFKeyPack> &keys){
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+    delete[] keys.first.sb;
+    delete[] keys.second.sb;
+}
+
+inline void freeMatMulKey(MatMulKey &key){
+    delete[] key.a;
+    delete[] key.b;
+    delete[] key.c;
+}
+
+inline void freeMatMulKeyPair(std::pair<MatMulKey, MatMulKey> &keys){
+    delete[] keys.first.a;
+    delete[] keys.first.b;
+    delete[] keys.first.c;
+    delete[] keys.second.a;
+    delete[] keys.second.b;
+    delete[] keys.second.c;
+}
+
+inline void freeConv2dKey(Conv2DKey &key){
+    delete[] key.a;
+    delete[] key.b;
+    delete[] key.c;
+}
+
+inline void freeConv3dKey(Conv3DKey &key){
+    delete[] key.a;
+    delete[] key.b;
+    delete[] key.c;
+}
+
+inline void freeReluKeyPack(ReluKeyPack &key)
+{
+    if (!LlamaConfig::dealer->keyBuf->isMem()) {
+        delete[] key.k;
+    }
+    delete[] key.g;
+    if (!(LlamaConfig::dealer->keyBuf->isMem() && (key.Bout > 32))) {
+        delete[] key.v;
+    }
+}
+
+inline void freeReluKeyPackPair(std::pair<ReluKeyPack,ReluKeyPack> &keys)
+{
+    delete[] keys.first.k;
+    delete[] keys.second.k;
+    delete[] keys.first.g;
+    delete[] keys.first.v;
+    // other key shares g and v, dont delete again
+}
+
+inline void freeMaxpoolKeyPack(MaxpoolKeyPack &key)
+{
+    freeReluKeyPack(key.reluKey);
+}
+
+inline void freeMaxpoolKeyPackPair(std::pair<MaxpoolKeyPack,MaxpoolKeyPack> &keys)
+{
+    delete[] keys.first.reluKey.k;
+    delete[] keys.second.reluKey.k;
+    delete[] keys.first.reluKey.g;
+    delete[] keys.first.reluKey.v;
+}
+
+inline void freeARSKeyPack(ARSKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+    if (key.Bout > key.Bin - key.shift) {
+        freeDualDCFKeyPack(key.dualDcfKey);
+    }
+}
+inline void freeARSKeyPackPair(std::pair<ARSKeyPack, ARSKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+    if (keys.first.Bout > keys.first.Bin - keys.first.shift) {
+        delete[] keys.first.dualDcfKey.sb;
+        delete[] keys.second.dualDcfKey.sb;
+        delete[] keys.first.dualDcfKey.dcfKey.k;
+        delete[] keys.second.dualDcfKey.dcfKey.k;
+        delete[] keys.first.dualDcfKey.dcfKey.g;
+        delete[] keys.first.dualDcfKey.dcfKey.v;
+    }
+}
+
+inline void freeReluTruncateKeyPack( ReluTruncateKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKeyN);
+    freeDCFKeyPack(key.dcfKeyS);
+}
+
+inline void freeReluTruncateKeyPackPair(const std::pair<ReluTruncateKeyPack, ReluTruncateKeyPack> &keys)
+{
+    delete[] keys.first.dcfKeyN.k;
+    delete[] keys.second.dcfKeyN.k;
+    delete[] keys.first.dcfKeyN.g;
+    delete[] keys.first.dcfKeyN.v;
+
+    delete[] keys.first.dcfKeyS.k;
+    delete[] keys.second.dcfKeyS.k;
+    delete[] keys.first.dcfKeyS.g;
+    delete[] keys.first.dcfKeyS.v;
+}
+
+inline void freeRelu2RoundKeyPack(Relu2RoundKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeRelu2RoundKeyPackPair(const std::pair<Relu2RoundKeyPack, Relu2RoundKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeSplineKey(SplineKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+    key.p.clear();
+    key.e_b.clear();
+    key.beta_b.clear();
+}
+
+inline void freeSplineKeyPair(std::pair<SplineKeyPack, SplineKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+    keys.first.p.clear();
+    keys.second.p.clear();
+    keys.first.e_b.clear();
+    keys.second.e_b.clear();
+    keys.first.beta_b.clear();
+    keys.second.beta_b.clear();
+}
+
+inline void freeMICKeyPack(MICKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+    delete[] key.z;
+}
+
+inline void freeMSNZBKeyPack(MSNZBKeyPack &key)
+{
+    freeMICKeyPack(key.micKey);
+}
+
+inline void freeBulkyLRSKeyPack(BulkyLRSKeyPack &key, int m)
+{
+    freeDCFKeyPack(key.dcfKeyN);
+    delete[] key.z;
+    for(int i = 0; i < m; i++) {
+        freeDCFKeyPack(key.dcfKeyS[i]);
+    }
+    delete[] key.dcfKeyS;
+}
+
+inline void freeTaylorKeyPack(TaylorKeyPack &key, int m)
+{
+    freeMSNZBKeyPack(key.msnzbKey);
+    freeBulkyLRSKeyPack(key.lrsKeys[0], m);
+    freeBulkyLRSKeyPack(key.lrsKeys[1], m);
+}
+
+inline void freeMaxpoolDoubleKeyPack(MaxpoolDoubleKeyPack &key)
+{
+    freeRelu2RoundKeyPack(key.reluKey);
+}
+
+inline void freeMaxpoolDoubleKeyPackPair(std::pair<MaxpoolDoubleKeyPack,MaxpoolDoubleKeyPack> &keys)
+{
+    delete[] keys.first.reluKey.dcfKey.k;
+    delete[] keys.second.reluKey.dcfKey.k;
+    delete[] keys.first.reluKey.dcfKey.g;
+    delete[] keys.first.reluKey.dcfKey.v;
+}
+
+inline void freeFixToFloatKeyPack(FixToFloatKeyPack &key)
+{
+    freeMICKeyPack(key.micKey);
+}
+
+inline void freeFixToFloatKeyPackPair(std::pair<FixToFloatKeyPack, FixToFloatKeyPack> &keys)
+{
+    delete[] keys.first.micKey.dcfKey.k;
+    delete[] keys.second.micKey.dcfKey.k;
+    delete[] keys.first.micKey.dcfKey.g;
+    delete[] keys.first.micKey.dcfKey.v;
+}
+
+inline void freeFloatToFixKeyPack(FloatToFixKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeFloatToFixKeyPackPair(std::pair<FloatToFixKeyPack, FloatToFixKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeReluExtendKeyPack(ReluExtendKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeReluExtendKeyPackPair(std::pair<ReluExtendKeyPack, ReluExtendKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeSignExtend2KeyPack(SignExtend2KeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeSignExtend2KeyPackPair(std::pair<SignExtend2KeyPack, SignExtend2KeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeTripleKey(TripleKeyPack &key){
+    delete[] key.a;
+    delete[] key.b;
+    delete[] key.c;
+}
+
+inline void freeDPFKeyPack(DPFKeyPack &key){
+    if (!LlamaConfig::dealer->keyBuf->isMem()) {
+        delete key.s;
+    }
+}
+
+inline void freeDPFKeyPackPair(std::pair<DPFKeyPack, DPFKeyPack> &keys){
+    delete[] keys.first.s;
+    delete[] keys.second.s;
+}
+
+inline void freeDPFKeyPack(DPFETKeyPack &key){
+    if (!LlamaConfig::dealer->keyBuf->isMem()) {
+        delete key.s;
+    }
+}
+
+inline void freeDPFKeyPackPair(std::pair<DPFETKeyPack, DPFETKeyPack> &keys){
+    delete[] keys.first.s;
+    delete[] keys.second.s;
+}
+
+inline void freeLUTKeyPack(LUTKeyPack &key){
+    freeDPFKeyPack(key.dpfKey);
+}
+
+inline void freeLUTKeyPackPair(std::pair<LUTKeyPack, LUTKeyPack> &keys){
+    delete[] keys.first.dpfKey.s;
+    delete[] keys.second.dpfKey.s;
+}
+
+inline void freeClipKeyPack(ClipKeyPack &key)
+{
+    freeDCFKeyPack(key.cmpKey.dcfKey);
+}
+
+inline void freeClipKeyPackPair(std::pair<ClipKeyPack, ClipKeyPack> &keys)
+{
+    delete[] keys.first.cmpKey.dcfKey.k;
+    delete[] keys.second.cmpKey.dcfKey.k;
+    delete[] keys.first.cmpKey.dcfKey.g;
+    delete[] keys.first.cmpKey.dcfKey.v;
+}
+
+inline void freeF2BF16KeyPack(F2BF16KeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeF2BF16KeyPackPair(std::pair<F2BF16KeyPack, F2BF16KeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeTruncateReduceKeyPack(TruncateReduceKeyPack &key)
+{
+    freeDCFKeyPack(key.dcfKey);
+}
+
+inline void freeTruncateReduceKeyPackPair(std::pair<TruncateReduceKeyPack, TruncateReduceKeyPack> &keys)
+{
+    delete[] keys.first.dcfKey.k;
+    delete[] keys.second.dcfKey.k;
+    delete[] keys.first.dcfKey.g;
+    delete[] keys.first.dcfKey.v;
+}
+
+inline void freeSlothDreluKeyPack(SlothDreluKeyPack &key){
+    freeDPFKeyPack(key.dpfKey);
+}
+
+inline void freeSlothDreluKeyPackPair(std::pair<SlothDreluKeyPack, SlothDreluKeyPack> &keys){
+    delete[] keys.first.dpfKey.s;
+    delete[] keys.second.dpfKey.s;
+}
+
+inline void freeLUTDPFETKeyPack(LUTDPFETKeyPack &key){
+    freeDPFKeyPack(key.dpfKey);
+}
+
+inline void freeLUTDPFETKeyPackPair(std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> &keys){
+    delete[] keys.first.dpfKey.s;
+    delete[] keys.second.dpfKey.s;
+}
+
+inline void freeWrapDPFKeyPack(WrapDPFKeyPack &key){
+    freeDPFKeyPack(key.dpfKey);
+}
+
+inline void freeWrapDPFKeyPackPair(std::pair<WrapDPFKeyPack, WrapDPFKeyPack> &keys){
+    delete[] keys.first.dpfKey.s;
+    delete[] keys.second.dpfKey.s;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/group_element.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/group_element.h
new file mode 100644
index 00000000..42f9db9d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/group_element.h
@@ -0,0 +1,120 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <vector>
+#include <cstdint>
+#include <iostream>
+#include <cryptoTools/Common/Defines.h>
+#include <cryptoTools/Crypto/PRNG.h>
+#include <llama/config.h>
+#include <llama/prng.h>
+#include <omp.h>
+
+using GroupElement = uint64_t;
+
+inline void mod(GroupElement &a, int bw)
+{
+    if (bw != 64)
+        a = a & ((uint64_t(1) << bw) - 1); 
+}
+
+inline GroupElement random_ge(int bw)
+{
+    GroupElement a;
+    int tid = omp_get_thread_num();
+    a = LlamaConfig::prngs[tid].get<uint64_t>();
+    mod(a, bw);
+    return a;
+}
+
+inline std::pair<GroupElement, GroupElement> splitShare(const GroupElement& a, int bw)
+{
+    GroupElement a1, a2;
+    a1 = random_ge(bw);
+    // a1 = 0;
+    mod(a1, bw);
+    a2 = a - a1;
+    mod(a2, bw);
+    return std::make_pair(a1, a2);
+}
+
+inline std::pair<GroupElement, GroupElement> splitShareXor(const GroupElement& a, int bw)
+{
+    GroupElement a1, a2;
+    a1 = random_ge(bw);
+    a2 = a ^ a1;
+    return std::make_pair(a1, a2);
+}
+
+inline std::pair<GroupElement, GroupElement> splitShareCommonPRNG(const GroupElement& a, int bw)
+{
+    GroupElement a1, a2;
+    a1 = prngShared.get<uint64_t>();
+    // a1 = 0;
+    mod(a1, bw);
+    a2 = a - a1;
+    mod(a2, bw);
+    return std::make_pair(a1, a2);
+}
+
+inline GroupElement pow(GroupElement x, uint64_t e)
+{
+    if (e == 0)
+    {
+        return 1;
+    }
+    GroupElement res = pow(x, e / 2);
+    if (e % 2 == 0)
+    {
+        return res * res;
+    }
+    else
+    {
+        return res * res * x;
+    }
+}
+
+inline GroupElement msb(GroupElement a, int bw)
+{
+    return (a >> (bw - 1)) & 1;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/input_prng.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/input_prng.h
new file mode 100644
index 00000000..4234c5a6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/input_prng.h
@@ -0,0 +1,61 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <llama/group_element.h>
+#include <thread>
+#include <chrono>
+
+void input_prng_init();
+void input_layer(GroupElement *x, GroupElement *x_mask, int size, int owner);
+
+#define TIME_THIS_BLOCK_FOR_INPUT_IF(x, condition, accumulator) \
+{\
+    if (condition) {\
+    auto start = std::chrono::high_resolution_clock::now();\
+    x;\
+    auto end = std::chrono::high_resolution_clock::now();\
+    accumulator += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();\
+    } else {\
+        x;\
+    }\
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/keypack.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/keypack.h
new file mode 100644
index 00000000..f50eefeb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/keypack.h
@@ -0,0 +1,445 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include <cryptoTools/Common/Defines.h>
+#include <llama/group_element.h>
+
+struct DCFKeyPack{
+    int Bin, Bout, groupSize;
+    osuCrypto::block *k;   // size Bin+1
+    GroupElement *g;    // bitsize Bout, size groupSize
+    GroupElement *v;   // bitsize Bout, size Bin x groupSize
+    DCFKeyPack(int Bin, int Bout, int groupSize,
+                osuCrypto::block *k,
+                GroupElement *g,
+                GroupElement *v) : Bin(Bin), Bout(Bout), groupSize(groupSize), k(k), g(g), v(v){}
+    DCFKeyPack() {
+        Bin = Bout = groupSize = 0;
+        k = nullptr;
+        g = nullptr;
+        v = nullptr;
+    }
+};
+
+struct DualDCFKeyPack{  
+    int Bin, Bout, groupSize;
+    DCFKeyPack dcfKey;
+    GroupElement *sb;   // size: groupSize
+    DualDCFKeyPack() {}
+};
+
+struct AddKey{
+    int Bin, Bout;
+    GroupElement rb;
+};
+
+struct MultKey{
+    int Bin, Bout;
+    GroupElement a, b, c;
+};
+
+struct MatMulKey{
+    int Bin, Bout;
+    int s1, s2, s3;
+    GroupElement *a, *b, *c;    
+};
+
+struct MultKeyNew {
+    GroupElement a, b, c;
+    DCFKeyPack k1, k2, k3, k4;
+};
+
+struct Conv2DKey{
+    int Bin, Bout;
+    int N, H, W, CI, FH, FW, CO,
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideH, strideW;
+    GroupElement *a, *b, *c;    
+};
+
+struct Conv3DKey{
+    int Bin, Bout;
+    int N, D, H, W, CI, FD, FH, FW, CO,
+        zPadDLeft, zPadDRight, 
+        zPadHLeft, zPadHRight, 
+        zPadWLeft, zPadWRight,
+        strideD, strideH, strideW;
+    GroupElement *a, *b, *c;    
+};
+
+struct TripleKeyPack {
+    int bw;
+    int64_t na, nb, nc;
+    GroupElement *a, *b, *c;
+};
+
+struct ScmpKeyPack
+{
+    int Bin, Bout;
+    DualDCFKeyPack dualDcfKey;
+    GroupElement rb;
+};
+
+struct PublicICKeyPack
+{
+    int Bin, Bout;
+    DCFKeyPack dcfKey;
+    GroupElement zb;
+};
+
+struct PublicDivKeyPack
+{
+    int Bin, Bout;
+    DualDCFKeyPack dualDcfKey;
+    ScmpKeyPack scmpKey;
+    GroupElement zb;
+};
+
+struct SignedPublicDivKeyPack
+{
+    int Bin, Bout;
+    GroupElement d;     // divisor
+    DCFKeyPack dcfKey;
+    PublicICKeyPack publicICkey;
+    ScmpKeyPack scmpKey;
+    GroupElement A_share, corr_share, B_share, rdiv_share;
+    GroupElement rout_temp_share, rout_share;
+};
+
+struct ReluKeyPack
+{
+    int Bin, Bout;
+    osuCrypto::block *k;
+    GroupElement *g, *v;
+    GroupElement e_b0, e_b1;		 // size: degree+1 (same as beta)
+    GroupElement beta_b0, beta_b1;	 // size: degree+1 (shares of beta, which is set of poly coeffs) (beta: highest to lowest power left to right)
+    GroupElement r_b;
+    GroupElement drelu;
+};
+
+struct MaxpoolKeyPack
+{
+    int Bin, Bout;
+    ReluKeyPack reluKey;
+    GroupElement rb;
+};
+
+struct ARSKeyPack
+{
+    // arithmetic right shift
+    int Bin, Bout, shift;
+    DCFKeyPack dcfKey;
+    DualDCFKeyPack dualDcfKey;      // groupSize = 2 for payload
+    GroupElement rb;
+    ARSKeyPack() {}
+};
+
+struct ReluTruncateKeyPack {
+    int Bin, Bout, shift;
+    DCFKeyPack dcfKeyN;
+    DCFKeyPack dcfKeyS;
+    GroupElement zTruncate;
+    GroupElement a, b, c, d1, d2;
+};
+
+struct Relu2RoundKeyPack {
+    int effectiveBin, Bin;
+    DCFKeyPack dcfKey;
+    GroupElement a, b, c, d1, d2;
+};
+
+/*
+struct SplineOneKeyPack
+{
+    int Bin, Bout;
+    int degree; // degree of poly in payload beta
+    DCFKeyPack dcfKey;
+    std::vector<GroupElement> e_b;		 // size: degree+1 (same as beta)
+    std::vector<GroupElement> beta_b;	 // size: degree+1 (shares of beta, which is set of poly coeffs) (beta: highest to lowest power left to right)
+    GroupElement r_b;
+};
+*/
+struct SplineKeyPack
+{
+    int Bin, Bout;
+    int numPoly, degree;
+    DCFKeyPack dcfKey;
+    std::vector<GroupElement> p;        // spline breakpoints, size: numPoly + 1; p[0] = 0 and p[numPoly] = N-1
+    std::vector<std::vector<GroupElement>> e_b; // 2d array dim: numPoly x (degree+1) (size is same as beta)
+    std::vector<GroupElement> beta_b;           // 1d array size: numPoly * (degree+1) (shares of beta, which is set of poly coeffs) (beta: highest to lowest power left to right)
+    GroupElement r_b;
+};
+
+struct PrivateScaleKeyPack
+{
+    GroupElement rin;
+    GroupElement rout;
+};
+
+struct SquareKey {
+    GroupElement b;
+    GroupElement c;
+};
+
+struct TaylorSqKey {
+    GroupElement a;
+    GroupElement b;
+};
+
+struct MICKeyPack {
+    DCFKeyPack dcfKey;
+    GroupElement *z;
+};
+
+struct MSNZBKeyPack {
+    MICKeyPack micKey;
+    GroupElement r;
+};
+
+struct BulkyLRSKeyPack
+{
+    DCFKeyPack dcfKeyN;
+    DCFKeyPack *dcfKeyS;
+    GroupElement *z;
+    GroupElement out;
+};
+
+struct TaylorKeyPack {
+    MSNZBKeyPack msnzbKey;
+    TaylorSqKey squareKey;
+    BulkyLRSKeyPack lrsKeys[2];
+    PrivateScaleKeyPack privateScaleKey;
+};
+
+struct SelectKeyPack {
+    int Bin;
+    GroupElement a, b, c, d1, d2;
+};
+
+struct MaxpoolDoubleKeyPack
+{
+    int Bin, Bout;
+    Relu2RoundKeyPack reluKey;
+    GroupElement rb;
+};
+
+struct BitwiseAndKeyPack
+{
+    GroupElement t[4];
+};
+
+struct FixToFloatKeyPack
+{
+    MICKeyPack micKey;
+    GroupElement rs, rpow, ry, rm;
+    SelectKeyPack selectKey;
+};
+
+struct FloatToFixKeyPack
+{
+    GroupElement rm, re, rw, /*rt,*/ rh;
+    DCFKeyPack dcfKey;
+    SelectKeyPack selectKey;
+    ARSKeyPack arsKey;
+    GroupElement p[1024];
+    GroupElement q[1024];
+};
+
+struct ReluExtendKeyPack
+{
+    DCFKeyPack dcfKey;
+    GroupElement rd, rw;
+    GroupElement p[4];
+    GroupElement q[2];
+};
+
+struct SignExtend2KeyPack
+{
+    DCFKeyPack dcfKey;
+    GroupElement rw;
+    GroupElement p[2];
+};
+
+struct EdabitsPrTruncKeyPack
+{
+    GroupElement a, b;
+};
+
+class DPFKeyPack
+{
+public:
+    int bin, bout;
+    osuCrypto::block *s;
+    union {
+        struct {
+            uint64_t tLcw;
+            uint64_t tRcw;
+        };
+        uint64_t tcw[2];
+    };
+    GroupElement payload;
+
+    DPFKeyPack(int bin, int bout) : bin(bin), bout(bout)
+    {
+        s = new osuCrypto::block[bin+1];
+        tLcw = 0;
+        tRcw = 0;
+    }
+
+    DPFKeyPack()
+    {
+        s = nullptr;
+        tLcw = 0;
+        tRcw = 0;
+    }
+};
+
+class DPFETKeyPack
+{
+public:
+    int bin;
+    osuCrypto::block *s;
+    union {
+        struct {
+            uint64_t tLcw;
+            uint64_t tRcw;
+        };
+        uint64_t tcw[2];
+    };
+    osuCrypto::block leaf;
+
+    DPFETKeyPack(int bin) : bin(bin)
+    {
+        s = new osuCrypto::block[bin+1-7];
+        tLcw = 0;
+        tRcw = 0;
+    }
+
+    DPFETKeyPack()
+    {
+        s = nullptr;
+        tLcw = 0;
+        tRcw = 0;
+    }
+};
+
+struct PubCmpKeyPack {
+    int bin;
+    DCFKeyPack dcfKey;
+    GroupElement rout;
+};
+
+struct ClipKeyPack {
+    int bin;
+    PubCmpKeyPack cmpKey;
+    GroupElement a, b, c, d1, d2;
+};
+
+struct LUTKeyPack {
+    int bin, bout;
+    DPFKeyPack dpfKey;
+    GroupElement rout;
+};
+
+struct F2BF16KeyPack {
+    int bin;
+    DCFKeyPack dcfKey, dcfTruncate;
+    GroupElement rout_k, rout_m, rin, prod, rout, rProd;
+};
+
+struct TruncateReduceKeyPack {
+    int bin, shift;
+    DCFKeyPack dcfKey;
+    GroupElement rout;
+};
+
+struct LUTSSKeyPack {
+    int bin, bout;
+    GroupElement b0, b1, b2, b3;
+    GroupElement routRes, routCorr;
+    GroupElement rout;
+};
+
+struct LUTDPFETKeyPack {
+    int bin, bout;
+    DPFETKeyPack dpfKey;
+    GroupElement routRes, routCorr;
+};
+
+struct SlothDreluKeyPack {
+    int bin;
+    DPFETKeyPack dpfKey;
+    GroupElement r;
+};
+
+struct WrapSSKeyPack {
+    int bin;
+    uint64_t b0, b1;
+};
+
+struct WrapDPFKeyPack {
+    int bin;
+    DPFETKeyPack dpfKey;
+    GroupElement r;
+};
+
+struct SlothLRSKeyPack {
+    int bin, shift;
+    GroupElement msb;
+    GroupElement rout;
+    GroupElement select;
+};
+
+struct SlothTRKeyPack {
+    int bin, shift;
+    GroupElement rout;
+    GroupElement select;
+};
+
+struct SlothSignExtendKeyPack {
+    int bin, bout;
+    GroupElement rout;
+    GroupElement select;
+};
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/prng.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/prng.h
new file mode 100644
index 00000000..200dbab8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/prng.h
@@ -0,0 +1,29 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <cryptoTools/Crypto/PRNG.h>
+
+namespace LlamaConfig {
+    extern osuCrypto::PRNG prngs[256];
+}
+
+extern osuCrypto::PRNG prngShared;
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/stats.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/stats.h
new file mode 100644
index 00000000..49f076f4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/stats.h
@@ -0,0 +1,77 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+extern int numRounds;
+extern uint64_t eigenMicroseconds;
+extern uint64_t accumulatedInputTimeOffline;
+extern uint64_t accumulatedInputTimeOnline;
+
+extern uint64_t evalMicroseconds;
+extern uint64_t reconstructMicroseconds;
+extern uint64_t arsEvalMicroseconds;
+extern uint64_t convEvalMicroseconds;
+extern uint64_t reluEvalMicroseconds;
+extern uint64_t avgpoolEvalMicroseconds;
+extern uint64_t pubdivEvalMicroseconds;
+extern uint64_t argmaxEvalMicroseconds;
+extern uint64_t multEvalMicroseconds;
+extern uint64_t selectEvalMicroseconds;
+extern uint64_t dealerMicroseconds;
+extern uint64_t inputOfflineComm;
+extern uint64_t inputOnlineComm;
+extern uint64_t startTime;
+extern uint64_t secFloatComm;
+
+extern uint64_t convOnlineComm;
+extern uint64_t selectOnlineComm;
+extern uint64_t arsOnlineComm;
+extern uint64_t reluOnlineComm;
+
+extern uint64_t packTime;
+extern uint64_t unpackTime;
+extern uint64_t sendTime;
+extern uint64_t recvTime;
+
+namespace Llama {
+    struct stat_t {
+        std::string name;
+
+        uint64_t keyread_time;
+        uint64_t compute_time;
+        uint64_t reconstruct_time;
+
+        uint64_t comm_bytes;
+        uint64_t keysize_bytes;
+
+        void print();
+    };
+    
+    extern std::map<std::string, stat_t> stats;
+    void push_stats(const stat_t &stat);
+
+    void dump_stats_csv(const std::string &filename);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/utils.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/utils.h
new file mode 100644
index 00000000..442a2945
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/utils.h
@@ -0,0 +1,173 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include <llama/group_element.h>
+#include <llama/stats.h>
+#include <Eigen/Dense>
+
+using eigenMatrix = Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic>;
+
+void MatAdd(int s1, int s2, GroupElement *A, GroupElement* B, GroupElement *C);
+
+void MatAdd4(int s0, int s1, int s2, int s3, GroupElement* A, GroupElement* B, GroupElement* C);
+void MatAdd5(int s0, int s1, int s2, int s3, int s4, GroupElement* A, GroupElement* B, GroupElement* C);
+
+void MatSub(int s1, int s2, GroupElement *A, GroupElement* B, GroupElement *C);
+
+void MatSub4(int s0, int s1, int s2, int s3, GroupElement* A, GroupElement* B, GroupElement* C);
+void MatSub5(int s0, int s1, int s2, int s3, int s4, GroupElement* A, GroupElement* B, GroupElement* C);
+
+void MatMul(int s1, int s2, int s3, GroupElement *A, GroupElement* B, GroupElement *C);
+
+void MatCopy4(int s1, int s2, int s3, int s4, GroupElement *input, GroupElement *output);
+void MatCopy5(int s1, int s2, int s3, int s4, int s5, GroupElement *input, GroupElement *output);
+
+void Conv2DReshapeFilter(int FH, int FW, int CI, int CO, GroupElement* filter, eigenMatrix &reshapedFilter);
+
+void Conv2DReshapeInput(size_t N, size_t H, size_t W, size_t CI, size_t FH, size_t FW, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, GroupElement *outputArr);
+
+void Conv2DReshapeOutput(int N, int finalH, int finalW, int CO, GroupElement *inputArr, GroupElement *outputArr);
+
+void Conv2DPlaintext(int N, int H, int W, int CI, 
+				   int FH, int FW, int CO, 
+				   int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr);
+
+void matmul_eval_helper(int party, int dim1, int dim2, int dim3, GroupElement *A,
+                            GroupElement *B, GroupElement *C, GroupElement *ka, GroupElement *kb, GroupElement *kc);
+
+void matmul_eval_helper_triangular(int party, int dim1, int dim2, int dim3, GroupElement *A,
+                            GroupElement *B, GroupElement *C, GroupElement *ka, GroupElement *kb, GroupElement *kc);
+
+void packBitArray(GroupElement *A, int size, uint8_t *out);
+
+struct Conv2DCache {
+    eigenMatrix reshapedFilter;
+    eigenMatrix reshapedInput;
+    eigenMatrix matmulResult;
+    GroupElement *temp;
+};
+
+Conv2DCache allocateConv2DCache(int N, int H, int W, int CI, 
+                                int FH, int FW, int CO, 
+                                int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+                                int strideH, int strideW);
+
+void freeConv2DCache(const Conv2DCache &cache);
+
+void Conv2DPlaintext(int N, int H, int W, int CI, 
+				   int FH, int FW, int CO, 
+				   int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr,
+                   Conv2DCache &cache);
+
+struct Conv3DCache {
+    eigenMatrix reshapedFilter;
+    eigenMatrix reshapedInput;
+    eigenMatrix matmulResult;
+    GroupElement *temp;
+};
+
+Conv3DCache allocateConv3DCache(int N, int D, int H, int W, int CI, 
+                                int FD, int FH, int FW, int CO, 
+                                int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+                                int strideD, int strideH, int strideW);
+
+void freeConv3DCache(const Conv3DCache &cache);
+
+void Conv3DReshapeFilter(int FD, int FH, int FW, int CI, int CO, GroupElement* filter, eigenMatrix &reshapedFilter);
+
+void Conv3DReshapeInput(size_t N, size_t D, size_t H, size_t W, size_t CI, size_t FD, size_t FH, size_t FW, size_t zPadDLeft, size_t zPadDRight, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideD, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, GroupElement *outputArr);
+
+void Conv3DReshapeOutput(int N, int finalD, int finalH, int finalW, int CO, GroupElement *inputArr, GroupElement *outputArr);
+
+
+void Conv3DPlaintext(int N, int D, int H, int W, int CI, 
+				   int FD, int FH, int FW, int CO, 
+				   int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideD, int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr,
+                   Conv3DCache &cache);
+
+void Conv3DPlaintext(int N, int D, int H, int W, int CI, 
+				   int FD, int FH, int FW, int CO, 
+				   int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideD, int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr);
+
+void ConvTranspose3DLoopInnerClear(
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/lut.cpp b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
new file mode 100644
index 00000000..92d62046
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
@@ -0,0 +1,225 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "lut.h"
+#include <llama/dpf.h>
+#include <llama/assert.h>
+#include <cassert>
+
+std::pair<LUTKeyPack, LUTKeyPack> keyGenLUT(int bin, int bout, GroupElement rin, GroupElement rout)
+{
+    LUTKeyPack key0, key1;
+    key0.bin = bin;
+    key1.bin = bin;
+    key0.bout = bout;
+    key1.bout = bout;
+    auto dpfKeys = keyGenDPF(bin, bout, -rin, 1);
+    key0.dpfKey = dpfKeys.first;
+    key1.dpfKey = dpfKeys.second;
+    auto routSplit = splitShare(rout, bout);
+    key0.rout = routSplit.first;
+    key1.rout = routSplit.second;
+    return std::make_pair(key0, key1);
+}
+
+inline osuCrypto::block random_block()
+{
+    osuCrypto::block a;
+    int tid = omp_get_thread_num();
+    a = LlamaConfig::prngs[tid].get<osuCrypto::block>();
+    return a;
+}
+
+std::pair<LUTSSKeyPack, LUTSSKeyPack> keyGenLUTSS(int bin, int bout, GroupElement rin, GroupElement rout)
+{
+    assert(bin == 8);
+    LUTSSKeyPack key0, key1;
+    key0.bin = bin;
+    key1.bin = bin;
+    key0.bout = bout;
+    key1.bout = bout;
+
+    uint64_t b0, b1, b2, b3;
+
+    rin = -rin;
+    mod(rin, bin);
+    if (rin < 64) 
+    {
+        b0 = 1ULL << (63-rin);
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+    else if (rin < 128)
+    {
+        b0 = 0;
+        b1 = 1ULL << (127-rin);
+        b2 = 0;
+        b3 = 0;
+    }
+    else if (rin < 192)
+    {
+        b0 = 0;
+        b1 = 0;
+        b2 = 1ULL << (191-rin);
+        b3 = 0;
+    }
+    else
+    {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 1ULL << (255-rin);
+    }
+
+    key0.b0 = random_ge(64);
+    key0.b1 = random_ge(64);
+    key0.b2 = random_ge(64);
+    key0.b3 = random_ge(64);
+
+    key1.b0 = b0 ^ key0.b0;
+    key1.b1 = b1 ^ key0.b1;
+    key1.b2 = b2 ^ key0.b2;
+    key1.b3 = b3 ^ key0.b3;
+
+    GroupElement routRes = random_ge(bout);
+    auto routRes_split = splitShare(routRes, bout);
+    key0.routRes = routRes_split.first;
+    key1.routRes = routRes_split.second;
+
+    GroupElement routCorr = random_ge(bout);
+    auto routCorr_split = splitShare(routCorr, bout);
+    key0.routCorr = routCorr_split.first;
+    key1.routCorr = routCorr_split.second;
+
+    rout = rout + routRes * routCorr;
+    auto rout_split = splitShare(rout, bout);
+    key0.rout = rout_split.first;
+    key1.rout = rout_split.second;
+
+    return std::make_pair(key0, key1);
+}
+
+std::pair<GroupElement, GroupElement> evalLUTSS_1(int party, GroupElement x, const std::vector<GroupElement> &tab, const LUTSSKeyPack &kp)
+{
+    int bin = kp.bin;
+    assert(bin == 8);
+    mod(x, bin);
+
+    uint64_t bL0 = kp.b0;
+    uint64_t bL1 = kp.b1;
+    uint64_t bR0 = kp.b2;
+    uint64_t bR1 = kp.b3;
+
+    GroupElement res = 0, corr = 0;
+    for (int i = 0; i < 64; i++)
+    {
+        GroupElement bit = (bL0 >> (63-i)) & 1;
+        res += (1 - 2 * party) * bit * tab[(i+x)%256];
+        corr += (1 - 2 * party) * bit;
+    }
+
+    for (int i = 0; i < 64; i++)
+    {
+        GroupElement bit = (bL1 >> (63-i)) & 1;
+        res += (1 - 2 * party) * bit * tab[(i+64+x)%256];
+        corr += (1 - 2 * party) * bit;
+    }
+
+    for (int i = 0; i < 64; i++)
+    {
+        GroupElement bit = (bR0 >> (63-i)) & 1;
+        res += (1 - 2 * party) * bit * tab[(i+128+x)%256];
+        corr += (1 - 2 * party) * bit;
+    }
+
+    for (int i = 0; i < 64; i++)
+    {
+        GroupElement bit = (bR1 >> (63-i)) & 1;
+        res += (1 - 2 * party) * bit * tab[(i+192+x)%256];
+        corr += (1 - 2 * party) * bit;
+    }
+
+    res = res + kp.routRes;
+    corr = corr + kp.routCorr;
+
+    return std::make_pair(res, corr);
+}
+
+GroupElement evalLUTSS_2(int party, GroupElement res, GroupElement corr, const LUTSSKeyPack &kp)
+{
+    return party * res * corr - res * kp.routCorr - corr * kp.routRes + kp.rout;
+}
+
+std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, GroupElement rin, GroupElement routRes, GroupElement routCorr)
+{
+    assert(bin == 8);
+    LUTDPFETKeyPack key0, key1;
+    key0.bin = bin;
+    key1.bin = bin;
+    key0.bout = bout;
+    key1.bout = bout;
+
+    auto dpfKeys = keyGenDPFET(bin, -rin);
+    key0.dpfKey = dpfKeys.first;
+    key1.dpfKey = dpfKeys.second;
+
+    auto routRes_split = splitShare(routRes, bout);
+    key0.routRes = routRes_split.first;
+    key1.routRes = routRes_split.second;
+
+    auto routCorr_split = splitShare(routCorr, 1);
+    key0.routCorr = routCorr_split.first;
+    key1.routCorr = routCorr_split.second;
+
+    return std::make_pair(key0, key1);
+}
+
+std::pair<GroupElement, GroupElement> evalLUTDPFET_1(int party, GroupElement x, const std::vector<GroupElement> &tab, LUTDPFETKeyPack &kp)
+{
+    int bin = kp.bin;
+    assert(bin == 8);
+    mod(x, bin);
+
+    GroupElement res = 0, corr = 0;
+    auto res_corr = evalAll_reduce_et(party, kp.dpfKey, x, tab);
+
+    res = res_corr.first + kp.routRes;
+    corr = res_corr.second;
+
+    // corr = -1 or 1
+    if (party == 0) {
+        corr = corr + 1; // corr = 0 or 2
+    }
+    mod(corr, 2);
+    if (party == 0) 
+    {
+        corr = corr / 2;
+    }
+    else
+    {
+        corr = 2 - ((4 - corr) / 2);
+    }
+    corr = corr + kp.routCorr;
+    mod(corr, 1);
+
+    return std::make_pair(res, corr);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/lut.h b/GPU-MPC/ext/sytorch/ext/llama/lut.h
new file mode 100644
index 00000000..a7f14bab
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/lut.h
@@ -0,0 +1,33 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/keypack.h>
+
+std::pair<LUTKeyPack, LUTKeyPack> keyGenLUT(int bin, int bout, GroupElement rin, GroupElement rout);
+std::pair<LUTSSKeyPack, LUTSSKeyPack> keyGenLUTSS(int bin, int bout, GroupElement rin, GroupElement rout);
+std::pair<GroupElement, GroupElement> evalLUTSS_1(int party, GroupElement x, const std::vector<GroupElement> &tab, const LUTSSKeyPack &kp);
+GroupElement evalLUTSS_2(int party, GroupElement res, GroupElement corr, const LUTSSKeyPack &kp);
+
+std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, GroupElement rin, GroupElement routRes, GroupElement routCorr);
+std::pair<GroupElement, GroupElement> evalLUTDPFET_1(int party, GroupElement x, const std::vector<GroupElement> &tab, LUTDPFETKeyPack &kp);
+GroupElement evalLUTDPFET_2(int party, GroupElement res, GroupElement corr, const LUTDPFETKeyPack &kp);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/mic.cpp b/GPU-MPC/ext/sytorch/ext/llama/mic.cpp
new file mode 100644
index 00000000..5d5576d1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/mic.cpp
@@ -0,0 +1,89 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/array.h>
+#include "mic.h"
+
+std::pair<MICKeyPack, MICKeyPack> keyGenMIC(int bin, int bout, int m, uint64_t *p, uint64_t *q, GroupElement rin, GroupElement *rout)
+{
+    std::pair<MICKeyPack, MICKeyPack> keys;
+    keys.first.z = make_array<GroupElement>(m);
+    keys.second.z = make_array<GroupElement>(m);
+
+    GroupElement gamma = rin - 1;
+    auto dcfKeys = keyGenDCF(bin, bout, gamma, 1);
+    keys.first.dcfKey = dcfKeys.first;
+    keys.second.dcfKey = dcfKeys.second;
+
+    GroupElement neg1 = -1;
+    mod(neg1, bin);
+    for (int i = 1; i <= m; ++i)
+    {
+        GroupElement qi_prime(q[i-1] + 1);
+        mod(qi_prime, bin);
+        GroupElement alpha_ip(p[i-1] + rin);
+        mod(alpha_ip, bin);
+        GroupElement alpha_iq(q[i-1] + rin);
+        mod(alpha_iq, bin);
+        GroupElement alpha_iq_prime(q[i-1] + rin + 1);
+        mod(alpha_iq_prime, bin);
+
+        GroupElement z((rout == nullptr ? 0 : rout[i-1]) + (alpha_ip > alpha_iq) - (alpha_ip > p[i-1]) + (alpha_iq_prime > qi_prime) + (alpha_iq == neg1));
+        mod(z, bout);
+
+        auto zpair = splitShare(z, bout);
+        keys.first.z[i-1] = zpair.first;
+        keys.second.z[i-1] = zpair.second;
+    }
+
+    return keys;
+}
+
+// assumes intervals [pi, qi] are sorted
+void evalMIC(int party, int bin, int bout, int m, uint64_t *p, uint64_t *q, GroupElement x, const MICKeyPack &key, GroupElement *y)
+{
+    GroupElement sp = 0, sq = 0;
+    for(int i = 1; i <= m; ++i)
+    {
+        GroupElement qi_prime(q[i-1] + 1);
+        mod(qi_prime, bin);
+        GroupElement xi_p(x - 1 - p[i-1]);
+        mod(xi_p, bin);
+        GroupElement xi_qprime(x - 1 - qi_prime);
+        mod(xi_qprime, bin);
+        
+        if ((i != 1) && (p[i-1] == q[i-2] + 1))
+        {
+            sp = sq;
+        }
+        else
+        {
+            evalDCF(party, &sp, xi_p, key.dcfKey);
+        }
+        evalDCF(party, &sq, xi_qprime, key.dcfKey);
+        y[i-1] = sq - sp + key.z[i-1];
+        if (party == 1)
+        {
+            y[i-1] = y[i-1] + (x > p[i-1]) - (x > qi_prime);
+            mod(y[i-1], bout);
+        }
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/mic.h b/GPU-MPC/ext/sytorch/ext/llama/mic.h
new file mode 100644
index 00000000..978a85ee
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/mic.h
@@ -0,0 +1,34 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/dcf.h>
+#include <llama/keypack.h>
+
+// struct MICKeyPack {
+//     DCFKeyPack dcfKey;
+//     GroupElement *z;
+// };
+
+std::pair<MICKeyPack, MICKeyPack> keyGenMIC(int bin, int bout, int m, uint64_t *p, uint64_t *q, GroupElement rin, GroupElement *rout);
+
+void evalMIC(int party, int bin, int bout, int m, uint64_t *p, uint64_t *q, GroupElement x, const MICKeyPack &key, GroupElement *y);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp b/GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp
new file mode 100644
index 00000000..ec43491e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp
@@ -0,0 +1,76 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "msnzb.h"
+
+std::pair<MSNZBKeyPack, MSNZBKeyPack> keyGenMSNZB(int bin, int bout, GroupElement rin, GroupElement rout, int start, int end)
+{
+    if (end == -1) end = (bin - 1);
+    
+    std::pair<MSNZBKeyPack, MSNZBKeyPack> keys;
+    int m = end - start + 1;
+    uint64_t p[m], q[m];
+
+    for(uint64_t i = start; i <= end; ++i)
+    {
+        p[i - start] = (1ULL << i);
+        q[i - start] = (1ULL << (i + 1)) - 1;
+    }
+
+    auto micKeys = keyGenMIC(bin, bout, m, p, q, rin, nullptr);
+    keys.first.micKey = micKeys.first;
+    keys.second.micKey = micKeys.second;
+    auto rpair = splitShare(rout, bout);
+    keys.first.r = rpair.first;
+    keys.second.r = rpair.second;
+    
+    return keys;
+}
+
+GroupElement evalMSNZB(int party, int bin, int bout, GroupElement x, const MSNZBKeyPack &key, int start, int end, GroupElement *zcache)
+{
+    if (end == -1) end = (bin - 1);
+    
+    int m = end - start + 1;
+    uint64_t p[m], q[m];
+    GroupElement z[m];
+    // for(int i = 0; i < m; ++i) z[m].bitsize = bout;
+
+    for(uint64_t i = start; i <= end; ++i)
+    {
+        p[i - start] = (1ULL << i);
+        q[i - start] = (1ULL << (i + 1)) - 1;
+    }
+
+    evalMIC(party, bin, bout, m, p, q, x, key.micKey, z);
+
+    if (zcache != nullptr) {
+        for(int i = 0; i < m; ++i) zcache[i] = z[i];
+    }
+
+    GroupElement sum = key.r;
+    for(int i = start; i <= end; ++i)
+    {
+        sum = sum + i * z[i - start];
+    }
+    mod(sum, bout);
+    return sum;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/msnzb.h b/GPU-MPC/ext/sytorch/ext/llama/msnzb.h
new file mode 100644
index 00000000..79376ce3
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/msnzb.h
@@ -0,0 +1,34 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "mic.h"
+#include <llama/keypack.h>
+
+// struct MSNZBKeyPack {
+//     MICKeyPack micKey;
+//     GroupElement r;
+// };
+
+std::pair<MSNZBKeyPack, MSNZBKeyPack> keyGenMSNZB(int bin, int bout, GroupElement rin, GroupElement rout, int start = 0, int end = -1);
+
+GroupElement evalMSNZB(int party, int bin, int bout, GroupElement x, const MSNZBKeyPack &key, int start = 0, int end = -1, GroupElement *zcache = nullptr);
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/mult.cpp b/GPU-MPC/ext/sytorch/ext/llama/mult.cpp
new file mode 100644
index 00000000..a380404d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/mult.cpp
@@ -0,0 +1,115 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/comms.h>
+#include <llama/dcf.h>
+#include "mult.h"
+#include <assert.h>
+#include <utility>
+using namespace LlamaConfig;
+
+std::pair<MultKey, MultKey> MultGen(GroupElement rin1, GroupElement rin2, GroupElement rout)
+{
+    
+    MultKey k1, k2;
+    // k1.Bin = Bin; k2.Bin = Bin;
+    // k1.Bout = Bout; k2.Bout = Bout;
+
+    GroupElement c  = rin1 * rin2 + rout;
+    auto a_split = splitShare(rin1, 64);
+    auto b_split = splitShare(rin2, 64);
+    auto c_split = splitShare(c, 64);
+    
+    k1.a = (a_split.first);
+    k1.b = (b_split.first);
+    k1.c = (c_split.first);
+    
+    k2.a = (a_split.second);
+    k2.b = (b_split.second);
+    k2.c = (c_split.second);
+    
+    return std::make_pair(k1, k2);
+}
+
+GroupElement MultEval(int party, const MultKey &k, const GroupElement &l, const GroupElement &r)
+{
+    return party * (l * r) - l * k.b - r * k.a + k.c;
+}
+
+GroupElement mult_helper(uint8_t party, GroupElement x, GroupElement y, GroupElement x_mask, GroupElement y_mask)
+{
+    if (party == DEALER) {
+        GroupElement z_mask = random_ge(64);
+        std::pair<MultKey, MultKey> keys = MultGen(x_mask, y_mask, z_mask);
+        server->send_mult_key(keys.first);
+        client->send_mult_key(keys.second);
+        return z_mask;
+    }
+    else {
+        MultKey key = dealer->recv_mult_key();
+        GroupElement e = MultEval(party - SERVER, key, x, y);
+        peer->send_input(e);
+        return e + peer->recv_input();
+    }
+}
+
+std::pair<SquareKey, SquareKey> keyGenSquare(GroupElement rin, GroupElement rout)
+{
+    SquareKey k1, k2;
+
+    GroupElement c  = rin * rin + rout;
+    auto b_split = splitShare(2 * rin, 64);
+    auto c_split = splitShare(c, 64);
+    
+    k1.b = (b_split.first);
+    k1.c = (c_split.first);
+    
+    k2.b = (b_split.second);
+    k2.c = (c_split.second);
+    
+    return std::make_pair(k1, k2);
+}
+
+GroupElement evalSquare(int party, GroupElement x, const SquareKey &k)
+{
+    return party * x * x - x * k.b + k.c;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/mult.h b/GPU-MPC/ext/sytorch/ext/llama/mult.h
new file mode 100644
index 00000000..134e0613
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/mult.h
@@ -0,0 +1,51 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <llama/keypack.h>
+
+std::pair<MultKey, MultKey> MultGen(GroupElement rin1, GroupElement rin2, GroupElement rout);
+GroupElement MultEval(int party, const MultKey &k, const GroupElement &l, const GroupElement &r);
+GroupElement mult_helper(uint8_t party, GroupElement x, GroupElement y, GroupElement x_mask, GroupElement y_mask);
+
+std::pair<SquareKey, SquareKey> keyGenSquare(GroupElement rin, GroupElement rout);
+GroupElement evalSquare(int party, GroupElement x, const SquareKey &k);
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp b/GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp
new file mode 100644
index 00000000..cd35cf73
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp
@@ -0,0 +1,61 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "pubcmp.h"
+#include <llama/dcf.h>
+
+std::pair<PubCmpKeyPack, PubCmpKeyPack> keyGenPubCmp(int bin, GroupElement rin, GroupElement rout)
+{
+    mod(rin, bin);
+    GroupElement r1, r0;
+    
+    auto dcfN = keyGenDCF(bin, 1, rin, 1);
+
+    GroupElement a = rout & 1;
+
+    PubCmpKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.dcfKey = dcfN.first; k1.dcfKey = dcfN.second;
+    
+    auto aSplit = splitShare(a, 1);
+    k0.rout = aSplit.first; k1.rout = aSplit.second;
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalPubCmp(int party, GroupElement x, GroupElement c, const PubCmpKeyPack &key) {
+    GroupElement y = x - c;
+    mod(y, key.bin);
+    mod(x, key.bin);
+    GroupElement t2 = 0;
+    GroupElement t1 = 0;
+    evalDCF(party, &t2, y, key.dcfKey);
+    evalDCF(party, &t1, x, key.dcfKey);
+    GroupElement res = t2 - t1 + key.rout;
+    if (party == 1) {
+        GroupElement N = -c;
+        mod(N, key.bin);
+        if (y >= N) {
+            res += 1;
+        }
+    }
+    mod(res, 1);
+    return res;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/pubcmp.h b/GPU-MPC/ext/sytorch/ext/llama/pubcmp.h
new file mode 100644
index 00000000..82356631
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/pubcmp.h
@@ -0,0 +1,27 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/keypack.h>
+
+std::pair<PubCmpKeyPack, PubCmpKeyPack> keyGenPubCmp(int bin, GroupElement rin, GroupElement rout);
+GroupElement evalPubCmp(int party, GroupElement x, GroupElement c, const PubCmpKeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp b/GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp
new file mode 100644
index 00000000..f80dbe1e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp
@@ -0,0 +1,248 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/keypack.h>
+#include <llama/dcf.h>
+#include <assert.h>
+#include <utility>
+
+std::pair<ScmpKeyPack, ScmpKeyPack> keyGenSCMP(int Bin, int Bout, GroupElement rin1, GroupElement rin2,
+                                GroupElement rout)
+{
+    // return 1 if x-rin1 >= y-rin2 else 0
+
+    ScmpKeyPack key0, key1;
+
+    key0.Bin = Bin; key1.Bin = Bin;
+    key0.Bout = Bout; key1.Bout = Bout;
+
+    GroupElement y = -(rin1 - rin2);
+    uint8_t y_msb = msb(y, Bin);
+    GroupElement y_idx = y - ((uint64_t)y_msb << (Bin-1));
+    GroupElement payload1(1 ^ y_msb), payload2(y_msb);
+    auto keys = keyGenDualDCF(Bin-1, Bout, y_idx, payload1, payload2);
+    key0.dualDcfKey = keys.first; 
+    key1.dualDcfKey = keys.second;
+    auto rout_split = splitShare(rout, Bout);
+    key0.rb = rout_split.first; key1.rb = rout_split.second;
+    return std::make_pair(key0, key1);
+}
+
+GroupElement evalSCMP(int party, ScmpKeyPack key,
+                    GroupElement x, GroupElement y)
+{
+    // return 1 if x-rin1 >= y-rin2 else 0
+
+    GroupElement z = x - y;
+    uint8_t z_msb = msb(z, key.Bin);
+    GroupElement z_n_1 = z - ((uint64_t)z_msb << (key.Bin-1));
+    GroupElement z_idx = ((uint64_t)1 << (key.Bin-1)) - z_n_1 - 1; 
+    GroupElement mb = 0;
+    evalDualDCF(party, &mb, z_idx, key.dualDcfKey);
+    return party - (party * z_msb + mb - 2*z_msb*mb) + key.rb;
+}
+
+std::pair<ARSKeyPack, ARSKeyPack> keyGenARS(int Bin, int Bout, uint64_t shift, GroupElement rin, GroupElement rout)
+{
+    ARSKeyPack k0, k1;
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+    k0.shift = shift; k1.shift = shift;
+
+    GroupElement y = -rin;
+    uint8_t y_msb = msb(y, Bin);
+    uint64_t alpha_n = y & (((uint64_t)1 << (Bin - 1)) - 1);
+
+    // store last shift bits of y in alpha_s 
+    uint64_t ones = ((uint64_t)1 << shift) - 1;
+    GroupElement alpha_s = y & ones;
+
+    // std::cout << "keygen alpha_n (dualdcf alpha) " << alpha_n << " alpha_s (dcf alpha)" << alpha_s << std::endl;
+
+    if (!LlamaConfig::stochasticT) {
+        auto dcfKeys = keyGenDCF(shift, Bout, alpha_s, 1);
+        k0.dcfKey = dcfKeys.first; k1.dcfKey = dcfKeys.second;
+    }
+
+    if (Bout > Bin - shift) {
+        GroupElement *payload1 = new GroupElement[2], *payload2 = new GroupElement[2];
+
+        payload1[0] = 1;
+        payload1[1] = 1 ^ y_msb;
+        payload2[0] = 0;
+        payload2[1] = y_msb;
+        auto dualDcfKeys = keyGenDualDCF(Bin - 1, Bout, 2, alpha_n, payload1, payload2);
+        k0.dualDcfKey = dualDcfKeys.first; k1.dualDcfKey = dualDcfKeys.second;
+        auto rb_split = splitShare(rout + (alpha_n >> shift), Bout);
+        k0.rb = rb_split.first; k1.rb = rb_split.second;
+    }
+    else {
+        auto rb_split = splitShare(rout + GroupElement(y >> shift), Bout);
+        k0.rb = rb_split.first; k1.rb = rb_split.second;
+    }
+
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack &k)
+{
+    // last shift bits of x
+    uint64_t ones = ((uint64_t)1 << shift) - 1;
+    GroupElement x_s = x & ones;
+
+    // last n-1 bits of x
+    uint8_t x_msb = msb(x, k.Bin);
+    // todo: bitsize of x_n should have been k.Bin - 1
+    uint64_t x_n = x & (((uint64_t)1 << (k.Bin - 1)) - 1);
+    // std::cout << "x_n " << x_n << std::endl;
+
+    GroupElement dcfIdx = ((uint64_t)1 << shift) - x_s - 1;
+    // GroupElement t_s = evalDCF(party, dcfIdx, k.dcfKey);
+    GroupElement t_s = party;
+    if (!LlamaConfig::stochasticT) {
+        evalDCF(party, &t_s, dcfIdx, k.dcfKey);
+    }
+
+    GroupElement res;
+    if (k.Bout > k.Bin - k.shift) {
+
+    GroupElement dualDcfIdx(((uint64_t)1 << (k.Bin - 1)) - x_n - 1);
+    GroupElement ddcfOut[2];
+    evalDualDCF(party, ddcfOut, dualDcfIdx, k.dualDcfKey);
+
+    GroupElement t_n = ddcfOut[0], m_n = ddcfOut[1];
+    GroupElement mb = GroupElement(party * x_msb) + m_n - 2 * x_msb * m_n;
+    res = party * GroupElement(x_n >> shift) + k.rb + t_s - ((uint64_t)1 << (k.Bin - shift - 1)) * (t_n + mb);
+    }
+    else {
+        res = party * GroupElement(x >> shift) + k.rb + t_s;
+    }
+
+    return res; 
+}
+
+std::pair<EdabitsPrTruncKeyPack, EdabitsPrTruncKeyPack> keyGenEdabitsPrTrunc(int bw, int shift, GroupElement rin, GroupElement rout)
+{
+    GroupElement a = 0;
+    mod(rin, bw);
+
+    if (rin & (1ULL << (bw - 1)))
+    {
+        a = 1;
+    }
+    GroupElement b = rin >> shift;
+    mod(b, bw - shift - 1);
+    b = rout - b;
+    mod(b, bw);
+    auto a_split = splitShare(a, bw);
+    auto b_split = splitShare(b, bw);
+    EdabitsPrTruncKeyPack k0, k1;
+    k0.a = a_split.first; k1.a = a_split.second;
+    k0.b = b_split.first; k1.b = b_split.second;
+    
+    return std::make_pair(k0, k1);
+}
+
+std::pair<TruncateReduceKeyPack, TruncateReduceKeyPack> keyGenTruncateReduce(int bin, int shift, GroupElement rin, GroupElement rout)
+{
+    GroupElement r1 = rin >> shift;
+    mod(r1, bin - shift);
+    GroupElement r0 = rin;
+    mod(r0, shift);
+
+    auto dcfKeys = keyGenDCF(shift, bin - shift, r0, 1);
+    auto rout_split = splitShare(rout - r1, bin - shift);
+
+    TruncateReduceKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.shift = shift; k1.shift = shift;
+    k0.dcfKey = dcfKeys.first; k1.dcfKey = dcfKeys.second;
+    k0.rout = rout_split.first; k1.rout = rout_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalTruncateReduce(int party, GroupElement x, const TruncateReduceKeyPack &k)
+{
+    GroupElement x0 = x;
+    mod(x0, k.shift);
+    GroupElement x1 = x >> k.shift;
+    mod(x1, k.bin - k.shift);
+
+    GroupElement t;
+    evalDCF(party, &t, x0, k.dcfKey);
+
+    return party * x1 - t + k.rout;
+}
+
+std::pair<SlothLRSKeyPack, SlothLRSKeyPack> keyGenSlothLRS(int bin, int shift, GroupElement rin, GroupElement rinWrap, GroupElement rout)
+{
+    SlothLRSKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.shift = shift; k1.shift = shift;
+
+    mod(rin, bin);
+    mod(rinWrap, 1);
+    GroupElement r = rout - (rin >> shift);
+    GroupElement msb = (1LL << (bin - shift)) * ((rin >> (bin - 1)) & 1);
+
+    auto rout_split = splitShare(r, bin);
+    k0.rout = rout_split.first; k1.rout = rout_split.second;
+
+    auto msb_split = splitShare(msb, bin);
+    k0.msb = msb_split.first; k1.msb = msb_split.second;
+
+    auto select_split = splitShare(rinWrap, bin);
+    k0.select = select_split.first; k1.select = select_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalSlothLRS(int party, GroupElement x, GroupElement w, const SlothLRSKeyPack &key)
+{
+    mod(x, key.bin);
+    mod(w, 1);
+    GroupElement msb = (x >> (key.bin - 1)) & 1;
+    // return party * (x >> key.shift) + key.rout + key.msb * (1 - msb) - (1 - w) * key.select - w * (party - key.select);
+    return party * (x >> key.shift) + key.rout + key.msb * (1 - msb) - key.select - w * party + 2 * w * key.select;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/pubdiv.h b/GPU-MPC/ext/sytorch/ext/llama/pubdiv.h
new file mode 100644
index 00000000..f55a43b8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/pubdiv.h
@@ -0,0 +1,61 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <llama/keypack.h>
+
+std::pair<ScmpKeyPack, ScmpKeyPack> keyGenSCMP(int Bin, int Bout, GroupElement rin1, GroupElement rin2,
+                                GroupElement rout);
+
+GroupElement evalSCMP(int party, ScmpKeyPack key, GroupElement x, GroupElement y);
+
+std::pair<ARSKeyPack, ARSKeyPack> keyGenARS(int Bin, int Bout, uint64_t shift, GroupElement rin, GroupElement rout);
+
+GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack &k);
+
+std::pair<EdabitsPrTruncKeyPack, EdabitsPrTruncKeyPack> keyGenEdabitsPrTrunc(int bw, int shift, GroupElement rin, GroupElement rout);
+
+std::pair<TruncateReduceKeyPack, TruncateReduceKeyPack> keyGenTruncateReduce(int bin, int shift, GroupElement rin, GroupElement rout);
+GroupElement evalTruncateReduce(int party, GroupElement x, const TruncateReduceKeyPack &k);
+
+std::pair<SlothLRSKeyPack, SlothLRSKeyPack> keyGenSlothLRS(int bin, int shift, GroupElement rin, GroupElement rinWrap, GroupElement rout);
+GroupElement evalSlothLRS(int party, GroupElement x, GroupElement w, const SlothLRSKeyPack &k);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/relu.cpp b/GPU-MPC/ext/sytorch/ext/llama/relu.cpp
new file mode 100644
index 00000000..a1f34243
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/relu.cpp
@@ -0,0 +1,312 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "relu.h"
+#include <llama/dcf.h>
+#include <llama/dpf.h>
+#include <assert.h>
+
+std::pair<ReluKeyPack, ReluKeyPack> keyGenRelu(int Bin, int Bout,
+                        GroupElement rin, GroupElement rout, GroupElement routDrelu)
+{
+    // represents offset poly p(x-rin)'s coefficients, where p(x)=x
+    GroupElement beta[2];
+    beta[0] = 1;
+    beta[1] = -rin;
+    mod(beta[1], Bout);
+    ReluKeyPack k0, k1;
+    
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+
+    GroupElement gamma = rin - 1;
+    auto dcfKeys = keyGenDCF(Bin, Bout, 2, gamma, beta);
+
+    GroupElement p = 0;
+    GroupElement q = (((uint64_t)1 << (Bin-1)) - 1);
+    GroupElement q1 = q + 1, alpha_L = p + rin, alpha_R = q + rin, alpha_R1 = q + 1 + rin;
+    mod(q1, Bin);
+    mod(alpha_L, Bin);
+    mod(alpha_R, Bin);
+    mod(alpha_R1, Bin);
+    GroupElement neg1 = -1;
+    mod(neg1, Bin);
+    GroupElement cr = GroupElement((alpha_L > alpha_R) - (alpha_L > p) + (alpha_R1 > q1) + (alpha_R == neg1));
+    mod(cr, Bin);
+
+    GroupElement val;
+    val = beta[0] * cr;
+    auto val_split = splitShare(val, Bin);
+    k0.e_b0 = val_split.first;
+    k1.e_b0 = val_split.second;
+
+    val = beta[1] * cr;
+    val_split = splitShare(val, Bin);
+    k0.e_b1 = val_split.first;
+    k1.e_b1 = val_split.second;
+
+    auto beta_split = splitShare(beta[0], Bin);
+    k0.beta_b0 = beta_split.first;
+    k1.beta_b0 = beta_split.second;
+
+    beta_split = splitShare(beta[1], Bin);
+    k0.beta_b1 = beta_split.first;
+    k1.beta_b1 = beta_split.second;
+
+
+    auto rout_split = splitShare(rout, Bout);
+    k0.r_b = rout_split.first; k1.r_b = rout_split.second;
+    auto drelu_split = splitShare(routDrelu, 1);
+    k0.drelu = drelu_split.first; k1.drelu = drelu_split.second;
+    k0.k = dcfKeys.first.k;
+    k0.g = dcfKeys.first.g;
+    k0.v = dcfKeys.first.v;
+    k1.k = dcfKeys.second.k;
+    k1.g = dcfKeys.second.g;
+    k1.v = dcfKeys.second.v;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalRelu(int party, GroupElement x, const ReluKeyPack &k, GroupElement *drelu)
+{
+    int Bout = k.Bout;
+    int Bin = k.Bin;
+    mod(x, Bin);
+
+    GroupElement p = 0;
+    GroupElement q = GroupElement((((uint64_t)1 << (Bin-1)) - 1));
+    mod(q, Bin);
+    GroupElement q1 = q + 1, xL = x - 1, xR1 = x - 1 - q1;
+    mod(q1, Bin);
+    mod(xL, Bin);
+    mod(xR1, Bin);
+    GroupElement share_L[2]; 
+    evalDCF(Bin, Bout, 2, share_L, party, xL, k.k, k.g, k.v);
+    GroupElement share_R1[2];
+    evalDCF(Bin, Bout, 2, share_R1, party, xR1, k.k, k.g, k.v);
+
+    GroupElement cx = GroupElement((x > 0) - (x > q1));
+    mod(cx, k.Bin);
+    GroupElement sum = 0;
+    
+    GroupElement w_b = cx * k.beta_b0 - share_L[0] + share_R1[0] + k.e_b0;
+    mod(w_b, Bout);
+    if (drelu != nullptr) {
+        *drelu = (w_b + k.drelu);
+        mod(*drelu, 1);
+    }
+    sum = sum + (w_b * x);
+
+    w_b = cx * k.beta_b1 - share_L[1] + share_R1[1] + k.e_b1;
+    sum = sum + w_b;
+
+    GroupElement ub(k.r_b + sum);
+    mod(ub, Bout);
+    return ub;
+}
+
+
+std::pair<MaxpoolKeyPack, MaxpoolKeyPack> keyGenMaxpool(int Bin, int Bout, GroupElement rin1, GroupElement rin2, GroupElement rout, GroupElement routBit)
+{
+    // maxpool(x, y) = relu(x - y) + y
+    // for correctness, ensure magnitude(x) + magnitude(y) in signed context < N/2
+    MaxpoolKeyPack k0, k1;
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+
+    auto reluKeys = keyGenRelu(Bin, Bout, rin2 - rin1, 0, routBit);
+    k0.reluKey = reluKeys.first; 
+    k1.reluKey = reluKeys.second;
+
+    auto rb_split = splitShare(-rin1 + rout, Bout);
+    k0.rb = rb_split.first; k1.rb = rb_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalMaxpool(int party, GroupElement x, GroupElement y, const MaxpoolKeyPack &k, GroupElement &bit)
+{
+    // maxpool(x, y) = relu(x - y) + y
+    // for correctness, ensure magnitude(x) + magnitude(y) in signed context < N/2
+    GroupElement res = evalRelu(party, y - x, k.reluKey, &bit) + (party * x) + k.rb;
+    return res;
+}
+
+std::pair<Relu2RoundKeyPack, Relu2RoundKeyPack> keyGenRelu2Round(int effectiveBw, int bin, GroupElement rin, GroupElement routRelu, GroupElement rout)
+{
+    mod(rin, bin);
+    GroupElement r1, r0;
+    
+    auto dcfN = keyGenDCF(effectiveBw, 1, rin, 1);
+
+    GroupElement a;
+    GroupElement b;
+    GroupElement c;
+    GroupElement d1 = 0;
+    GroupElement d2 = 0;
+    a = routRelu & 1;
+    b = rin;
+    c = (routRelu & 1) * rin + rout;
+    if ((routRelu & 1) == 1) {
+        d1 = 2;
+        d2 = -2 * rin;
+    }
+
+    Relu2RoundKeyPack k0, k1;
+    k0.Bin = bin; k1.Bin = bin;
+    k0.effectiveBin = effectiveBw; k1.effectiveBin = effectiveBw;
+    k0.dcfKey = dcfN.first; k1.dcfKey = dcfN.second;
+    auto aSplit = splitShare(a, bin);
+    k0.a = aSplit.first; k1.a = aSplit.second;
+    auto bSplit = splitShare(b, bin);
+    k0.b = bSplit.first; k1.b = bSplit.second;
+    auto cSplit = splitShare(c, bin);
+    k0.c = cSplit.first; k1.c = cSplit.second;
+    auto d1Split = splitShare(d1, bin);
+    k0.d1 = d1Split.first; k1.d1 = d1Split.second;
+    auto d2Split = splitShare(d2, bin);
+    k0.d2 = d2Split.first; k1.d2 = d2Split.second;
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalRelu2_drelu(int party, GroupElement x, const Relu2RoundKeyPack &key) {
+    GroupElement xp = x + (1ULL<<(key.effectiveBin - 1));
+    mod(xp, key.effectiveBin);
+    mod(x, key.effectiveBin);
+    GroupElement t2 = 0;
+    GroupElement t1 = 0;
+    evalDCF(party, &t2, xp, key.dcfKey);
+    evalDCF(party, &t1, x, key.dcfKey);
+    GroupElement res;
+    res = t2 - t1 + key.a;
+    mod(res, 1);
+    if (party == 1) {
+        if (xp >= (1ULL<<(key.effectiveBin - 1))) {
+            res += 1;
+        }
+    }
+    mod(res, 1);
+    return res;
+}
+
+GroupElement evalRelu2_mult(int party, GroupElement x, GroupElement y, const Relu2RoundKeyPack &key) {
+    GroupElement t1 = 0;
+    GroupElement t2 = 0;
+    mod(x, 1);
+    if (x == 0) {
+        t1 = key.d1;
+        t2 = key.d2;
+    }
+    GroupElement res;
+    res = -key.a * y - key.b * x + key.c + y * t1 + t2;
+    if (party == 1) {
+        res += x * y;
+    }
+    mod(res, key.Bin);
+    return res;
+}
+
+std::pair<MaxpoolDoubleKeyPack, MaxpoolDoubleKeyPack> keyGenMaxpoolDouble(int Bin, int Bout, GroupElement rin1, GroupElement rin2, GroupElement routBit, GroupElement rout)
+{
+    // maxpool(x, y) = relu(x - y) + y
+    // for correctness, ensure magnitude(x) + magnitude(y) in signed context < N/2
+    MaxpoolDoubleKeyPack k0, k1;
+    k0.Bin = Bin; k1.Bin = Bin;
+    k0.Bout = Bout; k1.Bout = Bout;
+
+    auto reluKeys = keyGenRelu2Round(Bin, Bout, rin2 - rin1, routBit, 0);
+    k0.reluKey = reluKeys.first; 
+    k1.reluKey = reluKeys.second;
+
+    auto rb_split = splitShare(-rin1 + rout, Bout);
+    k0.rb = rb_split.first; k1.rb = rb_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalMaxpoolDouble_1(int party, GroupElement x, GroupElement y, const MaxpoolDoubleKeyPack &k)
+{
+    // maxpool(x, y) = relu(x - y) + y
+    // for correctness, ensure magnitude(x) + magnitude(y) in signed context < N/2
+    GroupElement res = evalRelu2_drelu(party, y - x, k.reluKey);// + (party * y) + k.rb;
+    return res;
+}
+
+GroupElement evalMaxpoolDouble_2(int party, GroupElement x, GroupElement y, GroupElement s, const MaxpoolDoubleKeyPack &k)
+{
+    GroupElement res = evalRelu2_mult(party, s, y - x, k.reluKey) + (party * x) + k.rb;
+    return res;
+}
+
+std::pair<SlothDreluKeyPack, SlothDreluKeyPack> keyGenSlothDrelu(int bin, GroupElement rin, GroupElement rout)
+{
+    GroupElement x_1 = -rin;
+    mod(x_1, bin);
+    GroupElement y_1 = x_1;
+    mod(y_1, bin - 1);
+
+    auto dpfKeys = keyGenDPFET(bin - 1, y_1);
+    GroupElement r = rout ^ 1 ^ ((x_1 >> (bin - 1)) & 1);
+    auto r_split = splitShare(r, 1);
+
+    SlothDreluKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.dpfKey = dpfKeys.first; k1.dpfKey = dpfKeys.second;
+    k0.r = r_split.first; k1.r = r_split.second;
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalSlothDrelu(int party, GroupElement x, const SlothDreluKeyPack &kp)
+{
+    GroupElement y_0 = - x - 1;
+    mod(y_0, kp.bin - 1);
+    GroupElement u_b = evalDPFET_LT(party, kp.dpfKey, y_0);
+    GroupElement res = u_b ^ kp.r;
+    if (party == 0)
+    {
+        res ^= ((x >> (kp.bin - 1)) & 1);
+    }
+    return res;
+}
+
+
diff --git a/GPU-MPC/ext/sytorch/ext/llama/relu.h b/GPU-MPC/ext/sytorch/ext/llama/relu.h
new file mode 100644
index 00000000..49910026
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/relu.h
@@ -0,0 +1,64 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+#include <llama/keypack.h>
+
+std::pair<ReluKeyPack, ReluKeyPack> keyGenRelu(int Bin, int Bout,
+                        GroupElement rin, GroupElement rout, GroupElement routDrelu = 0);
+
+// GroupElement evalRelu(int party, GroupElement x, const ReluKeyPack &k);
+GroupElement evalRelu(int party, GroupElement x, const ReluKeyPack &k, GroupElement *drelu = nullptr);
+
+std::pair<MaxpoolKeyPack, MaxpoolKeyPack> keyGenMaxpool(int Bin, int Bout, GroupElement rin1, GroupElement rin2, GroupElement rout, GroupElement routBit);
+GroupElement evalMaxpool(int party, GroupElement x, GroupElement y, const MaxpoolKeyPack &k, GroupElement &bit);
+
+std::pair<Relu2RoundKeyPack, Relu2RoundKeyPack> keyGenRelu2Round(int effectiveBw, int bin, GroupElement rin, GroupElement routRelu, GroupElement rout);
+GroupElement evalRelu2_drelu(int party, GroupElement x, const Relu2RoundKeyPack &key);
+GroupElement evalRelu2_mult(int party, GroupElement x, GroupElement y, const Relu2RoundKeyPack &key);
+
+std::pair<MaxpoolDoubleKeyPack, MaxpoolDoubleKeyPack> keyGenMaxpoolDouble(int Bin, int Bout, GroupElement rin1, GroupElement rin2, GroupElement routBit, GroupElement rout);
+GroupElement evalMaxpoolDouble_1(int party, GroupElement x, GroupElement y, const MaxpoolDoubleKeyPack &k);
+GroupElement evalMaxpoolDouble_2(int party, GroupElement x, GroupElement y, GroupElement s, const MaxpoolDoubleKeyPack &k);
+
+std::pair<SlothDreluKeyPack, SlothDreluKeyPack> keyGenSlothDrelu(int bin, GroupElement rin, GroupElement rout);
+GroupElement evalSlothDrelu(int party, GroupElement x, const SlothDreluKeyPack &k);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/select.cpp b/GPU-MPC/ext/sytorch/ext/llama/select.cpp
new file mode 100644
index 00000000..0dc533e2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/select.cpp
@@ -0,0 +1,71 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "select.h"
+
+std::pair<SelectKeyPack, SelectKeyPack> keyGenSelect(int bin, GroupElement s, GroupElement y, GroupElement out)
+{
+    GroupElement a;
+    GroupElement b;
+    GroupElement c;
+    GroupElement d1 = 0;
+    GroupElement d2 = 0;
+    a = s & 1;
+    b = y;
+    c = (s & 1) * y + out;
+    if ((s & 1) == 1) {
+        d1 = 2;
+        d2 = -2 * y;
+    }
+
+    SelectKeyPack k0, k1;
+    k0.Bin = bin;
+    k1.Bin = bin;
+    auto aSplit = splitShare(a, bin);
+    k0.a = aSplit.first; k1.a = aSplit.second;
+    auto bSplit = splitShare(b, bin);
+    k0.b = bSplit.first; k1.b = bSplit.second;
+    auto cSplit = splitShare(c, bin);
+    k0.c = cSplit.first; k1.c = cSplit.second;
+    auto d1Split = splitShare(d1, bin);
+    k0.d1 = d1Split.first; k1.d1 = d1Split.second;
+    auto d2Split = splitShare(d2, bin);
+    k0.d2 = d2Split.first; k1.d2 = d2Split.second;
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalSelect(int party, GroupElement s, GroupElement x, const SelectKeyPack &key)
+{
+    GroupElement t1 = 0;
+    GroupElement t2 = 0;
+    mod(s, 1);
+    if (s == 0) {
+        t1 = key.d1;
+        t2 = key.d2;
+    }
+    GroupElement res;
+    res = -key.a * x - key.b * s + key.c + x * t1 + t2;
+    if (party == 1) {
+        res += s * x;
+    }
+    mod(res, key.Bin);
+    return res;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/select.h b/GPU-MPC/ext/sytorch/ext/llama/select.h
new file mode 100644
index 00000000..1f5e999e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/select.h
@@ -0,0 +1,26 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <llama/keypack.h>
+
+std::pair<SelectKeyPack, SelectKeyPack> keyGenSelect(int Bin, GroupElement s, GroupElement y, GroupElement out);
+GroupElement evalSelect(int party, GroupElement s, GroupElement x, const SelectKeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/signextend.cpp b/GPU-MPC/ext/sytorch/ext/llama/signextend.cpp
new file mode 100644
index 00000000..c6abf0b3
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/signextend.cpp
@@ -0,0 +1,80 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "signextend.h"
+#include <llama/dcf.h>
+
+std::pair<SignExtend2KeyPack, SignExtend2KeyPack> keyGenSignExtend2(int bin, int bout, GroupElement rin, GroupElement rout)
+{
+    std::pair<SignExtend2KeyPack, SignExtend2KeyPack> keys;
+    
+    auto dcfKeys = keyGenDCF(bin, 1, rin, 1);
+    keys.first.dcfKey = dcfKeys.first;
+    keys.second.dcfKey = dcfKeys.second;
+
+    GroupElement rw = random_ge(1);
+    auto rw_split = splitShare(rw, 1);
+    keys.first.rw = rw_split.first;
+    keys.second.rw = rw_split.second;
+
+    GroupElement p[2];
+    if ((rw % 2) == 0) {
+        p[0] = rout - rin - (1ULL << (bin - 1));
+        p[1] = rout - rin + (1ULL << (bin - 1));
+    } else {
+        p[0] = rout - rin + (1ULL << (bin - 1));
+        p[1] = rout - rin - (1ULL << (bin - 1));
+    }
+
+    for (int i = 0; i < 2; ++i) {
+        auto p_split = splitShare(p[i], bout);
+        keys.first.p[i] = p_split.first;
+        keys.second.p[i] = p_split.second;
+    }
+
+    return keys;
+}
+
+std::pair<SlothSignExtendKeyPack, SlothSignExtendKeyPack> keyGenSlothSignExtend(int bin, int bout, GroupElement rin, GroupElement w, GroupElement rout)
+{
+    SlothSignExtendKeyPack k0, k1;
+    k0.bin = bin;
+    k1.bin = bin;
+    k0.bout = bout;
+    k1.bout = bout;
+
+    auto rout_split = splitShare(rout - rin - (1LL << (bin - 1)), bout);
+    k0.rout = rout_split.first;
+    k1.rout = rout_split.second;
+
+    mod(w, 1);
+    auto w_split = splitShare(w, bout);
+    k0.select = w_split.first;
+    k1.select = w_split.second;
+
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalSlothSignExtend(int party, GroupElement x, GroupElement w, const SlothSignExtendKeyPack &kp)
+{
+    mod(w, 1);
+    return party * x + kp.rout + (1LL << kp.bin) * ((1 - w) * kp.select + w * (party - kp.select));
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/signextend.h b/GPU-MPC/ext/sytorch/ext/llama/signextend.h
new file mode 100644
index 00000000..e8d99a4c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/signextend.h
@@ -0,0 +1,28 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <llama/keypack.h>
+
+std::pair<SignExtend2KeyPack, SignExtend2KeyPack> keyGenSignExtend2(int bin, int bout, GroupElement rin, GroupElement rout);
+
+std::pair<SlothSignExtendKeyPack, SlothSignExtendKeyPack> keyGenSlothSignExtend(int bin, int bout, GroupElement rin, GroupElement w, GroupElement rout);
+GroupElement evalSlothSignExtend(int party, GroupElement x, GroupElement w, const SlothSignExtendKeyPack &kp);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/comms.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/comms.cpp
new file mode 100644
index 00000000..cd17b887
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/comms.cpp
@@ -0,0 +1,1808 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/comms.h>
+#include <llama/assert.h>
+#include <bitpack/bitpack.h>
+#include <llama/stats.h>
+#include <chrono>
+
+using namespace LlamaConfig;
+
+SocketBuf::SocketBuf(std::string ip, int port, bool onlyRecv = false)
+{
+    this->t = BUF_SOCKET;
+    std::cerr << "trying to connect with server...";
+    {
+        struct sockaddr_in addr;
+        addr.sin_family = AF_INET;
+        addr.sin_port = htons(port);
+        addr.sin_addr.s_addr = inet_addr(ip.c_str());
+        while (1)
+        {
+            recvsocket = socket(AF_INET, SOCK_STREAM, 0);
+            if (recvsocket < 0)
+            {
+                perror("socket");
+                exit(1);
+            }
+            if (connect(recvsocket, (struct sockaddr *)&addr, sizeof(addr)) == 0)
+            {
+                break;
+            }
+            ::close(recvsocket);
+            usleep(1000);
+        }
+        const int one = 1;
+        setsockopt(recvsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
+    }
+    sleep(1);
+    if (!onlyRecv)
+    {
+        struct sockaddr_in addr;
+        addr.sin_family = AF_INET;
+        addr.sin_port = htons(port + 3);
+        addr.sin_addr.s_addr = inet_addr(ip.c_str());
+        while (1)
+        {
+            sendsocket = socket(AF_INET, SOCK_STREAM, 0);
+            if (sendsocket < 0)
+            {
+                perror("socket");
+                exit(1);
+            }
+            if (connect(sendsocket, (struct sockaddr *)&addr, sizeof(addr)) == 0)
+            {
+                break;
+            }
+            ::close(sendsocket);
+            usleep(1000);
+        }
+        const int one = 1;
+        setsockopt(sendsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
+    }
+    std::cerr << "connected" << std::endl;
+}
+
+void SocketBuf::sync()
+{
+    char buf[1] = {1};
+    send(sendsocket, buf, 1, 0);
+    recv(recvsocket, buf, 1, MSG_WAITALL);
+    bytesReceived += 1;
+    bytesSent += 1;
+    always_assert(buf[0] == 1);
+}
+
+void SocketBuf::read(char *buf, int bytes)
+{
+    always_assert(bytes == recv(recvsocket, (char *)buf, bytes, MSG_WAITALL));
+    bytesReceived += bytes;
+}
+
+char *SocketBuf::read(int bytes)
+{
+    char *tmpBuf = new char[bytes];
+    always_assert(bytes == recv(recvsocket, (char *)tmpBuf, bytes, MSG_WAITALL));
+    bytesReceived += bytes;
+    return tmpBuf;
+}
+
+void SocketBuf::write(char *buf, int bytes)
+{
+    always_assert(bytes == send(sendsocket, buf, bytes, 0));
+    bytesSent += bytes;
+}
+
+void SocketBuf::close()
+{
+    ::close(sendsocket);
+    ::close(recvsocket);
+}
+
+void Peer::close()
+{
+    keyBuf->close();
+}
+
+Peer *waitForPeer(int port)
+{
+    int sendsocket, recvsocket;
+    std::cerr << "waiting for connection from client...";
+    {
+        struct sockaddr_in dest;
+        struct sockaddr_in serv;
+        socklen_t socksize = sizeof(struct sockaddr_in);
+        memset(&serv, 0, sizeof(serv));
+        serv.sin_family = AF_INET;
+        serv.sin_addr.s_addr = htonl(INADDR_ANY); /* set our address to any interface */
+        serv.sin_port = htons(port);              /* set the server port number */
+        int mysocket = socket(AF_INET, SOCK_STREAM, 0);
+        int reuse = 1;
+        setsockopt(mysocket, SOL_SOCKET, SO_REUSEADDR, (const char *)&reuse,
+                   sizeof(reuse));
+        if (::bind(mysocket, (struct sockaddr *)&serv, sizeof(struct sockaddr)) < 0)
+        {
+            perror("error: bind");
+            exit(1);
+        }
+        if (listen(mysocket, 1) < 0)
+        {
+            perror("error: listen");
+            exit(1);
+        }
+        sendsocket = accept(mysocket, (struct sockaddr *)&dest, &socksize);
+        const int one = 1;
+        setsockopt(sendsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
+        close(mysocket);
+    }
+
+    {
+        struct sockaddr_in dest;
+        struct sockaddr_in serv;
+        socklen_t socksize = sizeof(struct sockaddr_in);
+        memset(&serv, 0, sizeof(serv));
+        serv.sin_family = AF_INET;
+        serv.sin_addr.s_addr = htonl(INADDR_ANY); /* set our address to any interface */
+        serv.sin_port = htons(port + 3);          /* set the server port number */
+        int mysocket = socket(AF_INET, SOCK_STREAM, 0);
+        int reuse = 1;
+        setsockopt(mysocket, SOL_SOCKET, SO_REUSEADDR, (const char *)&reuse,
+                   sizeof(reuse));
+        if (::bind(mysocket, (struct sockaddr *)&serv, sizeof(struct sockaddr)) < 0)
+        {
+            perror("error: bind");
+            exit(1);
+        }
+        if (listen(mysocket, 1) < 0)
+        {
+            perror("error: listen");
+            exit(1);
+        }
+        recvsocket = accept(mysocket, (struct sockaddr *)&dest, &socksize);
+        const int one = 1;
+        setsockopt(recvsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
+        close(mysocket);
+    }
+
+    std::cerr << "connected" << std::endl;
+    return new Peer(sendsocket, recvsocket);
+}
+
+void Peer::send_ge(const GroupElement &g, int bw)
+{
+    if (bw > 32)
+    {
+        char *buf = (char *)(&g);
+        this->keyBuf->write(buf, 8);
+    }
+    else if (bw > 16)
+    {
+        char *buf = (char *)(&g);
+        this->keyBuf->write(buf, 4);
+    }
+    else if (bw > 8)
+    {
+        char *buf = (char *)(&g);
+        this->keyBuf->write(buf, 2);
+    }
+    else
+    {
+        char *buf = (char *)(&g);
+        this->keyBuf->write(buf, 1);
+    }
+}
+
+void Peer::send_ge_array(const GroupElement *g, int size)
+{
+    char *buf = (char *)(g);
+    this->keyBuf->write(buf, 8 * size);
+}
+
+void Peer::send_block(const osuCrypto::block &b)
+{
+    char *buf = (char *)(&b);
+    this->keyBuf->write(buf, sizeof(osuCrypto::block));
+}
+
+void Peer::send_mask(const GroupElement &g)
+{
+    send_ge(g, 64);
+}
+
+void Peer::send_input(const GroupElement &g)
+{
+    send_ge(g, 64);
+}
+
+void Peer::send_batched_input(GroupElement *g, int size, int bw)
+{
+    if (bw > 32)
+    {
+        uint64_t *temp = new uint64_t[size];
+        for (int i = 0; i < size; i++)
+        {
+            temp[i] = g[i];
+        }
+        char *buf = (char *)(temp);
+        this->keyBuf->write(buf, 8 * size);
+        delete[] temp;
+    }
+    else if (bw > 16)
+    {
+        uint32_t *temp = new uint32_t[size];
+        for (int i = 0; i < size; i++)
+        {
+            temp[i] = (uint32_t)g[i];
+        }
+        char *buf = (char *)(temp);
+        this->keyBuf->write(buf, 4 * size);
+        delete[] temp;
+    }
+    else if (bw > 8)
+    {
+        uint16_t *temp = new uint16_t[size];
+        for (int i = 0; i < size; i++)
+        {
+            temp[i] = (uint16_t)g[i];
+        }
+        char *buf = (char *)(temp);
+        this->keyBuf->write(buf, 2 * size);
+        delete[] temp;
+    }
+    else
+    {
+        uint8_t *temp = new uint8_t[size];
+        for (int i = 0; i < size; i++)
+        {
+            temp[i] = (uint8_t)g[i];
+        }
+        char *buf = (char *)(temp);
+        this->keyBuf->write(buf, size);
+        delete[] temp;
+    }
+}
+
+void Peer::recv_batched_input(uint64_t *g, int size, int bw)
+{
+    if (bw > 32)
+    {
+        this->keyBuf->read((char *)g, 8 * size);
+    }
+    else if (bw > 16)
+    {
+        uint32_t *tmp = new uint32_t[size];
+        this->keyBuf->read((char *)tmp, 4 * size);
+        for (int i = 0; i < size; i++)
+        {
+            g[i] = tmp[i];
+        }
+        delete[] tmp;
+    }
+    else if (bw > 8)
+    {
+        uint16_t *tmp = new uint16_t[size];
+        this->keyBuf->read((char *)tmp, 2 * size);
+        for (int i = 0; i < size; i++)
+        {
+            g[i] = tmp[i];
+        }
+        delete[] tmp;
+    }
+    else
+    {
+        uint8_t *tmp = new uint8_t[size];
+        this->keyBuf->read((char *)tmp, size);
+        for (int i = 0; i < size; i++)
+        {
+            g[i] = tmp[i];
+        }
+        delete[] tmp;
+    }
+}
+
+void Peer::send_mult_key(const MultKey &k)
+{
+    char *buf = (char *)(&k);
+    this->keyBuf->write(buf, sizeof(MultKey));
+}
+
+void Peer::send_square_key(const SquareKey &k)
+{
+    send_ge(k.b, 64);
+    send_ge(k.c, 64);
+}
+
+void Peer::send_matmul_key(const MatMulKey &k)
+{
+    int s1 = k.s1;
+    int s2 = k.s2;
+    int s3 = k.s3;
+
+    for (int i = 0; i < s1; i++)
+    {
+        for (int j = 0; j < s2; j++)
+        {
+            send_ge(Arr2DIdx(k.a, s1, s2, i, j), k.Bin);
+        }
+    }
+
+    for (int i = 0; i < s2; i++)
+    {
+        for (int j = 0; j < s3; j++)
+        {
+            send_ge(Arr2DIdx(k.b, s2, s3, i, j), k.Bin);
+        }
+    }
+
+    for (int i = 0; i < s1; i++)
+    {
+        for (int j = 0; j < s3; j++)
+        {
+            send_ge(Arr2DIdx(k.c, s1, s3, i, j), k.Bout);
+        }
+    }
+}
+
+void Peer::send_conv2d_key(const Conv2DKey &k)
+{
+    int N = k.N;
+    int H = k.H;
+    int W = k.W;
+    int CO = k.CO;
+    int CI = k.CI;
+    int FH = k.FH;
+    int FW = k.FW;
+    int zPadHLeft = k.zPadHLeft;
+    int zPadHRight = k.zPadHRight;
+    int zPadWLeft = k.zPadWLeft;
+    int zPadWRight = k.zPadWRight;
+    ;
+    int strideH = k.strideH;
+    int strideW = k.strideW;
+
+    int d0 = N;
+    int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d3 = CO;
+
+    for (int n = 0; n < N; ++n)
+    {
+        for (int h = 0; h < H; ++h)
+        {
+            for (int w = 0; w < W; ++w)
+            {
+                for (int ci = 0; ci < CI; ++ci)
+                {
+                    send_ge(Arr4DIdx(k.a, N, H, W, CI, n, h, w, ci), k.Bin);
+                }
+            }
+        }
+    }
+
+    for (int fh = 0; fh < FH; ++fh)
+    {
+        for (int fw = 0; fw < FW; ++fw)
+        {
+            for (int ci = 0; ci < CI; ++ci)
+            {
+                for (int co = 0; co < CO; ++co)
+                {
+                    send_ge(Arr4DIdx(k.b, FH, FW, CI, CO, fh, fw, ci, co), k.Bin);
+                }
+            }
+        }
+    }
+
+    GroupElement *c = k.c;
+    int Bout = k.Bout;
+    for (int i = 0; i < d0; ++i)
+    {
+        for (int j = 0; j < d1; ++j)
+        {
+            for (int k = 0; k < d2; ++k)
+            {
+                for (int l = 0; l < d3; ++l)
+                {
+                    send_ge(Arr4DIdx(c, d0, d1, d2, d3, i, j, k, l), Bout);
+                }
+            }
+        }
+    }
+}
+
+void Peer::send_conv3d_key(const Conv3DKey &k)
+{
+    int N = k.N;
+    int D = k.D;
+    int H = k.H;
+    int W = k.W;
+    int CO = k.CO;
+    int CI = k.CI;
+    int FD = k.FD;
+    int FH = k.FH;
+    int FW = k.FW;
+    int zPadDLeft = k.zPadDLeft;
+    int zPadDRight = k.zPadDRight;
+    int zPadHLeft = k.zPadHLeft;
+    int zPadHRight = k.zPadHRight;
+    int zPadWLeft = k.zPadWLeft;
+    int zPadWRight = k.zPadWRight;
+    ;
+    int strideH = k.strideH;
+    int strideW = k.strideW;
+    int strideD = k.strideD;
+
+    int d0 = N;
+    int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
+    int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d3 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d4 = CO;
+
+    for (size_t i = 0; i < N * D * H * W * CI; ++i)
+    {
+        send_ge(k.a[i], k.Bin);
+    }
+
+    for (size_t i = 0; i < FD * FH * FW * CI * CO; ++i)
+    {
+        send_ge(k.b[i], k.Bin);
+    }
+
+    for (size_t i = 0; i < d0 * d1 * d2 * d3 * d4; ++i)
+    {
+        send_ge(k.c[i], k.Bout);
+    }
+}
+
+void Peer::send_dcf_keypack(const DCFKeyPack &kp)
+{
+    for (int i = 0; i < kp.Bin + 1; ++i)
+    {
+        send_block(kp.k[i]);
+    }
+    for (int i = 0; i < kp.groupSize; ++i)
+    {
+        send_ge(kp.g[i], kp.Bout);
+    }
+    for (int i = 0; i < kp.groupSize * kp.Bin; ++i)
+    {
+        send_ge(kp.v[i], kp.Bout);
+    }
+    send_ge(42, 64);
+}
+
+void Peer::send_dpf_keypack(const DPFKeyPack &kp)
+{
+    for (int i = 0; i < kp.bin + 1; ++i)
+    {
+        send_block(kp.s[i]);
+    }
+    send_ge(kp.tLcw, kp.bin);
+    send_ge(kp.tRcw, kp.bin);
+    send_ge(kp.payload, kp.bout);
+    send_ge(42, 64);
+}
+
+void Peer::send_dpfet_keypack(const DPFETKeyPack &kp)
+{
+    for (int i = 0; i < kp.bin + 1 - 7; ++i)
+    {
+        send_block(kp.s[i]);
+    }
+    send_ge(kp.tLcw, kp.bin);
+    send_ge(kp.tRcw, kp.bin);
+    send_block(kp.leaf);
+    send_ge(42, 64);
+}
+
+void Peer::send_ddcf_keypack(const DualDCFKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    for (int i = 0; i < kp.groupSize; ++i)
+    {
+        send_ge(kp.sb[i], kp.Bout);
+    }
+}
+
+void Peer::send_new_mult_key(const MultKeyNew &k, int bw1, int bw2)
+{
+    send_ge(k.a, bw1 + bw2);
+    send_ge(k.b, bw1 + bw2);
+    send_ge(k.c, bw1 + bw2);
+    send_dcf_keypack(k.k1);
+    send_dcf_keypack(k.k2);
+    send_dcf_keypack(k.k3);
+    send_dcf_keypack(k.k4);
+}
+
+void Peer::send_relu_key(const ReluKeyPack &kp)
+{
+    int Bin = kp.Bin;
+    int groupSize = 2;
+    for (int i = 0; i < Bin + 1; ++i)
+    {
+        send_block(kp.k[i]);
+    }
+    for (int i = 0; i < groupSize; ++i)
+    {
+        send_ge(kp.g[i], kp.Bout);
+    }
+    for (int i = 0; i < Bin * groupSize; ++i)
+    {
+        send_ge(kp.v[i], kp.Bout);
+    }
+    send_ge(kp.e_b0, kp.Bout);
+    send_ge(kp.e_b1, kp.Bout);
+    send_ge(kp.beta_b0, kp.Bout);
+    send_ge(kp.beta_b1, kp.Bout);
+    send_ge(kp.r_b, kp.Bout);
+    send_ge(kp.drelu, 1);
+}
+
+void Peer::send_maxpool_key(const MaxpoolKeyPack &kp)
+{
+    send_relu_key(kp.reluKey);
+    send_ge(kp.rb, kp.Bout);
+}
+
+void Peer::send_maxpool_double_key(const MaxpoolDoubleKeyPack &kp)
+{
+    send_relu_2round_key(kp.reluKey);
+    send_ge(kp.rb, kp.Bout);
+}
+
+void Peer::send_scmp_keypack(const ScmpKeyPack &kp)
+{
+    send_ddcf_keypack(kp.dualDcfKey);
+    send_ge(kp.rb, kp.Bout);
+}
+
+void Peer::send_pubdiv_key(const PublicDivKeyPack &kp)
+{
+    send_ddcf_keypack(kp.dualDcfKey);
+    send_scmp_keypack(kp.scmpKey);
+    send_ge(kp.zb, kp.Bout);
+}
+
+void Peer::send_ars_key(const ARSKeyPack &kp)
+{
+    if (!LlamaConfig::stochasticT)
+        send_dcf_keypack(kp.dcfKey);
+    if (kp.Bout > kp.Bin - kp.shift)
+    {
+        send_ddcf_keypack(kp.dualDcfKey);
+    }
+    send_ge(kp.rb, kp.Bout);
+}
+
+void Peer::send_spline_key(const SplineKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    for (auto &pi : kp.p)
+    {
+        send_ge(pi, kp.Bin);
+    }
+
+    for (auto &row : kp.e_b)
+    {
+        for (auto &e : row)
+        {
+            send_ge(e, kp.Bout);
+        }
+    }
+
+    for (auto &b : kp.beta_b)
+    {
+        send_ge(b, kp.Bout);
+    }
+
+    send_ge(kp.r_b, kp.Bout);
+}
+
+void Peer::send_signedpubdiv_key(const SignedPublicDivKeyPack &kp)
+{
+    send_ge(kp.d, kp.Bin);
+    send_dcf_keypack(kp.dcfKey);
+    send_publicIC_key(kp.publicICkey);
+    send_scmp_keypack(kp.scmpKey);
+    send_ge(kp.A_share, kp.Bin);
+    send_ge(kp.corr_share, kp.Bout);
+    send_ge(kp.B_share, kp.Bout);
+    send_ge(kp.rdiv_share, kp.Bout);
+    send_ge(kp.rout_temp_share, kp.Bout);
+    send_ge(kp.rout_share, kp.Bout);
+}
+
+void Peer::send_publicIC_key(const PublicICKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.zb, kp.Bout);
+}
+
+void Peer::send_relu_truncate_key(const ReluTruncateKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKeyN);
+    if (!LlamaConfig::stochasticRT)
+        send_dcf_keypack(kp.dcfKeyS);
+    send_ge(kp.zTruncate, kp.Bin);
+    send_ge(kp.a, kp.Bin);
+    send_ge(kp.b, kp.Bin);
+    send_ge(kp.c, kp.Bin);
+    send_ge(kp.d1, kp.Bin);
+    send_ge(kp.d2, kp.Bin);
+}
+
+void Peer::send_relu_2round_key(const Relu2RoundKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.a, kp.Bin);
+    send_ge(kp.b, kp.Bin);
+    send_ge(kp.c, kp.Bin);
+    send_ge(kp.d1, kp.Bin);
+    send_ge(kp.d2, kp.Bin);
+}
+
+void Peer::send_select_key(const SelectKeyPack &kp)
+{
+    send_ge(kp.a, kp.Bin);
+    send_ge(kp.b, kp.Bin);
+    send_ge(kp.c, kp.Bin);
+    send_ge(kp.d1, kp.Bin);
+    send_ge(kp.d2, kp.Bin);
+}
+
+void Peer::send_bitwise_and_key(const BitwiseAndKeyPack &kp)
+{
+    send_ge(kp.t[0], 64);
+    send_ge(kp.t[1], 64);
+    send_ge(kp.t[2], 64);
+    send_ge(kp.t[3], 64);
+}
+
+void Peer::send_mic_key(const MICKeyPack &kp, int bin, int bout, int m)
+{
+    send_dcf_keypack(kp.dcfKey);
+    for (int i = 0; i < m; ++i)
+    {
+        send_ge(kp.z[i], bout);
+    }
+}
+
+void Peer::send_fix_to_float_key(const FixToFloatKeyPack &kp, int bl)
+{
+    send_mic_key(kp.micKey, bl, bl, 2 * bl);
+    send_ge(kp.rs, 1);
+    send_ge(kp.rpow, bl);
+    send_ge(kp.ry, bl);
+    send_select_key(kp.selectKey);
+    send_ge(kp.rm, bl);
+}
+
+void Peer::send_float_to_fix_key(const FloatToFixKeyPack &kp, int bl)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_select_key(kp.selectKey);
+    send_ge(kp.rm, 24);
+    send_ge(kp.re, 10);
+    send_ge(kp.rw, 1);
+    // send_ge(kp.rt, bl);
+    send_ge(kp.rh, bl);
+    for (int i = 0; i < 1024; ++i)
+    {
+        send_ge(kp.p[i], bl);
+    }
+    for (int i = 0; i < 1024; ++i)
+    {
+        send_ge(kp.q[i], bl);
+    }
+    send_ars_key(kp.arsKey);
+}
+
+void Peer::send_relu_extend_key(const ReluExtendKeyPack &kp, int bin, int bout)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.rd, 2);
+    send_ge(kp.rw, 2);
+    for (int i = 0; i < 4; ++i)
+        send_ge(kp.p[i], bout);
+    for (int i = 0; i < 2; ++i)
+        send_ge(kp.q[i], bout);
+}
+
+void Peer::send_sign_extend2_key(const SignExtend2KeyPack &kp, int bin, int bout)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.rw, 1);
+    send_ge(kp.p[0], bout);
+    send_ge(kp.p[1], bout);
+}
+
+void Peer::send_triple_key(const TripleKeyPack &kp)
+{
+    for (size_t i = 0; i < kp.na; ++i)
+    {
+        send_ge(kp.a[i], kp.bw);
+    }
+
+    for (size_t i = 0; i < kp.nb; ++i)
+    {
+        send_ge(kp.b[i], kp.bw);
+    }
+
+    for (size_t i = 0; i < kp.nc; ++i)
+    {
+        send_ge(kp.c[i], kp.bw);
+    }
+}
+
+void Peer::send_edabits_prtrunc_key(const EdabitsPrTruncKeyPack &kp, int bw)
+{
+    send_ge(kp.a, bw);
+    send_ge(kp.b, bw);
+}
+
+void Peer::send_pubcmp_key(const PubCmpKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.rout, 1);
+}
+
+void Peer::send_clip_key(const ClipKeyPack &kp)
+{
+    send_pubcmp_key(kp.cmpKey);
+    send_ge(kp.a, kp.bin);
+    send_ge(kp.b, kp.bin);
+    send_ge(kp.c, kp.bin);
+    send_ge(kp.d1, kp.bin);
+    send_ge(kp.d2, kp.bin);
+}
+
+void Peer::send_lut_key(const LUTKeyPack &kp)
+{
+    send_dpf_keypack(kp.dpfKey);
+    send_ge(kp.rout, kp.bout);
+}
+
+void Peer::send_f2bf16_key(const F2BF16KeyPack &kp)
+{
+    int bin = kp.bin;
+    send_dcf_keypack(kp.dcfKey);
+    send_dcf_keypack(kp.dcfTruncate);
+    send_ge(kp.rout_k, bin);
+    send_ge(kp.rout_m, bin);
+    send_ge(kp.rin, bin);
+    send_ge(kp.prod, bin);
+    send_ge(kp.rout, 13);
+    send_ge(kp.rProd, bin);
+}
+
+void Peer::send_truncate_reduce_key(const TruncateReduceKeyPack &kp)
+{
+    send_dcf_keypack(kp.dcfKey);
+    send_ge(kp.rout, kp.bin - kp.shift);
+}
+
+void Peer::send_lutss_key(const LUTSSKeyPack &kp)
+{
+    send_ge(kp.b0, 64);
+    send_ge(kp.b1, 64);
+    send_ge(kp.b2, 64);
+    send_ge(kp.b3, 64);
+    send_ge(kp.routRes, kp.bout);
+    send_ge(kp.routCorr, kp.bout);
+    send_ge(kp.rout, kp.bout);
+}
+
+void Peer::send_lutdpfet_key(const LUTDPFETKeyPack &kp)
+{
+    send_dpfet_keypack(kp.dpfKey);
+    send_ge(kp.routRes, kp.bout);
+    send_ge(kp.routCorr, 1);
+}
+
+void Peer::send_sloth_drelu_key(const SlothDreluKeyPack &kp)
+{
+    send_dpfet_keypack(kp.dpfKey);
+    send_ge(kp.r, 1);
+}
+
+void Peer::send_wrap_dpf_key(const WrapDPFKeyPack &kp)
+{
+    send_dpfet_keypack(kp.dpfKey);
+    send_ge(kp.r, 1);
+}
+
+void Peer::send_wrap_ss_key(const WrapSSKeyPack &kp)
+{
+    send_ge(kp.b0, 64);
+    if (kp.bin == 7)
+        send_ge(kp.b1, 64);
+}
+
+void Peer::send_sloth_lrs_key(const SlothLRSKeyPack &kp)
+{
+    send_ge(kp.rout, kp.bin);
+    send_ge(kp.msb, kp.bin);
+    send_ge(kp.select, kp.bin);
+}
+
+GroupElement Peer::recv_input()
+{
+    char buf[8];
+    this->keyBuf->read(buf, 8);
+    GroupElement g = *(uint64_t *)buf;
+    return g;
+}
+
+void Peer::send_uint8_array(const uint8_t *data, int size)
+{
+    this->keyBuf->write((char *)data, size);
+    // always_assert(size == send(sendsocket, data, size, 0));
+    // bytesSent += size;
+}
+
+void Peer::recv_uint8_array(uint8_t *data, int size)
+{
+    this->keyBuf->read((char *)data, size);
+    // always_assert(size == recv(recvsocket, data, size, MSG_WAITALL));
+    // bytesReceived += size;
+}
+
+Dealer::Dealer(std::string ip, int port)
+{
+    this->keyBuf = new SocketBuf(ip, port, true);
+}
+
+void Dealer::close()
+{
+    keyBuf->close();
+}
+
+GroupElement Dealer::recv_mask()
+{
+    char buf[8];
+    this->keyBuf->read(buf, 8);
+    GroupElement g = *(uint64_t *)buf;
+    return g;
+}
+
+MultKey Dealer::recv_mult_key()
+{
+    char buf[sizeof(MultKey)];
+    this->keyBuf->read(buf, sizeof(MultKey));
+    MultKey k(*(MultKey *)buf);
+    return k;
+}
+
+SquareKey Dealer::recv_square_key()
+{
+    SquareKey k;
+    k.b = recv_ge(64);
+    k.c = recv_ge(64);
+    return k;
+}
+
+osuCrypto::block Dealer::recv_block()
+{
+    char buf[sizeof(osuCrypto::block)];
+    this->keyBuf->read(buf, sizeof(osuCrypto::block));
+    osuCrypto::block b = *(osuCrypto::block *)buf;
+    return b;
+}
+
+osuCrypto::block *Dealer::recv_block_array(int numBlocks)
+{
+    return (osuCrypto::block *)this->keyBuf->read(numBlocks * sizeof(osuCrypto::block));
+}
+
+GroupElement Dealer::recv_ge(int bl)
+{
+    if (bl > 32)
+    {
+        char buf[8];
+        this->keyBuf->read(buf, 8);
+        GroupElement g(*(uint64_t *)buf);
+        mod(g, bl);
+        return g;
+    }
+    else if (bl > 16)
+    {
+        char buf[4];
+        this->keyBuf->read(buf, 4);
+        GroupElement g(*(uint32_t *)buf);
+        mod(g, bl);
+        return g;
+    }
+    else if (bl > 8)
+    {
+        char buf[2];
+        this->keyBuf->read(buf, 2);
+        GroupElement g(*(uint16_t *)buf);
+        mod(g, bl);
+        return g;
+    }
+    else
+    {
+        char buf[1];
+        this->keyBuf->read(buf, 1);
+        GroupElement g(*(uint8_t *)buf);
+        mod(g, bl);
+        return g;
+    }
+}
+
+GroupElement *Dealer::recv_ge_array(int bw, int size)
+{
+    always_assert(bw > 32);
+    return (GroupElement *)this->keyBuf->read(8 * size);
+}
+
+void Dealer::recv_ge_array(const GroupElement *g, int size)
+{
+    char *buf = (char *)g;
+    this->keyBuf->read(buf, 8 * size);
+}
+
+DCFKeyPack Dealer::recv_dcf_keypack(int Bin, int Bout, int groupSize)
+{
+    DCFKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.groupSize = groupSize;
+    kp.k = recv_block_array(Bin + 1);
+    kp.g = new GroupElement[groupSize];
+    for (int i = 0; i < groupSize; ++i)
+    {
+        kp.g[i] = recv_ge(Bout);
+    }
+    kp.v = new GroupElement[Bin * groupSize];
+    for (int i = 0; i < Bin * groupSize; ++i)
+    {
+        kp.v[i] = recv_ge(Bout);
+    }
+    GroupElement t = recv_ge(64);
+    always_assert(t == 42);
+    return kp;
+}
+
+DPFKeyPack Dealer::recv_dpf_keypack(int bin, int bout)
+{
+    DPFKeyPack kp;
+    kp.bin = bin;
+    kp.bout = bout;
+    kp.s = recv_block_array(bin + 1);
+    kp.tLcw = recv_ge(bin);
+    kp.tRcw = recv_ge(bin);
+    kp.payload = recv_ge(bout);
+
+    GroupElement t = recv_ge(64);
+    always_assert(t == 42);
+    return kp;
+}
+
+DPFETKeyPack Dealer::recv_dpfet_keypack(int bin)
+{
+    DPFETKeyPack kp;
+    kp.bin = bin;
+    kp.s = recv_block_array(bin - 6);
+    kp.tLcw = recv_ge(bin);
+    kp.tRcw = recv_ge(bin);
+    kp.leaf = recv_block();
+
+    GroupElement t = recv_ge(64);
+    always_assert(t == 42);
+    return kp;
+}
+
+DualDCFKeyPack Dealer::recv_ddcf_keypack(int Bin, int Bout, int groupSize)
+{
+    DualDCFKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.groupSize = groupSize;
+    kp.dcfKey = recv_dcf_keypack(Bin, Bout, groupSize);
+    kp.sb = new GroupElement[groupSize];
+    for (int i = 0; i < groupSize; ++i)
+    {
+        kp.sb[i] = recv_ge(Bout);
+    }
+    return kp;
+}
+
+MultKeyNew Dealer::recv_new_mult_key(int bw1, int bw2)
+{
+    MultKeyNew kp;
+    kp.a = recv_ge(bw1 + bw2);
+    kp.b = recv_ge(bw1 + bw2);
+    kp.c = recv_ge(bw1 + bw2);
+    kp.k1 = recv_dcf_keypack(bw2, bw1, 1);
+    kp.k2 = recv_dcf_keypack(bw1, bw2, 1);
+    kp.k3 = recv_dcf_keypack(bw2, bw1, 1);
+    kp.k4 = recv_dcf_keypack(bw1, bw2, 1);
+
+    return kp;
+}
+
+MatMulKey Dealer::recv_matmul_key(int Bin, int Bout, int s1, int s2, int s3)
+{
+    MatMulKey k;
+    k.Bin = Bin;
+    k.Bout = Bout;
+    k.s1 = s1;
+    k.s2 = s2;
+    k.s3 = s3;
+
+    k.a = make_array<GroupElement>(s1, s2);
+    k.b = make_array<GroupElement>(s2, s3);
+    k.c = make_array<GroupElement>(s1, s3);
+
+    for (int i = 0; i < s1; ++i)
+    {
+        for (int j = 0; j < s2; ++j)
+        {
+            Arr2DIdx(k.a, s1, s2, i, j) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+            mod(Arr2DIdx(k.a, s1, s2, i, j), Bin);
+        }
+    }
+
+    for (int i = 0; i < s2; ++i)
+    {
+        for (int j = 0; j < s3; ++j)
+        {
+            Arr2DIdx(k.b, s2, s3, i, j) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+            mod(Arr2DIdx(k.b, s2, s3, i, j), Bin);
+        }
+    }
+
+    for (int i = 0; i < s1; ++i)
+    {
+        for (int j = 0; j < s3; ++j)
+        {
+            Arr2DIdx(k.c, s1, s3, i, j) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bout));
+            mod(Arr2DIdx(k.c, s1, s3, i, j), Bout);
+        }
+    }
+
+    return k;
+}
+
+Conv2DKey Dealer::recv_conv2d_key(int Bin, int Bout, int64_t N, int64_t H, int64_t W,
+                                  int64_t CI, int64_t FH, int64_t FW,
+                                  int64_t CO, int64_t zPadHLeft,
+                                  int64_t zPadHRight, int64_t zPadWLeft,
+                                  int64_t zPadWRight, int64_t strideH,
+                                  int64_t strideW)
+{
+    Conv2DKey k;
+    k.Bin = Bin;
+    k.Bout = Bout;
+    k.N = N;
+    k.H = H;
+    k.W = W;
+    k.CO = CO;
+    k.CI = CI;
+    k.FH = FH;
+    k.FW = FW;
+    k.zPadHLeft = zPadHLeft;
+    k.zPadHRight = zPadHRight;
+    k.zPadWLeft = zPadWLeft;
+    k.zPadWRight = zPadWRight;
+    ;
+    k.strideH = strideH;
+    k.strideW = strideW;
+
+    int d0 = N;
+    int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d3 = CO;
+
+    k.a = make_array<GroupElement>(N, H, W, CI);
+    k.b = make_array<GroupElement>(FH, FW, CI, CO);
+    k.c = make_array<GroupElement>(d0, d1, d2, d3);
+
+    for (int n = 0; n < N; ++n)
+    {
+        for (int h = 0; h < H; ++h)
+        {
+            for (int w = 0; w < W; ++w)
+            {
+                for (int ci = 0; ci < CI; ++ci)
+                {
+                    Arr4DIdx(k.a, N, H, W, CI, n, h, w, ci) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+                    mod(Arr4DIdx(k.a, N, H, W, CI, n, h, w, ci), Bin);
+                }
+            }
+        }
+    }
+
+    for (int fh = 0; fh < FH; ++fh)
+    {
+        for (int fw = 0; fw < FW; ++fw)
+        {
+            for (int ci = 0; ci < CI; ++ci)
+            {
+                for (int co = 0; co < CO; ++co)
+                {
+                    Arr4DIdx(k.b, FH, FW, CI, CO, fh, fw, ci, co) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+                    mod(Arr4DIdx(k.b, FH, FW, CI, CO, fh, fw, ci, co), Bin);
+                }
+            }
+        }
+    }
+
+    GroupElement *c = k.c;
+    for (int i = 0; i < d0; ++i)
+    {
+        for (int j = 0; j < d1; ++j)
+        {
+            for (int k = 0; k < d2; ++k)
+            {
+                for (int l = 0; l < d3; ++l)
+                {
+                    Arr4DIdx(c, d0, d1, d2, d3, i, j, k, l) = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bout));
+                    mod(Arr4DIdx(c, d0, d1, d2, d3, i, j, k, l), Bout);
+                }
+            }
+        }
+    }
+    return k;
+}
+
+Conv3DKey Dealer::recv_conv3d_key(int Bin, int Bout, int64_t N, int64_t D, int64_t H, int64_t W,
+                                  int64_t CI, int64_t FD, int64_t FH, int64_t FW, int64_t CO,
+                                  int64_t zPadDLeft, int64_t zPadDRight,
+                                  int64_t zPadHLeft, int64_t zPadHRight,
+                                  int64_t zPadWLeft, int64_t zPadWRight,
+                                  int64_t strideD, int64_t strideH, int64_t strideW)
+{
+    Conv3DKey k;
+    k.Bin = Bin;
+    k.Bout = Bout;
+    k.N = N;
+    k.D = D;
+    k.H = H;
+    k.W = W;
+    k.CO = CO;
+    k.CI = CI;
+    k.FD = FD;
+    k.FH = FH;
+    k.FW = FW;
+    k.zPadDLeft = zPadDLeft;
+    k.zPadDRight = zPadDRight;
+    k.zPadHLeft = zPadHLeft;
+    k.zPadHRight = zPadHRight;
+    k.zPadWLeft = zPadWLeft;
+    k.zPadWRight = zPadWRight;
+    k.strideD = strideD;
+    k.strideH = strideH;
+    k.strideW = strideW;
+
+    int d0 = N;
+    int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
+    int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
+    int d3 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
+    int d4 = CO;
+
+    k.a = make_array<GroupElement>(N, D, H, W, CI);
+    k.b = make_array<GroupElement>(FD, FH, FW, CI, CO);
+    k.c = make_array<GroupElement>(d0, d1, d2, d3, d4);
+
+    for (size_t i = 0; i < N * D * H * W * CI; ++i)
+    {
+        k.a[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+        mod(k.a[i], Bin);
+    }
+
+    for (size_t i = 0; i < FD * FH * FW * CI * CO; ++i)
+    {
+        k.b[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bin));
+        mod(k.b[i], Bin);
+    }
+
+    for (size_t i = 0; i < d0 * d1 * d2 * d3 * d4; ++i)
+    {
+        k.c[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(Bout));
+        mod(k.c[i], Bout);
+    }
+    return k;
+}
+
+ReluKeyPack Dealer::recv_relu_key(int Bin, int Bout)
+{
+    int groupSize = 2;
+    ReluKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.k = recv_block_array(Bin + 1);
+    kp.g = new GroupElement[groupSize];
+    for (int i = 0; i < groupSize; ++i)
+    {
+        kp.g[i] = recv_ge(Bout);
+    }
+    if (Bout > 32)
+    {
+        kp.v = recv_ge_array(Bout, Bin * groupSize);
+    }
+    else
+    {
+        kp.v = new GroupElement[Bin * groupSize];
+        for (int i = 0; i < Bin * groupSize; ++i)
+        {
+            kp.v[i] = recv_ge(Bout);
+        }
+    }
+    kp.e_b0 = recv_ge(Bout);
+    kp.e_b1 = recv_ge(Bout);
+    kp.beta_b0 = recv_ge(Bout);
+    kp.beta_b1 = recv_ge(Bout);
+    kp.r_b = recv_ge(Bout);
+    kp.drelu = recv_ge(1);
+    return kp;
+}
+
+MaxpoolKeyPack Dealer::recv_maxpool_key(int Bin, int Bout)
+{
+    MaxpoolKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.reluKey = recv_relu_key(Bin, Bout);
+    kp.rb = recv_ge(Bout);
+    return kp;
+}
+
+MaxpoolDoubleKeyPack Dealer::recv_maxpool_double_key(int Bin, int Bout)
+{
+    MaxpoolDoubleKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.reluKey = recv_relu_2round_key(Bin, Bout);
+    kp.rb = recv_ge(Bout);
+    return kp;
+}
+
+ScmpKeyPack Dealer::recv_scmp_keypack(int Bin, int Bout)
+{
+    int groupSize = 1;
+    ScmpKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.dualDcfKey = recv_ddcf_keypack(Bin - 1, Bout, groupSize);
+    kp.rb = recv_ge(Bout);
+    return kp;
+}
+
+PublicDivKeyPack Dealer::recv_pubdiv_key(int Bin, int Bout)
+{
+    int groupSize = 1;
+    PublicDivKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.dualDcfKey = recv_ddcf_keypack(Bin, Bout, groupSize);
+    kp.scmpKey = recv_scmp_keypack(Bin, Bout);
+    kp.zb = recv_ge(Bout);
+    return kp;
+}
+
+ARSKeyPack Dealer::recv_ars_key(int Bin, int Bout, int shift)
+{
+    ARSKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.shift = shift;
+
+    int dcfGroupSize = 1, ddcfGroupSize = 2;
+    if (!LlamaConfig::stochasticT)
+        kp.dcfKey = recv_dcf_keypack(shift, Bout, dcfGroupSize);
+    if (Bout > Bin - shift)
+    {
+        kp.dualDcfKey = recv_ddcf_keypack(Bin - 1, Bout, ddcfGroupSize);
+    }
+    kp.rb = recv_ge(Bout);
+    return kp;
+}
+
+void Peer::sync()
+{
+    this->keyBuf->sync();
+}
+
+SignedPublicDivKeyPack Dealer::recv_signedpubdiv_key(int Bin, int Bout)
+{
+    SignedPublicDivKeyPack kp;
+    kp.d = recv_ge(Bin);
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    int groupSize = 1;
+    kp.dcfKey = recv_dcf_keypack(Bin, Bout, groupSize);
+    kp.publicICkey = recv_publicIC_key(Bin, Bout);
+    kp.scmpKey = recv_scmp_keypack(Bin, Bout);
+    kp.A_share = recv_ge(Bin);
+    kp.corr_share = recv_ge(Bout);
+    kp.B_share = recv_ge(Bout);
+    kp.rdiv_share = recv_ge(Bout);
+    kp.rout_temp_share = recv_ge(Bout);
+    kp.rout_share = recv_ge(Bout);
+    return kp;
+}
+
+PublicICKeyPack Dealer::recv_publicIC_key(int Bin, int Bout)
+{
+    PublicICKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    int groupSize = 1;
+    kp.dcfKey = recv_dcf_keypack(Bin, Bout, groupSize);
+    kp.zb = recv_ge(Bout);
+    return kp;
+}
+
+SplineKeyPack Dealer::recv_spline_key(int Bin, int Bout, int numPoly, int degree)
+{
+    SplineKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.numPoly = numPoly;
+    kp.degree = degree;
+    kp.dcfKey = recv_dcf_keypack(16, Bout, numPoly * (degree + 1));
+
+    kp.p.resize(numPoly + 1);
+    for (int i = 0; i < numPoly + 1; ++i)
+    {
+        kp.p[i] = recv_ge(Bin);
+    }
+
+    kp.e_b.resize(numPoly);
+    for (int i = 0; i < numPoly; ++i)
+    {
+        kp.e_b[i].resize(degree + 1);
+        for (int j = 0; j < degree + 1; ++j)
+        {
+            kp.e_b[i][j] = recv_ge(Bout);
+        }
+    }
+
+    kp.beta_b.resize(numPoly * (degree + 1));
+    for (int i = 0; i < numPoly * (degree + 1); ++i)
+    {
+        kp.beta_b[i] = recv_ge(Bout);
+    }
+
+    kp.r_b = recv_ge(Bout);
+    return kp;
+}
+
+ReluTruncateKeyPack Dealer::recv_relu_truncate_key(int Bin, int Bout, int s)
+{
+    ReluTruncateKeyPack kp;
+    kp.Bin = Bin;
+    kp.Bout = Bout;
+    kp.shift = s;
+    kp.dcfKeyN = recv_dcf_keypack(Bin, s, 1);
+    if (!LlamaConfig::stochasticRT)
+        kp.dcfKeyS = recv_dcf_keypack(s, Bin, 1);
+    kp.zTruncate = recv_ge(Bin);
+    kp.a = recv_ge(Bin);
+    kp.b = recv_ge(Bin);
+    kp.c = recv_ge(Bin);
+    kp.d1 = recv_ge(Bin);
+    kp.d2 = recv_ge(Bin);
+    return kp;
+}
+
+Relu2RoundKeyPack Dealer::recv_relu_2round_key(int effectiveBin, int Bin)
+{
+    Relu2RoundKeyPack kp;
+    kp.effectiveBin = effectiveBin;
+    kp.Bin = Bin;
+    kp.dcfKey = recv_dcf_keypack(effectiveBin, 1, 1);
+    kp.a = recv_ge(Bin);
+    kp.b = recv_ge(Bin);
+    kp.c = recv_ge(Bin);
+    kp.d1 = recv_ge(Bin);
+    kp.d2 = recv_ge(Bin);
+    return kp;
+}
+
+SelectKeyPack Dealer::recv_select_key(int Bin)
+{
+    SelectKeyPack kp;
+    kp.Bin = Bin;
+    kp.a = recv_ge(Bin);
+    kp.b = recv_ge(Bin);
+    kp.c = recv_ge(Bin);
+    kp.d1 = recv_ge(Bin);
+    kp.d2 = recv_ge(Bin);
+    return kp;
+}
+
+void Peer::send_bulkylrs_key(const BulkyLRSKeyPack &kp, int bl, int m)
+{
+    send_dcf_keypack(kp.dcfKeyN);
+    for (int i = 0; i < m; ++i)
+    {
+        send_dcf_keypack(kp.dcfKeyS[i]);
+        send_ge(kp.z[i], bl);
+    }
+    send_ge(kp.out, bl);
+}
+
+void Peer::send_taylor_key(const TaylorKeyPack &kp, int bl, int m)
+{
+    send_dcf_keypack(kp.msnzbKey.micKey.dcfKey);
+    for (int i = 0; i < m; ++i)
+    {
+        send_ge(kp.msnzbKey.micKey.z[i], bl);
+    }
+    send_ge(kp.msnzbKey.r, bl);
+    send_ge(kp.squareKey.a, bl);
+    send_ge(kp.squareKey.b, bl);
+    send_ge(kp.privateScaleKey.rin, bl);
+    send_ge(kp.privateScaleKey.rout, bl);
+    send_bulkylrs_key(kp.lrsKeys[0], bl, m);
+    send_bulkylrs_key(kp.lrsKeys[1], bl, m);
+    send_ge(69, bl);
+}
+BulkyLRSKeyPack Dealer::recv_bulkylrs_key(int bl, int m, uint64_t *scales)
+{
+    BulkyLRSKeyPack kp;
+    kp.dcfKeyN = recv_dcf_keypack(bl, bl, 1);
+    kp.z = new GroupElement[m];
+    kp.dcfKeyS = new DCFKeyPack[m];
+    for (int i = 0; i < m; ++i)
+    {
+        kp.dcfKeyS[i] = recv_dcf_keypack(scales[i], bl, 1);
+        kp.z[i] = recv_ge(bl);
+    }
+    kp.out = recv_ge(bl);
+    return kp;
+}
+
+TaylorKeyPack Dealer::recv_taylor_key(int bl, int m, int sf)
+{
+    TaylorKeyPack kp;
+    kp.msnzbKey.micKey.dcfKey = recv_dcf_keypack(bl, bl, 1);
+    kp.msnzbKey.micKey.z = new GroupElement[m];
+    for (int i = 0; i < m; ++i)
+    {
+        kp.msnzbKey.micKey.z[i] = recv_ge(bl);
+    }
+    kp.msnzbKey.r = recv_ge(bl);
+    kp.squareKey.a = recv_ge(bl);
+    kp.squareKey.b = recv_ge(bl);
+    kp.privateScaleKey.rin = recv_ge(bl);
+    kp.privateScaleKey.rout = recv_ge(bl);
+    uint64_t scales[m];
+    for (int i = 0; i < m; ++i)
+    {
+        scales[i] = sf + i;
+    }
+    kp.lrsKeys[0] = recv_bulkylrs_key(bl, m, scales);
+    for (int i = 0; i < m; ++i)
+    {
+        scales[i] = sf + 3 * i;
+    }
+    kp.lrsKeys[1] = recv_bulkylrs_key(bl, m, scales);
+    GroupElement ping = recv_ge(bl);
+    always_assert(ping == 69);
+    return kp;
+}
+
+BitwiseAndKeyPack Dealer::recv_bitwise_and_key()
+{
+    BitwiseAndKeyPack kp;
+    kp.t[0] = recv_ge(64);
+    kp.t[1] = recv_ge(64);
+    kp.t[2] = recv_ge(64);
+    kp.t[3] = recv_ge(64);
+    return kp;
+}
+
+// void Peer::send_mic_key(const MICKeyPack &kp, int bl, int m)
+// {
+//     send_dcf_keypack(kp.dcfKey);
+//     for(int i = 0; i < m; ++i)
+//     {
+//         send_ge(kp.z[i], bl);
+//     }
+// }
+
+// void Peer::send_fix_to_float_key(const FixToFloatKeyPack &kp, int bl)
+// {
+//     send_mic_key(kp.micKey, bl, 2*bl);
+// }
+
+MICKeyPack Dealer::recv_mic_key(int bin, int bout, int m)
+{
+    MICKeyPack kp;
+    kp.dcfKey = recv_dcf_keypack(bin, bout, 1);
+    kp.z = new GroupElement[m];
+    for (int i = 0; i < m; ++i)
+    {
+        kp.z[i] = recv_ge(bout);
+    }
+    return kp;
+}
+
+FixToFloatKeyPack Dealer::recv_fix_to_float_key(int bl)
+{
+    FixToFloatKeyPack kp;
+    kp.micKey = recv_mic_key(bl, bl, 2 * bl);
+    kp.rs = recv_ge(1);
+    kp.rpow = recv_ge(bl);
+    kp.ry = recv_ge(bl);
+    kp.selectKey = recv_select_key(bl);
+    kp.rm = recv_ge(bl);
+    return kp;
+}
+
+FloatToFixKeyPack Dealer::recv_float_to_fix_key(int bl)
+{
+    FloatToFixKeyPack kp;
+    kp.dcfKey = recv_dcf_keypack(24, 1, 1);
+    kp.selectKey = recv_select_key(bl);
+    kp.rm = recv_ge(24);
+    kp.re = recv_ge(10);
+    kp.rw = recv_ge(1);
+    // kp.rt = recv_ge(bl);
+    kp.rh = recv_ge(bl);
+    for (int i = 0; i < 1024; ++i)
+    {
+        kp.p[i] = recv_ge(bl);
+    }
+    for (int i = 0; i < 1024; ++i)
+    {
+        kp.q[i] = recv_ge(bl);
+    }
+    kp.arsKey = recv_ars_key(bl, bl, 23);
+    return kp;
+}
+
+ReluExtendKeyPack Dealer::recv_relu_extend_key(int bin, int bout)
+{
+    ReluExtendKeyPack kp;
+    kp.dcfKey = recv_dcf_keypack(bin, 2, 1);
+    kp.rd = recv_ge(2);
+    kp.rw = recv_ge(2);
+    for (int i = 0; i < 4; ++i)
+    {
+        kp.p[i] = recv_ge(bout);
+    }
+    for (int i = 0; i < 2; ++i)
+    {
+        kp.q[i] = recv_ge(bout);
+    }
+    return kp;
+}
+
+SignExtend2KeyPack Dealer::recv_sign_extend2_key(int Bin, int Bout)
+{
+    SignExtend2KeyPack kp;
+    kp.dcfKey = recv_dcf_keypack(Bin, 1, 1);
+    kp.rw = recv_ge(1);
+    kp.p[0] = recv_ge(Bout);
+    kp.p[1] = recv_ge(Bout);
+    return kp;
+}
+
+TripleKeyPack Dealer::recv_triple_key(int bw, int64_t na, int64_t nb, int64_t nc)
+{
+    TripleKeyPack k;
+    k.bw = bw;
+    k.na = na;
+    k.nb = nb;
+    k.nc = nc;
+    k.a = make_array<GroupElement>(na);
+    k.b = make_array<GroupElement>(nb);
+    k.c = make_array<GroupElement>(nc);
+
+    for (size_t i = 0; i < na; ++i)
+    {
+        k.a[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(bw));
+        mod(k.a[i], bw);
+    }
+
+    for (size_t i = 0; i < nb; ++i)
+    {
+        k.b[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(bw));
+        mod(k.b[i], bw);
+    }
+
+    for (size_t i = 0; i < nc; ++i)
+    {
+        k.c[i] = (party == SERVER ? GroupElement(prngShared.get<uint64_t>()) : recv_ge(bw));
+        mod(k.c[i], bw);
+    }
+    return k;
+}
+
+EdabitsPrTruncKeyPack Dealer::recv_edabits_prtrunc_key(int bw)
+{
+    EdabitsPrTruncKeyPack kp;
+    kp.a = recv_ge(bw);
+    kp.b = recv_ge(bw);
+    return kp;
+}
+
+PubCmpKeyPack Dealer::recv_pubcmp_key(int bin)
+{
+    PubCmpKeyPack kp;
+    kp.bin = bin;
+    kp.dcfKey = recv_dcf_keypack(bin, 1, 1);
+    kp.rout = recv_ge(1);
+    return kp;
+}
+
+ClipKeyPack Dealer::recv_clip_key(int bin)
+{
+    ClipKeyPack kp;
+    kp.bin = bin;
+    kp.cmpKey = recv_pubcmp_key(bin);
+    kp.a = recv_ge(bin);
+    kp.b = recv_ge(bin);
+    kp.c = recv_ge(bin);
+    kp.d1 = recv_ge(bin);
+    kp.d2 = recv_ge(bin);
+    return kp;
+}
+
+LUTKeyPack Dealer::recv_lut_key(int bin, int bout)
+{
+    LUTKeyPack kp;
+    kp.bin = bin;
+    kp.bout = bout;
+
+    kp.dpfKey = recv_dpf_keypack(bin, bout);
+    kp.rout = recv_ge(bout);
+    return kp;
+}
+
+F2BF16KeyPack Dealer::recv_f2bf16_key(int bin)
+{
+    F2BF16KeyPack kp;
+    kp.bin = bin;
+    kp.dcfKey = recv_dcf_keypack(bin, bin, 1);
+    kp.dcfTruncate = recv_dcf_keypack(bin - 8, 8, 1);
+    kp.rout_k = recv_ge(bin);
+    kp.rout_m = recv_ge(bin);
+    kp.rin = recv_ge(bin);
+    kp.prod = recv_ge(bin);
+    kp.rout = recv_ge(13);
+    kp.rProd = recv_ge(bin);
+    return kp;
+}
+
+TruncateReduceKeyPack Dealer::recv_truncate_reduce_key(int bin, int shift)
+{
+    TruncateReduceKeyPack kp;
+    kp.bin = bin;
+    kp.shift = shift;
+
+    kp.dcfKey = recv_dcf_keypack(shift, bin - shift, 1);
+    kp.rout = recv_ge(bin - shift);
+    return kp;
+}
+
+LUTSSKeyPack Dealer::recv_lutss_key(int bin, int bout)
+{
+    LUTSSKeyPack kp;
+    kp.bin = bin;
+    kp.bout = bout;
+
+    kp.b0 = recv_ge(64);
+    kp.b1 = recv_ge(64);
+    kp.b2 = recv_ge(64);
+    kp.b3 = recv_ge(64);
+    kp.routRes = recv_ge(bout);
+    kp.routCorr = recv_ge(bout);
+    kp.rout = recv_ge(bout);
+    return kp;
+}
+
+LUTDPFETKeyPack Dealer::recv_lutdpfet_key(int bin, int bout)
+{
+    LUTDPFETKeyPack kp;
+    kp.bin = bin;
+    kp.bout = bout;
+
+    kp.dpfKey = recv_dpfet_keypack(bin);
+    kp.routRes = recv_ge(bout);
+    kp.routCorr = recv_ge(1);
+    return kp;
+}
+
+SlothDreluKeyPack Dealer::recv_slothdrelu_key(int bin)
+{
+    SlothDreluKeyPack kp;
+    kp.bin = bin;
+    kp.dpfKey = recv_dpfet_keypack(bin - 1);
+    kp.r = recv_ge(1);
+    return kp;
+}
+
+WrapDPFKeyPack Dealer::recv_wrap_dpf_key(int bin)
+{
+    WrapDPFKeyPack kp;
+    kp.bin = bin;
+    kp.dpfKey = recv_dpfet_keypack(bin);
+    kp.r = recv_ge(1);
+    return kp;
+}
+
+WrapSSKeyPack Dealer::recv_wrap_ss_key(int bin)
+{
+    WrapSSKeyPack kp;
+    kp.bin = bin;
+    kp.b0 = recv_ge(64);
+    if (bin == 7)
+        kp.b1 = recv_ge(64);
+    return kp;
+}
+
+SlothLRSKeyPack Dealer::recv_sloth_lrs_key(int bin, int shift)
+{
+    SlothLRSKeyPack kp;
+    kp.bin = bin;
+    kp.shift = shift;
+    kp.rout = recv_ge(bin);
+    kp.msb = recv_ge(bin);
+    kp.select = recv_ge(bin);
+
+    return kp;
+}
+
+SlothSignExtendKeyPack Dealer::recv_sloth_sign_extend_key(int bin, int bout)
+{
+    SlothSignExtendKeyPack kp;
+    kp.bin = bin;
+    kp.bout = bout;
+    kp.rout = recv_ge(bout);
+    kp.select = recv_ge(bout);
+
+    return kp;
+}
+
+void Peer::send_sloth_sign_extend_key(const SlothSignExtendKeyPack &kp)
+{
+    send_ge(kp.rout, kp.bout);
+    send_ge(kp.select, kp.bout);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/config.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/config.cpp
new file mode 100644
index 00000000..4e9309b1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/config.cpp
@@ -0,0 +1,35 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/config.h>
+
+namespace LlamaConfig {
+    int bitlength = 64;
+    int num_threads = 4;
+    int party = 0;
+    Peer *client = nullptr;
+    Peer *server = nullptr;
+    Peer *peer = nullptr;
+    Dealer *dealer = nullptr;
+    int port = 42069;
+    bool stochasticRT = false;
+    bool stochasticT  = false;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/input_prng.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/input_prng.cpp
new file mode 100644
index 00000000..d33715ec
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/input_prng.cpp
@@ -0,0 +1,173 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/input_prng.h>
+#include <llama/comms.h>
+#include <llama/config.h>
+#include <chrono>
+#include <thread>
+#include <llama/stats.h>
+
+using namespace LlamaConfig;
+
+osuCrypto::AES inputPrng[2];
+int counter[2] = {0, 0};
+
+void input_prng_init()
+{
+    if (party == DEALER) {
+        osuCrypto::AES aesSeed(prngs[0].get<osuCrypto::block>());
+        auto seed0 = aesSeed.ecbEncBlock(osuCrypto::ZeroBlock);
+        server->send_block(seed0);
+        auto seed1 = aesSeed.ecbEncBlock(osuCrypto::OneBlock);
+        client->send_block(seed1);
+        inputPrng[0] = osuCrypto::AES(seed0);
+        inputPrng[1] = osuCrypto::AES(seed1);
+    }
+    else {
+        auto seed = dealer->recv_block();
+        inputPrng[party - SERVER] = osuCrypto::AES(seed);
+    }
+}
+
+osuCrypto::block get_input_mask_pair(int idx, int owner)
+{
+    osuCrypto::block val = inputPrng[owner - SERVER].ecbEncBlock(osuCrypto::toBlock(0, counter[owner - SERVER] + idx));
+    return val;
+}
+
+inline std::pair<int32_t, int32_t> get_start_end(int32_t size, int32_t thread_idx)
+{
+    int32_t chunk_size = size / num_threads;
+    if (thread_idx == num_threads - 1)
+    {
+        return std::make_pair(thread_idx * chunk_size, size);
+    }
+    else
+    {
+        return std::make_pair(thread_idx * chunk_size, (thread_idx + 1) * chunk_size);
+    }
+}
+
+void input_layer_dealer_thread(int thread_idx, int size, int owner, GroupElement *x_mask)
+{
+    auto p = get_start_end(size, thread_idx);
+    for(int i = p.first; i < p.second; i++)
+    {
+        auto pair = get_input_mask_pair(i, owner);
+        x_mask[2*i] = _mm_extract_epi64(pair, 0);
+        x_mask[2*i+1] = _mm_extract_epi64(pair, 1);
+    }
+}
+
+void input_layer_owner_thread(int thread_idx, int size, int owner, GroupElement *x, GroupElement *x_mask)
+{
+    auto p = get_start_end(size, thread_idx);
+    for(int i = p.first; i < p.second; i++)
+    {
+        auto pair = get_input_mask_pair(i, owner);
+        x_mask[2*i] = _mm_extract_epi64(pair, 0);
+        x_mask[2*i+1] = _mm_extract_epi64(pair, 1);
+        x[2*i] = x[2*i] + x_mask[2*i];
+        x[2*i+1] = x[2*i+1] + x_mask[2*i+1];
+    }
+}
+
+void input_layer(GroupElement *x, GroupElement *x_mask, int size, int owner)
+{
+    if (size == 0) return;
+    if (party == DEALER) {
+        TIME_THIS_BLOCK_FOR_INPUT_IF(
+            std::thread thread_pool[num_threads];
+            for(int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i] = std::thread(input_layer_dealer_thread, i, size/2, owner, x_mask);
+            }
+            for(int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i].join();
+            }
+            if (size % 2 == 1) {
+                auto pair = get_input_mask_pair(size/2, owner);
+                x_mask[size-1] = _mm_extract_epi64(pair, 0);
+            }
+        , true, accumulatedInputTimeOffline)
+    }
+    else if (party == owner) {
+        // for(int i = 0; i < size; ++i) {
+        //     std::cin >> x[i];
+        // }
+        // generate and add masks
+        TIME_THIS_BLOCK_FOR_INPUT_IF(
+            std::thread thread_pool[num_threads];
+            for(int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i] = std::thread(input_layer_owner_thread, i, size/2, owner, x, x_mask);
+            }
+            for(int i = 0; i < num_threads; ++i)
+            {
+                thread_pool[i].join();
+            }
+            if (size % 2 == 1) {
+                auto pair = get_input_mask_pair(size/2, owner);
+                x_mask[size-1] = _mm_extract_epi64(pair, 0);
+                x[size-1] = x[size-1] + x_mask[size-1];
+            }
+        , true, accumulatedInputTimeOffline)
+        
+        TIME_THIS_BLOCK_FOR_INPUT_IF(
+            peer->send_batched_input(x, size, bitlength);
+        , true, (owner == SERVER ? accumulatedInputTimeOffline : accumulatedInputTimeOnline))
+    }
+    else {
+        uint64_t *tmp = new uint64_t[size];
+        TIME_THIS_BLOCK_FOR_INPUT_IF(
+        peer->recv_batched_input(tmp, size, bitlength);
+        , true, (owner == SERVER ? accumulatedInputTimeOffline : accumulatedInputTimeOnline))
+        // todo: parallelize this maybe?
+        for(int i = 0; i < size; ++i) {
+            x[i] = tmp[i];
+        }
+        delete[] tmp;
+    }
+    counter[owner - SERVER] += size;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/prng.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/prng.cpp
new file mode 100644
index 00000000..be1579fb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/prng.cpp
@@ -0,0 +1,48 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <cryptoTools/Crypto/PRNG.h>
+
+namespace LlamaConfig {
+    osuCrypto::PRNG prngs[256];
+}
+osuCrypto::PRNG prngShared;
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/stats.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/stats.cpp
new file mode 100644
index 00000000..2e1d378f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/stats.cpp
@@ -0,0 +1,105 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/stats.h>
+#include <iostream>
+#include <fstream>
+
+int numRounds = 0;
+uint64_t eigenMicroseconds = 0;
+uint64_t accumulatedInputTimeOffline = 0;
+uint64_t accumulatedInputTimeOnline = 0;
+
+uint64_t evalMicroseconds = 0;
+uint64_t reconstructMicroseconds = 0;
+uint64_t arsEvalMicroseconds = 0;
+uint64_t convEvalMicroseconds = 0;
+uint64_t reluEvalMicroseconds = 0;
+uint64_t avgpoolEvalMicroseconds = 0;
+uint64_t pubdivEvalMicroseconds = 0;
+uint64_t argmaxEvalMicroseconds = 0;
+uint64_t multEvalMicroseconds = 0;
+uint64_t selectEvalMicroseconds = 0;
+uint64_t dealerMicroseconds = 0;
+uint64_t inputOfflineComm = 0;
+uint64_t inputOnlineComm = 0;
+uint64_t startTime = 0;
+uint64_t secFloatComm = 0;
+
+uint64_t convOnlineComm = 0;
+uint64_t selectOnlineComm = 0;
+uint64_t arsOnlineComm = 0;
+uint64_t reluOnlineComm = 0;
+
+uint64_t packTime = 0;
+uint64_t unpackTime = 0;
+uint64_t sendTime = 0;
+uint64_t recvTime = 0;
+
+namespace Llama
+{
+
+    std::ostream *log_output = nullptr;
+    // std::ostream *log_output = &(std::cerr);
+
+    void stat_t::print()
+    {
+        if (log_output != nullptr)
+        {
+            (*log_output) << ">> " << name << " - Start" << std::endl;
+            (*log_output) << "   Key Read Time = " << keyread_time / 1000.0 << " milliseconds\n";
+            (*log_output) << "   Compute Time = " << compute_time / 1000.0 << " milliseconds\n";
+            (*log_output) << "   Reconstruct Time = " << reconstruct_time / 1000.0 << " milliseconds\n";
+            (*log_output) << "   Online Time = " << (reconstruct_time + compute_time) / 1000.0 << " milliseconds\n";
+            (*log_output) << "   Online Comm = " << comm_bytes << " bytes\n";
+            (*log_output) << ">> " << name << " - End" << std::endl;
+        }
+    }
+
+    std::map<std::string, stat_t> stats;
+
+    void push_stats(const stat_t &stat)
+    {
+        if (stats.find(stat.name) == stats.end())
+        {
+            stats[stat.name] = stat;
+        }
+        else
+        {
+            stats[stat.name].compute_time += stat.compute_time;
+            stats[stat.name].reconstruct_time += stat.reconstruct_time;
+            stats[stat.name].keyread_time += stat.keyread_time;
+            stats[stat.name].comm_bytes += stat.comm_bytes;
+            stats[stat.name].keysize_bytes += stat.keysize_bytes;
+        }
+    }
+
+    void dump_stats_csv(const std::string &filename)
+    {
+        std::ofstream out(filename);
+        out << "Protocol,Online Time (ms),Communication (MB), Key Size (GB)" << std::endl;
+        for (auto &stat : stats)
+        {
+            out << stat.second.name << "," << (stat.second.compute_time + stat.second.reconstruct_time) / 1000.0 << "," << stat.second.comm_bytes / (1024.0 * 1024.0) << "," << stat.second.keysize_bytes / (1024.0 * 1024.0 * 1024.0) << std::endl;
+        }
+        out.close();
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/src/llama/utils.cpp b/GPU-MPC/ext/sytorch/ext/llama/src/llama/utils.cpp
new file mode 100644
index 00000000..2bdc8c4e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/src/llama/utils.cpp
@@ -0,0 +1,685 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+Authors: Deepak Kumaraswamy, Kanav Gupta
+Copyright:
+Copyright (c) 2022 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <llama/utils.h>
+#include <llama/array.h>
+#include <llama/comms.h>
+#include <assert.h>
+#include <iostream>
+#include <Eigen/Dense>
+#include <math.h>
+#include <chrono>
+
+void MatAdd(int s1, int s2, GroupElement *A, GroupElement* B, GroupElement *C)
+{
+    // using eigen map
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s1, s2);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s1, s2);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s1, s2);
+    C_eigen = A_eigen + B_eigen;
+}
+
+void MatAdd4(int s0, int s1, int s2, int s3, GroupElement* A, GroupElement* B, GroupElement* C)
+{
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s0, s1 * s2 * s3);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s0, s1 * s2 * s3);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s0, s1 * s2 * s3);
+    C_eigen = A_eigen + B_eigen;
+}
+
+void MatAdd5(int s0, int s1, int s2, int s3, int s4, GroupElement* A, GroupElement* B, GroupElement* C)
+{
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s0, s1 * s2 * s3 * s4);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s0, s1 * s2 * s3 * s4);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s0, s1 * s2 * s3 * s4);
+    C_eigen = A_eigen + B_eigen;
+}
+
+void MatSub(int s1, int s2, GroupElement *A, GroupElement* B, GroupElement *C)
+{
+    // using eigen map
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s1, s2);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s1, s2);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s1, s2);
+    C_eigen = A_eigen - B_eigen;
+}
+
+void MatSub4(int s0, int s1, int s2, int s3, GroupElement* A, GroupElement* B, GroupElement* C)
+{
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s0, s1 * s2 * s3);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s0, s1 * s2 * s3);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s0, s1 * s2 * s3);
+    C_eigen = A_eigen - B_eigen;
+}
+
+void MatSub5(int s0, int s1, int s2, int s3, int s4, GroupElement* A, GroupElement* B, GroupElement* C)
+{
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_eigen(A, s0, s1 * s2 * s3 * s4);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> B_eigen(B, s0, s1 * s2 * s3 * s4);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> C_eigen(C, s0, s1 * s2 * s3 * s4);
+    C_eigen = A_eigen - B_eigen;
+}
+
+void MatMul(int s1, int s2, int s3, eigenMatrix &A, eigenMatrix &B, eigenMatrix &C)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    C = A * B;
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    eigenMicroseconds += duration.count();
+}
+
+
+void matmul_cleartext_eigen_llama(int dim1, int dim2, int dim3, GroupElement *inA,
+                            GroupElement *inB, GroupElement *outC) {
+  auto start = std::chrono::high_resolution_clock::now();
+  Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(inA, dim1, dim2);
+  Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(inB, dim2, dim3);
+  Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(outC, dim1, dim3);
+  eC = eA * eB;
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  eigenMicroseconds += duration.count();
+}
+
+
+void MatMul(int s1, int s2, int s3, GroupElement *A, GroupElement *B, GroupElement *C)
+{
+    matmul_cleartext_eigen_llama(s1, s2, s3, A, B, C);
+}
+
+void Conv2DReshapeFilter(int FH, int FW, int CI, int CO, GroupElement* filter, GroupElement* reshapedFilter)
+{
+    // using eigen
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(reshapedFilter, CO, FH*FW*CI);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(filter, CO, FH*FW*CI);
+    eA = eB;
+}
+
+void Conv2DReshapeFilter(int FH, int FW, int CI, int CO, GroupElement* filter, eigenMatrix &reshapedFilter)
+{
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(filter, CO, FH*FW*CI);
+    reshapedFilter = eB;
+}
+
+void Conv2DReshapeInput(size_t N, size_t H, size_t W, size_t CI, size_t FH, size_t FW, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, GroupElement *outputArr)
+{
+    size_t linIdxFilterMult = 0;
+	for (size_t n = 0; n < N; n++){
+		size_t leftTopCornerH = 0 - zPadHLeft;
+		size_t extremeRightBottomCornerH = H - 1 + zPadHRight;
+		while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+			size_t leftTopCornerW = 0 - zPadWLeft;
+			size_t extremeRightBottomCornerW = W - 1 + zPadWRight;
+			while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+				for (size_t fh = 0; fh < FH; fh++){
+					for (size_t fw = 0; fw < FW; fw++){
+						size_t curPosH = leftTopCornerH + fh;
+						size_t curPosW = leftTopCornerW + fw;
+						for (size_t ci = 0; ci < CI; ci++){
+                            size_t rowidx = (fh*FW*CI) + (fw*CI) + ci;
+                            // std::cout << rowidx << std::endl;
+							if ((((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
+								Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L;
+							}
+							else{
+                                auto l = Arr4DIdx(inputArr, N, H, W, CI, n, curPosH, curPosW, ci);
+								Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = l;
+							}
+						}
+					}
+				}
+
+				linIdxFilterMult = linIdxFilterMult + 1;
+				leftTopCornerW = leftTopCornerW + strideW;
+			}
+
+			leftTopCornerH = leftTopCornerH + strideH;
+		}
+	}
+}
+
+void Conv2DReshapeInputPartial(size_t N, size_t H, size_t W, size_t CI, size_t FH, size_t FW, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, eigenMatrix &outputArr, size_t batchIndex)
+{
+    size_t linIdxFilterMult = 0;
+    size_t n = batchIndex;
+    size_t leftTopCornerH = 0 - zPadHLeft;
+    size_t extremeRightBottomCornerH = H - 1 + zPadHRight;
+    while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+        size_t leftTopCornerW = 0 - zPadWLeft;
+        size_t extremeRightBottomCornerW = W - 1 + zPadWRight;
+        while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+            for (size_t fh = 0; fh < FH; fh++){
+                for (size_t fw = 0; fw < FW; fw++){
+                    size_t curPosH = leftTopCornerH + fh;
+                    size_t curPosW = leftTopCornerW + fw;
+                    for (size_t ci = 0; ci < CI; ci++){
+                        if ((((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
+                            outputArr((fh*FW*CI) + (fw*CI) + ci, linIdxFilterMult) = 0L;
+                        }
+                        else{
+                            outputArr((fh*FW*CI) + (fw*CI) + ci, linIdxFilterMult) = Arr4DIdx(inputArr, N, H, W, CI, n, curPosH, curPosW, ci);
+                        }
+                    }
+                }
+            }
+
+            linIdxFilterMult = linIdxFilterMult + 1;
+            leftTopCornerW = leftTopCornerW + strideW;
+        }
+
+        leftTopCornerH = leftTopCornerH + strideH;
+    }
+}
+
+void Conv2DReshapeOutput(int N, int finalH, int finalW, int CO, GroupElement *inputArr, GroupElement *outputArr)
+{
+    for (int co = 0; co < CO; ++co){
+		for (int n = 0; n < N; ++n){
+			for(int h = 0; h < finalH; ++h){
+				for (int w = 0; w < finalW; ++w){
+					Arr4DIdx(outputArr, N, finalH, finalW, CO, n, h, w, co) = Arr2DIdx(inputArr, CO, N*finalH*finalW, co, (n*finalH*finalW) + (h*finalW) + w);
+				}
+			}
+		}
+	}
+}
+
+
+void Conv2DReshapeOutputPartial(int N, int finalH, int finalW, int CO, eigenMatrix inputArr, GroupElement *outputArr, int batchIndex)
+{
+    for (int co = 0; co < CO; ++co){
+        for(int h = 0; h < finalH; ++h){
+            for (int w = 0; w < finalW; ++w){
+                Arr4DIdx(outputArr, N, finalH, finalW, CO, batchIndex, h, w, co) = inputArr(co, (h*finalW) + w);
+            }
+        }
+	}
+}
+
+void Conv2DPlaintext(int N, int H, int W, int CI, 
+				   int FH, int FW, int CO, 
+				   int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr)
+{
+    size_t reshapedFilterRows = CO;
+	size_t reshapedFilterCols = FH*FW*CI;
+	size_t reshapedIPRows = FH*FW*CI;
+	size_t newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	size_t newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	size_t reshapedIPCols = N * newH * newW;
+
+    GroupElement *filterReshaped = filterArr;
+	GroupElement *inputReshaped = make_array<GroupElement>(reshapedIPRows, reshapedIPCols);
+	GroupElement *matmulOP = make_array<GroupElement>(reshapedFilterRows, reshapedIPCols);
+    
+    Conv2DReshapeInput(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, reshapedIPRows, reshapedIPCols, inputArr, inputReshaped);
+    // Conv2DReshapeFilter(FH, FW, CI, CO, filterArr, filterReshaped);
+    MatMul(reshapedFilterRows, reshapedFilterCols, reshapedIPCols, filterReshaped, inputReshaped, matmulOP);
+    Conv2DReshapeOutput(N, newH, newW, CO, matmulOP, outArr);
+
+    // delete[] filterReshaped;
+    delete[] inputReshaped;
+    delete[] matmulOP;
+
+}
+
+Conv2DCache allocateConv2DCache(int N, int H, int W, int CI, 
+                                int FH, int FW, int CO, 
+                                int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+                                int strideH, int strideW) {
+    int reshapedFilterRows = CO;
+	int reshapedFilterCols = FH*FW*CI;
+	int reshapedIPRows = reshapedFilterCols;
+	int newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	int newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	int reshapedIPCols = newH * newW;
+
+    Conv2DCache cache;
+    cache.reshapedFilter = eigenMatrix(reshapedFilterRows, reshapedFilterCols);
+	cache.reshapedInput = eigenMatrix(reshapedIPRows, reshapedIPCols);
+	cache.matmulResult = eigenMatrix(reshapedFilterRows, reshapedIPCols);
+    cache.temp = make_array<GroupElement>(N, newH, newW, CO);
+
+    return cache;
+}
+
+void freeConv2DCache(const Conv2DCache &cache) {
+    // cache.reshapedFilter.resize(0, 0);
+    // cache.reshapedInput.resize(0, 0);
+    // cache.matmulResult.resize(0, 0);
+    delete[] cache.temp;
+}
+
+void Conv2DPlaintext(int N, int H, int W, int CI, 
+				   int FH, int FW, int CO, 
+				   int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr,
+                   Conv2DCache &cache)
+{
+    int reshapedFilterRows = CO;
+	int reshapedFilterCols = FH*FW*CI;
+	int reshapedIPRows = FH*FW*CI;
+	int newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	int newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	int reshapedIPCols = newH * newW;
+
+    Conv2DReshapeFilter(FH, FW, CI, CO, filterArr, cache.reshapedFilter);
+    for(int i = 0; i < N; ++i) {
+        Conv2DReshapeInputPartial(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, reshapedIPRows, reshapedIPCols, inputArr, cache.reshapedInput, i);
+        MatMul(reshapedFilterRows, reshapedFilterCols, reshapedIPCols, cache.reshapedFilter, cache.reshapedInput, cache.matmulResult);
+        Conv2DReshapeOutputPartial(N, newH, newW, CO, cache.matmulResult, outArr, i);
+    }
+}
+
+void VecCopy(int s, GroupElement *input, GroupElement *output)
+{
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, 1>> output_eigen(output, s);
+    Eigen::Map<Eigen::Matrix<GroupElement, Eigen::Dynamic, 1>> input_eigen(input, s);
+    output_eigen = input_eigen;
+}
+
+void MatCopy4(int s1, int s2, int s3, int s4, GroupElement *input, GroupElement *output){
+    VecCopy(s1*s2*s3*s4, input, output);
+}
+
+void MatCopy5(int s1, int s2, int s3, int s4, int s5, GroupElement *input, GroupElement *output){
+    VecCopy(s1*s2*s3*s4*s5, input, output);
+}
+
+void matmul_eval_helper(int party, int dim1, int dim2, int dim3, GroupElement *A,
+                            GroupElement *B, GroupElement *C, GroupElement *ka, GroupElement *kb, GroupElement *kc) {
+    auto start = std::chrono::high_resolution_clock::now();
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_A(A, dim1, dim2);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_ka(ka, dim1, dim2);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_B(B, dim2, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_kb(kb, dim2, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_C(C, dim1, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_kc(kc, dim1, dim3);
+
+    if (party == SERVER) {
+        eigen_C = (eigen_A - eigen_ka) * eigen_B - eigen_A * eigen_kb + eigen_kc;
+    }
+    else {
+        eigen_C = eigen_kc - eigen_ka * eigen_B - eigen_A * eigen_kb;
+    }
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    eigenMicroseconds += duration.count();
+}
+
+void matmul_eval_helper_triangular(int party, int dim1, int dim2, int dim3, GroupElement *A,
+                            GroupElement *B, GroupElement *C, GroupElement *ka, GroupElement *kb, GroupElement *kc) {
+    auto start = std::chrono::high_resolution_clock::now();
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_A(A, dim1, dim2);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_ka(ka, dim1, dim2);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_B(B, dim2, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_kb(kb, dim2, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_C(C, dim1, dim3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_kc(kc, dim1, dim3);
+
+    if (party == SERVER) {
+        eigen_C = ((eigen_A - eigen_ka) * eigen_B - eigen_A * eigen_kb + eigen_kc).triangularView<Eigen::Lower>();
+    }
+    else {
+        eigen_C = (eigen_kc - eigen_ka * eigen_B - eigen_A * eigen_kb).triangularView<Eigen::Lower>();
+    }
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    eigenMicroseconds += duration.count();
+}
+
+void packBitArray(GroupElement *A, int size, uint8_t *out) {
+    int bytesize = (size % 8 == 0) ? (size / 8) : (size / 8 + 1);
+    for (int i = 0; i < bytesize; ++i) {
+        out[i] = 0;
+    }
+    for (int i = 0; i < size; i++) {
+        out[i / 8] = out[i / 8] | ((A[i] & 1) << (i % 8));
+    }
+}
+
+
+
+// 3d
+
+void Conv3DReshapeFilter(int FD, int FH, int FW, int CI, int CO, GroupElement* filter, GroupElement* reshapedFilter)
+{
+    // using eigen
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(reshapedFilter, CO, FD*FH*FW*CI);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(filter, CO, FD*FH*FW*CI);
+    eA = eB;
+}
+
+void Conv3DReshapeFilter(int FD, int FH, int FW, int CI, int CO, GroupElement* filter, eigenMatrix &reshapedFilter)
+{
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(filter, CO, FD*FH*FW*CI);
+    reshapedFilter = eB;
+}
+
+void Conv3DReshapeInput(size_t N, size_t D, size_t H, size_t W, size_t CI, size_t FD, size_t FH, size_t FW, size_t zPadDLeft, size_t zPadDRight, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideD, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, GroupElement *outputArr)
+{
+    size_t linIdxFilterMult = 0;
+	for (size_t n = 0; n < N; n++){
+        size_t leftTopCornerD = 0 - zPadDLeft;
+        size_t extremeRightBottomCornerD = D - 1 + zPadDRight;
+        while((leftTopCornerD + FD - 1) <= extremeRightBottomCornerD){
+            size_t leftTopCornerH = 0 - zPadHLeft;
+            size_t extremeRightBottomCornerH = H - 1 + zPadHRight;
+            while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+                size_t leftTopCornerW = 0 - zPadWLeft;
+                size_t extremeRightBottomCornerW = W - 1 + zPadWRight;
+                while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+                    for (size_t fd = 0; fd < FD; fd++) {
+                        for (size_t fh = 0; fh < FH; fh++){
+                            for (size_t fw = 0; fw < FW; fw++){
+                                size_t curPosD = leftTopCornerD + fd;
+                                size_t curPosH = leftTopCornerH + fh;
+                                size_t curPosW = leftTopCornerW + fw;
+                                for (size_t ci = 0; ci < CI; ci++){
+                                    size_t rowidx = (fd*FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci;
+                                    // std::cout << rowidx << std::endl;
+                                    if ((((curPosD < 0) || (curPosD >= D)) || ((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
+                                        Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L;
+                                    }
+                                    else{
+                                        auto l = Arr5DIdx(inputArr, N, D, H, W, CI, n, curPosD, curPosH, curPosW, ci);
+                                        Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = l;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    linIdxFilterMult = linIdxFilterMult + 1;
+                    leftTopCornerW = leftTopCornerW + strideW;
+                }
+                leftTopCornerH = leftTopCornerH + strideH;
+            }
+            leftTopCornerD = leftTopCornerD + strideD;
+        }
+	}
+}
+
+void Conv3DReshapeInputPartial(size_t N, size_t D, size_t H, size_t W, size_t CI, size_t FD, size_t FH, size_t FW, size_t zPadDLeft, size_t zPadDRight, size_t zPadHLeft, size_t zPadHRight, size_t zPadWLeft, size_t zPadWRight, size_t strideD, size_t strideH, size_t strideW, size_t RRows, size_t RCols, GroupElement *inputArr, eigenMatrix &outputArr, size_t batchIndex)
+{
+    size_t linIdxFilterMult = 0;
+    size_t n = batchIndex;
+
+    size_t leftTopCornerD = 0 - zPadDLeft;
+    size_t extremeRightBottomCornerD = D - 1 + zPadDRight;
+    while((leftTopCornerD + FD - 1) <= extremeRightBottomCornerD) {
+        size_t leftTopCornerH = 0 - zPadHLeft;
+        size_t extremeRightBottomCornerH = H - 1 + zPadHRight;
+        while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+            size_t leftTopCornerW = 0 - zPadWLeft;
+            size_t extremeRightBottomCornerW = W - 1 + zPadWRight;
+            while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+                for (size_t fd = 0; fd < FD; fd++) {
+                    for (size_t fh = 0; fh < FH; fh++){
+                        for (size_t fw = 0; fw < FW; fw++){
+                            size_t curPosD = leftTopCornerD + fd;
+                            size_t curPosH = leftTopCornerH + fh;
+                            size_t curPosW = leftTopCornerW + fw;
+                            for (size_t ci = 0; ci < CI; ci++){
+                                if ((((curPosD < 0) || (curPosD >= D)) || ((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
+                                    outputArr((fd*FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci, linIdxFilterMult) = 0L;
+                                }
+                                else{
+                                    outputArr((fd*FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci, linIdxFilterMult) = Arr5DIdx(inputArr, N, D,H, W, CI, n, curPosD, curPosH, curPosW, ci);
+                                }
+                            }
+                        }
+                    }
+                }
+
+                linIdxFilterMult = linIdxFilterMult + 1;
+                leftTopCornerW = leftTopCornerW + strideW;
+            }
+            leftTopCornerH = leftTopCornerH + strideH;
+        }
+        leftTopCornerD = leftTopCornerD + strideD;
+    }
+}
+
+void Conv3DReshapeOutput(int N, int finalD, int finalH, int finalW, int CO, GroupElement *inputArr, GroupElement *outputArr)
+{
+    for (int co = 0; co < CO; ++co){
+		for (int n = 0; n < N; ++n){
+            for(int d = 0; d < finalD; ++d) {
+                for(int h = 0; h < finalH; ++h){
+                    for (int w = 0; w < finalW; ++w){
+                        Arr5DIdx(outputArr, N, finalD, finalH, finalW, CO, n, d, h, w, co) = Arr2DIdx(inputArr, CO, N*finalD*finalH*finalW, co, (n*finalD*finalH*finalW) + (d*finalH*finalW) + (h*finalW) + w);
+                    }
+                }
+            }
+		}
+	}
+}
+
+
+void Conv3DReshapeOutputPartial(int N, int finalD, int finalH, int finalW, int CO, eigenMatrix inputArr, GroupElement *outputArr, int batchIndex)
+{
+    for (int co = 0; co < CO; ++co){
+        for(int d = 0; d < finalD; ++d) {
+            for(int h = 0; h < finalH; ++h){
+                for (int w = 0; w < finalW; ++w){
+                    Arr5DIdx(outputArr, N, finalD, finalH, finalW, CO, batchIndex, d, h, w, co) = inputArr(co, (d*finalH*finalW) + (h*finalW) + w);
+                }
+            }
+        }
+	}
+}
+
+void Conv3DPlaintext(int N, int D, int H, int W, int CI, 
+				   int FD, int FH, int FW, int CO, 
+				   int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideD, int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr)
+{
+    size_t reshapedFilterRows = CO;
+	size_t reshapedFilterCols = FD*FH*FW*CI;
+	size_t reshapedIPRows = FD*FH*FW*CI;
+    size_t newD = (((D + (zPadDLeft+zPadDRight) - FD)/strideD) + 1);
+	size_t newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	size_t newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	size_t reshapedIPCols = N * newD * newH * newW;
+
+    GroupElement *filterReshaped = filterArr;
+	GroupElement *inputReshaped = make_array<GroupElement>(reshapedIPRows, reshapedIPCols);
+	GroupElement *matmulOP = make_array<GroupElement>(reshapedFilterRows, reshapedIPCols);
+    
+    Conv3DReshapeInput(N, D, H, W, CI, FD, FH, FW, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW, reshapedIPRows, reshapedIPCols, inputArr, inputReshaped);
+    // Conv2DReshapeFilter(FH, FW, CI, CO, filterArr, filterReshaped);
+    MatMul(reshapedFilterRows, reshapedFilterCols, reshapedIPCols, filterReshaped, inputReshaped, matmulOP);
+    Conv3DReshapeOutput(N, newD, newH, newW, CO, matmulOP, outArr);
+
+    // delete[] filterReshaped;
+    delete[] inputReshaped;
+    delete[] matmulOP;
+
+}
+
+Conv3DCache allocateConv3DCache(int N, int D, int H, int W, int CI, 
+                                int FD, int FH, int FW, int CO, 
+                                int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+                                int strideD, int strideH, int strideW) {
+    int reshapedFilterRows = CO;
+	int reshapedFilterCols = FD*FH*FW*CI;
+	int reshapedIPRows = reshapedFilterCols;
+    int newD = (((D + (zPadDLeft+zPadDRight) - FD)/strideD) + 1);
+	int newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	int newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	int reshapedIPCols = newD * newH * newW;
+
+    Conv3DCache cache;
+    cache.reshapedFilter = eigenMatrix(reshapedFilterRows, reshapedFilterCols);
+	cache.reshapedInput = eigenMatrix(reshapedIPRows, reshapedIPCols);
+	cache.matmulResult = eigenMatrix(reshapedFilterRows, reshapedIPCols);
+    cache.temp = make_array<GroupElement>(N, newH, newW, CO);
+
+    return cache;
+}
+
+void freeConv3DCache(const Conv3DCache &cache) {
+    // cache.reshapedFilter.resize(0, 0);
+    // cache.reshapedInput.resize(0, 0);
+    // cache.matmulResult.resize(0, 0);
+    delete[] cache.temp;
+}
+
+void Conv3DPlaintext(int N, int D, int H, int W, int CI, 
+				   int FD, int FH, int FW, int CO, 
+				   int zPadDLeft, int zPadDRight, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, 
+				   int strideD, int strideH, int strideW, 
+				   GroupElement *inputArr, 
+				   GroupElement * filterArr, 
+				   GroupElement * outArr,
+                   Conv3DCache &cache)
+{
+    int reshapedFilterRows = CO;
+	int reshapedFilterCols = FD*FH*FW*CI;
+	int reshapedIPRows = FD*FH*FW*CI;
+    int newD = (((D + (zPadDLeft+zPadDRight) - FD)/strideD) + 1);
+	int newH = (((H + (zPadHLeft+zPadHRight) - FH)/strideH) + 1);
+	int newW = (((W + (zPadWLeft+zPadWRight) - FW)/strideW) + 1);
+	int reshapedIPCols = newD * newH * newW;
+
+    Conv3DReshapeFilter(FD, FH, FW, CI, CO, filterArr, cache.reshapedFilter);
+    for(int i = 0; i < N; ++i) {
+        Conv3DReshapeInputPartial(N, D, H, W, CI, FD, FH, FW, zPadDLeft, zPadDRight, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW, reshapedIPRows, reshapedIPCols, inputArr, cache.reshapedInput, i);
+        MatMul(reshapedFilterRows, reshapedFilterCols, reshapedIPCols, cache.reshapedFilter, cache.reshapedInput, cache.matmulResult);
+        Conv3DReshapeOutputPartial(N, newD, newH, newW, CO, cache.matmulResult, outArr, i);
+    }
+}
+
+void ConvTranspose3DLoopInnerClear(
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    GroupElement* inputArr, 
+    GroupElement* filterArr, 
+    GroupElement* outArr)
+{
+    zPadDLeft = FD - 1 - zPadDLeft;
+    zPadDRight = FD - 1 - zPadDRight;
+    zPadHLeft = FH - 1 - zPadHLeft;
+    zPadHRight = FH - 1 - zPadHRight;
+    zPadWLeft = FW - 1 - zPadWLeft;
+    zPadWRight = FW - 1 - zPadWRight;
+
+    #pragma omp parallel for collapse(5)
+    for (int64_t n =  0; n < N; n++){
+        for (int64_t d =  0; d < outD; d++){
+            for (int64_t h =  0; h < outH; h++){
+                for (int64_t w =  0; w < outW; w++){
+                    for (int64_t co =  0; co < CO; co++){
+                        
+                        GroupElement val =  0;
+                        for (int64_t ci =  0; ci < CI; ci++){
+                            for (int64_t fd = d; fd < (d + FD); fd++){
+                                for (int64_t fh = h; fh < (h + FH); fh++){
+                                    for (int64_t fw = w; fw < (w + FW); fw++){
+
+                                        int64_t curPosD = ((fd - zPadDLeft) / strideD);
+                                        int64_t curPosH = ((fh - zPadHLeft) / strideH);
+                                        int64_t curPosW = ((fw - zPadWLeft) / strideW);
+
+                                        if ((curPosD >=  0) &&
+                                            (curPosH >=  0) &&
+                                            (curPosW >=  0) &&
+                                            (curPosD < D) &&
+                                            (curPosH < H) &&
+                                            (curPosW < W) &&
+                                            (((fd - zPadDLeft) % strideD) == 0) &&
+                                            (((fh - zPadHLeft) % strideH) == 0) &&
+                                            (((fw - zPadWLeft) % strideW) == 0))
+                                        {
+                                            int32_t curFilterPosD = FD + d - fd -  1;
+                                            int32_t curFilterPosH = FH + h - fh -  1;
+                                            int32_t curFilterPosW = FW + w - fw -  1;
+                                            val += (Arr5DIdx(inputArr, N, D, H, W, CI, n, curPosD, curPosH, curPosW, ci) * Arr5DIdx(filterArr, CO, FD, FH, FW, CI, co, curFilterPosD, curFilterPosH, curFilterPosW, ci));
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) =  val;
+                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/taylor.cpp b/GPU-MPC/ext/sytorch/ext/llama/taylor.cpp
new file mode 100644
index 00000000..62de233c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/taylor.cpp
@@ -0,0 +1,172 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "taylor.h"
+
+std::pair<PrivateScaleKeyPack, PrivateScaleKeyPack> keyGenPrivateScale(int bin, int bout, GroupElement rin, GroupElement rout)
+{
+    std::pair<PrivateScaleKeyPack, PrivateScaleKeyPack> keys;
+    auto rinSplit = splitShare(rin, bin);
+    keys.first.rin = rinSplit.first;
+    keys.second.rin = rinSplit.second;
+    auto routSplit = splitShare(rout, bout);
+    keys.first.rout = routSplit.first;
+    keys.second.rout = routSplit.second;
+    return keys;
+}
+
+GroupElement evalPrivateScale(int party, int bin, int bout, GroupElement x, const PrivateScaleKeyPack &key, uint64_t scalar)
+{
+    // bin == bout
+    GroupElement res;
+    if (party == 1)
+    {
+        res = scalar * (x - key.rin) + key.rout;
+    }
+    else
+    {
+        res = scalar * (-key.rin) + key.rout;
+    }
+    mod(res, bout);
+    return res;
+}
+
+std::pair<TaylorSqKey, TaylorSqKey> keyGenTaylorSq(int bin, int bout, GroupElement rin, GroupElement rout)
+{
+    std::pair<TaylorSqKey, TaylorSqKey> keys;
+    GroupElement a(rout + rin * rin);
+    mod(a, bout);
+    GroupElement b(2 * rin);
+    mod(b, bout);
+    auto apair = splitShare(a, bout);
+    auto bpair = splitShare(b, bout);
+    keys.first.a = apair.first;
+    keys.first.b = bpair.first;
+    keys.second.a = apair.second;
+    keys.second.b = bpair.second;
+    return keys;
+}
+
+GroupElement evalTaylorSq(int party, int bin, int bout, GroupElement x, const TaylorSqKey &key)
+{
+    GroupElement sum = key.a - key.b * x;
+    if (party == 1)
+    {
+        sum = sum + x * x;
+    }
+    mod(sum, bout);
+    return sum;
+}
+
+std::pair<TaylorKeyPack, TaylorKeyPack> keyGenTaylor(int bin, int bout, double a, double b, double c, GroupElement rin, GroupElement rout, int sf, int logk)
+{
+    // bin == bout only
+    std::pair<TaylorKeyPack, TaylorKeyPack> keys;
+    std::pair<MSNZBKeyPack, MSNZBKeyPack> msnzbKeys = keyGenMSNZB(bin, bout, rin, 0, sf, sf + logk);
+    keys.first.msnzbKey = msnzbKeys.first;
+    keys.second.msnzbKey = msnzbKeys.second;
+    GroupElement routSquare = random_ge(bin);
+    // GroupElement routSquare = rout;
+    std::pair<TaylorSqKey, TaylorSqKey> squareKeys = keyGenTaylorSq(bin, bout, rin, routSquare);
+    keys.first.squareKey = squareKeys.first;
+    keys.second.squareKey = squareKeys.second;
+
+    std::pair<BulkyLRSKeyPack, BulkyLRSKeyPack> lrsKeys[2];
+    uint64_t scales[logk + 1];
+    for (int i = 0; i < logk + 1; ++i)
+    {
+        scales[i] = sf + i;
+    }
+
+    GroupElement routASquare = random_ge(bin);
+    lrsKeys[0] = keyGenBulkyLRS(bin, bout, logk + 1, scales, routSquare, routASquare);
+    keys.first.lrsKeys[0] = lrsKeys[0].first;
+    keys.second.lrsKeys[0] = lrsKeys[0].second;
+
+    GroupElement routBscale = random_ge(bin);
+    auto privateScaleKeys = keyGenPrivateScale(bin, bout, rin, routBscale);
+    keys.first.privateScaleKey = privateScaleKeys.first;
+    keys.second.privateScaleKey = privateScaleKeys.second;
+
+    for (int i = 0; i < logk + 1; ++i)
+    {
+        scales[i] = sf + 3 * i;
+    }
+    lrsKeys[1] = keyGenBulkyLRS(bin, bout, logk + 1, scales, routASquare + routBscale, rout);
+    keys.first.lrsKeys[1] = lrsKeys[1].first;
+    keys.second.lrsKeys[1] = lrsKeys[1].second;
+
+    return keys;
+}
+
+std::pair<GroupElement, GroupElement> evalTaylor_round1(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk)
+{
+    // bin == bout only
+    GroupElement alpha = evalMSNZB(party, bin, bout, x, key.msnzbKey, sf, sf + logk);
+    GroupElement square = evalTaylorSq(party, bin, bout, x, key.squareKey);
+    return std::make_pair(alpha, square);
+}
+
+inline uint64_t flt2fx(double a, int pow)
+{
+    if (a > 0)
+    {
+        return (uint64_t)(a * (1ULL << pow));
+    }
+    else
+    {
+        return -(uint64_t)((-a) * (1ULL << pow));
+    }
+}
+
+std::pair<GroupElement, GroupElement> evalTaylor_round2(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk, GroupElement alpha, GroupElement square)
+{
+    // bin == bout only
+    uint64_t pow = alpha + 1;
+    uint64_t scalarA = flt2fx(a, pow);
+    uint64_t scalarB = flt2fx(b, pow);
+
+    uint64_t scales[logk + 1];
+    for (int i = 0; i < logk + 1; ++i)
+    {
+        scales[i] = sf + i;
+    }
+    GroupElement ax2trunc = evalBulkyLRS(party, bin, bout, logk + 1, scales, square, key.lrsKeys[0], pow, scalarA);
+    GroupElement bx = evalPrivateScale(party, bin, bout, x, key.privateScaleKey, scalarB);
+
+    return std::make_pair(ax2trunc, bx);
+}
+
+GroupElement evalTaylor_round3(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk, GroupElement alpha, GroupElement square, GroupElement ax2bx, uint64_t scalar)
+{
+    // bin == bout only
+    uint64_t pow = alpha + 1;
+    uint64_t scalarC = flt2fx(c, 2 * pow);
+    mod(scalarC, bin);
+    uint64_t scales[logk + 1];
+    for (int i = 0; i < logk + 1; ++i)
+    {
+        scales[i] = sf + 3 * i;
+    }
+    GroupElement ax2bxctrunc = evalBulkyLRS(party, bin, bout, logk + 1, scales, ax2bx + scalarC, key.lrsKeys[1], pow + 2 * (pow - sf), scalar);
+
+    return ax2bxctrunc;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/taylor.h b/GPU-MPC/ext/sytorch/ext/llama/taylor.h
new file mode 100644
index 00000000..4f7f9a4f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/taylor.h
@@ -0,0 +1,43 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include "msnzb.h"
+#include "truncate.h"
+#include <llama/keypack.h>
+
+std::pair<PrivateScaleKeyPack, PrivateScaleKeyPack> keyGenPrivateScale(int bin, int bout, GroupElement rin, GroupElement rout);
+
+GroupElement evalPrivateScale(int party, int bin, int bout, GroupElement x, const PrivateScaleKeyPack &key, uint64_t scalar);
+
+std::pair<TaylorSqKey, TaylorSqKey> keyGenTaylorSq(int bin, int bout, GroupElement rin, GroupElement rout);
+
+GroupElement evalTaylorSq(int party, int bin, int bout, GroupElement x, const TaylorSqKey &key);
+
+std::pair<TaylorKeyPack, TaylorKeyPack> keyGenTaylor(int bin, int bout, double a, double b, double c, GroupElement rin, GroupElement rout, int sf, int logk);
+
+std::pair<GroupElement, GroupElement> evalTaylor_round1(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk);
+
+std::pair<GroupElement, GroupElement> evalTaylor_round2(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk, GroupElement alpha, GroupElement square);
+
+GroupElement evalTaylor_round3(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk, GroupElement alpha, GroupElement square, GroupElement ax2bx, uint64_t scalar = 1);
+
+GroupElement evalTaylor_round4(int party, int bin, int bout, double a, double b, double c, GroupElement x, const TaylorKeyPack &key, int sf, int logk, GroupElement alpha, GroupElement ax2bxctrunc);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/truncate.cpp b/GPU-MPC/ext/sytorch/ext/llama/truncate.cpp
new file mode 100644
index 00000000..d2e45866
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/truncate.cpp
@@ -0,0 +1,108 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "truncate.h"
+
+std::pair<BulkyLRSKeyPack, BulkyLRSKeyPack> keyGenBulkyLRS(int bin, int bout, int m, uint64_t *scales, GroupElement rin, GroupElement rout)
+{
+    std::pair<BulkyLRSKeyPack, BulkyLRSKeyPack> keys;
+    auto dcfN = keyGenDCF(bin, bout, rin, 1);
+    keys.first.dcfKeyN = dcfN.first;
+    keys.second.dcfKeyN = dcfN.second;
+
+    keys.first.dcfKeyS = new DCFKeyPack[m];
+    keys.second.dcfKeyS = new DCFKeyPack[m];
+    keys.first.z = new GroupElement[m];
+    keys.second.z = new GroupElement[m];
+
+    for(int i = 0; i < m; ++i)
+    {
+        uint64_t s = scales[i];
+        GroupElement r1, r0;
+        if (rin % (1ULL << s) == 0) {
+            r1 = (rin >> s);
+            r0 = 0;
+        }
+        else {
+            r1 = (rin >> s) + 1;
+            r0 = (r1 << s) - rin;
+        }
+        mod(r0, s);
+        mod(r1, bin);
+        auto dcfS = keyGenDCF(s,   bout, r0,  1);
+        keys.first.dcfKeyS[i] = dcfS.first;
+        keys.second.dcfKeyS[i] = dcfS.second;
+        GroupElement zTruncate = -r1;
+        mod(zTruncate, bout);
+        auto zTruncateSplit = splitShare(zTruncate, bout);
+        keys.first.z[i] = zTruncateSplit.first;
+        keys.second.z[i] = zTruncateSplit.second;
+    }
+    auto routSplit = splitShare(rout, bout);
+    keys.first.out = routSplit.first;
+    keys.second.out = routSplit.second;
+
+    return keys;
+}
+
+inline void assert_failed(const char* file, int line, const char* function, const char* expression) {
+    std::cout << "Assertion failed: " << expression << " in " << function << " at " << file << ":" << line << std::endl;
+    exit(1);
+}
+
+#define always_assert(expr) (static_cast <bool> (expr) ? void (0) : assert_failed (__FILE__, __LINE__, __PRETTY_FUNCTION__, #expr))
+
+GroupElement evalBulkyLRS(int party, int bin, int bout, int m, uint64_t *scales, GroupElement x, const BulkyLRSKeyPack &key, int s, uint64_t scalar)
+{
+    int idx = -1;
+    for(int i = 0; i < m; ++i)
+    {
+        if(scales[i] == s)
+        {
+            idx = i;
+            break;
+        }
+    }
+    always_assert(idx != -1);
+
+    GroupElement tn;
+    evalDCF(party, &tn, x, key.dcfKeyN);
+
+    GroupElement x0;
+    GroupElement x1;
+    x0 = x % (1ULL << s);
+    x1 = x >> s;
+    mod(x0, s);
+    mod(x1, bin);
+    GroupElement xs;
+    xs = (1ULL<<s) - 1 - x0;
+    GroupElement ts;
+    evalDCF(party, &ts, xs, key.dcfKeyS[idx]);
+    GroupElement res;
+    if (party == 1) {
+        res = scalar * (x1 + key.z[idx] + (1ULL<<(bin - s)) * tn + ts) + key.out;
+    }
+    else {
+        res = scalar * (key.z[idx] + (1ULL<<(bin - s)) * tn + ts) + key.out;
+    }
+    mod(res, bout);
+    return res;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/truncate.h b/GPU-MPC/ext/sytorch/ext/llama/truncate.h
new file mode 100644
index 00000000..115995db
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/truncate.h
@@ -0,0 +1,38 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <utility>
+#include <llama/dcf.h>
+#include <llama/keypack.h>
+
+// struct BulkyLRSKeyPack
+// {
+//     DCFKeyPack dcfKeyN;
+//     DCFKeyPack *dcfKeyS;
+//     GroupElement *z;
+//     GroupElement out;
+// };
+
+std::pair<BulkyLRSKeyPack, BulkyLRSKeyPack> keyGenBulkyLRS(int bin, int bout, int m, uint64_t *scales, GroupElement rin, GroupElement rout);
+
+GroupElement evalBulkyLRS(int party, int bin, int bout, int m, uint64_t *scales, GroupElement x, const BulkyLRSKeyPack &key, int s, uint64_t scalar = 1);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/wrap.cpp b/GPU-MPC/ext/sytorch/ext/llama/wrap.cpp
new file mode 100644
index 00000000..69411c24
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/wrap.cpp
@@ -0,0 +1,125 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "wrap.h"
+#include <cassert>
+#include <llama/dpf.h>
+
+std::pair<WrapSSKeyPack, WrapSSKeyPack> keyGenWrapSS(int bin, GroupElement rin, GroupElement rout)
+{
+    mod(rin, bin);
+    mod(rout, 1);
+    assert(bin <= 7);
+    uint64_t b0 = 0, b1 = 0;
+    WrapSSKeyPack k1, k2;
+    k1.bin = bin;
+    k2.bin = bin;
+
+    if (bin == 7)
+    {
+        for (int i = 0; i < 64; ++i)
+        {
+            uint64_t bit = rout;
+            if (i < rin)
+            {
+                bit ^= 1;
+            }
+            b0 |= (bit << (63 - i));
+        }
+
+        for (int i = 64; i < 128; ++i)
+        {
+            uint64_t bit = rout;
+            if (i < rin)
+            {
+                bit ^= 1;
+            }
+            b1 |= (bit << (127 - i));
+        }
+
+        auto b0_split = splitShareXor(b0, 64);
+        auto b1_split = splitShareXor(b1, 64);
+        k1.b0 = b0_split.first;
+        k2.b0 = b0_split.second;
+        k1.b1 = b1_split.first;
+        k2.b1 = b1_split.second;
+
+    }
+    else
+    {
+        for (int i = 0; i < (1LL << bin); ++i)
+        {
+            uint64_t bit = rout;
+            if (i < rin)
+            {
+                bit ^= 1;
+            }
+            b0 |= (bit << (63 - i));
+        }
+
+        auto b0_split = splitShareXor(b0, 64);
+        k1.b0 = b0_split.first;
+        k2.b0 = b0_split.second;
+    }
+    
+    return std::make_pair(k1, k2);
+}
+
+GroupElement evalWrapSS(int party, GroupElement x, const WrapSSKeyPack &key)
+{
+    mod(x, key.bin);
+    if (key.bin == 7)
+    {
+        if (x > 63)
+        {
+            return (key.b1 >> (127 - x)) & 1;
+        }
+        else
+        {
+            return (key.b0 >> (63 - x)) & 1;
+        }
+    }
+    else
+    {
+        return (key.b0 >> (63 - x)) & 1;
+    }
+}
+
+std::pair<WrapDPFKeyPack, WrapDPFKeyPack> keyGenWrapDPF(int bin, GroupElement rin, GroupElement rout)
+{
+    mod(rin, bin);
+    auto dpfKeys = keyGenDPFET(bin, rin);
+    auto r_split = splitShare(rout, 1);
+
+    WrapDPFKeyPack k0, k1;
+    k0.bin = bin; k1.bin = bin;
+    k0.dpfKey = dpfKeys.first;
+    k1.dpfKey = dpfKeys.second;
+    k0.r = r_split.first;
+    k1.r = r_split.second;
+    return std::make_pair(k0, k1);
+}
+
+GroupElement evalWrapDPF(int party, GroupElement x, const WrapDPFKeyPack &key)
+{
+    GroupElement u_b = evalDPFET_LT(party, key.dpfKey, x) ^ key.r;
+    return u_b;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/llama/wrap.h b/GPU-MPC/ext/sytorch/ext/llama/wrap.h
new file mode 100644
index 00000000..c15aa030
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/llama/wrap.h
@@ -0,0 +1,29 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <llama/keypack.h>
+
+std::pair<WrapSSKeyPack, WrapSSKeyPack> keyGenWrapSS(int bin, GroupElement rin, GroupElement rout);
+GroupElement evalWrapSS(int party, GroupElement x, const WrapSSKeyPack &key);
+std::pair<WrapDPFKeyPack, WrapDPFKeyPack> keyGenWrapDPF(int bin, GroupElement rin, GroupElement rout);
+GroupElement evalWrapDPF(int party, GroupElement x, const WrapDPFKeyPack &key);
diff --git a/GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt
new file mode 100644
index 00000000..7131b52e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required (VERSION 3.13)
+
+project (SCI)
+set(NAME "SCI")
+
+
+option(NO_REVEAL_OUTPUT "Only output secret shares after 2PC" OFF)
+message(STATUS "Option: NO_REVEAL_OUTPUT = ${NO_REVEAL_OUTPUT}")
+
+option(BUILD_TESTS "Build tests" OFF)
+message(STATUS "Option: BUILD_TESTS = ${BUILD_TESTS}")
+
+option(BUILD_NETWORKS "Build networks" OFF)
+message(STATUS "Option: BUILD_NETWORKS = ${BUILD_NETWORKS}")
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if (NO_REVEAL_OUTPUT)
+    add_compile_definitions(NO_REVEAL_OUTPUT=1)
+endif()
+
+add_subdirectory(src)
+
+if (BUILD_TESTS)
+    add_subdirectory(tests)
+endif()
+
+if (BUILD_NETWORKS)
+    add_subdirectory(networks)
+endif()
+
+set(INSTALL_DIR "${CMAKE_INSTALL_PREFIX}")
+set(BUILD_DIR "${PROJECT_SOURCE_DIR}/build")
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+	"${CMAKE_CURRENT_SOURCE_DIR}/cmake/SCIConfig.cmake.in"
+	"${CMAKE_CURRENT_SOURCE_DIR}/cmake/SCIConfig.cmake"
+	INSTALL_DESTINATION lib/cmake/SCI
+    PATH_VARS INSTALL_DIR BUILD_DIR)
+install(FILES
+        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/SCIConfig.cmake"
+        DESTINATION lib/cmake/SCI)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/README.md b/GPU-MPC/ext/sytorch/ext/sci/README.md
new file mode 100644
index 00000000..9a79d0a2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/README.md
@@ -0,0 +1,59 @@
+# Secure and Correct Inference (SCI) Library
+
+## Introduction
+This directory contains the code for the Secure and Correct Inference (SCI) library from ["CrypTFlow2: Practical 2-Party Secure Inference"](https://eprint.iacr.org/2020/1002), ["SIRNN: A Math Library for Secure RNN Inference"](https://eprint.iacr.org/2021/459), and ["SecFloat: Accurate Floating-Point meets Secure 2-Party Computation"](https://eprint.iacr.org/2022/).
+
+## Required Packages
+ - g++ (version >= 8)
+ - cmake
+ - make
+ - libgmp-dev
+ - libmpfr-dev
+ - libssl-dev  
+ - SEAL 3.3.2
+ - Eigen 3.3
+
+SEAL and Eigen are included in `extern/` and are automatically compiled and installed if not found. The other packages can be installed directly using `sudo apt-get install <package>` on Linux.
+
+## Compilation
+
+To compile the library:
+
+```
+mkdir build && cd build
+cmake -DCMAKE_INSTALL_PREFIX=./install .. [-DBUILD_TESTS=ON] [-DBUILD_NETWORKS=ON] [-DNO_REVEAL_OUTPUT=ON]
+cmake --build . --target install --parallel
+```
+
+To compile for secure AI validation so that outputs are not revealed at the end of 2PC, compile with `-DNO_REVEAL_OUTPUT=ON` flag:
+
+```
+mkdir build && cd build
+cmake -DCMAKE_INSTALL_PREFIX=./install .. -DNO_REVEAL_OUTPUT=ON
+cmake --build . --target install --parallel
+```
+
+## Running Tests & Networks
+
+On successful compilation, the test and network binaries will be created in `build/bin/`.
+
+Run the tests as follows to make sure everything works as intended:
+
+`./<test> r=1 [port=port] & ./<test> r=2 [port=port]`
+
+To run secure inference on networks:
+
+```
+./<network> r=1 [port=port] < <model_file> // Server
+./<network> r=2 [ip=server_address] [port=port] < <image_file> // Client
+```
+
+# Acknowledgements
+
+This library includes code from the following external repositories:
+
+ - [emp-toolkit/emp-tool](https://github.com/emp-toolkit/emp-tool/tree/c44566f40690d2f499aba4660f80223dc238eb03/emp-tool) for cryptographic tools and network I/O.
+ - [emp-toolkit/emp-ot](https://github.com/emp-toolkit/emp-ot/tree/0f4a1e41a25cf1a034b5796752fde903a241f482/emp-ot) for Naor-Pinkas (base) OT and IKNP OT extension implementation.
+ - [emp-toolkit/emp-sh2pc](https://github.com/emp-toolkit/emp-sh2pc/tree/24d141e0775f2f5f1ab196733b5c71cbb4fadd84/emp-sh2pc) for Garbled Circuits implementation.
+ - [mc2-project/delphi](https://github.com/mc2-project/delphi/tree/de77cd7b896a2314fec205a8f67b257df46dd75c/rust/protocols-sys/c++/src/lib) for implementation of [Gazelle's](https://eprint.iacr.org/2018/073.pdf) algorithms for convolution and fully connected layers, which was majorly modified for better efficiency. 
+ - [homenc/HElib](https://github.com/homenc/HElib/blob/6397b23e64c32fd6eab76bd7a08b95d8399503f4/src/NumbTh.h) for command-line argument parsing.
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore b/GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore
new file mode 100644
index 00000000..22b7e71e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore
@@ -0,0 +1 @@
+SCIConfig.cmake
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/SCIConfig.cmake.in b/GPU-MPC/ext/sytorch/ext/sci/cmake/SCIConfig.cmake.in
new file mode 100644
index 00000000..627a43a5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/SCIConfig.cmake.in
@@ -0,0 +1,18 @@
+include(CMakeFindDependencyMacro)
+
+@PACKAGE_INIT@
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
+include("${CMAKE_CURRENT_LIST_DIR}/source_of_randomness.cmake")
+
+find_dependency(OpenSSL REQUIRED)
+find_dependency(GMP REQUIRED)
+find_dependency(Threads REQUIRED)
+
+set_and_check(INSTALL_DIR "@PACKAGE_INSTALL_DIR@")
+find_dependency(Eigen3 REQUIRED PATHS "@PACKAGE_BUILD_DIR@/../extern/eigen/build" NO_DEFAULT_PATH)
+find_dependency(SEAL REQUIRED PATHS "@PACKAGE_BUILD_DIR@" NO_DEFAULT_PATH)
+find_dependency(OpenMP REQUIRED)
+
+# Add the targets file
+include("${CMAKE_CURRENT_LIST_DIR}/SCITargets.cmake")
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/SecureFixedPointConfig.cmake b/GPU-MPC/ext/sytorch/ext/sci/cmake/SecureFixedPointConfig.cmake
new file mode 100644
index 00000000..fa5d11b7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/SecureFixedPointConfig.cmake
@@ -0,0 +1,16 @@
+include(CMakeFindDependencyMacro)
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
+include("${CMAKE_CURRENT_LIST_DIR}/source_of_randomness.cmake")
+
+find_dependency(OpenSSL REQUIRED)
+find_dependency(GMP REQUIRED)
+find_dependency(Threads REQUIRED)
+find_dependency(Eigen3 REQUIRED)
+
+# Add the targets file
+include("${CMAKE_CURRENT_LIST_DIR}/SecureFixedPointTargets.cmake")
+
+if(USE_RANDOM_DEVICE)
+    target_compile_definitions(SCI-utils INTERFACE EMP_USE_RANDOM_DEVICE=1)
+endif(USE_RANDOM_DEVICE)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/install_EMP.cmake b/GPU-MPC/ext/sytorch/ext/sci/cmake/install_EMP.cmake
new file mode 100644
index 00000000..5fdcbda0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/install_EMP.cmake
@@ -0,0 +1,49 @@
+if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/EMP/emp-tool/CMakeLists.txt")
+    find_package(Git REQUIRED)
+    message(STATUS "initialize Git submodule: extern/EMP/emp-tool")
+    execute_process(COMMAND git submodule update --init --recursive extern/EMP/emp-tool
+            WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+endif ()
+execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory build
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-tool")
+execute_process(COMMAND ${CMAKE_COMMAND} -DTHREADING=ON
+    -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build ..
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-tool/build")
+execute_process(COMMAND make install
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-tool/build")
+find_package(emp-tool REQUIRED PATHS "${PROJECT_SOURCE_DIR}/build/" NO_DEFAULT_PATH)
+message(STATUS "emp-tool installed: ${emp-tool_FOUND}")
+
+if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/EMP/emp-ot/CMakeLists.txt")
+    find_package(Git REQUIRED)
+    message(STATUS "initialize Git submodule: extern/EMP/emp-ot")
+    execute_process(COMMAND git submodule update --init --recursive extern/EMP/emp-ot
+            WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+endif ()
+execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory build
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-ot")
+execute_process(COMMAND ${CMAKE_COMMAND} -DTHREADING=ON
+    -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build ..
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-ot/build")
+execute_process(COMMAND make install
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-ot/build")
+message(STATUS "${PROJECT_SOURCE_DIR}")
+find_package(emp-ot REQUIRED PATHS "${PROJECT_SOURCE_DIR}/build/" NO_DEFAULT_PATH)
+message(STATUS "emp-ot installed: ${emp-ot_FOUND}")
+
+if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/EMP/emp-sh2pc/CMakeLists.txt")
+    find_package(Git REQUIRED)
+    message(STATUS "initialize Git submodule: extern/EMP/emp-sh2pc")
+    execute_process(COMMAND git submodule update --init --recursive extern/EMP/emp-sh2pc
+            WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+endif ()
+execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory build
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-sh2pc")
+execute_process(COMMAND ${CMAKE_COMMAND} -DTHREADING=ON
+    -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build ..
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-sh2pc/build")
+execute_process(COMMAND make install
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/EMP/emp-sh2pc/build")
+message(STATUS "${PROJECT_SOURCE_DIR}")
+find_package(emp-sh2pc REQUIRED PATHS "${PROJECT_SOURCE_DIR}/build/" NO_DEFAULT_PATH)
+message(STATUS "emp-sh2pc installed: ${emp-sh2pc_FOUND}")
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/install_Eigen3.cmake b/GPU-MPC/ext/sytorch/ext/sci/cmake/install_Eigen3.cmake
new file mode 100644
index 00000000..3c5ee19a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/install_Eigen3.cmake
@@ -0,0 +1,18 @@
+find_package(Eigen3 3.3 NO_MODULE QUIET)
+if (NOT Eigen3_FOUND)
+    message(STATUS "Eigen 3.3 was not found: clone and install Eigen3 locally")
+    if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/eigen/CMakeLists.txt")
+        find_package(Git REQUIRED)
+        message(STATUS "initialize Git submodule: extern/eigen")
+        execute_process(COMMAND git submodule update --init --recursive extern/eigen
+                WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+    endif ()
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory build
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/eigen/")
+    execute_process(COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build ..
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/eigen/build")
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .. --target install
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/eigen/build")
+    message(STATUS "${PROJECT_SOURCE_DIR}")
+    find_package(Eigen3 3.3 REQUIRED NO_MODULE PATHS "${PROJECT_SOURCE_DIR}/build/")
+endif ()
diff --git a/GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch b/GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch
new file mode 100644
index 00000000..db11a27d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch
@@ -0,0 +1,13 @@
+diff --git a/native/src/seal/util/defines.h b/native/src/seal/util/defines.h
+index a811237..18908bf 100644
+--- a/native/src/seal/util/defines.h
++++ b/native/src/seal/util/defines.h
+@@ -29,7 +29,7 @@ static_assert(sizeof(unsigned long long) == 8, "Require sizeof(unsigned long lon
+ #define SEAL_COEFF_MOD_COUNT_MIN 1
+ 
+ // Bounds for polynomial modulus degree
+-#define SEAL_POLY_MOD_DEGREE_MAX 32768
++#define SEAL_POLY_MOD_DEGREE_MAX 65536
+ #define SEAL_POLY_MOD_DEGREE_MIN 2
+ 
+ // Bounds for the plaintext modulus
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/CMakeLists.txt
new file mode 100644
index 00000000..b9282ee9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/CMakeLists.txt
@@ -0,0 +1,10 @@
+find_package(OpenMP REQUIRED)
+
+add_library(SCI-BuildingBlocks
+    value-extension.cpp
+    aux-protocols.cpp
+    truncation.cpp)
+
+target_link_libraries(SCI-BuildingBlocks
+    PUBLIC SCI-OTPrimitive SCI-GC SCI-Millionaire SCI-LinearOT OpenMP::OpenMP_CXX
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.cpp
new file mode 100644
index 00000000..5a2d5221
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.cpp
@@ -0,0 +1,745 @@
+/*
+Authors: Deevashwer Rathee, Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "BuildingBlocks/truncation.h"
+#include "BuildingBlocks/value-extension.h"
+#include <omp.h>
+
+using namespace std;
+using namespace sci;
+
+AuxProtocols::AuxProtocols(int party, IOPack *iopack, OTPack *otpack) {
+  this->party = party;
+  this->iopack = iopack;
+  this->otpack = otpack;
+  this->mill = new MillionaireProtocol(party, iopack, otpack);
+  this->mill_and_eq = new MillionaireWithEquality(party, iopack, otpack);
+}
+
+AuxProtocols::~AuxProtocols() {
+  delete mill;
+  delete mill_and_eq;
+}
+
+void AuxProtocols::wrap_computation(uint64_t *x, uint8_t *y, int32_t size,
+                                    int32_t bw_x) {
+  assert(bw_x <= 64);
+  uint64_t mask = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *tmp_x = new uint64_t[size];
+  for (int i = 0; i < size; i++) {
+    if (party == sci::ALICE)
+      tmp_x[i] = x[i] & mask;
+    else
+      tmp_x[i] = (mask - x[i]) & mask; // 2^{bw_x} - 1 - x[i]
+  }
+  mill->compare(y, tmp_x, size, bw_x, true); // computing greater_than
+
+  delete[] tmp_x;
+}
+
+void AuxProtocols::multiplexer(uint8_t *sel, uint64_t *x, uint64_t *y,
+                               int32_t size, int32_t bw_x, int32_t bw_y) {
+  assert(bw_x <= 64 && bw_y <= 64 && bw_y <= bw_x);
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+  uint64_t *corr_data = new uint64_t[size];
+  uint64_t *data_S = new uint64_t[size];
+  uint64_t *data_R = new uint64_t[size];
+
+  // y = (sel_0 \xor sel_1) * (x_0 + x_1)
+  // y = (sel_0 + sel_1 - 2*sel_0*sel_1)*x_0 + (sel_0 + sel_1 -
+  // 2*sel_0*sel_1)*x_1 y = [sel_0*x_0 + sel_1*(x_0 - 2*sel_0*x_0)]
+  //     + [sel_1*x_1 + sel_0*(x_1 - 2*sel_1*x_1)]
+  for (int i = 0; i < size; i++) {
+    corr_data[i] = (x[i] * (1 - 2 * uint64_t(sel[i]))) & mask_y;
+  }
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1) {
+      if (party == sci::ALICE) {
+        otpack->iknp_reversed->recv_cot(data_R, (bool *)sel, size, bw_y);
+      } else { // party == sci::BOB
+        otpack->iknp_reversed->send_cot(data_S, corr_data, size, bw_y);
+      }
+    } else {
+      if (party == sci::ALICE) {
+        otpack->iknp_straight->send_cot(data_S, corr_data, size, bw_y);
+      } else { // party == sci::BOB
+        otpack->iknp_straight->recv_cot(data_R, (bool *)sel, size, bw_y);
+      }
+    }
+  }
+  for (int i = 0; i < size; i++) {
+    y[i] = ((x[i] * uint64_t(sel[i]) + data_R[i] - data_S[i]) & mask_y);
+  }
+
+  delete[] corr_data;
+  delete[] data_S;
+  delete[] data_R;
+}
+
+void AuxProtocols::B2A(uint8_t *x, uint64_t *y, int32_t size, int32_t bw_y) {
+  assert(bw_y <= 64 && bw_y >= 1);
+  if (bw_y == 1) {
+    for (int i = 0; i < size; i++) {
+      y[i] = uint64_t(x[i]) & 1;
+    }
+    return;
+  }
+  uint64_t mask = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+  if (party == sci::ALICE) {
+    uint64_t *corr_data = new uint64_t[size];
+    for (int i = 0; i < size; i++) {
+      corr_data[i] = (-2 * uint64_t(x[i])) & mask;
+    }
+    otpack->iknp_straight->send_cot(y, corr_data, size, bw_y);
+
+    for (int i = 0; i < size; i++) {
+      y[i] = (uint64_t(x[i]) - y[i]) & mask;
+    }
+    delete[] corr_data;
+  } else { // party == sci::BOB
+    otpack->iknp_straight->recv_cot(y, (bool *)x, size, bw_y);
+
+    for (int i = 0; i < size; i++) {
+      y[i] = (uint64_t(x[i]) + y[i]) & mask;
+    }
+  }
+}
+
+template <typename T>
+void AuxProtocols::lookup_table(T **spec, T *x, T *y, int32_t size,
+                                int32_t bw_x, int32_t bw_y) {
+  if (party == sci::ALICE) {
+    assert(x == nullptr);
+    assert(y == nullptr);
+  } else { // party == sci::BOB
+    assert(spec == nullptr);
+  }
+  assert(bw_x <= 8 && bw_x >= 1);
+  int32_t T_size = sizeof(T) * 8;
+  assert(bw_y <= T_size);
+
+  T mask_x = (bw_x == T_size ? -1 : ((1ULL << bw_x) - 1));
+  T mask_y = (bw_y == T_size ? -1 : ((1ULL << bw_y) - 1));
+  uint64_t N = 1 << bw_x;
+
+  if (party == sci::ALICE) {
+    PRG128 prg;
+    T **data = new T *[size];
+    for (int i = 0; i < size; i++) {
+      data[i] = new T[N];
+      for (uint64_t j = 0; j < N; j++) {
+        data[i][j] = spec[i][j];
+      }
+    }
+
+    otpack->kkot[bw_x - 1]->send(data, size, bw_y);
+
+    for (int i = 0; i < size; i++)
+      delete[] data[i];
+    delete[] data;
+  } else { // party == sci::BOB
+    uint8_t *choice = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      choice[i] = x[i] & mask_x;
+    }
+    otpack->kkot[bw_x - 1]->recv(y, choice, size, bw_y);
+
+    delete[] choice;
+  }
+}
+
+void AuxProtocols::MSB(uint64_t *x, uint8_t *msb_x, int32_t size,
+                       int32_t bw_x) {
+  assert(bw_x <= 64);
+  int32_t shift = bw_x - 1;
+  uint64_t shift_mask = (shift == 64 ? -1 : ((1ULL << shift) - 1));
+
+  uint64_t *tmp_x = new uint64_t[size];
+  uint8_t *msb_xb = new uint8_t[size];
+  for (int i = 0; i < size; i++) {
+    tmp_x[i] = x[i] & shift_mask;
+    msb_xb[i] = (x[i] >> shift) & 1;
+    if (party == sci::BOB)
+      tmp_x[i] = (shift_mask - tmp_x[i]) & shift_mask;
+  }
+
+  mill->compare(msb_x, tmp_x, size, bw_x - 1, true); // computing greater_than
+
+  for (int i = 0; i < size; i++) {
+    msb_x[i] = msb_x[i] ^ msb_xb[i];
+  }
+
+  delete[] tmp_x;
+  delete[] msb_xb;
+}
+
+void AuxProtocols::MSB_to_Wrap(uint64_t *x, uint8_t *msb_x, uint8_t *wrap_x,
+                               int32_t size, int32_t bw_x) {
+  assert(bw_x <= 64);
+  if (party == sci::ALICE) {
+    PRG128 prg;
+    prg.random_bool((bool *)wrap_x, size);
+    uint8_t **spec = new uint8_t *[size];
+    for (int i = 0; i < size; i++) {
+      spec[i] = new uint8_t[4];
+      uint8_t msb_xb = (x[i] >> (bw_x - 1)) & 1;
+      for (int j = 0; j < 4; j++) {
+        uint8_t bits_j[2]; // j0 || j1 (LSB to MSB)
+        uint8_to_bool(bits_j, j, 2);
+        spec[i][j] = (((1 ^ msb_x[i] ^ bits_j[0]) * (msb_xb ^ bits_j[1])) ^
+                      (msb_xb * bits_j[1]) ^ wrap_x[i]) &
+                     1;
+      }
+    }
+    lookup_table<uint8_t>(spec, nullptr, nullptr, size, 2, 1);
+
+    for (int i = 0; i < size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else { // party == sci::BOB
+    uint8_t *lut_in = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      lut_in[i] = (((x[i] >> (bw_x - 1)) & 1) << 1) | msb_x[i];
+    }
+    lookup_table<uint8_t>(nullptr, lut_in, wrap_x, size, 2, 1);
+
+    delete[] lut_in;
+  }
+}
+
+void AuxProtocols::AND(uint8_t *x, uint8_t *y, uint8_t *z, int32_t size) {
+  int32_t old_size = size;
+  size = ((old_size + 1)/2) * 2;
+  uint8_t* a = new uint8_t[size];
+  uint8_t* b = new uint8_t[size];
+  uint8_t* c = new uint8_t[size];
+  a[size-1] = 0; b[size-1] = 0; // if size is odd, last element should be set
+  memcpy(a, x, sizeof(uint8_t)*old_size);
+  memcpy(b, y, sizeof(uint8_t)*old_size);
+  switch (party) {
+  case sci::ALICE: {
+    PRG128 prg;
+    prg.random_bool((bool *)c, size);
+    uint8_t **ot_messages; // (size/2) X 16
+    ot_messages = new uint8_t *[size / 2];
+    for (int i = 0; i < size; i += 2)
+      ot_messages[i / 2] = new uint8_t[16];
+    for (int j = 0; j < 16; j++) {
+      uint8_t bits_j[4]; // a01 || b01 || a11 || b11 (LSB->MSB)
+      sci::uint8_to_bool(bits_j, j, 4);
+      for (int i = 0; i < size; i += 2) {
+        ot_messages[i / 2][j] =
+            ((((a[i + 1] ^ bits_j[2]) & (b[i + 1] ^ bits_j[3])) ^ c[i + 1])
+             << 1) |
+            (((a[i] ^ bits_j[0]) & (b[i] ^ bits_j[1])) ^ c[i]);
+      }
+    }
+    // otpack->kkot_16->send(ot_messages, size/2, 2);
+    otpack->kkot[3]->send(ot_messages, size / 2, 2);
+    for (int i = 0; i < size; i += 2)
+      delete[] ot_messages[i / 2];
+    delete[] ot_messages;
+    break;
+  }
+  case sci::BOB: {
+    uint8_t *ot_selection = new uint8_t[(size_t)size / 2];
+    uint8_t *ot_result = new uint8_t[(size_t)size / 2];
+    for (int i = 0; i < size; i += 2) {
+      ot_selection[i / 2] =
+          (b[i + 1] << 3) | (a[i + 1] << 2) | (b[i] << 1) | a[i];
+    }
+    // otpack->kkot_16->recv(ot_result, ot_selection, size/2, 2);
+    otpack->kkot[3]->recv(ot_result, ot_selection, size / 2, 2);
+    for (int i = 0; i < size; i += 2) {
+      c[i] = ot_result[i / 2] & 1;
+      c[i + 1] = ot_result[i / 2] >> 1;
+    }
+    delete[] ot_selection;
+    delete[] ot_result;
+    break;
+  }
+  }
+  memcpy(z, c, sizeof(uint8_t)*old_size);
+  delete[] a;
+  delete[] b;
+  delete[] c;
+  return;
+}
+
+void AuxProtocols::reduce(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                          int32_t bw_y) {
+  assert(bw_y <= bw_x);
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+  for (int i = 0; i < dim; i++) {
+    y[i] = x[i] & mask_y;
+  }
+}
+
+void AuxProtocols::digit_decomposition(int32_t dim, uint64_t *x,
+                                       uint64_t *x_digits, int32_t bw_x,
+                                       int32_t digit_size) {
+  assert(false && "Inefficient version of digit decomposition called");
+  int num_digits = ceil(double(bw_x) / digit_size);
+  int last_digit_size = bw_x - (num_digits - 1) * digit_size;
+  uint64_t digit_mask = (digit_size == 64 ? -1 : (1ULL << digit_size) - 1);
+  uint64_t last_digit_mask =
+      (last_digit_size == 64 ? -1 : (1ULL << last_digit_size) - 1);
+
+  Truncation trunc(this->party, this->iopack, this->otpack);
+  for (int i = 0; i < num_digits; i++) {
+    trunc.truncate_and_reduce(dim, x, x_digits + i * dim, i * digit_size, bw_x);
+    uint64_t mask = (i == (num_digits - 1) ? last_digit_mask : digit_mask);
+    for (int j = 0; j < dim; j++) {
+      x_digits[i * dim + j] &= mask;
+    }
+  }
+}
+
+void AuxProtocols::digit_decomposition_sci(int32_t dim, uint64_t *x,
+                                           uint64_t *x_digits, int32_t bw_x,
+                                           int32_t digit_size,
+                                           bool all_digit_size) {
+  int num_digits = ceil(double(bw_x) / digit_size);
+  int last_digit_size = bw_x - (num_digits - 1) * digit_size;
+  uint64_t digit_mask = (digit_size == 64 ? -1 : (1ULL << digit_size) - 1);
+  uint64_t last_digit_mask =
+      (last_digit_size == 64 ? -1 : (1ULL << last_digit_size) - 1);
+  for (int i = 0; i < num_digits; i++) {
+    for (int j = 0; j < dim; j++) {
+      x_digits[i * dim + j] = (x[j] >> (i * digit_size));
+      x_digits[i * dim + j] &=
+          (i == (num_digits - 1)) ? last_digit_mask : digit_mask;
+    }
+  }
+  uint8_t *wrap_ = new uint8_t[dim * (num_digits - 1)];
+  uint8_t *ones_ = new uint8_t[dim * (num_digits - 1)];
+  uint8_t *dp_wrap_entering = new uint8_t[dim * num_digits];
+  uint8_t *dp_temp = new uint8_t[dim * num_digits];
+  uint64_t *dp_wrap_arith = new uint64_t[dim * num_digits];
+  // Fill wrap_ and ones_
+  uint64_t *temp_x_digits = new uint64_t[dim * (num_digits - 1)];
+
+  for (int i = 0; i < (num_digits - 1); i++) {
+    for (int j = 0; j < dim; j++) {
+      if (party == sci::ALICE)
+        temp_x_digits[i * dim + j] = x_digits[i * dim + j] & digit_mask;
+      else
+        temp_x_digits[i * dim + j] =
+            (digit_mask - x_digits[i * dim + j]) & digit_mask;
+    }
+  }
+  this->mill_and_eq->compare_with_eq(wrap_, ones_, temp_x_digits,
+                                     (dim * (num_digits - 1)), digit_size);
+
+  // DP steps proceed
+  for (int i = 0; i < num_digits; i++) {
+    if (i > 0) {
+      this->AND(ones_ + (i - 1) * dim, dp_wrap_entering + (i - 1) * dim,
+                dp_temp + (i - 1) * dim, dim);
+    }
+    for (int j = 0; j < dim; j++) {
+      if (i == 0) {
+        dp_wrap_entering[i * dim + j] = 0;
+      } else {
+        dp_wrap_entering[i * dim + j] =
+            wrap_[(i - 1) * dim + j] ^ dp_temp[(i - 1) * dim + j];
+      }
+    }
+  }
+  this->B2A(dp_wrap_entering, dp_wrap_arith, num_digits * dim, digit_size);
+  for (int i = 0; i < num_digits; i++) {
+    for (int j = 0; j < dim; j++) {
+      x_digits[i * dim + j] += dp_wrap_arith[i * dim + j];
+      uint64_t temp_mask =
+          (i == (num_digits - 1)) ? last_digit_mask : digit_mask;
+      x_digits[i * dim + j] &= temp_mask;
+    }
+    if (all_digit_size) {
+      if (i == (num_digits - 1)) {
+        XTProtocol *xt =
+            new XTProtocol(this->party, this->iopack, this->otpack);
+        uint64_t *temp_last_digs = new uint64_t[dim];
+        xt->z_extend(dim, x_digits + (num_digits - 1) * dim, temp_last_digs,
+                     last_digit_size, digit_size);
+        for (int j = 0; j < dim; j++) {
+          x_digits[i * dim + j] = temp_last_digs[j];
+          x_digits[i * dim + j] &= digit_mask;
+        }
+        delete xt;
+        delete[] temp_last_digs;
+      }
+    }
+  }
+
+  delete[] wrap_;
+  delete[] ones_;
+  delete[] dp_wrap_entering;
+  delete[] dp_temp;
+  delete[] dp_wrap_arith;
+  delete[] temp_x_digits;
+}
+
+uint64_t lookup_msnzb(uint64_t index) {
+  uint64_t ret = 0ULL;
+  ret = floor(log2(index));
+  if (index == 0) {
+    ret = 0ULL;
+  }
+  // In the above step only at max log(64) = 6 bits are filled.
+  ret <<= 1;
+  // Last bit stores 1 if index is 0, else 0.
+  if (index == 0) {
+    ret ^= 1ULL;
+  }
+  return ret;
+}
+
+void AuxProtocols::msnzb_sci(uint64_t *x, uint64_t *msnzb_index, int32_t bw_x,
+                             int32_t size, int32_t digit_size) {
+  // The protocol only works when digit_size divides bw_x.
+  int32_t last_digit_size = bw_x % digit_size;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t digit_mask = (digit_size == 64 ? -1 : ((1ULL << digit_size) - 1));
+  uint64_t last_digit_mask =
+      (last_digit_size == 64 ? -1 : ((1ULL << last_digit_size) - 1));
+  if (last_digit_size == 0) {
+    last_digit_mask = digit_mask;
+    last_digit_size = digit_size;
+  }
+  int32_t num_digits = ceil((bw_x * 1.0) / digit_size);
+  uint64_t *x_digits = new uint64_t[num_digits * size];
+
+  XTProtocol *xt = new XTProtocol(this->party, this->iopack, this->otpack);
+
+  // Extract digits
+  this->digit_decomposition_sci(size, x, x_digits, bw_x, digit_size);
+
+  // Use LUTs for MSNZB on digits
+  int D = (1 << digit_size);
+  int DLast = (1 << last_digit_size);
+  uint8_t *z_ = new uint8_t[num_digits * size];
+  uint64_t *msnzb_ = new uint64_t[num_digits * size];
+  uint64_t *msnzb_extended = new uint64_t[num_digits * size];
+  int lookup_output_bits = (ceil(log2(digit_size))) + 1;
+  int mux_bits = ceil(log2(bw_x));
+  uint64_t msnzb_mask = (1ULL << (lookup_output_bits - 1)) - 1;
+  uint64_t mux_mask = (1ULL << mux_bits) - 1;
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[num_digits * size];
+    PRG128 prg;
+    prg.random_data(z_, size * sizeof(uint8_t));
+    prg.random_data(msnzb_, size * sizeof(uint64_t));
+    for (int i = 0; i < (num_digits - 1) * size; i++) {
+      spec[i] = new uint64_t[D];
+      z_[i] &= 1;
+      msnzb_[i] &= msnzb_mask;
+      for (int j = 0; j < D; j++) {
+        int idx = (x_digits[i] + j) & digit_mask;
+        uint64_t lookup_val = lookup_msnzb(idx);
+        spec[i][j] = ((lookup_val >> 1) - msnzb_[i]) & msnzb_mask;
+        spec[i][j] <<= 1;
+        spec[i][j] |=
+            ((uint64_t)(((uint8_t)(lookup_val & 1ULL)) ^ z_[i]) & 1ULL);
+      }
+    }
+    for (int i = (num_digits - 1) * size; i < num_digits * size; i++) {
+      spec[i] = new uint64_t[DLast];
+      z_[i] &= 1;
+      msnzb_[i] &= msnzb_mask;
+      for (int j = 0; j < DLast; j++) {
+        int idx = (x_digits[i] + j) & last_digit_mask;
+        uint64_t lookup_val = lookup_msnzb(idx);
+        spec[i][j] = ((lookup_val >> 1) - msnzb_[i]) & msnzb_mask;
+        spec[i][j] <<= 1;
+        spec[i][j] |=
+            ((uint64_t)(((uint8_t)(lookup_val & 1ULL)) ^ z_[i]) & 1ULL);
+      }
+    }
+    if (last_digit_size == digit_size) {
+      this->lookup_table<uint64_t>(spec, nullptr, nullptr, num_digits * size,
+                                   digit_size, lookup_output_bits);
+    } else {
+      this->lookup_table<uint64_t>(spec, nullptr, nullptr,
+                                   (num_digits - 1) * size, digit_size,
+                                   lookup_output_bits);
+      this->lookup_table<uint64_t>(spec + (num_digits - 1) * size, nullptr,
+                                   nullptr, size, last_digit_size,
+                                   lookup_output_bits);
+    }
+
+    // Zero extend to mux_bits
+    xt->z_extend(num_digits * size, msnzb_, msnzb_extended,
+                 lookup_output_bits - 1, mux_bits);
+
+    for (int i = 0; i < num_digits * size; i++) {
+      delete[] spec[i];
+    }
+    delete[] spec;
+  } else { // BOB
+    if (last_digit_size == digit_size) {
+      this->lookup_table<uint64_t>(nullptr, x_digits, msnzb_, num_digits * size,
+                                   digit_size, lookup_output_bits);
+    } else {
+      this->lookup_table<uint64_t>(nullptr, x_digits, msnzb_,
+                                   (num_digits - 1) * size, digit_size,
+                                   lookup_output_bits);
+      this->lookup_table<uint64_t>(nullptr, x_digits + (num_digits - 1) * size,
+                                   msnzb_ + (num_digits - 1) * size, size,
+                                   last_digit_size, lookup_output_bits);
+    }
+
+    for (int i = 0; i < (num_digits * size); i++) {
+      z_[i] = (uint8_t)(msnzb_[i] & 1ULL);
+      msnzb_[i] >>= 1;
+    }
+
+    // Zero extend to mux_bits
+    xt->z_extend(num_digits * size, msnzb_, msnzb_extended,
+                 lookup_output_bits - 1, mux_bits);
+
+    for (int i = 0; i < num_digits; i++) {
+      for (int j = 0; j < size; j++) {
+        msnzb_extended[i * size + j] += (i * digit_size);
+        msnzb_extended[i * size + j] &= mux_mask;
+      }
+    }
+  }
+
+  // Combine MSNZB of digits
+  uint8_t *dp_zeros_ = new uint8_t[(num_digits - 1) * size];
+  uint8_t *one_xor_zeros_ = new uint8_t[(num_digits - 1) * size];
+  uint8_t *dp_zeros_final = new uint8_t[num_digits * size];
+
+  if (party == ALICE) {
+    for (int i = 0; i < size; i++) {
+      dp_zeros_final[(num_digits - 1) * size + i] =
+          z_[(num_digits - 1) * size + i];
+    }
+    for (int i = 0; i < (num_digits - 1); i++) {
+      for (int j = 0; j < size; j++) {
+        one_xor_zeros_[i * size + j] = z_[i * size + j];
+      }
+    }
+  } else {
+    for (int i = 0; i < size; i++) {
+      dp_zeros_final[(num_digits - 1) * size + i] =
+          (1 ^ z_[(num_digits - 1) * size + i]);
+    }
+    for (int i = 0; i < (num_digits - 1); i++) {
+      for (int j = 0; j < size; j++) {
+        one_xor_zeros_[i * size + j] = (1 ^ z_[i * size + j]);
+      }
+    }
+  }
+  for (int i = (num_digits - 2); i >= 0; i--) {
+    if (i == (num_digits - 2)) {
+      for (int j = 0; j < size; j++) {
+        dp_zeros_[i * size + j] = z_[(i + 1) * size + j];
+      }
+    } else {
+      this->AND(dp_zeros_ + (i + 1) * size, z_ + (i + 1) * size,
+                dp_zeros_ + i * size, size);
+    }
+  }
+  this->AND(dp_zeros_, one_xor_zeros_, dp_zeros_final, (num_digits - 1) * size);
+
+  uint64_t *msnzb_muxed = new uint64_t[num_digits * size];
+  this->multiplexer(dp_zeros_final, msnzb_extended, msnzb_muxed,
+                    num_digits * size, mux_bits, mux_bits);
+
+  for (int i = 0; i < size; i++) {
+    msnzb_index[i] = 0ULL;
+    for (int j = 0; j < num_digits; j++) {
+      msnzb_index[i] += msnzb_muxed[j * size + i];
+      msnzb_index[i] &= mux_mask;
+    }
+  }
+
+  delete xt;
+  delete[] x_digits;
+  delete[] z_;
+  delete[] msnzb_;
+  delete[] msnzb_extended;
+  delete[] dp_zeros_;
+  delete[] one_xor_zeros_;
+  delete[] dp_zeros_final;
+  delete[] msnzb_muxed;
+  return;
+}
+
+void AuxProtocols::msnzb_one_hot(uint64_t *x, uint8_t *one_hot_vector,
+                                 int32_t bw_x, int32_t size,
+                                 int32_t digit_size) {
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  int msnzb_index_bits = ceil(log2(bw_x));
+  uint64_t msnzb_index_mask = (1ULL << msnzb_index_bits) - 1;
+
+  uint64_t *msnzb_index = new uint64_t[size];
+
+  this->msnzb_sci(x, msnzb_index, bw_x, size, digit_size);
+
+  // use LUT to get the one-hot representation
+  int D = 1 << msnzb_index_bits;
+  uint64_t *xor_mask = new uint64_t[size];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[size];
+    PRG128 prg;
+    prg.random_data(one_hot_vector, size * bw_x * sizeof(uint8_t));
+    for (int i = 0; i < size; i++) {
+      for (int j = 0; j < bw_x; j++) {
+        one_hot_vector[i * bw_x + j] &= 1;
+      }
+      xor_mask[i] = 0ULL;
+      for (int j = (bw_x - 1); j >= 0; j--) {
+        xor_mask[i] <<= 1;
+        xor_mask[i] ^= (uint64_t)one_hot_vector[i * bw_x + j];
+      }
+    }
+    for (int i = 0; i < size; i++) {
+      spec[i] = new uint64_t[D];
+      for (int j = 0; j < D; j++) {
+        int idx = (msnzb_index[i] + j) & msnzb_index_mask;
+        uint64_t lookup_val = (1ULL << idx);
+        lookup_val ^= xor_mask[i];
+        spec[i][j] = lookup_val;
+      }
+    }
+    this->lookup_table<uint64_t>(spec, nullptr, nullptr, size, msnzb_index_bits,
+                                 bw_x);
+
+    for (int i = 0; i < size; i++) {
+      delete[] spec[i];
+    }
+    delete[] spec;
+  } else { // BOB
+    uint64_t *temp = new uint64_t[size];
+    this->lookup_table<uint64_t>(nullptr, msnzb_index, temp, size,
+                                 msnzb_index_bits, bw_x);
+    for (int i = 0; i < size; i++) {
+      for (int j = 0; j < bw_x; j++) {
+        one_hot_vector[i * bw_x + j] = (uint8_t)(temp[i] & 1ULL);
+        temp[i] >>= 1;
+      }
+    }
+    delete[] temp;
+  }
+  delete[] xor_mask;
+  delete[] msnzb_index;
+}
+
+void AuxProtocols::msnzb_GC(uint64_t *x, uint8_t *one_hot_vector, int32_t bw_x,
+                            int32_t size) {
+  int batch_size = (bw_x * size < (1 << 20)) ? bw_x * size : (1 << 20);
+  SemiHonestParty<NetIO> *prot_exec =
+      setup_semi_honest<NetIO>(iopack->io_GC, party, batch_size);
+  SplitIKNP<NetIO> *iknp_s_base = otpack->iknp_straight;
+  if (party == ALICE) {
+    static_cast<SemiHonestGen<NetIO> *>(prot_exec)->setup_keys(iknp_s_base->k0,
+                                                               iknp_s_base->s);
+  } else {
+    static_cast<SemiHonestEva<NetIO> *>(prot_exec)->setup_keys(iknp_s_base->k0,
+                                                               iknp_s_base->k1);
+  }
+
+  Integer *x0_bits = new Integer[size];
+  Integer *x1_bits = new Integer[size];
+  Bit *msnzb_vector = new Bit[size * bw_x];
+  Bit *out_mask0 = new Bit[size * bw_x];
+  Bit *out = new Bit[size * bw_x];
+  if (party == ALICE) {
+    PRG128 prg;
+    prg.random_bool((bool *)one_hot_vector, size * bw_x);
+  }
+
+  for (int i = 0; i < size; ++i) {
+    if (party == ALICE) {
+      x0_bits[i] = Integer(bw_x, x[i], ALICE);
+      x1_bits[i] = Integer(bw_x, 0, BOB);
+    } else { // party == BOB
+      x0_bits[i] = Integer(bw_x, 0, ALICE);
+      x1_bits[i] = Integer(bw_x, x[i], BOB);
+    }
+  }
+  for (int i = 0; i < size * bw_x; ++i) {
+    if (party == ALICE) {
+      out_mask0[i] = Bit(one_hot_vector[i], ALICE);
+    } else { // party == BOB
+      out_mask0[i] = Bit(0, ALICE);
+    }
+  }
+
+  for (int i = 0; i < size; i++) {
+    Integer x0 = x0_bits[i];
+    Integer x1 = x1_bits[i];
+    Integer X = x0 + x1;
+
+    msnzb_vector[i * bw_x] = X[bw_x - 1];
+    for (int j = 1; j < bw_x; j++) {
+      msnzb_vector[i * bw_x + j] =
+          (msnzb_vector[i * bw_x + (j - 1)] | X[bw_x - j - 1]);
+    }
+    for (int j = 1; j < bw_x; j++) {
+      msnzb_vector[i * bw_x + bw_x - j] =
+          (msnzb_vector[i * bw_x + bw_x - j] ^
+           msnzb_vector[i * bw_x + bw_x - j - 1]);
+    }
+    for (int j = 0; j < bw_x; j++) {
+      out[i * bw_x + j] =
+          (msnzb_vector[i * bw_x + bw_x - j - 1] ^ out_mask0[i * bw_x + j]);
+    }
+  }
+
+  for (int i = 0; i < bw_x * size; i++) {
+    if (party == ALICE) {
+      out[i].reveal<bool>(BOB);
+    } else { // party == BOB
+      one_hot_vector[i] = out[i].reveal<bool>(BOB);
+    }
+  }
+  iopack->io_GC->flush();
+
+  delete[] x0_bits;
+  delete[] x1_bits;
+  delete[] msnzb_vector;
+  delete[] out_mask0;
+  delete[] out;
+
+  delete circ_exec;
+  delete prot_exec;
+
+  return;
+}
+
+template void AuxProtocols::lookup_table(uint64_t **spec, uint64_t *x,
+                                         uint64_t *y, int32_t size,
+                                         int32_t bw_x, int32_t bw_y);
+template void AuxProtocols::lookup_table(uint8_t **spec, uint8_t *x, uint8_t *y,
+                                         int32_t size, int32_t bw_x,
+                                         int32_t bw_y);
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.h b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.h
new file mode 100644
index 00000000..a65e5a7c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.h
@@ -0,0 +1,167 @@
+/*
+Authors: Deevashwer Rathee, Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef AUX_PROTOCOLS_H__
+#define AUX_PROTOCOLS_H__
+
+#include "Millionaire/millionaire.h"
+#include "Millionaire/millionaire_with_equality.h"
+#include "OT/emp-ot.h"
+#include "GC/emp-sh2pc.h"
+
+class AuxProtocols {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  MillionaireProtocol *mill;
+  MillionaireWithEquality *mill_and_eq;
+
+  AuxProtocols(int party, sci::IOPack *iopack, sci::OTPack *otpack);
+
+  ~AuxProtocols();
+
+  void wrap_computation(
+      // input vector
+      uint64_t *x,
+      // wrap-bit of shares of x
+      uint8_t *y,
+      // size of input vector
+      int32_t size,
+      // bitwidth of x
+      int32_t bw_x);
+
+  // y = sel * x
+  void multiplexer(
+      // selection bits
+      uint8_t *sel,
+      // input vector
+      uint64_t *x,
+      // output vector
+      uint64_t *y,
+      // size of vectors
+      int32_t size,
+      // bitwidth of x
+      int32_t bw_x,
+      // bitwidth of y
+      int32_t bw_y);
+
+  // Boolean to Arithmetic Shares
+  void B2A(
+      // input (boolean) vector
+      uint8_t *x,
+      // output vector
+      uint64_t *y,
+      // size of vector
+      int32_t size,
+      // bitwidth of y
+      int32_t bw_y);
+
+  template <typename T>
+  void lookup_table(
+      // table specification
+      T **spec,
+      // input vector
+      T *x,
+      // output vector
+      T *y,
+      // size of vector
+      int32_t size,
+      // bitwidth of input to LUT
+      int32_t bw_x,
+      // bitwidth of output of LUT
+      int32_t bw_y);
+
+  // MSB computation
+  void MSB(
+      // input vector
+      uint64_t *x,
+      // shares of MSB(x)
+      uint8_t *msb_x,
+      // size of input vector
+      int32_t size,
+      // bitwidth of x
+      int32_t bw_x);
+
+  // MSB to Wrap computation
+  void MSB_to_Wrap(
+      // input vector
+      uint64_t *x,
+      // shares of MSB(x)
+      uint8_t *msb_x,
+      // output shares of Wrap(x)
+      uint8_t *wrap_x,
+      // size of input vector
+      int32_t size,
+      // bitwidth of x
+      int32_t bw_x);
+
+  // Bitwise AND
+  void AND(
+      // input A (boolean) vector
+      uint8_t *x,
+      // input B (boolean) vector
+      uint8_t *y,
+      // output vector
+      uint8_t *z,
+      // size of vector
+      int32_t size);
+
+  void digit_decomposition(int32_t dim, uint64_t *x, uint64_t *x_digits,
+                           int32_t bw_x, int32_t digit_size);
+
+  void digit_decomposition_sci(
+      int32_t dim, uint64_t *x, uint64_t *x_digits, int32_t bw_x,
+      int32_t digit_size,
+      // set true if the bitlength of all output digits is digit_size
+      // leave false, if the last digit is output over <= digit_size bits
+      bool all_digit_size = false);
+
+  void reduce(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+              int32_t bw_y);
+
+  // Make x positive: pos_x = x * (1 - 2*MSB(x))
+  void make_positive(
+      // input vector
+      uint64_t *x,
+      // input (boolean) vector containing MSB(x)
+      uint8_t *msb_x,
+      // output vector
+      uint64_t *pos_x,
+      // size of vector
+      int32_t size);
+
+  // Outputs index and not one-hot vector
+  void msnzb_sci(uint64_t *x, uint64_t *msnzb_index, int32_t bw_x, int32_t size,
+                 int32_t digit_size = 8);
+
+  // Wrapper over msnzb_sci. Outputs one-hot vector
+  void msnzb_one_hot(uint64_t *x,
+                     // size: bw_x * size
+                     uint8_t *one_hot_vector, int32_t bw_x, int32_t size,
+                     int32_t digit_size = 8);
+
+  void msnzb_GC(uint64_t *x,
+                // size: bw_x * size
+                uint8_t *one_hot_vector, int32_t bw_x, int32_t size);
+};
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.cpp
new file mode 100644
index 00000000..15169a44
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.cpp
@@ -0,0 +1,298 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/truncation.h"
+#include "BuildingBlocks/value-extension.h"
+
+using namespace std;
+using namespace sci;
+
+Truncation::Truncation(int party, IOPack *iopack, OTPack *otpack) {
+  this->party = party;
+  this->iopack = iopack;
+  this->otpack = otpack;
+  this->aux = new AuxProtocols(party, iopack, otpack);
+  this->mill = this->aux->mill;
+  this->mill_eq = new MillionaireWithEquality(party, iopack, otpack);
+  this->eq = new Equality(party, iopack, otpack);
+  this->triple_gen = this->mill->triple_gen;
+}
+
+Truncation::~Truncation() {
+  delete this->aux;
+  delete this->mill_eq;
+  delete this->eq;
+}
+
+void Truncation::div_pow2(int32_t dim, uint64_t *inA, uint64_t *outB,
+                          int32_t shift, int32_t bw, bool signed_arithmetic,
+                          uint8_t *msb_x) {
+  if (signed_arithmetic == false) {
+    truncate(dim, inA, outB, shift, bw, false, msb_x);
+    return;
+  }
+  if (shift == 0) {
+    memcpy(outB, inA, sizeof(uint64_t) * dim);
+    return;
+  }
+  assert((bw - shift) > 0 && "Division shouldn't truncate the full bitwidth");
+  assert(signed_arithmetic && (bw - shift - 1 >= 0));
+  assert((msb_x == nullptr) && "Not yet implemented");
+  assert(inA != outB);
+
+  uint64_t mask_bw = (bw == 64 ? -1 : ((1ULL << bw) - 1));
+  uint64_t mask_shift = (shift == 64 ? -1 : ((1ULL << shift) - 1));
+  // mask_upper extracts the upper bw-shift-1 bits after the MSB
+  uint64_t mask_upper =
+      ((bw - shift - 1) == 64 ? -1 : ((1ULL << (bw - shift - 1)) - 1));
+
+  uint64_t *inA_orig = new uint64_t[dim];
+
+  if (party == ALICE) {
+    for (int i = 0; i < dim; i++) {
+      inA_orig[i] = inA[i];
+      inA[i] = ((inA[i] + (1ULL << (bw - 1))) & mask_bw);
+    }
+  }
+
+  uint64_t *inA_lower = new uint64_t[dim];
+  uint64_t *inA_upper = new uint64_t[dim];
+  uint8_t *wrap_lower = new uint8_t[dim];
+  uint8_t *wrap_upper = new uint8_t[dim];
+  uint8_t *msb_upper = new uint8_t[dim];
+  uint8_t *zero_test_lower = new uint8_t[dim];
+  uint8_t *eq_upper = new uint8_t[dim];
+  uint8_t *and_upper = new uint8_t[dim];
+  uint8_t *div_correction = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    inA_lower[i] = inA[i] & mask_shift;
+    inA_upper[i] = (inA[i] >> shift) & mask_upper;
+    if (party == BOB) {
+      inA_upper[i] = (mask_upper - inA_upper[i]) & mask_upper;
+    }
+  }
+
+  this->aux->wrap_computation(inA_lower, wrap_lower, dim, shift);
+  for (int i = 0; i < dim; i++) {
+    if (party == BOB) {
+      inA_lower[i] = (-1 * inA_lower[i]) & mask_shift;
+    }
+  }
+  this->eq->check_equality(zero_test_lower, inA_lower, dim, shift);
+  this->mill_eq->compare_with_eq(msb_upper, eq_upper, inA_upper, dim,
+                                 (bw - shift - 1));
+  this->aux->AND(wrap_lower, eq_upper, and_upper, dim);
+  for (int i = 0; i < dim; i++) {
+    msb_upper[i] = (msb_upper[i] ^ and_upper[i] ^ (inA[i] >> (bw - 1))) & 1;
+  }
+  this->aux->MSB_to_Wrap(inA, msb_upper, wrap_upper, dim, bw);
+  // negate zero_test_lower and msb_upper
+  // if signed_arithmetic == true, MSB of inA is flipped in the beginning
+  // if MSB was 1, and the lower shift bits were not all 0, add 1 as
+  // div_correction
+  for (int i = 0; i < dim; i++) {
+    if (party == ALICE) {
+      zero_test_lower[i] ^= 1;
+      msb_upper[i] ^= 1;
+    }
+  }
+  this->aux->AND(zero_test_lower, msb_upper, div_correction, dim);
+
+  uint64_t *arith_wrap_upper = new uint64_t[dim];
+  uint64_t *arith_wrap_lower = new uint64_t[dim];
+  uint64_t *arith_div_correction = new uint64_t[dim];
+  this->aux->B2A(wrap_upper, arith_wrap_upper, dim, shift);
+  this->aux->B2A(wrap_lower, arith_wrap_lower, dim, bw);
+  this->aux->B2A(div_correction, arith_div_correction, dim, bw);
+
+  for (int i = 0; i < dim; i++) {
+    outB[i] =
+        ((inA[i] >> shift) + arith_div_correction[i] + arith_wrap_lower[i] -
+         (1ULL << (bw - shift)) * arith_wrap_upper[i]) &
+        mask_bw;
+  }
+
+  if (signed_arithmetic && (party == ALICE)) {
+    for (int i = 0; i < dim; i++) {
+      outB[i] = ((outB[i] - (1ULL << (bw - shift - 1))) & mask_bw);
+      inA[i] = inA_orig[i];
+    }
+  }
+  delete[] inA_orig;
+  delete[] inA_lower;
+  delete[] inA_upper;
+  delete[] wrap_lower;
+  delete[] wrap_upper;
+  delete[] msb_upper;
+  delete[] zero_test_lower;
+  delete[] eq_upper;
+  delete[] and_upper;
+  delete[] div_correction;
+  delete[] arith_wrap_lower;
+  delete[] arith_wrap_upper;
+  delete[] arith_div_correction;
+
+  return;
+}
+
+void Truncation::truncate(int32_t dim, uint64_t *inA, uint64_t *outB,
+                          int32_t shift, int32_t bw, bool signed_arithmetic,
+                          uint8_t *msb_x) {
+  if (shift == 0) {
+    memcpy(outB, inA, sizeof(uint64_t) * dim);
+    return;
+  }
+  assert((bw - shift) > 0 && "Truncation shouldn't truncate the full bitwidth");
+  assert((signed_arithmetic && (bw - shift - 1 >= 0)) || !signed_arithmetic);
+  assert(inA != outB);
+
+  uint64_t mask_bw = (bw == 64 ? -1 : ((1ULL << bw) - 1));
+  uint64_t mask_shift = (shift == 64 ? -1 : ((1ULL << shift) - 1));
+  uint64_t mask_upper =
+      ((bw - shift) == 64 ? -1 : ((1ULL << (bw - shift)) - 1));
+
+  uint64_t *inA_orig = new uint64_t[dim];
+
+  if (signed_arithmetic && (party == ALICE)) {
+    for (int i = 0; i < dim; i++) {
+      inA_orig[i] = inA[i];
+      inA[i] = ((inA[i] + (1ULL << (bw - 1))) & mask_bw);
+    }
+  }
+
+  uint64_t *inA_lower = new uint64_t[dim];
+  uint64_t *inA_upper = new uint64_t[dim];
+  uint8_t *wrap_lower = new uint8_t[dim];
+  uint8_t *wrap_upper = new uint8_t[dim];
+  uint8_t *eq_upper = new uint8_t[dim];
+  uint8_t *and_upper = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    inA_lower[i] = inA[i] & mask_shift;
+    inA_upper[i] = (inA[i] >> shift) & mask_upper;
+    if (party == BOB) {
+      inA_upper[i] = (mask_upper - inA_upper[i]) & mask_upper;
+    }
+  }
+
+  this->aux->wrap_computation(inA_lower, wrap_lower, dim, shift);
+  if (msb_x == nullptr) {
+    this->mill_eq->compare_with_eq(wrap_upper, eq_upper, inA_upper, dim,
+                                   (bw - shift));
+    this->aux->AND(wrap_lower, eq_upper, and_upper, dim);
+    for (int i = 0; i < dim; i++) {
+      wrap_upper[i] ^= and_upper[i];
+    }
+  } else {
+    if (signed_arithmetic) {
+      uint8_t *inv_msb_x = new uint8_t[dim];
+      for (int i = 0; i < dim; i++) {
+        inv_msb_x[i] = msb_x[i] ^ (party == ALICE ? 1 : 0);
+      }
+      this->aux->MSB_to_Wrap(inA, inv_msb_x, wrap_upper, dim, bw);
+      delete[] inv_msb_x;
+    } else {
+      this->aux->MSB_to_Wrap(inA, msb_x, wrap_upper, dim, bw);
+    }
+  }
+
+  uint64_t *arith_wrap_upper = new uint64_t[dim];
+  uint64_t *arith_wrap_lower = new uint64_t[dim];
+  this->aux->B2A(wrap_upper, arith_wrap_upper, dim, shift);
+  this->aux->B2A(wrap_lower, arith_wrap_lower, dim, bw);
+
+  for (int i = 0; i < dim; i++) {
+    outB[i] = (((inA[i] >> shift) & mask_upper) + arith_wrap_lower[i] -
+               (1ULL << (bw - shift)) * arith_wrap_upper[i]) &
+              mask_bw;
+  }
+
+  if (signed_arithmetic && (party == ALICE)) {
+    for (int i = 0; i < dim; i++) {
+      outB[i] = ((outB[i] - (1ULL << (bw - shift - 1))) & mask_bw);
+      inA[i] = inA_orig[i];
+    }
+  }
+  delete[] inA_orig;
+  delete[] inA_lower;
+  delete[] inA_upper;
+  delete[] wrap_lower;
+  delete[] wrap_upper;
+  delete[] eq_upper;
+  delete[] and_upper;
+  delete[] arith_wrap_lower;
+  delete[] arith_wrap_upper;
+
+  return;
+}
+
+void Truncation::truncate_red_then_ext(int32_t dim, uint64_t *inA,
+                                       uint64_t *outB, int32_t shift,
+                                       int32_t bw, bool signed_arithmetic,
+                                       uint8_t *msb_x) {
+  if (shift == 0) {
+    memcpy(outB, inA, dim * sizeof(uint64_t));
+    return;
+  }
+  uint64_t *tmpB = new uint64_t[dim];
+  truncate_and_reduce(dim, inA, tmpB, shift, bw);
+  XTProtocol xt(this->party, this->iopack, this->otpack);
+  if (signed_arithmetic)
+    xt.s_extend(dim, tmpB, outB, bw - shift, bw);
+  else
+    xt.z_extend(dim, tmpB, outB, bw - shift, bw);
+
+  delete[] tmpB;
+  return;
+}
+
+void Truncation::truncate_and_reduce(int32_t dim, uint64_t *inA, uint64_t *outB,
+                                     int32_t shift, int32_t bw) {
+  if (shift == 0) {
+    memcpy(outB, inA, sizeof(uint64_t) * dim);
+    return;
+  }
+  assert((bw - shift) > 0 && "Truncation shouldn't truncate the full bitwidth");
+
+  uint64_t mask_bw = (bw == 64 ? -1 : ((1ULL << bw) - 1));
+  uint64_t mask_shift = (shift == 64 ? -1 : ((1ULL << shift) - 1));
+  uint64_t mask_out = ((bw - shift) == 64 ? -1 : ((1ULL << (bw - shift)) - 1));
+
+  uint64_t *inA_lower = new uint64_t[dim];
+  uint8_t *wrap = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    inA_lower[i] = inA[i] & mask_shift;
+  }
+
+  this->aux->wrap_computation(inA_lower, wrap, dim, shift);
+
+  uint64_t *arith_wrap = new uint64_t[dim];
+  this->aux->B2A(wrap, arith_wrap, dim, (bw - shift));
+
+  for (int i = 0; i < dim; i++) {
+    outB[i] = ((inA[i] >> shift) + arith_wrap[i]) & mask_out;
+  }
+
+  delete[] inA_lower;
+  delete[] wrap;
+  delete[] arith_wrap;
+
+  return;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.h b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.h
new file mode 100644
index 00000000..b064376e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.h
@@ -0,0 +1,111 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef TRUNCATION_H__
+#define TRUNCATION_H__
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "Millionaire/equality.h"
+#include "Millionaire/millionaire_with_equality.h"
+
+class Truncation {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *mill;
+  MillionaireWithEquality *mill_eq;
+  Equality *eq;
+  AuxProtocols *aux;
+  int party;
+
+  // Constructor
+  Truncation(int party, sci::IOPack *iopack, sci::OTPack *otpack);
+
+  // Destructor
+  ~Truncation();
+
+  // Truncate (right-shift) by shift in the same ring (round towards -inf)
+  void truncate(
+      // Size of vector
+      int32_t dim,
+      // input vector
+      uint64_t *inA,
+      // output vector
+      uint64_t *outB,
+      // right shift amount
+      int32_t shift,
+      // Input and output bitwidth
+      int32_t bw,
+      // signed truncation?
+      bool signed_arithmetic = true,
+      // msb of input vector elements
+      uint8_t *msb_x = nullptr);
+
+  // Divide by 2^shift in the same ring (round towards 0)
+  void div_pow2(
+      // Size of vector
+      int32_t dim,
+      // input vector
+      uint64_t *inA,
+      // output vector
+      uint64_t *outB,
+      // right shift amount
+      int32_t shift,
+      // Input and output bitwidth
+      int32_t bw,
+      // signed truncation?
+      bool signed_arithmetic = true,
+      // msb of input vector elements
+      uint8_t *msb_x = nullptr);
+
+  // Truncate (right-shift) by shift in the same ring
+  void truncate_red_then_ext(
+      // Size of vector
+      int32_t dim,
+      // input vector
+      uint64_t *inA,
+      // output vector
+      uint64_t *outB,
+      // right shift amount
+      int32_t shift,
+      // Input and output bitwidth
+      int32_t bw,
+      // signed truncation?
+      bool signed_arithmetic = true,
+      // msb of input vector elements
+      uint8_t *msb_x = nullptr);
+
+  // Truncate (right-shift) by shift and go to a smaller ring
+  void truncate_and_reduce(
+      // Size of vector
+      int32_t dim,
+      // input vector
+      uint64_t *inA,
+      // output vector
+      uint64_t *outB,
+      // right shift amount
+      int32_t shift,
+      // Input bitwidth
+      int32_t bw);
+};
+
+#endif // TRUNCATION_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.cpp
new file mode 100644
index 00000000..28281a03
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.cpp
@@ -0,0 +1,111 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/value-extension.h"
+
+using namespace std;
+using namespace sci;
+
+XTProtocol::XTProtocol(int party, IOPack *iopack, OTPack *otpack) {
+  this->party = party;
+  this->iopack = iopack;
+  this->otpack = otpack;
+  this->aux = new AuxProtocols(party, iopack, otpack);
+  this->millionaire = this->aux->mill;
+  this->triple_gen = this->millionaire->triple_gen;
+}
+
+XTProtocol::~XTProtocol() { delete aux; };
+
+void XTProtocol::z_extend(int32_t dim, uint64_t *inA, uint64_t *outB,
+                          int32_t bwA, int32_t bwB, uint8_t *msbA) {
+  if (bwA == bwB) {
+    memcpy(outB, inA, sizeof(uint64_t) * dim);
+    return;
+  }
+  assert(bwB > bwA && "Extended bitwidth should be > original");
+  uint64_t mask_bwA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+  uint64_t mask_bwB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+  uint8_t *wrap = new uint8_t[dim];
+
+  if (msbA != nullptr) {
+    this->aux->MSB_to_Wrap(inA, msbA, wrap, dim, bwA);
+  } else {
+    this->aux->wrap_computation(inA, wrap, dim, bwA);
+  }
+
+  uint64_t *arith_wrap = new uint64_t[dim];
+  this->aux->B2A(wrap, arith_wrap, dim, (bwB - bwA));
+
+  for (int i = 0; i < dim; i++) {
+    outB[i] = ((inA[i] & mask_bwA) - (1ULL << bwA) * arith_wrap[i]) & mask_bwB;
+  }
+
+  delete[] wrap;
+  delete[] arith_wrap;
+}
+
+void XTProtocol::s_extend(int32_t dim, uint64_t *inA, uint64_t *outB,
+                          int32_t bwA, int32_t bwB, uint8_t *msbA) {
+  if (bwA == bwB) {
+    memcpy(outB, inA, sizeof(uint64_t) * dim);
+    return;
+  }
+  assert(bwB > bwA && "Extended bitwidth should be > original");
+  uint64_t mask_bwA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+  uint64_t mask_bwB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+
+  uint64_t *mapped_inA = new uint64_t[dim];
+  uint64_t *mapped_outB = new uint64_t[dim];
+  if (party == ALICE) {
+    for (int i = 0; i < dim; i++) {
+      mapped_inA[i] = (inA[i] + (1ULL << (bwA - 1))) & mask_bwA;
+    }
+  } else { // BOB
+    for (int i = 0; i < dim; i++) {
+      mapped_inA[i] = inA[i];
+    }
+  }
+
+  uint8_t *tmp_msbA = nullptr;
+  if (msbA != nullptr) {
+    tmp_msbA = new uint8_t[dim];
+    for (int i = 0; i < dim; i++) {
+      tmp_msbA[i] = (party == ALICE ? msbA[i] ^ 1 : msbA[i]);
+    }
+  }
+  this->z_extend(dim, mapped_inA, mapped_outB, bwA, bwB, tmp_msbA);
+  if (msbA != nullptr) {
+    delete[] tmp_msbA;
+  }
+
+  if (party == ALICE) {
+    for (int i = 0; i < dim; i++) {
+      outB[i] = (mapped_outB[i] - (1ULL << (bwA - 1))) & mask_bwB;
+    }
+  } else { // BOB
+    for (int i = 0; i < dim; i++) {
+      outB[i] = (mapped_outB[i]) & mask_bwB;
+    }
+  }
+  delete[] mapped_inA;
+  delete[] mapped_outB;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.h b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.h
new file mode 100644
index 00000000..355cb39a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.h
@@ -0,0 +1,50 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef ZERO_EXT_H__
+#define ZERO_EXT_H__
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "Millionaire/millionaire.h"
+
+class XTProtocol {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *millionaire;
+  AuxProtocols *aux;
+  int party;
+
+  // Constructor
+  XTProtocol(int party, sci::IOPack *iopack, sci::OTPack *otpack);
+
+  // Destructor
+  ~XTProtocol();
+
+  void z_extend(int32_t dim, uint64_t *inA, uint64_t *outB, int32_t bwA,
+                int32_t bwB, uint8_t *msbA = nullptr);
+
+  void s_extend(int32_t dim, uint64_t *inA, uint64_t *outB, int32_t bwA,
+                int32_t bwB, uint8_t *msbA = nullptr);
+};
+
+#endif // ZERO_EXT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/CMakeLists.txt
new file mode 100644
index 00000000..3e598e9b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/CMakeLists.txt
@@ -0,0 +1,123 @@
+# set(CMAKE_FIND_DEBUG_MODE 1)
+include_directories(${CMAKE_CURRENT_LIST_DIR})
+
+find_package(Eigen3 3.3 QUIET NO_MODULE PATHS "${PROJECT_SOURCE_DIR}/extern/eigen/build" NO_DEFAULT_PATH)
+if (NOT Eigen3_FOUND)
+    message(STATUS "Eigen 3.3 was not found: clone and install Eigen3 locally")
+    if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/eigen/CMakeLists.txt")
+        find_package(Git REQUIRED)
+        message(STATUS "initialize Git submodule: extern/eigen")
+        execute_process(COMMAND git submodule update --init --recursive extern/eigen
+                WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+    endif ()
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory build
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/eigen/")
+    execute_process(COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build ..
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/eigen/build")
+    find_package(Eigen3 3.3 REQUIRED NO_MODULE PATHS "${PROJECT_SOURCE_DIR}/extern/eigen/build" NO_DEFAULT_PATH)
+endif()
+
+add_subdirectory(utils)
+add_subdirectory(OT)
+add_subdirectory(GC)
+add_subdirectory(Millionaire)
+add_subdirectory(BuildingBlocks)
+add_subdirectory(LinearOT)
+add_subdirectory(LinearHE) # Kanav: Removed for now
+add_subdirectory(NonLinear)
+add_subdirectory(Math)
+add_subdirectory(FloatingPoint)
+
+add_library(SCI-common INTERFACE)
+
+find_package(Threads REQUIRED)
+
+target_link_libraries(SCI-common
+    INTERFACE
+    SCI-NonLinear
+    Threads::Threads
+)
+
+target_include_directories(SCI-common
+    INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:./>
+)
+
+add_library(SCI-OT library_fixed_uniform.cpp library_fixed.cpp
+    globals.cpp cleartext_library_fixed.cpp)
+add_library(SCI-HE library_fixed_uniform.cpp globals.cpp)
+add_library(SCI-FloatML library_float.cpp globals_float.cpp cleartext_library_float.cpp)
+
+target_link_libraries(SCI-OT PUBLIC SCI-common SCI-LinearOT SCI-GC
+	SCI-Math SCI-BuildingBlocks Eigen3::Eigen)
+target_link_libraries(SCI-HE PUBLIC SCI-common SCI-LinearHE)
+target_link_libraries(SCI-FloatML PUBLIC SCI-FloatingPoint) 
+target_include_directories(SCI-FloatML PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+) # Kanav: HACK
+
+target_compile_definitions(SCI-OT PUBLIC SCI_OT=1)
+target_compile_definitions(SCI-HE PUBLIC SCI_HE=1)
+
+install(TARGETS
+        SCI-FloatML
+        SCI-OT
+        SCI-HE
+        SCI-common
+        SCI-FloatingPoint
+        SCI-BuildingBlocks
+        SCI-LinearOT
+        SCI-LinearHE
+        SCI-NonLinear
+        SCI-Math
+        SCI-Millionaire
+        SCI-GC
+        SCI-OTPrimitive
+        SCI-utils
+        EXPORT SCITargets
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib
+        RUNTIME DESTINATION bin
+        INCLUDES DESTINATION include)
+
+install(EXPORT SCITargets
+        FILE SCITargets.cmake
+        NAMESPACE SCI::
+        DESTINATION lib/cmake/SCI)
+
+install(DIRECTORY
+        utils
+        OT
+        GC
+        Millionaire
+        NonLinear
+        BuildingBlocks
+        LinearOT
+        LinearHE
+        Math
+        FloatingPoint
+        DESTINATION include
+        FILES_MATCHING
+        PATTERN "*.h"
+        PATTERN "*.hpp")
+
+install(FILES
+        "defines.h"
+        "defines_uniform.h"
+        "defines_float.h"
+        "globals.h"
+        "globals_float.h"
+        "library_fixed.h"
+        "library_fixed_uniform.h"
+        "library_float.h"
+        "cleartext_library_fixed.h"
+        "cleartext_library_fixed_uniform.h"
+        "cleartext_library_float.h"
+        DESTINATION include)
+
+install(FILES
+        "${CMAKE_CURRENT_SOURCE_DIR}/utils/cmake/FindGMP.cmake"
+        "${CMAKE_CURRENT_SOURCE_DIR}/utils/cmake/source_of_randomness.cmake"
+        DESTINATION lib/cmake/SCI)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/CMakeLists.txt
new file mode 100644
index 00000000..817d8dc8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(SCI-FloatingPoint fp-math.cpp floating-point.cpp fixed-point.cpp bool-data.cpp)
+target_link_libraries(SCI-FloatingPoint
+    PUBLIC SCI-BuildingBlocks SCI-Math
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.cpp
new file mode 100644
index 00000000..9501ca4c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.cpp
@@ -0,0 +1,171 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/bool-data.h"
+#include "omp.h"
+
+using namespace std;
+using namespace sci;
+
+BoolArray BoolArray::subset(int i, int j) {
+  assert(i >= 0 && j <= size && i < j);
+  int sz = j - i;
+  BoolArray ret(this->party, sz);
+  memcpy(ret.data, this->data + i, sz * sizeof(uint8_t));
+  return ret;
+}
+
+std::ostream &operator<<(std::ostream &os, BoolArray &other) {
+  assert(other.party == PUBLIC);
+  os << "[";
+  for (int i = 0; i < other.size; i++) {
+    os << int(other.data[i]) << "\t";
+  }
+  os << "]";
+  return os;
+}
+
+BoolArray BoolOp::input(int party_, int sz, uint8_t* data_) {
+  BoolArray ret((party_ == PUBLIC ? party_ : this->party), sz);
+  if ((this->party == party_) || (party_ == PUBLIC)) {
+      memcpy(ret.data, data_, sz * sizeof(uint8_t));
+      for (int i = 0; i < sz; i++) {
+          ret.data[i] &= 1;
+      }
+  } else {
+      memset(ret.data, 0, sz * sizeof(uint8_t));
+  }
+  return ret;
+}
+
+BoolArray BoolOp::input(int party_, int sz, uint8_t data_) {
+  BoolArray ret((party_ == PUBLIC ? party_ : this->party), sz);
+  if ((this->party == party_) || (party_ == PUBLIC)) {
+      memset(ret.data, (data_ & 1), sz * sizeof(uint8_t));
+  } else {
+      memset(ret.data, 0, sz * sizeof(uint8_t));
+  }
+  return ret;
+}
+
+BoolArray BoolOp::output(int party_, const BoolArray& x) {
+  if (x.party == PUBLIC) {
+    return x;
+  }
+  int sz = x.size;
+  int ret_party = (party_ == PUBLIC || party_ == x.party ? PUBLIC : x.party);
+  BoolArray ret(ret_party, sz);
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1 && party_ != BOB) {
+      if (party == sci::ALICE) {
+        iopack->io_rev->recv_data(ret.data, sz * sizeof(uint8_t));
+      } else { // party == sci::BOB
+        iopack->io_rev->send_data(x.data, sz * sizeof(uint8_t));
+      }
+    } else if (omp_get_thread_num() == 0 && party_ != ALICE) {
+      if (party == sci::ALICE) {
+        iopack->io->send_data(x.data, sz * sizeof(uint8_t));
+      } else { // party == sci::BOB
+        iopack->io->recv_data(ret.data, sz * sizeof(uint8_t));
+      }
+    }
+  }
+  for (int i = 0; i < sz; i++) {
+    ret.data[i] = ret.data[i] ^ x.data[i];
+  }
+  return ret;
+}
+
+BoolArray BoolOp::if_else(const BoolArray &cond, const BoolArray &x,
+                          const BoolArray &y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == x.size && cond.size == y.size);
+  BoolArray x_xor_y = this->XOR(x, y);
+  BoolArray ret = this->AND(cond, x_xor_y);
+  return this->XOR(ret, y);
+}
+
+BoolArray BoolOp::if_else(const BoolArray &cond, const BoolArray &x,
+                          uint8_t y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == x.size);
+  BoolArray bool_y = this->input(PUBLIC, x.size, y);
+  return this->if_else(cond, x, bool_y);
+}
+
+BoolArray BoolOp::if_else(const BoolArray &cond, uint8_t x,
+                          const BoolArray &y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == y.size);
+  BoolArray bool_x = this->input(PUBLIC, y.size, x);
+  return this->if_else(cond, bool_x, y);
+}
+
+BoolArray BoolOp::NOT(const BoolArray &x) {
+  BoolArray ret(x.party, x.size);
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = x.data[i] ^ ((x.party != BOB) ? 1 : 0);
+  }
+  return ret;
+}
+
+BoolArray BoolOp::XOR(const BoolArray &x, const BoolArray &y) {
+  assert(x.size == y.size);
+  bool x_cond, y_cond;
+  int party_;
+  if (x.party == PUBLIC && y.party == PUBLIC) {
+    x_cond = false; y_cond = false;
+    party_ = PUBLIC;
+  } else {
+    x_cond = (x.party == PUBLIC) && (this->party == BOB);
+    y_cond = (y.party == PUBLIC) && (this->party == BOB);
+    party_ = this->party;
+  }
+  BoolArray ret(party_, x.size);
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = (x_cond ? 0 : x.data[i]) ^ (y_cond ? 0 : y.data[i]);
+  }
+  return ret;
+}
+
+BoolArray BoolOp::AND(const BoolArray &x, const BoolArray &y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  BoolArray ret(this->party, x.size);
+  if (x.party == PUBLIC || y.party == PUBLIC) {
+    for (int i = 0; i < x.size; i++) {
+      ret.data[i] = x.data[i] & y.data[i];
+    }
+  } else {
+      aux->AND(x.data, y.data, ret.data, x.size);
+  }
+  return ret;
+}
+
+BoolArray BoolOp::OR(const BoolArray &x, const BoolArray &y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  BoolArray ret(this->party, x.size);
+  BoolArray neg_x = this->NOT(x);
+  BoolArray neg_y = this->NOT(y);
+  return this->NOT(this->AND(neg_x, neg_y));
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.h b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.h
new file mode 100644
index 00000000..490925b0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.h
@@ -0,0 +1,162 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef BOOL_DATA_H__
+#define BOOL_DATA_H__
+
+#include "Math/math-functions.h"
+
+#define I 0
+#define J 1
+#define print_bool(vec)                                                        \
+  {                                                                            \
+    auto tmp_pub = bool_op->output(PUBLIC, vec).subset(I, I + J);              \
+    cout << #vec << "_pub: " << tmp_pub << endl;                               \
+  }
+
+// A container to hold an array of boolean values
+// If party is set as PUBLIC for a BoolArray instance, then the underlying array is known publicly and we maintain the invariant that both parties will hold identical data in that instance.
+// Else, the underlying array is secret-shared and the class instance will hold the party's share of the secret array. In this case, the party data member denotes which party this share belongs to.
+class BoolArray {
+public:
+  int party = sci::PUBLIC;
+  int size = 0;            // size of array
+  uint8_t *data = nullptr; // data
+
+  BoolArray(){};
+
+  BoolArray(int party_, int sz) {
+    assert(party_ == sci::PUBLIC || party_ == sci::ALICE || party_ == sci::BOB);
+    assert(sz > 0);
+    this->party = party_;
+    this->size = sz;
+    data = new uint8_t[sz];
+  }
+
+  // copy constructor
+  BoolArray(const BoolArray &other) {
+    this->size = other.size;
+    this->party = other.party;
+    this->data = new uint8_t[size];
+    memcpy(this->data, other.data, size * sizeof(uint8_t));
+  }
+
+  // move constructor
+  BoolArray(BoolArray &&other) noexcept {
+    this->size = other.size;
+    this->party = other.party;
+    this->data = other.data;
+    other.data = nullptr;
+  }
+
+  ~BoolArray() { delete[] data; }
+
+  // copy assignment
+  BoolArray &operator=(const BoolArray &other) {
+    if (this == &other) return *this;
+
+    delete[] this->data;
+    this->size = other.size;
+    this->party = other.party;
+    this->data = new uint8_t[size];
+    memcpy(this->data, other.data, size * sizeof(uint8_t));
+    return *this;
+  }
+
+  // move assignment
+  BoolArray &operator=(BoolArray &&other) noexcept {
+    if (this == &other) return *this;
+
+    delete[] this->data;
+    this->size = other.size;
+    this->party = other.party;
+    this->data = other.data;
+    other.data = nullptr;
+    return *this;
+  }
+
+  // BoolArray[i, j)
+  BoolArray subset(int i, int j);
+};
+
+// prints the contents of other, which must be a PUBLIC BoolArray
+std::ostream &operator<<(std::ostream &os, BoolArray &other);
+
+class BoolOp {
+  friend class FixOp;
+  friend class FPOp;
+  friend class FPMath;
+public:
+  int party;
+
+  BoolOp(int party, sci::IOPack *iopack, sci::OTPack *otpack) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->aux = new AuxProtocols(party, iopack, otpack);
+  }
+
+  ~BoolOp() { delete aux; }
+
+  // input functions: return a BoolArray that stores data_
+  // party_ denotes which party provides the input data_ and the data_ provided by the other party is ignored. If party_ is PUBLIC, then the data_ provided by both parties must be identical.
+  // sz is the size of the returned BoolArray and the uint8_t array pointed by data_
+  BoolArray input(int party_, int sz, uint8_t* data_);
+  // same as the above function, except that it replicates data_ in all sz positions of the returned BoolArray
+  BoolArray input(int party_, int sz, uint8_t data_);
+
+  // output function: returns the secret array underlying x in the form of a PUBLIC BoolArray
+  // party_ denotes which party will receive the output. If party_ is PUBLIC, both parties receive the output.
+  BoolArray output(int party_, const BoolArray& x);
+
+  // Multiplexers: return x[i] if cond[i] = 1; else return y[i]
+  // cond must be a secret-shared BoolArray
+  //// Both x and y can be PUBLIC or secret-shared
+  //// cond, x, y must have equal size
+  BoolArray if_else(const BoolArray &cond, const BoolArray &x, const BoolArray &y);
+  //// x can be PUBLIC or secret-shared
+  //// cond, x must have equal size
+  //// y[i] = y
+  BoolArray if_else(const BoolArray &cond, const BoolArray &x, uint8_t y);
+  //// y can be PUBLIC or secret-shared
+  //// cond, y must have equal size
+  //// x[i] = x
+  BoolArray if_else(const BoolArray &cond, uint8_t x, const BoolArray &y);
+
+  // NOT Gate: return ! x[i]
+  BoolArray NOT(const BoolArray &x);
+
+  // Boolean Arithmetic Gates: return x[i] OP y[i], OP = {^, &, |}
+  // x and y must have equal size
+  //// Both x and y can be PUBLIC or secret-shared
+  BoolArray XOR(const BoolArray &x, const BoolArray &y);
+  //// At least one of x and y must be a secret-shared BoolArray
+  BoolArray AND(const BoolArray &x, const BoolArray &y);
+  //// At least one of x and y must be a secret-shared BoolArray
+  BoolArray OR(const BoolArray &x, const BoolArray &y);
+
+private:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  AuxProtocols *aux;
+};
+
+#endif // BOOL_DATA_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.cpp
new file mode 100644
index 00000000..13ef290a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.cpp
@@ -0,0 +1,1095 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/fixed-point.h"
+
+using namespace std;
+using namespace sci;
+
+FixArray FixArray::subset(int i, int j) {
+  assert(i >= 0 && j <= size && i < j);
+  int sz = j - i;
+  FixArray ret(this->party, sz, this->signed_, this->ell, this->s);
+  memcpy(ret.data, this->data + i, sz * sizeof(uint64_t));
+  return ret;
+}
+
+
+FixArray concat(const vector<FixArray>& x) {
+  int N = x.size();
+  int sz = x[0].size;
+  bool signed_ = x[0].signed_;
+  int ell = x[0].ell;
+  int s = x[0].s;
+  int party = x[0].party;
+  for (int i = 1; i < N; i++) {
+    sz += x[i].size;
+    assert(signed_ == x[i].signed_);
+    assert(ell == x[i].ell);
+    assert(s == x[i].s);
+    assert(party == x[i].party);
+  }
+  FixArray ret(party, sz, signed_, ell, s);
+  int offset = 0;
+  for (int i = 0; i < N; i++) {
+    int n = x[i].size;
+    memcpy(ret.data + offset, x[i].data, n * sizeof(uint64_t));
+    offset += n;
+  }
+  return ret;
+}
+
+template <class T> std::vector<T> FixArray::get_native_type() {
+  assert(this->party == PUBLIC);
+  if constexpr (is_same_v<T, uint32_t> || is_same_v<T, uint64_t>) {
+    assert(this->signed_ == false);
+  }
+  vector<T> ret(this->size);
+  double den = pow(2.0, this->s);
+  for (int i = 0; i < this->size; i++) {
+    int64_t data_ = (this->signed_ ? signed_val(this->data[i], this->ell) : this->data[i]);
+    ret[i] = T(data_ / den);
+  }
+  return ret;
+}
+
+std::ostream &operator<<(std::ostream &os, FixArray &other) {
+  assert(other.party == PUBLIC);
+  vector<double> dbl_other = other.get_native_type<double>();
+  os << "(ell: " << other.ell << ", s: " << other.s << ") \t[";
+  for (int i = 0; i < other.size; i++) {
+    int64_t data_ =
+        (other.signed_ ? signed_val(other.data[i], other.ell) : other.data[i]);
+    std::string tmp_data = std::bitset<64>(data_).to_string();
+    os << dbl_other[i] << " int=" << data_ << " ("
+       << tmp_data.substr(64 - other.ell, 64) << ")\t";
+  }
+  os << "]";
+  return os;
+}
+
+template vector<uint32_t> FixArray::get_native_type();
+template vector<int32_t> FixArray::get_native_type();
+template vector<uint64_t> FixArray::get_native_type();
+template vector<int64_t> FixArray::get_native_type();
+template vector<float> FixArray::get_native_type();
+template vector<double> FixArray::get_native_type();
+
+FixArray FixOp::input(int party_, int sz, uint64_t* data_, bool signed__, int ell_, int s_) {
+  FixArray ret((party_ == PUBLIC ? party_ : this->party), sz, signed__, ell_, s_);
+  uint64_t ell_mask_ = ret.ell_mask();
+  if ((this->party == party_) || (party_ == PUBLIC)) {
+    memcpy(ret.data, data_, sz * sizeof(uint64_t));
+    for (int i = 0; i < sz; i++) {
+      ret.data[i] &= ell_mask_;
+    }
+  } else {
+    for (int i = 0; i < sz; i++) {
+      ret.data[i] = 0;
+    }
+  }
+  return ret;
+}
+
+FixArray FixOp::input(int party_, int sz, uint64_t data_, bool signed__, int ell_, int s_) {
+  FixArray ret((party_ == PUBLIC ? party_ : this->party), sz, signed__, ell_, s_);
+  uint64_t ell_mask_ = ret.ell_mask();
+  if ((this->party == party_) || (party_ == PUBLIC)) {
+    for (int i = 0; i < sz; i++) {
+      ret.data[i] = data_ & ell_mask_;
+    }
+  } else {
+    for (int i = 0; i < sz; i++) {
+      ret.data[i] = 0;
+    }
+  }
+  return ret;
+}
+
+FixArray FixOp::output(int party_, const FixArray& x) {
+  if (x.party == PUBLIC) {
+    return x;
+  }
+  int sz = x.size;
+  int ret_party = (party_ == PUBLIC || party_ == x.party ? PUBLIC : x.party);
+  FixArray ret(ret_party, sz, x.signed_, x.ell, x.s);
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1 && party_ != BOB) {
+      if (party == sci::ALICE) {
+        iopack->io_rev->recv_data(ret.data, sz * sizeof(uint64_t));
+      } else { // party == sci::BOB
+        iopack->io_rev->send_data(x.data, sz * sizeof(uint64_t));
+      }
+    } else if (omp_get_thread_num() == 0 && party_ != ALICE) {
+      if (party == sci::ALICE) {
+        iopack->io->send_data(x.data, sz * sizeof(uint64_t));
+      } else { // party == sci::BOB
+        iopack->io->recv_data(ret.data, sz * sizeof(uint64_t));
+      }
+    }
+  }
+  uint64_t ell_mask_ = x.ell_mask();
+  for (int i = 0; i < sz; i++) {
+    ret.data[i] = (ret.data[i] + x.data[i]) & ell_mask_;
+  }
+  return ret;
+}
+
+FixArray FixOp::if_else(const BoolArray &cond, const FixArray &x,
+                        const FixArray &y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == x.size && cond.size == y.size);
+  assert(x.signed_ == y.signed_);
+  assert(x.ell == y.ell);
+  assert(x.s == y.s);
+  FixArray ret(this->party, x.size, x.signed_, x.ell, x.s);
+  FixArray diff = this->sub(x, y);
+  if (diff.party == PUBLIC) {
+    FixArray cond_fix = this->B2A(cond, x.signed_, x.ell);
+    cond_fix.s = x.s;
+    ret = this->mul(cond_fix, diff, x.ell);
+  } else {
+    aux->multiplexer(cond.data, diff.data, ret.data, x.size, x.ell, x.ell);
+  }
+  return this->add(ret, y);
+}
+
+FixArray FixOp::if_else(const BoolArray &cond, const FixArray &x, uint64_t y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == x.size);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->if_else(cond, x, y_fix);
+}
+
+FixArray FixOp::if_else(const BoolArray &cond, uint64_t x, const FixArray &y) {
+  assert(cond.party != PUBLIC);
+  assert(cond.size == y.size);
+  FixArray x_fix = this->input(PUBLIC, y.size, x, y.signed_, y.ell, y.s);
+  return this->if_else(cond, x_fix, y);
+}
+
+FixArray FixOp::add(const FixArray &x, const FixArray &y) {
+  assert(x.size == y.size);
+  assert(x.signed_ == y.signed_);
+  assert(x.ell == y.ell);
+  assert(x.s == y.s);
+
+  bool x_cond, y_cond;
+  int party_;
+  if (x.party == PUBLIC && y.party == PUBLIC) {
+    x_cond = false; y_cond = false;
+    party_ = PUBLIC;
+  } else {
+    x_cond = (x.party == PUBLIC) && (this->party == BOB);
+    y_cond = (y.party == PUBLIC) && (this->party == BOB);
+    party_ = this->party;
+  }
+  FixArray ret(party_, x.size, x.signed_, x.ell, x.s);
+  uint64_t ell_mask_ = x.ell_mask();
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = ((x_cond ? 0 : x.data[i]) + (y_cond ? 0 : y.data[i])) & ell_mask_;
+  }
+  return ret;
+}
+
+FixArray FixOp::add(const FixArray &x, uint64_t y) {
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->add(x, y_fix);
+}
+
+FixArray FixOp::sub(const FixArray &x, const FixArray &y) {
+  FixArray neg_y = this->mul(y, uint64_t(-1));
+  return this->add(x, neg_y);
+}
+
+FixArray FixOp::sub(const FixArray &x, uint64_t y) {
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->sub(x, y_fix);
+}
+
+FixArray FixOp::sub(uint64_t x, const FixArray &y) {
+  FixArray x_fix = this->input(PUBLIC, y.size, x, y.signed_, y.ell, y.s);
+  return this->sub(x_fix, y);
+}
+
+FixArray FixOp::extend(const FixArray &x, int ell, uint8_t *msb_x) {
+  assert(ell >= x.ell);
+  FixArray ret(x.party, x.size, x.signed_, ell, x.s);
+  if (x.signed_) {
+    if (x.party == PUBLIC) {
+      uint64_t ret_mask = ret.ell_mask();
+      for (int i = 0; i < x.size; i++) {
+        ret.data[i] = uint64_t(signed_val(x.data[i], x.ell)) & ret_mask;
+      }
+    } else {
+      xt->s_extend(x.size, x.data, ret.data, x.ell, ell, msb_x);
+    }
+  } else {
+    if (x.party == PUBLIC) {
+      memcpy(ret.data, x.data, x.size*sizeof(uint64_t));
+    } else {
+      xt->z_extend(x.size, x.data, ret.data, x.ell, ell, msb_x);
+    }
+  }
+  return ret;
+}
+
+FixArray FixOp::B2A(const BoolArray &x, bool signed_, int ell) {
+  assert(ell >= 1);
+  FixArray ret(x.party, x.size, signed_, ell, 0);
+  if (x.party == PUBLIC) {
+    for (int i = 0; i < x.size; i++) {
+        ret.data[i] = uint64_t(x.data[i] & 1);
+    }
+  } else {
+    aux->B2A(x.data, ret.data, x.size, ret.ell);
+  }
+  return ret;
+}
+
+FixArray FixOp::mul(const FixArray &x, const FixArray &y, int ell,
+        uint8_t *msb_x, uint8_t *msb_y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.signed_ || (x.signed_ == y.signed_));
+  assert(ell >= x.ell && ell >= y.ell && ell <= x.ell + y.ell);
+  FixArray ret(this->party, x.size, x.signed_, ell, x.s + y.s);
+  if (x.party == PUBLIC || y.party == PUBLIC) {
+    FixArray x_ext = this->extend(x, ell, msb_x);
+    FixArray y_ext = this->extend(y, ell, msb_y);
+    uint64_t ret_mask = ret.ell_mask();
+    for (int i = 0; i < x.size; i++) {
+      ret.data[i] = (x_ext.data[i] * y_ext.data[i]) & ret_mask;
+    }
+  } else {
+    mult->hadamard_product(x.size, x.data, y.data, ret.data, x.ell, y.ell, ell,
+                           x.signed_, y.signed_, MultMode::None, msb_x, msb_y);
+  }
+  return ret;
+}
+
+FixArray FixOp::mul(const FixArray &x, uint64_t y, int ell, uint8_t *msb_x) {
+  assert(ell >= x.ell);
+  FixArray ret;
+  if (ell > x.ell) {
+    ret = this->extend(x, ell, msb_x);
+  } else {
+    ret = x;
+  }
+  uint64_t ell_mask_ = ret.ell_mask();
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = (y * ret.data[i]) & ell_mask_;
+  }
+  return ret;
+}
+
+FixArray FixOp::left_shift(const FixArray &x, const FixArray &s, int ell,
+                           int bound, uint8_t *msb_x) {
+  assert(x.party != PUBLIC && s.party != PUBLIC);
+  assert(x.size == s.size);
+  assert(ell <= x.ell + bound && ell >= x.ell && ell >= bound);
+  int m = ceil(log2(bound + 1));
+  assert(s.signed_ == false && s.s == 0 && s.ell >= m);
+  int pow2_s_ell = bound + 2;
+  if (pow2_s_ell > ell) pow2_s_ell = ell;
+  FixArray pow2_s(x.party, x.size, x.signed_, pow2_s_ell, 0);
+  int M = 1 << m;
+  uint64_t pow2_s_mask = pow2_s.ell_mask();
+  uint64_t s_mask = M - 1;
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(pow2_s.data, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[M];
+      pow2_s.data[i] &= pow2_s_mask;
+      for (int j = 0; j < M; j++) {
+        int idx = (s.data[i] + j) & s_mask;
+        if (idx > pow2_s_ell - 2) {
+          spec[i][j] = (0 - pow2_s.data[i]) & pow2_s_mask;
+        } else {
+          spec[i][j] = ((1ULL << idx) - pow2_s.data[i]) & pow2_s_mask;
+        }
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, m, pow2_s_ell);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint64_t>(nullptr, s.data, pow2_s.data, x.size, m,
+                                pow2_s_ell);
+  }
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  FixArray ret = this->mul(x, pow2_s, ell, msb_x, all_0.data);
+
+  return ret;
+}
+
+FixArray FixOp::right_shift(const FixArray &x, const FixArray &s, int bound,
+                            uint8_t *msb_x) {
+  assert(x.party != PUBLIC && s.party != PUBLIC);
+  assert(x.size == s.size);
+  assert(bound <= x.ell && bound + x.ell < 64);
+  int m = ceil(log2(bound));
+  assert(s.signed_ == false && s.s == 0 && s.ell >= m);
+  FixArray pow2_neg_s(x.party, x.size, x.signed_, bound + 2, bound);
+  int M = 1 << m;
+  uint64_t pow2_neg_s_mask = pow2_neg_s.ell_mask();
+  uint64_t s_mask = M - 1;
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(pow2_neg_s.data, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[M];
+      pow2_neg_s.data[i] &= pow2_neg_s_mask;
+      for (int j = 0; j < M; j++) {
+        int idx = (s.data[i] + j) & s_mask;
+        int exp = bound - idx;
+        if (exp < 0)
+          exp += M;
+        spec[i][j] = ((1ULL << exp) - pow2_neg_s.data[i]) & pow2_neg_s_mask;
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, m, bound + 2);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint64_t>(nullptr, s.data, pow2_neg_s.data, x.size, m,
+                                bound + 2);
+  }
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  FixArray ret = this->mul(x, pow2_neg_s, x.ell + bound, msb_x, all_0.data);
+  ret = this->truncate_reduce(ret, bound);
+
+  return ret;
+}
+
+FixArray FixOp::right_shift(const FixArray &x, int s, uint8_t *msb_x) {
+  assert(x.party != PUBLIC);
+  assert(s <= x.ell && s >= 0);
+  FixArray ret(x.party, x.size, x.signed_, x.ell, x.s - s);
+  trunc->truncate(x.size, x.data, ret.data, s, x.ell, x.signed_, msb_x);
+  return ret;
+}
+
+FixArray FixOp::truncate_reduce(const FixArray &x, int s, uint8_t *wrap_x_s) {
+  assert(x.party != PUBLIC);
+  assert(s < x.ell && s >= 0);
+  FixArray ret(x.party, x.size, x.signed_, x.ell - s, x.s - s);
+  if (wrap_x_s != nullptr) {
+    aux->B2A(wrap_x_s, ret.data, x.size, x.ell - s);
+    uint64_t ret_mask = ret.ell_mask();
+    for (int i = 0; i < x.size; i++) {
+      ret.data[i] = (ret.data[i] + (x.data[i] >> s)) & ret_mask;
+    }
+  } else {
+    trunc->truncate_and_reduce(x.size, x.data, ret.data, s, x.ell);
+  }
+  return ret;
+}
+
+FixArray FixOp::truncate_with_sticky_bit(const FixArray &x, int s) {
+  assert(x.party != PUBLIC);
+  assert(s < x.ell && s >= 0);
+  if (s == 0) return x;
+  FixArray x_s = reduce(x, s);
+  BoolArray wrap, zero_test;
+  tie(wrap, zero_test) = wrap_and_zero_test(x_s);
+  FixArray ret = truncate_reduce(x, s, wrap.data);
+  uint8_t *not_lsb_x = new uint8_t[x.size];
+  uint8_t *corr_bits = new uint8_t[x.size];
+  for (int i = 0; i < x.size; i++) {
+    not_lsb_x[i] = (ret.data[i] & 1) ^ (party == ALICE ? 1 : 0);
+  }
+  BoolArray nonzero_test = bool_op->NOT(zero_test);
+  FixArray correction(x.party, x.size, x.signed_, x.ell - s, x.s - s);
+  aux->AND(not_lsb_x, nonzero_test.data, corr_bits, x.size);
+  aux->B2A(corr_bits, correction.data, x.size, ret.ell);
+  ret = add(ret, correction);
+
+  delete[] not_lsb_x;
+  delete[] corr_bits;
+  return ret;
+}
+
+FixArray FixOp::round_ties_to_even(const FixArray &x, int s) {
+  assert(x.party != PUBLIC);
+  assert(s <= x.ell && s >= 2);
+  FixArray x_ = truncate_with_sticky_bit(x, s - 2);
+  FixArray ret = truncate_reduce(x_, 2);
+  uint8_t *corr_bit = new uint8_t[x.size];
+  uint8_t *x_lower_3 = new uint8_t[x.size];
+  for (int i = 0; i < x.size; i++) {
+    x_lower_3[i] = x_.data[i] & 7;
+  }
+  if (party == ALICE) {
+    uint8_t **spec;
+    spec = new uint8_t *[x.size];
+    PRG128 prg;
+    prg.random_data(corr_bit, x.size * sizeof(uint8_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint8_t[8];
+      corr_bit[i] &= 1;
+      for (int j = 0; j < 8; j++) {
+        int idx = (x_lower_3[i] + j) & 7;
+        bool a = idx & 1;
+        bool b = (idx >> 1) & 1;
+        bool c = (idx >> 2) & 1;
+        uint8_t out = (b && (a || c));
+        spec[i][j] = (out - corr_bit[i]) & 1;
+      }
+    }
+    aux->lookup_table<uint8_t>(spec, nullptr, nullptr, x.size, 3, 1);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint8_t>(nullptr, x_lower_3, corr_bit, x.size, 3, 1);
+  }
+  FixArray correction(x.party, x.size, x.signed_, x.ell - s, x.s - s);
+  aux->B2A(corr_bit, correction.data, x.size, correction.ell);
+
+  delete[] corr_bit;
+  delete[] x_lower_3;
+
+  return add(ret, correction);
+}
+
+FixArray FixOp::scale_up(const FixArray &x, int ell, int s) {
+  assert(ell - x.ell <= s - x.s);
+  assert(s >= x.s);
+  FixArray ret(x.party, x.size, x.signed_, ell, s);
+  uint64_t ell_mask_ = ret.ell_mask();
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = (x.data[i] << (s - x.s)) & ell_mask_;
+  }
+  return ret;
+}
+
+FixArray FixOp::reduce(const FixArray &x, int ell) {
+  assert(ell <= x.ell && ell > 0);
+  FixArray ret(x.party, x.size, x.signed_, ell, x.s);
+  uint64_t ell_mask_ = ret.ell_mask();
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = x.data[i] & ell_mask_;
+  }
+  return ret;
+}
+
+BoolArray FixOp::LSB(const FixArray &x) {
+  BoolArray ret(x.party, x.size);
+  for (int i = 0; i < x.size; i++) {
+    ret.data[i] = x.data[i] & 1;
+  }
+  return ret;
+}
+
+tuple<BoolArray,BoolArray> FixOp::wrap_and_zero_test(const FixArray &x) {
+  assert(x.party != PUBLIC);
+  BoolArray wrap(x.party, x.size);
+  BoolArray zero_test(x.party, x.size);
+  uint64_t *mill_inp = new uint64_t[x.size];
+  for (int i = 0; i < x.size; i++) {
+    mill_inp[i] = (party == ALICE ? x.data[i] : (1ULL << x.ell) - x.data[i]);
+  }
+  mill_eq->compare_with_eq(wrap.data, zero_test.data, mill_inp, x.size, x.ell + 1, true);
+  wrap = bool_op->XOR(wrap, zero_test);
+
+  BoolArray eq_corr(this->party, x.size);
+  uint8_t *ztest_share_ALICE = new uint8_t[x.size];
+  uint8_t *ztest_share_BOB = new uint8_t[x.size];
+  for (int i = 0; i < x.size; i++) {
+    ztest_share_ALICE[i] = (party == ALICE ? (x.data[i] == 0) : 0);
+    ztest_share_BOB[i] = (party == BOB ? (x.data[i] == 0) : 0);
+  }
+  aux->AND(ztest_share_ALICE, ztest_share_BOB, eq_corr.data, x.size);
+  zero_test = bool_op->XOR(zero_test, eq_corr);
+
+  delete[] mill_inp;
+  delete[] ztest_share_ALICE;
+  delete[] ztest_share_BOB;
+
+  return make_tuple(wrap, zero_test);
+}
+
+tuple<BoolArray,BoolArray> FixOp::MSB_and_zero_test(const FixArray &x) {
+  assert(x.party != PUBLIC);
+  BoolArray msb(x.party, x.size);
+  BoolArray sub_wrap, sub_zero_test;
+  tie(sub_wrap, sub_zero_test) = wrap_and_zero_test(this->reduce(x, x.ell - 1));
+  for (int i = 0; i < x.size; i++) {
+    msb.data[i] = (sub_wrap.data[i] ^ (x.data[i] >= (1ULL << (x.ell - 1))));
+  }
+  BoolArray zero_test = bool_op->AND(bool_op->NOT(msb), sub_zero_test);
+  return make_tuple(msb, zero_test);
+}
+
+BoolArray FixOp::EQ(const FixArray &x, const FixArray &y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.signed_ == y.signed_);
+  assert(x.ell == y.ell);
+  assert(x.s == y.s);
+
+  BoolArray ret(this->party, x.size);
+  FixArray diff = this->sub(x, y);
+
+  if (diff.party == BOB) {
+    uint64_t ell_mask_ = diff.ell_mask();
+    for (int i = 0; i < diff.size; i++) {
+      diff.data[i] = (-1 * diff.data[i]) & ell_mask_;
+    }
+  }
+  eq->check_equality(ret.data, diff.data, diff.size, diff.ell);
+
+  return ret;
+}
+
+BoolArray FixOp::LT(const FixArray &x, const FixArray &y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.signed_ == y.signed_);
+  assert(x.ell == y.ell);
+  assert(x.s == y.s);
+
+  BoolArray ret(this->party, x.size);
+  FixArray diff = this->sub(x, y);
+  aux->MSB(diff.data, ret.data, x.size, diff.ell);
+
+  return ret;
+}
+
+BoolArray FixOp::GT(const FixArray &x, const FixArray &y) {
+  return this->LT(y, x);
+}
+
+BoolArray FixOp::LE(const FixArray &x, const FixArray &y) {
+  return bool_op->NOT(this->LT(y, x));
+}
+
+BoolArray FixOp::GE(const FixArray &x, const FixArray &y) {
+  return bool_op->NOT(this->LT(x, y));
+}
+
+BoolArray FixOp::EQ(const FixArray &x, uint64_t y) {
+  assert(x.party != PUBLIC);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->EQ(x, y_fix);
+}
+
+BoolArray FixOp::GT(const FixArray &x, uint64_t y) {
+  assert(x.party != PUBLIC);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->GT(x, y_fix);
+}
+
+BoolArray FixOp::LT(const FixArray &x, uint64_t y) {
+  assert(x.party != PUBLIC);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->LT(x, y_fix);
+}
+
+BoolArray FixOp::LE(const FixArray &x, uint64_t y) {
+  assert(x.party != PUBLIC);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->LE(x, y_fix);
+}
+
+BoolArray FixOp::GE(const FixArray &x, uint64_t y) {
+  assert(x.party != PUBLIC);
+  FixArray y_fix = this->input(PUBLIC, x.size, y, x.signed_, x.ell, x.s);
+  return this->GE(x, y_fix);
+}
+
+tuple<BoolArray,BoolArray> FixOp::LT_and_EQ(const FixArray &x, const FixArray &y) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.signed_ == y.signed_);
+  assert(x.ell == y.ell);
+  assert(x.s == y.s);
+
+  FixArray diff = this->sub(x, y);
+  return MSB_and_zero_test(diff);
+}
+
+FixArray FixOp::LUT(const vector<uint64_t> &spec_vec, const FixArray &x,
+        bool signed_, int l_out, int s_out, int l_in) {
+  assert(x.party != PUBLIC);
+  assert(x.signed_ == false);
+  assert(l_out < 64);
+  assert(spec_vec.size() == (1 << l_in));
+  assert(l_in <= x.ell);
+  if (l_in > 8) {
+    assert(l_in <= 14);
+    int l_rem = l_in - 8;
+    int n = 1 << l_rem;
+    FixArray x_red = this->reduce(x, 8);
+    vector<vector<uint64_t>> lspec_vec(n);
+    for (int i = 0; i < n; i++) {
+      lspec_vec[i].resize(1 << 8);
+      for (int j = 0; j < (1 << 8); j++) {
+        lspec_vec[i][j] = spec_vec[i*(1 << 8) + j];
+      }
+    }
+    vector<FixArray> lout(1 << l_rem);
+    for (int i = 0; i < (1 << l_rem); i++) {
+      lout[i] = this->LUT(lspec_vec[i], x_red, signed_, l_out, s_out, 8);
+    }
+    FixArray x_hi = this->reduce(this->truncate_reduce(x, 8), l_rem);
+    uint64_t x_mask = x_hi.ell_mask();
+    uint64_t LUT_out_mask = (1ULL << n) - 1;
+    uint64_t *LUT_out = new uint64_t[x.size];
+    if (party == ALICE) {
+      uint64_t **spec;
+      spec = new uint64_t *[x.size];
+      PRG128 prg;
+      prg.random_data(LUT_out, x.size * sizeof(uint64_t));
+      for (int i = 0; i < x.size; i++) {
+        spec[i] = new uint64_t[1 << l_rem];
+        LUT_out[i] &= LUT_out_mask;
+        for (int j = 0; j < (1 << l_rem); j++) {
+          int idx = (x_hi.data[i] + j) & x_mask;
+          vector<uint8_t> spec_active_interval(n, 0);
+          spec_active_interval[idx] = 1;
+          uint64_t spec_data = 0;
+          uint64_t LUT_out_data = LUT_out[i];
+          for (int k = 0; k < n; k++) {
+            spec_data |= (((spec_active_interval[k] ^ LUT_out_data) & 1) << k);
+            LUT_out_data >>= 1;
+          }
+          spec[i][j] = spec_data;
+        }
+      }
+      fix->aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, l_rem, n);
+
+      for (int i = 0; i < x.size; i++)
+        delete[] spec[i];
+      delete[] spec;
+    } else {
+      fix->aux->lookup_table<uint64_t>(nullptr, x_hi.data, LUT_out, x.size, l_rem, n);
+    }
+    uint8_t *v = new uint8_t[x.size * n];
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < x.size; j++) {
+        v[i * x.size + j] = LUT_out[j] & 1;
+        LUT_out[j] >>= 1;
+      }
+    }
+    vector<BoolArray> v_bl(n);
+    vector<FixArray> lout_v(n);
+    FixArray zero = fix->input(PUBLIC, x.size, (uint64_t)0ULL, signed_, l_out, s_out);
+    for(int i = 0; i < n; i++) {
+      v_bl[i] = bool_op->input(this->party, x.size, v + i*x.size);
+      lout_v[i] = fix->if_else(v_bl[i], lout[i], zero);
+    }
+    FixArray ret(x.party, x.size, signed_, l_out, s_out);
+    memset(ret.data, 0, ret.size*sizeof(uint64_t));
+    for(int i = 0; i < n; i++) {
+      ret = fix->add(ret, lout_v[i]);
+    }
+    delete[] v;
+    delete[] LUT_out;
+    return ret;
+    /*
+    FixArray ret_send(x.party, x.size, signed_, l_out, s_out);
+    FixArray ret_recv(x.party, x.size, signed_, l_out, s_out);
+    FixArray x_hi = this->reduce(this->truncate_reduce(x, 8), l_rem);
+    uint64_t x_mask = x_hi.ell_mask();
+    uint64_t ret_mask = ret_send.ell_mask();
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(ret_send.data, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[1 << l_rem];
+      ret_send.data[i] &= ret_mask;
+      for (int j = 0; j < (1 << l_rem); j++) {
+        int idx = (x_hi.data[i] + j) & x_mask;
+        spec[i][j] = (lout[idx].data[i] - ret_send.data[i]) & ret_mask;
+      }
+    }
+    if (party == sci::ALICE) {
+      aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, l_rem, l_out);
+      aux->party = sci::BOB;
+      aux->lookup_table<uint64_t>(nullptr, x_hi.data, ret_recv.data, x.size, l_rem, l_out);
+      aux->party = sci::ALICE;
+    } else { // party == sci::BOB
+      aux->lookup_table<uint64_t>(nullptr, x_hi.data, ret_recv.data, x.size, l_rem, l_out);
+      aux->party = sci::ALICE;
+      aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, l_rem, l_out);
+      aux->party = sci::BOB;
+    }
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+
+    return fix->add(ret_send, ret_recv);
+    */
+  }
+  assert(l_in <= 8);
+  FixArray ret(x.party, x.size, signed_, l_out, s_out);
+  FixArray x_red = this->reduce(x, l_in);
+  uint64_t x_mask = x_red.ell_mask();
+  uint64_t ret_mask = ret.ell_mask();
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(ret.data, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[spec_vec.size()];
+      ret.data[i] &= ret_mask;
+      for (int j = 0; j < spec_vec.size(); j++) {
+        int idx = (x_red.data[i] + j) & x_mask;
+        spec[i][j] = (spec_vec[idx] - ret.data[i]) & ret_mask;
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, l_in, l_out);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint64_t>(nullptr, x_red.data, ret.data, x.size, l_in,
+                                l_out);
+  }
+  return ret;
+}
+
+vector<FixArray> FixOp::digit_decomposition(const FixArray& x, int digit_size) {
+  assert(x.party != PUBLIC);
+  assert(digit_size <= 8);
+  int num_digits = ceil(x.ell/double(digit_size));
+  vector<FixArray> digits(num_digits);
+  for (int i = 0; i < num_digits; i++) {
+      int digit_ell = (i == (num_digits - 1) ? x.ell - i*digit_size : digit_size);
+      int digit_s = x.s - i*digit_size;
+      digits[i] = FixArray(x.party, x.size, false, digit_ell, digit_s);
+  }
+  uint64_t* digits_data = new uint64_t[num_digits * x.size];
+  aux->digit_decomposition_sci(x.size, x.data, digits_data, x.ell, digit_size);
+  for (int i = 0; i < num_digits; i++) {
+    memcpy(digits[i].data, digits_data + i*x.size, x.size * sizeof(uint64_t));
+  }
+  delete[] digits_data;
+  return digits;
+}
+
+vector<FixArray> FixOp::msnzb_one_hot(const FixArray& x, int ell) {
+  assert(x.party != PUBLIC);
+  assert(ell <= 64);
+  uint8_t *x_one_hot = new uint8_t[x.size * x.ell];
+  uint64_t *x_one_hot_64 = new uint64_t[x.size * x.ell];
+#ifdef MSNZB_GC
+  aux->msnzb_GC(x.data, x_one_hot, x.ell, x.size);
+#else  // MSNZB_GC
+  aux->msnzb_one_hot(x.data, x_one_hot, x.ell, x.size);
+#endif // MSNZB_GC
+  fix->aux->B2A(x_one_hot, x_one_hot_64, x.size * x.ell, ell);
+  vector<FixArray> ret(x.ell);
+  for (int i = 0; i < x.ell; i++) {
+    ret[i] = FixArray(x.party, x.size, false, ell, 0);
+    for (int j = 0; j < x.size; j++) {
+      ret[i].data[j] = x_one_hot_64[j*x.ell + i];
+    }
+  }
+  delete[] x_one_hot;
+  delete[] x_one_hot_64;
+  return ret;
+}
+
+FixArray FixOp::exp(const FixArray& x, int l_y, int s_y, int digit_size) {
+  assert(x.party != PUBLIC);
+  assert(x.signed_ == true);
+  assert(l_y >= (s_y + 2));
+  assert(digit_size <= 8);
+
+  FixArray pos_x = this->mul(x, -1);
+  pos_x.signed_ = false; // pos_x is unsigned
+
+  vector<FixArray> digits = this->digit_decomposition(pos_x, digit_size);
+  int num_digits = digits.size();
+
+  vector<FixArray> digits_exp(num_digits);
+  for (int i = 0; i < num_digits; i++) {
+    vector<uint64_t> spec(1 << digits[i].ell);
+    for (int j = 0; j < (1 << digits[i].ell); j++) {
+      spec[j] = std::exp(-1.0 * (j / pow(2.0, digits[i].s))) * (1ULL << s_y);
+    }
+    digits_exp[i] = this->LUT(spec, digits[i], true, s_y + 2, s_y);
+  }
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  for (int i = 1; i < num_digits; i *= 2) {
+    for (int j = 0; j < num_digits and j + i < num_digits; j += 2 * i) {
+      digits_exp[j] = this->mul(digits_exp[j+i], digits_exp[j],
+              2*s_y + 2, all_0.data, all_0.data);
+      digits_exp[j] = this->truncate_reduce(digits_exp[j], s_y);
+    }
+  }
+
+  return this->extend(digits_exp[0], l_y, all_0.data);
+}
+
+FixArray FixOp::max(const vector<FixArray>& x) {
+  int N = x.size();
+  int n = x[0].size;
+  int party = x[0].party;
+  int signed_ = x[0].signed_;
+  int ell = x[0].ell;
+  int s = x[0].s;
+  for (int i = 1; i < N; i++) {
+    assert(x[i].party == party);
+    assert(x[i].signed_ == signed_);
+    assert(x[i].ell == ell);
+    assert(x[i].s == s);
+  }
+
+  vector<FixArray> x_tr(n);
+  for (int i = 0; i < n; i++) {
+    x_tr[i] = FixArray(party, N, signed_, ell, s);
+    for (int j = 0; j < N; j++) {
+      x_tr[i].data[j] = x[j].data[i];
+    }
+  }
+  int num_cmps_old = n; int num_cmps_curr = n/2;
+  uint64_t* lhs = new uint64_t[N*num_cmps_curr];
+  uint64_t* rhs = new uint64_t[N*num_cmps_curr];
+  while(num_cmps_old > 1) {
+    int odd_num_cmps = num_cmps_old & 1;
+    for (int j = odd_num_cmps; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      memcpy(lhs + (j/2)*N, x_tr[j].data, N*sizeof(uint64_t));
+      memcpy(rhs + (j/2)*N, x_tr[j + 1].data, N*sizeof(uint64_t));
+    }
+    FixArray lhs_fp = fix->input(this->party, N*num_cmps_curr, lhs, signed_, ell, s);
+    FixArray rhs_fp = fix->input(this->party, N*num_cmps_curr, rhs, signed_, ell, s);
+    BoolArray cond = fix->GT(lhs_fp, rhs_fp);
+    lhs_fp = fix->if_else(cond, lhs_fp, rhs_fp);
+    for (int j = 0; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      memcpy(x_tr[odd_num_cmps + (j/2)].data, lhs_fp.data + (j/2)*N, N*sizeof(uint64_t));
+    }
+    num_cmps_old = num_cmps_curr + odd_num_cmps;
+    num_cmps_curr = num_cmps_old/2;
+  }
+  delete[] lhs;
+  delete[] rhs;
+
+  return x_tr[0];
+}
+
+// A0 \in (1/4, 1)
+inline uint64_t recp_lookup_c0(uint64_t index, int m) {
+  uint64_t k = 1ULL << m;
+  double p = 1 + (double(index) / double(k));
+  double A1 = 1.0 / (p * (p + 1.0 / double(k)));
+  int32_t scale = m + 3;
+  uint64_t mask = (1ULL << scale) - 1;
+  uint64_t val = uint64_t(A1 * (1ULL << scale)) & mask;
+  return val;
+}
+
+// A1 \in (1/2, 1)
+inline uint64_t recp_lookup_c1(uint64_t index, int m) {
+  uint64_t k = 1ULL << m;
+  double p = 1 + (double(index) / double(k));
+  double z = (p * (p + (1.0 / double(k))));
+  double A1 = ((1.0 / double(k * 2)) + sqrt(z)) / z;
+  int32_t scale = 2 * m + 2;
+  uint64_t mask = (1ULL << scale) - 1;
+  uint64_t val = uint64_t(A1 * (1ULL << scale)) & mask;
+  return val;
+}
+
+FixArray FixOp::div(const FixArray& nm, const FixArray& dn, int l_out, int s_out, bool normalized_dn) {
+  if (!normalized_dn) assert(dn.signed_ == false);
+  assert(nm.party != PUBLIC && dn.party != PUBLIC);
+  assert(nm.size == dn.size);
+  assert(s_out <= dn.s);
+  BoolArray all_0 = bool_op->input(ALICE, dn.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, dn.size, uint8_t(1));
+  print_fix(dn);
+
+  FixArray nrmlzd_dn;
+  FixArray adjust = fix->input(PUBLIC, dn.size, uint64_t(0), false, dn.ell + 1, 0);
+  if (!normalized_dn) {
+    vector<FixArray> msnzb_one_hot = fix->msnzb_one_hot(dn, dn.ell + 1);
+    for (int i = 0; i < dn.ell; i++) {
+      adjust = fix->add(adjust, fix->mul(msnzb_one_hot[i], (1ULL << (dn.ell - 1 - i))));
+    }
+    adjust.s = dn.ell - 1 - dn.s;
+    BoolArray msb_dn = fix->LSB(msnzb_one_hot[dn.ell - 1]);
+    nrmlzd_dn = fix->mul(dn, adjust, dn.ell + 1, msb_dn.data, all_0.data);
+  } else {
+    if (dn.ell == dn.s + 1) {
+      nrmlzd_dn = fix->extend(dn, dn.s + 2, all_1.data);
+    } else {
+      nrmlzd_dn = fix->reduce(dn, dn.s + 2);
+    }
+  }
+
+  int32_t m, iters;
+  m = (s_out <= 18 ? ceil((s_out - 2) / 2.0) : ceil((ceil(s_out / 2.0) - 2) / 2.0));
+  iters = (s_out <= 18 ? 0 : 1);
+
+  // reciprocal approximation w
+  FixArray eps = fix->reduce(nrmlzd_dn, nrmlzd_dn.s - m);
+  eps.signed_ = false;
+  BoolArray msb_eps = fix->MSB(eps);
+  uint8_t *wrap_eps = new uint8_t[dn.size];
+  fix->aux->MSB_to_Wrap(eps.data, msb_eps.data, wrap_eps, eps.size, eps.ell);
+  FixArray idx = fix->truncate_reduce(fix->reduce(nrmlzd_dn, nrmlzd_dn.s), nrmlzd_dn.s - m, wrap_eps);
+  idx.signed_ = false;
+  delete[] wrap_eps;
+  vector<uint64_t> spec_c0(1 << idx.ell);
+  vector<uint64_t> spec_c1(1 << idx.ell);
+  for (int j = 0; j < (1 << idx.ell); j++) {
+    spec_c0[j] = recp_lookup_c0(j, m);
+    spec_c1[j] = recp_lookup_c1(j, m);
+  }
+  FixArray c0 = fix->LUT(spec_c0, idx, true, m + 4, m + 3);
+  FixArray c1 = fix->LUT(spec_c1, idx, true, 2*m + 3, 2*m + 2);
+  FixArray w = fix->mul(c0, eps, nrmlzd_dn.s + 4, all_0.data, msb_eps.data);
+  print_fix(eps);
+  print_fix(w);
+  w = fix->sub(fix->scale_up(c1, nrmlzd_dn.s + m + 4, nrmlzd_dn.s + m + 3),
+               fix->extend(w, nrmlzd_dn.s + m + 4, all_0.data));
+  print_fix(c0);
+  print_fix(c1);
+  print_fix(w);
+  w = fix->truncate_reduce(w, w.s - s_out);
+
+  BoolArray msb_nm;
+  uint8_t* msb_nm_data = nullptr;
+  if (nm.signed_) {
+    msb_nm = fix->MSB(nm);
+    msb_nm_data = msb_nm.data;
+  }
+  FixArray a = fix->mul(nm, w, nm.ell + s_out, msb_nm_data, all_0.data);
+  a = fix->truncate_reduce(a, nm.s);
+  if ((nm.ell - nm.s) >= (l_out - s_out)) {
+    a = fix->reduce(a, l_out);
+  } else {
+    a = fix->extend(a, l_out, msb_nm_data);
+  }
+  print_fix(a);
+
+  if (!normalized_dn) {
+    a = fix->mul(a, adjust, l_out + adjust.s, msb_nm_data, all_0.data);
+    a = fix->truncate_reduce(a, adjust.s);
+  }
+
+  if (iters > 0) {
+    FixArray d = fix->mul(w, nrmlzd_dn, s_out + nrmlzd_dn.s + 2, all_0.data, all_0.data);
+    d = fix->truncate_reduce(d, nrmlzd_dn.s);
+    FixArray e = fix->sub(1ULL << d.s, d);
+    e.signed_ = true;
+
+    FixArray a_curr, e_curr;
+    FixArray a_prev = a, e_prev = e;
+    for (int i = 0; i < iters - 1; i++) {
+      e_curr = fix->mul(e_prev, e_prev, 2*s_out + 2, all_0.data, all_0.data);
+      e_curr = fix->truncate_reduce(e_curr, s_out);
+      e_prev = fix->add(e_prev, 1ULL << e_prev.s);
+      a_curr = fix->mul(e_prev, a_prev, l_out + s_out, all_0.data, msb_nm_data);
+      a_curr = fix->truncate_reduce(a_curr, s_out);
+      a_prev = a_curr;
+      e_prev = e_curr;
+    }
+    e_prev = fix->add(e_prev, 1ULL << e_prev.s);
+    FixArray out = fix->mul(e_prev, a_prev, l_out + s_out, all_0.data, msb_nm_data);
+    out = fix->truncate_reduce(out, s_out);
+    return out;
+  } else {
+    return a;
+  }
+}
+
+FixArray FixOp::sigmoid(const FixArray& x, int l_y, int s_y) {
+  assert(x.party != PUBLIC);
+  assert(x.signed_ == true);
+
+  BoolArray msb_x = fix->MSB(x);
+  FixArray neg_x = fix->if_else(msb_x, x, fix->mul(x, -1));
+
+  FixArray exp_neg_x = fix->exp(neg_x, s_y + 2, s_y);
+  FixArray dn = fix->add(exp_neg_x, 1ULL << exp_neg_x.s);
+
+  // if dn.data == 2 * 2^{dn.s}, subtract 1 from dn.data to ensure normalized form of dn
+  BoolArray dn_eq_2 = fix->EQ(dn, 2ULL << dn.s);
+  dn = fix->if_else(dn_eq_2, fix->sub(dn, 1), dn);
+
+  // setting one_dn as secret-shared as div expects a secret-share numerator
+  FixArray one_dn = fix->input(ALICE, x.size, 1, true, 2, 0);
+  FixArray inv_dn = fix->div(one_dn, dn, s_y + 2, s_y, true);
+
+  FixArray one_nm = fix->input(PUBLIC, x.size, 1ULL << exp_neg_x.s, true, exp_neg_x.ell, exp_neg_x.s);
+  FixArray nm = fix->if_else(msb_x, exp_neg_x, one_nm);
+
+  BoolArray all_0 = bool_op->input(ALICE, dn.size, uint8_t(0));
+  FixArray ret = fix->mul(nm, inv_dn, 2*s_y + 2, all_0.data, all_0.data);
+  ret = fix->truncate_reduce(ret, s_y);
+  if (l_y >= s_y + 2) {
+    ret = fix->extend(ret, l_y, all_0.data);
+  } else {
+    ret = fix->reduce(ret, l_y);
+  }
+  return ret;
+}
+
+FixArray FixOp::tanh(const FixArray& x, int l_y, int s_y) {
+  assert(x.party != PUBLIC);
+  assert(x.signed_ == true);
+
+  BoolArray msb_x = fix->MSB(x);
+  FixArray neg_x = fix->if_else(msb_x, x, fix->mul(x, -1));
+  // get neg_2x from neg_x by reducing scale by 1
+  FixArray neg_2x = neg_x;
+  neg_2x.s -= 1;
+
+  FixArray exp_neg_2x = fix->exp(neg_2x, s_y + 2, s_y);
+  FixArray nm = fix->sub(1ULL << exp_neg_2x.s, exp_neg_2x);
+  FixArray dn = fix->add(exp_neg_2x, 1ULL << exp_neg_2x.s);
+  // if dn.data == 2 * 2^{dn.s}, subtract 1 from dn.data to ensure normalized form of dn
+  BoolArray dn_eq_2 = fix->EQ(dn, 2ULL << dn.s);
+  dn = fix->if_else(dn_eq_2, fix->sub(dn, 1), dn);
+
+  FixArray tanh_neg_x = fix->div(nm, dn, s_y + 2, s_y, true);
+
+  FixArray ret = fix->if_else(msb_x, fix->mul(tanh_neg_x, -1), tanh_neg_x);
+  if (l_y >= s_y + 2) {
+    ret = fix->extend(ret, l_y, msb_x.data);
+  } else {
+    ret = fix->reduce(ret, l_y);
+  }
+  return ret;
+}
+
+FixArray FixOp::sqrt(const FixArray& x, int l_y, int s_y, bool recp_sqrt) {
+  assert(x.party != PUBLIC);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.h b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.h
new file mode 100644
index 00000000..542c13c2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.h
@@ -0,0 +1,414 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef FIXED_POINT_H__
+#define FIXED_POINT_H__
+
+#include "FloatingPoint/bool-data.h"
+#include "Math/math-functions.h"
+#include <tuple>
+
+// #define MSNZB_GC
+
+#define print_fix(vec)                                                         \
+  {                                                                            \
+    auto tmp_pub = fix->output(PUBLIC, vec).subset(I, I + J);                  \
+    cout << #vec << "_pub: " << tmp_pub << endl;                               \
+  }
+
+// A container to hold an array of fixed-point values
+// If party is set as PUBLIC for a FixArray instance, then the underlying array is known publicly and we maintain the invariant that both parties will hold identical data in that instance.
+// Else, the underlying array is secret-shared and the class instance will hold the party's share of the secret array. In this case, the party data member denotes which party this share belongs to.
+// signed_ denotes the signedness, ell is the bitlength, and s is the scale of the underlying fixed-point array
+// If s is set to 0, the FixArray will behave like an IntegerArray
+class FixArray {
+public:
+  int party = sci::PUBLIC;
+  int size = 0;             // size of array
+  uint64_t *data = nullptr; // data (ell-bit integers)
+  bool signed_;             // signed? (1: signed; 0: unsigned)
+  int ell;                  // bitlength
+  int s;                    // scale
+
+  FixArray(){};
+
+  FixArray(int party_, int sz, bool signed__, int ell_, int s_ = 0) {
+    assert(party_ == sci::PUBLIC || party_ == sci::ALICE || party_ == sci::BOB);
+    assert(sz > 0);
+    assert(ell_ <= 64 && ell_ > 0);
+    this->party = party_;
+    this->size = sz;
+    this->signed_ = signed__;
+    this->ell = ell_;
+    this->s = s_;
+    data = new uint64_t[sz];
+  }
+
+  // copy constructor
+  FixArray(const FixArray &other) {
+    this->party = other.party;
+    this->size = other.size;
+    this->signed_ = other.signed_;
+    this->ell = other.ell;
+    this->s = other.s;
+    this->data = new uint64_t[size];
+    memcpy(this->data, other.data, size * sizeof(uint64_t));
+  }
+
+  // move constructor
+  FixArray(FixArray &&other) noexcept {
+    this->party = other.party;
+    this->size = other.size;
+    this->signed_ = other.signed_;
+    this->ell = other.ell;
+    this->s = other.s;
+    this->data = other.data;
+    other.data = nullptr;
+  }
+
+  ~FixArray() { delete[] data; }
+
+  template <class T> std::vector<T> get_native_type();
+
+  // copy assignment
+  FixArray &operator=(const FixArray &other) {
+    if (this == &other) return *this;
+
+    delete[] this->data;
+    this->party = other.party;
+    this->size = other.size;
+    this->signed_ = other.signed_;
+    this->ell = other.ell;
+    this->s = other.s;
+    this->data = new uint64_t[size];
+    memcpy(this->data, other.data, size * sizeof(uint64_t));
+    return *this;
+  }
+
+  // move assignment
+  FixArray &operator=(FixArray &&other) noexcept {
+    if (this == &other) return *this;
+
+    delete[] this->data;
+    this->party = other.party;
+    this->size = other.size;
+    this->signed_ = other.signed_;
+    this->ell = other.ell;
+    this->s = other.s;
+    this->data = other.data;
+    other.data = nullptr;
+    return *this;
+  }
+
+  // FixArray[i, j)
+  FixArray subset(int i, int j);
+
+  uint64_t ell_mask() const { return (1ULL << (this->ell)) - 1; }
+};
+
+std::ostream &operator<<(std::ostream &os, FixArray &other);
+
+FixArray concat(const vector<FixArray>& x);
+
+class FixOp {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  Equality *eq;
+  MillionaireWithEquality *mill_eq;
+  AuxProtocols *aux;
+  XTProtocol *xt;
+  Truncation *trunc;
+  LinearOT *mult;
+  BoolOp *bool_op;
+  FixOp *fix;
+
+  FixOp(int party, sci::IOPack *iopack, sci::OTPack *otpack) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->aux = new AuxProtocols(party, iopack, otpack);
+    this->eq = new Equality(party, iopack, otpack);
+    this->mill_eq = new MillionaireWithEquality(party, iopack, otpack);
+    this->xt = new XTProtocol(party, iopack, otpack);
+    this->trunc = new Truncation(party, iopack, otpack);
+    this->mult = new LinearOT(party, iopack, otpack);
+    this->bool_op = new BoolOp(party, iopack, otpack);
+    this->fix = this;
+  }
+
+  ~FixOp() {
+    delete aux;
+    delete eq;
+    delete mill_eq;
+    delete xt;
+    delete trunc;
+    delete mult;
+    delete bool_op;
+  }
+
+  // input functions: return a FixArray that stores data_
+  // party_ denotes which party provides the input data_ and the data_ provided by the other party is ignored. If party_ is PUBLIC, then the data_ provided by both parties must be identical.
+  // sz is the size of the returned FixArray and the uint64_t array pointed by data_
+  // signed__, ell_, and s_ are the signedness, bitlength and scale of the input, respectively
+  FixArray input(int party_, int sz, uint64_t* data_, bool signed__, int ell_, int s_ = 0);
+  // same as the above function, except that it replicates data_ in all sz positions of the returned FixArray
+  FixArray input(int party_, int sz, uint64_t data_, bool signed__, int ell_, int s_ = 0);
+
+  // output function: returns the secret array underlying x in the form of a PUBLIC FixArray
+  // party_ denotes which party will receive the output. If party_ is PUBLIC, both parties receive the output.
+  FixArray output(int party_, const FixArray& x);
+
+  // Multiplexers: return x[i] if cond[i] = 1; else return y[i]
+  // cond must be a secret-shared BoolArray
+  //// Both x and y can be PUBLIC or secret-shared
+  //// cond, x, y must have equal size
+  //// x, y must have same signedness, bitlength and scale
+  FixArray if_else(const BoolArray &cond, const FixArray &x, const FixArray &y);
+  //// x can be PUBLIC or secret-shared
+  //// cond, x must have equal size
+  //// y[i] = y (with same signedness, bitlength and scale as x)
+  FixArray if_else(const BoolArray &cond, const FixArray &x, uint64_t y);
+  //// y can be PUBLIC or secret-shared
+  //// cond, y must have equal size
+  //// x[i] = x (with same signedness, bitlength and scale as y)
+  FixArray if_else(const BoolArray &cond, uint64_t x, const FixArray &y);
+
+  // Add Operations: return x[i] + y[i]
+  // The output has same signedness, bitlength and scale as x, y
+  //// Both x and y can be PUBLIC or secret-shared
+  //// x and y must have equal size
+  //// x, y must have same signedness, bitlength and scale
+  FixArray add(const FixArray &x, const FixArray &y);
+  //// x can be PUBLIC or secret-shared
+  //// y[i] = y (with same signedness, bitlength and scale as x)
+  FixArray add(const FixArray &x, uint64_t y);
+
+  // Sub Operations: return x[i] - y[i]
+  // The output has same signedness, bitlength and scale as x, y
+  //// Both x and y can be PUBLIC or secret-shared
+  //// x and y must have equal size
+  //// x, y must have same signedness, bitlength and scale
+  FixArray sub(const FixArray &x, const FixArray &y);
+  //// x can be PUBLIC or secret-shared
+  //// y[i] = y (with same signedness, bitlength and scale as x)
+  FixArray sub(const FixArray &x, uint64_t y);
+  //// y can be PUBLIC or secret-shared
+  //// x[i] = x (with same signedness, bitlength and scale as y)
+  FixArray sub(uint64_t x, const FixArray &y);
+
+  // Extension: returns a FixArray that holds the same fixed-point values as x, except in larger bitlength ell
+  // The signedness and scale of the output are same as x
+  // x can be PUBLIC or secret-shared
+  // ell should be greater than or equal to bitlength of x
+  // msb_x is an optional parameter that points to an array holding boolean shares of most significant bit (MSB) of x[i]'s. If msb_x provided, this operation is cheaper when x is secret-shared.
+  FixArray extend(const FixArray &x, int ell, uint8_t *msb_x = nullptr);
+
+  // Boolean-to-Arithmetic Operation: returns a FixArray with bitlength ell that holds the same boolean values as x
+  // The signedness of the output is set as signed_ and the scale is set as 0
+  // x can be PUBLIC or secret-shared
+  // ell should be greater than 1
+  FixArray B2A(const BoolArray &x, bool signed_, int ell);
+
+  // Multiplication Operation: return x[i] * y[i] (in ell bits)
+  // ell specifies the output bitlength
+  // Signedness of the output is the same as x and scale is equal to sum of scales of x and y
+  // msb_x is an optional parameter that points to an array holding boolean shares of most significant bit (MSB) of x[i]'s. If msb_x provided, this operation is cheaper when x is secret-shared.
+  //// At least one of x and y must be a secret-shared FixArray
+  //// x and y must have equal size
+  //// Either signedness of x and y is same, or x is signed (x.signed_ = 1)
+  //// ell >= bitlengths of x and y and ell <= sum of bitlengths of x and y (x.ell + y.ell)
+  //// msb_y is similar to msb_x but for y
+  FixArray mul(const FixArray &x, const FixArray &y, int ell,
+               uint8_t *msb_x = nullptr, uint8_t *msb_y = nullptr);
+  //// x can be PUBLIC or secret-shared
+  //// y[i] = y (with same signedness as x; bitlength is ell and scale is 0)
+  //// ell >= bitlength of x
+  FixArray mul(const FixArray &x, uint64_t y, int ell,
+               uint8_t *msb_x = nullptr);
+  // same as above mul functions except ell is a constant depending on bitlengths of inputs
+  //// ell = sum of bitlengths of x and y (x.ell + y.ell)
+  inline FixArray mul(const FixArray &x, const FixArray &y,
+          uint8_t *msb_x = nullptr, uint8_t *msb_y = nullptr) {
+      return mul(x, y, x.ell + y.ell, msb_x, msb_y);
+  }
+  //// ell = bitlength of x (x.ell)
+  inline FixArray mul(const FixArray &x, uint64_t y, uint8_t *msb_x = nullptr) {
+      return mul(x, y, x.ell, msb_x);
+  }
+
+  // Left Shift: returns x[i] << s[i] (in ell bits)
+  // Output bitlength is ell, output signedness and scale are same as that of x
+  // bound is the (closed) upper bound on integer values in s
+  // Both x and s must be secret shared FixArray and of equal size
+  // ell <= bitlength of x (x.ell) + bound and ell is >= both x.ell and bound
+  // s must be an unsigned FixArray with scale 0 and bitlength >= ceil(log2(bound))
+  // msb_x is an optional parameter that points to an array holding boolean shares of most significant bit (MSB) of x[i]'s. If msb_x provided, this operation is cheaper
+  FixArray left_shift(const FixArray &x, const FixArray &s, int ell, int bound,
+                      uint8_t *msb_x = nullptr);
+
+  // Right Shift: returns x[i] >> s[i]
+  // Output bitlength, signedness and scale are same as that of x
+  // bound is the (closed) upper bound on integer values in s
+  // Both x and s must be secret shared FixArray and of equal size
+  // bound <= bitlength of x (x.ell) and bound + x.ell < 64
+  // s must be an unsigned FixArray with scale 0 and bitlength >= ceil(log2(bound))
+  // msb_x is an optional parameter that points to an array holding boolean shares of most significant bit (MSB) of x[i]'s. If msb_x provided, this operation is cheaper
+  FixArray right_shift(const FixArray &x, const FixArray &s, int bound,
+                       uint8_t *msb_x = nullptr);
+
+  // Right Shift: returns x[i] >> s
+  // Output scale is x.s - s; Output bitlength and signedness are same as that of x
+  // x must be secret shared FixArray
+  // s <= bitlength of x (x.ell) and s >= 0
+  // msb_x is an optional parameter that points to an array holding boolean shares of most significant bit (MSB) of x[i]'s. If msb_x provided, this operation is cheaper
+  FixArray right_shift(const FixArray &x, int s, uint8_t *msb_x = nullptr);
+
+  // Truncate and Reduce: returns x[i] >> s mod 2^{x.ell - s}
+  // Output bitlength and scale are x.ell-s and x.s-s; Output signedness is same as that of x
+  // x must be secret shared FixArray
+  // s < bitlength of x (x.ell) and s >= 0
+  // wrap_x_s is an optional parameter that points to an array holding boolean shares of the wrap-bit of lower s bits of x[i]'s (i.e., wrap_x_s[i] = 1{ share(1, x[i]) mod 2^{s} + share(2, x[i]) mod 2^{s} >= 2^{s} }). If wrap_x_s is provided, this operation is cheaper
+  FixArray truncate_reduce(const FixArray &x, int s,
+                           uint8_t *wrap_x_s = nullptr);
+
+  // Round Nearest: returns (x[i] + 2^(s-1)) >> s mod 2^{x.ell - s}
+  // Output bitlength and scale are x.ell-s and x.s-s; Output signedness is same as that of x
+  // x must be secret shared FixArray
+  // s < bitlength of x (x.ell) and s >= 0
+  FixArray round_nearest(const FixArray &x, int s) {
+    FixArray y = fix->add(x, 1ULL << (s-1));
+    return truncate_reduce(y, s);
+  }
+
+  // Truncate with Sticky-Bit: returns (x[i] >> s) mod 2^{x.ell-s} if x[i] mod 2^{s} = 0 and returns ((x[i] >> s) | 1) mod 2^{x.ell-s} otherwise
+  // Output bitlength and scale are x.ell-s and x.s-s; Output signedness is same as that of x
+  // x must be secret shared FixArray
+  // s < bitlength of x (x.ell) and s >= 0
+  FixArray truncate_with_sticky_bit(const FixArray &x, int s);
+
+  // Round (Ties to Even): returns (x[i] >> s) mod 2^{x.ell-s} if x[i] mod 2^{s} < 2^{s-1} or x[i] mod 2^{s+1} = 2^{s-1}, and returns ((x[i] >> s) + 1) mod 2^{x.ell-s} otherwise
+  // Output bitlength and scale are x.ell-s and x.s-s; Output signedness is same as that of x
+  // x must be secret shared FixArray
+  // s <= bitlength of x (x.ell) and s >= 2
+  // TODO: Improve this protocol: perform wrap_and_zero_test to get wrap_x_s and eq_x_s, then do truncate_reduce(x[i] + 2^{s-1}, s, wrap_x_s) and finally, subtract 1 from output out if eq_x_s & (out mod 2) = 1
+  FixArray round_ties_to_even(const FixArray &x, int s);
+
+  // Scale-Up: returns (x[i] << (s - x.s)) mod 2^{ell}
+  // Output bitlength and scale are ell and s, and output signedness is same as that of x
+  // x can be PUBLIC or secret-shared
+  // s >= x.s and ell <= x.ell + (s - x.s)
+  FixArray scale_up(const FixArray &x, int ell, int s);
+
+  // (Modulo) Reduce: returns x[i] mod 2^{ell}
+  // Output bitwidth is ell, and output scale and signedness are same as that of x
+  // x can be PUBLIC or secret-shared
+  // ell <= x.ell and ell > 0
+  FixArray reduce(const FixArray &x, int ell);
+
+  // Least Significant Bit: returns x[i] mod 2 in the form of BoolArray
+  // x can be PUBLIC or secret-shared
+  BoolArray LSB(const FixArray &x);
+
+  // Most Significant Bit: returns MSB of x in the form of BoolArray
+  // x must be secret-shared
+  BoolArray MSB(const FixArray &x) {
+      assert(x.party != sci::PUBLIC);
+      return fix->LT(x, 0);
+  }
+
+  // Wrap Bit and Zero-test Bit: returns 1{ share(1, x[i]) + share(2, x[i]) >= 2^{x.ell} } and 1{ x[i] = 0 } in the form of BoolArrays
+  // x must be secret-shared
+  std::tuple<BoolArray,BoolArray> wrap_and_zero_test(const FixArray &x);
+  // Most Significant Bit and Zero-test Bit: returns 1{ x[i] >= 2^{x.ell-1} } and  1{ x[i] = 0 } in the form of BoolArrays
+  // x must be secret-shared
+  std::tuple<BoolArray,BoolArray> MSB_and_zero_test(const FixArray &x);
+
+  // Equality and Comparison Operations: return (x[i]-y[i] mod 2^{x.ell}) OP 0 (OP = {=, <, >, <=, >=}) in the form of a BoolArray, where the comparison is over signed integers
+  // The comparison operations have correctness for signed inputs if | x[i] | + | y[i] | < 2^{x.ell-1}, and for unsigned inputs if | x[i]-y[i] | < 2^{x.ell-1}
+  //// At least one of x and y must be a secret-shared FixArray
+  //// x, y must have same size, signedness, bitlength and scale
+  BoolArray EQ(const FixArray &x, const FixArray &y);
+  BoolArray LT(const FixArray &x, const FixArray &y);
+  BoolArray GT(const FixArray &x, const FixArray &y);
+  BoolArray LE(const FixArray &x, const FixArray &y);
+  BoolArray GE(const FixArray &x, const FixArray &y);
+  //// x must be secret-shared
+  //// y[i] = y (with same signedness, bitlength and scale as x)
+  BoolArray EQ(const FixArray &x, uint64_t y);
+  BoolArray LT(const FixArray &x, uint64_t y);
+  BoolArray GT(const FixArray &x, uint64_t y);
+  BoolArray LE(const FixArray &x, uint64_t y);
+  BoolArray GE(const FixArray &x, uint64_t y);
+
+  // Less-Than Bit and Equal-To Bit: returns LT(x, y) and EQ(x, y) but has better cost than the sum of their costs
+  // At least one of x and y must be a secret-shared FixArray
+  // x, y must have same size, signedness, bitlength and scale
+  std::tuple<BoolArray,BoolArray> LT_and_EQ(const FixArray &x, const FixArray &y);
+
+  // Lookup Table (LUT): returns spec_vec[x[i] mod 2^{l_in}] mod 2^{l_out}
+  // Output has bitlength l_out, scale s_out, and is signed if signed_ = true
+  // x must be secret-shared and an unsigned FixArray
+  // l_out < 64, l_in <= 8, and l_in <= x.ell
+  // spec_vec.size() must be equal to 2^{l_in}
+  FixArray LUT(const std::vector<uint64_t> &spec_vec, const FixArray &x,
+               bool signed_, int l_out, int s_out, int l_in);
+  // same as above LUT function except l_in is same as x.ell
+  inline FixArray LUT(const std::vector<uint64_t> &spec_vec, const FixArray &x,
+          bool signed_, int l_out, int s_out) {
+    return this->LUT(spec_vec, x, signed_, l_out, s_out, x.ell);
+  }
+
+  // Digit Decomposition: returns { x_i }_{i \in [d]}, where x = x_0 || ... || x_{d-1} (x_0 has the LSBs) and d = ceil(x.ell/digit_size)
+  // i-th output digit is unsigned and has scale x.s - i*digit_size
+  // All output digits have bitlength digit_size except the last digit which has bitlength x.ell - (d-1)*digit_size
+  // x must be secret-shared and digit_size should be <= 8
+  std::vector<FixArray> digit_decomposition(const FixArray& x, int digit_size);
+
+  // Most Significant Non-Zero Bit (MSNZB): return { z_i }_{i \in {0, ..., x.ell - 1}}, where z_i = 1 if 2^i <= x < 2^{i+1}
+  // The output is an x.ell sized vector of unsigned FixArrays of bitlength ell and scale 0
+  std::vector<FixArray> msnzb_one_hot(const FixArray& x, int ell);
+
+  // Finds max element in x[i], forall i
+  FixArray max(const std::vector<FixArray>& x);
+
+  // SIRNN's math functions
+
+  // Exponentiation: returns y = e^{-x} in an l_y-bit fixed-point representation with scale s_y
+  // x must be secret-shared and signed, and l_y should be >= s_y + 2
+  // digit_size is an optional parameter that should be <= 8. This parameter affects both efficiency and precision (the larger the better)
+  FixArray exp(const FixArray& x, int l_y, int s_y, int digit_size = 8);
+
+  // Division: return out = nm / dn in an l_out fixed-point representation with scale s_out
+  // nm and dn must be secret-shared, and s_out should be <= dn.s
+  // if normalized_dn is true, it is assumed that dn \in [1,2)
+  FixArray div(const FixArray& nm, const FixArray& dn, int l_out, int s_out, bool normalized_dn = false);
+
+  FixArray sqrt(const FixArray& x, int l_y, int s_y, bool recp_sqrt = false);
+
+  FixArray sigmoid(const FixArray& x, int l_y, int s_y);
+
+  FixArray tanh(const FixArray& x, int l_y, int s_y);
+
+};
+
+#endif // FIXED_POINT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.cpp
new file mode 100644
index 00000000..6c172597
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.cpp
@@ -0,0 +1,1611 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/floating-point.h"
+#include <omp.h>
+
+#define CHUNK_SIZE (1 << 20)
+
+using namespace std;
+using namespace sci;
+
+FPArray FPArray::subset(int i, int j) {
+  assert(i >= 0 && j <= size && i < j);
+  int sz = j - i;
+  FPArray ret(this->party, sz, this->m_bits, this->e_bits);
+  memcpy(ret.s, this->s + i, sz * sizeof(uint8_t));
+  memcpy(ret.z, this->z + i, sz * sizeof(uint8_t));
+  memcpy(ret.m, this->m + i, sz * sizeof(uint64_t));
+  memcpy(ret.e, this->e + i, sz * sizeof(uint64_t));
+
+  return ret;
+}
+
+BoolArray concat(const vector<BoolArray>& x) {
+  int N = x.size();
+  int sz = x[0].size;
+  int party = x[0].party;
+  for (int i = 1; i < N; i++) {
+    sz += x[i].size;
+    assert(party == x[i].party);
+  }
+  BoolArray ret(party, sz);
+  int offset = 0;
+  for (int i = 0; i < N; i++) {
+    int n = x[i].size;
+    memcpy(ret.data + offset, x[i].data, n * sizeof(uint8_t));
+    offset += n;
+  }
+  return ret;
+}
+
+FPArray concat(const vector<FPArray>& x) {
+  int N = x.size();
+  int sz = x[0].size;
+  int m_bits = x[0].m_bits;
+  int e_bits = x[0].e_bits;
+  int party = x[0].party;
+  for (int i = 1; i < N; i++) {
+    sz += x[i].size;
+    assert(m_bits == x[i].m_bits);
+    assert(e_bits == x[i].e_bits);
+    assert(party == x[i].party);
+  }
+  FPArray ret(party, sz, m_bits, e_bits);
+  int offset = 0;
+  for (int i = 0; i < N; i++) {
+    int n = x[i].size;
+    memcpy(ret.s + offset, x[i].s, n * sizeof(uint8_t));
+    memcpy(ret.z + offset, x[i].z, n * sizeof(uint8_t));
+    memcpy(ret.m + offset, x[i].m, n * sizeof(uint64_t));
+    memcpy(ret.e + offset, x[i].e, n * sizeof(uint64_t));
+    offset += n;
+  }
+  return ret;
+}
+
+uint64_t RoundTiesToEven(uint64_t x, int32_t shift_amt) {
+  assert(shift_amt > 0);
+  bool a = (x >> (shift_amt - 1)) & 1;
+  bool b = (x & ((1ULL << (shift_amt - 1)) - 1)) != 0;
+  bool c = (x >> shift_amt) & 1;
+  uint64_t y = x >> shift_amt;
+  if (a && (b || c)) {
+    y += 1;
+  }
+  return y;
+}
+
+template <class T> vector<T> FPArray::get_native_type() {
+  assert(this->party == PUBLIC);
+  vector<T> ret(this->size);
+  uint8_t T_m_bits, T_e_bits;
+  if constexpr (is_same_v<T, float>) {
+    T_m_bits = 23;
+    T_e_bits = 8;
+  }
+  if constexpr (is_same_v<T, double>) {
+    T_m_bits = 52;
+    T_e_bits = 11;
+  }
+  uint64_t T_m_mask = (1ULL << T_m_bits) - 1;
+  uint64_t T_e_mask = (uint64_t(1) << T_e_bits) - 1;
+  uint64_t T_e_bias = (uint64_t(1) << (T_e_bits - 1)) - 1;
+  uint64_t *m_ = new uint64_t[this->size];
+  uint64_t *e_ = new uint64_t[this->size];
+  uint64_t m_mask_ = (this->m_mask() >> 1);
+  uint64_t e_bias_ = this->e_bias();
+  for (int i = 0; i < this->size; i++) {
+    uint64_t ret_int = 0;
+    uint64_t m_ = this->m[i];
+    int64_t e_ = this->e[i] - e_bias_;
+    if (m_bits <= T_m_bits) {
+      m_ <<= (T_m_bits - m_bits);
+    } else {
+      // m_ >>= (m_bits - T_m_bits);
+      m_ = RoundTiesToEven(m_, m_bits - T_m_bits);
+    }
+    if (e_ == (e_bias_ + 1)) {
+      e_ = T_e_bias + 1;
+    }
+    if (e_ == (-1 * int64_t(e_bias_))) {
+      e_ = -1 * T_e_bias;
+      m_ = 0;
+    }
+    if (m_ == (1ULL << (T_m_bits + 1))) {
+      e_ = e_ + 1;
+      m_ >>= 1;
+    }
+    m_ &= T_m_mask;
+    if (e_ > int64_t(T_e_bias)) {
+      e_ = T_e_bias + 1;
+      m_ = 0;
+    }
+    if ((z[i] & 1) || e_ < 1 - int64_t(T_e_bias)) {
+      e_ = -1 * T_e_bias;
+      m_ = 0;
+    }
+    e_ = e_ + T_e_bias;
+    assert(e_ >= 0);
+    ret_int |= m_;
+    ret_int |= (uint64_t(e_) << T_m_bits);
+    if constexpr (is_same_v<T, float>) {
+      ret_int |= (uint64_t(this->s[i] & 1) << 31);
+      uint32_t tmp = ret_int;
+      ret[i] = *((T *)&tmp);
+    }
+    if constexpr (is_same_v<T, double>) {
+      ret_int |= (uint64_t(this->s[i] & 1) << 63);
+      ret[i] = *((T *)&ret_int);
+    }
+  }
+  return ret;
+}
+
+FPMatrix FPMatrix::transpose() {
+  FPMatrix ret(this->party, dim2, dim1, m_bits, e_bits);
+  for (int i = 0; i < dim2; i++) {
+    for (int j = 0; j < dim1; j++) {
+      ret.m[i*dim1 + j] = this->m[j*dim2 + i];
+      ret.e[i*dim1 + j] = this->e[j*dim2 + i];
+      ret.s[i*dim1 + j] = this->s[j*dim2 + i];
+      ret.z[i*dim1 + j] = this->z[j*dim2 + i];
+    }
+  }
+  return ret;
+}
+
+std::ostream &operator<<(std::ostream &os, FPArray &other) {
+  assert(other.party == PUBLIC);
+  uint64_t m_mask_ = (other.m_mask() >> 1);
+  os << "[";
+  auto dbl_other = other.get_native_type<double>();
+  for (int i = 0; i < other.size; i++) {
+    uint64_t m_ = other.m[i] & m_mask_;
+    uint64_t e_ = other.e[i];
+    std::string tmp_m = std::bitset<64>(m_).to_string();
+    std::string tmp_e = std::bitset<16>(e_).to_string();
+    os << dbl_other[i] << " s=" << uint16_t(other.s[i]) << " e=" << signed_val(e_, other.e_bits + 2) << " ("
+       << tmp_e.substr(16 - other.e_bits, 16) << ") m=" << m_ << " ("
+       << tmp_m.substr(64 - other.m_bits, 64) << ")\t";
+  }
+  os << "]";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, FPMatrix &other) {
+  assert(other.party == PUBLIC);
+  uint64_t m_mask_ = (other.m_mask() >> 1);
+  os << "[";
+  auto dbl_other = other.get_native_type<double>();
+  for (int i = 0; i < other.dim1; i++) {
+    os << "[";
+    for (int j = 0; j < other.dim2; j++) {
+    uint64_t m_ = other.m[i*other.dim2+j] & m_mask_;
+    uint64_t e_ = other.e[i*other.dim2+j];
+    std::string tmp_m = std::bitset<64>(m_).to_string();
+    std::string tmp_e = std::bitset<16>(e_).to_string();
+    os << dbl_other[i*other.dim2+j] << " s=" << uint16_t(other.s[i*other.dim2+j]) << " e=" << signed_val(e_, other.e_bits + 2) << " ("
+       << tmp_e.substr(16 - other.e_bits, 16) << ") m=" << m_ << " ("
+       << tmp_m.substr(64 - other.m_bits, 64) << ")\t";
+    }
+    os << "]\n";
+  }
+  os << "]";
+
+  return os;
+}
+
+template vector<float> FPArray::get_native_type();
+template vector<double> FPArray::get_native_type();
+
+FPArray FPOp::input(int party_, int sz, uint8_t* s_, uint8_t* z_, uint64_t* m_,
+        uint64_t* e_, uint8_t m_bits_, uint8_t e_bits_) {
+  FPArray ret((party_ == PUBLIC ? party_ : this->party), sz, m_bits_, e_bits_);
+  uint64_t m_mask_ = ret.m_mask();
+  uint64_t e_mask_ = ret.e_mask();
+  if ((this->party == party_) || (party_ == PUBLIC)) {
+    memcpy(ret.s, s_, sz * sizeof(uint8_t));
+    memcpy(ret.z, z_, sz * sizeof(uint8_t));
+    memcpy(ret.m, m_, sz * sizeof(uint64_t));
+    memcpy(ret.e, e_, sz * sizeof(uint64_t));
+    for (int i = 0; i < sz; i++) {
+      ret.s[i] &= 1;
+      ret.z[i] &= 1;
+      ret.m[i] &= m_mask_;
+      ret.e[i] &= e_mask_;
+    }
+  } else {
+    memset(ret.s, 0, sz * sizeof(uint8_t));
+    memset(ret.z, 0, sz * sizeof(uint8_t));
+    for (int i = 0; i < sz; i++) {
+      ret.m[i] = 0;
+      ret.e[i] = 0;
+    }
+  }
+  return ret;
+}
+
+template <class T>
+FPArray FPOp::input(int party_, int sz, T* data_, uint8_t m_bits_,
+        uint8_t e_bits_, bool check_params) {
+  FPArray ret((party_ == PUBLIC ? party_ : this->party), sz, m_bits_, e_bits_);
+  if ((this->party != party_) && (party_ != PUBLIC)) {
+    memset(ret.s, 0, sz * sizeof(uint8_t));
+    memset(ret.z, 0, sz * sizeof(uint8_t));
+    for (int i = 0; i < sz; i++) {
+      ret.m[i] = 0;
+      ret.e[i] = 0;
+    }
+    return ret;
+  }
+  uint64_t *data_int = new uint64_t[sz];
+  uint8_t T_m_bits, T_e_bits;
+  if constexpr (is_same_v<T, float>) {
+    for (int i = 0; i < sz; i++) {
+      uint32_t tmp = ((uint32_t *)data_)[i];
+      data_int[i] = tmp;
+    }
+    T_m_bits = 23;
+    T_e_bits = 8;
+    if (m_bits_ == 0) {
+      m_bits_ = 23;
+    }
+    if (e_bits_ == 0) {
+      e_bits_ = 8;
+    }
+  }
+  if constexpr (is_same_v<T, double>) {
+    for (int i = 0; i < sz; i++) {
+      data_int[i] = ((uint64_t *)data_)[i];
+    }
+    T_m_bits = 52;
+    T_e_bits = 11;
+    if (m_bits_ == 0) {
+      m_bits_ = 52;
+    }
+    if (e_bits_ == 0) {
+      e_bits_ = 11;
+    }
+  }
+
+  uint64_t T_m_mask = (1ULL << T_m_bits) - 1;
+  uint64_t T_e_mask = (uint64_t(1) << T_e_bits) - 1;
+  uint64_t T_e_bias = (uint64_t(1) << (T_e_bits - 1)) - 1;
+  uint64_t e_mask = (uint64_t(1) << e_bits_) - 1;
+
+  for (int i = 0; i < sz; i++) {
+    ret.z[i] = (data_int[i] == 0 || data_int[i] == (1ULL << (sizeof(T) * 8 - 1)));
+    ret.s[i] = data_int[i] >= (1ULL << (sizeof(T) * 8 - 1));
+    ret.m[i] = (data_int[i] & T_m_mask) | (ret.z[i] ? 0 : 1ULL << T_m_bits);
+    if (m_bits_ >= T_m_bits) {
+      ret.m[i] <<= (m_bits_ - T_m_bits);
+    } else {
+      ret.m[i] = RoundTiesToEven(ret.m[i], T_m_bits - m_bits_);
+    }
+    ret.e[i] = ((data_int[i] >> T_m_bits) & T_e_mask) - T_e_bias;
+    if (ret.e[i] == (T_e_bias + 1)) {
+      ret.e[i] = ret.e_bias() + 1;
+    }
+    ret.e[i] += ret.e_bias();
+    if (ret.m[i] == (1ULL << (m_bits_ + 1))) {
+      ret.m[i] >>= 1;
+      ret.e[i] += 1;
+    }
+  }
+  if (check_params) ret.check_bounds();
+  delete[] data_int;
+
+  return ret;
+}
+
+template <class T>
+FPArray FPOp::input(int party_, int sz, T data_, uint8_t m_bits_,
+        uint8_t e_bits_, bool check_params) {
+  vector<T> data_vec(sz);
+  for (int i = 0; i < sz; i++) {
+    data_vec[i] = data_;
+  }
+  FPArray ret = this->input<T>(party_, sz, data_vec.data(), m_bits_, e_bits_, check_params);
+  return ret;
+}
+
+FPArray FPOp::output(int party_, const FPArray& x) {
+  if (x.party == PUBLIC) {
+    return x;
+  }
+  int sz = x.size;
+  int ret_party = (party_ == PUBLIC || party_ == x.party ? PUBLIC : x.party);
+  FPArray ret(ret_party, sz, x.m_bits, x.e_bits);
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1 && party_ != BOB) {
+      if (party == sci::ALICE) {
+        iopack->io_rev->recv_data(ret.s, sz * sizeof(uint8_t));
+        iopack->io_rev->recv_data(ret.z, sz * sizeof(uint8_t));
+        iopack->io_rev->recv_data(ret.m, sz * sizeof(uint64_t));
+        iopack->io_rev->recv_data(ret.e, sz * sizeof(uint64_t));
+      } else { // party == sci::BOB
+        iopack->io_rev->send_data(x.s, sz * sizeof(uint8_t));
+        iopack->io_rev->send_data(x.z, sz * sizeof(uint8_t));
+        iopack->io_rev->send_data(x.m, sz * sizeof(uint64_t));
+        iopack->io_rev->send_data(x.e, sz * sizeof(uint64_t));
+      }
+    } else if (omp_get_thread_num() == 0 && party_ != ALICE) {
+      if (party == sci::ALICE) {
+        iopack->io->send_data(x.s, sz * sizeof(uint8_t));
+        iopack->io->send_data(x.z, sz * sizeof(uint8_t));
+        iopack->io->send_data(x.m, sz * sizeof(uint64_t));
+        iopack->io->send_data(x.e, sz * sizeof(uint64_t));
+      } else { // party == sci::BOB
+        iopack->io->recv_data(ret.s, sz * sizeof(uint8_t));
+        iopack->io->recv_data(ret.z, sz * sizeof(uint8_t));
+        iopack->io->recv_data(ret.m, sz * sizeof(uint64_t));
+        iopack->io->recv_data(ret.e, sz * sizeof(uint64_t));
+      }
+    }
+  }
+  uint64_t m_mask_ = x.m_mask();
+  uint64_t e_mask_ = x.e_mask();
+  for (int i = 0; i < sz; i++) {
+    ret.s[i] = ret.s[i] ^ x.s[i];
+    ret.z[i] = ret.z[i] ^ x.z[i];
+    ret.m[i] = (ret.m[i] + x.m[i]) & m_mask_;
+    ret.e[i] = (ret.e[i] + x.e[i]) & e_mask_;
+  }
+  return ret;
+}
+
+// To avoid linkage failure
+template FPArray FPOp::input(int party_, int sz, float* data_,
+        uint8_t m_bits_, uint8_t e_bits_, bool check_params);
+template FPArray FPOp::input(int party_, int sz, float data_,
+        uint8_t m_bits_, uint8_t e_bits_, bool check_params);
+template FPArray FPOp::input(int party_, int sz, double* data_,
+        uint8_t m_bits_, uint8_t e_bits_, bool check_params);
+template FPArray FPOp::input(int party_, int sz, double data_,
+        uint8_t m_bits_, uint8_t e_bits_, bool check_params);
+
+tuple<BoolArray,BoolArray,FixArray,FixArray> FPOp::get_components(const FPArray &x) {
+  BoolArray x_s = bool_op->input(x.party, x.size, x.s);
+  BoolArray x_z = bool_op->input(x.party, x.size, x.z);
+  FixArray x_m = fix->input(x.party, x.size, x.m, false, x.m_bits + 1, x.m_bits);
+  FixArray x_e = fix->input(x.party, x.size, x.e, true, x.e_bits + 2, 0);
+  return make_tuple(x_s, x_z, x_m, x_e);
+}
+
+FPArray FPOp::check_bounds(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+
+  BoolArray overflow_check = fix->GT(x_e, x.e_max());
+  BoolArray underflow_check = fix->LT(x_e, x.e_min());
+  underflow_check = bool_op->OR(underflow_check, x_z);
+
+  uint64_t m_inf = (1ULL << x.m_bits);
+  uint64_t e_inf = x.e_max() + 1;
+  FixArray ret_m_if = fix->input(PUBLIC, x.size, m_inf, x_m.signed_, x_m.ell, x_m.s);
+  FixArray ret_m_else = x_m;
+  FixArray ret_e_if = fix->input(PUBLIC, x.size, e_inf, x_e.signed_, x_e.ell, x_e.s);
+  FixArray ret_e_else = x_e;
+
+  FixArray ret_m = fix->if_else(overflow_check, ret_m_if, ret_m_else);
+  FixArray ret_e = fix->if_else(overflow_check, ret_e_if, ret_e_else);
+
+  uint64_t m_0 = 0;
+  uint64_t e_0 = x.e_min() - 1;
+  ret_m_if = fix->input(PUBLIC, x.size, m_0, x_m.signed_, x_m.ell, x_m.s);
+  ret_m_else = ret_m;
+  ret_e_if = fix->input(PUBLIC, x.size, e_0, x_e.signed_, x_e.ell, x_e.s);
+  ret_e_else = ret_e;
+
+  ret_m = fix->if_else(underflow_check, ret_m_if, ret_m_else);
+  ret_e = fix->if_else(underflow_check, ret_e_if, ret_e_else);
+  BoolArray ret_z = underflow_check;
+
+  FPArray ret = this->input(x.party, x.size, x_s.data, ret_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+BoolArray FPOp::LT(const FPArray &x, const FPArray &y, bool equal_sign) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.m_bits == y.m_bits);
+  assert(x.e_bits == y.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+  BoolArray y_s, y_z;
+  FixArray y_m, y_e;
+  tie(y_s, y_z, y_m, y_e) = get_components(y);
+
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+
+  FixArray x_em = fix->extend(x_m, x_m.ell + x_e.ell - 1, all_1.data);
+  x_e.signed_ = false;
+  x_em = fix->add(x_em, fix->scale_up(x_e, x_m.ell + x_e.ell - 1, x.m_bits));
+
+  FixArray y_em = fix->extend(y_m, y_m.ell + y_e.ell - 1, all_1.data);
+  y_e.signed_ = false;
+  y_em = fix->add(y_em, fix->scale_up(y_e, y_m.ell + y_e.ell - 1, y.m_bits));
+
+  BoolArray em_lt, em_eq;
+  tie(em_lt, em_eq) = fix->LT_and_EQ(x_em, y_em);
+
+  BoolArray cond1 = bool_op->XOR(x_s, em_lt);
+  BoolArray cond2 = bool_op->NOT(em_eq);
+  BoolArray ret_if = bool_op->AND(cond1, cond2);
+
+  if (equal_sign)
+    return ret_if;
+
+  BoolArray cond3 = bool_op->NOT(bool_op->AND(x_z, y_z));
+  BoolArray ret_else = bool_op->AND(x_s, cond3);
+
+  // s_eq = 1 ^ x_s ^ y_s
+  BoolArray s_eq = bool_op->XOR(x_s, bool_op->NOT(y_s));
+
+  BoolArray ret = bool_op->if_else(s_eq, ret_if, ret_else);
+
+  return ret;
+}
+
+BoolArray FPOp::GT(const FPArray &x, const FPArray &y, bool equal_sign) {
+  return this->LT(y, x, equal_sign);
+}
+
+BoolArray FPOp::LE(const FPArray &x, const FPArray &y, bool equal_sign) {
+  return bool_op->NOT(this->LT(y, x, equal_sign));
+}
+
+BoolArray FPOp::GE(const FPArray &x, const FPArray &y, bool equal_sign) {
+  return bool_op->NOT(this->LT(x, y, equal_sign));
+}
+
+template <class T> BoolArray FPOp::LT(const FPArray &x, T y, bool equal_sign) {
+  assert(x.party != PUBLIC);
+  FPArray fp_y = this->input<T>(PUBLIC, x.size, y, x.m_bits, x.e_bits);
+  return this->LT(x, fp_y, equal_sign);
+}
+
+template <class T> BoolArray FPOp::GT(const FPArray &x, T y, bool equal_sign) {
+  assert(x.party != PUBLIC);
+  FPArray fp_y = this->input<T>(PUBLIC, x.size, y, x.m_bits, x.e_bits);
+  return this->GT(x, fp_y, equal_sign);
+}
+
+template <class T> BoolArray FPOp::LE(const FPArray &x, T y, bool equal_sign) {
+  assert(x.party != PUBLIC);
+  FPArray fp_y = this->input<T>(PUBLIC, x.size, y, x.m_bits, x.e_bits);
+  return this->LE(x, fp_y, equal_sign);
+}
+
+template <class T> BoolArray FPOp::GE(const FPArray &x, T y, bool equal_sign) {
+  assert(x.party != PUBLIC);
+  FPArray fp_y = this->input<T>(PUBLIC, x.size, y, x.m_bits, x.e_bits);
+  return this->GE(x, fp_y, equal_sign);
+}
+
+// To avoid linkage failure
+template BoolArray FPOp::LT(const FPArray &x, float y, bool equal_sign);
+template BoolArray FPOp::GT(const FPArray &x, float y, bool equal_sign);
+template BoolArray FPOp::LE(const FPArray &x, float y, bool equal_sign);
+template BoolArray FPOp::GE(const FPArray &x, float y, bool equal_sign);
+template BoolArray FPOp::LT(const FPArray &x, double y, bool equal_sign);
+template BoolArray FPOp::GT(const FPArray &x, double y, bool equal_sign);
+template BoolArray FPOp::LE(const FPArray &x, double y, bool equal_sign);
+template BoolArray FPOp::GE(const FPArray &x, double y, bool equal_sign);
+
+FPArray FPOp::if_else(const BoolArray &cond, const FPArray &x,
+                      const FPArray &y) {
+  assert(cond.party != PUBLIC);
+  assert(x.party != PUBLIC && y.party != PUBLIC);
+  assert(cond.size == x.size && cond.size == y.size);
+  assert(x.m_bits == y.m_bits);
+  assert(x.e_bits == y.e_bits);
+  int ret_ell = x.m_bits + x.e_bits + 5;
+  assert(ret_ell <= 64);
+
+  uint64_t m_mask = x.m_mask();
+  uint64_t e_mask = x.e_mask();
+  uint64_t ret_mask = (1ULL << ret_ell) - 1;
+  uint64_t **send_data = new uint64_t *[x.size];
+  uint64_t *r = new uint64_t[x.size];
+  uint64_t *recv_data = new uint64_t[x.size];
+  PRG128 prg;
+  prg.random_data(r, x.size * sizeof(uint64_t));
+  for (int i = 0; i < x.size; i++) {
+    r[i] &= ret_mask;
+    uint64_t r_data = r[i];
+    uint64_t ret_x = (x.m[i] - r_data) & m_mask;
+    r_data >>= (x.m_bits + 1);
+    ret_x |= ((x.e[i] - r_data) & e_mask) << (x.m_bits + 1);
+    r_data >>= (x.e_bits + 2);
+    ret_x |= ((x.s[i] - r_data) & 1) << (x.m_bits + x.e_bits + 3);
+    r_data >>= 1;
+    ret_x |= ((x.z[i] - r_data) & 1) << (x.m_bits + x.e_bits + 4);
+
+    r_data = r[i];
+    uint64_t ret_y = (y.m[i] - r_data) & m_mask;
+    r_data >>= (x.m_bits + 1);
+    ret_y |= ((y.e[i] - r_data) & e_mask) << (x.m_bits + 1);
+    r_data >>= (x.e_bits + 2);
+    ret_y |= ((y.s[i] - r_data) & 1) << (x.m_bits + x.e_bits + 3);
+    r_data >>= 1;
+    ret_y |= ((y.z[i] - r_data) & 1) << (x.m_bits + x.e_bits + 4);
+    send_data[i] = new uint64_t[2];
+    if (cond.data[i]) {
+      send_data[i][0] = ret_x;
+      send_data[i][1] = ret_y;
+    } else {
+      send_data[i][0] = ret_y;
+      send_data[i][1] = ret_x;
+    }
+  }
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1) {
+      if (party == sci::ALICE) {
+        otpack->iknp_reversed->recv_impl(recv_data, cond.data, x.size, ret_ell);
+      } else { // party == sci::BOB
+        otpack->iknp_reversed->send_impl(send_data, x.size, ret_ell);
+      }
+    } else {
+      if (party == sci::ALICE) {
+        otpack->iknp_straight->send_impl(send_data, x.size, ret_ell);
+      } else { // party == sci::BOB
+        otpack->iknp_straight->recv_impl(recv_data, cond.data, x.size, ret_ell);
+      }
+    }
+  }
+  FPArray ret(this->party, x.size, x.m_bits, x.e_bits);
+  for (int i = 0; i < x.size; i++) {
+    uint64_t r_data = r[i];
+    uint64_t rcv_data = recv_data[i];
+    ret.m[i] = (r_data + rcv_data) & m_mask;
+    r_data >>= (x.m_bits + 1);
+    rcv_data >>= (x.m_bits + 1);
+    ret.e[i] = (r_data + rcv_data) & e_mask;
+    r_data >>= (x.e_bits + 2);
+    rcv_data >>= (x.e_bits + 2);
+    ret.s[i] = (r_data + rcv_data) & 1;
+    r_data >>= 1;
+    rcv_data >>= 1;
+    ret.z[i] = (r_data + rcv_data) & 1;
+  }
+
+  delete[] r;
+  delete[] recv_data;
+  for (int i = 0; i < x.size; i++) {
+    delete[] send_data[i];
+  }
+  delete[] send_data;
+
+  return ret;
+}
+
+FPArray FPOp::if_else(const BoolArray &cond, const FPArray &x, float y) {
+  assert(cond.party != PUBLIC && x.party != PUBLIC);
+  assert(cond.size == x.size);
+  // Setting input party to ALICE so that fp_y is interpreted as a secret-shared value
+  // This is required as the core if_else function doesn't support PUBLIC FPArrays
+  FPArray fp_y = this->input<float>(ALICE, x.size, y, x.m_bits, x.e_bits, false);
+  return this->if_else(cond, x, fp_y);
+}
+
+FPArray FPOp::if_else(const BoolArray &cond, const FPArray &x, double y) {
+  assert(cond.party != PUBLIC && x.party != PUBLIC);
+  assert(cond.size == x.size);
+  // Setting input party to ALICE so that fp_y is interpreted as a secret-shared value
+  // This is required as the core if_else function doesn't support PUBLIC FPArrays
+  FPArray fp_y = this->input<double>(ALICE, x.size, y, x.m_bits, x.e_bits, false);
+  return this->if_else(cond, x, fp_y);
+}
+
+FPArray FPOp::LUT(const std::vector<uint64_t> &spec_vec, const FixArray &x,
+                  uint8_t m_bits, uint8_t e_bits) {
+  assert(spec_vec.size() <= (1 << x.ell));
+  assert(x.party != PUBLIC);
+  assert(x.ell <= 8);
+  vector<uint64_t> spec_pow2(1 << x.ell, 0);
+  memcpy(spec_pow2.data(), spec_vec.data(), sizeof(uint64_t) * spec_vec.size());
+  uint64_t x_mask = x.ell_mask();
+  int ret_ell = (m_bits + 1 + e_bits + 2 + 2);
+  uint64_t ret_mask = (1ULL << ret_ell) - 1;
+  uint64_t m_mask = (1ULL << (m_bits + 1)) - 1;
+  uint64_t e_mask = (1ULL << (e_bits + 2)) - 1;
+  uint64_t *ret_int64 = new uint64_t[x.size];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(ret_int64, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[spec_pow2.size()];
+      ret_int64[i] &= ret_mask;
+      for (int j = 0; j < spec_pow2.size(); j++) {
+        int idx = (x.data[i] + j) & x_mask;
+        uint64_t spec_data = spec_pow2[idx];
+        uint64_t ret_data = ret_int64[i];
+        uint64_t spec_data_m = spec_data & m_mask;
+        uint64_t ret_data_m = ret_data & m_mask;
+        spec_data >>= (m_bits + 1);
+        ret_data >>= (m_bits + 1);
+        uint64_t spec_data_e = spec_data & e_mask;
+        uint64_t ret_data_e = ret_data & e_mask;
+        spec_data >>= (e_bits + 2);
+        ret_data >>= (e_bits + 2);
+        uint64_t spec_data_s = spec_data & 1;
+        uint64_t ret_data_s = ret_data & 1;
+        spec_data >>= 1;
+        ret_data >>= 1;
+        uint64_t spec_data_z = spec_data & 1;
+        uint64_t ret_data_z = ret_data & 1;
+        spec[i][j] =
+            (((spec_data_z + ret_data_z) & 1) << (m_bits + e_bits + 4)) |
+            (((spec_data_s + ret_data_s) & 1) << (m_bits + e_bits + 3)) |
+            (((spec_data_e - ret_data_e) & e_mask) << (m_bits + 1)) |
+            ((spec_data_m - ret_data_m) & m_mask);
+      }
+    }
+    fix->aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, x.ell,
+                                     ret_ell);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    fix->aux->lookup_table<uint64_t>(nullptr, x.data, ret_int64, x.size, x.ell,
+                                     ret_ell);
+  }
+  uint8_t *ret_s = new uint8_t[x.size];
+  uint8_t *ret_z = new uint8_t[x.size];
+  uint64_t *ret_e = new uint64_t[x.size];
+  uint64_t *ret_m = new uint64_t[x.size];
+  for (int i = 0; i < x.size; i++) {
+    uint64_t ret_data = ret_int64[i];
+    ret_m[i] = ret_data & m_mask;
+    ret_data >>= (m_bits + 1);
+    ret_e[i] = ret_data & e_mask;
+    ret_data >>= (e_bits + 2);
+    ret_s[i] = ret_data & 1;
+    ret_data >>= 1;
+    ret_z[i] = ret_data & 1;
+  }
+
+  FPArray ret = this->input(this->party, x.size, ret_s, ret_z, ret_m, ret_e, m_bits, e_bits);
+  delete[] ret_s;
+  delete[] ret_z;
+  delete[] ret_m;
+  delete[] ret_e;
+  delete[] ret_int64;
+
+  return ret;
+}
+
+vector<FPArray> FPOp::GetCoeffs(const vector<vector<uint64_t>> &spec_coeff,
+                                const vector<uint64_t> &knots_bits,
+                                const FixArray &x, int n, uint8_t m_bits,
+                                uint8_t e_bits) {
+  assert(x.party != PUBLIC);
+  assert(x.ell <= 8);
+  assert(n <= (1 << x.ell));
+  assert((knots_bits.size() == n - 1) || (knots_bits.size() == n) || (knots_bits.size() == n + 1));
+  for (int i = 0; i < spec_coeff.size(); i++) {
+    assert(spec_coeff[i].size() == n);
+  }
+  if (n > 64) {
+    int d = spec_coeff.size();
+    int num_iters = ceil(n/double(64));
+    vector<vector<uint64_t>> lspec_coeff(d);
+    vector<uint64_t> lknots_bits;
+    vector<vector<FPArray>> lret(num_iters);
+    vector<FPArray> ret(d);
+    for (int i = 0; i < n; i += 64) {
+      int ln = std::min(64, n-i);
+      for (int j = 0; j < d; j++) {
+        lspec_coeff[j].resize(ln);
+        for (int k = 0; k < ln; k++) {
+          lspec_coeff[j][k] = spec_coeff[j][i+k];
+        }
+      }
+      int lln = (i/64 == (num_iters - 1))? ln: ln + 1;
+      lknots_bits.resize(lln);
+      if (i) {
+        lknots_bits[0] = knots_bits[i-1];
+      } else {
+        lknots_bits[0] = 0;
+      }
+      for (int k = 0; k < ln && i+k < n; k++) {
+        lknots_bits[k+1] = knots_bits[i+k];
+      }
+      lret[i/64] = this->GetCoeffs(lspec_coeff, lknots_bits, x, ln, m_bits, e_bits);
+    }
+    uint64_t m_mask = (1ULL << (m_bits + 1)) - 1;
+    uint64_t e_mask = (1ULL << (e_bits + 2)) - 1;
+    for (int k = 0; k < d; k++) {
+      ret[k] = lret[0][k];
+      for (int i = 1; i < num_iters; i++) {
+        for (int j = 0; j < x.size; j++) {
+          ret[k].s[j] ^= lret[i][k].s[j];
+          ret[k].z[j] ^= lret[i][k].z[j];
+          ret[k].m[j] = (ret[k].m[j] + lret[i][k].m[j]) & m_mask;
+          ret[k].e[j] = (ret[k].e[j] + lret[i][k].e[j]) & e_mask;
+        }
+      }
+    }
+    return ret;
+  }
+  assert(n <= 64);
+  uint64_t x_mask = x.ell_mask();
+  uint64_t m_mask = (1ULL << (m_bits + 1)) - 1;
+  uint64_t e_mask = (1ULL << (e_bits + 2)) - 1;
+  uint64_t LUT_out_mask = (1ULL << n) - 1;
+  uint64_t *LUT_out = new uint64_t[x.size];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[x.size];
+    PRG128 prg;
+    prg.random_data(LUT_out, x.size * sizeof(uint64_t));
+    for (int i = 0; i < x.size; i++) {
+      spec[i] = new uint64_t[1 << x.ell];
+      LUT_out[i] &= LUT_out_mask;
+      for (int j = 0; j < (1 << x.ell); j++) {
+        int idx = (x.data[i] + j) & x_mask;
+        vector<uint8_t> spec_active_interval(n, 0);
+        vector<uint8_t> u(n, 0);
+        // n-1 knots specified
+        if (knots_bits.size() == n-1) {
+          spec_active_interval[0] = u[0] = (idx < knots_bits[0]);
+          for (int k = 1; k < n - 1; k++) {
+            u[k] = (idx < knots_bits[k]);
+            spec_active_interval[k] = u[k] ^ u[k - 1];
+          }
+          spec_active_interval[n - 1] = u[n - 2] ^ 1;
+        // n knots specified (lb specified)
+        } else if (knots_bits.size() == n) {
+          u[0] = (idx < knots_bits[1]);
+          spec_active_interval[0] = u[0] && (idx >= knots_bits[0]);
+          for (int k = 1; k < n - 1; k++) {
+            u[k] = (idx < knots_bits[k+1]);
+            spec_active_interval[k] = u[k] ^ u[k - 1];
+          }
+          spec_active_interval[n - 1] = u[n - 2] ^ 1;
+        // n+1 knots specified (both lb and ub specified)
+        } else if (knots_bits.size() == n+1) {
+          u[0] = (idx < knots_bits[1]);
+          spec_active_interval[0] = u[0] && (idx >= knots_bits[0]);
+          for (int k = 1; k < n; k++) {
+            u[k] = (idx < knots_bits[k+1]);
+            spec_active_interval[k] = u[k] ^ u[k - 1];
+          }
+        }
+        uint64_t spec_data = 0;
+        uint64_t LUT_out_data = LUT_out[i];
+        for (int k = 0; k < n; k++) {
+          spec_data |= (((spec_active_interval[k] ^ LUT_out_data) & 1) << k);
+          LUT_out_data >>= 1;
+        }
+        spec[i][j] = spec_data;
+      }
+    }
+    fix->aux->lookup_table<uint64_t>(spec, nullptr, nullptr, x.size, x.ell, n);
+
+    for (int i = 0; i < x.size; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    fix->aux->lookup_table<uint64_t>(nullptr, x.data, LUT_out, x.size, x.ell,
+                                     n);
+  }
+  uint8_t *v = new uint8_t[x.size * n];
+  uint64_t *V = new uint64_t[x.size * n];
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < x.size; j++) {
+      v[i * x.size + j] = LUT_out[j] & 1;
+      LUT_out[j] >>= 1;
+    }
+  }
+  fix->aux->B2A(v, V, x.size * n, m_bits + 1);
+
+  uint8_t *s = new uint8_t[x.size];
+  uint8_t *z = new uint8_t[x.size];
+  uint64_t *m = new uint64_t[x.size];
+  uint64_t *e = new uint64_t[x.size];
+  int d = spec_coeff.size();
+  uint8_t theta_s[d][n];
+  uint8_t theta_z[d][n];
+  uint64_t theta_m[d][n];
+  uint64_t theta_e[d][n];
+  vector<FPArray> ret(d);
+  for (int h = 0; h < d; h++) {
+    for (int k = 0; k < n; k++) {
+      uint64_t coeff = spec_coeff[h][k];
+      theta_m[h][k] = coeff & m_mask;
+      coeff >>= (m_bits + 1);
+      theta_e[h][k] = coeff & e_mask;
+      coeff >>= (e_bits + 2);
+      theta_s[h][k] = coeff & 1;
+      coeff >>= 1;
+      theta_z[h][k] = coeff & 1;
+    }
+    for (int i = 0; i < x.size; i++) {
+      s[i] = 0;
+      z[i] = 0;
+      m[i] = 0;
+      e[i] = 0;
+      for (int k = 0; k < n; k++) {
+        s[i] ^= (theta_s[h][k] & v[k * x.size + i]);
+        z[i] ^= (theta_z[h][k] & v[k * x.size + i]);
+        m[i] += (theta_m[h][k] * V[k * x.size + i]);
+        e[i] += (theta_e[h][k] * V[k * x.size + i]);
+      }
+      m[i] &= m_mask;
+      e[i] &= e_mask;
+    }
+    ret[h] = this->input(this->party, x.size, s, z, m, e, m_bits, e_bits);
+  }
+
+  delete[] s;
+  delete[] z;
+  delete[] m;
+  delete[] e;
+  delete[] v;
+  delete[] V;
+  delete[] LUT_out;
+
+  return ret;
+}
+
+FPArray FPOp::mulpow2(const FPArray &x, const FixArray &exp,
+                      bool check_bounds) {
+  assert(x.party != PUBLIC);
+  assert(x.size == exp.size);
+  assert(exp.ell <= x.e_bits + 2);
+  assert(exp.signed_ == true);
+
+  FixArray exp_ext = exp;
+  if (exp_ext.ell < x.e_bits + 2) {
+    fix->extend(exp_ext, x.e_bits + 2, nullptr);
+  }
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+
+  FixArray ret_e = fix->add(x_e, exp_ext);
+
+  FPArray ret = this->input(x.party, x.size, x_s.data, x_z.data, x_m.data,
+          ret_e.data, x.m_bits, x.e_bits);
+
+  if (check_bounds) {
+    ret = this->check_bounds(ret);
+  }
+
+  return ret;
+}
+
+FPArray FPOp::mulpow2(const FPArray &x, int exp, bool check_bounds) {
+  assert(x.party != PUBLIC);
+  FixArray fp_exp = fix->input(PUBLIC, x.size, int64_t(exp), true, x.e_bits + 2, 0);
+  return this->mulpow2(x, fp_exp, check_bounds);
+}
+
+FPArray FPOp::mul(const FPArray &x, const FPArray &y, bool check_bounds) {
+  assert(x.party != PUBLIC || y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.m_bits == y.m_bits);
+  assert(x.e_bits == y.e_bits);
+
+  int sz = x.size;
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+  BoolArray y_s, y_z;
+  FixArray y_m, y_e;
+  tie(y_s, y_z, y_m, y_e) = get_components(y);
+
+  BoolArray msb_x_m = bool_op->NOT(x_z);
+  BoolArray msb_y_m = bool_op->NOT(y_z);
+
+  /* FixArray ret_m = fix->mul(x_m, y_m, 2 * x.m_bits + 2, all_1.data, all_1.data); */
+  FixArray ret_m = fix->mul(x_m, y_m, 2 * x.m_bits + 2, msb_x_m.data, msb_y_m.data);
+  ret_m = fix->truncate_with_sticky_bit(ret_m, x.m_bits - 2);
+
+  FixArray ret_e = fix->add(x_e, y_e);
+  ret_e = fix->sub(ret_e, x.e_bias());
+  BoolArray denormal_m = fix->GE(ret_m, (1ULL << (x.m_bits + 3)) - 2);
+
+  FixArray ret_m_if = fix->round_ties_to_even(ret_m, 3);
+  ret_m_if.s += 1;
+  FixArray ret_m_else = fix->round_ties_to_even(ret_m, 2);
+  ret_m_else = fix->reduce(ret_m_else, x.m_bits + 1);
+  FixArray ret_e_if = fix->add(ret_e, 1);
+  FixArray ret_e_else = ret_e;
+
+  ret_m = fix->if_else(denormal_m, ret_m_if, ret_m_else);
+  ret_e = fix->if_else(denormal_m, ret_e_if, ret_e_else);
+
+  BoolArray ret_s = bool_op->XOR(x_s, y_s);
+  BoolArray ret_z = bool_op->OR(x_z, y_z);
+
+  FPArray ret = this->input(this->party, sz, ret_s.data, ret_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  if (check_bounds) {
+    ret = this->check_bounds(ret);
+  }
+
+  return ret;
+}
+
+vector<FPArray> FPOp::mul(const vector<FPArray> &x, const vector<FPArray> &y) {
+  FPArray concat_x = concat(x) ;
+  FPArray concat_y = concat(y) ;
+  FPArray concat_mul = this->mul(concat_x, concat_y) ;
+
+  int N = x.size() ;
+  int m_bits = x[0].m_bits ;
+  int e_bits = x[0].e_bits ; 
+  vector<FPArray> ret ;
+  int offset = 0 ;
+
+  for (int i = 0 ; i < N ; i++) {
+    int n = x[i].size ;
+    FPArray retvec(party, n, m_bits, e_bits) ;
+    
+    memcpy(retvec.s, concat_mul.s + offset, n*sizeof(uint8_t)) ;
+    memcpy(retvec.z, concat_mul.z + offset, n*sizeof(uint8_t)) ;
+    memcpy(retvec.m, concat_mul.m + offset, n*sizeof(uint64_t)) ;
+    memcpy(retvec.e, concat_mul.e + offset, n*sizeof(uint64_t)) ;
+
+    offset += n ;
+    ret.push_back(retvec) ;
+  }
+
+  return ret ;
+}
+
+void FPOp::normalize(FixArray &m, FixArray &e, int e_offset, int ell) {
+  assert(m.party != PUBLIC);
+  assert(m.size == e.size);
+  assert(ell <= m.ell);
+  uint8_t *m_one_hot = new uint8_t[m.size * m.ell];
+  uint64_t *m_one_hot_64 = new uint64_t[m.size * m.ell];
+#ifdef MSNZB_GC
+  fix->aux->msnzb_GC(m.data, m_one_hot, m.ell, m.size);
+#else  // MSNZB_GC
+  fix->aux->msnzb_one_hot(m.data, m_one_hot, m.ell, m.size);
+#endif // MSNZB_GC
+  int l = (m.ell > e.ell ? m.ell : e.ell);
+  fix->aux->B2A(m_one_hot, m_one_hot_64, m.size * m.ell, l);
+  FixArray k(m.party, m.size, m.signed_, m.ell, ell - 1 - m.s);
+  FixArray e_adj(m.party, m.size, e.signed_, e.ell, e.s);
+  uint64_t m_mask_ = m.ell_mask();
+  uint64_t e_mask_ = e.ell_mask();
+  for (int i = 0; i < m.size; i++) {
+    k.data[i] = 0;
+    e_adj.data[i] = 0;
+    for (int j = 0; j < m.ell; j++) {
+      k.data[i] += (1ULL << (ell - 1 - j)) * m_one_hot_64[i * m.ell + j];
+      e_adj.data[i] += (j - e_offset) * m_one_hot_64[i * m.ell + j];
+    }
+    k.data[i] &= m_mask_;
+    e_adj.data[i] &= e_mask_;
+  }
+  e = fix->add(e, e_adj);
+  m = fix->mul(m, k, m.ell);
+
+  delete[] m_one_hot;
+  delete[] m_one_hot_64;
+
+  return;
+}
+
+void FPOp::round_and_check(FixArray &m, FixArray &e, int s) {
+  assert(m.party != PUBLIC && e.party != PUBLIC);
+  assert(m.size == e.size);
+  assert(m.s == m.ell - 1);
+  int m_bits = m.ell - 1;
+  uint64_t oflow_threshold = (1ULL << (m_bits + 1)) - (1ULL << (s - 1));
+  BoolArray rnd_no_oflow = fix->LT(m, oflow_threshold);
+  FixArray m_if = fix->round_ties_to_even(m, s);
+  m = fix->if_else(rnd_no_oflow, m_if, (1ULL << (m_bits - s)));
+  e = fix->if_else(rnd_no_oflow, e, fix->add(e, 1));
+
+  return;
+}
+
+FPArray FPOp::flip_sign(const FPArray &x) {
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+  BoolArray not_x_s = bool_op->NOT(x_s);
+  FPArray neg_x = this->input(x.party, x.size, not_x_s.data, x_z.data,
+          x_m.data, x_e.data, x.m_bits, x.e_bits);
+  return neg_x;
+}
+
+FPArray FPOp::add(const FPArray &x, const FPArray &y, bool cheap_variant,
+                  bool compare_and_swap, bool check_bounds, int output_m_bits) {
+  assert(x.party != PUBLIC && y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.m_bits == y.m_bits);
+  assert(x.e_bits == y.e_bits);
+  if (output_m_bits == -1) output_m_bits = x.m_bits;
+  if (cheap_variant) assert(x.m_bits == output_m_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+  BoolArray y_s, y_z;
+  FixArray y_m, y_e;
+  tie(y_s, y_z, y_m, y_e) = get_components(y);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+  BoolArray msb_x_m = bool_op->NOT(x_z);
+  BoolArray msb_y_m = bool_op->NOT(y_z);
+
+  FPArray a = x;
+  FPArray b = y;
+  FixArray e_diff = fix->sub(x_e, y_e);
+  if (compare_and_swap) {
+    /* FixArray x_em = fix->extend(x_m, x_m.ell + x_e.ell - 1, all_1.data); */
+    FixArray x_em = fix->extend(x_m, x_m.ell + x_e.ell - 1, msb_x_m.data);
+    x_e.signed_ = false;
+    x_em = fix->add(x_em, fix->scale_up(x_e, x_m.ell + x_e.ell - 1, x.m_bits));
+
+    /* FixArray y_em = fix->extend(y_m, y_m.ell + y_e.ell - 1, all_1.data); */
+    FixArray y_em = fix->extend(y_m, y_m.ell + y_e.ell - 1, msb_y_m.data);
+    y_e.signed_ = false;
+    y_em = fix->add(y_em, fix->scale_up(y_e, y_m.ell + y_e.ell - 1, y.m_bits));
+
+    BoolArray em_gt = fix->GT(x_em, y_em);
+
+    FixArray e_diff_if = e_diff;
+    FixArray e_diff_else = fix->mul(e_diff, -1);
+
+    a = this->if_else(em_gt, x, y);
+    b = this->if_else(em_gt, y, x);
+    e_diff = fix->if_else(em_gt, e_diff_if, e_diff_else);
+  }
+  e_diff.signed_ = false; // e_diff is unsigned now
+  BoolArray a_s, a_z;
+  FixArray a_m, a_e;
+  tie(a_s, a_z, a_m, a_e) = get_components(a);
+  BoolArray b_s, b_z;
+  FixArray b_m, b_e;
+  tie(b_s, b_z, b_m, b_e) = get_components(b);
+
+  BoolArray msb_a_m = bool_op->NOT(a_z);
+  BoolArray msb_b_m = bool_op->NOT(b_z);
+  if (!cheap_variant) {
+    a_m = fix->extend(a_m, 2 * x.m_bits + 3, msb_a_m.data);
+    b_m = fix->extend(b_m, 2 * x.m_bits + 3, msb_b_m.data);
+  } else {
+    a_m = fix->extend(a_m, x.m_bits + 2, msb_a_m.data);
+    b_m = fix->extend(b_m, x.m_bits + 2, msb_b_m.data);
+  }
+  BoolArray cond = fix->GT(e_diff, x.m_bits + 1);
+  BoolArray s_neq = bool_op->XOR(a_s, b_s);
+
+  FixArray ret_m_if = a_m;
+  FixArray ret_e_if = a_e;
+
+  FixArray a_m_prime, b_m_prime;
+  if (cheap_variant) {
+    a_m_prime = a_m;
+    b_m_prime = fix->right_shift(b_m, e_diff, x.m_bits + 2, all_0.data);
+  } else {
+    a_m_prime = fix->left_shift(a_m, e_diff, 2 * x.m_bits + 3, x.m_bits + 1);
+    b_m_prime = b_m;
+  }
+  b_m_prime = fix->if_else(s_neq, fix->mul(b_m_prime, -1), b_m_prime);
+  FixArray ret_m_else = fix->add(a_m_prime, b_m_prime);
+  FixArray ret_e_else = (cheap_variant ? a_e : b_e);
+
+  FixArray ret_m = fix->if_else(cond, ret_m_if, ret_m_else);
+  FixArray ret_e;
+  if (cheap_variant) {
+    ret_e = a_e;
+  } else {
+    ret_e = fix->if_else(cond, ret_e_if, ret_e_else);
+  }
+
+  if (!cheap_variant) {
+    this->normalize(ret_m, ret_e, x.m_bits, 2 * x.m_bits + 2);
+  } else {
+    this->normalize(ret_m, ret_e, x.m_bits);
+  }
+
+  if (!cheap_variant) {
+    ret_m = fix->reduce(ret_m, 2 * x.m_bits + 2);
+    // fp_op->round_and_check(ret_m, ret_e, x.m_bits + 1);
+    fp_op->round_and_check(ret_m, ret_e, 2 * x.m_bits + 1 - output_m_bits);
+  } else {
+    ret_m = fix->truncate_reduce(ret_m, 1);
+  }
+
+  BoolArray ret_z = fix->EQ(ret_m, 0);
+  BoolArray ret_s = a_s;
+
+  FPArray ret = this->input(this->party, x.size, ret_s.data, ret_z.data,
+          ret_m.data, ret_e.data, output_m_bits, x.e_bits);
+
+  // ret = this->if_else(x_z, y, ret);
+  // ret = this->if_else(y_z, x, ret);
+
+  if (check_bounds) {
+    ret = this->check_bounds(ret);
+  }
+
+  return ret;
+}
+
+FPArray FPOp::sub(const FPArray &x, const FPArray &y, bool cheap_variant,
+                  bool compare_and_swap, bool check_bounds) {
+  assert(x.party != PUBLIC && y.party != PUBLIC);
+  assert(x.size == y.size);
+  assert(x.m_bits == y.m_bits);
+  assert(x.e_bits == y.e_bits);
+  FPArray neg_y = this->flip_sign(y);
+  return this->add(x, neg_y, cheap_variant, compare_and_swap, check_bounds);
+}
+
+vector<FixArray> quotient_mantissa(FixOp *fix, const vector<FixArray> &m1, const FixArray &m2,
+                           bool cheap_variant) {
+  assert(m1[0].party != PUBLIC && m2.party != PUBLIC);
+  assert(m1[0].size == m2.size);
+  assert(m1[0].ell == (m2.ell + 2));
+  int N = m1.size();
+  for (int i = 1; i < N; i++) {
+    assert(m1[0].party == m1[i].party);
+    assert(m1[0].size == m1[i].size);
+    assert(m1[0].ell == m1[i].ell);
+  }
+  int m_bits = m2.ell - 1;
+  assert(m_bits <= 27);
+  BoolArray all_0 = fix->bool_op->input(ALICE, m2.size, uint8_t(0));
+  BoolArray all_1 = fix->bool_op->input(ALICE, m2.size, 1);
+
+  int num_iter = 2;
+  int p = ceil((m_bits + 1) / double(1 << num_iter)) + 1;
+
+  // idx = m >> (m_bits-p) mod 2^{p}
+  FixArray idx = fix->reduce(fix->truncate_reduce(m2, m_bits - p), p);
+  int k = 1 << p;
+  vector<uint64_t> spec_vec_r(1 << p);
+  int32_t scale_curr = p + 1;
+  for (int i = 0; i < (1 << p); i++) {
+    double u = (1.0 + (double(i) / double(k)));
+    double Y = 1.0 / u;
+    spec_vec_r[i] = (Y * (1ULL << scale_curr));
+  }
+  FixArray r = fix->LUT(spec_vec_r, idx, false, scale_curr + 2, scale_curr, p);
+  for (int i = 1; i <= num_iter; i++) {
+    int32_t scale_next = (1 << i) * (p - 1) + 3;
+    FixArray m2r =
+        fix->mul(m2, r, scale_curr + m_bits + 2, all_1.data, all_0.data);
+    m2r = fix->truncate_reduce(m2r, m_bits + scale_curr - scale_next);
+    FixArray e = fix->sub(1ULL << scale_next, m2r);
+    FixArray r_new = fix->scale_up(r, scale_next + 2, scale_next);
+    e.signed_ = true;
+    r.signed_ = true; // changing signed_ to enable signed mult
+    FixArray re =
+        fix->mul(r, e, scale_curr + scale_next + 2, all_0.data, nullptr);
+    re.signed_ = false; // switching back to unsigned arithmetic
+    r = fix->add(r_new, fix->truncate_reduce(re, scale_curr));
+    scale_curr = scale_next;
+  }
+
+  FixArray r_replicated(r.party, N*r.size, r.signed_, r.ell, r.s);
+  FixArray m2_replicated(m2.party, N*m2.size, m2.signed_, m2.ell, m2.s);
+  for (int i = 0; i < N; i++) {
+    memcpy(r_replicated.data + i*r.size, r.data, r.size * sizeof(uint64_t));
+    memcpy(m2_replicated.data + i*m2.size, m2.data, m2.size * sizeof(uint64_t));
+  }
+  FixArray m2_ext_replicated(m2.party, N*m2.size, m2.signed_, m_bits+3, m2.s);
+  if (!cheap_variant) {
+      FixArray m2_ext = fix->extend(m2, m_bits + 3, all_1.data);
+      for (int i = 0; i < N; i++) {
+        memcpy(m2_ext_replicated.data + i*m2.size, m2_ext.data, m2.size * sizeof(uint64_t));
+      }
+      m2_ext_replicated.s = 2 * m_bits;
+  }
+  BoolArray all_0_flat = fix->bool_op->input(ALICE, N*m2.size, uint8_t(0));
+  BoolArray all_1_flat = fix->bool_op->input(ALICE, N*m2.size, 1);
+
+  FixArray q_flat;
+  FixArray m1_flat = concat(m1);
+  if (cheap_variant) {
+    q_flat = fix->mul(m1_flat, r_replicated, scale_curr + m_bits + 1, all_0_flat.data, all_0_flat.data);
+    q_flat = fix->truncate_reduce(q_flat, scale_curr);
+  } else {
+    q_flat = fix->mul(m1_flat, r_replicated, scale_curr + m_bits + 2, all_0_flat.data, all_0_flat.data);
+    q_flat = fix->truncate_reduce(q_flat, scale_curr);
+    FixArray m2q = fix->mul(m2_replicated, q_flat, m_bits + 3, all_1_flat.data, all_0_flat.data);
+    FixArray m2q_plus_1 = fix->add(m2q, m2_ext_replicated);
+    FixArray m1_scaled = fix->scale_up(m1_flat, m_bits + 3, 2 * m_bits + 1);
+    m1_scaled.s -= 1;
+    BoolArray lt, eq;
+    tie(lt, eq) = fix->LT_and_EQ(fix->add(m2q, m2q_plus_1), m1_scaled);
+    BoolArray odd_repr = fix->LSB(q_flat);
+    BoolArray add_1 = fix->bool_op->XOR(lt, fix->bool_op->AND(odd_repr, eq));
+    q_flat = fix->if_else(add_1, fix->add(q_flat, 1), q_flat);
+  }
+  q_flat = fix->reduce(q_flat, m_bits + 1);
+
+  vector<FixArray> q(N);
+  for (int i = 0; i < N; i++) {
+    q[i] = q_flat.subset(i*m2.size, (i+1)*m2.size);
+  }
+
+  return q;
+}
+
+vector<FPArray> FPOp::div(const vector<FPArray> &x, const FPArray &y, bool cheap_variant,
+                  bool check_bounds) {
+  assert(x[0].party != PUBLIC && y.party != PUBLIC);
+  assert(x[0].size == y.size);
+  assert(x[0].m_bits == y.m_bits);
+  assert(x[0].e_bits == y.e_bits);
+  int N = x.size();
+  int n = x[0].size;
+  for (int i = 1; i < N; i++) {
+    assert(x[0].party == x[i].party);
+    assert(x[0].size == x[i].size);
+    assert(x[0].m_bits == x[i].m_bits);
+    assert(x[0].e_bits == x[i].e_bits);
+  }
+
+  FPArray x_flat = concat(x);
+  BoolArray all_1 = bool_op->input(ALICE, x_flat.size, 1);
+
+  BoolArray x_s_flat, x_z_flat;
+  FixArray x_m_flat, x_e_flat;
+  tie(x_s_flat, x_z_flat, x_m_flat, x_e_flat) = get_components(x_flat);
+  BoolArray y_s, y_z;
+  FixArray y_m, y_e;
+  tie(y_s, y_z, y_m, y_e) = get_components(y);
+
+  BoolArray msb_x_m_flat = bool_op->NOT(x_z_flat);
+
+  BoolArray y_s_replicated = BoolArray(y_s.party, N*y_s.size);
+  BoolArray y_z_replicated = BoolArray(y_z.party, N*y_z.size);
+  FixArray y_m_replicated = FixArray(y_m.party, N*y_m.size, y_m.signed_, y_m.ell, y_m.s);
+  FixArray y_e_replicated = FixArray(y_e.party, N*y_e.size, y_e.signed_, y_e.ell, y_e.s);
+  for (int i = 0; i < N; i++) {
+    memcpy(y_s_replicated.data + i*n, y_s.data, n*sizeof(uint8_t));
+    memcpy(y_z_replicated.data + i*n, y_z.data, n*sizeof(uint8_t));
+    memcpy(y_m_replicated.data + i*n, y_m.data, n*sizeof(uint64_t));
+    memcpy(y_e_replicated.data + i*n, y_e.data, n*sizeof(uint64_t));
+  }
+  FixArray ret_e_flat = fix->sub(x_e_flat, y_e_replicated);
+  ret_e_flat = fix->add(ret_e_flat, y.e_bias());
+  BoolArray denormal_m_flat = fix->LT(x_m_flat, y_m_replicated);
+  /* x_m_flat = fix->extend(x_m_flat, y.m_bits + 3, all_1.data); */
+  x_m_flat = fix->extend(x_m_flat, y.m_bits + 3, msb_x_m_flat.data);
+  FixArray x_m_if_flat = fix->mul(x_m_flat, 2, y.m_bits + 3);
+  x_m_flat = fix->if_else(denormal_m_flat, x_m_if_flat, x_m_flat);
+  FixArray ret_e_if_flat = fix->sub(ret_e_flat, 1);
+  ret_e_flat = fix->if_else(denormal_m_flat, ret_e_if_flat, ret_e_flat);
+
+  vector<FixArray> x_m(N);
+  for (int i = 0; i < N; i++) {
+    x_m[i] = x_m_flat.subset(i*n, (i+1)*n);
+  }
+
+  vector<FixArray> q = quotient_mantissa(fix, x_m, y_m, cheap_variant);
+  FixArray q_flat = concat(q);
+
+  BoolArray ret_s_flat = bool_op->XOR(x_s_flat, y_s_replicated);
+  BoolArray ret_z_flat = x_z_flat;
+
+  FPArray ret_flat = this->input(this->party, x_flat.size, ret_s_flat.data, ret_z_flat.data,
+          q_flat.data, ret_e_flat.data, y.m_bits, y.e_bits);
+
+  if (check_bounds) {
+    ret_flat = this->check_bounds(ret_flat);
+  }
+
+  vector<FPArray> ret(N);
+  for (int i = 0; i < N; i++) {
+    ret[i] = ret_flat.subset(i*n, (i+1)*n);
+  }
+
+  return ret;
+}
+
+FixArray sqrt_mantissa(FixOp *fix, const BoolArray &oddExp, const FixArray &m) {
+  assert(m.party != PUBLIC && oddExp.party != PUBLIC);
+  assert(oddExp.size == m.size);
+  int m_bits = m.ell - 1;
+  assert(m_bits <= 27);
+  BoolArray all_0 = fix->bool_op->input(ALICE, m.size, uint8_t(0));
+  BoolArray all_1 = fix->bool_op->input(ALICE, m.size, 1);
+
+  FixArray m_prime = fix->extend(m, m_bits + 3, all_1.data);
+  m_prime = fix->if_else(oddExp, fix->mul(m_prime, 2), m_prime);
+
+  int num_iter = 2;
+  int p = ceil((m_bits + 1) / double(1 << num_iter)) + 1;
+
+  // idx = oddExp || (m >> (m_bits-(p-1)) mod 2^{p-1})
+  FixArray idx =
+      fix->sub(fix->truncate_reduce(m, m_bits - p + 1), 1ULL << (p - 1));
+  idx = fix->add(idx, fix->scale_up(fix->B2A(oddExp, false, 1), p, p - 1));
+  int k = 1 << (p - 1);
+  vector<uint64_t> spec_vec_r(1 << p);
+  int m_msb_mask = k - 1;
+  int32_t scale_curr = p + 1;
+  for (int i = 0; i < (1 << p); i++) {
+    int exp_parity = i >> (p - 1);
+    int m_msb = i & m_msb_mask;
+    double u = (1.0 + (double(m_msb) / double(k))) * (1 << exp_parity);
+    double Y = 1.0 / sqrt(u);
+    spec_vec_r[i] = (Y * (1ULL << scale_curr));
+  }
+  FixArray r = fix->LUT(spec_vec_r, idx, false, scale_curr + 2, scale_curr, p);
+  for (int i = 1; i <= num_iter; i++) {
+    int32_t scale_next = (1 << i) * (p - 1) + 5;
+    FixArray r_sq = fix->mul(r, r, 2 * scale_curr + 2, all_0.data, all_0.data);
+    FixArray mr_sq = fix->mul(m_prime, r_sq, 2 * scale_curr + m_bits + 2,
+                              all_0.data, all_0.data);
+    mr_sq = fix->truncate_reduce(mr_sq, m_bits + 2 * scale_curr - scale_next);
+    FixArray e = fix->sub(1ULL << scale_next, mr_sq);
+    FixArray r_new = fix->scale_up(r, scale_next + 2, scale_next);
+    e.signed_ = true;
+    r.signed_ = true; // changing signed_ to enable signed mult
+    FixArray re =
+        fix->mul(r, e, scale_curr + scale_next + 3, all_0.data, nullptr);
+    re.signed_ = false; // switching back to unsigned arithmetic
+    re.s +=
+        1; // increasing scale by 1 because we want r_new = (r << scale) + re/2
+    r = fix->add(r_new, fix->truncate_reduce(re, scale_curr + 1));
+    scale_curr = scale_next;
+  }
+
+  FixArray q =
+      fix->mul(m_prime, r, scale_curr + m_bits + 2, all_0.data, all_0.data);
+  q = fix->truncate_reduce(q, scale_curr);
+  FixArray q_sq = fix->mul(q, q, m_bits + 4, all_0.data, all_0.data);
+  FixArray two_q = fix->mul(fix->extend(q, m_bits + 4, all_0.data), 2);
+  two_q.s = 2 * m_bits;
+  FixArray q_plus_1_sq = fix->add(fix->add(q_sq, 1), two_q);
+  FixArray m_scaled = fix->scale_up(m_prime, m_bits + 4, 2 * m_bits + 1);
+  m_scaled.s -= 1;
+  BoolArray lt, eq;
+  tie(lt, eq) = fix->LT_and_EQ(fix->add(q_sq, q_plus_1_sq), m_scaled);
+  BoolArray odd_repr = fix->LSB(q);
+  BoolArray add_1 = fix->bool_op->XOR(lt, fix->bool_op->AND(odd_repr, eq));
+  q = fix->if_else(add_1, fix->add(q, 1), q);
+  return fix->reduce(q, m_bits + 1);
+}
+
+FPArray FPOp::sqrt(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = get_components(x);
+
+  FixArray e_plus_1 = fix->add(x_e, 1);
+  BoolArray oddExp = fix->LSB(e_plus_1);
+  FixArray ret_e = fix->right_shift(e_plus_1, 1);
+  ret_e.s += 1; // right_shift decreases scale by 1
+  ret_e = fix->add(ret_e, (x.e_bias() - 1) / 2);
+
+  FixArray q = sqrt_mantissa(fix, oddExp, x_m);
+
+  BoolArray ret_z = x_z;
+  uint64_t m_0 = 0;
+  uint64_t e_0 = x.e_min() - 1;
+  FixArray ret_m_if = fix->input(PUBLIC, x.size, m_0, x_m.signed_, x_m.ell, x_m.s);
+  FixArray ret_e_if = fix->input(PUBLIC, x.size, e_0, x_e.signed_, x_e.ell, x_e.s);
+
+  FixArray ret_m = fix->if_else(ret_z, ret_m_if, q);
+  ret_e = fix->if_else(ret_z, ret_e_if, ret_e);
+
+  FPArray ret = this->input(x.party, x.size, all_0.data, ret_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+FPArray FPOp::int_to_float(const FixArray &x, int m_bits, int e_bits) {
+  assert(x.party != PUBLIC);
+  assert(x.s == 0);
+  assert(x.ell <= (1 << (e_bits - 1)) - 1);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  FixArray f;
+  BoolArray msb_x, f_eq_0;
+  if (x.signed_ == true) {
+    tie(msb_x, f_eq_0) = fix->MSB_and_zero_test(x);
+    f = fix->reduce(fix->if_else(msb_x, fix->mul(x, -1), x), x.ell - 1);
+    f.signed_ = false;
+  } else {
+    f = x;
+    msb_x = all_0;
+    f_eq_0 = fix->EQ(f, 0);
+  }
+  uint64_t delta_e_bias = (1ULL << (e_bits - 1)) - 1;
+  FixArray delta_m = f;
+  FixArray delta_e = fix->input(ALICE, x.size, uint64_t(0), true, e_bits + 2, 0);
+  this->normalize(delta_m, delta_e, -1 * delta_e_bias);
+  round_and_check(delta_m, delta_e, f.ell - 1 - m_bits);
+  delta_e = fix->if_else(f_eq_0, 0, delta_e);
+  FPArray delta = this->input(x.party, x.size, msb_x.data, f_eq_0.data,
+          delta_m.data, delta_e.data, m_bits, e_bits);
+
+  return delta;
+}
+
+FPArray FPOp::max(const vector<FPArray>& x) {
+  int N = x.size();
+  int n = x[0].size;
+  int party = x[0].party;
+  int m_bits = x[0].m_bits;
+  int e_bits = x[0].e_bits;
+  for (int i = 1; i < N; i++) {
+    assert(x[i].party == party);
+    assert(x[i].m_bits == m_bits);
+    assert(x[i].e_bits == e_bits);
+    assert(x[i].size == n);
+  }
+
+  vector<FPArray> x_tr(n);
+  for (int i = 0; i < n; i++) {
+    x_tr[i] = FPArray(party, N, m_bits, e_bits);
+    for (int j = 0; j < N; j++) {
+      x_tr[i].s[j] = x[j].s[i];
+      x_tr[i].z[j] = x[j].z[i];
+      x_tr[i].m[j] = x[j].m[i];
+      x_tr[i].e[j] = x[j].e[i];
+    }
+  }
+  int num_cmps_old = n; int num_cmps_curr = n/2;
+  while(num_cmps_old > 1) {
+    int odd_num_cmps = num_cmps_old & 1;
+    vector<FPArray> lhs(num_cmps_curr); vector<FPArray> rhs(num_cmps_curr);
+    for (int j = odd_num_cmps; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      lhs[j/2] = x_tr[j]; rhs[j/2] = x_tr[j+1];
+    }
+    FPArray lhs_concat = concat(lhs);
+    FPArray rhs_concat = concat(rhs);
+    BoolArray cond = fp_op->GT(lhs_concat, rhs_concat);
+    lhs_concat = fp_op->if_else(cond, lhs_concat, rhs_concat);
+    for (int j = 0; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      x_tr[odd_num_cmps + (j/2)] = lhs_concat.subset((j/2)*N, (j/2)*N + N);
+    }
+    num_cmps_old = num_cmps_curr + odd_num_cmps;
+    num_cmps_curr = num_cmps_old/2;
+  }
+
+  return x_tr[0];
+}
+
+FPArray FPOp::treesum(const vector<FPArray> &x) {
+  int N = x.size();
+  int n = x[0].size;
+  int party = x[0].party;
+  int m_bits = x[0].m_bits;
+  int e_bits = x[0].e_bits;
+  for (int i = 1; i < N; i++) {
+    assert(x[i].party == party);
+    assert(x[i].m_bits == m_bits);
+    assert(x[i].e_bits == e_bits);
+    assert(x[i].size == n);
+  }
+
+  vector<FPArray> x_tr(n);
+  for (int i = 0; i < n; i++) {
+    x_tr[i] = FPArray(party, N, m_bits, e_bits);
+    for (int j = 0; j < N; j++) {
+      x_tr[i].s[j] = x[j].s[i];
+      x_tr[i].z[j] = x[j].z[i];
+      x_tr[i].m[j] = x[j].m[i];
+      x_tr[i].e[j] = x[j].e[i];
+    }
+  }
+  int num_cmps_old = n; int num_cmps_curr = n/2;
+  while(num_cmps_old > 1) {
+    int odd_num_cmps = num_cmps_old & 1;
+    vector<FPArray> lhs(num_cmps_curr); vector<FPArray> rhs(num_cmps_curr);
+    for (int j = odd_num_cmps; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      lhs[j/2] = x_tr[j]; rhs[j/2] = x_tr[j+1];
+    }
+    FPArray lhs_concat = concat(lhs);
+    FPArray rhs_concat = concat(rhs);
+    lhs_concat = this->add(lhs_concat, rhs_concat) ;
+    for (int j = 0; j < num_cmps_old && j + 1 < num_cmps_old; j += 2) {
+      x_tr[odd_num_cmps + (j/2)] = lhs_concat.subset((j/2)*N, (j/2)*N + N);
+    }
+    num_cmps_old = num_cmps_curr + odd_num_cmps;
+    num_cmps_curr = num_cmps_old/2;
+  }
+
+  return x_tr[0];
+}
+
+vector<FPArray> enlist_products(FPOp* fp_op, const FPMatrix &x, const FPMatrix &y) {
+  assert(x.party != PUBLIC); assert(y.party != PUBLIC);
+  assert(x.dim2 == y.dim1);
+  assert(x.m_bits == y.m_bits); assert(x.e_bits == y.e_bits);
+
+  int N = x.dim1*y.dim2; int n = x.dim2;
+  int m_bits = x.m_bits; int e_bits = x.e_bits;
+
+  vector<FPArray> prod_x(N), prod_y(N) ;
+  for (int i = 0 ; i < x.dim1 ; i++) {
+    for (int j = 0 ; j < y.dim2 ; j++) {
+      int ind = i*y.dim2 + j ;
+      prod_x[ind] = FPArray(fp_op->party, n, m_bits, e_bits) ; 
+      prod_y[ind] = FPArray(fp_op->party, n, m_bits, e_bits) ;
+
+      for (int k = 0 ; k < n ; k++) {
+        prod_x[ind].s[k] = x.s[i*x.dim2+k] ;
+        prod_x[ind].z[k] = x.z[i*x.dim2+k] ;
+        prod_x[ind].m[k] = x.m[i*x.dim2+k] ;
+        prod_x[ind].e[k] = x.e[i*x.dim2+k] ;
+
+        prod_y[ind].s[k] = y.s[k*y.dim2+j] ;
+        prod_y[ind].z[k] = y.z[k*y.dim2+j] ;
+        prod_y[ind].m[k] = y.m[k*y.dim2+j] ;
+        prod_y[ind].e[k] = y.e[k*y.dim2+j] ;
+      }
+    }
+  }
+
+  return fp_op->mul(prod_x, prod_y) ;
+}
+
+FPMatrix FPOp::matrix_multiplication(const FPMatrix &x, const FPMatrix &y) {
+  assert(x.party != PUBLIC); assert(y.party != PUBLIC);
+  assert(x.dim2 == y.dim1);
+  assert(x.m_bits == y.m_bits); assert(x.e_bits == y.e_bits);
+
+  int N = x.dim1*y.dim2; int n = x.dim2;
+  int m_bits = x.m_bits; int e_bits = x.e_bits;
+  
+  vector<FPArray> prod = enlist_products(this, x, y) ;
+  FPMatrix ret(this->party, x.dim1, y.dim2, m_bits, e_bits) ;
+  int rows_per_batch = ceil(CHUNK_SIZE/double(n));
+  for (int i = 0; i < N; i += rows_per_batch) {
+    int j = std::min(i + rows_per_batch, N);
+    vector<FPArray> prod_i = {prod.begin() + i, prod.begin() + j};
+    FPArray ret_i = treesum(prod_i);
+    memcpy(ret.s + i, ret_i.s, (j-i) * sizeof(uint8_t));
+    memcpy(ret.z + i, ret_i.z, (j-i) * sizeof(uint8_t));
+    memcpy(ret.m + i, ret_i.m, (j-i) * sizeof(uint64_t));
+    memcpy(ret.e + i, ret_i.e, (j-i) * sizeof(uint64_t));
+  }
+  return ret ;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.h b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.h
new file mode 100644
index 00000000..955dbea9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.h
@@ -0,0 +1,462 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef FLOATING_POINT_H__
+#define FLOATING_POINT_H__
+
+#include "FloatingPoint/fixed-point.h"
+#include "Math/math-functions.h"
+#include <iostream>
+
+#define FP32_M_BITS 23
+#define FP32_E_BITS 8
+
+#define print_fp(vec)                                                          \
+  {                                                                            \
+    auto tmp_pub = fp_op->output(PUBLIC, vec).subset(I, I + J);                \
+    cout << #vec << "_pub: " << tmp_pub << endl;                               \
+  }
+
+// A container to hold an array of floating-point values
+// If party is set as PUBLIC for a FPArray instance, then the underlying array is known publicly and we maintain the invariant that both parties will hold identical data in that instance.
+// Else, the underlying array is secret-shared and the class instance will hold the party's share of the secret array. In this case, the party data member denotes which party this share belongs to.
+// m_bits denotes the mantissa bits (after the binary point; e.g. 23 for 32-bit floats) and e_bits denotes the exponent bits
+// For efficiency reasons, we maintain the sign-bit (s; BoolArray), exponent (e; FixArray with ell = e_bits + 2 and s = 0) and mantissa (m; FixArray with ell = m_bits + 1 and s = m_bits) in different secret-shared arrays, and additionally maintain a zero-bit (z; BoolArray)
+class FPArray {
+public:
+  int party = sci::PUBLIC;
+  int size = 0;          // size of array
+  uint8_t *s = nullptr;  // sign
+  uint8_t *z = nullptr;  // is_zero?
+  uint64_t *m = nullptr; // mantissa
+  uint64_t *e = nullptr; // exponent
+  uint8_t m_bits; // mantissa bits (after binary point)
+  uint8_t e_bits; // exponent bits
+
+  FPArray(){};
+
+  FPArray(int party_, int sz, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS) {
+    assert(party_ == sci::PUBLIC || party_ == sci::ALICE || party_ == sci::BOB);
+    assert(sz > 0);
+    assert(m_bits_ > 0);
+    assert(e_bits_ > 0);
+    this->party = party_;
+    this->size = sz;
+    this->m_bits = m_bits_;
+    this->e_bits = e_bits_;
+    s = new uint8_t[sz];
+    z = new uint8_t[sz];
+    m = new uint64_t[sz];
+    e = new uint64_t[sz];
+  }
+
+  // copy constructor
+  FPArray(const FPArray &other) {
+    this->party = other.party;
+    this->size = other.size;
+    this->m_bits = other.m_bits;
+    this->e_bits = other.e_bits;
+    this->s = new uint8_t[size];
+    this->z = new uint8_t[size];
+    this->m = new uint64_t[size];
+    this->e = new uint64_t[size];
+    memcpy(this->s, other.s, size * sizeof(uint8_t));
+    memcpy(this->z, other.z, size * sizeof(uint8_t));
+    memcpy(this->m, other.m, size * sizeof(uint64_t));
+    memcpy(this->e, other.e, size * sizeof(uint64_t));
+  }
+
+  // move constructor
+  FPArray(FPArray &&other) noexcept {
+    this->party = other.party;
+    this->size = other.size;
+    this->m_bits = other.m_bits;
+    this->e_bits = other.e_bits;
+    this->s = other.s;
+    this->z = other.z;
+    this->m = other.m;
+    this->e = other.e;
+    other.s = nullptr;
+    other.z = nullptr;
+    other.m = nullptr;
+    other.e = nullptr;
+  }
+
+  ~FPArray() {
+    delete[] s;
+    delete[] z;
+    delete[] m;
+    delete[] e;
+  }
+
+  template <class T> std::vector<T> get_native_type();
+
+  // if the input floating-point value overflows/underflows the representation, set it as inf/0
+  void check_bounds() {
+    for (int i = 0; i < size; i++) {
+      if (z[i] || (int64_t(e[i]) - int64_t(e_min()) < 0)) {
+        // 0 representation
+        e[i] = 0;
+        m[i] = 0;
+      }
+      if (int64_t(e[i]) - int64_t(e_max()) > 0) {
+        // inf representation
+        e[i] = e_max() + 1;
+        m[i] = 0;
+      }
+    }
+  }
+
+  // modulo mask for m
+  uint64_t m_mask() const {
+    return (1ULL << (this->m_bits + 1)) - 1;
+  }
+
+  // modulo mask for e
+  uint64_t e_mask() const {
+    return (uint64_t(1) << (this->e_bits + 2)) - 1;
+  }
+
+  uint64_t e_bias() const { return (uint64_t(1) << (this->e_bits - 1)) - 1; }
+
+  uint64_t e_min() const { return 1; }
+
+  uint64_t e_max() const { return 2 * e_bias(); }
+
+  // copy assignment
+  FPArray &operator=(const FPArray &other) {
+    if (this == &other) return *this;
+
+    delete[] this->s;
+    delete[] this->z;
+    delete[] this->m;
+    delete[] this->e;
+    this->party = other.party;
+    this->size = other.size;
+    this->m_bits = other.m_bits;
+    this->e_bits = other.e_bits;
+    this->s = new uint8_t[size];
+    this->z = new uint8_t[size];
+    this->m = new uint64_t[size];
+    this->e = new uint64_t[size];
+    memcpy(this->s, other.s, size * sizeof(uint8_t));
+    memcpy(this->z, other.z, size * sizeof(uint8_t));
+    memcpy(this->m, other.m, size * sizeof(uint64_t));
+    memcpy(this->e, other.e, size * sizeof(uint64_t));
+    return *this;
+  }
+
+  // move assignment
+  FPArray &operator=(FPArray &&other) noexcept {
+    if (this == &other) return *this;
+
+    delete[] this->s;
+    delete[] this->z;
+    delete[] this->m;
+    delete[] this->e;
+    this->party = other.party;
+    this->size = other.size;
+    this->m_bits = other.m_bits;
+    this->e_bits = other.e_bits;
+    this->s = other.s;
+    this->z = other.z;
+    this->m = other.m;
+    this->e = other.e;
+    other.s = nullptr;
+    other.z = nullptr;
+    other.m = nullptr;
+    other.e = nullptr;
+    return *this;
+  }
+
+  // FPArray[i, j)
+  FPArray subset(int i, int j);
+
+};
+
+std::ostream &operator<<(std::ostream &os, FPArray &other);
+
+FPArray concat(const vector<FPArray>& x);
+
+class FPMatrix : public FPArray {
+public:
+  int dim1, dim2;
+
+  FPMatrix(){};
+
+  FPMatrix(int party_, int dim1_, int dim2_, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS) : FPArray(party_, dim1_*dim2_, m_bits_, e_bits_) {
+    this->dim1 = dim1_;
+    this->dim2 = dim2_;
+  }
+
+  FPMatrix(int dim1_, int dim2_, const FPArray &other) : FPArray (other) {
+    assert(dim1_*dim2_ == other.size);
+    this->dim1 = dim1_;
+    this->dim2 = dim2_;
+  }
+
+  // copy constructor
+  FPMatrix(const FPMatrix &other) : FPArray(other) {
+    this->dim1 = other.dim1;
+    this->dim2 = other.dim2;
+  }
+
+  // move constructor
+  FPMatrix(FPMatrix &&other) noexcept : FPArray(std::move(other)) {
+    this->dim1 = other.dim1;
+    this->dim2 = other.dim2;
+  }
+
+  // copy assignment
+  FPMatrix &operator=(const FPMatrix &other) {
+    FPArray::operator=(other);
+    this->dim1 = other.dim1;
+    this->dim2 = other.dim2;
+    return *this;
+  }
+
+  // move assignment
+  FPMatrix &operator=(FPMatrix &&other) noexcept {
+    FPArray::operator=(std::move(other));
+    this->dim1 = other.dim1;
+    this->dim2 = other.dim2;
+    return *this;
+  }
+
+  FPMatrix transpose();
+};
+
+std::ostream &operator<<(std::ostream &os, FPMatrix &other);
+class FPOp {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  FixOp *fix;
+  BoolOp *bool_op;
+  FPOp *fp_op;
+
+  FPOp(int party, sci::IOPack *iopack, sci::OTPack *otpack) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->fix = new FixOp(party, iopack, otpack);
+    this->bool_op = new BoolOp(party, iopack, otpack);
+    this->fp_op = this;
+  }
+
+  ~FPOp() {
+    delete fix;
+    delete bool_op;
+  }
+
+  // input functions: return a FPArray that stores input data_
+  // party_ denotes which party provides the input data_ and the data_ provided by the other party is ignored. If party_ is PUBLIC, then the data_ provided by both parties must be identical.
+  // sz is the size of the returned FPArray and the array pointed by data_
+  // m_bits_ and e_bits_ are the mantissa and exponent bits, respectively of the input and default to 32-bit float if not provided
+  // if check_params = true, the input is checked if it overflows/underflows the bounds of the FP representation <e_bits_, m_bits_>
+  template <class T>
+  FPArray input(int party_, int sz, T* data_, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS, bool check_params = true);
+  // same as the above function, except that it replicates data_ in all sz positions of the returned FPArray
+  template <class T>
+  FPArray input(int party_, int sz, T data_, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS, bool check_params = true);
+  // same as above function, except it provides data_ in the form of 4 arrays (s_, z_, m_, e_)
+  // used internally to initialize a FPArray from individual components
+  FPArray input(int party_, int sz, uint8_t* s_, uint8_t* z_,
+          uint64_t* m_, uint64_t* e_, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS);
+
+  template <class T>
+  FPMatrix input(int party_, int dim1, int dim2, T* data_, uint8_t m_bits_ = FP32_M_BITS,
+      uint8_t e_bits_ = FP32_E_BITS, bool check_params = true) {
+    return FPMatrix(dim1, dim2, fp_op->input<T>(party_, dim1*dim2, data_, m_bits_, e_bits_, check_params));
+  }
+  template <class T>
+  FPMatrix input(int party_, int dim1, int dim2, T data_, uint8_t m_bits_ = FP32_M_BITS,
+      uint8_t e_bits_ = FP32_E_BITS, bool check_params = true) {
+    return FPMatrix(dim1, dim2, fp_op->input<T>(party_, dim1*dim2, data_, m_bits_, e_bits_, check_params));
+  }
+  FPMatrix input(int party_, int dim1, int dim2, uint8_t* s_, uint8_t* z_,
+          uint64_t* m_, uint64_t* e_, uint8_t m_bits_ = FP32_M_BITS,
+          uint8_t e_bits_ = FP32_E_BITS) {
+    return FPMatrix(dim1, dim2, fp_op->input(party_, dim1*dim2, s_, z_, m_, e_, m_bits_, e_bits_));
+  }
+
+  // output function: returns the secret array underlying x in the form of a PUBLIC FPArray
+  // party_ denotes which party will receive the output. If party_ is PUBLIC, both parties receive the output.
+  FPArray output(int party_, const FPArray& x);
+
+  FPMatrix output(int party_, const FPMatrix& x) {
+    return FPMatrix(x.dim1, x.dim2, fp_op->output(party_, static_cast<FPArray>(x)));
+  }
+
+  // returns (BoolArray x.s, BoolArray x.z, FixArray x.m, FixArray x.e)
+  std::tuple<BoolArray,BoolArray,FixArray,FixArray> get_components(const FPArray &x);
+
+  // if x overflows/underflows the representation, set it as inf/0
+  // x must be a secret-shared array (otherwise, simply call x.check_bounds())
+  FPArray check_bounds(const FPArray &x);
+
+  // Multiplexers: return x[i] if cond[i] = 1; else return y[i]
+  // cond must be a secret-shared BoolArray
+  //// x, y must be secret-shared FPArray
+  //// cond, x, y must have equal size
+  //// x, y must have same m_bits and e_bits
+  //// x.m_bits + x.e_bits + 5 <= 64
+  FPArray if_else(const BoolArray &cond, const FPArray &x, const FPArray &y);
+  //// x must be secret-shared
+  //// cond, x must have equal size
+  //// y[i] = y (with same m_bits and e_bits as x)
+  FPArray if_else(const BoolArray &cond, const FPArray &x, float y);
+  FPArray if_else(const BoolArray &cond, const FPArray &x, double y);
+
+  // Comparison Operations: return x[i] OP y[i] (OP = {<, >, <=, >=}) in the form of a BoolArray
+  // if equal_sign = true, it is assumed that x and y have the same sign and comparison is a bit cheaper
+  //// At least one of x and y must be secret-shared
+  //// x, y must have same size, m_bits and e_bits
+  BoolArray LT(const FPArray &x, const FPArray &y, bool equal_sign = false);
+  BoolArray GT(const FPArray &x, const FPArray &y, bool equal_sign = false);
+  BoolArray LE(const FPArray &x, const FPArray &y, bool equal_sign = false);
+  BoolArray GE(const FPArray &x, const FPArray &y, bool equal_sign = false);
+  //// x must be secret-shared
+  //// y[i] = y (float/double converted to same m_bits and e_bits as x)
+  template <class T>
+  BoolArray LT(const FPArray &x, T y, bool equal_sign = false);
+  template <class T>
+  BoolArray GT(const FPArray &x, T y, bool equal_sign = false);
+  template <class T>
+  BoolArray LE(const FPArray &x, T y, bool equal_sign = false);
+  template <class T>
+  BoolArray GE(const FPArray &x, T y, bool equal_sign = false);
+
+  // Lookup Table (LUT): returns spec_vec[x[i]]
+  // Output has m_bits mantissa bits and e_bits exponent bits
+  // x must be secret-shared and x.ell <= 8
+  // spec_vec.size() must be equal to 2^{x.ell}
+  FPArray LUT(const std::vector<uint64_t> &spec_vec, const FixArray &x,
+              uint8_t m_bits, uint8_t e_bits);
+
+  // Retrieve Spline's Coefficients: returns { spec_coeff[i][j] }_{i \in {0, ..., num_coeffs-1}} if knots_bits[j-1] <= x < knots_bits[j], where num_coeffs = spec_coeff.size() and j \in {0, ..., n-1}
+  // n is the number of pieces in the spline
+  // Output has m_bits mantissa bits and e_bits exponent bits
+  // x must be secret-shared and x.ell <= 8
+  // n must be <= min{2^{x.ell}, 64}
+  // Size of knots_bits must be n-1 (knots_bits[-1] is -inf and knots_bits[n-1] is inf)
+  // For all i \in {0, ..., num_coeffs-1}, spec_coeff[i] must be of size n
+  std::vector<FPArray>
+  GetCoeffs(const std::vector<std::vector<uint64_t>> &spec_coeff,
+            const std::vector<uint64_t> &knots_bits, const FixArray &x,
+            int n, uint8_t m_bits, uint8_t e_bits);
+
+  // Normalize m and adjust e accordingly: modifies {m[i] -> m[i] << (ell-1-k)} and {e[i] -> e[i]+k-e_offset}, where 2^k <= m[i] < 2^{k+1}
+  // The bitlengths of m and e remain the same. The scale of m changes to ell-1
+  // m must be secret-shared and the size of m and e must be the same
+  //// ell <= m.ell and this parameter is used when we want to normalize m such that the (ell-1)-th bit position is always set and it is known that k <= ell-1
+  void normalize(FixArray &m, FixArray &e, int e_offset, int ell);
+  //// ell = m.ell
+  inline void normalize(FixArray &m, FixArray &e, int e_offset) {
+    return normalize(m, e, e_offset, m.ell);
+  }
+
+  // Round and Check: modifies {m[i] -> m[i] >>_R s} and {e[i] -> e[i]} if m[i] < (2^{m.ell}-2^{s-1}) (no overflow from rounding), and modifies {m[i] -> 2^{m.ell-s-1}} and {e[i] -> e[i]+1} otherwise (rounding leads to overflow)
+  // m and e must be secret-shared and of equal size
+  // m must be normalized (\in [1, 2)) with scale m.ell-1
+  void round_and_check(FixArray &m, FixArray &e, int s);
+
+  // Returns -x
+  FPArray flip_sign(const FPArray& x);
+
+  // Multiplication by power-of-2: returns x[i] * 2^{exp[i]}
+  // check_bounds is an optional parameter which rounds to 0/inf in case of underflows/overflows after multiplication
+  // x must be secret-shared
+  //// exp can be secret-shared or PUBLIC
+  //// x and exp must be of equal size
+  //// exp must be a signed FixArray with bitlength <= x.e_bits + 2 (bitlength of FixArray holding x's exponent)
+  FPArray mulpow2(const FPArray &x, const FixArray &exp, bool check_bounds = true);
+  //// exp[i] = exp (with bitlength x.e_bits + 2)
+  FPArray mulpow2(const FPArray &x, int exp, bool check_bounds = true);
+
+  vector<FPArray> mul(const vector<FPArray> &x, const vector<FPArray> &y) ;
+
+  // Multiplication: returns x[i] * y[i]
+  // check_bounds is an optional parameter which rounds to 0/inf in case of underflows/overflows after multiplication
+  // At least one of x and y must be secret-shared
+  // x and y must be of the same size, and have the same mantissa (m_bits) and exponent bits (e_bits)
+  FPArray mul(const FPArray &x, const FPArray &y, bool check_bounds = true);
+
+  // Addition: return x[i] + y[i]
+  // If cheap_variant is set to true, addition is cheaper in cost but does not return correctly rounded results
+  // compare_and_swap is an optional parameter which can be set to 1 to save some cost if | x | >= | y |
+  // check_bounds is an optional parameter which rounds to 0/inf in case of underflows/overflows after addition
+  // Both x and y must be secret-shared
+  // x and y must be of the same size, and have the same mantissa (m_bits) and exponent bits (e_bits)
+  FPArray add(const FPArray &x, const FPArray &y, bool cheap_variant = false,
+              bool compare_and_swap = true, bool check_bounds = true, int output_m_bits = -1);
+
+  // Subtraction: return x[i] - y[i]
+  // If cheap_variant is set to true, subtraction is cheaper in cost but does not return correctly rounded results
+  // compare_and_swap is an optional parameter which can be set to 1 to save some cost if | x | >= | y |
+  // check_bounds is an optional parameter which rounds to 0/inf in case of underflows/overflows after subtraction
+  // Both x and y must be secret-shared
+  // x and y must be of the same size, and have the same mantissa (m_bits) and exponent bits (e_bits)
+  FPArray sub(const FPArray &x, const FPArray &y, bool cheap_variant = false,
+              bool compare_and_swap = true, bool check_bounds = true);
+
+  vector<FPArray> div(const vector<FPArray> &x, const FPArray &y, bool cheap_variant = false,
+              bool check_bounds = true);
+  // Division: returns x[i] / y[i]
+  // If cheap_variant is set to true, division is cheaper in cost but does not return correctly rounded results
+  // check_bounds is an optional parameter which rounds to 0/inf in case of underflows/overflows after division
+  // Both x and y must be secret-shared
+  // x and y must be of the same size, and have the same mantissa (m_bits) and exponent bits (e_bits)
+  FPArray div(const FPArray &x, const FPArray &y, bool cheap_variant = false,
+      bool check_bounds = true) {
+    vector<FPArray> x_vec(1);
+    x_vec[0] = x;
+    return (fp_op->div(x_vec, y, cheap_variant, check_bounds))[0];
+  }
+
+  // Square-Root: returns sqrt(x[i])
+  // x must be secret-shared
+  FPArray sqrt(const FPArray &x);
+
+  // Convert Integer to Float: returns an FPArray with m_bits mantissa bits and e_bits exponent bits that holds float(x[i]) in the i-th slot
+  // x must be a secret-shared IntegerArray (i.e., x.s must be 0)
+  // x.ell <= x.e_max = 2^{x.e_bits - 1} - 1
+  FPArray int_to_float(const FixArray &x, int m_bits = FP32_M_BITS,
+                       int e_bits = FP32_E_BITS);
+
+  // Finds max element m_i in x[i], forall i
+  // Returns a FPArray of length x.size() with m_i in i-th index
+  FPArray max(const std::vector<FPArray>& x);
+
+  // Finds sum s_i of elements in x[i], forall i
+  // Returns a FPArray of length x.size() with s_i in i-th index
+  FPArray treesum(const vector<FPArray> &x) ;
+
+  FPMatrix matrix_multiplication(const FPMatrix &x, const FPMatrix &y) ;
+};
+
+#endif // FLOATING_POINT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math-coeffs.h b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math-coeffs.h
new file mode 100644
index 00000000..ca0b0a04
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math-coeffs.h
@@ -0,0 +1,712 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef FLOATING_POINT_MATH_COEFFS_H__
+#define FLOATING_POINT_MATH_COEFFS_H__
+#include <vector>
+
+// Coefficients for splines used in fp-math functions (derived from .poly files)
+// z || s || 00 || e || {0,1} || m
+
+std::vector<uint64_t> tan_N = {
+    0x8000000000, 0x77c910800, 0x78c912702, 0x7996d0406, 0x79c91a312,
+    0x79fb6802f,  0x7a96dd575, 0x7ab00996a, 0x7ac9393c5, 0x7ae26cc57,
+    0x7afba4af5,  0x7b8a70bbe, 0x7b9711ce6, 0x7ba3b5ce8, 0x7bb05cfbb,
+    0x7bbd0795a,  0x7bc9b5dc6, 0x7bd668104, 0x7be31e71c, 0x7befd941e,
+    0x7bfc98c1d,  0x7c84ae99b, 0x7c8b136c3, 0x7c917af9b, 0x7c97e563a,
+    0x7c9e52cba,  0x7ca4c353a, 0x7cab371dd, 0x7cb1ae4c8, 0x7cb829028,
+    0x7cbea762a,  0x7cc529905, 0x7ccbafaf0, 0x7cd239e2b, 0x7cd8c84f8,
+    0x7cdf5b1a1,  0x7ce5f2676, 0x7cec8e5ca, 0x7cf32f1f9, 0x7cf9d4d65,
+    0x7d803fd3b,  0x7d8397dcd, 0x7d86f29a4, 0x7d8a501fe, 0x7d8db081f,
+    0x7d9113d4c,  0x7d947a2cf, 0x7d97e39f9, 0x7d9b5041b, 0x7d9ec028c,
+    0x7da2336aa,  0x7da5aa1d4, 0x7da924571, 0x7daca22ea, 0x7db023bb0,
+    0x7db3a9138,  0x7db7324fc, 0x7dbabf87d, 0x7dbe50d42, 0x7dc1e64d5,
+    0x7dc5800cb,  0x7dc91e2bc, 0x7dccc0c47, 0x7dd067f13, 0x7dd413ccd,
+    0x7dd7c4729,  0x7ddb79fe4, 0x7ddf348c1, 0x7de2f438c, 0x7de6b9218,
+    0x7dea83641,  0x7dee531ec, 0x7df228707, 0x7df603787, 0x7df9e456d,
+    0x7dfdcb2c2,  0x7e80dc0ce, 0x7e82d5a0b, 0x7e84d262d, 0x7e86d264e,
+    0x7e88d5b8d,  0x7e8adc70f, 0x7e8ce69ff, 0x7e8ef4590, 0x7e9105af8,
+    0x7e931ab75,  0x7e953384d, 0x7e97502cc, 0x7e9970c45, 0x7e9b95611,
+    0x7e9dbe194,  0x7e9feb038, 0x7ea21c36e, 0x7ea451cb1, 0x7ea68bd84,
+    0x7ea8ca774,  0x7eab0dc15, 0x7ead55d07, 0x7eafa2bf2, 0x7eb1f4a8a,
+    0x7eb44ba8a,  0x7eb6a7dbc, 0x7eb9095f3, 0x7ebb7050e, 0x7ebddccf7,
+    0x7ec04efa5,  0x7ec2c6f1e, 0x7ec544d71, 0x7ec7c8cbe, 0x7eca52f32,
+    0x7ecce3708,  0x7ecf7a68b, 0x7ed218015, 0x7ed4bc612, 0x7ed767afd,
+    0x7eda1a162,  0x7edcd3be1, 0x7edf94d2b, 0x7ee25d805, 0x7ee52df49,
+    0x7ee8065e4,  0x7eeae6eda, 0x7eedcfd45, 0x7ef0c1456, 0x7ef3bb758,
+    0x7ef6be9ac,  0x7ef9caed0, 0x7efce0a5a};
+
+std::vector<uint64_t> trig_knots_bits = {0x1, 0x2,  0x3,  0x4,  0x5, 0x6, 0x7,
+                                         0x8, 0x9,  0xa,  0xb,  0xc, 0xd, 0xe,
+                                         0xf, 0x10, 0x11, 0x12, 0x13};
+
+std::vector<std::vector<uint64_t>> tan_coeffs = {
+    // theta_1 (tan)
+    {0x80c90fd8f, 0x80c90fd8c, 0x80c90fd7a, 0x80c90fd71, 0x80c90fd90,
+     0x80c90fd8d, 0x80c90fd77, 0x80c90fd71, 0x80c90fd91, 0x80c90fd8e,
+     0x80c90fd7a, 0x80c90fd71, 0x80c90fd8e, 0x80c90fd8f, 0x80c90fd7f,
+     0x80c90fd75, 0x80c90fd91, 0x80c90fd90, 0x80c90fd83, 0x80c90fd76},
+    // theta_3 (tan)
+    {0x85bf324af, 0x8598a8bf8, 0x85a7132fa, 0x8595dd606, 0x83f917d57,
+     0x83ce65aef, 0x83ec812ce, 0x83d3be6ed, 0x82f34c572, 0x82e0d33bb,
+     0x82ef8a3ad, 0x82e5e993c, 0x82bb2caef, 0x82b3f2815, 0x82b608568,
+     0x82b44645e, 0x82aa32c8e, 0x82a8d8d22, 0x82a914fe0, 0x82a91ab98}};
+
+std::vector<uint64_t> exp2_knots_bits = {
+    0x20, 0x40, 0x50, 0x60, 0x68, 0x70, 0x78, 0x80, 0x84, 0x88, 0x8c,
+    0x90, 0x94, 0x98, 0x9c, 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac,
+    0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc1,
+    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc,
+    0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf};
+
+std::vector<std::vector<uint64_t>> exp2_coeffs = {
+    // theta_0 (2^x)
+    {0x7f8000008, 0x7f80000ce, 0x7f8000309, 0x7f8000783, 0x7f8000eb8,
+     0x7f80019c0, 0x7f800294d, 0x7f8003dff, 0x7f8005782, 0x7f8007a37,
+     0x7f800a3a7, 0x7f800d67f, 0x7f8011024, 0x7f8015816, 0x7f801a65f,
+     0x7f802097e, 0x7f802709a, 0x7f802efdf, 0x7f8037c41, 0x7f80417b7,
+     0x7f804c52f, 0x7f805894f, 0x7f8065c17, 0x7f80748da, 0x7f8084bb8,
+     0x7f8096811, 0x7f80a9e7c, 0x7f80bf007, 0x7f80d5cb3, 0x7f80ee4b3,
+     0x7f81092ca, 0x7f8126033, 0x7f8144e40, 0x7f81662d1, 0x7f818a337,
+     0x7f81b01c8, 0x7f81d8d4f, 0x7f82046ff, 0x7f823377a, 0x7f826418d,
+     0x7f8298453, 0x7f82d0033, 0x7f830b51e, 0x7f8348bd1, 0x7f838c135,
+     0x7f83d2015, 0x7f8418cf7, 0x7f8468ed3, 0x7f84baff1, 0x7f850fc33,
+     0x7f856bfec, 0x7f85caa12, 0x7f862c585, 0x7f86988e9, 0x7f87034c9,
+     0x7f877a48b, 0x7f87f08ba, 0x7f8870f4b, 0x7f88f4fdd, 0x7f897eb41,
+     0x7f8a10553, 0x7f8aa7c64, 0x7f8b46c15, 0x7f8be86e6},
+    // theta_1 (2^x)
+    {0x7eb16fda0, 0x7eb167048, 0x7eb158a95, 0x7eb144556, 0x7eb12b045,
+     0x7eb10b328, 0x7eb0e5421, 0x7eb0b984d, 0x7eb089e27, 0x7eb04f90c,
+     0x7eb010c2c, 0x7eafca59a, 0x7eaf80e56, 0x7eaf2bc91, 0x7eaed5a58,
+     0x7eae6f4bc, 0x7eae0b8b7, 0x7ead971b6, 0x7ead1dd08, 0x7eac9e6e2,
+     0x7eac17232, 0x7eab853ea, 0x7eaaef738, 0x7eaa4e38f, 0x7ea9a5250,
+     0x7ea8f29be, 0x7ea83713c, 0x7ea7728f7, 0x7ea6a5ad6, 0x7ea5d0d8b,
+     0x7ea4eee66, 0x7ea40437e, 0x7ea310a3e, 0x7ea211da3, 0x7ea10610c,
+     0x7e9ff43e9, 0x7e9ed61f3, 0x7e9dabdb2, 0x7e9c7286e, 0x7e9b36ce0,
+     0x7e99ec4c6, 0x7e9893a56, 0x7e972d963, 0x7e95c3557, 0x7e943ef03,
+     0x7e92b4970, 0x7e912ddac, 0x7e8f7cc21, 0x7e8dca7b8, 0x7e8c13035,
+     0x7e8a3e182, 0x7e886678a, 0x7e8688a72, 0x7e84813b8, 0x7e828a702,
+     0x7e8063dc0, 0x7dfc94fbb, 0x7df81a003, 0x7df392ce0, 0x7deeee28a,
+     0x7dea19cc0, 0x7de5290ff, 0x7de00e992, 0x7ddaf2f7f},
+    // theta_2 (2^x)
+    {0x7cf91ee2d, 0x7cfbc448f, 0x7cfe7be2a, 0x7d80a40a1, 0x7d820110b,
+     0x7d836bba4, 0x7d84dad02, 0x7d864a1a4, 0x7d87ac0fc, 0x7d8932bc4,
+     0x7d8aae4fa, 0x7d8c335e0, 0x7d8da8ee0, 0x7d8f3b04c, 0x7d90b58de,
+     0x7d925bdd1, 0x7d93dd6af, 0x7d9587325, 0x7d972a096, 0x7d98cb3b4,
+     0x7d9a71076, 0x7d9c22e14, 0x7d9dcc752, 0x7d9f836db, 0x7da13cf81,
+     0x7da2fd340, 0x7da4c2379, 0x7da68bb82, 0x7da858062, 0x7daa261d9,
+     0x7dac00cf2, 0x7dadde285, 0x7dafbe6b2, 0x7db1a5e90, 0x7db397777,
+     0x7db585d3c, 0x7db77c5d3, 0x7db97a56e, 0x7dbb84266, 0x7dbd8488b,
+     0x7dbf8fe62, 0x7dc1a4976, 0x7dc3c0f96, 0x7dc5d7139, 0x7dc80718d,
+     0x7dca33023, 0x7dcc4e086, 0x7dce97506, 0x7dd0d5c66, 0x7dd30f590,
+     0x7dd563533, 0x7dd7aee4e, 0x7dd9f6fc5, 0x7ddc66455, 0x7ddeb654d,
+     0x7de1333bc, 0x7de395a56, 0x7de615f10, 0x7de891fd1, 0x7deb1332d,
+     0x7deda335a, 0x7df037541, 0x7df2d6535, 0x7df56b505}};
+
+// input in [2^-24, 1/32)
+std::vector<uint64_t> log_knots_bits_1 = {0x1, 0x2, 0x3, 0x4,  0x5,  0x6,
+                                          0x7, 0x8, 0x9, 0xa,  0xb,  0xc,
+                                          0xd, 0xe, 0xf, 0x10, 0x11, 0x12};
+
+std::vector<std::vector<uint64_t>> log2_coeffs_1 = {
+    // theta_0 (exponent = -1)
+    {0x67b8aa3b8, 0x454a480000, 0x45ab68c000, 0x45aaed8000, 0x45aff631de,
+     0x45bbf3913f, 0x45c9f17880, 0x45da0ff60a, 0x45e9b4dbe7, 0x45f8244835,
+     0x45ff7d9ca9, 0x460ff77051, 0x461f803dc0, 0x462ea08164, 0x463e357d1d,
+     0x464e51cb78, 0x465f0c019d, 0x46782cc641, 0x468d45504f},
+    // theta_1 (exponent = -1)
+    {0x8000000000, 0x7fb8aa44d, 0x7fb8ad842, 0x7fb8abc67, 0x7fb8ab52d,
+     0x7fb8ab092, 0x7fb8aae6a, 0x7fb8aaea1, 0x7fb8aae15, 0x7fb8aac8b,
+     0x7fb8aac24, 0x7fb8aac55, 0x7fb8aac21, 0x7fb8aabae, 0x7fb8aab6f,
+     0x7fb8aab7d, 0x7fb8aabf2, 0x7fb8aace2, 0x7fb8ab472},
+    // theta_2 (exponent = -1)
+    {0x8000000000, 0x8000000000, 0x4879d90000, 0x4858f8f8af, 0x483c24e766,
+     0x48284f6d25, 0x480c17f59b, 0x47f9abb50f, 0x47cb85b74d, 0x7daa0befe,
+     0x7e88ee318, 0x7ea06fdc5, 0x7eacc5cc3, 0x7eb305d49, 0x7eb5f00fe,
+     0x7eb74acb9, 0x7eb7ee42a, 0x7eb83c33f, 0x7eb8319c5},
+    // theta_3 (exponent = -1)
+    {0x8000000000, 0x8000000000, 0x9b9aaaaaa, 0x988a04e05, 0x95baa0071,
+     0x9383e0dac, 0x90d93ecbc, 0x8ee28ec73, 0x8ccfe5ed4, 0x8ab5cb2f8,
+     0x88aefbf57, 0x86b0b062c, 0x84afc9cf7, 0x82ace3038, 0x80be0f23d,
+     0x7f8de8437, 0x7ea71a1a4, 0x7e89da037, 0x7e8671d2f},
+    // theta_0
+    {0x8000000000, 0x68b8aa3a8, 0x4548100000, 0x45aea98000, 0x45b9dca492,
+     0x45be186e9e, 0x45c9fcd81a, 0x45db6b8e3c, 0x45eb30dca8, 0x45f86d1e79,
+     0x4608985af9, 0x460fe2a096, 0x461f98b981, 0x462f29c692, 0x463e55898d,
+     0x464ecca0a2, 0x465e6dbdb3, 0x466c69df55, 0x465d4e7349},
+    // theta_1
+    {0x8000000000, 0x8000000000, 0x7fb8aa3c9, 0x7fb8ac575, 0x7fb8aba10,
+     0x7fb8ab329, 0x7fb8aaeaa, 0x7fb8aafe9, 0x7fb8aaf94, 0x7fb8aacd6,
+     0x7fb8aad02, 0x7fb8aac61, 0x7fb8aac2e, 0x7fb8aabf3, 0x7fb8aab7d,
+     0x7fb8aabbe, 0x7fb8aab83, 0x7fb8aaa35, 0x7fb8aa370},
+    // theta_2
+    {0x8000000000, 0x8000000000, 0x8000000000, 0x485cca0000, 0x484871d41d,
+     0x482bc5ed09, 0x481942d373, 0x480b644e28, 0x47fdfce81b, 0x47f8fd00f2,
+     0x47eed0287d, 0x47ed147417, 0x47ec4a2dc9, 0x47ebe7d4f1, 0x47ebb68e32,
+     0x47eba14fe5, 0x47eb958f93, 0x47eb8edc42, 0x47eb890fb2},
+    // theta_3
+    {0x8000000000, 0x8000000000, 0x8000000000, 0x98c800000, 0x95fe2be2c,
+     0x93a4587e7, 0x90e765bc2, 0x8ef66dd2c, 0x8cec64b15, 0x8abc03a30,
+     0x88beade75, 0x86b529589, 0x84b0ee221, 0x82b1e374f, 0x80bed09d2,
+     0x7f9001cea, 0x7ea1e1c2f, 0x7e80b257d, 0x7deafa938}};
+
+std::vector<std::vector<uint64_t>> ln_coeffs_1 = {
+    // theta_0 (exponent = -1)
+    {0x678000006, 0x453b380000, 0x459aaf8000, 0x459ef90000, 0x45aacf500f,
+     0x45b885a2c2, 0x45bfe843e1, 0x45cf8a597b, 0x45ddc59dc0, 0x45ee920a6a,
+     0x45fc1e1e10, 0x460c459015, 0x461b0350fc, 0x462b14e7e8, 0x463afee990,
+     0x464ab0d034, 0x465a268804, 0x466aff7f0c, 0x468953cf7f},
+    // theta_1 (exponent = -1)
+    {0x8000000000, 0x7f8000056, 0x7f8001888, 0x7f800109a, 0x7f8000be9,
+     0x7f800094c, 0x7f800089c, 0x7f800086e, 0x7f800076d, 0x7f80007dc,
+     0x7f8000687, 0x7f800068a, 0x7f80005d9, 0x7f80005db, 0x7f80005d2,
+     0x7f80005ae, 0x7f8000566, 0x7f800060e, 0x7f8000ba2},
+    // theta_2 (exponent = -1)
+    {0x8000000000, 0x8000000000, 0x4869280000, 0x484bd33333, 0x4838446bfc,
+     0x481c2c7183, 0x480a199da5, 0x47efbcce98, 0x47c9a65fa4, 0x7c9fd543f,
+     0x7db7022f1, 0x7ddbe1a14, 0x7defe5ab3, 0x7df7f9cb5, 0x7dfc01ef4,
+     0x7dfe0bd55, 0x7dff10fd9, 0x7dff715d0, 0x7dff59cdb},
+    // theta_3 (exponent = -1)
+    {0x8000000000, 0x8000000000, 0x9a9000000, 0x97b333333, 0x94fd350fd,
+     0x92c221e79, 0x90b0fd0d9, 0x8eadc492b, 0x8c981c397, 0x8aa05ee3e,
+     0x8884b7b11, 0x8681fe997, 0x83e928c11, 0x81ef5d3e4, 0x8087377b2,
+     0x7ec54fcf7, 0x7de152efe, 0x7dbdb0ae2, 0x7dba42662},
+    // theta_0
+    {0x8000000000, 0x67ffffffd, 0x451bc00000, 0x459fe37fff, 0x45a824a492,
+     0x45af11c37d, 0x45bd4949df, 0x45cafa9d51, 0x45db06f325, 0x45e9d65f57,
+     0x45f9bead01, 0x460a8c4c5a, 0x461a28113b, 0x4629d356c5, 0x4639a99321,
+     0x46494ab8e4, 0x46589b8ca3, 0x465fee58f3, 0x463e1b1263},
+    // theta_1
+    {0x8000000000, 0x8000000000, 0x7efffffef, 0x7f8001261, 0x7f800091e,
+     0x7f800083c, 0x7f8000724, 0x7f80005eb, 0x7f80005f5, 0x7f8000553,
+     0x7f8000546, 0x7f80005b2, 0x7f8000568, 0x7f800053b, 0x7f8000529,
+     0x7f80004ee, 0x7f8000493, 0x7f8000419, 0x7effffdf9},
+    // theta_2
+    {0x8000000000, 0x8000000000, 0x8000000000, 0x484dfbffff, 0x482dbf7df7,
+     0x481c9e93e9, 0x480be1cf01, 0x47fc3967a0, 0x47f827aa0a, 0x47ebb9376b,
+     0x47e9d7f264, 0x47e8fe8475, 0x47e8772e04, 0x47e839aace, 0x47e81c8560,
+     0x47e80d87e4, 0x47e8063df8, 0x47e80293e3, 0x47dffd2ae2},
+    // theta_3
+    {0x8000000000, 0x8000000000, 0x8000000000, 0x97da00000, 0x94c7df7df,
+     0x92a9c2f94, 0x908da0f81, 0x8dec80240, 0x8befd88b0, 0x89d7bcdaa,
+     0x87d5b5bc0, 0x85e6d8667, 0x83d7894e7, 0x81d885d94, 0x7ff6ef62f,
+     0x7eb5505ee, 0x7dd5b262f, 0x7db0f1a7e, 0x7da23bb13}};
+
+// input in [1/32, 1)
+std::vector<uint64_t> log_knots_bits_2 = {
+    0x8,  0x10, 0x18, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x32, 0x34, 0x36, 0x38,
+    0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45,
+    0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f};
+
+std::vector<std::vector<uint64_t>> log2_coeffs_2 = {
+    // theta_0 (exponent = -1)
+    {0x46be6a41ea, 0x46e862f26b, 0x46f891aa59, 0x471a2d2e39, 0x472f410b0c,
+     0x4749ae4d10, 0x475af22c95, 0x476aceece2, 0x477a096dc5, 0x47890074cd,
+     0x478fcd4013, 0x479d04a38f, 0x47aa81816e, 0x47b92524b7, 0x47bb4084e6,
+     0x47bdca96d5, 0x47c84fdb85, 0x47ca3d2d5b, 0x47cc9a5d4d, 0x7f8000002,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_1 (exponent = -1)
+    {0x7fb8af6df,  0x7fb8baf51,  0x7fb8c5f05,  0x7fb905041,  0x7fb97cf94,
+     0x7fba5bb06,  0x7fbbdccf7,  0x7fbe1e940,  0x7fc19027f,  0x7fc6d6e63,
+     0x7fceee0d6,  0x7fda01040,  0x7fe9e7a07,  0x80833fc4c,  0x808a7f2e9,
+     0x8092f51ce,  0x809c1b742,  0x80a83ba5b,  0x80b6ab65f,  0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_2 (exponent = -1)
+    {0x7eb74dce7,  0x7eb5a4b30,  0x7eb4898db,  0x7eaf5a57e,  0x7ea7f762e,
+     0x7e9cf19c8,  0x7e8d1d58a,  0x7df1797e1,  0x7dbad70d0,  0x7ce085d81,
+     0x7998dc93c,  0x47cec92823, 0x47e8f819d3, 0x47f8dc8aca, 0x47faefd620,
+     0x47fd48bb3f, 0x47ffbee339, 0x4809764702, 0x480b4c62e8, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_3 (exponent = -1)
+    {0x7e8e70556,  0x7e9877d7c,  0x7e9cf26cb,  0x7eaf38bb7,  0x7ec2aa537,
+     0x7ed9f9141,  0x7ef5c1ac1,  0x7f8a37d9c,  0x7f9c48dd5,  0x7fb241222,
+     0x7fcd6d46a,  0x7fec2dcbd,  0x8088be406,  0x80a55589e,  0x80b1ff4bc,
+     0x80bfe0aa6,  0x80cdff0b6,  0x80dfc3c12,  0x80f3b1b03,  0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_0
+    {0x467fe98efd, 0x46bdceffc4, 0x6e91e9dc2, 0x6fedbf7ca, 0x719c3e00c,
+     0x72a3ee83a,  0x72f859bfa,  0x73ca200f3, 0x749c7cf76, 0x74e50da68,
+     0x75a487668,  0x75dd4baa8,  0x7693ab45c, 0x769721217, 0x769721217,
+     0x7697593a8,  0x7697593a8,  0x76b766181, 0x76b766181, 0x76dc282ce,
+     0x76f63c2b6,  0x778c2df8c,  0x779e036b4, 0x77bf79f9b, 0x77c98c2c2,
+     0x77f64926f,  0x77f87162f,  0x78868cbd4, 0x789c867a3, 0x78a5fc39b,
+     0x78c9a973a,  0x78c4f67ed,  0x78e54fc48, 0x78ee7b1c6, 0x78df50698},
+    // theta_1
+    {0x7fb8a9e92, 0x7fb8ab86d, 0x7fb89a718, 0x7fb884883, 0x7fb85dfb7,
+     0x7fb827a56, 0x7fb7f8279, 0x7fb7acdab, 0x7fb74fd15, 0x7fb6e3daf,
+     0x7fb65fffc, 0x7fb5d7a64, 0x7fb53677e, 0x7fb5183c8, 0x7fb5183c8,
+     0x7fb4f9b91, 0x7fb4f9b91, 0x7fb470198, 0x7fb470198, 0x7fb3dcd22,
+     0x7fb36adb6, 0x7fb2e3966, 0x7fb25a3e9, 0x7fb18951c, 0x7fb128424,
+     0x7fb030a32, 0x7faff1386, 0x7faf6367a, 0x7fae7e90e, 0x7fadfbe50,
+     0x7facc12bf, 0x7faca35cb, 0x7fab8cfe9, 0x7fab11a08, 0x7fab29e9d},
+    // theta_2
+    {0x47eb857369, 0x47eb85a6fc, 0x47eb619c06, 0x47eb429519, 0x47eb19b5d2,
+     0x47eaebe88a, 0x47eac8bd24, 0x47ea98d542, 0x47ea6557aa, 0x47ea305bd3,
+     0x47e9f6da8e, 0x47e9c0b685, 0x47e9869a67, 0x47e977f0d1, 0x47e977f0d1,
+     0x47e966b89e, 0x47e966b89e, 0x47e938aba0, 0x47e938aba0, 0x47e90a9863,
+     0x47e8e649bb, 0x47e8be62c6, 0x47e897864a, 0x47e86411b4, 0x47e847dcfe,
+     0x47e8114e43, 0x47dff9a277, 0x47dfb6a149, 0x47df5a2bfa, 0x47df1f40b4,
+     0x47deafe7a1, 0x47de95abfc, 0x47de372940, 0x47de0503f0, 0x47ddf9511e},
+    // theta_3
+    {0x7de2ec016, 0x7ddf87418, 0x7dc61296d, 0x7db771277, 0x7da903ff2,
+     0x7d9c29fbc, 0x7d939220f, 0x7d8982a84, 0x7d80185f5, 0x7cef00ce9,
+     0x7cde644b1, 0x7cd028404, 0x7cc244055, 0x7cbe29067, 0x7cbe29067,
+     0x7cb942c85, 0x7cb942c85, 0x7caf6931f, 0x7caf6931f, 0x7ca630243,
+     0x7c9f00cc7, 0x7c97a1f74, 0x7c90c822f, 0x7c88a150d, 0x7c83d070d,
+     0x7bf816a83, 0x7bf1250b0, 0x7be781b20, 0x7bdb97956, 0x7bd3a131e,
+     0x7bc6d7df8, 0x7bc2a8805, 0x7bb8564df, 0x7bb258ba7, 0x7bafcba09}};
+
+std::vector<std::vector<uint64_t>> ln_coeffs_2 = {
+    // theta_0 (exponent = -1)
+    {0x46bb9a1e7b, 0x46d9fb9cf8, 0x46ecb26fa5, 0x470d3c3a35, 0x472b1e3b6b,
+     0x473d91b8bf, 0x474e945937, 0x475ee02d1b, 0x476e5a9db5, 0x477ca18e49,
+     0x478abc30da, 0x4798d9660e, 0x479e2d5276, 0x47ae0a5cb9, 0x47b8908349,
+     0x47ba5a7955, 0x47bcb4b84b, 0x47bf669326, 0x47c96c6b5c, 0x47c96c6b5c,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_1 (exponent = -1)
+    {0x7f8004158,  0x7f800a32a,  0x7f8014387,  0x7f803bdbb,  0x7f8097f85,
+     0x7f812ef0d,  0x7f8225ae5,  0x7f83c286a,  0x7f8652e08,  0x7f89eb796,
+     0x7f8f2ecec,  0x7f96be35f,  0x7fa16248d,  0x7fbae32ca,  0x7fc557550,
+     0x7fd114099,  0x7fe01d4a1,  0x7ff0d08e8,  0x8082cf41a,  0x8082cf41a,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_2 (exponent = -1)
+    {0x7dfdeca5c,  0x7dfc34208,  0x7dfa11f0a,  0x7df389d94,  0x7de8303a7,
+     0x7dd95aab5,  0x7dc4ff914,  0x7da7b8d0d,  0x7cfe4c5bb,  0x7c991b1df,
+     0x799ae876c,  0x47c9bca86c, 0x47dbf44dc8, 0x47edc67f25, 0x47f85bb796,
+     0x47f9f59952, 0x47fbf5830f, 0x47fe1cebea, 0x48085d276d, 0x48085d276d,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_3 (exponent = -1)
+    {0x7dc705d51,  0x7dd1075fa,  0x7dda7901c,  0x7df185ae1,  0x7e87b933f,
+     0x7e97494c5,  0x7ea936bbb,  0x7ebf6002e,  0x7eda240dd,  0x7ef7cf905,
+     0x7f8d9c8ca,  0x7fa2c0a77,  0x7fbbec881,  0x7feecd82b,  0x80803413a,
+     0x8089806ff,  0x8094d5438,  0x80a0ac48d,  0x80aeaf72c,  0x80aeaf72c,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000,
+     0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000, 0x8000000000},
+    // theta_0
+    {0x468b3fff17, 0x468fedfc35, 0x6dc46258e, 0x6fb3f68bf, 0x70d29d1ff,
+     0x71ce0e35b,  0x72bc4d051,  0x739d7db71, 0x73f184849, 0x748a00d7f,
+     0x74b543494,  0x74f79786d,  0x75a19dd82, 0x75c764d58, 0x75c764d58,
+     0x75ff84975,  0x75ff84975,  0x7697f2dc0, 0x7697f2dc0, 0x76bdb0a8a,
+     0x76dc9f010,  0x77862c7cb,  0x7799cd708, 0x779f62216, 0x769e3f43a,
+     0x76a80c72d,  0x76b597c68,  0x76c70e77b, 0x76f68235b, 0x75eac25a2,
+     0x76bb75395,  0x77910ad64,  0x77863f3d8, 0x76c215d06, 0x7787f81a2},
+    // theta_1
+    {0x7f8000020, 0x7efffe852, 0x7effea8d5, 0x7effc8745, 0x7eff98787,
+     0x7eff568f5, 0x7efef9fba, 0x7efe84de0, 0x7efdfcffb, 0x7efdbef7a,
+     0x7efd3e86d, 0x7efc91f43, 0x7efbdc60e, 0x7efb2f804, 0x7efb2f804,
+     0x7efa4b452, 0x7efa4b452, 0x7ef98a5ba, 0x7ef98a5ba, 0x7ef87f70c,
+     0x7ef7a75db, 0x7ef67ddff, 0x7ef58ba08, 0x7ef525244, 0x7ef7b4cd5,
+     0x7ef7243cf, 0x7ef6816ef, 0x7ef5cdb1e, 0x7ef4a844e, 0x7ef60b066,
+     0x7ef48f793, 0x7ef2b2003, 0x7ef27c825, 0x7ef2e5aeb, 0x7ef16b1e5},
+    // theta_2
+    {0x47dffa5878, 0x47dff192bf, 0x47dfc7cca8, 0x47df98566c, 0x47df6588db,
+     0x47df2c6190, 0x47dee9bf0c, 0x47dea1e738, 0x47de5905c6, 0x47de365d7a,
+     0x47ddf92152, 0x47ddb038ca, 0x47dd69960b, 0x47dd2a1df6, 0x47dd2a1df6,
+     0x47dcdea9ab, 0x47dcdea9ab, 0x47dca10e07, 0x47dca10e07, 0x47dc54320b,
+     0x47dc179c5a, 0x47dbcbf1b0, 0x47db8fb2f8, 0x47db716c11, 0x47dbdf7a7d,
+     0x47dbb2af4e, 0x47db83e351, 0x47db539d84, 0x47db12de70, 0x47db36b23d,
+     0x47daeb235c, 0x47da94f813, 0x47da7c5c9c, 0x47da78aaf1, 0x47da3612dc},
+    // theta_3
+    {0x7d9e20c81, 0x7d9814a7e, 0x7d8976f1a, 0x7cfcde8b2, 0x7ceaf7ca2,
+     0x7cda875a0, 0x7cca9da75, 0x7cbbfc0fa, 0x7caf01d8c, 0x7ca8d4244,
+     0x7c9f58da6, 0x7c95466d3, 0x7c8c4d250, 0x7c84bcb8c, 0x7c84bcb8c,
+     0x7bf925a7d, 0x7bf925a7d, 0x7bec62d58, 0x7bec62d58, 0x7bdde2a13,
+     0x7bd2da518, 0x7bc6398c9, 0x7bbc788bf, 0x7bb72149e, 0x7bc33a7eb,
+     0x7bbb8e109, 0x7bb3fc63c, 0x7bac962ee, 0x7ba3c7fc3, 0x7ba588ebe,
+     0x7b9c29785, 0x7b923e57d, 0x7b8e65a4e, 0x7b8c7e905, 0x7b853d05b}};
+
+std::vector<uint64_t> log2_int_to_float = {
+    0x242ff, 0x242fe, 0x242fd, 0x242fc, 0x242fb, 0x242fa, 0x242f9, 0x242f8,
+    0x242f7, 0x242f6, 0x242f5, 0x242f4, 0x242f3, 0x242f2, 0x242f1, 0x242f0,
+    0x242ef, 0x242ee, 0x242ed, 0x242ec, 0x242eb, 0x242ea, 0x242e9, 0x242e8,
+    0x242e7, 0x242e6, 0x242e5, 0x242e4, 0x242e3, 0x242e2, 0x242e1, 0x242e0,
+    0x242df, 0x242de, 0x242dd, 0x242dc, 0x242db, 0x242da, 0x242d9, 0x242d8,
+    0x242d7, 0x242d6, 0x242d5, 0x242d4, 0x242d3, 0x242d2, 0x242d1, 0x242d0,
+    0x242cf, 0x242ce, 0x242cd, 0x242cc, 0x242cb, 0x242ca, 0x242c9, 0x242c8,
+    0x242c7, 0x242c6, 0x242c5, 0x242c4, 0x242c3, 0x242c2, 0x242c1, 0x242c0,
+    0x2427e, 0x2427c, 0x2427a, 0x24278, 0x24276, 0x24274, 0x24272, 0x24270,
+    0x2426e, 0x2426c, 0x2426a, 0x24268, 0x24266, 0x24264, 0x24262, 0x24260,
+    0x2425e, 0x2425c, 0x2425a, 0x24258, 0x24256, 0x24254, 0x24252, 0x24250,
+    0x2424e, 0x2424c, 0x2424a, 0x24248, 0x24246, 0x24244, 0x24242, 0x24240,
+    0x241fc, 0x241f8, 0x241f4, 0x241f0, 0x241ec, 0x241e8, 0x241e4, 0x241e0,
+    0x241dc, 0x241d8, 0x241d4, 0x241d0, 0x241cc, 0x241c8, 0x241c4, 0x241c0,
+    0x24178, 0x24170, 0x24168, 0x24160, 0x24158, 0x24150, 0x24148, 0x24140,
+    0x240f0, 0x240e0, 0x240d0, 0x240c0, 0x24060, 0x24040, 0x23fc0, 0x40000,
+    0x3fc0,  0x4040,  0x4060,  0x40c0,  0x40d0,  0x40e0,  0x40f0,  0x4140,
+    0x4148,  0x4150,  0x4158,  0x4160,  0x4168,  0x4170,  0x4178,  0x41c0,
+    0x41c4,  0x41c8,  0x41cc,  0x41d0,  0x41d4,  0x41d8,  0x41dc,  0x41e0,
+    0x41e4,  0x41e8,  0x41ec,  0x41f0,  0x41f4,  0x41f8,  0x41fc,  0x4240,
+    0x4242,  0x4244,  0x4246,  0x4248,  0x424a,  0x424c,  0x424e,  0x4250,
+    0x4252,  0x4254,  0x4256,  0x4258,  0x425a,  0x425c,  0x425e,  0x4260,
+    0x4262,  0x4264,  0x4266,  0x4268,  0x426a,  0x426c,  0x426e,  0x4270,
+    0x4272,  0x4274,  0x4276,  0x4278,  0x427a,  0x427c,  0x427e,  0x42c0,
+    0x42c1,  0x42c2,  0x42c3,  0x42c4,  0x42c5,  0x42c6,  0x42c7,  0x42c8,
+    0x42c9,  0x42ca,  0x42cb,  0x42cc,  0x42cd,  0x42ce,  0x42cf,  0x42d0,
+    0x42d1,  0x42d2,  0x42d3,  0x42d4,  0x42d5,  0x42d6,  0x42d7,  0x42d8,
+    0x42d9,  0x42da,  0x42db,  0x42dc,  0x42dd,  0x42de,  0x42df,  0x42e0,
+    0x42e1,  0x42e2,  0x42e3,  0x42e4,  0x42e5,  0x42e6,  0x42e7,  0x42e8,
+    0x42e9,  0x42ea,  0x42eb,  0x42ec,  0x42ed,  0x42ee,  0x42ef,  0x42f0,
+    0x42f1,  0x42f2,  0x42f3,  0x42f4,  0x42f5,  0x42f6,  0x42f7,  0x42f8,
+    0x42f9,  0x42fa,  0x42fb,  0x42fc,  0x42fd,  0x42fe,  0x42ff,  0x4340};
+
+std::vector<uint64_t> ln_int_to_float = {
+    0x485b00f33c, 0x485aeac4f9, 0x485ad496b7, 0x485abe6874, 0x485aa83a31,
+    0x485a920bee, 0x485a7bddab, 0x485a65af68, 0x485a4f8125, 0x485a3952e2,
+    0x485a23249f, 0x485a0cf65c, 0x4859f6c819, 0x4859e099d6, 0x4859ca6b93,
+    0x4859b43d50, 0x48599e0f0d, 0x485987e0ca, 0x485971b287, 0x48595b8444,
+    0x4859455601, 0x48592f27be, 0x485918f97b, 0x485902cb38, 0x4858ec9cf5,
+    0x4858d66eb2, 0x4858c0406f, 0x4858aa122c, 0x485893e3e9, 0x48587db5a6,
+    0x4858678763, 0x4858515920, 0x48583b2add, 0x485824fc9a, 0x48580ece57,
+    0x484ff14027, 0x484fc4e3a1, 0x484f98871b, 0x484f6c2a95, 0x484f3fce0f,
+    0x484f137189, 0x484ee71503, 0x484ebab87d, 0x484e8e5bf7, 0x484e61ff71,
+    0x484e35a2eb, 0x484e094665, 0x484ddce9df, 0x484db08d59, 0x484d8430d3,
+    0x484d57d44d, 0x484d2b77c7, 0x484cff1b41, 0x484cd2bebb, 0x484ca66235,
+    0x484c7a05af, 0x484c4da929, 0x484c214ca3, 0x484bf4f01d, 0x484bc89397,
+    0x484b9c3711, 0x484b6fda8b, 0x484b437e05, 0x484b17217f, 0x484aeac4f9,
+    0x484abe6874, 0x484a920bee, 0x484a65af68, 0x484a3952e2, 0x484a0cf65c,
+    0x4849e099d6, 0x4849b43d50, 0x484987e0ca, 0x48495b8444, 0x48492f27be,
+    0x484902cb38, 0x4848d66eb2, 0x4848aa122c, 0x48487db5a6, 0x4848515920,
+    0x484824fc9a, 0x483ff14027, 0x483f98871b, 0x483f3fce0f, 0x483ee71503,
+    0x483e8e5bf7, 0x483e35a2eb, 0x483ddce9df, 0x483d8430d3, 0x483d2b77c7,
+    0x483cd2bebb, 0x483c7a05af, 0x483c214ca3, 0x483bc89397, 0x483b6fda8b,
+    0x483b17217f, 0x483abe6874, 0x483a65af68, 0x483a0cf65c, 0x4839b43d50,
+    0x48395b8444, 0x483902cb38, 0x4838aa122c, 0x4838515920, 0x482ff14027,
+    0x482f3fce0f, 0x482e8e5bf7, 0x482ddce9df, 0x482d2b77c7, 0x482c7a05af,
+    0x482bc89397, 0x482b17217f, 0x482a65af68, 0x4829b43d50, 0x482902cb38,
+    0x4828515920, 0x481f3fce0f, 0x481ddce9df, 0x481c7a05af, 0x481b17217f,
+    0x4819b43d50, 0x4818515920, 0x480ddce9df, 0x480b17217f, 0x4808515920,
+    0x47fb17217f, 0x47eb17217f, 0x8000000000, 0x7eb17217f,  0x7fb17217f,
+    0x808515920,  0x80b17217f,  0x80ddce9df,  0x818515920,  0x819b43d50,
+    0x81b17217f,  0x81c7a05af,  0x81ddce9df,  0x81f3fce0f,  0x828515920,
+    0x82902cb38,  0x829b43d50,  0x82a65af68,  0x82b17217f,  0x82bc89397,
+    0x82c7a05af,  0x82d2b77c7,  0x82ddce9df,  0x82e8e5bf7,  0x82f3fce0f,
+    0x82ff14027,  0x838515920,  0x838aa122c,  0x83902cb38,  0x8395b8444,
+    0x839b43d50,  0x83a0cf65c,  0x83a65af68,  0x83abe6874,  0x83b17217f,
+    0x83b6fda8b,  0x83bc89397,  0x83c214ca3,  0x83c7a05af,  0x83cd2bebb,
+    0x83d2b77c7,  0x83d8430d3,  0x83ddce9df,  0x83e35a2eb,  0x83e8e5bf7,
+    0x83ee71503,  0x83f3fce0f,  0x83f98871b,  0x83ff14027,  0x84824fc9a,
+    0x848515920,  0x8487db5a6,  0x848aa122c,  0x848d66eb2,  0x84902cb38,
+    0x8492f27be,  0x8495b8444,  0x84987e0ca,  0x849b43d50,  0x849e099d6,
+    0x84a0cf65c,  0x84a3952e2,  0x84a65af68,  0x84a920bee,  0x84abe6874,
+    0x84aeac4f9,  0x84b17217f,  0x84b437e05,  0x84b6fda8b,  0x84b9c3711,
+    0x84bc89397,  0x84bf4f01d,  0x84c214ca3,  0x84c4da929,  0x84c7a05af,
+    0x84ca66235,  0x84cd2bebb,  0x84cff1b41,  0x84d2b77c7,  0x84d57d44d,
+    0x84d8430d3,  0x84db08d59,  0x84ddce9df,  0x84e094665,  0x84e35a2eb,
+    0x84e61ff71,  0x84e8e5bf7,  0x84ebab87d,  0x84ee71503,  0x84f137189,
+    0x84f3fce0f,  0x84f6c2a95,  0x84f98871b,  0x84fc4e3a1,  0x84ff14027,
+    0x8580ece57,  0x85824fc9a,  0x8583b2add,  0x858515920,  0x858678763,
+    0x8587db5a6,  0x85893e3e9,  0x858aa122c,  0x858c0406f,  0x858d66eb2,
+    0x858ec9cf5,  0x85902cb38,  0x85918f97b,  0x8592f27be,  0x859455601,
+    0x8595b8444,  0x85971b287,  0x85987e0ca,  0x8599e0f0d,  0x859b43d50,
+    0x859ca6b93,  0x859e099d6,  0x859f6c819,  0x85a0cf65c,  0x85a23249f,
+    0x85a3952e2,  0x85a4f8125,  0x85a65af68,  0x85a7bddab,  0x85a920bee,
+    0x85aa83a31,  0x85abe6874,  0x85ad496b7,  0x85aeac4f9,  0x85b00f33c,
+    0x85b17217f};
+
+// input in [2^-24, 1/32)
+std::vector<uint64_t> sin_knots_bits_1 = {0x1, 0x2, 0x3, 0x4,
+                                          0x5, 0x6, 0x7, 0x8};
+
+std::vector<std::vector<uint64_t>> sin_coeffs_1 = {
+    // theta_0 (x)
+    {0x80c90fdcb, 0x80c90fdc9, 0x80c90fdc8, 0x80c90fdcb, 0x80c90fdcd,
+     0x80c90fdca, 0x80c90fdc8, 0x80c90fdc8, 0x80c90fdc8},
+    // theta_1 (x^3)
+    {0x4868a64cdd, 0x484914a934, 0x482cbdf0fd, 0x481e8199b3, 0x481b70f5b4,
+     0x481a95452f, 0x481a6507d9, 0x481a59a9c6, 0x481a56d141},
+    // theta_2 (x^5)
+    {0x9fd5c1706, 0x9bc6f0f5c, 0x97c1ea8b3, 0x93d5ea0d8, 0x8fe266fb5,
+     0x8bc8f1f91, 0x87c36e043, 0x83d6c62ff, 0x81822bf6a}};
+
+std::vector<std::vector<uint64_t>> cos_coeffs_1 = {
+    // theta_0 (x)
+    {0x80c90fdd2, 0x80c90fdcc, 0x80c90fdce, 0x80c90fdcc, 0x80c90fdcc,
+     0x80c90fdcc, 0x80c90fdcb, 0x80c90fdcb, 0x80c90fdcd},
+    // theta_1 (x^3)
+    {0x486a2a90c7, 0x4849f20c6d, 0x482e343a01, 0x481ea4041f, 0x481b6cd07b,
+     0x481a99292b, 0x481a662452, 0x481a59ffbf, 0x481a56f965},
+    // theta_2 (x^5)
+    {0x9fff3e016, 0x9be0d5ef1, 0x97e6f0994, 0x93dd4afb2, 0x8fe199edf,
+     0x8bd77b7ba, 0x87cfef880, 0x83e759e8e, 0x818a2e0fd}};
+
+// input in [1/32, 0.5]
+std::vector<uint64_t> sin_knots_bits_2 = {
+    0x20, 0x30, 0x40, 0x48, 0x50, 0x58, 0x60, 0x64, 0x66, 0x68, 0x69,
+    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+    0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f};
+
+std::vector<std::vector<uint64_t>> sin_coeffs_2 = {
+    // theta_0 (x)
+    {0x80c90fdca, 0x80c90fdef, 0x80c90fe23, 0x80c90fd59, 0x80c90fb69,
+     0x80c90f550, 0x80c90e686, 0x80c90c854, 0x80c90b44e, 0x80c90933c,
+     0x80c90a91b, 0x80c909392, 0x80c90749c, 0x80c905f25, 0x80c904104,
+     0x80c901b8b, 0x80c8fef5b, 0x80c8fc240, 0x80c8f9294, 0x80c8f6146,
+     0x80c8f2003, 0x80c8edf2a, 0x80c8e8cde, 0x80c8e47da, 0x80c8de7fe,
+     0x80c8d8b1a, 0x80c8d20ae, 0x80c8cab1d, 0x80c8c2d50, 0x80c8ba53a,
+     0x80c8b0cc7, 0x80c8a670c, 0x80c89cbf9, 0x80c89175d},
+    // theta_1 (x^3)
+    {0x481a561e85, 0x481a56097e, 0x481a55fa4e, 0x481a55ae60, 0x481a554d55,
+     0x481a548cc6, 0x481a534bb1, 0x481a5166f1, 0x481a50571a, 0x481a4ece64,
+     0x481a4f9381, 0x481a4ea17a, 0x481a4d6492, 0x481a4c8a52, 0x481a4b6f09,
+     0x481a4a2501, 0x481a48b65e, 0x481a475120, 0x481a45e848, 0x481a448216,
+     0x481a42c186, 0x481a411598, 0x481a3f110b, 0x481a3d6d07, 0x481a3b4156,
+     0x481a393a6e, 0x481a36ff3b, 0x481a349fc8, 0x481a322d91, 0x481a2fa00c,
+     0x481a2cde8c, 0x481a29fa4c, 0x481a275a94, 0x481a2465bc},
+    // theta_2 (x^5)
+    {0x80a977765, 0x80a4842fa, 0x80a3093a4, 0x80a173d7d, 0x80a04e353,
+     0x809ed5d60, 0x809d2299d, 0x809b3b876, 0x809a55fe1, 0x809933182,
+     0x8099a0117, 0x8098f9124, 0x8098300fb, 0x8097a7a9d, 0x809702dc5,
+     0x80964e652, 0x8095911aa, 0x8094e0fbb, 0x809436e66, 0x809394dce,
+     0x8092d4957, 0x809224753, 0x80915a0e6, 0x8090ba945, 0x808ff1714,
+     0x808f3c713, 0x808e7cf6d, 0x808db8e9b, 0x808cf62d4, 0x808c321bf,
+     0x808b661c2, 0x808a97804, 0x8089e1b2f, 0x80891b84b}};
+
+std::vector<std::vector<uint64_t>> cos_coeffs_2 = {
+    // theta_0 (x)
+    {0x80c90fdc8, 0x80c90fdf1, 0x80c90fe1b, 0x80c90fd5f, 0x80c90fb79,
+     0x80c90f557, 0x80c90e63d, 0x80c90c811, 0x80c90b44e, 0x80c909426,
+     0x80c90a3f0, 0x80c9091f8, 0x80c90789c, 0x80c905efb, 0x80c903a1f,
+     0x80c9016bc, 0x80c8ff36c, 0x80c8fc252, 0x80c8f90d3, 0x80c8f53fb,
+     0x80c8f21b4, 0x80c8edb6d, 0x80c8e930d, 0x80c8e44ed, 0x80c8de0fd,
+     0x80c8d871b, 0x80c8d1f33, 0x80c8caf7d, 0x80c8c2c3a, 0x80c8ba675,
+     0x80c8b0c2a, 0x80c8a748c, 0x80c89c885, 0x80c891374},
+    // theta_1 (x^3)
+    {0x481a561a1e, 0x481a560b12, 0x481a55f75c, 0x481a55afc0, 0x481a554f73,
+     0x481a548d87, 0x481a53465f, 0x481a51634e, 0x481a50571a, 0x481a4ed84a,
+     0x481a4f5fe4, 0x481a4e928e, 0x481a4d8926, 0x481a4c88f1, 0x481a4b3604,
+     0x481a49fef1, 0x481a48d515, 0x481a47519c, 0x481a45dc1b, 0x481a442941,
+     0x481a42cc73, 0x481a40fe79, 0x481a3f35cf, 0x481a3d5c64, 0x481a3b1b0d,
+     0x481a392555, 0x481a36f789, 0x481a34b573, 0x481a322859, 0x481a2fa5c3,
+     0x481a2cdbc0, 0x481a2a34c3, 0x481a274c25, 0x481a245593},
+    // theta_2 (x^5)
+    {0x80a905221, 0x80a49729d, 0x80a2f8a68, 0x80a178bc7, 0x80a0528c4,
+     0x809ed704c, 0x809d1c7a7, 0x809b3868f, 0x809a55fe1, 0x809939c73,
+     0x80997fda8, 0x8098f0611, 0x809844f56, 0x8097a6f12, 0x8096e566a,
+     0x80963b931, 0x80959f9aa, 0x8094e12f3, 0x8094319d0, 0x80936fc5b,
+     0x8092d8fdf, 0x80921b852, 0x809167b6a, 0x8090b4ab4, 0x808fe45d7,
+     0x808f357b0, 0x808e7a770, 0x808dbf9e2, 0x808cf49bb, 0x808c33c42,
+     0x808b654fa, 0x808aa7528, 0x8089ddedc, 0x80891758c}};
+
+std::vector<uint64_t> exp_knots_bits_1 = {0x1, 0x2, 0x3, 0x4,  0x5, 0x6,
+                                          0x7, 0x8, 0x9, 0xa,  0xb, 0xc,
+                                          0xd, 0xe, 0xf, 0x10, 0x11};
+
+std::vector<std::vector<uint64_t>> exp_coeffs_1 = {
+    // theta_0 (e^x)
+    {0x7f80000ed, 0x7f8000120, 0x7f80000da, 0x7f80000e2, 0x7f80000ae,
+     0x7f8000095, 0x7f8000087, 0x7f8000080, 0x7f800007e, 0x7f800007a,
+     0x7f800007a, 0x7f800007b, 0x7f8000079, 0x7f800007a, 0x7f800007a,
+     0x7f8000078, 0x7f800007e, 0x7f8000098},
+    // theta_1 (e^x)
+    {0x484a08009f, 0x483ba000b9, 0x481fd0a694, 0x480e91dc31, 0x47ed46fed6,
+     0x7cd003cbf, 0x7ea216816, 0x7ed3111ce, 0x7ee9e2e10, 0x7ef54b16d,
+     0x7efaa2b51, 0x7efd4c994, 0x7efeabd23, 0x7eff541a6, 0x7effaa77d,
+     0x7effd5a81, 0x7effe98d2, 0x7efff121d},
+    // theta_2 (e^x)
+    {0x9ad6000f8, 0x98f800120, 0x96b406f5d, 0x94bc13de1, 0x92975ab85,
+     0x9085e53b5, 0x8df937671, 0x8bef9c9db, 0x89ebf0c75, 0x87e48c9db,
+     0x85e5d28fa, 0x83ea851b0, 0x81f2c31c1, 0x8092ad369, 0x7ef219188,
+     0x7e9c6c2bf, 0x7e87dde61, 0x7e8341a52},
+    // theta_0 (e^-x)
+    {0x7f8000077, 0x7f8000052, 0x7f800005a, 0x7f8000048, 0x7f800003f,
+     0x7f800003b, 0x7f8000037, 0x7f8000035, 0x7f8000034, 0x7f8000034,
+     0x7f8000031, 0x7f8000038, 0x7f8000034, 0x7f8000036, 0x7f8000031,
+     0x7f8000032, 0x7f800002c, 0x7f800000e},
+    // theta_1 (e^-x)
+    {0x483ba000e7, 0x48289852c5, 0x481a58ed75, 0x480a8f735a, 0x47fd987e3c,
+     0x47fa95fa98, 0x47f93773aa, 0x47f89717e6, 0x47f84a148a, 0x47f82515e2,
+     0x47f811834a, 0x47f809f36d, 0x47f804a384, 0x47f802663b, 0x47f80115f3,
+     0x47f8008e05, 0x47f8003b8f, 0x47effffc8a},
+    // theta_2 (e^-x)
+    {0x99f800120, 0x97ac06e49, 0x95bc13c7b, 0x939140f5f, 0x90f0f9cac,
+     0x8edc8e9f0, 0x8ccfaad40, 0x8ac9987e6, 0x88c5aeb9b, 0x86c635b61,
+     0x84bccb91c, 0x82dc3038d, 0x80e61b0d3, 0x7fa665d7b, 0x7eae242e1,
+     0x7e8bae8f2, 0x7e8218113, 0x7dfe60c96}};
+
+std::vector<uint64_t> exp_knots_bits_2 = {
+    0x20, 0x30, 0x40, 0x48, 0x50, 0x58, 0x60, 0x64, 0x68, 0x6c, 0x70,
+    0x74, 0x78, 0x7c, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e,
+    0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa1, 0xa2,
+    0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac};
+
+std::vector<std::vector<uint64_t>> exp_coeffs_2 = {
+    // theta_0 (e^x)
+    {0x7f800016c, 0x7f8000648, 0x7f80010f1, 0x7f800234a, 0x7f800405d,
+     0x7f80069eb, 0x7f800a32d, 0x7f800eebf, 0x7f8014f6e, 0x7f801c6c6,
+     0x7f8025ab2, 0x7f8030b26, 0x7f803dedc, 0x7f804d717, 0x7f805f845,
+     0x7f8074622, 0x7f808c41b, 0x7f80a75c3, 0x7f80c6123, 0x7f80e80cf,
+     0x7f810dfeb, 0x7f813a654, 0x7f8169bfb, 0x7f819ebb3, 0x7f81d9a30,
+     0x7f8218de0, 0x7f825f429, 0x7f82aaa3a, 0x7f82fd5d8, 0x7f8357854,
+     0x7f83bae96, 0x7f8425040, 0x7f84956ad, 0x7f8512237, 0x7f8595048,
+     0x7f8624042, 0x7f86beff5, 0x7f8760de3, 0x7f8811cae, 0x7f88cc39a,
+     0x7f8992b1b, 0x7f8a65490, 0x7f8b454ee, 0x7f8cc753f},
+    // theta_1 (e^x)
+    {0x7effe9e54, 0x7effc601f, 0x7eff9030d, 0x7eff49d13, 0x7efeefc3e,
+     0x7efe83efc, 0x7efe04023, 0x7efd6fd1f, 0x7efcc6ed5, 0x7efc0adf8,
+     0x7efb36872, 0x7efa4e057, 0x7ef94c317, 0x7ef8332c3, 0x7ef7015d7,
+     0x7ef5b62c8, 0x7ef451500, 0x7ef2d270a, 0x7ef1372fb, 0x7eef869f8,
+     0x7eedba0f5, 0x7eebb7916, 0x7ee9aae3f, 0x7ee7780f1, 0x7ee51f6ba,
+     0x7ee2b369d, 0x7ee01a6a3, 0x7edd6b9fb, 0x7eda937cd, 0x7ed794337,
+     0x7ed46198e, 0x7ed1131e5, 0x7ecdad4de, 0x7eca04432, 0x7ec648e9e,
+     0x7ec251cf8, 0x7ebe237ec, 0x7eb9e2f99, 0x7eb55b8a8, 0x7eb0b4551,
+     0x7eabdeead, 0x7ea6dccd9, 0x7ea1a83ef, 0x7e98e56a6},
+    // theta_2 (e^x)
+    {0x7e8369342, 0x7e8577967, 0x7e87953f1, 0x7e89b016a, 0x7e8bdd562,
+     0x7e8e0c7d6, 0x7e9047d62, 0x7e928ccd4, 0x7e94dab0c, 0x7e972b348,
+     0x7e998cbfe, 0x7e9bf166e, 0x7e9e65514, 0x7ea0e18e8, 0x7ea3684b5,
+     0x7ea5f954d, 0x7ea89419b, 0x7eab38253, 0x7eade884c, 0x7eb098d31,
+     0x7eb353980, 0x7eb63ccd4, 0x7eb91379c, 0x7ebbfed57, 0x7ebefc62e,
+     0x7ec1f44fb, 0x7ec50595c, 0x7ec813c26, 0x7ecb34deb, 0x7ece65236,
+     0x7ed1afe9a, 0x7ed4fc0ab, 0x7ed8456a6, 0x7edbb56db, 0x7edf1d1f8,
+     0x7ee2a1dd4, 0x7ee63dd4a, 0x7ee9d0d18, 0x7eed8694e, 0x7ef13dfc2,
+     0x7ef5025e1, 0x7ef8d2167, 0x7efcb0e6c, 0x7f8186e6a},
+    // theta_0 (e^-x)
+    {0x7effffe33, 0x7effff6bf, 0x7efffe5ff, 0x7efffc8f4, 0x7efff9bb0,
+     0x7efff5b5c, 0x7efff05de, 0x7effe964d, 0x7effe0d15, 0x7effd645a,
+     0x7effc9b60, 0x7effbaf2c, 0x7effa9c49, 0x7eff963c9, 0x7eff8091e,
+     0x7eff67d14, 0x7eff4c8ef, 0x7eff2e046, 0x7eff0d3ae, 0x7efee87de,
+     0x7efec1dad, 0x7efe97234, 0x7efe6997d, 0x7efe38ab5, 0x7efe04e37,
+     0x7efdccaf1, 0x7efd919f9, 0x7efd5305b, 0x7efd112d0, 0x7efccb4c4,
+     0x7efc82591, 0x7efc353ec, 0x7efbe4fb2, 0x7efb9008a, 0x7efb3a250,
+     0x7efadd784, 0x7efa7e217, 0x7efa1be29, 0x7ef9b4aa2, 0x7ef94ae86,
+     0x7ef8dd754, 0x7ef86bb35, 0x7ef7f7038, 0x7ef852386},
+    // theta_1 (e^-x)
+    {0x47effeff3a, 0x47effd184b, 0x47effa4411, 0x47eff69795, 0x47eff2047c,
+     0x47efec9ba6, 0x47efe67291, 0x47efdf6b28, 0x47efd7bc53, 0x47efcf3c78,
+     0x47efc609a2, 0x47efbc226a, 0x47efb1810f, 0x47efa64a05, 0x47ef9aac15,
+     0x47ef8e3b39, 0x47ef815670, 0x47ef73b2ff, 0x47ef65d3c7, 0x47ef5710e8,
+     0x47ef484764, 0x47ef38ac62, 0x47ef28c235, 0x47ef18604f, 0x47ef07ba40,
+     0x47eef65a64, 0x47eee4c66d, 0x47eed2cf17, 0x47eec08feb, 0x47eeadd7ff,
+     0x47ee9aee82, 0x47ee87922a, 0x47ee7406bd, 0x47ee5ff42e, 0x47ee4c3b95,
+     0x47ee378bf4, 0x47ee22d868, 0x47ee0e12b6, 0x47edf8cfb7, 0x47ede3928c,
+     0x47edce2077, 0x47edb85e58, 0x47eda291ce, 0x47edb2ed12},
+    // theta_2 (e^-x)
+    {0x7dfa413cc, 0x7df65c7a2, 0x7df288c09, 0x7deed099b, 0x7deb1d72b,
+     0x7de779a48, 0x7de3ec678, 0x7de061655, 0x7ddcf0640, 0x7dd9838b3,
+     0x7dd6254d7, 0x7dd2d2f5b, 0x7dcf89074, 0x7dcc50bac, 0x7dc93379e,
+     0x7dc6131b7, 0x7dc306781, 0x7dbffae81, 0x7dbd0bab3, 0x7dba14825,
+     0x7db7401f1, 0x7db46665b, 0x7db19e95f, 0x7daee06c5, 0x7dac3342e,
+     0x7da983cae, 0x7da6e625f, 0x7da4522ac, 0x7da1caf68, 0x7d9f4928e,
+     0x7d9cd59a3, 0x7d9a67570, 0x7d98062cc, 0x7d95a7146, 0x7d93637ca,
+     0x7d911479d, 0x7d8ed5215, 0x7d8ca2f9c, 0x7d8a72669, 0x7d8850712,
+     0x7d8636960, 0x7d8421e65, 0x7d8218a41, 0x7d839007c}};
+
+// input in [2^-11, 1)
+std::vector<uint64_t> erf_knots_bits_1 = {
+    0x8,  0x10, 0x14, 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x21, 0x22, 0x23, 0x24,
+    0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f};
+
+std::vector<std::vector<uint64_t>> erf_coeffs_1 = {
+    // theta_0 (1)
+    {0x45be3c7d08, 0x46985d4af5, 0x46cb25c4d0, 0x46ec194c1a, 0x470a9303a2,
+     0x471c5d2d46, 0x472cbdbe58, 0x473ce69dc3, 0x474cf3fb0a, 0x475a63702d,
+     0x476808c854, 0x476bcb2be9, 0x4778556d1b, 0x477b9beea0, 0x477ffc7988,
+     0x478b24e800, 0x478de6a89f, 0x479b293cbd, 0x47a85c37e0, 0x47abd04ad1,
+     0x47afef743e, 0x47ba4ce224, 0x47bcc0b5c0, 0x47bf3d7ea9},
+    // theta_1 (x)
+    {0x7f906ebaf, 0x7f906f56d, 0x7f90728d3, 0x7f907ad17, 0x7f908f5f3,
+     0x7f90ae5c6, 0x7f90df5d9, 0x7f913487a, 0x7f91c8046, 0x7f926390c,
+     0x7f93311e2, 0x7f9429fc8, 0x7f95545a0, 0x7f96c99cd, 0x7f989a736,
+     0x7f9b0ca4b, 0x7f9d04aab, 0x7fa29ed4c, 0x7fa946b6e, 0x7fb0cb8f2,
+     0x7fb905775, 0x7fc19bc8d, 0x7fc9fb812, 0x7fd1e6374},
+    // theta_2 (x^2)
+    {0x46ecbba7a5, 0x473f7e6e17, 0x475e97252b, 0x47787b3c90, 0x4788deefd1,
+     0x478e7afee3, 0x479b09cda0, 0x47a867dbf9, 0x47acc01e59, 0x47b860f0a2,
+     0x47bacab451, 0x47bd774bd1, 0x47c8377a94, 0x47c9f20cd4, 0x47cbf3cf93,
+     0x47ce7b6fe4, 0x47d82d1e67, 0x47daa97eff, 0x47dd51bc64, 0x47e8063707,
+     0x47e9648ee9, 0x47eab5d3c0, 0x47ebe6de12, 0x47ecf3ac60},
+    // theta_3 (x^3)
+    {0x47dc053d72, 0x47dbe646a2, 0x47dbb7c91b, 0x47db7c7233, 0x47db243c09,
+     0x47dace6ddb, 0x47da6a7013, 0x47d9e52905, 0x47d936e8a6, 0x47d8aacbda,
+     0x47d810ad54, 0x47cee8742a, 0x47cda6bfc9, 0x47cc495adb, 0x47cacf28a0,
+     0x47c910ff28, 0x47bfc52546, 0x47b9e25e14, 0x47a872d734, 0x7887d00ab,
+     0x7abd6b10c,  0x7ba3b767d,  0x7bdd99a9d,  0x7c869249a}};
+
+// input in [1, 3.875]
+std::vector<uint64_t> erf_knots_bits_2 = {
+    0x2,  0x4,  0x6,  0x8,  0xa,  0xc,  0xe,  0x10, 0x12, 0x14, 0x16, 0x18,
+    0x1a, 0x1c, 0x1e, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34,
+    0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d};
+
+std::vector<std::vector<uint64_t>> erf_coeffs_2 = {
+    // theta_0 (1)
+    {0x47c868ebc9, 0x47c94fbf20, 0x47c9e747bc, 0x47ca1d6071, 0x47c9e875db,
+     0x47c927997f, 0x47bfdb43d3, 0x47bc1f6534, 0x47ae68a1b7, 0x47888b466a,
+     0x7ab903a6c,  0x7bd536f02,  0x7cae967b1,  0x7cf7716f3,  0x7da22f4e0,
+     0x7dc6a1dc8,  0x7de968995,  0x7e876154f,  0x7e98ab08f,  0x7ea8af4cd,
+     0x7eb74001e,  0x7ec460bac,  0x7ed1661f7,  0x7edc755fa,  0x7ee6cfa2a,
+     0x7eefaf0b3,  0x7ef7011a5,  0x7efd3de88,  0x7f804ef84,  0x7f81f11ea,
+     0x7f8523621,  0x7f86557ab,  0x7f88001db,  0x7f893f962,  0x7f8ba2e08,
+     0x7f8d88fe1,  0x7f8f55896,  0x7f8fdedea,  0x7f9587246,  0x7f97896c6,
+     0x7f95fb467,  0x7f9de111f,  0x7facdffe3,  0x7f9aa4613,  0x7faf4dff6,
+     0x7fb4b70e2},
+    // theta_1 (x)
+    {0x7fd6a5ec7,  0x7fdbcea5d,  0x7fdf08b82,  0x7fe02d869,  0x7fdf43175,
+     0x7fdbe4cda,  0x7fd69c8cf,  0x7fcee13ad,  0x7fc51b417,  0x7fb964f94,
+     0x7facd3214,  0x7f9f7b409,  0x7f90fb46a,  0x7f81fd706,  0x7ee56bdb5,
+     0x7ec9729ed,  0x7eaf97aa4,  0x7e94acfb3,  0x7df91c518,  0x7dcdcd074,
+     0x7da79a2e1,  0x7d8631e3e,  0x7ccbfc3aa,  0x7c9709100,  0x7bcde2d91,
+     0x7afc6468e,  0x79fb2c971,  0x77aa9b3b4,  0x477f5d007f, 0x479a058703,
+     0x47ab44ca75, 0x47ad46561f, 0x47b815cb54, 0x47b9161589, 0x47bb250126,
+     0x47bcac863a, 0x47be10ad3c, 0x47be4595e2, 0x47c97d99d0, 0x47ca2f6bc7,
+     0x47c9568d9e, 0x47cc788436, 0x47d9349db5, 0x47cabfefe6, 0x47d96210d0,
+     0x47da4ae0de},
+    // theta_2 (x^2)
+    {0x47ed8bf3d0, 0x47ee296117, 0x47ee8729a6, 0x47eea8098e, 0x47ee928cd4,
+     0x47ee424409, 0x47edc8f1f5, 0x47ed1e2bd6, 0x47ec4f028f, 0x47eb608fe4,
+     0x47ea6a84f5, 0x47e96ec877, 0x47e866dc75, 0x47debf3c32, 0x47dcb94b29,
+     0x47daef4a61, 0x47d9553a0c, 0x47cf6ee3e0, 0x47cca10836, 0x47ca308cb2,
+     0x47c81a6643, 0x47bcaa4f54, 0x47b95931db, 0x47ad6b6606, 0x47a8c52a09,
+     0x479a19265f, 0x4788caa625, 0x72f89708d,  0x778b3c3ca,  0x7882f8695,
+     0x79828c60b,  0x7993a613c,  0x79adde30c,  0x79be2a578,  0x79e386f84,
+     0x79fd0f0f4,  0x7a89931ea,  0x7a88df30e,  0x7ab27fa62,  0x7abc0ad33,
+     0x7aa93fb89,  0x7ade12319,  0x7ba123bee,  0x7ab90ea5c,  0x7b9ed1252,
+     0x7bab7b278},
+    // theta_3 (x^3)
+    {0x7c9349436,  0x7c9fcba89,  0x7ca6e424a,  0x7ca957ab5,  0x7ca80a57f,
+     0x7ca30f324,  0x7c9bcdcf5,  0x7c91fb2da,  0x7c868bb97,  0x7bf3ce7b2,
+     0x7bdab8f58,  0x7bc1fc77e,  0x7ba8f73bb,  0x7b90e3e93,  0x7af40f03a,
+     0x7acd01212,  0x7aab2222a,  0x7a8a01504,  0x79dc61346,  0x79ad7d891,
+     0x7986992a7,  0x78cd2a8b0,  0x7892e2def,  0x77cc14bd0,  0x76feead45,
+     0x768889896,  0x74c53b50b,  0x4738912c1a, 0x474b0a91b2, 0x47589fed27,
+     0x475fa2eebb, 0x476884ce05, 0x4769b759ac, 0x476a59ff67, 0x476c16b54a,
+     0x476d27309b, 0x476e038813, 0x476dabd1e3, 0x4778bdec83, 0x47790a5129,
+     0x476ff9e4cb, 0x477a4c3ec1, 0x477eb0d766, 0x47784bca01, 0x477e0019d9,
+     0x477ee14486}};
+
+#endif // FLOATING_POINT_MATH_COEFFS_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.cpp
new file mode 100644
index 00000000..8b67e5a1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.cpp
@@ -0,0 +1,924 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/fp-math.h"
+#include "FloatingPoint/fp-math-coeffs.h"
+
+using namespace std;
+using namespace sci;
+
+#define FRAC_RANGE 9
+#define FP_INTMD_M_BITS 27
+#define FP_INTMD_E_BITS 8
+#define PI_DOUBLE                                                              \
+  3.1415926535897932384626433832795028841971693993751058209749445923078164062
+#define LOG2E                                                                  \
+  1.44269504088896340735992468100189213742664595415298593413544940693110921918118507988552662289350634449699
+#define LOGE2                                                                  \
+  0.693147180559945309417232121458176568075500134360255254120680009493393621969694715605863326996418687
+#define TWO_INV_SQRT_PI                                                        \
+  1.128379167095512573896158903121545171688101258657997713688171443421284936882
+
+FixArray get_idx_from_input(FixOp *fix, const FixArray &delta_m,
+                            const FixArray &delta_e, int idx_m_bits,
+                            int idx_e_bits, int e_offset) {
+  assert(delta_m.party != PUBLIC && delta_e.party != PUBLIC);
+  assert(delta_m.size == delta_e.size);
+  assert(idx_m_bits + idx_e_bits <= delta_e.ell);
+  FixArray idx_hi =
+      fix->reduce(fix->add(delta_e, e_offset), idx_m_bits + idx_e_bits);
+  idx_hi.signed_ = false;
+  if (idx_m_bits == 0) {
+    return idx_hi;
+  }
+  idx_hi = fix->mul(idx_hi, 1 << idx_m_bits, idx_m_bits + idx_e_bits);
+  FixArray idx_lo = fix->truncate_reduce(delta_m, delta_m.ell - 1 - idx_m_bits);
+  idx_lo = fix->sub(idx_lo, 1 << idx_m_bits);
+  if (idx_m_bits + idx_e_bits < idx_m_bits + 1) {
+    idx_lo = fix->reduce(idx_lo, idx_m_bits + idx_e_bits);
+  }
+  idx_lo.s = 0;
+  BoolArray all_0 = fix->bool_op->input(ALICE, delta_m.size, uint8_t(0));
+  FixArray idx = fix->add(
+      idx_hi, fix->extend(idx_lo, idx_m_bits + idx_e_bits, all_0.data));
+  return idx;
+}
+
+FPArray FPMath::tanpi(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+  FPArray pos_x = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m.data, x_e.data, x.m_bits, x.e_bits);
+
+  // Range Reduction:
+  // Input: [2^-14, 2^23)
+  // Output: [2^(-m_bits-REDUCED_RANGE_UB, 2^(-REDUCED_RANGE_UB))]
+
+  // if x >= 2^-FRAC_RANGE
+  BoolArray f0 = fix->GE(x_e, -FRAC_RANGE + x.e_bias());
+  FixArray shift_amt = fix->add(x_e, FRAC_RANGE - x.e_bias());
+  shift_amt.signed_ = false;
+  FixArray N__ = fix->left_shift(x_m, shift_amt, x.m_bits + FRAC_RANGE + 1,
+              22 + FRAC_RANGE, all_1.data);
+  N__.s += FRAC_RANGE;
+  FixArray N_ = fix->reduce(N__, FRAC_RANGE + x.m_bits);
+  // f1 = N_ >= 2^(FRAC_RANGE + x.m_bits - 1)
+  // Using f1 = N < 0 (signed) instead as the ring does not have 1-bit slack
+  BoolArray f1 = fix->LT(N_, 0);
+  N_ = fix->if_else(f1, fix->sub(1ULL << (FRAC_RANGE + x.m_bits), N_), N_);
+
+  BoolArray f2 = fix->GE(N_, 1ULL << (FRAC_RANGE + x.m_bits - 2));
+  N_ = fix->if_else(f2, fix->sub(1ULL << (FRAC_RANGE + x.m_bits - 1), N_), N_);
+
+  FixArray f_m_ = fix->reduce(N_, x.m_bits);
+  f_m_.s = x.m_bits;
+  BoolArray msb_f_m_ = fix->LT(f_m_, 0);
+  uint8_t *wrap_f_m_ = new uint8_t[x.size];
+  fix->aux->MSB_to_Wrap(f_m_.data, msb_f_m_.data, wrap_f_m_, x.size, f_m_.ell);
+  N_ = fix->truncate_reduce(N_, x.m_bits, wrap_f_m_);
+  f_m_ = fix->extend(f_m_, x.m_bits + 1, msb_f_m_.data);
+
+  BoolArray f_m__eq_zero = fix->EQ(f_m_, 0);
+  BoolArray N__eq_zero = fix->EQ(N_, 0);
+
+  FixArray f_e__if_if = fix->input(PUBLIC, x.size, uint64_t(0), x_e.signed_, x_e.ell, x_e.s);
+  FixArray f_e__if =
+      fix->if_else(N__eq_zero, f_e__if_if, -1 * FRAC_RANGE + x.e_bias());
+  FixArray N__if = fix->if_else(N__eq_zero, N_, fix->sub(N_, 1));
+  FixArray f_m__if = fix->if_else(N__eq_zero, f_m_, 1ULL << x.m_bits);
+  BoolArray f_z_ = bool_op->AND(f_m__eq_zero, N__eq_zero);
+
+  FixArray f_m__else = f_m_, N__else = N_;
+  FixArray f_e__else = fix->input(PUBLIC, x.size, uint64_t(0), x_e.signed_, x_e.ell, x_e.s);
+  fp_op->normalize(f_m__else, f_e__else, x.m_bits + FRAC_RANGE - x.e_bias());
+
+  f_m_ = fix->if_else(f_m__eq_zero, f_m__if, f_m__else);
+  FixArray f_e_ = fix->if_else(f_m__eq_zero, f_e__if, f_e__else);
+  N_ = fix->if_else(f_m__eq_zero, N__if, N__else);
+
+  // else
+  FixArray N = fix->if_else(f0, N_, 0);
+  N = fix->reduce(N, FRAC_RANGE - 2);
+  FixArray f_m = fix->if_else(f0, f_m_, x_m);
+  FixArray f_e = fix->if_else(f0, f_e_, x_e);
+  BoolArray f_z = bool_op->if_else(f0, f_z_, x_z);
+  BoolArray f_s = bool_op->input(ALICE, x.size, uint8_t(0));
+  f1 = bool_op->if_else(f0, f1, 0);
+  f2 = bool_op->if_else(f0, f2, 0);
+  // increasing precision of f
+  f_m = fix->scale_up(f_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  FPArray f = fp_op->input(x.party, x.size, f_s.data, f_z.data,
+          f_m.data, f_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // Spline Evaluation on f
+  BoolArray cond1 = fix->LT(f_e, -14 + f.e_bias());
+
+  // if f < 2^-14, f_tan = pi * f
+  FPArray pi = fp_op->input<double>(PUBLIC, x.size, PI_DOUBLE,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  FPArray f_tan_if = fp_op->mul(f, pi, true);
+
+  // else spline evaluation on f
+  FixArray idx = get_idx_from_input(fix, f_m, f_e, 2, 3, 14 - x.e_bias());
+  vector<FPArray> theta = fp_op->GetCoeffs(tan_coeffs, trig_knots_bits, idx, 20,
+                                           FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  FPArray f_tan_else = fp_op->mul(theta[1], f, false);
+  f_tan_else = fp_op->mul(f_tan_else, f, false);
+  f_tan_else = fp_op->add(f_tan_else, theta[0], true, true, false);
+  f_tan_else = fp_op->mul(f_tan_else, f, false);
+
+  FPArray f_tan = fp_op->if_else(cond1, f_tan_if, f_tan_else);
+
+  // Range Propagation
+  FPArray N_tan = fp_op->LUT(tan_N, N, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  FPArray sum = fp_op->add(f_tan, N_tan, true, true, false);
+  FPArray prod = fp_op->mul(f_tan, N_tan, false);
+  // secret-sharing one as fp_op add does not support PUBLIC FPArrays
+  FPArray one = fp_op->input<float>(ALICE, x.size, 1.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  prod = fp_op->sub(one, prod, true, false, false);
+
+  FPArray num = fp_op->if_else(f2, prod, sum);
+  FPArray den = fp_op->if_else(f2, sum, prod);
+  FPArray z = fp_op->div(num, den, true, false);
+
+  // Special Cases
+  // Case I
+  BoolArray cond2 = fix->GE(x_e, 23 + x.e_bias());
+  // secret-sharing zero as fp_op if_else does not support PUBLIC FPArrays
+  FPArray zero = fp_op->input<double>(ALICE, x.size, 0.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  z = fp_op->if_else(cond2, zero, z);
+
+  // Case II
+  BoolArray cond3 = fix->LT(x_e, x.e_bias() - 14);
+  // increasing precision of x
+  FixArray x_m_prec = fix->scale_up(x_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  FixArray x_e_prec = x_e;
+  FPArray pos_x_prec = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m_prec.data, x_e_prec.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  FPArray pos_x_prec_pi = fp_op->mul(pos_x_prec, pi, true);
+  z = fp_op->if_else(cond3, pos_x_prec_pi, z);
+
+  // reduce precision to normal
+  BoolArray z_s, z_z;
+  FixArray z_m, z_e;
+  tie(z_s, z_z, z_m, z_e) = fp_op->get_components(z);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(z_m, oflow_threshold);
+  FixArray ret_m_if = fix->round_ties_to_even(z_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray ret_m = fix->if_else(rnd_no_oflow, ret_m_if, (1ULL << x.m_bits));
+  FixArray ret_e = fix->if_else(rnd_no_oflow, z_e, fix->add(z_e, 1));
+
+  BoolArray ret_s = bool_op->XOR(f1, x_s);
+  FPArray ret = fp_op->input(x.party, x.size, ret_s.data, z_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  delete[] wrap_f_m_;
+
+  return ret;
+}
+
+FPArray FPMath::exp2(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+  FixArray shift_amt = fix->add(x_e, 24 - x.e_bias());
+  shift_amt.signed_ = false;
+  FixArray m = fix->left_shift(x_m, shift_amt, x.m_bits + 32, 31, all_1.data);
+  m.s += 24;
+  FixArray f = fix->reduce(m, x.m_bits + 24);
+  BoolArray msb_f = fix->LT(f, 0);
+  uint8_t *wrap_f = new uint8_t[x.size];
+  fix->aux->MSB_to_Wrap(f.data, msb_f.data, wrap_f, x.size, f.ell);
+  FixArray N = fix->truncate_reduce(m, x.m_bits + 24, wrap_f);
+  N.signed_ = true;
+  N = fix->if_else(x_s, fix->mul(N, -1), N);
+  f = fix->extend(f, x.m_bits + 25, msb_f.data);
+
+  BoolArray f_eq_0 = fix->EQ(f, 0);
+  BoolArray neg_f = bool_op->AND(x_s, bool_op->NOT(f_eq_0));
+  f = fix->if_else(neg_f, fix->sub(1ULL << (x.m_bits + 24), f), f);
+  N = fix->if_else(neg_f, fix->sub(N, 1), N);
+  FixArray delta_m = f;
+  FixArray delta_e = fix->input(PUBLIC, x.size, uint64_t(0), x_e.signed_, x_e.ell, x_e.s);
+  fp_op->normalize(delta_m, delta_e, x.m_bits + 24 - x.e_bias());
+  delta_m = fix->truncate_reduce(delta_m, 24 + x.m_bits - FP_INTMD_M_BITS);
+  delta_e = fix->if_else(f_eq_0, 0, delta_e);
+  FPArray delta = fp_op->input(x.party, x.size, all_0.data, f_eq_0.data,
+          delta_m.data, delta_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // if delta < 2^-14, mu = 1.0
+  BoolArray cond = fix->LT(delta_e, -24 + delta.e_bias());
+  // secret-sharing one as fp_op else_if does not support PUBLIC FPArrays
+  FPArray one = fp_op->input<float>(ALICE, x.size, 1.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  FPArray mu_if = one;
+
+  // else evaluate spline
+  BoolArray e_lt_neg_6 = fix->LT(delta_e, delta.e_bias() - 6);
+  FixArray delta_e_prime =
+      fix->if_else(e_lt_neg_6, delta.e_bias() - 7, delta_e);
+  FixArray idx =
+      get_idx_from_input(fix, delta_m, delta_e_prime, 5, 3, 7 - delta.e_bias());
+  vector<FPArray> theta = fp_op->GetCoeffs(
+      exp2_coeffs, exp2_knots_bits, idx, 64, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  FPArray mu_else = fp_op->mul(theta[2], delta, false);
+  mu_else = fp_op->add(mu_else, theta[1], true, true, false);
+  mu_else = fp_op->mul(mu_else, delta, false);
+  mu_else = fp_op->add(mu_else, theta[0], true, true, false);
+  FPArray mu = fp_op->if_else(cond, mu_if, mu_else);
+
+  BoolArray mu_s, mu_z;
+  FixArray mu_m, mu_e;
+  tie(mu_s, mu_z, mu_m, mu_e) = fp_op->get_components(mu);
+  N = fix->extend(N, x.e_bits + 2);
+  FixArray gamma_e = fix->add(mu_e, N);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(mu_m, oflow_threshold);
+  FixArray gamma_m_if =
+      fix->round_ties_to_even(mu_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray gamma_m = fix->if_else(rnd_no_oflow, gamma_m_if, (1ULL << x.m_bits));
+  gamma_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+  FPArray gamma = fp_op->input(x.party, x.size, all_0.data, all_0.data,
+          gamma_m.data, gamma_e.data, x.m_bits, x.e_bits);
+
+  // Special Cases
+  // Case I
+  BoolArray cond1 =
+      bool_op->AND(bool_op->NOT(x_s), fix->GE(x_e, 7 + x.e_bias()));
+  // secret-sharing inf as fp_op if_else does not support PUBLIC FPArrays
+  FPArray inf = fp_op->input<double>(ALICE, x.size, INFINITY, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond1, inf, gamma);
+
+  // Case II
+  BoolArray cond2 = fp_op->LT<double>(x, -126.0);
+  // secret-sharing zero as fp_op if_else does not support PUBLIC FPArrays
+  FPArray zero = fp_op->input<double>(ALICE, x.size, 0.0, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond2, zero, gamma);
+
+  // Case III
+  BoolArray cond3 = fix->LE(x_e, -25 + x.e_bias());
+  // secret-sharing one32 as fp_op if_else does not support PUBLIC FPArrays
+  FPArray one32 = fp_op->input<double>(ALICE, x.size, 1.0, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond3, one32, gamma);
+
+  delete[] wrap_f;
+
+  return gamma;
+}
+
+FPArray log_core(FPOp *fp_op, FixOp *fix, BoolOp *bool_op, const FPArray &x,
+                 bool base_2 = false) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+
+  BoolArray a = fix->EQ(x_e, -1 + x.e_bias());
+  FixArray f_if = fix->sub(0, x_m);
+  FixArray f_else = fix->sub(x_m, 1ULL << x.m_bits);
+  FixArray f = fix->if_else(a, f_if, f_else);
+  BoolArray f_eq_0 = fix->EQ(f, 0);
+  FixArray delta_m = f;
+  FixArray delta_e = fix->input(PUBLIC, x.size, uint64_t(0), x_e.signed_, x_e.ell, x_e.s);
+  fp_op->normalize(delta_m, delta_e, x.m_bits - x.e_bias());
+  FixArray delta_m_low_prec = delta_m;
+  delta_m = fix->scale_up(delta_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  delta_e = fix->if_else(a, fix->sub(delta_e, 1), delta_e);
+  delta_e = fix->if_else(f_eq_0, 0, delta_e);
+  FPArray delta = fp_op->input(x.party, x.size, all_0.data, f_eq_0.data,
+          delta_m.data, delta_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // if delta < 2^-5
+  BoolArray cond1 = fix->LT(delta_e, -5 + delta.e_bias());
+  FixArray idx_1 = get_idx_from_input(fix, delta_m_low_prec, delta_e, 0, 5,
+                                      24 - delta.e_bias());
+  FixArray idx_2 = get_idx_from_input(fix, delta_m_low_prec, delta_e, 4, 3,
+                                      5 - delta.e_bias());
+  vector<FPArray> theta_1, theta_2;
+  if (base_2) {
+    theta_1 = fp_op->GetCoeffs(log2_coeffs_1, log_knots_bits_1, idx_1, 19,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+    theta_2 = fp_op->GetCoeffs(log2_coeffs_2, log_knots_bits_2, idx_2, 35,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  } else {
+    theta_1 = fp_op->GetCoeffs(ln_coeffs_1, log_knots_bits_1, idx_1, 19,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+    theta_2 = fp_op->GetCoeffs(ln_coeffs_2, log_knots_bits_2, idx_2, 35,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  }
+
+  assert(theta_1.size() == theta_2.size());
+  assert(theta_1.size() == 8);
+  vector<FPArray> theta(theta_1.size());
+  for (int i = 0; i < theta_1.size(); i++) {
+    theta[i] = fp_op->if_else(cond1, theta_1[i], theta_2[i]);
+  }
+  for (int i = 0; i < 4; i++) {
+    theta[i] = fp_op->if_else(a, theta[i], theta[i + 4]);
+  }
+  FPArray mu = fp_op->mul(theta[3], delta, false);
+  mu = fp_op->add(mu, theta[2], true, true, false);
+  mu = fp_op->mul(mu, delta, false);
+  mu = fp_op->add(mu, theta[1], true, true, false);
+  mu = fp_op->mul(mu, delta, false);
+  mu = fp_op->add(mu, theta[0], true, true, false);
+
+  // else (delta == 0, mu = 0.0)
+  // secret-sharing zero as fp_op if_else does not support PUBLIC FPArrays
+  FPArray zero = fp_op->input<double>(ALICE, x.size, 0.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  mu = fp_op->if_else(f_eq_0, zero, mu);
+
+  FixArray N = fix->reduce(x_e, 8);
+  FPArray beta;
+  if (base_2) {
+    beta = fp_op->LUT(log2_int_to_float, N, 6, FP_INTMD_E_BITS);
+    for (int i = 0; i < x.size; i++) {
+      beta.m[i] <<= (FP_INTMD_M_BITS - 6);
+    }
+    beta.m_bits = FP_INTMD_M_BITS;
+  } else {
+    beta = fp_op->LUT(ln_int_to_float, N, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  }
+  FPArray neg_mu = fp_op->input(mu.party, mu.size, all_1.data, mu.z,
+          mu.m, mu.e, mu.m_bits, mu.e_bits);
+
+  // reduce precision to normal
+  FPArray gamma =
+      fp_op->if_else(a, neg_mu, fp_op->add(beta, mu, true, true, false));
+  BoolArray gamma_s, gamma_z;
+  FixArray gamma_m, gamma_e;
+  tie(gamma_s, gamma_z, gamma_m, gamma_e) = fp_op->get_components(gamma);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(gamma_m, oflow_threshold);
+  FixArray ret_m_if =
+      fix->round_ties_to_even(gamma_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray ret_m = fix->if_else(rnd_no_oflow, ret_m_if, (1ULL << x.m_bits));
+  FixArray ret_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+
+  FPArray ret = fp_op->input(x.party, x.size, gamma_s.data, gamma_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+FPArray FPMath::log2(const FPArray &x) {
+  return log_core(fp_op, fix, bool_op, x, true);
+}
+
+FPArray FPMath::ln(const FPArray &x) {
+  return log_core(fp_op, fix, bool_op, x, false);
+}
+
+// returns mu
+FPArray sinpi_core(FPOp *fp_op, FixOp *fix, BoolOp *bool_op, const FPArray &x,
+                   const FixArray &m, FPArray &delta, BoolArray &a,
+                   bool cos_invocation = false) {
+  assert(x.party != PUBLIC && m.party != PUBLIC);
+  assert(x.size == m.size);
+  BoolArray all_0 = bool_op->input(ALICE, m.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, m.size, 1);
+
+  FixArray f = fix->reduce(m, x.m_bits + 14);
+  BoolArray msb_f = fix->LT(f, 0);
+  uint8_t *wrap_f = new uint8_t[x.size];
+  fix->aux->MSB_to_Wrap(f.data, msb_f.data, wrap_f, x.size, f.ell);
+  FixArray a_fix = fix->truncate_reduce(m, x.m_bits + 14, wrap_f);
+  a = fix->LSB(a_fix);
+  f = fix->extend(f, x.m_bits + 15, msb_f.data);
+
+  f = fix->if_else(msb_f, fix->sub(1ULL << (x.m_bits + 14), f), f);
+
+  BoolArray f_eq_0 = fix->EQ(f, 0);
+  FixArray delta_m = f;
+  FixArray delta_e = fix->input(PUBLIC, x.size, uint64_t(0), true, x.e_bits + 2, 0);
+  fp_op->normalize(delta_m, delta_e, x.m_bits + 14 - x.e_bias());
+  delta_m = fix->truncate_reduce(delta_m, x.m_bits + 14 - FP_INTMD_M_BITS);
+  delta_e = fix->if_else(f_eq_0, 0, delta_e);
+  delta = fp_op->input(x.party, x.size, all_0.data, f_eq_0.data,
+          delta_m.data, delta_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // if delta < 2^-5
+  BoolArray cond1 = fix->LT(delta_e, -5 + delta.e_bias());
+  FixArray idx_1 =
+      get_idx_from_input(fix, delta_m, delta_e, 0, 4, 14 - delta.e_bias());
+  vector<FPArray> theta_1;
+  if (cos_invocation) {
+    theta_1 = fp_op->GetCoeffs(cos_coeffs_1, sin_knots_bits_1, idx_1, 9,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  } else {
+    theta_1 = fp_op->GetCoeffs(sin_coeffs_1, sin_knots_bits_1, idx_1, 9,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  }
+
+  // else if delta >= 2^-5
+  BoolArray e_eq_neg_1 = fix->EQ(delta_e, delta.e_bias() - 1);
+  FixArray idx_2 =
+      get_idx_from_input(fix, delta_m, delta_e, 5, 2, 5 - delta.e_bias());
+  idx_2 = fix->if_else(e_eq_neg_1, (1ULL << 7) - 1, idx_2);
+  vector<FPArray> theta_2;
+  if (cos_invocation) {
+    theta_2 = fp_op->GetCoeffs(cos_coeffs_2, sin_knots_bits_2, idx_2, 34,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  } else {
+    theta_2 = fp_op->GetCoeffs(sin_coeffs_2, sin_knots_bits_2, idx_2, 34,
+                               FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  }
+  assert(theta_1.size() == theta_2.size());
+  assert(theta_1.size() == 3);
+  vector<FPArray> theta(theta_1.size());
+  for (int i = 0; i < theta_1.size(); i++) {
+    theta[i] = fp_op->if_else(cond1, theta_1[i], theta_2[i]);
+  }
+  FPArray delta_sq = fp_op->mul(delta, delta, false);
+  FPArray mu = fp_op->mul(theta[2], delta_sq, false);
+  mu = fp_op->add(mu, theta[1], true, true, false);
+  mu = fp_op->mul(mu, delta_sq, false);
+  mu = fp_op->add(mu, theta[0], true, true, false);
+  mu = fp_op->mul(mu, delta, false);
+
+  delete[] wrap_f;
+
+  return mu;
+}
+
+FPArray FPMath::sinpi(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+  FixArray shift_amt = fix->add(x_e, 14 - x.e_bias());
+  shift_amt.signed_ = false;
+  FixArray m = fix->left_shift(x_m, shift_amt, x.m_bits + 14 + 1, 14 + 23, all_1.data);
+  m.s += 14;
+
+  BoolArray a;
+  FPArray delta;
+  FPArray mu = sinpi_core(fp_op, fix, bool_op, x, m, delta, a);
+
+  BoolArray delta_s, delta_z;
+  FixArray delta_m, delta_e;
+  tie(delta_s, delta_z, delta_m, delta_e) = fp_op->get_components(delta);
+
+  // else (delta < 2^-14, mu = pi * delta) and Special Case (x < 2^-14)
+  BoolArray cond2 = fix->LT(delta_e, -14 + delta.e_bias());
+  FPArray pi = fp_op->input<double>(PUBLIC, x.size, PI_DOUBLE,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  BoolArray cond3 = fix->LT(x_e, x.e_bias() - 14);
+  // higher precision x
+  FixArray x_m_prec = fix->scale_up(x_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  FixArray x_e_prec = x_e;
+  FPArray pos_x_prec = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m_prec.data, x_e_prec.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  FPArray y = fp_op->if_else(cond3, pos_x_prec, delta);
+  BoolArray cond_23 = bool_op->if_else(cond3, cond3, cond2);
+  mu = fp_op->if_else(cond_23, fp_op->mul(y, pi, true), mu);
+  a = bool_op->if_else(cond3, 0, a);
+
+  // Special Cases
+  // Case I (Case II already done above)
+  BoolArray cond4 = fix->GE(x_e, 23 + x.e_bias());
+  // secret-sharing zero as fp_op if_else does not support PUBLIC FPArrays
+  FPArray zero = fp_op->input<double>(ALICE, x.size, 0.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  FPArray gamma = fp_op->if_else(cond4, zero, mu);
+
+  // reduce precision to normal
+  BoolArray gamma_s, gamma_z;
+  FixArray gamma_m, gamma_e;
+  tie(gamma_s, gamma_z, gamma_m, gamma_e) = fp_op->get_components(gamma);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(gamma_m, oflow_threshold);
+  FixArray ret_m_if =
+      fix->round_ties_to_even(gamma_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray ret_m = fix->if_else(rnd_no_oflow, ret_m_if, (1ULL << x.m_bits));
+  FixArray ret_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+  BoolArray ret_s = bool_op->XOR(a, x_s);
+
+  FPArray ret = fp_op->input(x.party, x.size, ret_s.data, gamma_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+FPArray FPMath::cospi(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+  FixArray shift_amt = fix->add(x_e, 14 - x.e_bias());
+  shift_amt.signed_ = false;
+  FixArray m = fix->left_shift(x_m, shift_amt, x.m_bits + 14 + 1, 14 + 23, all_1.data);
+  m = fix->add(m, 1ULL << (x.m_bits + 14 - 1));
+  m.s += 14;
+
+  BoolArray a;
+  FPArray delta;
+  FPArray mu = sinpi_core(fp_op, fix, bool_op, x, m, delta, a, true);
+
+  BoolArray delta_s, delta_z;
+  FixArray delta_m, delta_e;
+  tie(delta_s, delta_z, delta_m, delta_e) = fp_op->get_components(delta);
+
+  // else (delta < 2^-14, mu = pi * delta)
+  BoolArray cond2 = fix->LT(delta_e, -14 + delta.e_bias());
+  FPArray pi = fp_op->input<double>(PUBLIC, x.size, PI_DOUBLE,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  mu = fp_op->if_else(cond2, fp_op->mul(delta, pi, true), mu);
+  for (int i = 0; i < x.size; i++) {
+    mu.s[i] = a.data[i];
+  }
+
+  // Special Cases
+  // secret-sharing one as fp_op add does not support PUBLIC FPArrays
+  FPArray one = fp_op->input<float>(ALICE, x.size, 1.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+
+  // Case I (x >= 2^23)
+  BoolArray x_e_eq_23, x_e_gt_23;
+  tie(x_e_gt_23, x_e_eq_23) = fix->MSB_and_zero_test(fix->sub(x.e_bias() + 23, x_e));
+  BoolArray cond3 = bool_op->XOR(x_e_gt_23, x_e_eq_23);
+  FPArray gamma = fp_op->if_else(cond3, one, mu);
+  // If x_e == 23 and x_m & 1 == 1: output -1.0
+  BoolArray lsb_x_m = fix->LSB(x_m);
+  BoolArray case_1_s = bool_op->AND(lsb_x_m, x_e_eq_23);
+
+  // Case II
+  BoolArray cond4 = fix->LT(x_e, x.e_bias() - 14);
+  gamma = fp_op->if_else(cond4, one, gamma);
+
+  // reduce precision to normal
+  BoolArray gamma_s, gamma_z;
+  FixArray gamma_m, gamma_e;
+  tie(gamma_s, gamma_z, gamma_m, gamma_e) = fp_op->get_components(gamma);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(gamma_m, oflow_threshold);
+  FixArray ret_m_if =
+      fix->round_ties_to_even(gamma_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray ret_m = fix->if_else(rnd_no_oflow, ret_m_if, (1ULL << x.m_bits));
+  FixArray ret_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+  BoolArray ret_s = bool_op->XOR(case_1_s, gamma_s);
+
+  FPArray ret = fp_op->input(x.party, x.size, ret_s.data, gamma_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+FPArray FPMath::exp(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+  BoolArray all_1 = bool_op->input(ALICE, x.size, 1);
+
+  FPArray pos_x = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m.data, x_e.data, x.m_bits, x.e_bits);
+  BoolArray pos_x_ge_LOGE2 = fp_op->GE<double>(pos_x, double(LOGE2));
+  FixArray shift_amt = fix->add(x_e, 1 - x.e_bias());
+  shift_amt.signed_ = false;
+  FixArray m = fix->left_shift(x_m, shift_amt, x.m_bits + 10, 9, all_1.data);
+  m.s += 1;
+  FixArray N = fix->mul(m, uint64_t(LOG2E * (1ULL << FP_INTMD_M_BITS)),
+                        x.m_bits + FP_INTMD_M_BITS + 11, all_0.data);
+  N.s += FP_INTMD_M_BITS;
+  N = fix->round_ties_to_even(N, x.m_bits + 1 + FP_INTMD_M_BITS);
+
+  FixArray f = fix->mul(N, uint64_t(LOGE2 * (1ULL << (FP_INTMD_M_BITS + 7))),
+                        FP_INTMD_M_BITS + 8, all_0.data);
+  f.s = FP_INTMD_M_BITS + 7;
+  f = fix->sub(fix->scale_up(m, FP_INTMD_M_BITS + 8, FP_INTMD_M_BITS + 7), f);
+  BoolArray msb_f, zero_f;
+  tie(msb_f, zero_f) = fix->MSB_and_zero_test(f);
+  BoolArray a = bool_op->XOR(x_s, msb_f);
+  f = fix->if_else(msb_f, fix->mul(f, -1), f);
+  N = fix->if_else(x_s, fix->mul(N, -1), N);
+  N.signed_ = true;
+
+  FixArray delta_m = f;
+  FixArray delta_e = fix->input(PUBLIC, x.size, uint64_t(0), x_e.signed_, x_e.ell, x_e.s);
+  fp_op->normalize(delta_m, delta_e, FP_INTMD_M_BITS + 7 - x.e_bias());
+  delta_m = fix->truncate_reduce(delta_m, 7);
+  delta_e = fix->if_else(zero_f, 0, delta_e);
+  FPArray delta = fp_op->input(x.party, x.size, all_0.data, zero_f.data,
+          delta_m.data, delta_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // if x < LOGE2, a = x_s, N = 0, delta = x
+  FixArray x_m_prec = fix->scale_up(x_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  FixArray x_e_prec = x_e;
+  FPArray pos_x_prec = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m_prec.data, x_e_prec.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  a = bool_op->if_else(pos_x_ge_LOGE2, a, x_s);
+  N = fix->if_else(pos_x_ge_LOGE2, N, 0);
+  delta = fp_op->if_else(pos_x_ge_LOGE2, delta, pos_x_prec);
+  BoolArray delta_s, delta_z;
+  tie(delta_s, delta_z, delta_m, delta_e) = fp_op->get_components(delta);
+
+  // if delta < 2^-14, mu = 1.0
+  BoolArray cond = fix->LT(delta_e, -24 + delta.e_bias());
+  // secret-sharing one as fp_op add does not support PUBLIC FPArrays
+  FPArray one = fp_op->input<float>(ALICE, x.size, 1.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  FPArray mu_if = one;
+
+  // else evaluate spline
+  // if delta < 2^-6
+  BoolArray cond1 = fix->LT(delta_e, -6 + delta.e_bias());
+  FixArray idx_1 =
+      get_idx_from_input(fix, delta_m, delta_e, 0, 5, 24 - delta.e_bias());
+  vector<FPArray> theta_1 =
+      fp_op->GetCoeffs(exp_coeffs_1, exp_knots_bits_1, idx_1, 18,
+                       FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  // else if delta >= 2^-6
+  FixArray idx_2 =
+      get_idx_from_input(fix, delta_m, delta_e, 5, 3, 6 - delta.e_bias());
+  vector<FPArray> theta_2 =
+      fp_op->GetCoeffs(exp_coeffs_2, exp_knots_bits_2, idx_2, 44,
+                       FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  assert(theta_1.size() == theta_2.size());
+  assert(theta_1.size() == 6);
+  vector<FPArray> theta(theta_1.size());
+  for (int i = 0; i < theta_1.size(); i++) {
+    theta[i] = fp_op->if_else(cond1, theta_1[i], theta_2[i]);
+  }
+  for (int i = 0; i < 3; i++) {
+    theta[i] = fp_op->if_else(a, theta[i + 3], theta[i]);
+  }
+  FPArray mu_else = fp_op->mul(theta[2], delta, false);
+  mu_else = fp_op->add(mu_else, theta[1], true, true, false);
+  mu_else = fp_op->mul(mu_else, delta, false);
+  mu_else = fp_op->add(mu_else, theta[0], true, true, false);
+  FPArray mu = fp_op->if_else(cond, mu_if, mu_else);
+
+  BoolArray mu_s, mu_z;
+  FixArray mu_m, mu_e;
+  tie(mu_s, mu_z, mu_m, mu_e) = fp_op->get_components(mu);
+  N = fix->extend(N, x.e_bits + 2);
+  FixArray gamma_e = fix->add(mu_e, N);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(mu_m, oflow_threshold);
+  FixArray gamma_m_if =
+      fix->round_ties_to_even(mu_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray gamma_m = fix->if_else(rnd_no_oflow, gamma_m_if, (1ULL << x.m_bits));
+  gamma_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+  FPArray gamma = fp_op->input(x.party, x.size, all_0.data, all_0.data,
+          gamma_m.data, gamma_e.data, x.m_bits, x.e_bits);
+
+  // Special Cases
+  // Case I
+  BoolArray cond2 = fp_op->GE(x, 88.72283172607421875);
+  // secret-sharing inf as fp_op if_else does not support PUBLIC FPArrays
+  FPArray inf = fp_op->input<double>(ALICE, x.size, INFINITY, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond2, inf, gamma);
+
+  // Case II
+  BoolArray cond3 = fp_op->LT(x, -87.33654022216796875);
+  // secret-sharing zero as fp_op if_else does not support PUBLIC FPArrays
+  FPArray zero = fp_op->input<double>(ALICE, x.size, 0.0, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond3, zero, gamma);
+
+  // Case III
+  BoolArray cond4 = fix->LE(x_e, -25 + x.e_bias());
+  // secret-sharing one32 as fp_op if_else does not support PUBLIC FPArrays
+  FPArray one32 = fp_op->input<double>(ALICE, x.size, 1.0, x.m_bits, x.e_bits, false);
+  gamma = fp_op->if_else(cond4, one32, gamma);
+
+  return gamma;
+}
+
+FPArray FPMath::erf(const FPArray &x) {
+  assert(x.party != PUBLIC);
+  assert(x.m_bits == 23);
+  assert(x.e_bits == 8);
+  assert(FP_INTMD_E_BITS == x.e_bits);
+
+  BoolArray x_s, x_z;
+  FixArray x_m, x_e;
+  tie(x_s, x_z, x_m, x_e) = fp_op->get_components(x);
+
+  BoolArray all_0 = bool_op->input(ALICE, x.size, uint8_t(0));
+
+  FixArray x_m_prec = fix->scale_up(x_m, FP_INTMD_M_BITS + 1, FP_INTMD_M_BITS);
+  FPArray delta = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m_prec.data, x_e.data, FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  BoolArray delta_s, delta_z;
+  FixArray delta_m, delta_e;
+  tie(delta_s, delta_z, delta_m, delta_e) = fp_op->get_components(delta);
+
+  // if delta < 1
+  BoolArray cond1 = fix->LT(delta_e, delta.e_bias());
+  BoolArray e_lt_neg_5 = fix->LT(delta_e, delta.e_bias() - 5);
+  FixArray delta_e_prime =
+      fix->if_else(e_lt_neg_5, delta.e_bias() - 6, delta_e);
+  FixArray idx_1 =
+      get_idx_from_input(fix, delta_m, delta_e_prime, 3, 3, 6 - delta.e_bias());
+  vector<FPArray> theta_1;
+  theta_1 = fp_op->GetCoeffs(erf_coeffs_1, erf_knots_bits_1, idx_1, 24,
+                             FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+  // else if delta >= 1
+  FixArray idx_2 =
+      get_idx_from_input(fix, delta_m, delta_e, 5, 1, -1 * delta.e_bias());
+  vector<FPArray> theta_2;
+  theta_2 = fp_op->GetCoeffs(erf_coeffs_2, erf_knots_bits_2, idx_2, 46,
+                             FP_INTMD_M_BITS, FP_INTMD_E_BITS);
+
+  assert(theta_1.size() == theta_2.size());
+  assert(theta_1.size() == 4);
+  vector<FPArray> theta(theta_1.size());
+  for (int i = 0; i < theta_1.size(); i++) {
+    theta[i] = fp_op->if_else(cond1, theta_1[i], theta_2[i]);
+  }
+  FPArray mu = fp_op->mul(theta[3], delta, false);
+  mu = fp_op->add(mu, theta[2], true, true, false);
+  mu = fp_op->mul(mu, delta, false);
+  mu = fp_op->add(mu, theta[1], true, true, false);
+  mu = fp_op->mul(mu, delta, false);
+  mu = fp_op->add(mu, theta[0], true, true, false);
+
+  // Special Cases
+  // Special Case I (x < 2^-12)
+  BoolArray cond2 = fix->LT(x_e, -12 + delta.e_bias());
+  FPArray two_inv_sqrt_pi = fp_op->input<double>(PUBLIC, x.size, TWO_INV_SQRT_PI,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  mu = fp_op->if_else(cond2, fp_op->mul(delta, two_inv_sqrt_pi, true), mu);
+
+  // Case I (x >= 3.875)
+  FPArray pos_x = fp_op->input(x.party, x.size, all_0.data, x_z.data,
+          x_m.data, x_e.data, x.m_bits, x.e_bits);
+  BoolArray cond3 = fp_op->GE(pos_x, 3.875);
+  // secret-sharing one as fp_op add does not support PUBLIC FPArrays
+  FPArray one = fp_op->input<double>(ALICE, x.size, 1.0,
+          FP_INTMD_M_BITS, FP_INTMD_E_BITS, false);
+  FPArray gamma = fp_op->if_else(cond3, one, mu);
+
+  // reduce precision to normal
+  BoolArray gamma_s, gamma_z;
+  FixArray gamma_m, gamma_e;
+  tie(gamma_s, gamma_z, gamma_m, gamma_e) = fp_op->get_components(gamma);
+  uint64_t oflow_threshold = (1ULL << (FP_INTMD_M_BITS + 1)) -
+                             (1ULL << (FP_INTMD_M_BITS - x.m_bits - 1));
+  BoolArray rnd_no_oflow = fix->LT(gamma_m, oflow_threshold);
+  FixArray ret_m_if =
+      fix->round_ties_to_even(gamma_m, FP_INTMD_M_BITS - x.m_bits);
+  FixArray ret_m = fix->if_else(rnd_no_oflow, ret_m_if, (1ULL << x.m_bits));
+  FixArray ret_e = fix->if_else(rnd_no_oflow, gamma_e, fix->add(gamma_e, 1));
+  BoolArray ret_s = x_s;
+
+  FPArray ret = fp_op->input(x.party, x.size, ret_s.data, gamma_z.data,
+          ret_m.data, ret_e.data, x.m_bits, x.e_bits);
+
+  return ret;
+}
+
+vector<FPArray> FPMath::softmax(const vector<FPArray>& x) {
+  int N = x.size();
+  int n = x[0].size;
+  int m_bits = x[0].m_bits;
+  int e_bits = x[0].e_bits;
+  assert(m_bits > 0);
+  for(int i = 1; i < N; i++) {
+    assert(x[i].party != PUBLIC);
+    assert(x[i].m_bits == m_bits);
+    assert(x[i].e_bits == e_bits);
+    assert(x[i].size == n);
+  }
+  FPArray x_max = fp_op->max(x);
+  FPArray x_max_flat(party, N*n, m_bits, e_bits);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < n; j++) {
+      x_max_flat.s[i*n + j] = x_max.s[i];
+      x_max_flat.z[i*n + j] = x_max.z[i];
+      x_max_flat.m[i*n + j] = x_max.m[i];
+      x_max_flat.e[i*n + j] = x_max.e[i];
+    }
+  }
+
+  FPArray x_flat = concat(x);
+  FPArray shifted_x_flat = fp_op->flip_sign(fp_op->sub(x_max_flat, x_flat, false, true, true));
+
+  FPArray e_x_flat = this->exp(shifted_x_flat);
+
+  vector<FPArray> e_x_tr(n);
+  for (int i = 0; i < n; i++) {
+    e_x_tr[i] = FPArray(party, N, m_bits, e_bits);
+    for (int j = 0; j < N; j++) {
+      e_x_tr[i].s[j] = e_x_flat.s[j*n + i];
+      e_x_tr[i].z[j] = e_x_flat.z[j*n + i];
+      e_x_tr[i].m[j] = e_x_flat.m[j*n + i];
+      e_x_tr[i].e[j] = e_x_flat.e[j*n + i];
+    }
+  }
+  FPArray sum_e_x;
+  {
+    vector<FPArray> tmp = e_x_tr;
+    int num_adds_old = n; int num_adds_curr = n/2;
+    while(num_adds_old > 1) {
+      int odd_num_adds = num_adds_old & 1;
+      vector<FPArray> lhs(num_adds_curr); vector<FPArray> rhs(num_adds_curr);
+      for (int j = odd_num_adds; j < num_adds_old && j + 1 < num_adds_old; j += 2) {
+        lhs[j/2] = tmp[j]; rhs[j/2] = tmp[j+1];
+      }
+      FPArray lhs_concat = concat(lhs);
+      FPArray rhs_concat = concat(rhs);
+      lhs_concat = fp_op->add(lhs_concat, rhs_concat);
+      for (int j = 0; j < num_adds_old && j + 1 < num_adds_old; j += 2) {
+        tmp[odd_num_adds + (j/2)] = lhs_concat.subset((j/2)*N, (j/2)*N + N);
+      }
+      num_adds_old = num_adds_curr + odd_num_adds;
+      num_adds_curr = num_adds_old/2;
+    }
+    sum_e_x = tmp[0];
+  }
+  FPArray sum_e_x_replicated(party, N*n, m_bits, e_bits);
+  for(int i = 0; i < N; i++) {
+    for (int j = 0; j < n; j++) {
+      sum_e_x_replicated.s[i*n + j] = sum_e_x.s[i];
+      sum_e_x_replicated.z[i*n + j] = sum_e_x.z[i];
+      sum_e_x_replicated.m[i*n + j] = sum_e_x.m[i];
+      sum_e_x_replicated.e[i*n + j] = sum_e_x.e[i];
+    }
+  }
+
+  FPArray ret_flat = fp_op->div(e_x_flat, sum_e_x_replicated);
+  vector<FPArray> ret(N);
+  for (int i = 0; i < N; i++) {
+    ret[i] = FPArray(party, n, m_bits, e_bits);
+    memcpy(ret[i].s, ret_flat.s + i*n, n*sizeof(uint8_t));
+    memcpy(ret[i].z, ret_flat.z + i*n, n*sizeof(uint8_t));
+    memcpy(ret[i].m, ret_flat.m + i*n, n*sizeof(uint64_t));
+    memcpy(ret[i].e, ret_flat.e + i*n, n*sizeof(uint64_t));
+  }
+  return ret;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.h b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.h
new file mode 100644
index 00000000..e1cf82ae
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.h
@@ -0,0 +1,65 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef FLOATING_POINT_MATH_H__
+#define FLOATING_POINT_MATH_H__
+
+#include "FloatingPoint/floating-point.h"
+#include "Math/math-functions.h"
+
+class FPMath {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  BoolOp *bool_op;
+  FixOp *fix;
+  FPOp *fp_op;
+
+  FPMath(int party, sci::IOPack *iopack, sci::OTPack *otpack) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->bool_op = new BoolOp(party, iopack, otpack);
+    this->fix = new FixOp(party, iopack, otpack);
+    this->fp_op = new FPOp(party, iopack, otpack);
+  }
+
+  ~FPMath() {
+    delete bool_op;
+    delete fix;
+    delete fp_op;
+  }
+
+  // Floating-Point Math Functions: returns OP(x[i]), OP = {sinpi, cospi, tanpi, exp2, log2, exp, ln, erf}
+  // x must be secret-shared
+  FPArray sinpi(const FPArray &x);
+  FPArray cospi(const FPArray &x);
+  FPArray tanpi(const FPArray &x);
+  FPArray exp2(const FPArray &x);
+  FPArray log2(const FPArray &x);
+  FPArray exp(const FPArray &x);
+  FPArray ln(const FPArray &x);
+  FPArray erf(const FPArray &x);
+  vector<FPArray> softmax(const vector<FPArray>& x);
+};
+
+#endif // FLOATING_POINT_MATH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/GC/CMakeLists.txt
new file mode 100644
index 00000000..aff2b661
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(SCI-GC
+    emp-tool.cpp
+    halfgate_gen.cpp
+    halfgate_eva.cpp)
+target_link_libraries(SCI-GC
+    PUBLIC SCI-utils SCI-OT
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h
new file mode 100644
index 00000000..806cc854
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h
@@ -0,0 +1,152 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_AES_OPT_KS_H__
+#define EMP_AES_OPT_KS_H__
+
+#include "utils/aes.h"
+
+namespace sci {
+template <int NumKeys>
+static inline void ks_rounds(AES_KEY *keys, block128 con, block128 con3,
+                             block128 mask, int r) {
+  for (int i = 0; i < NumKeys; ++i) {
+    block128 key = keys[i].rd_key[r - 1];
+    block128 x2 = _mm_shuffle_epi8(key, mask);
+    block128 aux = _mm_aesenclast_si128(x2, con);
+
+    block128 globAux = _mm_slli_epi64(key, 32);
+    key = _mm_xor_si128(globAux, key);
+    globAux = _mm_shuffle_epi8(key, con3);
+    key = _mm_xor_si128(globAux, key);
+    keys[i].rd_key[r] = _mm_xor_si128(aux, key);
+  }
+}
+/*
+ * AES key scheduling for 8 keys
+ * [REF] Implementation of "Fast Garbling of Circuits Under Standard
+ * Assumptions" https://eprint.iacr.org/2015/751.pdf
+ */
+template <int NumKeys>
+static inline void AES_opt_key_schedule(block128 *user_key, AES_KEY *keys) {
+  block128 con = _mm_set_epi32(1, 1, 1, 1);
+  block128 con2 = _mm_set_epi32(0x1b, 0x1b, 0x1b, 0x1b);
+  block128 con3 =
+      _mm_set_epi32(0x07060504, 0x07060504, 0x0ffffffff, 0x0ffffffff);
+  block128 mask = _mm_set_epi32(0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d);
+
+  for (int i = 0; i < NumKeys; ++i) {
+    keys[i].rounds = 10;
+    keys[i].rd_key[0] = user_key[i];
+  }
+
+  ks_rounds<NumKeys>(keys, con, con3, mask, 1);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 2);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 3);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 4);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 5);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 6);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 7);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con, con3, mask, 8);
+  con = _mm_slli_epi32(con, 1);
+  ks_rounds<NumKeys>(keys, con2, con3, mask, 9);
+  con2 = _mm_slli_epi32(con2, 1);
+  ks_rounds<NumKeys>(keys, con2, con3, mask, 10);
+}
+
+/*
+ * With numKeys keys, use each key to encrypt numEncs blocks.
+ */
+#ifdef __x86_64__
+template <int numKeys, int numEncs>
+static inline void ParaEnc(block128 *blks, AES_KEY *keys) {
+  block128 *first = blks;
+  for (size_t i = 0; i < numKeys; ++i) {
+    block128 K = keys[i].rd_key[0];
+    for (size_t j = 0; j < numEncs; ++j) {
+      *blks = *blks ^ K;
+      ++blks;
+    }
+  }
+
+  for (unsigned int r = 1; r < 10; ++r) {
+    blks = first;
+    for (size_t i = 0; i < numKeys; ++i) {
+      block128 K = keys[i].rd_key[r];
+      for (size_t j = 0; j < numEncs; ++j) {
+        *blks = _mm_aesenc_si128(*blks, K);
+        ++blks;
+      }
+    }
+  }
+
+  blks = first;
+  for (size_t i = 0; i < numKeys; ++i) {
+    block128 K = keys[i].rd_key[10];
+    for (size_t j = 0; j < numEncs; ++j) {
+      *blks = _mm_aesenclast_si128(*blks, K);
+      ++blks;
+    }
+  }
+}
+#elif __aarch64__
+template <int numKeys, int numEncs>
+static inline void ParaEnc(block128 *_blks, AES_KEY *keys) {
+  uint8x16_t *first = (uint8x16_t *)(_blks);
+
+  for (unsigned int r = 0; r < 9; ++r) {
+    auto blks = first;
+    for (size_t i = 0; i < numKeys; ++i) {
+      uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[r]);
+      for (size_t j = 0; j < numEncs; ++j, ++blks)
+        *blks = vaeseq_u8(*blks, K);
+    }
+    blks = first;
+    for (size_t i = 0; i < numKeys; ++i) {
+      for (size_t j = 0; j < numEncs; ++j, ++blks)
+        *blks = vaesmcq_u8(*blks);
+    }
+  }
+
+  auto blks = first;
+  for (size_t i = 0; i < numKeys; ++i) {
+    uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[9]);
+    for (size_t j = 0; j < numEncs; ++j, ++blks)
+      *blks = vaeseq_u8(*blks, K) ^ K;
+  }
+}
+#endif
+
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h
new file mode 100644
index 00000000..8413e9d9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h
@@ -0,0 +1,68 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_BIT_H__
+#define EMP_BIT_H__
+#include "GC/circuit_execution.h"
+#include "GC/protocol_execution.h"
+#include "GC/swappable.h"
+#include "utils/block.h"
+#include "utils/utils.h"
+
+namespace sci {
+class Bit : public Swappable<Bit> {
+public:
+  block128 bit;
+
+  Bit(bool _b = false, int party = PUBLIC);
+  Bit(const block128 &a) { memcpy(&bit, &a, sizeof(block128)); }
+
+  template <typename O = bool> O reveal(int party = PUBLIC) const;
+
+  Bit operator!=(const Bit &rhs) const;
+  Bit operator==(const Bit &rhs) const;
+  Bit operator&(const Bit &rhs) const;
+  Bit operator|(const Bit &rhs) const;
+  Bit operator!() const;
+
+  // swappable
+  Bit select(const Bit &select, const Bit &new_v) const;
+  Bit operator^(const Bit &rhs) const;
+
+  // batcher
+  template <typename... Args> static size_t bool_size(Args &&...args) {
+    return 1;
+  }
+
+  static void bool_data(bool *b, bool data) { b[0] = data; }
+
+  Bit(size_t size, const block128 *a) { memcpy(&bit, a, sizeof(block128)); }
+};
+
+#include "GC/bit.hpp"
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp b/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp
new file mode 100644
index 00000000..e1a358ad
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp
@@ -0,0 +1,80 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+inline Bit::Bit(bool b, int party) {
+	if (party == PUBLIC)
+		bit = circ_exec->public_label(b);
+	else prot_exec->feed(&bit, party, &b, 1); 
+}
+
+inline Bit Bit::select(const Bit & select, const Bit & new_v) const{
+	Bit tmp = *this;
+	tmp = tmp ^ new_v;
+	tmp = tmp & select;
+	return *this ^ tmp;
+}
+
+template<typename O>
+inline O Bit::reveal(int party) const {
+	O res;
+	prot_exec->reveal(&res, party, &bit, 1);
+	return res;
+}
+
+template<>
+inline string Bit::reveal<string>(int party) const {
+	bool res;
+	prot_exec->reveal(&res, party, &bit, 1);
+	return res ? "true" : "false";
+}
+
+inline Bit Bit::operator==(const Bit& rhs) const {
+	return !(*this ^ rhs);
+}
+
+inline Bit Bit::operator!=(const Bit& rhs) const {
+	return (*this) ^ rhs;
+}
+
+inline Bit Bit::operator &(const Bit& rhs) const{
+	Bit res;
+	res.bit = circ_exec->and_gate(bit, rhs.bit);
+	return res;
+}
+inline Bit Bit::operator ^(const Bit& rhs) const{
+	Bit res;
+	res.bit = circ_exec->xor_gate(bit, rhs.bit);
+	return res;
+}
+
+inline Bit Bit::operator |(const Bit& rhs) const{
+	return (*this ^ rhs) ^ (*this & rhs);
+}
+
+inline Bit Bit::operator!() const {
+	return circ_exec->not_gate(bit);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/circuit_execution.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/circuit_execution.h
new file mode 100644
index 00000000..b53ea5fa
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/circuit_execution.h
@@ -0,0 +1,60 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_CIRCUIT_EXECUTION_H__
+#define EMP_CIRCUIT_EXECUTION_H__
+#include "utils/block.h"
+#include "utils/constants.h"
+
+namespace sci {
+
+/* Circuit Pipelining
+ * [REF] Implementation of "Faster Secure Two-Party Computation Using Garbled
+ * Circuit" https://www.usenix.org/legacy/event/sec11/tech/full_papers/Huang.pdf
+ */
+class CircuitExecution {
+public:
+  /*
+  #ifndef THREADING
+          // static CircuitExecution * circ_exec;
+  #else
+          static __thread CircuitExecution * circ_exec;
+  #endif
+  */
+  virtual block128 and_gate(const block128 &in1, const block128 &in2) = 0;
+  virtual block128 xor_gate(const block128 &in1, const block128 &in2) = 0;
+  virtual block128 not_gate(const block128 &in1) = 0;
+  virtual block128 public_label(bool b) = 0;
+  virtual size_t num_and() { return -1; }
+  virtual ~CircuitExecution() {}
+};
+enum RTCktOpt { on, off };
+} // namespace sci
+
+// extern sci::CircuitExecution* circ_exec;
+thread_local extern sci::CircuitExecution *circ_exec;
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/comparable.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/comparable.h
new file mode 100644
index 00000000..786fef1f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/comparable.h
@@ -0,0 +1,57 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_COMPARABLE_H__
+#define EMP_COMPARABLE_H__
+
+namespace sci {
+template <typename T> class Comparable {
+public:
+  Bit operator>=(const T &rhs) const {
+    return static_cast<const T *>(this)->geq(rhs);
+  }
+  Bit operator<(const T &rhs) const {
+    return !((*static_cast<const T *>(this)) >= rhs);
+  }
+
+  Bit operator<=(const T &rhs) const {
+    return rhs >= *static_cast<const T *>(this);
+  }
+
+  Bit operator>(const T &rhs) const {
+    return !(rhs >= *static_cast<const T *>(this));
+  }
+
+  Bit operator==(const T &rhs) const {
+    return static_cast<const T *>(this)->equal(rhs);
+  }
+  Bit operator!=(const T &rhs) const {
+    return !(*static_cast<const T *>(this) == rhs);
+  }
+};
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-sh2pc.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-sh2pc.h
new file mode 100644
index 00000000..ac6dc9f1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-sh2pc.h
@@ -0,0 +1,4 @@
+#include "GC/semihonest.h"
+#include "GC/sh_eva.h"
+#include "GC/sh_gen.h"
+#include "GC/sh_party.h"
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.cpp
new file mode 100644
index 00000000..95df503a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.cpp
@@ -0,0 +1,43 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#include "GC/circuit_execution.h"
+#include "GC/protocol_execution.h"
+
+thread_local sci::ProtocolExecution *prot_exec = nullptr;
+thread_local sci::CircuitExecution *circ_exec = nullptr;
+/*
+#ifndef THREADING
+// sci::ProtocolExecution* sci::ProtocolExecution::prot_exec = nullptr;
+// sci::CircuitExecution* sci::CircuitExecution::circ_exec = nullptr;
+sci::ProtocolExecution* prot_exec = nullptr;
+sci::CircuitExecution* circ_exec = nullptr;
+#else
+__thread sci::ProtocolExecution* sci::ProtocolExecution::prot_exec = nullptr;
+__thread sci::CircuitExecution* sci::CircuitExecution::circ_exec = nullptr;
+#endif
+*/
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h
new file mode 100644
index 00000000..b6d132b1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h
@@ -0,0 +1,19 @@
+#include "utils/emp-tool.h"
+#include <thread>
+
+#include "GC/bit.h"
+#include "GC/comparable.h"
+#include "GC/integer.h"
+#include "GC/number.h"
+#include "GC/swappable.h"
+
+#include "GC/aes_opt.h"
+#include "GC/f2k.h"
+#include "GC/mitccrh.h"
+#include "GC/utils.h"
+
+#include "GC/halfgate_eva.h"
+#include "GC/halfgate_gen.h"
+
+#include "GC/circuit_execution.h"
+#include "GC/protocol_execution.h"
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h
new file mode 100644
index 00000000..cd20db5d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h
@@ -0,0 +1,232 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_F2K_H__
+#define EMP_F2K_H__
+#include "utils/block.h"
+
+namespace sci {
+/* multiplication in galois field without reduction */
+#ifdef __x86_64__
+__attribute__((target("sse2,pclmul"))) inline void
+mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
+  __m128i tmp3, tmp4, tmp5, tmp6;
+  tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+  tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
+  tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
+  tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
+
+  tmp4 = _mm_xor_si128(tmp4, tmp5);
+  tmp5 = _mm_slli_si128(tmp4, 8);
+  tmp4 = _mm_srli_si128(tmp4, 8);
+  tmp3 = _mm_xor_si128(tmp3, tmp5);
+  tmp6 = _mm_xor_si128(tmp6, tmp4);
+  // initial mul now in tmp3, tmp6
+  *res1 = tmp3;
+  *res2 = tmp6;
+}
+#elif __aarch64__
+inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
+  __m128i tmp3, tmp4, tmp5, tmp6;
+  poly64_t a_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(a));
+  poly64_t a_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(a));
+  poly64_t b_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(b));
+  poly64_t b_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(b));
+  tmp3 = (__m128i)vmull_p64(a_lo, b_lo);
+  tmp4 = (__m128i)vmull_p64(a_hi, b_lo);
+  tmp5 = (__m128i)vmull_p64(a_lo, b_hi);
+  tmp6 = (__m128i)vmull_p64(a_hi, b_hi);
+
+  tmp4 = _mm_xor_si128(tmp4, tmp5);
+  tmp5 = _mm_slli_si128(tmp4, 8);
+  tmp4 = _mm_srli_si128(tmp4, 8);
+  tmp3 = _mm_xor_si128(tmp3, tmp5);
+  tmp6 = _mm_xor_si128(tmp6, tmp4);
+  // initial mul now in tmp3, tmp6
+  *res1 = tmp3;
+  *res2 = tmp6;
+}
+#endif
+
+/* multiplication in galois field with reduction */
+#ifdef __x86_64__
+__attribute__((target("sse2,pclmul")))
+#endif
+inline void
+gfmul(__m128i a, __m128i b, __m128i *res) {
+  __m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
+  __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
+  mul128(a, b, &tmp3, &tmp6);
+  tmp7 = _mm_srli_epi32(tmp6, 31);
+  tmp8 = _mm_srli_epi32(tmp6, 30);
+  tmp9 = _mm_srli_epi32(tmp6, 25);
+  tmp7 = _mm_xor_si128(tmp7, tmp8);
+  tmp7 = _mm_xor_si128(tmp7, tmp9);
+  tmp8 = _mm_shuffle_epi32(tmp7, 147);
+
+  tmp7 = _mm_and_si128(XMMMASK, tmp8);
+  tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
+  tmp3 = _mm_xor_si128(tmp3, tmp8);
+  tmp6 = _mm_xor_si128(tmp6, tmp7);
+  tmp10 = _mm_slli_epi32(tmp6, 1);
+  tmp3 = _mm_xor_si128(tmp3, tmp10);
+  tmp11 = _mm_slli_epi32(tmp6, 2);
+  tmp3 = _mm_xor_si128(tmp3, tmp11);
+  tmp12 = _mm_slli_epi32(tmp6, 7);
+  tmp3 = _mm_xor_si128(tmp3, tmp12);
+
+  *res = _mm_xor_si128(tmp3, tmp6);
+}
+
+/* inner product of two galois field vectors with reduction */
+inline void vector_inn_prdt_sum_red(block128 *res, const block128 *a,
+                                    const block128 *b, int sz) {
+  block128 r = zero_block();
+  block128 r1;
+  for (int i = 0; i < sz; i++) {
+    gfmul(a[i], b[i], &r1);
+    r = r ^ r1;
+  }
+  *res = r;
+}
+
+/* inner product of two galois field vectors with reduction */
+template <int N>
+inline void vector_inn_prdt_sum_red(block128 *res, block128 const *a,
+                                    const block128 *b) {
+  vector_inn_prdt_sum_red(res, a, b, N);
+}
+
+/* inner product of two galois field vectors without reduction */
+inline void vector_inn_prdt_sum_no_red(block128 *res, const block128 *a,
+                                       const block128 *b, int sz) {
+  block128 r1 = zero_block(), r2 = zero_block();
+  block128 r11, r12;
+  for (int i = 0; i < sz; i++) {
+    mul128(a[i], b[i], &r11, &r12);
+    r1 = r1 ^ r11;
+    r2 = r2 ^ r12;
+  }
+  res[0] = r1;
+  res[1] = r2;
+}
+
+/* inner product of two galois field vectors without reduction */
+template <int N>
+inline void vector_inn_prdt_sum_no_red(block128 *res, const block128 *a,
+                                       const block128 *b) {
+  vector_inn_prdt_sum_no_red(res, a, b, N);
+}
+
+/* coefficients of almost universal hash function */
+inline void uni_hash_coeff_gen(block128 *coeff, block128 seed, int sz) {
+  coeff[0] = seed;
+  gfmul(seed, seed, &coeff[1]);
+  gfmul(coeff[1], seed, &coeff[2]);
+  block128 multiplier;
+  gfmul(coeff[2], seed, &multiplier);
+  coeff[3] = multiplier;
+  int i = 4;
+  for (; i < sz - 3; i += 4) {
+    gfmul(coeff[i - 4], multiplier, &coeff[i]);
+    gfmul(coeff[i - 3], multiplier, &coeff[i + 1]);
+    gfmul(coeff[i - 2], multiplier, &coeff[i + 2]);
+    gfmul(coeff[i - 1], multiplier, &coeff[i + 3]);
+  }
+  int remainder = sz % 4;
+  if (remainder != 0) {
+    i = sz - remainder;
+    for (; i < sz; ++i)
+      gfmul(coeff[i - 1], seed, &coeff[i]);
+  }
+}
+
+/* coefficients of almost universal hash function */
+template <int N>
+inline void uni_hash_coeff_gen(block128 *coeff, block128 seed) {
+  uni_hash_coeff_gen(coeff, seed, N);
+}
+
+/* packing in Galois field (v[i] * X^i for v of size 128) */
+class GaloisFieldPacking {
+public:
+  block128 base[128];
+
+  GaloisFieldPacking() { packing_base_gen(); }
+
+  ~GaloisFieldPacking() {}
+
+  void packing_base_gen() {
+    uint64_t a = 0, b = 1;
+    for (int i = 0; i < 64; i += 4) {
+      base[i] = _mm_set_epi64x(a, b);
+      base[i + 1] = _mm_set_epi64x(a, b << 1);
+      base[i + 2] = _mm_set_epi64x(a, b << 2);
+      base[i + 3] = _mm_set_epi64x(a, b << 3);
+      b <<= 4;
+    }
+    a = 1, b = 0;
+    for (int i = 64; i < 128; i += 4) {
+      base[i] = _mm_set_epi64x(a, b);
+      base[i + 1] = _mm_set_epi64x(a << 1, b);
+      base[i + 2] = _mm_set_epi64x(a << 2, b);
+      base[i + 3] = _mm_set_epi64x(a << 3, b);
+      a <<= 4;
+    }
+  }
+
+  void packing(block128 *res, block128 *data) {
+    vector_inn_prdt_sum_red(res, data, base, 128);
+  }
+};
+
+/* XOR of all elements in a vector */
+inline void vector_self_xor(block128 *sum, block128 *data, int sz) {
+  block128 res[4];
+  res[0] = zero_block();
+  res[1] = zero_block();
+  res[2] = zero_block();
+  res[3] = zero_block();
+  for (int i = 0; i < (sz / 4) * 4; i += 4) {
+    res[0] = data[i] ^ res[0];
+    res[1] = data[i + 1] ^ res[1];
+    res[2] = data[i + 2] ^ res[2];
+    res[3] = data[i + 3] ^ res[3];
+  }
+  for (int i = (sz / 4) * 4, j = 0; i < sz; ++i, ++j)
+    res[j] = data[i] ^ res[j];
+  res[0] = res[0] ^ res[1];
+  res[2] = res[2] ^ res[3];
+  *sum = res[0] ^ res[2];
+}
+
+/* XOR of all elements in a vector */
+template <int N> inline void vector_self_xor(block128 *sum, block128 *data) {
+  vector_self_xor(sum, data, N);
+}
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.cpp
new file mode 100644
index 00000000..09c77d70
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.cpp
@@ -0,0 +1,52 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#include "GC/halfgate_eva.h"
+
+using namespace sci;
+
+block128 sci::halfgates_eval(block128 A, block128 B, const block128 *table,
+                             MITCCRH<8> *mitccrh) {
+  block128 HA, HB, W;
+  int sa, sb;
+
+  sa = getLSB(A);
+  sb = getLSB(B);
+
+  block128 H[2];
+  H[0] = A;
+  H[1] = B;
+  mitccrh->hash<2, 1>(H);
+  HA = H[0];
+  HB = H[1];
+
+  W = HA ^ HB;
+  W = W ^ (select_mask[sa] & table[0]);
+  W = W ^ (select_mask[sb] & table[1]);
+  W = W ^ (select_mask[sb] & A);
+  return W;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.h
new file mode 100644
index 00000000..71838f5e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.h
@@ -0,0 +1,66 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_HALFGATE_EVA_H__
+#define EMP_HALFGATE_EVA_H__
+#include "GC/circuit_execution.h"
+#include "GC/mitccrh.h"
+#include "utils/utils.h"
+#include <iostream>
+namespace sci {
+
+block128 halfgates_eval(block128 A, block128 B, const block128 *table,
+                        MITCCRH<8> *mitccrh);
+
+template <typename T> class HalfGateEva : public CircuitExecution {
+public:
+  T *io;
+  block128 constant[2];
+  MITCCRH<8> mitccrh;
+  HalfGateEva(T *io) : io(io) {
+    set_delta();
+    block128 tmp;
+    io->recv_block(&tmp, 1);
+    mitccrh.setS(tmp);
+  }
+  void set_delta() { io->recv_block(constant, 2); }
+  block128 public_label(bool b) override { return constant[b]; }
+  block128 and_gate(const block128 &a, const block128 &b) override {
+    block128 table[2];
+    io->recv_block(table, 2);
+    return halfgates_eval(a, b, table, &mitccrh);
+  }
+  block128 xor_gate(const block128 &a, const block128 &b) override {
+    return a ^ b;
+  }
+  block128 not_gate(const block128 &a) override {
+    return xor_gate(a, public_label(true));
+  }
+  size_t num_and() override { return mitccrh.gid; }
+};
+} // namespace sci
+#endif // HALFGATE_EVA_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.cpp
new file mode 100644
index 00000000..aedbd7b9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.cpp
@@ -0,0 +1,61 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#include "GC/halfgate_gen.h"
+
+using namespace sci;
+
+block128 sci::halfgates_garble(block128 LA0, block128 A1, block128 LB0,
+                               block128 B1, block128 delta, block128 *table,
+                               MITCCRH<8> *mitccrh) {
+  bool pa = getLSB(LA0);
+  bool pb = getLSB(LB0);
+  block128 HLA0, HA1, HLB0, HB1;
+  block128 tmp, W0;
+
+  block128 H[4];
+  H[0] = LA0;
+  H[1] = A1;
+  H[2] = LB0;
+  H[3] = B1;
+  mitccrh->hash<2, 2>(H);
+  HLA0 = H[0];
+  HA1 = H[1];
+  HLB0 = H[2];
+  HB1 = H[3];
+
+  table[0] = HLA0 ^ HA1;
+  table[0] = table[0] ^ (select_mask[pb] & delta);
+  W0 = HLA0;
+  W0 = W0 ^ (select_mask[pa] & table[0]);
+  tmp = HLB0 ^ HB1;
+  table[1] = tmp ^ LA0;
+  W0 = W0 ^ HLB0;
+  W0 = W0 ^ (select_mask[pb] & tmp);
+
+  return W0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.h
new file mode 100644
index 00000000..f8c807e1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.h
@@ -0,0 +1,80 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_HALFGATE_GEN_H__
+#define EMP_HALFGATE_GEN_H__
+#include "GC/circuit_execution.h"
+#include "GC/mitccrh.h"
+#include "utils/utils.h"
+#include <iostream>
+namespace sci {
+
+/*
+ * The half-gate garbling scheme, with improved hashing
+ * [REF] Implementation of "Two Halves Make a Whole"
+ * https://eprint.iacr.org/2014/756.pdf
+ */
+block128 halfgates_garble(block128 LA0, block128 A1, block128 LB0, block128 B1,
+                          block128 delta, block128 *table, MITCCRH<8> *mitccrh);
+
+template <typename T> class HalfGateGen : public CircuitExecution {
+public:
+  block128 delta;
+  T *io;
+  block128 constant[2];
+  MITCCRH<8> mitccrh;
+  HalfGateGen(T *io) : io(io) {
+    block128 tmp[2];
+    PRG128().random_block(tmp, 2);
+    set_delta(tmp[0]);
+    io->send_block(tmp + 1, 1);
+    mitccrh.setS(tmp[1]);
+  }
+  void set_delta(const block128 &_delta) {
+    delta = set_bit(_delta, 0);
+    PRG128().random_block(constant, 2);
+    io->send_block(constant, 2);
+    constant[1] = constant[1] ^ delta;
+  }
+  block128 public_label(bool b) override { return constant[b]; }
+  block128 and_gate(const block128 &a, const block128 &b) override {
+    block128 table[2];
+    block128 res =
+        halfgates_garble(a, a ^ delta, b, b ^ delta, delta, table, &mitccrh);
+    io->send_block(table, 2);
+    return res;
+  }
+  block128 xor_gate(const block128 &a, const block128 &b) override {
+    return a ^ b;
+  }
+  block128 not_gate(const block128 &a) override {
+    return xor_gate(a, public_label(true));
+  }
+  size_t num_and() override { return mitccrh.gid; }
+};
+} // namespace sci
+#endif // HALFGATE_GEN_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h
new file mode 100644
index 00000000..e0474170
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h
@@ -0,0 +1,87 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_INTEGER_H__
+#define EMP_INTEGER_H__
+
+#include "GC/bit.h"
+#include "GC/comparable.h"
+#include "GC/number.h"
+#include "GC/swappable.h"
+#include <algorithm>
+#include <bitset>
+#include <math.h>
+#include <vector>
+
+using std::min;
+using std::vector;
+
+namespace sci {
+class Integer : public Swappable<Integer>, public Comparable<Integer> {
+public:
+  std::vector<Bit> bits;
+  Integer() {}
+  Integer(int length, int64_t input, int party = PUBLIC);
+
+  // Comparable
+  Bit geq(const Integer &rhs) const;
+  Bit equal(const Integer &rhs) const;
+
+  // Swappable
+  Integer select(const Bit &sel, const Integer &rhs) const;
+  Integer operator^(const Integer &rhs) const;
+
+  int size() const;
+  template <typename T> T reveal(int party = PUBLIC) const;
+
+  Integer abs() const;
+  Integer &resize(int length, bool signed_extend = true);
+  Integer modExp(Integer p, Integer q);
+  Integer leading_zeros() const;
+  Integer hamming_weight() const;
+
+  Integer operator<<(int shamt) const;
+  Integer operator>>(int shamt) const;
+  Integer operator<<(const Integer &shamt) const;
+  Integer operator>>(const Integer &shamt) const;
+
+  Integer operator+(const Integer &rhs) const;
+  Integer operator-(const Integer &rhs) const;
+  Integer operator-() const;
+  Integer operator*(const Integer &rhs) const;
+  Integer operator/(const Integer &rhs) const;
+  Integer operator%(const Integer &rhs) const;
+  Integer operator&(const Integer &rhs) const;
+  Integer operator|(const Integer &rhs) const;
+
+  Bit &operator[](int index);
+  const Bit &operator[](int index) const;
+};
+
+#include "GC/integer.hpp"
+} // namespace sci
+#endif // INTEGER_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.hpp b/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.hpp
new file mode 100644
index 00000000..a96c016f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.hpp
@@ -0,0 +1,424 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+//https://github.com/samee/obliv-c/blob/obliv-c/src/ext/oblivc/obliv_bits.c#L1487
+inline void add_full(Bit* dest, Bit * carryOut, const Bit * op1, const Bit * op2,
+		const Bit * carryIn, int size) {
+	Bit carry, bxc, axc, t;
+	int skipLast; 
+	int i = 0;
+	if(size==0) { 
+		if(carryIn && carryOut)
+			*carryOut = *carryIn;
+		return;
+	}
+	if(carryIn)
+		carry = *carryIn;
+	else 
+		carry = false;
+	// skip AND on last bit if carryOut==NULL
+	skipLast = (carryOut == nullptr);
+	while(size-->skipLast) { 
+		axc = op1[i] ^ carry;
+		bxc = op2[i] ^ carry;
+		dest[i] = op1[i] ^ bxc;
+		t = axc & bxc;
+		carry =carry^t;
+		++i;
+	}
+	if(carryOut != nullptr)
+		*carryOut = carry;
+	else
+		dest[i] = carry ^ op2[i] ^ op1[i];
+}
+inline void sub_full(Bit * dest, Bit * borrowOut, const Bit * op1, const Bit * op2,
+		const Bit * borrowIn, int size) {
+	Bit borrow,bxc,bxa,t;
+	int skipLast; int i = 0;
+	if(size==0) { 
+		if(borrowIn && borrowOut) 
+			*borrowOut = *borrowIn;
+		return;
+	}
+	if(borrowIn) 
+		borrow = *borrowIn;
+	else 
+		borrow = false;
+	// skip AND on last bit if borrowOut==NULL
+	skipLast = (borrowOut == nullptr);
+	while(size-- > skipLast) {
+		bxa = op1[i] ^ op2[i];
+		bxc = borrow ^ op2[i];
+		dest[i] = bxa ^ borrow;
+		t = bxa & bxc;
+		borrow = borrow ^ t;
+		++i;
+	}
+	if(borrowOut != nullptr) {
+		*borrowOut = borrow;
+	}
+	else
+		dest[i] = op1[i] ^ op2[i] ^ borrow;
+}
+inline void mul_full(Bit * dest, const Bit * op1, const Bit * op2, int size) {
+	Bit * sum = new Bit[size];
+	Bit * temp = new Bit[size];
+	for(int i = 0; i < size; ++i)sum[i]=false;
+	for(int i=0;i<size;++i) {
+		for (int k = 0; k < size-i; ++k)
+			temp[k] = op1[k] & op2[i];
+		add_full(sum+i, nullptr, sum+i, temp, nullptr, size-i);
+	}
+	memcpy(dest, sum, sizeof(Bit)*size);
+	delete[] sum;
+	delete[] temp;
+}
+
+inline void ifThenElse(Bit * dest, const Bit * tsrc, const Bit * fsrc, 
+		int size, Bit cond) {
+	Bit x, a;
+	int i = 0;
+	while(size-- > 0) {
+		x = tsrc[i] ^ fsrc[i];
+		a = cond & x;
+		dest[i] = a ^ fsrc[i];
+		++i;
+	}
+}
+inline void condNeg(Bit cond, Bit * dest, const Bit * src, int size) {
+	int i;
+	Bit c = cond;
+	for(i=0; i < size-1; ++i) {
+		dest[i] = src[i] ^ cond;
+		Bit t  = dest[i] ^ c;
+		c = c & dest[i];
+		dest[i] = t;
+	}
+	dest[i] = cond ^ c ^ src[i];
+}
+
+inline void div_full(Bit * vquot, Bit * vrem, const Bit * op1, const Bit * op2, 
+		int size) {
+	Bit * overflow = new Bit[size];
+	Bit * temp = new Bit[size];
+	Bit * rem = new Bit[size];
+	Bit * quot = new Bit[size];
+	Bit b;
+	memcpy(rem, op1, size*sizeof(Bit));
+	overflow[0] = false;
+	for(int i  = 1; i < size;++i)
+		overflow[i] = overflow[i-1] | op2[size-i];
+	// skip AND on last bit if borrowOut==NULL
+	for(int i = size-1; i >= 0; --i) {
+		sub_full(temp, &b, rem+i, op2, nullptr, size-i);
+		b = b | overflow[i];
+		ifThenElse(rem+i,rem+i,temp,size-i,b);
+		quot[i] = !b;
+	}
+	if(vrem != nullptr) memcpy(vrem, rem, size*sizeof(Bit));
+	if(vquot != nullptr) memcpy(vquot, quot, size*sizeof(Bit));
+	delete[] overflow;
+	delete[] temp;
+	delete[] rem;
+	delete[] quot;
+}
+
+
+inline Integer::Integer(int len, int64_t input, int party) {
+	bool* b = new bool[len];
+	int_to_bool<int64_t>(b, input, len);
+	bits.resize(len);
+	if (party == PUBLIC) {
+		block128 one = circ_exec->public_label(true);
+		block128 zero = circ_exec->public_label(false);
+		for(int i = 0; i < len; ++i)
+			bits[i] = b[i] ? one : zero;
+	}
+	else {
+		prot_exec->feed((block128 *)bits.data(), party, b, len); 
+	}
+
+	delete[] b;
+}
+
+inline Integer Integer::select(const Bit & select, const Integer & a) const{
+	Integer res(*this);
+	for(int i = 0; i < size(); ++i)
+		res[i] = bits[i].select(select, a[i]);
+	return res;
+}
+
+inline Bit& Integer::operator[](int index) {
+	return bits[min(index, size()-1)];
+}
+
+inline const Bit &Integer::operator[](int index) const {
+	return bits[min(index, size()-1)];
+}
+
+template<>
+inline uint32_t Integer::reveal<uint32_t>(int party) const {
+	std::bitset<32> bs;
+	bs.reset();
+	bool b[size()];
+	prot_exec->reveal(b, party, (block128 *)bits.data(), size());
+	for (int i = 0; i < min(32, size()); ++i)
+		bs.set(i, b[i]);
+	return bs.to_ulong();
+}
+
+template<>
+inline uint64_t Integer::reveal<uint64_t>(int party) const {
+	std::bitset<64> bs;
+	bs.reset();
+	bool b[size()];
+	prot_exec->reveal(b, party, (block128 *)bits.data(), size());
+	for (int i = 0; i < min(64, size()); ++i)
+		bs.set(i, b[i]);
+	return bs.to_ullong();
+}
+template<>
+inline int32_t Integer::reveal<int32_t>(int party) const {
+	return reveal<uint32_t>(party);
+}
+
+template<>
+inline int64_t Integer::reveal<int64_t>(int party) const {
+	return reveal<uint64_t>(party);
+}
+
+
+template<>
+inline string Integer::reveal<string>(int party) const {
+	bool * b = new bool[size()];
+	string res = "";
+	prot_exec->reveal(b, party, (block128 *)bits.data(), size());
+	for(int i = 0; i < size(); ++i)
+		res+=(b[i]? "1" : "0");
+	delete[] b;
+	return res;
+}
+
+
+inline int Integer::size() const {
+	return bits.size();
+}
+
+//circuits
+inline Integer Integer::abs() const {
+	Integer res(*this);
+	for(int i = 0; i < size(); ++i)
+		res[i] = bits[size()-1];
+	return ( (*this) + res) ^ res;
+}
+
+inline Integer& Integer::resize(int len, bool signed_extend) {
+	Bit MSB(false, PUBLIC); 
+	if(signed_extend)
+		MSB = bits[bits.size()-1];
+	bits.resize(len, MSB);
+	return *this;
+}
+
+//Logical operations
+inline Integer Integer::operator^(const Integer& rhs) const {
+	Integer res(*this);
+	for(int i = 0; i < size(); ++i)
+		res.bits[i] = res.bits[i] ^ rhs.bits[i];
+	return res;
+}
+
+inline Integer Integer::operator|(const Integer& rhs) const {
+	Integer res(*this);
+	for(int i = 0; i < size(); ++i)
+		res.bits[i] = res.bits[i] | rhs.bits[i];
+	return res;
+}
+
+inline Integer Integer::operator&(const Integer& rhs) const {
+	Integer res(*this);
+	for(int i = 0; i < size(); ++i)
+		res.bits[i] = res.bits[i] & rhs.bits[i];
+	return res;
+}
+
+inline Integer Integer::operator<<(int shamt) const {
+	Integer res(*this);
+	if(shamt > size()) {
+		for(int i = 0; i < size(); ++i)
+			res.bits[i] = false;
+	}
+	else {
+		for(int i = size()-1; i >= shamt; --i)
+			res.bits[i] = bits[i-shamt];
+		for(int i = shamt-1; i>=0; --i)
+			res.bits[i] = false;
+	}
+	return res;
+}
+
+inline Integer Integer::operator>>(int shamt) const {
+	Integer res(*this);
+	if(shamt >size()) {
+		for(int i = 0; i < size(); ++i)
+			res.bits[i] = false;
+	}
+	else {
+		for(int i = shamt; i < size(); ++i)
+			res.bits[i-shamt] = bits[i];
+		for(int i = size()-shamt; i < size(); ++i)
+			res.bits[i] = false;
+	}
+	return res;
+}
+
+inline Integer Integer::operator<<(const Integer& shamt) const {
+	Integer res(*this);
+	for(int i = 0; i < min(int(ceil(log2(size()))) , shamt.size()-1); ++i)
+		res = res.select(shamt[i], res<<(1<<i));
+	return res;
+}
+
+inline Integer Integer::operator>>(const Integer& shamt) const{
+	Integer res(*this);
+	for(int i = 0; i <min(int(ceil(log2(size()))) , shamt.size()-1); ++i)
+		res = res.select(shamt[i], res>>(1<<i));
+	return res;
+}
+
+//Comparisons
+inline Bit Integer::geq (const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Integer thisExtend(*this), rhsExtend(rhs);
+	thisExtend.resize(size()+1, true);
+	rhsExtend.resize(size()+1, true);
+	Integer tmp = thisExtend-rhsExtend;
+	return !tmp[tmp.size()-1];
+}
+
+inline Bit Integer::equal(const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Bit res(true);
+	for(int i = 0; i < size(); ++i)
+		res = res & (bits[i] == rhs[i]);
+	return res;
+}
+
+/* Arithmethics
+ */
+inline Integer Integer::operator+(const Integer & rhs) const {
+	assert(size() == rhs.size());
+	Integer res(*this);
+	add_full(res.bits.data(), nullptr, bits.data(), rhs.bits.data(), nullptr, size());
+	return res;
+}
+
+inline Integer Integer::operator-(const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Integer res(*this);
+	sub_full(res.bits.data(), nullptr, bits.data(), rhs.bits.data(), nullptr, size());
+	return res;
+}
+
+inline Integer Integer::operator*(const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Integer res(*this);
+	mul_full(res.bits.data(), bits.data(), rhs.bits.data(), size());
+	return res;
+}
+
+inline Integer Integer::operator/(const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Integer res(*this);
+	Integer i1 = abs();
+	Integer i2 = rhs.abs();
+	Bit sign = bits[size()-1] ^ rhs[size()-1];
+	div_full(res.bits.data(), nullptr, i1.bits.data(), i2.bits.data(), size());
+	condNeg(sign, res.bits.data(), res.bits.data(), size());
+	return res;
+}
+inline Integer Integer::operator%(const Integer& rhs) const {
+	assert(size() == rhs.size());
+	Integer res(*this);
+	Integer i1 = abs();
+	Integer i2 = rhs.abs();
+	Bit sign = bits[size()-1];
+	div_full(nullptr, res.bits.data(), i1.bits.data(), i2.bits.data(), size());
+	condNeg(sign, res.bits.data(), res.bits.data(), size());
+	return res;
+}
+
+inline Integer Integer::operator-() const {
+	return Integer(size(), 0, PUBLIC)-(*this);
+}
+
+//Others
+inline Integer Integer::leading_zeros() const {
+	Integer res = *this;
+	for(int i = size() - 2; i>=0; --i)
+		res[i] = res[i+1] | res[i];
+
+	for(int i = 0; i < res.size(); ++i)
+		res[i] = !res[i];
+	return res.hamming_weight();
+}
+
+inline Integer Integer::hamming_weight() const {
+	vector<Integer> vec;
+	for(int i = 0; i < size(); i++) {
+		Integer tmp(2, 0, PUBLIC);
+		tmp[0] = bits[i];
+		vec.push_back(tmp);
+	}
+
+	while(vec.size() > 1) {
+		int j = 0;
+		for(size_t i = 0; i < vec.size()-1; i+=2) {
+			vec[j++] = vec[i]+vec[i+1];
+		}
+		if(vec.size()%2 == 1) {
+			vec[j++] = vec[vec.size()-1];
+		}
+		for(int i = 0; i < j; ++i)
+			vec[i].resize(vec[i].size()+1, false);
+		int vec_size = vec.size();
+		for(int i = j; i < vec_size; ++i)
+			vec.pop_back();
+	}
+	return vec[0];
+}
+inline Integer Integer::modExp(Integer p, Integer q) {
+	// the value of q should be less than half of the MAX_INT
+	Integer base = *this;
+	Integer res(size(),1);
+	for(int i = 0; i < p.size(); ++i) {
+		Integer tmp = (res * base) % q;
+		res = res.select(p[i], tmp);
+		base = (base*base) % q; 
+	}
+	return res;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h
new file mode 100644
index 00000000..9efd9048
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h
@@ -0,0 +1,86 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_MITCCRH_H__
+#define EMP_MITCCRH_H__
+#include "GC/aes_opt.h"
+#include <stdio.h>
+
+namespace sci {
+
+/*
+ * [REF] Implementation of "Better Concrete Security for Half-Gates Garbling (in
+ * the Multi-Instance Setting)" https://eprint.iacr.org/2019/1168.pdf
+ */
+
+template <int BatchSize = 8> class MITCCRH {
+public:
+  AES_KEY scheduled_key[BatchSize];
+  block128 keys[BatchSize];
+  int key_used = BatchSize;
+  block128 start_point;
+  uint64_t gid = 0;
+
+  void setS(block128 sin) { this->start_point = sin; }
+
+  void renew_ks(uint64_t gid) {
+    this->gid = gid;
+    renew_ks();
+  }
+
+  void renew_ks() {
+    for (int i = 0; i < BatchSize; ++i)
+      keys[i] = start_point ^ makeBlock128(gid++, 0);
+    AES_opt_key_schedule<BatchSize>(keys, scheduled_key);
+    key_used = 0;
+  }
+
+  template <int K, int H> void hash_cir(block128 *blks) {
+    for (int i = 0; i < K * H; ++i)
+      blks[i] = sigma(blks[i]);
+    hash<K, H>(blks);
+  }
+
+  template <int K, int H> void hash(block128 *blks) {
+    assert(K <= BatchSize);
+    assert(BatchSize % K == 0);
+    if (key_used == BatchSize)
+      renew_ks();
+
+    block128 tmp[K * H];
+    for (int i = 0; i < K * H; ++i)
+      tmp[i] = blks[i];
+
+    ParaEnc<K, H>(tmp, scheduled_key + key_used);
+    key_used += K;
+
+    for (int i = 0; i < K * H; ++i)
+      blks[i] = blks[i] ^ tmp[i];
+  }
+};
+} // namespace sci
+#endif // MITCCRH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h
new file mode 100644
index 00000000..e111c77c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h
@@ -0,0 +1,74 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_NUMBER_H__
+#define EMP_NUMBER_H__
+#include "GC/bit.h"
+
+namespace sci {
+template <typename T, typename D>
+void cmp_swap(T *key, D *data, int i, int j, Bit acc) {
+  Bit to_swap = ((key[i] > key[j]) == acc);
+  swap(to_swap, key[i], key[j]);
+  if (data != nullptr)
+    swap(to_swap, data[i], data[j]);
+}
+
+inline int greatestPowerOfTwoLessThan(int n) {
+  int k = 1;
+  while (k < n)
+    k = k << 1;
+  return k >> 1;
+}
+
+template <typename T, typename D>
+void bitonic_merge(T *key, D *data, int lo, int n, Bit acc) {
+  if (n > 1) {
+    int m = greatestPowerOfTwoLessThan(n);
+    for (int i = lo; i < lo + n - m; i++)
+      cmp_swap(key, data, i, i + m, acc);
+    bitonic_merge(key, data, lo, m, acc);
+    bitonic_merge(key, data, lo + m, n - m, acc);
+  }
+}
+
+template <typename T, typename D>
+void bitonic_sort(T *key, D *data, int lo, int n, Bit acc) {
+  if (n > 1) {
+    int m = n / 2;
+    bitonic_sort(key, data, lo, m, !acc);
+    bitonic_sort(key, data, lo + m, n - m, acc);
+    bitonic_merge(key, data, lo, n, acc);
+  }
+}
+
+template <typename T, typename D = Bit>
+void sort(T *key, int size, D *data = nullptr, Bit acc = true) {
+  bitonic_sort(key, data, 0, size, acc);
+}
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/protocol_execution.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/protocol_execution.h
new file mode 100644
index 00000000..62b81492
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/protocol_execution.h
@@ -0,0 +1,55 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_PROTOCOL_EXECUTION_H__
+#define EMP_PROTOCOL_EXECUTION_H__
+#include "utils/block.h"
+#include "utils/constants.h"
+#include <pthread.h>
+
+namespace sci {
+class ProtocolExecution {
+public:
+  int cur_party;
+  /*
+  #ifndef THREADING
+          // static ProtocolExecution * prot_exec;
+  #else
+          static __thread ProtocolExecution * prot_exec;
+  #endif
+  */
+
+  ProtocolExecution(int party = PUBLIC) : cur_party(party) {}
+  virtual ~ProtocolExecution() {}
+  virtual void feed(block128 *lbls, int party, const bool *b, int nel) = 0;
+  virtual void reveal(bool *out, int party, const block128 *lbls, int nel) = 0;
+  virtual void finalize() {}
+};
+} // namespace sci
+// extern sci::ProtocolExecution* prot_exec;
+thread_local extern sci::ProtocolExecution *prot_exec;
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/semihonest.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/semihonest.h
new file mode 100644
index 00000000..9f8473e4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/semihonest.h
@@ -0,0 +1,51 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_SEMIHONEST_H__
+#define EMP_SEMIHONEST_H__
+#include "GC/sh_eva.h"
+#include "GC/sh_gen.h"
+
+namespace sci {
+
+template <typename IO = NetIO>
+inline SemiHonestParty<IO> *setup_semi_honest(IO *io, int party,
+                                              int batch_size = 1024 * 16) {
+  if (party == ALICE) {
+    HalfGateGen<IO> *t = new HalfGateGen<IO>(io);
+    circ_exec = t;
+    prot_exec = new SemiHonestGen<IO>(io, t);
+  } else {
+    HalfGateEva<IO> *t = new HalfGateEva<IO>(io);
+    circ_exec = t;
+    prot_exec = new SemiHonestEva<IO>(io, t);
+  }
+  static_cast<SemiHonestParty<IO> *>(prot_exec)->set_batch_size(batch_size);
+  return (SemiHonestParty<IO> *)prot_exec;
+}
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h
new file mode 100644
index 00000000..7abca775
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h
@@ -0,0 +1,112 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_SEMIHONEST_EVA_H__
+#define EMP_SEMIHONEST_EVA_H__
+#include "GC/sh_party.h"
+
+namespace sci {
+template <typename IO> class SemiHonestEva : public SemiHonestParty<IO> {
+public:
+  HalfGateEva<IO> *gc;
+  PRG128 prg;
+  SemiHonestEva(IO *io, HalfGateEva<IO> *gc) : SemiHonestParty<IO>(io, BOB) {
+    this->gc = gc;
+    // this->ot->setup_recv();
+    block128 seed;
+    this->io->recv_block(&seed, 1);
+    this->shared_prg.reseed(&seed);
+    this->top = this->batch_size;
+    // refill();
+  }
+
+  void setup_keys(block128 *in_k0, block128 *in_k1) {
+    this->ot->setup_recv(in_k0, in_k1);
+  }
+
+  void refill() {
+    prg.random_bool(this->buff, this->batch_size);
+    this->ot->recv_cot(this->buf, this->buff, this->batch_size);
+    this->top = 0;
+  }
+
+  void feed(block128 *label, int party, const bool *b, int length) {
+    if (party == ALICE) {
+      this->shared_prg.random_block(label, length);
+    } else {
+      if (length > this->batch_size) {
+        this->ot->recv_cot(label, b, length);
+      } else {
+        bool *tmp = new bool[length];
+        if (length > this->batch_size - this->top) {
+          memcpy(label, this->buf + this->top,
+                 (this->batch_size - this->top) * sizeof(block128));
+          memcpy(tmp, this->buff + this->top, (this->batch_size - this->top));
+          int filled = this->batch_size - this->top;
+          refill();
+          memcpy(label + filled, this->buf,
+                 (length - filled) * sizeof(block128));
+          memcpy(tmp + filled, this->buff, length - filled);
+          this->top = length - filled;
+        } else {
+          memcpy(label, this->buf + this->top, length * sizeof(block128));
+          memcpy(tmp, this->buff + this->top, length);
+          this->top += length;
+        }
+
+        for (int i = 0; i < length; ++i)
+          tmp[i] = (tmp[i] != b[i]);
+        this->io->send_data(tmp, length);
+
+        delete[] tmp;
+      }
+    }
+  }
+
+  void reveal(bool *b, int party, const block128 *label, int length) {
+    if (party == XOR) {
+      for (int i = 0; i < length; ++i)
+        b[i] = getLSB(label[i]);
+      return;
+    }
+    for (int i = 0; i < length; ++i) {
+      bool lsb = getLSB(label[i]), tmp;
+      if (party == BOB or party == PUBLIC) {
+        this->io->recv_data(&tmp, 1);
+        b[i] = (tmp != lsb);
+      } else if (party == ALICE) {
+        this->io->send_data(&lsb, 1);
+        b[i] = false;
+      }
+    }
+    if (party == PUBLIC)
+      this->io->send_data(b, length);
+  }
+};
+} // namespace sci
+
+#endif // GARBLE_CIRCUIT_SEMIHONEST_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h
new file mode 100644
index 00000000..203182f8
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h
@@ -0,0 +1,118 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_SEMIHONEST_GEN_H__
+#define EMP_SEMIHONEST_GEN_H__
+#include "GC/sh_party.h"
+
+namespace sci {
+
+template <typename IO> class SemiHonestGen : public SemiHonestParty<IO> {
+public:
+  HalfGateGen<IO> *gc;
+  SemiHonestGen(IO *io, HalfGateGen<IO> *gc) : SemiHonestParty<IO>(io, ALICE) {
+    this->gc = gc;
+    bool delta_bool[128];
+    block_to_bool(delta_bool, gc->delta);
+    // this->ot->setup_send();
+    block128 seed;
+    PRG128 prg;
+    prg.random_block(&seed, 1);
+    this->io->send_block(&seed, 1);
+    this->shared_prg.reseed(&seed);
+    this->top = this->batch_size;
+    // refill();
+  }
+
+  void setup_keys(block128 *in_k0, bool *in_s) {
+    this->ot->setup_send(in_k0, in_s);
+  }
+
+  void refill() {
+    // this->ot->send_cot(this->buf, this->batch_size);
+    this->ot->send_cot(this->buf, gc->delta, this->batch_size);
+    this->top = 0;
+  }
+
+  void feed(block128 *label, int party, const bool *b, int length) {
+    if (party == ALICE) {
+      this->shared_prg.random_block(label, length);
+      for (int i = 0; i < length; ++i) {
+        if (b[i])
+          label[i] = label[i] ^ gc->delta;
+      }
+    } else {
+      if (length > this->batch_size) {
+        // this->ot->send_cot(label, length);
+        this->ot->send_cot(label, gc->delta, length);
+      } else {
+        bool *tmp = new bool[length];
+        if (length > this->batch_size - this->top) {
+          memcpy(label, this->buf + this->top,
+                 (this->batch_size - this->top) * sizeof(block128));
+          int filled = this->batch_size - this->top;
+          refill();
+          memcpy(label + filled, this->buf,
+                 (length - filled) * sizeof(block128));
+          this->top = (length - filled);
+        } else {
+          memcpy(label, this->buf + this->top, length * sizeof(block128));
+          this->top += length;
+        }
+
+        this->io->recv_data(tmp, length);
+        for (int i = 0; i < length; ++i)
+          if (tmp[i])
+            label[i] = label[i] ^ gc->delta;
+        delete[] tmp;
+      }
+    }
+  }
+
+  void reveal(bool *b, int party, const block128 *label, int length) {
+    if (party == XOR) {
+      for (int i = 0; i < length; ++i)
+        b[i] = getLSB(label[i]);
+      return;
+    }
+    for (int i = 0; i < length; ++i) {
+      bool lsb = getLSB(label[i]);
+      if (party == BOB or party == PUBLIC) {
+        this->io->send_data(&lsb, 1);
+        b[i] = false;
+      } else if (party == ALICE) {
+        bool tmp;
+        this->io->recv_data(&tmp, 1);
+        b[i] = (tmp != lsb);
+      }
+    }
+    if (party == PUBLIC)
+      this->io->recv_data(b, length);
+  }
+};
+} // namespace sci
+#endif // SEMIHONEST_GEN_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h
new file mode 100644
index 00000000..032dd2c4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h
@@ -0,0 +1,68 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_SH_PARTY_H__
+#define EMP_SH_PARTY_H__
+#include "GC/emp-tool.h"
+#include "OT/iknp.h"
+
+namespace sci {
+
+template <typename IO> class SemiHonestParty : public ProtocolExecution {
+public:
+  IO *io = nullptr;
+  IKNP<IO> *ot = nullptr;
+  PRG128 shared_prg;
+
+  block128 *buf = nullptr;
+  bool *buff = nullptr;
+  int top = 0;
+  int batch_size = 1024 * 16;
+
+  SemiHonestParty(IO *io, int party) : ProtocolExecution(party) {
+    this->io = io;
+    ot = new IKNP<IO>(io);
+    buf = new block128[batch_size];
+    buff = new bool[batch_size];
+  }
+  void set_batch_size(int size) {
+    delete[] buf;
+    delete[] buff;
+    batch_size = size;
+    top = batch_size;
+    buf = new block128[batch_size];
+    buff = new bool[batch_size];
+  }
+
+  ~SemiHonestParty() {
+    delete[] buf;
+    delete[] buff;
+    delete ot;
+  }
+};
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/swappable.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/swappable.h
new file mode 100644
index 00000000..996a5867
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/swappable.h
@@ -0,0 +1,50 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef EMP_SWAPPABLE_H__
+#define EMP_SWAPPABLE_H__
+#include "GC/bit.h"
+namespace sci {
+class Bit;
+template <typename T> class Swappable {
+public:
+  T If(const Bit &sel, const T &rhs) const {
+    return static_cast<const T *>(this)->select(sel, rhs);
+  }
+};
+template <typename T> inline T If(const Bit &select, const T &o1, const T &o2) {
+  T res = o2;
+  return res.If(select, o1);
+}
+template <typename T> inline void swap(const Bit &swap, T &o1, T &o2) {
+  T o = If(swap, o1, o2);
+  o = o ^ o2;
+  o1 = o ^ o1;
+  o2 = o ^ o2;
+}
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h b/GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h
new file mode 100644
index 00000000..831115f2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h
@@ -0,0 +1,47 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2021 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef GC_UTILS_H__
+#define GC_UTILS_H__
+
+#include "utils/emp-tool.h"
+
+namespace sci {
+
+inline block128 bool_to_block(const bool *data) {
+  return makeBlock128(bool_to_int<uint64_t>(data + 64),
+                      bool_to_int<uint64_t>(data));
+}
+
+inline void block_to_bool(bool *data, block128 b) {
+  uint64_t *ptr = (uint64_t *)(&b);
+  int_to_bool<uint64_t>(data, ptr[0], 64);
+  int_to_bool<uint64_t>(data + 64, ptr[1], 64);
+}
+
+} // namespace sci
+#endif // GC_UTILS_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/CMakeLists.txt
new file mode 100644
index 00000000..37a518e7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/CMakeLists.txt
@@ -0,0 +1,34 @@
+find_package(OpenMP REQUIRED)
+
+find_package(SEAL 3.3.2 EXACT QUIET PATHS "${PROJECT_SOURCE_DIR}/build/" NO_DEFAULT_PATH)
+if (NOT SEAL_FOUND)
+    message(STATUS "SEAL 3.3.2 was not found: clone and install SEAL locally")
+    if (NOT EXISTS "${PROJECT_SOURCE_DIR}/extern/SEAL/native/src/CMakeLists.txt")
+        find_package(Git REQUIRED)
+        message(STATUS "initialize Git submodule: extern/SEAL")
+        execute_process(COMMAND git submodule update --init --recursive extern/SEAL
+                WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
+        execute_process(COMMAND git apply "${PROJECT_SOURCE_DIR}/cmake/seal.patch"
+            WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/SEAL")
+    endif ()
+    execute_process(COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_SOURCE_DIR}/build .
+            WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/SEAL/native/src")
+    execute_process(COMMAND ${CMAKE_COMMAND} --build . --target install
+        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/extern/SEAL/native/src")
+    find_package(SEAL 3.3.2 EXACT REQUIRED PATHS "${PROJECT_SOURCE_DIR}/build/" NO_DEFAULT_PATH)
+endif()
+
+add_library(SCI-LinearHE
+    conv-field.cpp
+    fc-field.cpp
+    elemwise-prod-field.cpp
+    utils-HE.cpp
+)
+
+target_link_libraries(SCI-LinearHE
+    PUBLIC
+    SCI-utils
+    SEAL::seal
+    OpenMP::OpenMP_CXX
+    Eigen3::Eigen
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.cpp
new file mode 100644
index 00000000..ae2c774b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.cpp
@@ -0,0 +1,946 @@
+/*
+Original Author: ryanleh
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Modified by Deevashwer Rathee
+*/
+
+#include "LinearHE/conv-field.h"
+
+using namespace std;
+using namespace sci;
+using namespace seal;
+using namespace Eigen;
+
+Image pad_image(ConvMetadata data, Image &image) {
+  int image_h = data.image_h;
+  int image_w = data.image_w;
+  Image p_image;
+
+  int pad_h = data.pad_t + data.pad_b;
+  int pad_w = data.pad_l + data.pad_r;
+  int pad_top = data.pad_t;
+  int pad_left = data.pad_l;
+
+  for (Channel &channel : image) {
+    Channel p_channel = Channel::Zero(image_h + pad_h, image_w + pad_w);
+    p_channel.block(pad_top, pad_left, image_h, image_w) = channel;
+    p_image.push_back(p_channel);
+  }
+  return p_image;
+}
+
+/* Adapted im2col algorithm from Caffe framework */
+void i2c(Image &image, Channel &column, const int filter_h, const int filter_w,
+         const int stride_h, const int stride_w, const int output_h,
+         const int output_w) {
+  int height = image[0].rows();
+  int width = image[0].cols();
+  int channels = image.size();
+
+  int col_width = column.cols();
+
+  // Index counters for images
+  int column_i = 0;
+  const int channel_size = height * width;
+  for (auto &channel : image) {
+    for (int filter_row = 0; filter_row < filter_h; filter_row++) {
+      for (int filter_col = 0; filter_col < filter_w; filter_col++) {
+        int input_row = filter_row;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!condition_check(input_row, height)) {
+            for (int output_cols = output_w; output_cols; output_cols--) {
+              int row_i = column_i / col_width;
+              int col_i = column_i % col_width;
+              column(row_i, col_i) = 0;
+              column_i++;
+            }
+          } else {
+            int input_col = filter_col;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (condition_check(input_col, width)) {
+                int row_i = column_i / col_width;
+                int col_i = column_i % col_width;
+                column(row_i, col_i) = channel(input_row, input_col);
+                column_i++;
+              } else {
+                int row_i = column_i / col_width;
+                int col_i = column_i % col_width;
+                column(row_i, col_i) = 0;
+                column_i++;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+// Generates a masking vector of random noise that will be applied to parts of
+// the ciphertext that contain leakage from the convolution
+vector<Ciphertext> HE_preprocess_noise(const uint64_t *const *secret_share,
+                                       const ConvMetadata &data,
+                                       Encryptor &encryptor,
+                                       BatchEncoder &batch_encoder,
+                                       Evaluator &evaluator) {
+  vector<vector<uint64_t>> noise(data.out_ct,
+                                 vector<uint64_t>(data.slot_count, 0ULL));
+  // Sample randomness into vector
+  PRG128 prg;
+  for (int ct_idx = 0; ct_idx < data.out_ct; ct_idx++) {
+    prg.random_mod_p<uint64_t>(noise[ct_idx].data(), data.slot_count,
+                               prime_mod);
+  }
+  vector<Ciphertext> enc_noise(data.out_ct);
+
+  // Puncture the vector with 0s where an actual convolution result value lives
+#pragma omp parallel for num_threads(num_threads) schedule(static)
+  for (int ct_idx = 0; ct_idx < data.out_ct; ct_idx++) {
+    int out_base = 2 * ct_idx * data.chans_per_half;
+    for (int out_c = 0;
+         out_c < 2 * data.chans_per_half && out_c + out_base < data.out_chans;
+         out_c++) {
+      int half_idx = out_c / data.chans_per_half;
+      int half_off = out_c % data.chans_per_half;
+      for (int col = 0; col < data.output_h; col++) {
+        for (int row = 0; row < data.output_w; row++) {
+          int noise_idx =
+              half_idx * data.pack_num + half_off * data.image_size +
+              col * data.stride_w * data.image_w + row * data.stride_h;
+          int share_idx = col * data.output_w + row;
+          noise[ct_idx][noise_idx] = secret_share[out_base + out_c][share_idx];
+        }
+      }
+    }
+    Plaintext tmp;
+    batch_encoder.encode(noise[ct_idx], tmp);
+    encryptor.encrypt(tmp, enc_noise[ct_idx]);
+    evaluator.mod_switch_to_next_inplace(enc_noise[ct_idx]);
+  }
+  return enc_noise;
+}
+
+// Preprocesses the input image for output packing. Ciphertext is packed in
+// RowMajor order. In this mode simply pack all the input channels as tightly as
+// possible where each channel is padded to the nearest of two
+vector<vector<uint64_t>> preprocess_image_OP(Image &image, ConvMetadata data) {
+  vector<vector<uint64_t>> ct(data.inp_ct,
+                              vector<uint64_t>(data.slot_count, 0));
+  int inp_c = 0;
+  for (int ct_idx = 0; ct_idx < data.inp_ct; ct_idx++) {
+    int inp_c_limit = (ct_idx + 1) * 2 * data.chans_per_half;
+    for (; inp_c < data.inp_chans && inp_c < inp_c_limit; inp_c++) {
+      // Calculate which half of ciphertext the output channel
+      // falls in and the offest from that half,
+      int half_idx = (inp_c % (2 * data.chans_per_half)) / data.chans_per_half;
+      int half_off = inp_c % data.chans_per_half;
+      for (int row = 0; row < data.image_h; row++) {
+        for (int col = 0; col < data.image_w; col++) {
+          int idx = half_idx * data.pack_num + half_off * data.image_size +
+                    row * data.image_w + col;
+          ct[ct_idx][idx] = image[inp_c](row, col);
+        }
+      }
+    }
+  }
+  return ct;
+}
+
+// Evaluates the filter rotations necessary to convole an input. Essentially,
+// think about placing the filter in the top left corner of the padded image and
+// sliding the image over the filter in such a way that we capture which
+// elements of filter multiply with which elements of the image. We account for
+// the zero padding by zero-puncturing the masks. This function can evaluate
+// plaintexts and ciphertexts.
+vector<vector<Ciphertext>> filter_rotations(vector<Ciphertext> &input,
+                                            const ConvMetadata &data,
+                                            Evaluator *evaluator,
+                                            GaloisKeys *gal_keys) {
+  vector<vector<Ciphertext>> rotations(input.size(),
+                                       vector<Ciphertext>(data.filter_size));
+  int pad_h = data.pad_t + data.pad_b;
+  int pad_w = data.pad_l + data.pad_r;
+
+  // This tells us how many filters fit on a single row of the padded image
+  int f_per_row = data.image_w + pad_w - data.filter_w + 1;
+
+  // This offset calculates rotations needed to bring filter from top left
+  // corner of image to the top left corner of padded image
+  int offset = f_per_row * data.pad_t + data.pad_l;
+
+  // For each element of the filter, rotate the padded image s.t. the top
+  // left position always contains the first element of the image it touches
+#pragma omp parallel for num_threads(num_threads) schedule(static) collapse(2)
+  for (int f = 0; f < data.filter_size; f++) {
+    for (size_t ct_idx = 0; ct_idx < input.size(); ct_idx++) {
+      int f_row = f / data.filter_w;
+      int f_col = f % data.filter_w;
+      int row_offset = f_row * data.image_w - offset;
+      int rot_amt = row_offset + f_col;
+      int idx = f_row * data.filter_w + f_col;
+      evaluator->rotate_rows(input[ct_idx], rot_amt, *gal_keys,
+                             rotations[ct_idx][idx]);
+    }
+  }
+  return rotations;
+}
+
+// Encrypts the given input image
+vector<Ciphertext> HE_encrypt(vector<vector<uint64_t>> &pt,
+                              const ConvMetadata &data, Encryptor &encryptor,
+                              BatchEncoder &batch_encoder) {
+  vector<Ciphertext> ct(pt.size());
+#pragma omp parallel for num_threads(num_threads) schedule(static)
+  for (size_t ct_idx = 0; ct_idx < pt.size(); ct_idx++) {
+    Plaintext tmp;
+    batch_encoder.encode(pt[ct_idx], tmp);
+    encryptor.encrypt(tmp, ct[ct_idx]);
+  }
+  return ct;
+}
+
+// Creates filter masks for an image input that has been output packed.
+vector<vector<vector<Plaintext>>>
+HE_preprocess_filters_OP(Filters &filters, const ConvMetadata &data,
+                         BatchEncoder &batch_encoder) {
+  // Mask is convolutions x cts per convolution x mask size
+  vector<vector<vector<Plaintext>>> encoded_masks(
+      data.convs, vector<vector<Plaintext>>(
+                      data.inp_ct, vector<Plaintext>(data.filter_size)));
+  // Since a half in a permutation may have a variable number of rotations we
+  // use this index to track where we are at in the masks tensor
+  // Build each half permutation as well as it's inward rotations
+#pragma omp parallel for num_threads(num_threads) schedule(static) collapse(2)
+  for (int perm = 0; perm < data.half_perms; perm += 2) {
+    for (int rot = 0; rot < data.half_rots; rot++) {
+      int conv_idx = perm * data.half_rots;
+      for (int ct_idx = 0; ct_idx < data.inp_ct; ct_idx++) {
+        // The output channel the current ct starts from
+        // int out_base = (((perm/2) + ct_idx)*2*data.chans_per_half) %
+        // data.out_mod;
+        int out_base = (perm * data.chans_per_half) % data.out_mod;
+        // Generate all inward rotations of each half -- half_rots loop
+        for (int f = 0; f < data.filter_size; f++) {
+          vector<vector<uint64_t>> masks(2,
+                                         vector<uint64_t>(data.slot_count, 0));
+          for (int half_idx = 0; half_idx < 2; half_idx++) {
+            int inp_base = (2 * ct_idx + half_idx) * data.chans_per_half;
+            for (int chan = 0; chan < data.chans_per_half &&
+                               (chan + inp_base) < data.inp_chans;
+                 chan++) {
+              // Pull the value of this mask
+              int f_w = f % data.filter_w;
+              int f_h = f / data.filter_w;
+              // Set the coefficients of this channel for both
+              // permutations
+              uint64_t val, val2;
+              int out_idx, out_idx2;
+
+              int offset = neg_mod(chan - rot, (int64_t)data.chans_per_half);
+              if (half_idx) {
+                // If out_halves < 1 we may repeat within a
+                // ciphertext
+                // TODO: Add the log optimization for this case
+                if (data.out_halves > 1)
+                  out_idx = offset + out_base + data.chans_per_half;
+                else
+                  out_idx = offset + out_base;
+                out_idx2 = offset + out_base;
+              } else {
+                out_idx = offset + out_base;
+                out_idx2 = offset + out_base + data.chans_per_half;
+              }
+              val = (out_idx < data.out_chans)
+                        ? filters[out_idx][inp_base + chan](f_h, f_w)
+                        : 0;
+              val2 = (out_idx2 < data.out_chans)
+                         ? filters[out_idx2][inp_base + chan](f_h, f_w)
+                         : 0;
+              // Iterate through the whole image and figure out which
+              // values the filter value touches - this is the same
+              // as for input packing
+              for (int curr_h = 0; curr_h < data.image_h;
+                   curr_h += data.stride_h) {
+                for (int curr_w = 0; curr_w < data.image_w;
+                     curr_w += data.stride_w) {
+                  // curr_h and curr_w simulate the current top-left position of
+                  // the filter. This detects whether the filter would fit over
+                  // this section. If it's out-of-bounds we set the mask index
+                  // to 0
+                  bool zero = ((curr_w + f_w) < data.pad_l) ||
+                              ((curr_w + f_w) >= (data.image_w + data.pad_l)) ||
+                              ((curr_h + f_h) < data.pad_t) ||
+                              ((curr_h + f_h) >= (data.image_h + data.pad_l));
+                  // Calculate which half of ciphertext the output channel
+                  // falls in and the offest from that half,
+                  int idx = half_idx * data.pack_num + chan * data.image_size +
+                            curr_h * data.image_w + curr_w;
+                  // Add both values to appropiate permutations
+                  masks[0][idx] = zero ? 0 : val;
+                  if (data.half_perms > 1) {
+                    masks[1][idx] = zero ? 0 : val2;
+                  }
+                }
+              }
+            }
+          }
+          batch_encoder.encode(masks[0],
+                               encoded_masks[conv_idx + rot][ct_idx][f]);
+          if (data.half_perms > 1) {
+            batch_encoder.encode(
+                masks[1],
+                encoded_masks[conv_idx + data.half_rots + rot][ct_idx][f]);
+          }
+        }
+      }
+    }
+  }
+  return encoded_masks;
+}
+
+// Performs convolution for an output packed image. Returns the intermediate
+// rotation sets
+vector<Ciphertext> HE_conv_OP(vector<vector<vector<Plaintext>>> &masks,
+                              vector<vector<Ciphertext>> &rotations,
+                              const ConvMetadata &data, Evaluator &evaluator,
+                              Ciphertext &zero) {
+  vector<Ciphertext> result(data.convs);
+
+  // Multiply masks and add for each convolution
+#pragma omp parallel for num_threads(num_threads) schedule(static)
+  for (int conv_idx = 0; conv_idx < data.convs; conv_idx++) {
+    result[conv_idx] = zero;
+    for (int ct_idx = 0; ct_idx < data.inp_ct; ct_idx++) {
+      for (int f = 0; f < data.filter_size; f++) {
+        Ciphertext tmp;
+        if (!masks[conv_idx][ct_idx][f].is_zero()) {
+          evaluator.multiply_plain(rotations[ct_idx][f],
+                                   masks[conv_idx][ct_idx][f], tmp);
+
+          evaluator.add_inplace(result[conv_idx], tmp);
+        }
+      }
+    }
+    evaluator.mod_switch_to_next_inplace(result[conv_idx]);
+  }
+  return result;
+}
+
+// Takes the result of an output-packed convolution, and rotates + adds all the
+// ciphertexts to get a tightly packed output
+vector<Ciphertext> HE_output_rotations(vector<Ciphertext> &convs,
+                                       const ConvMetadata &data,
+                                       Evaluator &evaluator,
+                                       GaloisKeys &gal_keys, Ciphertext &zero,
+                                       vector<Ciphertext> &enc_noise) {
+  vector<Ciphertext> partials(data.half_perms);
+  Ciphertext zero_next_level = zero;
+  evaluator.mod_switch_to_next_inplace(zero_next_level);
+  // Init the result vector to all 0
+  vector<Ciphertext> result(data.out_ct);
+  for (int ct_idx = 0; ct_idx < data.out_ct; ct_idx++) {
+    result[ct_idx] = zero_next_level;
+  }
+
+  // For each half perm, add up all the inside channels of each half
+#pragma omp parallel for num_threads(num_threads) schedule(static)
+  for (int perm = 0; perm < data.half_perms; perm += 2) {
+    partials[perm] = zero_next_level;
+    if (data.half_perms > 1)
+      partials[perm + 1] = zero_next_level;
+    // The output channel the current ct starts from
+    int total_rots = data.half_rots;
+    for (int in_rot = 0; in_rot < total_rots; in_rot++) {
+      int conv_idx = perm * data.half_rots + in_rot;
+      int rot_amt;
+      rot_amt =
+          -neg_mod(-in_rot, (int64_t)data.chans_per_half) * data.image_size;
+
+      evaluator.rotate_rows_inplace(convs[conv_idx], rot_amt, gal_keys);
+      evaluator.add_inplace(partials[perm], convs[conv_idx]);
+      // Do the same for the column swap if it exists
+      if (data.half_perms > 1) {
+        evaluator.rotate_rows_inplace(convs[conv_idx + data.half_rots], rot_amt,
+                                      gal_keys);
+        evaluator.add_inplace(partials[perm + 1],
+                              convs[conv_idx + data.half_rots]);
+      }
+    }
+    // The correct index for the correct ciphertext in the final output
+    int out_idx = (perm / 2) % data.out_ct;
+    if (perm == 0) {
+      // The first set of convolutions is aligned correctly
+      evaluator.add_inplace(result[out_idx], partials[perm]);
+      ///*
+      if (data.out_halves == 1 && data.inp_halves > 1) {
+        // If the output fits in a single half but the input
+        // doesn't, add the two columns
+        evaluator.rotate_columns_inplace(partials[perm], gal_keys);
+        evaluator.add_inplace(result[out_idx], partials[perm]);
+      }
+      //*/
+      // Do the same for column swap if exists and we aren't on a repeat
+      if (data.half_perms > 1) {
+        evaluator.rotate_columns_inplace(partials[perm + 1], gal_keys);
+        evaluator.add_inplace(result[out_idx], partials[perm + 1]);
+      }
+    } else {
+      // Rotate the output ciphertexts by one and add
+      evaluator.add_inplace(result[out_idx], partials[perm]);
+      // If we're on a tight half we add both halves together and
+      // don't look at the column flip
+      if (data.half_perms > 1) {
+        evaluator.rotate_columns_inplace(partials[perm + 1], gal_keys);
+        evaluator.add_inplace(result[out_idx], partials[perm + 1]);
+      }
+    }
+  }
+  //// Add the noise vector to remove any leakage
+  for (int ct_idx = 0; ct_idx < data.out_ct; ct_idx++) {
+    evaluator.add_inplace(result[ct_idx], enc_noise[ct_idx]);
+    // evaluator.mod_switch_to_next_inplace(result[ct_idx]);
+  }
+  return result;
+}
+
+// Decrypts and reshapes convolution result
+uint64_t **HE_decrypt(vector<Ciphertext> &enc_result, const ConvMetadata &data,
+                      Decryptor &decryptor, BatchEncoder &batch_encoder) {
+  // Decrypt ciphertext
+  vector<vector<uint64_t>> result(data.out_ct);
+
+#pragma omp parallel for num_threads(num_threads) schedule(static)
+  for (int ct_idx = 0; ct_idx < data.out_ct; ct_idx++) {
+    Plaintext tmp;
+    decryptor.decrypt(enc_result[ct_idx], tmp);
+    batch_encoder.decode(tmp, result[ct_idx]);
+  }
+
+  uint64_t **final_result = new uint64_t *[data.out_chans];
+  // Extract correct values to reshape
+  for (int out_c = 0; out_c < data.out_chans; out_c++) {
+    int ct_idx = out_c / (2 * data.chans_per_half);
+    int half_idx = (out_c % (2 * data.chans_per_half)) / data.chans_per_half;
+    int half_off = out_c % data.chans_per_half;
+    // Depending on the padding type and stride the output values won't be
+    // lined up so extract them into a temporary channel before placing
+    // them in resultant Image
+    final_result[out_c] = new uint64_t[data.output_h * data.output_w];
+    for (int col = 0; col < data.output_h; col++) {
+      for (int row = 0; row < data.output_w; row++) {
+        int idx = half_idx * data.pack_num + half_off * data.image_size +
+                  col * data.stride_w * data.image_w + row * data.stride_h;
+        final_result[out_c][col * data.output_w + row] = result[ct_idx][idx];
+      }
+    }
+  }
+  return final_result;
+}
+
+ConvField::ConvField(int party, NetIO *io) {
+  this->party = party;
+  this->io = io;
+  this->slot_count = POLY_MOD_DEGREE_LARGE;
+  generate_new_keys(party, io, slot_count, context[1], encryptor[1],
+                    decryptor[1], evaluator[1], encoder[1], gal_keys[1],
+                    zero[1]);
+  this->slot_count = POLY_MOD_DEGREE;
+  generate_new_keys(party, io, slot_count, context[0], encryptor[0],
+                    decryptor[0], evaluator[0], encoder[0], gal_keys[0],
+                    zero[0]);
+}
+
+ConvField::~ConvField() {
+  for (int i = 0; i < 2; i++) {
+    free_keys(party, encryptor[i], decryptor[i], evaluator[i], encoder[i],
+              gal_keys[i], zero[i]);
+  }
+}
+
+void ConvField::configure() {
+  data.slot_count = this->slot_count;
+  // If using Output packing we pad image_size to the nearest power of 2
+  data.image_size = next_pow2(data.image_h * data.image_w);
+  data.filter_size = data.filter_h * data.filter_w;
+
+  assert(data.out_chans > 0 && data.inp_chans > 0);
+  // Doesn't currently support a channel being larger than a half ciphertext
+  assert(data.image_size <= (slot_count / 2));
+
+  data.pack_num = slot_count / 2;
+  data.chans_per_half = data.pack_num / data.image_size;
+  data.out_ct = ceil((float)data.out_chans / (2 * data.chans_per_half));
+  data.inp_ct = ceil((float)data.inp_chans / (2 * data.chans_per_half));
+
+  data.inp_halves = ceil((float)data.inp_chans / data.chans_per_half);
+  data.out_halves = ceil((float)data.out_chans / data.chans_per_half);
+
+  // The modulo is calculated per ciphertext instead of per half since we
+  // should never have the last out_half wrap around to the first in the
+  // same ciphertext
+  data.out_mod = data.out_ct * 2 * data.chans_per_half;
+
+  data.half_perms = (data.out_halves % 2 != 0 && data.out_halves > 1)
+                        ? data.out_halves + 1
+                        : data.out_halves;
+  data.half_rots =
+      (data.inp_halves > 1 || data.out_halves > 1)
+          ? data.chans_per_half
+          : max(data.chans_per_half, max(data.out_chans, data.inp_chans));
+  data.convs = data.half_perms * data.half_rots;
+
+  data.output_h = 1 + (data.image_h + data.pad_t + data.pad_b - data.filter_h) /
+                          data.stride_h;
+  data.output_w = 1 + (data.image_w + data.pad_l + data.pad_r - data.filter_w) /
+                          data.stride_w;
+}
+
+Image ConvField::ideal_functionality(Image &image, Filters &filters) {
+  int channels = data.inp_chans;
+  int filter_h = data.filter_h;
+  int filter_w = data.filter_w;
+  int output_h = data.output_h;
+  int output_w = data.output_w;
+
+  auto p_image = pad_image(data, image);
+  const int col_height = filter_h * filter_w * channels;
+  const int col_width = output_h * output_w;
+  Channel image_col(col_height, col_width);
+  i2c(p_image, image_col, data.filter_h, data.filter_w, data.stride_h,
+      data.stride_w, data.output_h, data.output_w);
+
+  // For each filter, flatten it into and multiply with image_col
+  Image result;
+  for (auto &filter : filters) {
+    Channel filter_col(1, col_height);
+    // Use im2col with a filter size 1x1 to translate
+    i2c(filter, filter_col, 1, 1, 1, 1, filter_h, filter_w);
+    Channel tmp = filter_col * image_col;
+
+    // Reshape result of multiplication to the right size
+    // SEAL stores matrices in RowMajor form
+    result.push_back(Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic,
+                                              Eigen::Dynamic, Eigen::RowMajor>>(
+        tmp.data(), output_h, output_w));
+  }
+  return result;
+}
+
+void ConvField::non_strided_conv(int32_t H, int32_t W, int32_t CI, int32_t FH,
+                                 int32_t FW, int32_t CO, Image *image,
+                                 Filters *filters,
+                                 vector<vector<vector<uint64_t>>> &outArr,
+                                 bool verbose) {
+  data.image_h = H;
+  data.image_w = W;
+  data.inp_chans = CI;
+  data.out_chans = CO;
+  data.filter_h = FH;
+  data.filter_w = FW;
+  data.pad_t = 0;
+  data.pad_b = 0;
+  data.pad_l = 0;
+  data.pad_r = 0;
+  data.stride_h = 1;
+  data.stride_w = 1;
+  this->slot_count =
+      min(SEAL_POLY_MOD_DEGREE_MAX, max(8192, 2 * next_pow2(H * W)));
+  configure();
+
+  shared_ptr<SEALContext> context_;
+  Encryptor *encryptor_;
+  Decryptor *decryptor_;
+  Evaluator *evaluator_;
+  BatchEncoder *encoder_;
+  GaloisKeys *gal_keys_;
+  Ciphertext *zero_;
+  if (slot_count == POLY_MOD_DEGREE) {
+    context_ = this->context[0];
+    encryptor_ = this->encryptor[0];
+    decryptor_ = this->decryptor[0];
+    evaluator_ = this->evaluator[0];
+    encoder_ = this->encoder[0];
+    gal_keys_ = this->gal_keys[0];
+    zero_ = this->zero[0];
+  } else if (slot_count == POLY_MOD_DEGREE_LARGE) {
+    context_ = this->context[1];
+    encryptor_ = this->encryptor[1];
+    decryptor_ = this->decryptor[1];
+    evaluator_ = this->evaluator[1];
+    encoder_ = this->encoder[1];
+    gal_keys_ = this->gal_keys[1];
+    zero_ = this->zero[1];
+  } else {
+    generate_new_keys(party, io, slot_count, context_, encryptor_, decryptor_,
+                      evaluator_, encoder_, gal_keys_, zero_, verbose);
+  }
+
+  if (party == BOB) {
+    auto pt = preprocess_image_OP(*image, data);
+    if (verbose)
+      cout << "[Client] Image preprocessed" << endl;
+
+    auto ct = HE_encrypt(pt, data, *encryptor_, *encoder_);
+    send_encrypted_vector(io, ct);
+    if (verbose)
+      cout << "[Client] Image encrypted and sent" << endl;
+
+    vector<Ciphertext> enc_result(data.out_ct);
+    recv_encrypted_vector(io, enc_result);
+    auto HE_result = HE_decrypt(enc_result, data, *decryptor_, *encoder_);
+
+    if (verbose)
+      cout << "[Client] Result received and decrypted" << endl;
+
+    for (int idx = 0; idx < data.output_h * data.output_w; idx++) {
+      for (int chan = 0; chan < CO; chan++) {
+        outArr[idx / data.output_w][idx % data.output_w][chan] +=
+            HE_result[chan][idx];
+      }
+    }
+  } else // party == ALICE
+  {
+    PRG128 prg;
+    uint64_t **secret_share = new uint64_t *[CO];
+    for (int chan = 0; chan < CO; chan++) {
+      secret_share[chan] = new uint64_t[data.output_h * data.output_w];
+      prg.random_mod_p<uint64_t>(secret_share[chan],
+                                 data.output_h * data.output_w, prime_mod);
+    }
+    vector<Ciphertext> noise_ct = HE_preprocess_noise(
+        secret_share, data, *encryptor_, *encoder_, *evaluator_);
+
+    if (verbose)
+      cout << "[Server] Noise processed" << endl;
+
+    vector<vector<vector<Plaintext>>> masks_OP;
+    masks_OP = HE_preprocess_filters_OP(*filters, data, *encoder_);
+
+    if (verbose)
+      cout << "[Server] Filters processed" << endl;
+
+    vector<Ciphertext> result;
+    vector<Ciphertext> ct(data.inp_ct);
+    vector<vector<Ciphertext>> rotations(data.inp_ct);
+    for (int i = 0; i < data.inp_ct; i++) {
+      rotations[i].resize(data.filter_size);
+    }
+    recv_encrypted_vector(io, ct);
+    rotations = filter_rotations(ct, data, evaluator_, gal_keys_);
+    if (verbose)
+      cout << "[Server] Filter Rotations done" << endl;
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, rotations[0][0],
+                       "before homomorphic convolution");
+#endif
+
+    auto conv_result =
+        HE_conv_OP(masks_OP, rotations, data, *evaluator_, *zero_);
+    if (verbose)
+      cout << "[Server] Convolution done" << endl;
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, conv_result[0],
+                       "after homomorphic convolution");
+#endif
+
+    result = HE_output_rotations(conv_result, data, *evaluator_, *gal_keys_,
+                                 *zero_, noise_ct);
+    if (verbose)
+      cout << "[Server] Output Rotations done" << endl;
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, result[0], "after output rotations");
+#endif
+
+    parms_id_type parms_id = result[0].parms_id();
+    shared_ptr<const SEALContext::ContextData> context_data =
+        context_->get_context_data(parms_id);
+    for (size_t ct_idx = 0; ct_idx < result.size(); ct_idx++) {
+      flood_ciphertext(result[ct_idx], context_data, SMUDGING_BITLEN);
+    }
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, result[0], "after noise flooding");
+#endif
+
+    for (size_t ct_idx = 0; ct_idx < result.size(); ct_idx++) {
+      evaluator_->mod_switch_to_next_inplace(result[ct_idx]);
+    }
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, result[0], "after mod-switch");
+#endif
+
+    send_encrypted_vector(io, result);
+    if (verbose)
+      cout << "[Server] Result computed and sent" << endl;
+
+    for (int idx = 0; idx < data.output_h * data.output_w; idx++) {
+      for (int chan = 0; chan < CO; chan++) {
+        outArr[idx / data.output_w][idx % data.output_w][chan] +=
+            (prime_mod - secret_share[chan][idx]);
+      }
+    }
+    for (int i = 0; i < data.out_chans; i++)
+      delete[] secret_share[i];
+    delete[] secret_share;
+  }
+  if (slot_count > POLY_MOD_DEGREE && slot_count < POLY_MOD_DEGREE_LARGE) {
+    free_keys(party, encryptor_, decryptor_, evaluator_, encoder_, gal_keys_,
+              zero_);
+  }
+}
+
+void ConvField::convolution(int32_t N, int32_t H, int32_t W, int32_t CI,
+                            int32_t FH, int32_t FW, int32_t CO,
+                            int32_t zPadHLeft, int32_t zPadHRight,
+                            int32_t zPadWLeft, int32_t zPadWRight,
+                            int32_t strideH, int32_t strideW,
+                            vector<vector<vector<vector<uint64_t>>>> &inputArr,
+                            vector<vector<vector<vector<uint64_t>>>> &filterArr,
+                            vector<vector<vector<vector<uint64_t>>>> &outArr,
+                            bool verify_output, bool verbose) {
+  int paddedH = H + zPadHLeft + zPadHRight;
+  int paddedW = W + zPadWLeft + zPadWRight;
+  int newH = 1 + (paddedH - FH) / strideH;
+  int newW = 1 + (paddedW - FW) / strideW;
+  int limitH = FH + ((paddedH - FH) / strideH) * strideH;
+  int limitW = FW + ((paddedW - FW) / strideW) * strideW;
+
+  for (int i = 0; i < newH; i++) {
+    for (int j = 0; j < newW; j++) {
+      for (int k = 0; k < CO; k++) {
+        outArr[0][i][j][k] = 0;
+      }
+    }
+  }
+
+  Image image;
+  Filters filters;
+
+  image.resize(CI);
+  for (int chan = 0; chan < CI; chan++) {
+    Channel tmp_chan(H, W);
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        tmp_chan(h, w) =
+            neg_mod((int64_t)inputArr[0][h][w][chan], (int64_t)prime_mod);
+      }
+    }
+    image[chan] = tmp_chan;
+  }
+  if (party == BOB) {
+    for (int s_row = 0; s_row < strideH; s_row++) {
+      for (int s_col = 0; s_col < strideW; s_col++) {
+        int lH = ((limitH - s_row + strideH - 1) / strideH);
+        int lW = ((limitW - s_col + strideW - 1) / strideW);
+        int lFH = ((FH - s_row + strideH - 1) / strideH);
+        int lFW = ((FW - s_col + strideW - 1) / strideW);
+        Image lImage(CI);
+        for (int chan = 0; chan < CI; chan++) {
+          Channel tmp_chan(lH, lW);
+          // lImage[chan] = new uint64_t[lH*lW];
+          for (int row = 0; row < lH; row++) {
+            for (int col = 0; col < lW; col++) {
+              int idxH = row * strideH + s_row - zPadHLeft;
+              int idxW = col * strideW + s_col - zPadWLeft;
+              if ((idxH < 0 || idxH >= H) || (idxW < 0 || idxW >= W)) {
+                tmp_chan(row, col) = 0;
+              } else {
+                tmp_chan(row, col) =
+                    neg_mod(inputArr[0][idxH][idxW][chan], (int64_t)prime_mod);
+              }
+            }
+          }
+          lImage[chan] = tmp_chan;
+        }
+        if (lFH > 0 && lFW > 0) {
+          non_strided_conv(lH, lW, CI, lFH, lFW, CO, &lImage, nullptr,
+                           outArr[0], verbose);
+        }
+      }
+    }
+    for (int idx = 0; idx < newH * newW; idx++) {
+      for (int chan = 0; chan < CO; chan++) {
+        outArr[0][idx / newW][idx % newW][chan] = neg_mod(
+            (int64_t)outArr[0][idx / newW][idx % newW][chan], prime_mod);
+      }
+    }
+    if (verify_output)
+      verify(H, W, CI, CO, image, nullptr, outArr);
+  } else // party == ALICE
+  {
+    filters.resize(CO);
+    for (int out_c = 0; out_c < CO; out_c++) {
+      Image tmp_img(CI);
+      for (int inp_c = 0; inp_c < CI; inp_c++) {
+        Channel tmp_chan(FH, FW);
+        for (int idx = 0; idx < FH * FW; idx++) {
+          int64_t val = (int64_t)filterArr[idx / FW][idx % FW][inp_c][out_c];
+          if (val > int64_t(prime_mod/2)) {
+            val = val - prime_mod;
+          }
+          tmp_chan(idx / FW, idx % FW) = val;
+        }
+        tmp_img[inp_c] = tmp_chan;
+      }
+      filters[out_c] = tmp_img;
+    }
+
+    for (int s_row = 0; s_row < strideH; s_row++) {
+      for (int s_col = 0; s_col < strideW; s_col++) {
+        int lH = ((limitH - s_row + strideH - 1) / strideH);
+        int lW = ((limitW - s_col + strideW - 1) / strideW);
+        int lFH = ((FH - s_row + strideH - 1) / strideH);
+        int lFW = ((FW - s_col + strideW - 1) / strideW);
+        Filters lFilters(CO);
+        for (int out_c = 0; out_c < CO; out_c++) {
+          Image tmp_img(CI);
+          for (int inp_c = 0; inp_c < CI; inp_c++) {
+            Channel tmp_chan(lFH, lFW);
+            for (int row = 0; row < lFH; row++) {
+              for (int col = 0; col < lFW; col++) {
+                int idxFH = row * strideH + s_row;
+                int idxFW = col * strideW + s_col;
+                tmp_chan(row, col) = neg_mod(
+                    filterArr[idxFH][idxFW][inp_c][out_c], (int64_t)prime_mod);
+              }
+            }
+            tmp_img[inp_c] = tmp_chan;
+          }
+          lFilters[out_c] = tmp_img;
+        }
+        if (lFH > 0 && lFW > 0) {
+          non_strided_conv(lH, lW, CI, lFH, lFW, CO, nullptr, &lFilters,
+                           outArr[0], verbose);
+        }
+      }
+    }
+    data.image_h = H;
+    data.image_w = W;
+    data.inp_chans = CI;
+    data.out_chans = CO;
+    data.filter_h = FH;
+    data.filter_w = FW;
+    data.pad_t = zPadHLeft;
+    data.pad_b = zPadHRight;
+    data.pad_l = zPadWLeft;
+    data.pad_r = zPadWRight;
+    data.stride_h = strideH;
+    data.stride_w = strideW;
+
+    // The filter values should be small enough to not overflow uint64_t
+    Image local_result = ideal_functionality(image, filters);
+
+    for (int idx = 0; idx < newH * newW; idx++) {
+      for (int chan = 0; chan < CO; chan++) {
+        outArr[0][idx / newW][idx % newW][chan] =
+            neg_mod((int64_t)local_result[chan](idx / newW, idx % newW) +
+                        (int64_t)outArr[0][idx / newW][idx % newW][chan],
+                    prime_mod);
+      }
+    }
+    if (verify_output)
+      verify(H, W, CI, CO, image, &filters, outArr);
+  }
+}
+
+void ConvField::verify(int H, int W, int CI, int CO, Image &image,
+                       Filters *filters,
+                       vector<vector<vector<vector<uint64_t>>>> &outArr) {
+  int newH = outArr[0].size();
+  int newW = outArr[0][0].size();
+  if (party == BOB) {
+    for (int i = 0; i < CI; i++) {
+      io->send_data(image[i].data(), H * W * sizeof(uint64_t));
+    }
+    for (int i = 0; i < newH; i++) {
+      for (int j = 0; j < newW; j++) {
+        io->send_data(outArr[0][i][j].data(),
+                      sizeof(uint64_t) * data.out_chans);
+      }
+    }
+  } else // party == ALICE
+  {
+    Image image_0(CI); // = new Channel[CI];
+    for (int i = 0; i < CI; i++) {
+      // image_0[i] = new uint64_t[H*W];
+      image_0[i].resize(H, W);
+      io->recv_data(image_0[i].data(), H * W * sizeof(uint64_t));
+    }
+    for (int i = 0; i < CI; i++) {
+      for (int h = 0; h < H; h++) {
+        for (int w = 0; w < W; w++) {
+          image[i](h, w) = (image[i](h, w) + image_0[i](h, w)) % prime_mod;
+        }
+      }
+    }
+    Image result = ideal_functionality(image, *filters);
+
+    vector<vector<vector<vector<uint64_t>>>> outArr_0;
+    outArr_0.resize(1);
+    outArr_0[0].resize(newH);
+    for (int i = 0; i < newH; i++) {
+      outArr_0[0][i].resize(newW);
+      for (int j = 0; j < newW; j++) {
+        outArr_0[0][i][j].resize(CO);
+        io->recv_data(outArr_0[0][i][j].data(), sizeof(uint64_t) * CO);
+      }
+    }
+    for (int i = 0; i < newH; i++) {
+      for (int j = 0; j < newW; j++) {
+        for (int k = 0; k < CO; k++) {
+          outArr_0[0][i][j][k] =
+              (outArr_0[0][i][j][k] + outArr[0][i][j][k]) % prime_mod;
+        }
+      }
+    }
+    bool pass = true;
+    for (int i = 0; i < CO; i++) {
+      for (int j = 0; j < newH; j++) {
+        for (int k = 0; k < newW; k++) {
+          if ((int64_t)outArr_0[0][j][k][i] !=
+              neg_mod(result[i](j, k), (int64_t)prime_mod)) {
+            pass = false;
+          }
+        }
+      }
+    }
+    if (pass)
+      cout << GREEN << "[Server] Successful Operation" << RESET << endl;
+    else {
+      cout << RED << "[Server] Failed Operation" << RESET << endl;
+      cout << RED << "WARNING: The implementation assumes that the computation"
+           << endl;
+      cout << "performed locally by the server (on the model and its input "
+              "share)"
+           << endl;
+      cout << "fits in a 64-bit integer. The failed operation could be a result"
+           << endl;
+      cout << "of overflowing the bound." << RESET << endl;
+    }
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.h
new file mode 100644
index 00000000..40e3b7ad
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.h
@@ -0,0 +1,173 @@
+/*
+Original Author: ryanleh
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef CONV_FIELD_H__
+#define CONV_FIELD_H__
+
+#include "LinearHE/utils-HE.h"
+#include <Eigen/Dense>
+
+// This is to keep compatibility for im2col implementations
+typedef Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+    Channel;
+typedef std::vector<Channel> Image;
+typedef std::vector<Image> Filters;
+
+struct ConvMetadata {
+  int slot_count;
+  // Number of plaintext slots in a half ciphertext
+  // (since ciphertexts are a two column matrix)
+  int32_t pack_num;
+  // Number of Channels that can fit in a half ciphertext
+  int32_t chans_per_half;
+  // Number of input ciphertexts for convolution
+  int32_t inp_ct;
+  // Number of output ciphertexts
+  int32_t out_ct;
+  // Image and Filters metadata
+  int32_t image_h;
+  int32_t image_w;
+  size_t image_size;
+  int32_t inp_chans;
+  int32_t filter_h;
+  int32_t filter_w;
+  int32_t filter_size;
+  int32_t out_chans;
+  // How many total ciphertext halves the input and output take up
+  int32_t inp_halves;
+  int32_t out_halves;
+  // The modulo used when deciding which output channels to pack into a mask
+  int32_t out_mod;
+  // How many permutations of ciphertexts are needed to generate all
+  // intermediate rotation sets
+  int32_t half_perms;
+  /* The number of rotations for each ciphertext half */
+  int32_t half_rots;
+  // Total number of convolutions needed to generate all
+  // intermediate rotations sets
+  int32_t convs;
+  int32_t stride_h;
+  int32_t stride_w;
+  int32_t output_h;
+  int32_t output_w;
+  int32_t pad_t;
+  int32_t pad_b;
+  int32_t pad_r;
+  int32_t pad_l;
+};
+
+/* Use casting to do two conditionals instead of one - check if a > 0 and a < b
+ */
+inline bool condition_check(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
+Image pad_image(ConvMetadata data, Image &image);
+
+void i2c(Image &image, Channel &column, const int filter_h, const int filter_w,
+         const int stride_h, const int stride_w, const int output_h,
+         const int output_w);
+
+std::vector<seal::Ciphertext>
+HE_preprocess_noise(const uint64_t *const *secret_share,
+                    const ConvMetadata &data, seal::Encryptor &encryptor,
+                    seal::BatchEncoder &batch_encoder,
+                    seal::Evaluator &evaluator);
+
+std::vector<std::vector<uint64_t>> preprocess_image_OP(Image &image,
+                                                       ConvMetadata data);
+
+std::vector<std::vector<seal::Ciphertext>>
+filter_rotations(std::vector<seal::Ciphertext> &input, const ConvMetadata &data,
+                 seal::Evaluator *evaluator = NULL,
+                 seal::GaloisKeys *gal_keys = NULL);
+
+std::vector<seal::Ciphertext> HE_encrypt(std::vector<std::vector<uint64_t>> &pt,
+                                         const ConvMetadata &data,
+                                         seal::Encryptor &encryptor,
+                                         seal::BatchEncoder &batch_encoder);
+
+std::vector<std::vector<std::vector<seal::Plaintext>>>
+HE_preprocess_filters_OP(Filters &filters, const ConvMetadata &data,
+                         seal::BatchEncoder &batch_encoder);
+
+std::vector<seal::Ciphertext>
+HE_conv_OP(std::vector<std::vector<std::vector<seal::Plaintext>>> &masks,
+           std::vector<std::vector<seal::Ciphertext>> &rotations,
+           const ConvMetadata &data, seal::Evaluator &evaluator,
+           seal::Ciphertext &zero);
+
+std::vector<seal::Ciphertext>
+HE_output_rotations(std::vector<seal::Ciphertext> &convs,
+                    const ConvMetadata &data, seal::Evaluator &evaluator,
+                    seal::GaloisKeys &gal_keys, seal::Ciphertext &zero,
+                    std::vector<seal::Ciphertext> &enc_noise);
+
+uint64_t **HE_decrypt(std::vector<seal::Ciphertext> &enc_result,
+                      const ConvMetadata &data, seal::Decryptor &decryptor,
+                      seal::BatchEncoder &batch_encoder);
+
+class ConvField {
+public:
+  int party;
+  sci::NetIO *io;
+  std::shared_ptr<seal::SEALContext> context[2];
+  seal::Encryptor *encryptor[2];
+  seal::Decryptor *decryptor[2];
+  seal::Evaluator *evaluator[2];
+  seal::BatchEncoder *encoder[2];
+  seal::GaloisKeys *gal_keys[2];
+  seal::Ciphertext *zero[2];
+  size_t slot_count;
+  ConvMetadata data;
+
+  ConvField(int party, sci::NetIO *io);
+
+  ~ConvField();
+
+  void configure();
+
+  Image ideal_functionality(Image &image, Filters &filters);
+
+  void non_strided_conv(int32_t H, int32_t W, int32_t CI, int32_t FH,
+                        int32_t FW, int32_t CO, Image *image, Filters *filters,
+                        std::vector<std::vector<std::vector<uint64_t>>> &outArr,
+                        bool verbose = false);
+
+  void convolution(
+      int32_t N, int32_t H, int32_t W, int32_t CI, int32_t FH, int32_t FW,
+      int32_t CO, int32_t zPadHLeft, int32_t zPadHRight, int32_t zPadWLeft,
+      int32_t zPadWRight, int32_t strideH, int32_t strideW,
+      std::vector<std::vector<std::vector<std::vector<uint64_t>>>> &inputArr,
+      std::vector<std::vector<std::vector<std::vector<uint64_t>>>> &filterArr,
+      std::vector<std::vector<std::vector<std::vector<uint64_t>>>> &outArr,
+      bool verify_output = false, bool verbose = false);
+
+  void
+  verify(int H, int W, int CI, int CO, Image &image, Filters *filters,
+         std::vector<std::vector<std::vector<std::vector<uint64_t>>>> &outArr);
+};
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/defines-HE.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/defines-HE.h
new file mode 100644
index 00000000..01ff8021
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/defines-HE.h
@@ -0,0 +1,42 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef DEFINES_HE_H__
+#define DEFINES_HE_H__
+
+#include <cstdint>
+#include <cmath>
+// #define HE_DEBUG
+
+extern uint64_t prime_mod;
+extern int32_t bitlength;
+extern int32_t num_threads;
+
+const uint64_t POLY_MOD_DEGREE = 8192;
+const uint64_t POLY_MOD_DEGREE_LARGE = 65536;
+const int32_t SMUDGING_BITLEN = 100 - bitlength;
+
+/* Helper function for rounding to the next power of 2
+ * Credit:
+ * https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 */
+inline int next_pow2(int val) { return pow(2, ceil(log(val) / log(2))); }
+
+#endif // DEFINES_HE_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.cpp
new file mode 100644
index 00000000..30910767
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.cpp
@@ -0,0 +1,220 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "LinearHE/elemwise-prod-field.h"
+
+using namespace std;
+using namespace seal;
+using namespace sci;
+
+ElemWiseProdField::ElemWiseProdField(int party, NetIO *io) {
+  this->party = party;
+  this->io = io;
+  this->slot_count = POLY_MOD_DEGREE;
+  generate_new_keys(party, io, slot_count, context, encryptor, decryptor,
+                    evaluator, encoder, gal_keys, zero);
+}
+
+ElemWiseProdField::~ElemWiseProdField() {
+  free_keys(party, encryptor, decryptor, evaluator, encoder, gal_keys, zero);
+}
+
+vector<uint64_t>
+ElemWiseProdField::ideal_functionality(vector<uint64_t> &inArr,
+                                       vector<uint64_t> &multArr) {
+  vector<uint64_t> result(inArr.size(), 0ULL);
+
+  for (size_t i = 0; i < inArr.size(); i++) {
+    result[i] = multArr[i] * inArr[i];
+  }
+  return result;
+}
+
+void ElemWiseProdField::elemwise_product(int32_t size, vector<uint64_t> &inArr,
+                                         vector<uint64_t> &multArr,
+                                         vector<uint64_t> &outputArr,
+                                         bool verify_output, bool verbose) {
+  int num_ct = ceil(float(size) / slot_count);
+
+  if (party == BOB) {
+    vector<Ciphertext> ct(num_ct);
+    for (int i = 0; i < num_ct; i++) {
+      int offset = i * slot_count;
+      vector<uint64_t> tmp_vec(slot_count, 0);
+      Plaintext tmp_pt;
+      for (int j = 0; j < slot_count && j + offset < size; j++) {
+        tmp_vec[j] = neg_mod((int64_t)inArr[j + offset], (int64_t)prime_mod);
+      }
+      encoder->encode(tmp_vec, tmp_pt);
+      encryptor->encrypt(tmp_pt, ct[i]);
+    }
+    send_encrypted_vector(io, ct);
+
+    vector<Ciphertext> enc_result(num_ct);
+    recv_encrypted_vector(io, enc_result);
+    for (int i = 0; i < num_ct; i++) {
+      int offset = i * slot_count;
+      vector<uint64_t> tmp_vec(slot_count, 0);
+      Plaintext tmp_pt;
+      decryptor->decrypt(enc_result[i], tmp_pt);
+      encoder->decode(tmp_pt, tmp_vec);
+      for (int j = 0; j < slot_count && j + offset < size; j++) {
+        outputArr[j + offset] = tmp_vec[j];
+      }
+    }
+    if (verify_output)
+      verify(inArr, nullptr, outputArr);
+  } else // party == ALICE
+  {
+    vector<Plaintext> multArr_pt(num_ct);
+    for (int i = 0; i < num_ct; i++) {
+      int offset = i * slot_count;
+      vector<uint64_t> tmp_vec(slot_count, 0);
+      for (int j = 0; j < slot_count && j + offset < size; j++) {
+        tmp_vec[j] = neg_mod((int64_t)multArr[j + offset], (int64_t)prime_mod);
+      }
+      encoder->encode(tmp_vec, multArr_pt[i]);
+    }
+
+    PRG128 prg;
+    vector<Ciphertext> enc_noise(num_ct);
+    vector<vector<uint64_t>> secret_share(num_ct,
+                                          vector<uint64_t>(slot_count, 0));
+    for (int i = 0; i < num_ct; i++) {
+      Plaintext tmp_pt;
+      prg.random_mod_p<uint64_t>(secret_share[i].data(), slot_count, prime_mod);
+      encoder->encode(secret_share[i], tmp_pt);
+      encryptor->encrypt(tmp_pt, enc_noise[i]);
+    }
+
+    vector<Ciphertext> ct(num_ct);
+    recv_encrypted_vector(io, ct);
+
+    vector<Ciphertext> enc_result(num_ct);
+    for (int i = 0; i < num_ct; i++) {
+#ifdef HE_DEBUG
+      if (!i)
+        PRINT_NOISE_BUDGET(decryptor, ct[i], "before product");
+#endif
+
+      if (multArr_pt[i].is_zero()) {
+        enc_result[i] = *zero;
+      } else {
+        evaluator->multiply_plain(ct[i], multArr_pt[i], enc_result[i]);
+      }
+      evaluator->add_inplace(enc_result[i], enc_noise[i]);
+
+#ifdef HE_DEBUG
+      if (!i)
+        PRINT_NOISE_BUDGET(decryptor, enc_result[i], "after product");
+#endif
+
+      evaluator->mod_switch_to_next_inplace(enc_result[i]);
+
+#ifdef HE_DEBUG
+      if (!i)
+        PRINT_NOISE_BUDGET(decryptor, enc_result[i], "after mod-switch");
+#endif
+
+      parms_id_type parms_id = enc_result[i].parms_id();
+      shared_ptr<const SEALContext::ContextData> context_data =
+          context->get_context_data(parms_id);
+      flood_ciphertext(enc_result[i], context_data, SMUDGING_BITLEN);
+
+#ifdef HE_DEBUG
+      if (!i)
+        PRINT_NOISE_BUDGET(decryptor, enc_result[i], "after noise flooding");
+#endif
+
+      evaluator->mod_switch_to_next_inplace(enc_result[i]);
+
+#ifdef HE_DEBUG
+      if (!i)
+        PRINT_NOISE_BUDGET(decryptor, enc_result[i], "after mod-switch");
+#endif
+    }
+    send_encrypted_vector(io, enc_result);
+
+    vector<uint64_t> multArr_lifted(size, 0);
+    for (int i = 0; i < size; i++) {
+        int64_t val = (int64_t)multArr[i];
+        if (val > int64_t(prime_mod/2)) {
+          val = val - prime_mod;
+        }
+        multArr_lifted[i] = val;
+    }
+    auto result = ideal_functionality(inArr, multArr_lifted);
+
+    for (int i = 0; i < num_ct; i++) {
+      int offset = i * slot_count;
+      for (int j = 0; j < slot_count && j + offset < size; j++) {
+        outputArr[j + offset] =
+            neg_mod((int64_t)result[j + offset] - (int64_t)secret_share[i][j],
+                    (int64_t)prime_mod);
+      }
+    }
+    if (verify_output)
+      verify(inArr, &multArr_lifted, outputArr);
+  }
+}
+
+void ElemWiseProdField::verify(vector<uint64_t> &inArr,
+                               vector<uint64_t> *multArr,
+                               vector<uint64_t> &outArr) {
+  if (party == BOB) {
+    io->send_data(inArr.data(), inArr.size() * sizeof(uint64_t));
+    io->send_data(outArr.data(), outArr.size() * sizeof(uint64_t));
+  } else // party == ALICE
+  {
+    vector<uint64_t> inArr_0(inArr.size());
+    io->recv_data(inArr_0.data(), inArr.size() * sizeof(uint64_t));
+    for (size_t i = 0; i < inArr.size(); i++) {
+      inArr_0[i] = (inArr[i] + inArr_0[i]) % prime_mod;
+    }
+
+    auto result = ideal_functionality(inArr_0, *multArr);
+
+    vector<uint64_t> outArr_0(outArr.size());
+    io->recv_data(outArr_0.data(), outArr.size() * sizeof(uint64_t));
+    for (size_t i = 0; i < outArr.size(); i++) {
+      outArr_0[i] = (outArr[i] + outArr_0[i]) % prime_mod;
+    }
+    bool pass = true;
+    for (size_t i = 0; i < outArr.size(); i++) {
+      if (neg_mod(result[i], (int64_t)prime_mod) != (int64_t)outArr_0[i]) {
+        pass = false;
+      }
+    }
+    if (pass)
+      cout << GREEN << "[Server] Successful Operation" << RESET << endl;
+    else {
+      cout << RED << "[Server] Failed Operation" << RESET << endl;
+      cout << RED << "WARNING: The implementation assumes that the computation"
+           << endl;
+      cout << "performed locally by the server (on the model and its input "
+              "share)"
+           << endl;
+      cout << "fits in a 64-bit integer. The failed operation could be a result"
+           << endl;
+      cout << "of overflowing the bound." << RESET << endl;
+    }
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.h
new file mode 100644
index 00000000..e2aadedf
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.h
@@ -0,0 +1,56 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef ELEMWISEPROD_FIELD_H__
+#define ELEMWISEPROD_FIELD_H__
+
+#include "LinearHE/utils-HE.h"
+
+class ElemWiseProdField {
+public:
+  int party;
+  sci::NetIO *io;
+  std::shared_ptr<seal::SEALContext> context;
+  seal::Encryptor *encryptor;
+  seal::Decryptor *decryptor;
+  seal::Evaluator *evaluator;
+  seal::BatchEncoder *encoder;
+  seal::GaloisKeys *gal_keys;
+  seal::Ciphertext *zero;
+  int slot_count;
+
+  ElemWiseProdField(int party, sci::NetIO *io);
+
+  ~ElemWiseProdField();
+
+  std::vector<uint64_t> ideal_functionality(std::vector<uint64_t> &inArr,
+                                            std::vector<uint64_t> &multArr);
+
+  void elemwise_product(int32_t size, std::vector<uint64_t> &inArr,
+                        std::vector<uint64_t> &multArr,
+                        std::vector<uint64_t> &outputArr,
+                        bool verify_output = false, bool verbose = false);
+
+  void verify(std::vector<uint64_t> &inArr, std::vector<uint64_t> *multArr,
+              std::vector<uint64_t> &outArr);
+};
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.cpp
new file mode 100644
index 00000000..e634f412
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.cpp
@@ -0,0 +1,395 @@
+/*
+Original Author: ryanleh
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Modified by Deevashwer Rathee
+*/
+
+#include "LinearHE/fc-field.h"
+
+using namespace std;
+using namespace seal;
+using namespace sci;
+
+Ciphertext preprocess_vec(const uint64_t *input, const FCMetadata &data,
+                          Encryptor &encryptor, BatchEncoder &batch_encoder) {
+  // Create copies of the input vector to fill the ciphertext appropiately.
+  // Pack using powers of two for easy rotations later
+  vector<uint64_t> pod_matrix(data.slot_count, 0ULL);
+  uint64_t size_pow2 = next_pow2(data.image_size);
+  for (int col = 0; col < data.image_size; col++) {
+    for (int idx = 0; idx < data.pack_num; idx++) {
+      pod_matrix[col + size_pow2 * idx] = input[col];
+    }
+  }
+
+  Ciphertext ciphertext;
+  Plaintext tmp;
+  batch_encoder.encode(pod_matrix, tmp);
+  encryptor.encrypt(tmp, ciphertext);
+  return ciphertext;
+}
+
+vector<Plaintext> preprocess_matrix(const uint64_t *const *matrix,
+                                    const FCMetadata &data,
+                                    BatchEncoder &batch_encoder) {
+  // Pack the filter in alternating order of needed ciphertexts. This way we
+  // rotate the input once per ciphertext
+  vector<vector<uint64_t>> mat_pack(data.inp_ct,
+                                    vector<uint64_t>(data.slot_count, 0ULL));
+  for (int row = 0; row < data.filter_h; row++) {
+    int ct_idx = row / data.inp_ct;
+    for (int col = 0; col < data.filter_w; col++) {
+      mat_pack[row % data.inp_ct][col + next_pow2(data.filter_w) * ct_idx] =
+          matrix[row][col];
+    }
+  }
+
+  // Take the packed ciphertexts above and repack them in a diagonal ordering.
+  int mod_mask = (data.inp_ct - 1);
+  int wrap_thresh = min(data.slot_count >> 1, next_pow2(data.filter_w));
+  int wrap_mask = wrap_thresh - 1;
+  vector<vector<uint64_t>> mat_diag(data.inp_ct,
+                                    vector<uint64_t>(data.slot_count, 0ULL));
+  for (int ct = 0; ct < data.inp_ct; ct++) {
+    for (int col = 0; col < data.slot_count; col++) {
+      int ct_diag_l = (col - ct) & wrap_mask & mod_mask;
+      int ct_diag_h = (col ^ ct) & (data.slot_count / 2) & mod_mask;
+      int ct_diag = (ct_diag_h + ct_diag_l);
+
+      int col_diag_l = (col - ct_diag_l) & wrap_mask;
+      int col_diag_h = wrap_thresh * (col / wrap_thresh) ^ ct_diag_h;
+      int col_diag = col_diag_h + col_diag_l;
+
+      mat_diag[ct_diag][col_diag] = mat_pack[ct][col];
+    }
+  }
+
+  vector<Plaintext> enc_mat(data.inp_ct);
+  for (int ct = 0; ct < data.inp_ct; ct++) {
+    batch_encoder.encode(mat_diag[ct], enc_mat[ct]);
+  }
+  return enc_mat;
+}
+
+/* Generates a masking vector of random noise that will be applied to parts of
+ * the ciphertext that contain leakage */
+Ciphertext fc_preprocess_noise(const uint64_t *secret_share,
+                               const FCMetadata &data, Encryptor &encryptor,
+                               BatchEncoder &batch_encoder) {
+  // Sample randomness into vector
+  vector<uint64_t> noise(data.slot_count, 0ULL);
+  PRG128 prg;
+  prg.random_mod_p<uint64_t>(noise.data(), data.slot_count, prime_mod);
+
+  // Puncture the vector with secret shares where an actual fc result value
+  // lives
+  for (int row = 0; row < data.filter_h; row++) {
+    int curr_set = row / data.inp_ct;
+    noise[(row % data.inp_ct) + next_pow2(data.image_size) * curr_set] =
+        secret_share[row];
+  }
+
+  Ciphertext enc_noise;
+  Plaintext tmp;
+  batch_encoder.encode(noise, tmp);
+  encryptor.encrypt(tmp, enc_noise);
+
+  return enc_noise;
+}
+
+Ciphertext fc_online(Ciphertext &ct, vector<Plaintext> &enc_mat,
+                     const FCMetadata &data, Evaluator &evaluator,
+                     GaloisKeys &gal_keys, Ciphertext &zero,
+                     Ciphertext &enc_noise) {
+  Ciphertext result = zero;
+  // For each matrix ciphertext, rotate the input vector once and multiply + add
+  Ciphertext tmp;
+  for (int ct_idx = 0; ct_idx < data.inp_ct; ct_idx++) {
+    if (!enc_mat[ct_idx].is_zero()) {
+      evaluator.rotate_rows(ct, ct_idx, gal_keys, tmp);
+      evaluator.multiply_plain_inplace(tmp, enc_mat[ct_idx]);
+      evaluator.add_inplace(result, tmp);
+    }
+  }
+  evaluator.mod_switch_to_next_inplace(result);
+  evaluator.mod_switch_to_next_inplace(enc_noise);
+
+  // Rotate all partial sums together
+  for (int rot = data.inp_ct; rot < next_pow2(data.image_size); rot *= 2) {
+    Ciphertext tmp;
+    if (rot == data.slot_count / 2) {
+      evaluator.rotate_columns(result, gal_keys, tmp);
+    } else {
+      evaluator.rotate_rows(result, rot, gal_keys, tmp);
+    }
+    evaluator.add_inplace(result, tmp);
+  }
+  // Add noise to cover leakage
+  evaluator.add_inplace(result, enc_noise);
+  return result;
+}
+
+uint64_t *fc_postprocess(Ciphertext &ct, const FCMetadata &data,
+                         BatchEncoder &batch_encoder, Decryptor &decryptor) {
+  vector<uint64_t> plain(data.slot_count, 0ULL);
+  Plaintext tmp;
+  decryptor.decrypt(ct, tmp);
+  batch_encoder.decode(tmp, plain);
+
+  uint64_t *result = new uint64_t[data.filter_h];
+  for (int row = 0; row < data.filter_h; row++) {
+    int curr_set = row / data.inp_ct;
+    result[row] =
+        plain[(row % data.inp_ct) + next_pow2(data.image_size) * curr_set];
+  }
+  return result;
+}
+
+FCField::FCField(int party, NetIO *io) {
+  this->party = party;
+  this->io = io;
+  this->slot_count = POLY_MOD_DEGREE;
+  generate_new_keys(party, io, slot_count, context, encryptor, decryptor,
+                    evaluator, encoder, gal_keys, zero);
+}
+
+FCField::~FCField() {
+  free_keys(party, encryptor, decryptor, evaluator, encoder, gal_keys, zero);
+}
+
+void FCField::configure() {
+  data.slot_count = slot_count;
+  // Only works with a ciphertext that fits in a single ciphertext
+  assert(data.slot_count >= data.image_size);
+
+  data.filter_size = data.filter_h * data.filter_w;
+  // How many columns of matrix we can fit in a single ciphertext
+  data.pack_num = slot_count / next_pow2(data.filter_w);
+  // How many total ciphertexts we'll need
+  data.inp_ct = ceil((float)next_pow2(data.filter_h) / data.pack_num);
+}
+
+vector<uint64_t> FCField::ideal_functionality(uint64_t *vec,
+                                              uint64_t **matrix) {
+  vector<uint64_t> result(data.filter_h, 0ULL);
+  for (int row = 0; row < data.filter_h; row++) {
+    for (int idx = 0; idx < data.filter_w; idx++) {
+      uint64_t partial = vec[idx] * matrix[row][idx];
+      result[row] = result[row] + partial;
+    }
+  }
+  return result;
+}
+
+void FCField::matrix_multiplication(int32_t num_rows, int32_t common_dim,
+                                    int32_t num_cols,
+                                    vector<vector<uint64_t>> &A,
+                                    vector<vector<uint64_t>> &B,
+                                    vector<vector<uint64_t>> &C,
+                                    bool verify_output, bool verbose) {
+  assert(num_cols == 1);
+  data.filter_h = num_rows;
+  data.filter_w = common_dim;
+  data.image_size = common_dim;
+  this->slot_count =
+      min(max(8192, 2 * next_pow2(common_dim)), SEAL_POLY_MOD_DEGREE_MAX);
+  configure();
+
+  shared_ptr<SEALContext> context_;
+  Encryptor *encryptor_;
+  Decryptor *decryptor_;
+  Evaluator *evaluator_;
+  BatchEncoder *encoder_;
+  GaloisKeys *gal_keys_;
+  Ciphertext *zero_;
+  if (slot_count > POLY_MOD_DEGREE) {
+    generate_new_keys(party, io, slot_count, context_, encryptor_, decryptor_,
+                      evaluator_, encoder_, gal_keys_, zero_);
+  } else {
+    context_ = this->context;
+    encryptor_ = this->encryptor;
+    decryptor_ = this->decryptor;
+    evaluator_ = this->evaluator;
+    encoder_ = this->encoder;
+    gal_keys_ = this->gal_keys;
+    zero_ = this->zero;
+  }
+
+  if (party == BOB) {
+    vector<uint64_t> vec(common_dim);
+    for (int i = 0; i < common_dim; i++) {
+      vec[i] = B[i][0];
+    }
+    if (verbose)
+      cout << "[Client] Vector Generated" << endl;
+
+    auto ct = preprocess_vec(vec.data(), data, *encryptor_, *encoder_);
+    send_ciphertext(io, ct);
+    if (verbose)
+      cout << "[Client] Vector processed and sent" << endl;
+
+    Ciphertext enc_result;
+    recv_ciphertext(io, enc_result);
+    auto HE_result = fc_postprocess(enc_result, data, *encoder_, *decryptor_);
+    if (verbose)
+      cout << "[Client] Result received and decrypted" << endl;
+
+    for (int i = 0; i < num_rows; i++) {
+      C[i][0] = HE_result[i];
+    }
+    if (verify_output)
+      verify(&vec, nullptr, C);
+
+    delete[] HE_result;
+  } else // party == ALICE
+  {
+    vector<uint64_t> vec(common_dim);
+    for (int i = 0; i < common_dim; i++) {
+      vec[i] = B[i][0];
+    }
+    if (verbose)
+      cout << "[Server] Vector Generated" << endl;
+    vector<uint64_t *> matrix_mod_p(num_rows);
+    vector<uint64_t *> matrix(num_rows);
+    for (int i = 0; i < num_rows; i++) {
+      matrix_mod_p[i] = new uint64_t[common_dim];
+      matrix[i] = new uint64_t[common_dim];
+      for (int j = 0; j < common_dim; j++) {
+        matrix_mod_p[i][j] = neg_mod((int64_t)A[i][j], (int64_t)prime_mod);
+        int64_t val = (int64_t)A[i][j];
+        if (val > int64_t(prime_mod/2)) {
+          val = val - prime_mod;
+        }
+        matrix[i][j] = val;
+      }
+    }
+    if (verbose)
+      cout << "[Server] Matrix generated" << endl;
+
+    PRG128 prg;
+    uint64_t *secret_share = new uint64_t[num_rows];
+    prg.random_mod_p<uint64_t>(secret_share, num_rows, prime_mod);
+
+    Ciphertext enc_noise =
+        fc_preprocess_noise(secret_share, data, *encryptor_, *encoder_);
+    auto encoded_mat = preprocess_matrix(matrix_mod_p.data(), data, *encoder_);
+    if (verbose)
+      cout << "[Server] Matrix and noise processed" << endl;
+
+    Ciphertext ct;
+    recv_ciphertext(io, ct);
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, ct, "before FC Online");
+#endif
+
+    auto HE_result = fc_online(ct, encoded_mat, data, *evaluator_, *gal_keys_,
+                               *zero_, enc_noise);
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, HE_result, "after FC Online");
+#endif
+
+    parms_id_type parms_id = HE_result.parms_id();
+    shared_ptr<const SEALContext::ContextData> context_data =
+        context_->get_context_data(parms_id);
+    flood_ciphertext(HE_result, context_data, SMUDGING_BITLEN);
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, HE_result, "after noise flooding");
+#endif
+
+    evaluator_->mod_switch_to_next_inplace(HE_result);
+
+#ifdef HE_DEBUG
+    PRINT_NOISE_BUDGET(decryptor_, HE_result, "after mod-switch");
+#endif
+
+    send_ciphertext(io, HE_result);
+    if (verbose)
+      cout << "[Server] Result computed and sent" << endl;
+
+    auto result = ideal_functionality(vec.data(), matrix.data());
+
+    for (int i = 0; i < num_rows; i++) {
+      C[i][0] = neg_mod((int64_t)result[i] - (int64_t)secret_share[i],
+                        (int64_t)prime_mod);
+    }
+    if (verify_output)
+      verify(&vec, &matrix, C);
+
+    for (int i = 0; i < num_rows; i++) {
+      delete[] matrix_mod_p[i];
+      delete[] matrix[i];
+    }
+    delete[] secret_share;
+  }
+  if (slot_count > POLY_MOD_DEGREE) {
+    free_keys(party, encryptor_, decryptor_, evaluator_, encoder_, gal_keys_,
+              zero_);
+  }
+}
+
+void FCField::verify(vector<uint64_t> *vec, vector<uint64_t *> *matrix,
+                     vector<vector<uint64_t>> &C) {
+  if (party == BOB) {
+    io->send_data(vec->data(), data.filter_w * sizeof(uint64_t));
+    io->flush();
+    for (int i = 0; i < data.filter_h; i++) {
+      io->send_data(C[i].data(), sizeof(uint64_t));
+    }
+  } else // party == ALICE
+  {
+    vector<uint64_t> vec_0(data.filter_w);
+    io->recv_data(vec_0.data(), data.filter_w * sizeof(uint64_t));
+    for (int i = 0; i < data.filter_w; i++) {
+      vec_0[i] = (vec_0[i] + (*vec)[i]) % prime_mod;
+    }
+    auto result = ideal_functionality(vec_0.data(), matrix->data());
+
+    vector<vector<uint64_t>> C_0(data.filter_h);
+    for (int i = 0; i < data.filter_h; i++) {
+      C_0[i].resize(1);
+      io->recv_data(C_0[i].data(), sizeof(uint64_t));
+      C_0[i][0] = (C_0[i][0] + C[i][0]) % prime_mod;
+    }
+    bool pass = true;
+    for (int i = 0; i < data.filter_h; i++) {
+      if (neg_mod(result[i], (int64_t)prime_mod) != (int64_t)C_0[i][0]) {
+        pass = false;
+      }
+    }
+    if (pass)
+      cout << GREEN << "[Server] Successful Operation" << RESET << endl;
+    else {
+      cout << RED << "[Server] Failed Operation" << RESET << endl;
+      cout << RED << "WARNING: The implementation assumes that the computation"
+           << endl;
+      cout << "performed locally by the server (on the model and its input "
+              "share)"
+           << endl;
+      cout << "fits in a 64-bit integer. The failed operation could be a result"
+           << endl;
+      cout << "of overflowing the bound." << RESET << endl;
+    }
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.h
new file mode 100644
index 00000000..c1ff7ea9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.h
@@ -0,0 +1,98 @@
+/*
+Original Author: ryanleh
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef FC_FIELD_H__
+#define FC_FIELD_H__
+
+#include "LinearHE/utils-HE.h"
+
+struct FCMetadata {
+  int slot_count;
+  int32_t pack_num;
+  int32_t inp_ct;
+  // Filter is a matrix
+  int32_t filter_h;
+  int32_t filter_w;
+  int32_t filter_size;
+  // Image is a vector
+  int32_t image_size;
+};
+
+seal::Ciphertext preprocess_vec(const uint64_t *input, const FCMetadata &data,
+                                seal::Encryptor &encryptor,
+                                seal::BatchEncoder &batch_encoder);
+
+std::vector<seal::Plaintext>
+preprocess_matrix(const uint64_t *const *matrix, const FCMetadata &data,
+                  seal::BatchEncoder &batch_encoder);
+
+seal::Ciphertext fc_preprocess_noise(const uint64_t *secret_share,
+                                     const FCMetadata &data,
+                                     seal::Encryptor &encryptor,
+                                     seal::BatchEncoder &batch_encoder);
+
+seal::Ciphertext fc_online(seal::Ciphertext &ct,
+                           std::vector<seal::Plaintext> &enc_mat,
+                           const FCMetadata &data, seal::Evaluator &evaluator,
+                           seal::GaloisKeys &gal_keys, seal::Ciphertext &zero,
+                           seal::Ciphertext &enc_noise);
+
+uint64_t *fc_postprocess(seal::Ciphertext &result, const FCMetadata &data,
+                         seal::BatchEncoder &batch_encoder,
+                         seal::Decryptor &decryptor);
+
+class FCField {
+public:
+  int party;
+  sci::NetIO *io;
+  FCMetadata data;
+  std::shared_ptr<seal::SEALContext> context;
+  seal::Encryptor *encryptor;
+  seal::Decryptor *decryptor;
+  seal::Evaluator *evaluator;
+  seal::BatchEncoder *encoder;
+  seal::GaloisKeys *gal_keys;
+  seal::Ciphertext *zero;
+  size_t slot_count;
+
+  FCField(int party, sci::NetIO *io);
+
+  ~FCField();
+
+  void configure();
+
+  std::vector<uint64_t> ideal_functionality(uint64_t *vec, uint64_t **matrix);
+
+  void matrix_multiplication(int32_t num_rows, int32_t common_dim,
+                             int32_t num_cols,
+                             std::vector<std::vector<uint64_t>> &A,
+                             std::vector<std::vector<uint64_t>> &B,
+                             std::vector<std::vector<uint64_t>> &C,
+                             bool verify_output = false, bool verbose = false);
+
+  void verify(std::vector<uint64_t> *vec, std::vector<uint64_t *> *matrix,
+              std::vector<std::vector<uint64_t>> &C);
+};
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/generate_primes.py b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/generate_primes.py
new file mode 100644
index 00000000..2160d395
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/generate_primes.py
@@ -0,0 +1,26 @@
+import math
+import gmpy2
+import random
+
+bitlengths = range(32, 42, 1)
+primes = []
+mod_degree = 2**19
+
+for bitlen in bitlengths:
+    p = mod_degree
+    rand_bitlen = bitlen-int(math.log2(mod_degree))-2
+    while not gmpy2.is_prime(p):
+        p = 2*mod_degree*random.randrange(int(2.0**(rand_bitlen+0.999)), 2**(rand_bitlen + 1)) + 1
+    primes.append(p)
+
+print(primes)
+
+'''
+mod_degree = 65536
+const std::map<int32_t, uint64_t> default_prime_mod{
+    {32, 4293918721},    {33, 8585084929},   {34, 17171218433},
+    {35, 34359214081},   {36, 68686184449},  {37, 137352314881},
+    {38, 274824036353},  {39, 549753716737}, {40, 1099480956929},
+    {41, 2198100901889},
+};
+'''
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.cpp
new file mode 100644
index 00000000..5ec77ec5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.cpp
@@ -0,0 +1,237 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "LinearHE/utils-HE.h"
+#include "seal/util/polyarithsmallmod.h"
+
+using namespace std;
+using namespace sci;
+using namespace seal;
+using namespace seal::util;
+
+void generate_new_keys(int party, NetIO *io, int slot_count,
+                       shared_ptr<SEALContext> &context_,
+                       Encryptor *&encryptor_, Decryptor *&decryptor_,
+                       Evaluator *&evaluator_, BatchEncoder *&encoder_,
+                       GaloisKeys *&gal_keys_, Ciphertext *&zero_,
+                       bool verbose) {
+  EncryptionParameters parms(scheme_type::BFV);
+  parms.set_poly_modulus_degree(slot_count);
+  parms.set_coeff_modulus(CoeffModulus::Create(slot_count, {60, 60, 60, 38}));
+  parms.set_plain_modulus(prime_mod);
+  // auto context = SEALContext::Create(parms, true, sec_level_type::none);
+  context_ = SEALContext::Create(parms, true, sec_level_type::none);
+  encoder_ = new BatchEncoder(context_);
+  evaluator_ = new Evaluator(context_);
+  if (party == BOB) {
+    KeyGenerator keygen(context_);
+    auto pub_key = keygen.public_key();
+    auto sec_key = keygen.secret_key();
+    auto gal_keys_ = keygen.galois_keys();
+
+    stringstream os;
+    pub_key.save(os);
+    uint64_t pk_size = os.tellp();
+    gal_keys_.save(os);
+    uint64_t gk_size = (uint64_t)os.tellp() - pk_size;
+
+    string keys_ser = os.str();
+    io->send_data(&pk_size, sizeof(uint64_t));
+    io->send_data(&gk_size, sizeof(uint64_t));
+    io->send_data(keys_ser.c_str(), pk_size + gk_size);
+
+#ifdef HE_DEBUG
+    stringstream os_sk;
+    sec_key.save(os_sk);
+    uint64_t sk_size = os_sk.tellp();
+    string keys_ser_sk = os_sk.str();
+    io->send_data(&sk_size, sizeof(uint64_t));
+    io->send_data(keys_ser_sk.c_str(), sk_size);
+#endif
+    encryptor_ = new Encryptor(context_, pub_key);
+    decryptor_ = new Decryptor(context_, sec_key);
+  } else // party == ALICE
+  {
+    uint64_t pk_size;
+    uint64_t gk_size;
+    io->recv_data(&pk_size, sizeof(uint64_t));
+    io->recv_data(&gk_size, sizeof(uint64_t));
+    char *key_share = new char[pk_size + gk_size];
+    io->recv_data(key_share, pk_size + gk_size);
+    stringstream is;
+    PublicKey pub_key;
+    is.write(key_share, pk_size);
+    pub_key.load(context_, is);
+    gal_keys_ = new GaloisKeys();
+    is.write(key_share + pk_size, gk_size);
+    gal_keys_->load(context_, is);
+    delete[] key_share;
+
+#ifdef HE_DEBUG
+    uint64_t sk_size;
+    io->recv_data(&sk_size, sizeof(uint64_t));
+    char *key_share_sk = new char[sk_size];
+    io->recv_data(key_share_sk, sk_size);
+    stringstream is_sk;
+    SecretKey sec_key;
+    is_sk.write(key_share_sk, sk_size);
+    sec_key.load(context_, is_sk);
+    delete[] key_share_sk;
+    decryptor_ = new Decryptor(context_, sec_key);
+#endif
+    encryptor_ = new Encryptor(context_, pub_key);
+    vector<uint64_t> pod_matrix(slot_count, 0ULL);
+    Plaintext tmp;
+    encoder_->encode(pod_matrix, tmp);
+    zero_ = new Ciphertext;
+    encryptor_->encrypt(tmp, *zero_);
+  }
+  if (verbose)
+    cout << "Keys Generated (slot_count: " << slot_count << ")" << endl;
+}
+
+void free_keys(int party, Encryptor *&encryptor_, Decryptor *&decryptor_,
+               Evaluator *&evaluator_, BatchEncoder *&encoder_,
+               GaloisKeys *&gal_keys_, Ciphertext *&zero_) {
+  delete encoder_;
+  delete evaluator_;
+  delete encryptor_;
+  if (party == BOB) {
+    delete decryptor_;
+  } else // party ==ALICE
+  {
+#ifdef HE_DEBUG
+    delete decryptor_;
+#endif
+    delete gal_keys_;
+    delete zero_;
+  }
+}
+
+void send_encrypted_vector(NetIO *io, vector<Ciphertext> &ct_vec) {
+  assert(ct_vec.size() > 0);
+  stringstream os;
+  uint64_t ct_size;
+  for (size_t ct = 0; ct < ct_vec.size(); ct++) {
+    ct_vec[ct].save(os);
+    if (!ct)
+      ct_size = os.tellp();
+  }
+  string ct_ser = os.str();
+  io->send_data(&ct_size, sizeof(uint64_t));
+  io->send_data(ct_ser.c_str(), ct_ser.size());
+}
+
+void recv_encrypted_vector(NetIO *io, vector<Ciphertext> &ct_vec) {
+  assert(ct_vec.size() > 0);
+  stringstream is;
+  uint64_t ct_size;
+  io->recv_data(&ct_size, sizeof(uint64_t));
+  char *c_enc_result = new char[ct_size * ct_vec.size()];
+  io->recv_data(c_enc_result, ct_size * ct_vec.size());
+  for (size_t ct = 0; ct < ct_vec.size(); ct++) {
+    is.write(c_enc_result + ct_size * ct, ct_size);
+    ct_vec[ct].unsafe_load(is);
+  }
+  delete[] c_enc_result;
+}
+
+void send_ciphertext(NetIO *io, Ciphertext &ct) {
+  stringstream os;
+  uint64_t ct_size;
+  ct.save(os);
+  ct_size = os.tellp();
+  string ct_ser = os.str();
+  io->send_data(&ct_size, sizeof(uint64_t));
+  io->send_data(ct_ser.c_str(), ct_ser.size());
+}
+
+void recv_ciphertext(NetIO *io, Ciphertext &ct) {
+  stringstream is;
+  uint64_t ct_size;
+  io->recv_data(&ct_size, sizeof(uint64_t));
+  char *c_enc_result = new char[ct_size];
+  io->recv_data(c_enc_result, ct_size);
+  is.write(c_enc_result, ct_size);
+  ct.unsafe_load(is);
+  delete[] c_enc_result;
+}
+
+void set_poly_coeffs_uniform(
+    uint64_t *poly, uint32_t bitlen, shared_ptr<UniformRandomGenerator> random,
+    shared_ptr<const SEALContext::ContextData> &context_data) {
+  assert(bitlen < 128 && bitlen > 0);
+  auto &parms = context_data->parms();
+  auto &coeff_modulus = parms.coeff_modulus();
+  size_t coeff_count = parms.poly_modulus_degree();
+  size_t coeff_mod_count = coeff_modulus.size();
+  uint64_t bitlen_mask = (1ULL << (bitlen % 64)) - 1;
+
+  RandomToStandardAdapter engine(random);
+  for (size_t i = 0; i < coeff_count; i++) {
+    if (bitlen < 64) {
+      uint64_t noise = (uint64_t(engine()) << 32) | engine();
+      noise &= bitlen_mask;
+      for (size_t j = 0; j < coeff_mod_count; j++) {
+        poly[i + (j * coeff_count)] =
+            barrett_reduce_63(noise, coeff_modulus[j]);
+      }
+    } else {
+      uint64_t noise[2]; // LSB || MSB
+      for (int j = 0; j < 2; j++) {
+        noise[0] = (uint64_t(engine()) << 32) | engine();
+        noise[1] = (uint64_t(engine()) << 32) | engine();
+      }
+      noise[1] &= bitlen_mask;
+      for (size_t j = 0; j < coeff_mod_count; j++) {
+        poly[i + (j * coeff_count)] =
+            barrett_reduce_128(noise, coeff_modulus[j]);
+      }
+    }
+  }
+}
+
+void flood_ciphertext(Ciphertext &ct,
+                      shared_ptr<const SEALContext::ContextData> &context_data,
+                      uint32_t noise_len, MemoryPoolHandle pool) {
+
+  auto &parms = context_data->parms();
+  auto &coeff_modulus = parms.coeff_modulus();
+  size_t coeff_count = parms.poly_modulus_degree();
+  size_t coeff_mod_count = coeff_modulus.size();
+
+  auto noise(allocate_poly(coeff_count, coeff_mod_count, pool));
+  shared_ptr<UniformRandomGenerator> random(parms.random_generator()->create());
+
+  set_poly_coeffs_uniform(noise.get(), noise_len, random, context_data);
+  for (size_t i = 0; i < coeff_mod_count; i++) {
+    add_poly_poly_coeffmod(noise.get() + (i * coeff_count),
+                           ct.data() + (i * coeff_count), coeff_count,
+                           coeff_modulus[i], ct.data() + (i * coeff_count));
+  }
+
+  set_poly_coeffs_uniform(noise.get(), noise_len, random, context_data);
+  for (size_t i = 0; i < coeff_mod_count; i++) {
+    add_poly_poly_coeffmod(noise.get() + (i * coeff_count),
+                           ct.data(1) + (i * coeff_count), coeff_count,
+                           coeff_modulus[i], ct.data(1) + (i * coeff_count));
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.h
new file mode 100644
index 00000000..a06ca458
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.h
@@ -0,0 +1,70 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef UTILS_HE_H__
+#define UTILS_HE_H__
+
+#include "LinearHE/defines-HE.h"
+#include "seal/seal.h"
+#include "utils/emp-tool.h"
+
+#define PRINT_NOISE_BUDGET(decryptor, ct, print_msg)                           \
+  if (verbose)                                                                 \
+  std::cout << "[Server] Noise Budget " << print_msg << ": " << YELLOW         \
+            << decryptor->invariant_noise_budget(ct) << " bits" << RESET       \
+            << std::endl
+
+void generate_new_keys(int party, sci::NetIO *io, int slot_count,
+                       std::shared_ptr<seal::SEALContext> &context_,
+                       seal::Encryptor *&encryptor_,
+                       seal::Decryptor *&decryptor_,
+                       seal::Evaluator *&evaluator_,
+                       seal::BatchEncoder *&encoder_,
+                       seal::GaloisKeys *&gal_keys_, seal::Ciphertext *&zero_,
+                       bool verbose = false);
+
+void free_keys(int party, seal::Encryptor *&encryptor_,
+               seal::Decryptor *&decryptor_, seal::Evaluator *&evaluator_,
+               seal::BatchEncoder *&encoder_, seal::GaloisKeys *&gal_keys_,
+               seal::Ciphertext *&zero_);
+
+void send_encrypted_vector(sci::NetIO *io,
+                           std::vector<seal::Ciphertext> &ct_vec);
+
+void recv_encrypted_vector(sci::NetIO *io,
+                           std::vector<seal::Ciphertext> &ct_vec);
+
+void send_ciphertext(sci::NetIO *io, seal::Ciphertext &ct);
+
+void recv_ciphertext(sci::NetIO *io, seal::Ciphertext &ct);
+
+void set_poly_coeffs_uniform(
+    uint64_t *poly, uint32_t bitlen,
+    std::shared_ptr<seal::UniformRandomGenerator> random,
+    std::shared_ptr<const seal::SEALContext::ContextData> &context_data);
+
+void flood_ciphertext(
+    seal::Ciphertext &ct,
+    std::shared_ptr<const seal::SEALContext::ContextData> &context_data,
+    uint32_t noise_len,
+    seal::MemoryPoolHandle pool = seal::MemoryManager::GetPool());
+
+#endif // UTILS_HE_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/CMakeLists.txt
new file mode 100644
index 00000000..430f3be0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_library(SCI-LinearOT linear-ot.cpp)
+target_link_libraries(SCI-LinearOT
+    PUBLIC
+    SCI-BuildingBlocks
+    Eigen3::Eigen
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.cpp
new file mode 100644
index 00000000..651f9ab5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.cpp
@@ -0,0 +1,813 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "LinearOT/linear-ot.h"
+#include <cmath>
+#include <omp.h>
+
+#define MAX_NUM_OT (1 << 24)
+
+#define USE_EIGEN
+#ifdef USE_EIGEN
+#include <Eigen/Dense>
+#endif
+
+using namespace std;
+using namespace sci;
+
+#define START_IDX 0
+#define print_vec(ot, vec, bw, N)                                              \
+  {                                                                            \
+    uint64_t* rec_vec = new uint64_t[N];                                       \
+    reconstruct(ot, N, vec+START_IDX, rec_vec, bw);                            \
+    std::cout << #vec << "_pub:" << std::endl;                                 \
+    for (int i = 0; i < N; i++) {                                              \
+        std::cout << rec_vec[i] << " ";                                        \
+    }                                                                          \
+    std::cout << std::endl;                                                    \
+    delete[] rec_vec;                                                          \
+  }                                                                            \
+
+#define print_share(vec, bw, N)                                                \
+  {                                                                            \
+    std::cout << #vec << "_share:" << std::endl;                               \
+    for (int i = 0; i < N; i++) {                                              \
+        std::cout << vec[i] << " ";                                            \
+    }                                                                          \
+    std::cout << std::endl;                                                    \
+  }                                                                            \
+
+void reconstruct(LinearOT* ot, int dim, uint64_t *x, uint64_t *y, int bw_x) {
+  uint64_t mask = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  if (ot->party == sci::ALICE) {
+    ot->iopack->io->send_data(x, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      y[i] = 0;
+    }
+  } else {
+    ot->iopack->io->recv_data(y, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      y[i] = (y[i] + x[i]) & mask;
+    }
+  }
+}
+
+void matrix_transpose(uint64_t *A, int32_t m, int32_t n, int d = 1) {
+  uint64_t *tmpA = new uint64_t[m * n * d];
+  memcpy(tmpA, A, m * n * d * sizeof(uint64_t));
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      if (d == 1) {
+        A[j * m + i] = tmpA[i * n + j];
+      } else {
+        memcpy(A + (j * m + i) * d, tmpA + (i * n + j) * d,
+               d * sizeof(uint64_t));
+      }
+    }
+  }
+  delete[] tmpA;
+}
+
+LinearOT::LinearOT(int party, IOPack *iopack, OTPack *otpack) {
+  this->party = party;
+  this->iopack = iopack;
+  this->otpack = otpack;
+  this->aux = new AuxProtocols(party, iopack, otpack);
+  this->trunc = new Truncation(party, iopack, otpack);
+  this->xt = new XTProtocol(party, iopack, otpack);
+}
+
+LinearOT::~LinearOT() {
+  delete aux;
+  delete trunc;
+  delete xt;
+}
+
+void LinearOT::hadamard_cleartext(int dim, uint64_t *inA, uint64_t *inB,
+                                  uint64_t *outC) {
+  matmul_cleartext(1, dim, 1, inA, inB, outC, false);
+}
+
+#ifdef USE_EIGEN
+void matmul_cleartext_eigen(int dim1, int dim2, int dim3, uint64_t *inA,
+                            uint64_t *inB, uint64_t *outC) {
+  Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic> eigen_A(dim1, dim2);
+  Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic> eigen_B(dim2, dim3);
+  Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic> eigen_C(dim1, dim3);
+
+  for (int i = 0; i < dim1; i++) {
+    for (int j = 0; j < dim2; j++) {
+      eigen_A(i, j) = Arr2DIdxRowM(inA, dim1, dim2, i, j);
+    }
+  }
+  for (int i = 0; i < dim2; i++) {
+    for (int j = 0; j < dim3; j++) {
+      eigen_B(i, j) = Arr2DIdxRowM(inB, dim2, dim3, i, j);
+    }
+  }
+  eigen_C = eigen_A * eigen_B;
+  for (int i = 0; i < dim1; i++) {
+    for (int j = 0; j < dim3; j++) {
+      Arr2DIdxRowM(outC, dim1, dim3, i, j) = eigen_C(i, j);
+    }
+  }
+}
+#endif
+
+void LinearOT::matmul_cleartext(int dim1, int dim2, int dim3, uint64_t *inA,
+                                uint64_t *inB, uint64_t *outC,
+                                bool accumulate) {
+  if (!accumulate) {
+    for (int i = 0; i < dim1; i++) {
+      for (int j = 0; j < dim3; j++) {
+        for (int k = 0; k < dim2; k++) {
+          outC[dim2 * dim3 * i + dim2 * j + k] =
+              inA[dim2 * i + k] * inB[dim3 * k + j];
+        }
+      }
+    }
+    return;
+  }
+#ifndef USE_EIGEN
+  for (int i = 0; i < dim1; i++) {
+    for (int j = 0; j < dim3; j++) {
+      outC[i * dim3 + j] = 0;
+      for (int k = 0; k < dim2; k++) {
+        outC[i * dim3 + j] += (inA[dim2 * i + k] * inB[dim3 * k + j]);
+      }
+    }
+  }
+#else
+  assert(accumulate == true && "Eigen not configured for accumulate = false");
+  matmul_cleartext_eigen(dim1, dim2, dim3, inA, inB, outC);
+#endif
+}
+
+void LinearOT::hadamard_cross_terms(int32_t dim, uint64_t *inA, uint64_t *inB,
+                                    uint64_t *outC, int32_t bwA, int32_t bwB,
+                                    int32_t bwC, MultMode mode) {
+  matmul_cross_terms(1, dim, 1, inA, inB, outC, bwA, bwB, bwC, false, mode);
+}
+
+void LinearOT::hadamard_product(int32_t dim, uint64_t *inA, uint64_t *inB,
+                                uint64_t *outC, int32_t bwA, int32_t bwB,
+                                int32_t bwC, bool signed_arithmetic,
+                                bool signed_B, MultMode mode, uint8_t *msbA,
+                                uint8_t *msbB) {
+  matrix_multiplication(1, dim, 1, inA, inB, outC, bwA, bwB, bwC,
+                        signed_arithmetic, signed_B, false, mode, msbA, msbB);
+}
+
+// Cost with matrix of dimension dim1xdim2 and bitwidth m used as OT receiver
+double cross_term_cost(int dim1, int dim2, int dim3, int m, int n, int l) {
+  return dim1*dim2*(m*128.0 + dim3*(m*(l-m) + (m*m + m)/2.0)); 
+}
+
+void LinearOT::matmul_cross_terms(int32_t dim1, int32_t dim2, int32_t dim3,
+                                  uint64_t *inA, uint64_t *inB, uint64_t *outC,
+                                  int32_t bwA, int32_t bwB, int32_t bwC,
+                                  bool accumulate, MultMode mode) {
+  bool use_straight_ot = false, use_reversed_ot = false;
+  uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+  uint64_t *inS, *inR;
+  int32_t bwR, dimS1, dimS2, dimR1, dimR2;
+  // A whole row of values is multiplied to a element using only bwR OTs
+  bool row_batching;
+  if (mode == MultMode::Alice_has_A) {
+    use_straight_ot = true;
+    row_batching = false;
+  } else if (mode == MultMode::Bob_has_A) {
+    use_straight_ot = true;
+    row_batching = true;
+  } else if (mode == MultMode::Alice_has_B) {
+    use_reversed_ot = true;
+    row_batching = false;
+  } else if (mode == MultMode::Bob_has_B) {
+    use_reversed_ot = true;
+    row_batching = true;
+  } else {
+    use_straight_ot = true;
+    use_reversed_ot = true;
+    // if (bwA*dim1*dim2 > bwB*dim2*dim3) {
+    // if (bwA > bwB) {
+    if (cross_term_cost(dim1, dim2, dim3, bwA, bwB, bwC)
+        > cross_term_cost(dim3, dim2, dim1, bwB, bwA, bwC)) {
+      row_batching = false;
+    } else {
+      row_batching = true;
+    }
+  }
+  if (row_batching) {
+    inS = inB;
+    inR = inA;
+    bwR = bwA;
+    dimS1 = dim2;
+    dimS2 = dim3;
+    dimR1 = dim1;
+    dimR2 = dim2;
+  } else {
+    // inS = inA;
+    inS = new uint64_t[dim1 * dim2];
+    memcpy(inS, inA, dim1 * dim2 * sizeof(uint64_t));
+    inR = inB;
+    bwR = bwB;
+    dimS1 = dim1;
+    dimS2 = dim2;
+    dimR1 = dim2;
+    dimR2 = dim3;
+  }
+
+  int32_t num_ot = dimR1 * dimR2 * bwR;
+  int32_t msgs_per_ot = (row_batching ? dimS2 : dimS1);
+  int32_t dim = dimR1 * dimR2;
+  // max_num_ot is multiple of dim
+  int32_t max_num_ot = ceil(float(MAX_NUM_OT) / (dim * msgs_per_ot)) * dim;
+  int32_t batch_size;
+  if (num_ot < max_num_ot)
+    batch_size = num_ot;
+  else
+    batch_size = max_num_ot;
+
+  uint64_t *corr = new uint64_t[batch_size * msgs_per_ot];
+  uint64_t *ABs = new uint64_t[batch_size * msgs_per_ot];
+  uint64_t *ABr = new uint64_t[batch_size * msgs_per_ot];
+  bool *choice = new bool[batch_size];
+  uint64_t *tmpR = new uint64_t[dim];
+  memcpy(tmpR, inR, dim * sizeof(uint64_t));
+
+  memset(ABs, 0, batch_size * msgs_per_ot * sizeof(uint64_t));
+  memset(ABr, 0, batch_size * msgs_per_ot * sizeof(uint64_t));
+  if (accumulate) {
+    memset(outC, 0, dim1 * dim3 * sizeof(uint64_t));
+  } else {
+    memset(outC, 0, dim1 * dim2 * dim3 * sizeof(uint64_t));
+  }
+  if (!row_batching) {
+    // inplace transposing is fine because inS is a copy if row_batching = false
+    matrix_transpose(inS, dimS1, dimS2);
+  }
+
+  for (int i = 0; i < num_ot; i += batch_size) {
+    vector<int> msg_len;
+    if (batch_size <= num_ot - i)
+      msg_len.resize(batch_size / dim);
+    else
+      msg_len.resize((num_ot - i) / dim);
+    for (int j = i; j < i + batch_size and j < num_ot; j += dim) {
+      int bit_offset = i / dim;
+      int bit_idx = j / dim;
+      msg_len[bit_idx - bit_offset] = bwC - bit_idx;
+      for (int k = j; k < j + dim and k < i + batch_size; k++) {
+        int inp_idx = k - j;
+        int row_idx_R = inp_idx / dimR2;
+        int col_idx_R = inp_idx % dimR2;
+        for (int h = 0; h < msgs_per_ot; h++) {
+          uint64_t elemS;
+          if (row_batching)
+            elemS = inS[col_idx_R * dimS2 + h];
+          // To preserve locality, inS is tranposed if row_batching = false
+          else
+            elemS = inS[row_idx_R * dimS1 + h];
+          // else elemS = inS[h*dimS2 + row_idx_R];
+          corr[(k - i) * msgs_per_ot + h] =
+              ((elemS << bit_idx) & maskC) >> bit_idx;
+        }
+        choice[k - i] = (bool)(tmpR[inp_idx] & 1);
+        tmpR[inp_idx] >>= 1;
+      }
+    }
+#pragma omp parallel num_threads(2)
+    {
+      if (omp_get_thread_num() == 1 && use_reversed_ot) {
+        if (party == sci::ALICE) {
+          if (batch_size <= num_ot - i) {
+            otpack->iknp_reversed->recv_batched_cot(ABr, choice, msg_len,
+                                                    batch_size, msgs_per_ot);
+          } else {
+            otpack->iknp_reversed->recv_batched_cot(ABr, choice, msg_len,
+                                                    num_ot - i, msgs_per_ot);
+          }
+        } else { // party == sci::BOB
+          if (batch_size <= num_ot - i) {
+            otpack->iknp_reversed->send_batched_cot(ABs, corr, msg_len,
+                                                    batch_size, msgs_per_ot);
+          } else {
+            otpack->iknp_reversed->send_batched_cot(ABs, corr, msg_len,
+                                                    num_ot - i, msgs_per_ot);
+          }
+        }
+      } else if (omp_get_thread_num() == 0 && use_straight_ot) {
+        if (party == sci::ALICE) {
+          if (batch_size <= num_ot - i) {
+            otpack->iknp_straight->send_batched_cot(ABs, corr, msg_len,
+                                                    batch_size, msgs_per_ot);
+          } else {
+            otpack->iknp_straight->send_batched_cot(ABs, corr, msg_len,
+                                                    num_ot - i, msgs_per_ot);
+          }
+        } else { // party == sci::BOB
+          if (batch_size <= num_ot - i) {
+            otpack->iknp_straight->recv_batched_cot(ABr, choice, msg_len,
+                                                    batch_size, msgs_per_ot);
+          } else {
+            otpack->iknp_straight->recv_batched_cot(ABr, choice, msg_len,
+                                                    num_ot - i, msgs_per_ot);
+          }
+        }
+      }
+    }
+    for (int h = 0; h < dim2; h++) {
+      for (int j = i; j < i + batch_size and j < num_ot; j += dim) {
+        int bit_idx = j / dim;
+        for (int k = 0; k < dim1 * dim3; k++) {
+          int row_idx = (row_batching ? k / dim3 : k / dim1);
+          int col_idx = (row_batching ? k % dim3 : k % dim1);
+          int idx;
+          if (row_batching)
+            idx = (j - i + row_idx * dimR2 + h) * msgs_per_ot + col_idx;
+          // To preserve locality, outC-transpose is computed if row_batching =
+          // false
+          else
+            idx = (j - i + h * dimR2 + row_idx) * msgs_per_ot + col_idx;
+          if (use_straight_ot) {
+            uint64_t temp = (party == ALICE ? -ABs[idx] : ABr[idx]) << bit_idx;
+            if (accumulate) {
+              outC[k] += temp;
+            } else {
+              outC[k * dim2 + h] += temp;
+            }
+          }
+          if (use_reversed_ot) {
+            uint64_t temp = (party == BOB ? -ABs[idx] : ABr[idx]) << bit_idx;
+            if (accumulate) {
+              outC[k] += temp;
+            } else {
+              outC[k * dim2 + h] += temp;
+            }
+          }
+        }
+      }
+    }
+  }
+  if (!row_batching) {
+    if (accumulate) {
+      matrix_transpose(outC, dim3, dim1);
+    } else {
+      matrix_transpose(outC, dim3, dim1, dim2);
+    }
+  }
+
+  if (accumulate) {
+    for (int k = 0; k < dim1 * dim3; k++) {
+      outC[k] &= maskC;
+    }
+  } else {
+    for (int k = 0; k < dim1 * dim2 * dim3; k++) {
+      outC[k] &= maskC;
+    }
+  }
+
+  delete[] corr;
+  delete[] ABs;
+  delete[] ABr;
+  delete[] choice;
+  delete[] tmpR;
+  if (!row_batching)
+    delete[] inS;
+}
+
+void LinearOT::matmul_multiplexer(int32_t dim1, int32_t dim2, int32_t dim3,
+                                  uint64_t *inA, uint64_t *inB, uint64_t *outC,
+                                  int32_t bwA, int32_t bwB, int32_t bwC,
+                                  bool accumulate, MultMode mode) {
+  assert(bwA == 1 || bwB == 1);
+  assert(bwC < (bwA + bwB));
+
+  bool use_straight_ot = false, use_reversed_ot = false;
+  uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+  uint64_t *inS, *inR;
+  int32_t dimS1, dimS2, dimR1, dimR2;
+  // A whole row of values is multiplied to a bit
+  bool row_batching;
+  if (bwB == 1) {
+    inS = new uint64_t[dim1 * dim2];
+    memcpy(inS, inA, dim1 * dim2 * sizeof(uint64_t));
+    inR = inB;
+    dimS1 = dim1;
+    dimS2 = dim2;
+    dimR1 = dim2;
+    dimR2 = dim3;
+    if (mode == MultMode::Alice_has_A)
+      use_straight_ot = true;
+    else if (mode == MultMode::Bob_has_A)
+      use_reversed_ot = true;
+    else {
+      use_straight_ot = true;
+      use_reversed_ot = true;
+    }
+    row_batching = false;
+  } else { // bwA == 1
+    inS = inB;
+    inR = inA;
+    dimS1 = dim2;
+    dimS2 = dim3;
+    dimR1 = dim1;
+    dimR2 = dim2;
+    if (mode == MultMode::Alice_has_B)
+      use_straight_ot = true;
+    else if (mode == MultMode::Bob_has_B)
+      use_reversed_ot = true;
+    else {
+      use_straight_ot = true;
+      use_reversed_ot = true;
+    }
+    row_batching = true;
+  }
+
+  int32_t dim = dimR1 * dimR2;
+  int32_t msgs_per_ot = (row_batching ? dimS2 : dimS1);
+
+  PRG128 prg;
+  uint64_t *data = new uint64_t[2 * dim * msgs_per_ot];
+  uint64_t *ABs = new uint64_t[dim * msgs_per_ot];
+  uint64_t *ABr = new uint64_t[dim * msgs_per_ot];
+  uint8_t *choice = new uint8_t[dim];
+
+  if (accumulate) {
+    for (int k = 0; k < dim1 * dim3; k++) {
+      outC[k] = 0;
+    }
+  } else {
+    for (int k = 0; k < dim1 * dim2 * dim3; k++) {
+      outC[k] = 0;
+    }
+  }
+  memset(ABs, 0, dim * msgs_per_ot * sizeof(uint64_t));
+  memset(ABr, 0, dim * msgs_per_ot * sizeof(uint64_t));
+
+  if (!row_batching) {
+    // inplace transposing is fine because inS is a copy if row_batching = false
+    matrix_transpose(inS, dimS1, dimS2);
+  }
+
+  if (use_straight_ot && party == ALICE) {
+    prg.random_data(ABs, dim * msgs_per_ot * sizeof(uint64_t));
+  }
+  if (use_reversed_ot && party == BOB) {
+    prg.random_data(ABs, dim * msgs_per_ot * sizeof(uint64_t));
+  }
+
+  for (int i = 0; i < dim; i++) {
+    int row_idx_R = i / dimR2;
+    int col_idx_R = i % dimR2;
+
+    choice[i] = (bool)(inR[i] & 1);
+    uint64_t choice_bit = choice[i];
+    for (int h = 0; h < msgs_per_ot; h++) {
+      uint64_t elemS;
+      if (row_batching)
+        elemS = inS[col_idx_R * dimS2 + h];
+      // To preserve locality, inS is tranposed if row_batching = false
+      else
+        elemS = inS[row_idx_R * dimS1 + h];
+      int idx = i * msgs_per_ot + h;
+      data[idx * 2] = (ABs[idx] + elemS * choice_bit) & maskC;
+      data[idx * 2 + 1] = (ABs[idx] + elemS * (1 - choice_bit)) & maskC;
+    }
+  }
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1 && use_reversed_ot) {
+      if (party == sci::ALICE) {
+        otpack->iknp_reversed->recv_batched_got(ABr, choice, dim, bwC,
+                                                msgs_per_ot);
+      } else { // party == sci::BOB
+        otpack->iknp_reversed->send_batched_got(data, dim, bwC, msgs_per_ot);
+      }
+    } else if (omp_get_thread_num() == 0 && use_straight_ot) {
+      if (party == sci::ALICE) {
+        otpack->iknp_straight->send_batched_got(data, dim, bwC, msgs_per_ot);
+      } else { // party == sci::BOB
+        otpack->iknp_straight->recv_batched_got(ABr, choice, dim, bwC,
+                                                msgs_per_ot);
+      }
+    }
+  }
+  for (int h = 0; h < dim2; h++) {
+    for (int k = 0; k < dim1 * dim3; k++) {
+      int row_idx = (row_batching ? k / dim3 : k / dim1);
+      int col_idx = (row_batching ? k % dim3 : k % dim1);
+      int idx;
+      if (row_batching)
+        idx = (row_idx * dimR2 + h) * msgs_per_ot + col_idx;
+      // To preserve locality, outC-transpose is computed if row_batching =
+      // false
+      else
+        idx = (h * dimR2 + row_idx) * msgs_per_ot + col_idx;
+      if (use_straight_ot) {
+        uint64_t temp = (party == ALICE ? -ABs[idx] : ABr[idx]);
+        if (accumulate) {
+          outC[k] += temp;
+        } else {
+          outC[k * dim2 + h] += temp;
+        }
+      }
+      if (use_reversed_ot) {
+        uint64_t temp = (party == BOB ? -ABs[idx] : ABr[idx]);
+        if (accumulate) {
+          outC[k] += temp;
+        } else {
+          outC[k * dim2 + h] += temp;
+        }
+      }
+    }
+  }
+  if (!row_batching) {
+    if (accumulate) {
+      matrix_transpose(outC, dim3, dim1);
+    } else {
+      matrix_transpose(outC, dim3, dim1, dim2);
+    }
+  }
+
+  if (accumulate) {
+    for (int k = 0; k < dim1 * dim3; k++) {
+      outC[k] &= maskC;
+    }
+  } else {
+    for (int k = 0; k < dim1 * dim2 * dim3; k++) {
+      outC[k] &= maskC;
+    }
+  }
+
+  delete[] data;
+  delete[] ABs;
+  delete[] ABr;
+  delete[] choice;
+  if (!row_batching)
+    delete[] inS;
+}
+
+uint64_t extension_cost(int dim1, int dim2, int m, int n, uint8_t* msb) {
+  if (msb != nullptr) {
+    return dim1*dim2*(128.0*(m + 1) + 13*m + n);
+  } else {
+    return dim1*dim2*(128.0*2 - m + n + 2);
+  }
+}
+
+void LinearOT::matrix_multiplication(int32_t dim1, int32_t dim2, int32_t dim3,
+                                     uint64_t *inA, uint64_t *inB,
+                                     uint64_t *outC, int32_t bwA, int32_t bwB,
+                                     int32_t bwC, bool signed_arithmetic,
+                                     bool signed_B, bool accumulate,
+                                     MultMode mode, uint8_t *msbA,
+                                     uint8_t *msbB) {
+  assert(bwC <= 64);
+  assert((bwC <= (bwA + bwB)) && (bwC >= bwA) && (bwC >= bwB));
+  int32_t extra_bits = (accumulate ? ceil(log2(dim2)) : 0);
+  uint64_t *tmpA = new uint64_t[dim1 * dim2];
+  uint64_t *tmpB = new uint64_t[dim2 * dim3];
+
+  bool sender_A = false;
+  bool sender_B = false;
+  if (mode == MultMode::Alice_has_A || mode == MultMode::Alice_has_B) {
+    sender_A = true;
+  } else if (mode == MultMode::Bob_has_A || mode == MultMode::Bob_has_B) {
+    sender_B = true;
+  } else {
+    if (extension_cost(dim1, dim2, bwA, bwA + extra_bits, msbA)
+        > extension_cost(dim2, dim3, bwB, bwB + extra_bits, msbB)) {
+      sender_A = true;
+    } else {
+      sender_B = true;
+    }
+  }
+  if (sender_A) {
+    if (mode == MultMode::Alice_has_A || mode == MultMode::Bob_has_A) {
+      for (int i = 0; i < dim1 * dim2; i++) {
+        tmpA[i] = (signed_arithmetic ? signed_val(inA[i], bwA) : inA[i]);
+      }
+    } else {
+      if (signed_arithmetic) {
+        xt->s_extend(dim1 * dim2, inA, tmpA, bwA, bwA + extra_bits, msbA);
+      } else {
+        xt->z_extend(dim1 * dim2, inA, tmpA, bwA, bwA + extra_bits, msbA);
+      }
+    }
+    memcpy(tmpB, inB, dim2 * dim3 * sizeof(uint64_t));
+    bwA = bwA + extra_bits;
+  } else if (sender_B) {
+    if (mode == MultMode::Alice_has_B || mode == MultMode::Bob_has_B) {
+      for (int i = 0; i < dim2 * dim3; i++) {
+        tmpB[i] = ((signed_arithmetic && signed_B) ? signed_val(inB[i], bwB)
+                                                   : inB[i]);
+      }
+    } else {
+      if (signed_arithmetic && signed_B) {
+        xt->s_extend(dim2 * dim3, inB, tmpB, bwB, bwB + extra_bits, msbB);
+      } else {
+        xt->z_extend(dim2 * dim3, inB, tmpB, bwB, bwB + extra_bits, msbB);
+      }
+    }
+    memcpy(tmpA, inA, dim1 * dim2 * sizeof(uint64_t));
+    bwB = bwB + extra_bits;
+  }
+  bwC = bwC + extra_bits;
+  uint64_t maskA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+  uint64_t maskB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+  uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+  uint64_t pow_A = 1ULL << bwA;
+  uint64_t pow_B = 1ULL << bwB;
+  uint64_t pow_A_2 = (1ULL << (bwA - 1));
+  uint64_t pow_B_2 = (1ULL << (bwB - 1));
+  int32_t dim;
+  if (accumulate)
+    dim = dim1 * dim3;
+  else
+    dim = dim1 * dim2 * dim3;
+
+  for (int i = 0; i < dim1 * dim2; i++) {
+    if (signed_arithmetic) {
+      if (mode == MultMode::Alice_has_A) {
+        tmpA[i] = (party == ALICE ? tmpA[i] + pow_A_2 : 0) & maskA;
+      } else if (mode == MultMode::Bob_has_A) {
+        tmpA[i] = (party == BOB ? tmpA[i] + pow_A_2 : 0) & maskA;
+      } else {
+        tmpA[i] = (tmpA[i] + (party == BOB ? pow_A_2 : 0)) & maskA;
+      }
+    } else {
+      tmpA[i] = tmpA[i] & maskA;
+    }
+  }
+  for (int i = 0; i < dim2 * dim3; i++) {
+    if (signed_arithmetic && signed_B) {
+      if (mode == MultMode::Alice_has_B) {
+        tmpB[i] = (party == ALICE ? tmpB[i] + pow_B_2 : 0) & maskB;
+      } else if (mode == MultMode::Bob_has_B) {
+        tmpB[i] = (party == BOB ? tmpB[i] + pow_B_2 : 0) & maskB;
+      } else {
+        tmpB[i] = (tmpB[i] + (party == BOB ? pow_B_2 : 0)) & maskB;
+      }
+    } else {
+      tmpB[i] = tmpB[i] & maskB;
+    }
+  }
+  /* print_share(tmpA, bwA, dim1*dim2); */
+  /* print_share(tmpB, bwB, dim2*dim3); */
+  /* print_vec(this, tmpA, bwA, dim1*dim2); */
+  /* print_vec(this, tmpB, bwB, dim2*dim3); */
+  uint64_t *cross_terms = new uint64_t[dim];
+  matmul_cross_terms(dim1, dim2, dim3, tmpA, tmpB, cross_terms, bwA, bwB, bwC,
+                     accumulate, mode);
+  /* print_vec(this, cross_terms, bwC, dim); */
+
+  uint64_t *local_terms = new uint64_t[dim];
+  if (party == ALICE &&
+      (mode == MultMode::Alice_has_A || mode == MultMode::Alice_has_B)) {
+    matmul_cleartext(dim1, dim2, dim3, tmpA, tmpB, local_terms, accumulate);
+  } else if (party == BOB &&
+             (mode == MultMode::Bob_has_A || mode == MultMode::Bob_has_B)) {
+    matmul_cleartext(dim1, dim2, dim3, tmpA, tmpB, local_terms, accumulate);
+  } else if (mode == MultMode::None) {
+    matmul_cleartext(dim1, dim2, dim3, tmpA, tmpB, local_terms, accumulate);
+  } else {
+    memset(local_terms, 0, dim * sizeof(uint64_t));
+  }
+  /* print_vec(this, local_terms, bwC, dim); */
+
+  uint8_t *wA = new uint8_t[dim1 * dim2];
+  uint8_t *wB = new uint8_t[dim2 * dim3];
+  uint64_t *wA_B = new uint64_t[dim];
+  uint64_t *wB_A = new uint64_t[dim];
+
+  if (bwC > bwA) {
+    if (mode == MultMode::Alice_has_A || mode == MultMode::Bob_has_A) {
+      memset(wA, 0, dim1 * dim2 * sizeof(uint8_t));
+      memset(wA_B, 0, dim * sizeof(uint64_t));
+    } else {
+      if (msbA != nullptr) {
+        uint8_t *tmp_msbA = new uint8_t[dim1 * dim2];
+        if (signed_arithmetic) {
+          for (int i = 0; i < dim1 * dim2; i++) {
+            tmp_msbA[i] = (party == ALICE ? msbA[i] ^ 1 : msbA[i]);
+          }
+        } else {
+          for (int i = 0; i < dim1 * dim2; i++) {
+            tmp_msbA[i] = (sender_A && (extra_bits > 0) ? 0 : msbA[i]);
+          }
+        }
+        aux->MSB_to_Wrap(tmpA, tmp_msbA, wA, dim1 * dim2, bwA);
+        delete[] tmp_msbA;
+      } else {
+        aux->wrap_computation(tmpA, wA, dim1 * dim2, bwA);
+      }
+      uint64_t *wA64 = new uint64_t[dim1 * dim2];
+      for (int i = 0; i < dim1 * dim2; i++) {
+        wA64[i] = uint64_t(wA[i]);
+      }
+      matmul_multiplexer(dim1, dim2, dim3, wA64, tmpB, wA_B, 1, bwB, bwC - bwA,
+                         accumulate, mode);
+      delete[] wA64;
+    }
+  }
+  if (bwC > bwB) {
+    if (mode == MultMode::Alice_has_B || mode == MultMode::Bob_has_B) {
+      memset(wB, 0, dim2 * dim3 * sizeof(uint8_t));
+      memset(wB_A, 0, dim * sizeof(uint64_t));
+    } else {
+      if (msbB != nullptr) {
+        uint8_t *tmp_msbB = new uint8_t[dim2 * dim3];
+        if (signed_arithmetic && signed_B) {
+          for (int i = 0; i < dim2 * dim3; i++) {
+            tmp_msbB[i] = (party == ALICE ? msbB[i] ^ 1 : msbB[i]);
+          }
+        } else {
+          for (int i = 0; i < dim2 * dim3; i++) {
+            tmp_msbB[i] = (sender_B && (extra_bits > 0) ? 0 : msbB[i]);
+          }
+        }
+        aux->MSB_to_Wrap(tmpB, tmp_msbB, wB, dim2 * dim3, bwB);
+        delete[] tmp_msbB;
+      } else {
+        aux->wrap_computation(tmpB, wB, dim2 * dim3, bwB);
+      }
+      uint64_t *wB64 = new uint64_t[dim2 * dim3];
+      for (int i = 0; i < dim2 * dim3; i++) {
+        wB64[i] = uint64_t(wB[i]);
+      }
+      matmul_multiplexer(dim1, dim2, dim3, tmpA, wB64, wB_A, bwA, 1, bwC - bwB,
+                         accumulate, mode);
+      delete[] wB64;
+    }
+  }
+  /* print_vec(this, wA_B, bwC - bwA, dim); */
+  /* print_vec(this, wB_A, bwC - bwB, dim); */
+
+  uint64_t *tmpC = new uint64_t[dim];
+  int inner_loop_size = (accumulate ? dim2 : 1);
+  for (int i = 0; i < dim; i++) {
+    tmpC[i] =
+        (local_terms[i] + cross_terms[i] - pow_A * wA_B[i] - pow_B * wB_A[i]) &
+        maskC;
+    if (signed_arithmetic) {
+      for (int j = 0; j < inner_loop_size; j++) {
+        int idx = (accumulate ? i : i / dim2);
+        int common_idx = (accumulate ? j : i % dim2);
+        int row_idx = idx / dim3;
+        int col_idx = idx % dim3;
+        int A_idx = row_idx * dim2 + common_idx;
+        int B_idx = common_idx * dim3 + col_idx;
+        if (signed_B) {
+          if (party == ALICE) {
+            tmpC[i] = (tmpC[i] - pow_A_2 * (tmpB[B_idx] - pow_B * wB[B_idx]) -
+                       pow_B_2 * (tmpA[A_idx] - pow_A * wA[A_idx])) &
+                      maskC;
+          } else { // party == BOB
+            tmpC[i] = (tmpC[i] - pow_A_2 * (tmpB[B_idx] - pow_B * wB[B_idx]) -
+                       pow_B_2 * (tmpA[A_idx] - pow_A * wA[A_idx]) +
+                       pow_A_2 * pow_B_2) &
+                      maskC;
+          }
+        } else {
+          tmpC[i] =
+              (tmpC[i] - pow_A_2 * (tmpB[B_idx] - pow_B * wB[B_idx])) & maskC;
+        }
+      }
+    }
+  }
+  if (accumulate) {
+    trunc->truncate_and_reduce(dim1 * dim3, tmpC, outC, extra_bits, bwC);
+  } else {
+    memcpy(outC, tmpC, dim * sizeof(uint64_t));
+  }
+  /* print_vec(this, outC, bwC, dim); */
+
+  delete[] cross_terms;
+  delete[] local_terms;
+  delete[] wA;
+  delete[] wB;
+  delete[] wA_B;
+  delete[] wB_A;
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.h
new file mode 100644
index 00000000..0dd68eab
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.h
@@ -0,0 +1,113 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LINEAR_OT_H__
+#define LINEAR_OT_H__
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "BuildingBlocks/truncation.h"
+#include "BuildingBlocks/value-extension.h"
+#include "OT/emp-ot.h"
+
+enum class MultMode {
+  None,        // Both A and B are secret shared
+  Alice_has_A, // A is known to ALICE
+  Alice_has_B, // B is known to ALICE
+  Bob_has_A,   // A is known to BOB
+  Bob_has_B,   // B is known to BOB
+};
+
+class LinearOT {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  AuxProtocols *aux;
+  Truncation *trunc;
+  XTProtocol *xt;
+
+  LinearOT(int party, sci::IOPack *iopack, sci::OTPack *otpack);
+
+  ~LinearOT();
+
+  void hadamard_cleartext(int dim, uint64_t *inA, uint64_t *inB,
+                          uint64_t *outC);
+
+  // Outputs the dim1*dim2*dim3 multiplications in inA*inB without accumulating
+  void matmul_cleartext(int dim1, int dim2, int dim3, uint64_t *inA,
+                        uint64_t *inB, uint64_t *outC, bool accumulate = true);
+
+  // Hadamard cross terms A0B1 + A1B0
+  void hadamard_cross_terms(int32_t dim, uint64_t *inA, uint64_t *inB,
+                            uint64_t *outC, int32_t bwA, int32_t bwB,
+                            int32_t bwC, MultMode mode = MultMode::None);
+
+  // Hadamard product of two secret-shared vectors A and B
+  void hadamard_product(int32_t dim,
+                        // input share vector
+                        uint64_t *inA, uint64_t *inB,
+                        // output share vector
+                        uint64_t *outC,
+                        // bitwidths
+                        int32_t bwA, int32_t bwB, int32_t bwC,
+                        bool signed_arithmetic = true,
+                        // take B as signed input?
+                        bool signed_B = true, MultMode mode = MultMode::None,
+                        uint8_t *msbA = nullptr, uint8_t *msbB = nullptr);
+
+  // Matmul cross terms A0B1 + A1B0
+  void matmul_cross_terms(
+      // (dim1xdim2) X (dim2xdim3)
+      int32_t dim1, int32_t dim2, int32_t dim3,
+      // input share matrix
+      uint64_t *inA, uint64_t *inB,
+      // output share matrix
+      uint64_t *outC,
+      // bitwidths
+      int32_t bwA, int32_t bwB, int32_t bwC, bool accumulate,
+      MultMode mode = MultMode::None);
+
+  // Matmul Multiplexer: bwA == 1 || bwB == 1
+  void matmul_multiplexer(
+      // (dim1xdim2) X (dim2xdim3)
+      int32_t dim1, int32_t dim2, int32_t dim3,
+      // either A or B will be a bit matrix
+      uint64_t *inA, uint64_t *inB, uint64_t *outC,
+      // bitwidths
+      int32_t bwA, int32_t bwB, int32_t bwC, bool accumulate, MultMode mode);
+
+  // Matrix Multiplication of two secret-shared matrices A and B
+  void matrix_multiplication(
+      // (dim1xdim2) X (dim2xdim3)
+      int32_t dim1, int32_t dim2, int32_t dim3,
+      // input share matrix
+      uint64_t *inA, uint64_t *inB,
+      // output share matrix
+      uint64_t *outC,
+      // bitwidths
+      int32_t bwA, int32_t bwB, int32_t bwC, bool signed_arithmetic = true,
+      // take B as signed input?
+      bool signed_B = true, bool accumulate = true,
+      MultMode mode = MultMode::None, uint8_t *msbA = nullptr,
+      uint8_t *msbB = nullptr);
+};
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-uniform.h b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-uniform.h
new file mode 100644
index 00000000..f142c3de
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-uniform.h
@@ -0,0 +1,750 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LINEAR_UNIFORM_H__
+#define LINEAR_UNIFORM_H__
+#include <iostream>
+
+#ifdef USE_EIGEN
+#include <Eigen/Dense>
+#endif
+
+#include "OT/iknp.h"
+
+// Special case of LinearOT which works only for uniform bitwidth multiplication
+// Faster than LinearOT for uniform bitwidth matrix multiplication
+template <typename IO, typename intType, typename otType> class MatMulUniform {
+public:
+  IO *io = nullptr;
+  sci::OT<otType> *otImpl = nullptr;
+  sci::OT<otType> *otImplRoleReversed = nullptr;
+  int party;
+  int bitlength;
+  const uint32_t batchSizeOTs =
+      1ULL << 18; // This is the default size of the batch of OTs that will be
+                  // done in one go for matmul
+                  // This will be scaled appropriately to manage memory if the
+                  // matmul dimensions are too large.
+
+  const uint64_t MaxMemToUseInBytes = 2.5 * (1 << 30); // 2.5 GiB
+  intType moduloMask;
+
+  MatMulUniform(int party, int bitlength, IO *io, sci::OT<otType> *otImpl,
+                sci::OT<otType> *otImplRoleReversed) {
+    this->party = party;
+    assert(((party == 1) || (party == 2)) && "PartyNum should be 1 or 2.");
+    this->bitlength = bitlength;
+    this->io = io;
+    assert(io != nullptr && "IO can't be nullptr.");
+    assert(otImpl != nullptr && "otImpl can't be nullptr.");
+    this->otImpl = otImpl;
+    this->otImplRoleReversed = otImplRoleReversed;
+    if (bitlength == 64)
+      moduloMask = -1;
+    else
+      moduloMask = (1ULL << bitlength) - 1;
+  }
+
+  ~MatMulUniform() {}
+
+#ifdef USE_EIGEN
+  void ideal_func_eigen(int s1, int s2, int s3, const intType *A,
+                        const intType *B, intType *C) {
+    Eigen::Matrix<intType, Eigen::Dynamic, Eigen::Dynamic> eigen_a(s1, s2);
+    Eigen::Matrix<intType, Eigen::Dynamic, Eigen::Dynamic> eigen_b(s2, s3);
+    Eigen::Matrix<intType, Eigen::Dynamic, Eigen::Dynamic> eigen_c(s1, s3);
+
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s2; j++) {
+        eigen_a(i, j) = Arr2DIdxRowM(A, s1, s2, i, j);
+      }
+    }
+    for (int i = 0; i < s2; i++) {
+      for (int j = 0; j < s3; j++) {
+        eigen_b(i, j) = Arr2DIdxRowM(B, s2, s3, i, j);
+      }
+    }
+    eigen_c = eigen_a * eigen_b;
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s3; j++) {
+        Arr2DIdxRowM(C, s1, s3, i, j) = eigen_c(i, j);
+      }
+    }
+  }
+#endif
+
+  void ideal_func(int s1, int s2, int s3, const intType *A, const intType *B,
+                  intType *C) {
+#ifndef USE_EIGEN
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s3; j++) {
+        Arr2DIdxRowM(C, s1, s3, i, j) = 0;
+        for (int k = 0; k < s2; k++) {
+          Arr2DIdxRowM(C, s1, s3, i, j) +=
+              (Arr2DIdxRowM(A, s1, s2, i, k) * Arr2DIdxRowM(B, s2, s3, k, j));
+        }
+      }
+    }
+#else
+    ideal_func_eigen(s1, s2, s3, A, B, C);
+#endif
+    for (int i = 0; i < s1 * s3; i++) {
+      C[i] = C[i] & moduloMask;
+    }
+  }
+
+  void verifyMatmulShares(int s1, int s2, int s3, const intType *A_share,
+                          const intType *B_share, const intType *C_share) {
+    if (party == sci::ALICE) {
+      intType *A_temp_share = new intType[s1 * s2];
+      intType *B_temp_share = new intType[s2 * s3];
+      intType *C_temp_share = new intType[s1 * s3];
+      intType *C_clear = new intType[s1 * s3];
+      io->recv_data(A_temp_share, sizeof(intType) * s1 * s2);
+      io->recv_data(B_temp_share, sizeof(intType) * s2 * s3);
+      io->recv_data(C_temp_share, sizeof(intType) * s1 * s3);
+      sci::elemWiseAdd<intType>(s1 * s2, A_share, A_temp_share, A_temp_share);
+      sci::elemWiseAdd<intType>(s2 * s3, B_share, B_temp_share, B_temp_share);
+      sci::elemWiseAdd<intType>(s1 * s3, C_share, C_temp_share, C_temp_share);
+      ideal_func(s1, s2, s3, A_temp_share, B_temp_share, C_clear);
+      for (int i = 0; i < s1; i++) {
+        for (int j = 0; j < s3; j++) {
+          assert(Arr2DIdxRowM(C_clear, s1, s3, i, j) ==
+                 (Arr2DIdxRowM(C_temp_share, s1, s3, i, j) & moduloMask));
+        }
+      }
+      delete[] A_temp_share;
+      delete[] B_temp_share;
+      delete[] C_temp_share;
+      delete[] C_clear;
+    } else if (party == sci::BOB) {
+      io->send_data(A_share, sizeof(intType) * s1 * s2);
+      io->send_data(B_share, sizeof(intType) * s2 * s3);
+      io->send_data(C_share, sizeof(intType) * s1 * s3);
+    } else {
+      assert(false);
+    }
+  }
+
+  void fillInSimpleValues(int s1, int s2, intType *arr) {
+    if (party == sci::ALICE) {
+      for (int i = 0; i < s1; i++) {
+        for (int j = 0; j < s2; j++) {
+          Arr2DIdxRowM(arr, s1, s2, i, j) = i + j + 1;
+        }
+      }
+    } else if (party == sci::BOB) {
+      for (int i = 0; i < s1; i++) {
+        for (int j = 0; j < s2; j++) {
+          Arr2DIdxRowM(arr, s1, s2, i, j) = 0;
+        }
+      }
+    }
+  }
+
+  /*
+          This function is not being used anywhere right now
+          - matmul where dimensions of matrices are (s1,s2) and (s2,s3)
+          - inp is input of the sender of size (s1,s2)
+          - outp is the share of the result of size (s1,s3)
+  */
+  void funcOTSenderNonBatched(int s1, int s2, int s3, const intType *inp,
+                              intType *outp, sci::OT<otType> *otInstance) {
+    uint64_t *corrData = new uint64_t[s1 * s2 * bitlength * s3];
+    uint64_t *rData = new uint64_t[s1 * s2 * bitlength * s3];
+    uint64_t *chunkSizes = new uint64_t[s2 * bitlength * s3];
+    uint64_t *numChunks = new uint64_t[s2 * bitlength * s3];
+
+    intType *inpColumnMajor = new intType[s1 * s2];
+    intType *outpColumnMajor = new intType[s1 * s3];
+    sci::convertRowToColMajor<intType>(s1, s2, inp, inpColumnMajor);
+
+    for (int i = 0; i < s1 * s3; i++) {
+      outpColumnMajor[i] = 0;
+    }
+    for (int col = 0; col < s3; col++) {
+      for (int j = 0; j < s2; j++) {
+        for (int k = 0; k < bitlength; k++) {
+          for (int i = 0; i < s1; i++) {
+            intType curElem = Arr2DIdxColM(inpColumnMajor, s1, s2, i, j);
+            Arr4DIdxRowM(corrData, s3, s2, bitlength, s1, col, j, k, i) =
+                ((intType)(((intType)curElem) << k)) >> k;
+            // casting is imp to make sure that
+            // right shift happens in apt bitwidth
+          }
+          chunkSizes[col * s2 * bitlength + j * bitlength + k] = bitlength - k;
+          numChunks[col * s2 * bitlength + j * bitlength + k] = s1;
+        }
+      }
+    }
+    otInstance->template send_cot_matmul<intType>(
+        rData, corrData, chunkSizes, numChunks, s2 * bitlength * s3);
+    for (int col = 0; col < s3; col++) {
+      for (int i = 0; i < s2; i++) {
+        for (int j = 0; j < bitlength; j++) {
+          for (int k = 0; k < s1; k++) {
+            // Arr3DIdxRowM(rData,s2,bitlength,s1,i,j,k) should be of
+            // bitlength-j bits -- top j bits should be 0.
+            uint64_t curElem =
+                Arr4DIdxRowM(rData, s3, s2, bitlength, s1, col, i, j, k);
+            assert(((((1ULL << j) - 1) << (bitlength - j)) & curElem) == 0 &&
+                   "assertion failed");
+            Arr2DIdxColM(outpColumnMajor, s1, s3, k, col) -=
+                (((intType)curElem) << j);
+          }
+        }
+      }
+    }
+
+    sci::convertColToRowMajor<intType>(s1, s3, outpColumnMajor, outp);
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] corrData;
+    delete[] rData;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  /*
+          This function is not being used anywhere right now
+          - The matrix multiplication being performed is (s1, s2) * (s2, s3).
+          - inp is the receiver's array to be multiplied of size (s2, s3)
+          - outp is the share of the receiver after multiplication of size
+     (s1,s3)
+  */
+  void funcOTReceiverNonBatched(int s1, int s2, int s3, const intType *inp,
+                                intType *outp, sci::OT<otType> *otInstance) {
+    // copy inp from row to column and outp from column to row major
+    uint8_t *choiceBitArr = new uint8_t[s2 * bitlength * s3];
+    uint64_t *recv_data = new uint64_t[s1 * s2 * bitlength * s3];
+    uint64_t *chunkSizes = new uint64_t[s2 * bitlength * s3];
+    uint64_t *numChunks = new uint64_t[s2 * bitlength * s3];
+
+    intType *inpColumnMajor = new intType[s2 * s3];
+    intType *outpColumnMajor = new intType[s1 * s3];
+    sci::convertRowToColMajor<intType>(s2, s3, inp, inpColumnMajor);
+
+    for (int j = 0; j < s3; j++) {
+      for (int i = 0; i < s2; i++) {
+        for (int k = 0; k < bitlength; k++) {
+          choiceBitArr[j * s2 * bitlength + i * bitlength + k] =
+              (Arr2DIdxColM(inpColumnMajor, s2, s3, i, j) & (1ULL << k)) >>
+              k; // Unsigned right shift
+          chunkSizes[j * s2 * bitlength + i * bitlength + k] = (bitlength - k);
+          numChunks[j * s2 * bitlength + i * bitlength + k] = s1;
+        }
+      }
+    }
+    otInstance->template recv_cot_matmul<intType>(
+        recv_data, choiceBitArr, chunkSizes, numChunks, s2 * s3 * bitlength);
+    // recv_data can be interpreted as a 4d array of size(s3,s2,bitlength,s1)
+    for (int i = 0; i < s1 * s3; i++) {
+      outpColumnMajor[i] = 0;
+    }
+
+    for (int i = 0; i < s3; i++) {
+      for (int j = 0; j < s2; j++) {
+        for (int k = 0; k < bitlength; k++) {
+          for (int w = 0; w < s1; w++) {
+            Arr2DIdxColM(outpColumnMajor, s1, s3, w, i) +=
+                (((intType)Arr4DIdxRowM(recv_data, s3, s2, bitlength, s1, i, j,
+                                        k, w))
+                 << k);
+          }
+        }
+      }
+    }
+
+    sci::convertColToRowMajor<intType>(s1, s3, outpColumnMajor, outp);
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] choiceBitArr;
+    delete[] recv_data;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  int chooseOptimalBatchSize(int senderMatmulDims) {
+    uint64_t temp =
+        MaxMemToUseInBytes / ((uint64_t)senderMatmulDims * sizeof(intType));
+    if (temp > batchSizeOTs) {
+      temp = batchSizeOTs;
+    }
+    return temp;
+  }
+
+  /*
+          - matmul where dimensions of matrices are (s1,s2) and (s2,s3)
+          - inp is input of the sender of size (s1,s2)
+          - outp is the share of the result of size (s1,s3)
+  */
+  void funcOTSenderInputA(int s1, int s2, int s3, const intType *inp,
+                          intType *outp, sci::OT<otType> *otInstance,
+                          bool inpAlreadyColumnMajor = false) {
+    intType *inpColumnMajor = const_cast<intType *>(inp);
+    if (!inpAlreadyColumnMajor) {
+      inpColumnMajor = new intType[s1 * s2];
+      sci::convertRowToColMajor<intType>(s1, s2, inp, inpColumnMajor);
+    }
+    intType *outpColumnMajor = new intType[s1 * s3];
+    int curBatchSizeOTs = chooseOptimalBatchSize(s1);
+
+    intType *corrData = new intType[curBatchSizeOTs * s1];
+    intType *data = new intType[curBatchSizeOTs * s1];
+    uint32_t *chunkSizes = new uint32_t[curBatchSizeOTs];
+    uint32_t *numChunks = new uint32_t[curBatchSizeOTs];
+
+    for (int i = 0; i < s1 * s3; i++) {
+      outpColumnMajor[i] = 0;
+    }
+    int numOTs = s2 * s3 * bitlength;
+    int bitIdxRecv = 0, rowIdxRecv = 0, colIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s3, s2, bitlength, colIdxRecv,
+                                      rowIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s1; k++) {
+          corrData[(j - i) * s1 + k] =
+              (((intType)Arr2DIdxColM(inpColumnMajor, s1, s2, k, rowIdxRecv))
+               << bitIdxRecv) >>
+              bitIdxRecv;
+        }
+        chunkSizes[j - i] = bitlength - bitIdxRecv;
+        numChunks[j - i] = s1;
+      }
+      otInstance->template send_cot_matmul<intType>(data, corrData, chunkSizes,
+                                                    numChunks, j - i, s1);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s3, s2, bitlength, colIdxRecv,
+                                      rowIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s1; k++) {
+          Arr2DIdxColM(outpColumnMajor, s1, s3, k, colIdxRecv) -=
+              (((intType)Arr2DIdxRowM(data, curBatchSizeOTs, s1, (j - i), k))
+               << bitIdxRecv);
+        }
+      }
+    }
+
+    sci::convertColToRowMajor<intType>(s1, s3, outpColumnMajor, outp);
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    if (!inpAlreadyColumnMajor)
+      delete[] inpColumnMajor;
+    delete[] outpColumnMajor;
+
+    delete[] corrData;
+    delete[] data;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  /*
+          - The matrix multiplication being performed is (s1, s2) * (s2, s3).
+          - inp is the receiver's array to be multiplied of size (s2, s3)
+          - outp is the share of the receiver after multiplication of size
+     (s1,s3)
+  */
+  void funcOTReceiverInputB(int s1, int s2, int s3, const intType *inp,
+                            intType *outp, sci::OT<otType> *otInstance,
+                            bool inpAlreadyColumnMajor = false) {
+    // copy inp from row to column and outp from column to row major
+    intType *inpColumnMajor = const_cast<intType *>(inp);
+    intType *outpColumnMajor = new intType[s1 * s3];
+    if (!inpAlreadyColumnMajor) {
+      inpColumnMajor = new intType[s2 * s3];
+      sci::convertRowToColMajor<intType>(s2, s3, inp, inpColumnMajor);
+    }
+    uint64_t masks[64];
+    for (int i = 0; i < 64; i++) {
+      masks[i] = 1ULL << i;
+    }
+
+    int curBatchSizeOTs = chooseOptimalBatchSize(s1);
+
+    intType *data = new intType[curBatchSizeOTs * s1];
+    uint8_t *choiceBitArr = new uint8_t[curBatchSizeOTs];
+    uint32_t *chunkSizes = new uint32_t[curBatchSizeOTs];
+    uint32_t *numChunks = new uint32_t[curBatchSizeOTs];
+
+    for (int i = 0; i < s1 * s3; i++) {
+      outpColumnMajor[i] = 0;
+    }
+
+    int numOTs = s2 * s3 * bitlength;
+    int bitIdxRecv = 0, rowIdxRecv = 0, colIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s3, s2, bitlength, colIdxRecv,
+                                      rowIdxRecv, bitIdxRecv);
+        choiceBitArr[j - i] =
+            (Arr2DIdxColM(inpColumnMajor, s2, s3, rowIdxRecv, colIdxRecv) &
+             masks[bitIdxRecv]) >>
+            bitIdxRecv;
+        chunkSizes[j - i] = bitlength - bitIdxRecv;
+        numChunks[j - i] = s1;
+      }
+      otInstance->template recv_cot_matmul<intType>(
+          data, choiceBitArr, chunkSizes, numChunks, j - i, s1);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s3, s2, bitlength, colIdxRecv,
+                                      rowIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s1; k++) {
+          Arr2DIdxColM(outpColumnMajor, s1, s3, k, colIdxRecv) +=
+              (((intType)Arr2DIdxRowM(data, curBatchSizeOTs, s1, (j - i), k))
+               << bitIdxRecv);
+        }
+      }
+    }
+
+    sci::convertColToRowMajor<intType>(s1, s3, outpColumnMajor, outp);
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    if (!inpAlreadyColumnMajor)
+      delete[] inpColumnMajor;
+    delete[] outpColumnMajor;
+
+    delete[] data;
+    delete[] choiceBitArr;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  /*
+          - matmul where dimensions of matrices are (s1,s2) and (s2,s3)
+          - inp is input of the sender of size (s2,s3)
+          - outp is the share of the result of size (s1,s3)
+  */
+  void funcOTSenderInputB(int s1, int s2, int s3, const intType *inp,
+                          intType *outp, sci::OT<otType> *otInstance) {
+    int curBatchSizeOTs = chooseOptimalBatchSize(s3);
+
+    intType *corrData = new intType[curBatchSizeOTs * s3];
+    intType *data = new intType[curBatchSizeOTs * s3];
+    uint32_t *chunkSizes = new uint32_t[curBatchSizeOTs];
+    uint32_t *numChunks = new uint32_t[curBatchSizeOTs];
+
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = 0;
+    }
+
+    int numOTs = s1 * s2 * bitlength;
+    int bitIdxRecv = 0, rowIdxRecv = 0, colIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s1, s2, bitlength, rowIdxRecv,
+                                      colIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s3; k++) {
+          corrData[(j - i) * s3 + k] =
+              (((intType)Arr2DIdxRowM(inp, s2, s3, colIdxRecv, k))
+               << bitIdxRecv) >>
+              bitIdxRecv;
+        }
+        chunkSizes[j - i] = bitlength - bitIdxRecv;
+        numChunks[j - i] = s3;
+      }
+      otInstance->template send_cot_matmul<intType>(data, corrData, chunkSizes,
+                                                    numChunks, j - i, s3);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s1, s2, bitlength, rowIdxRecv,
+                                      colIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s3; k++) {
+          Arr2DIdxRowM(outp, s1, s3, rowIdxRecv, k) -=
+              (((intType)Arr2DIdxRowM(data, curBatchSizeOTs, s3, (j - i), k))
+               << bitIdxRecv);
+        }
+      }
+    }
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] corrData;
+    delete[] data;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  /*
+          - The matrix multiplication being performed is (s1, s2) * (s2, s3).
+          - inp is the receiver's array to be multiplied of size (s1, s2)
+          - outp is the share of the receiver after multiplication of size
+     (s1,s3)
+  */
+  void funcOTReceiverInputA(int s1, int s2, int s3, const intType *inp,
+                            intType *outp, sci::OT<otType> *otInstance) {
+    uint64_t masks[64];
+    for (int i = 0; i < 64; i++) {
+      masks[i] = 1ULL << i;
+    }
+
+    int curBatchSizeOTs = chooseOptimalBatchSize(s3);
+
+    intType *data = new intType[curBatchSizeOTs * s3];
+    uint8_t *choiceBitArr = new uint8_t[curBatchSizeOTs];
+    uint32_t *chunkSizes = new uint32_t[curBatchSizeOTs];
+    uint32_t *numChunks = new uint32_t[curBatchSizeOTs];
+
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = 0;
+    }
+
+    int numOTs = s1 * s2 * bitlength;
+    int bitIdxRecv = 0, rowIdxRecv = 0, colIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s1, s2, bitlength, rowIdxRecv,
+                                      colIdxRecv, bitIdxRecv);
+        choiceBitArr[j - i] =
+            (Arr2DIdxRowM(inp, s1, s2, rowIdxRecv, colIdxRecv) &
+             masks[bitIdxRecv]) >>
+            bitIdxRecv;
+        chunkSizes[j - i] = bitlength - bitIdxRecv;
+        numChunks[j - i] = s3;
+      }
+      otInstance->template recv_cot_matmul<intType>(
+          data, choiceBitArr, chunkSizes, numChunks, j - i, s3);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, s1, s2, bitlength, rowIdxRecv,
+                                      colIdxRecv, bitIdxRecv);
+        for (int k = 0; k < s3; k++) {
+          Arr2DIdxRowM(outp, s1, s3, rowIdxRecv, k) +=
+              (((intType)Arr2DIdxRowM(data, curBatchSizeOTs, s3, (j - i), k))
+               << bitIdxRecv);
+        }
+      }
+    }
+    for (int i = 0; i < s1 * s3; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] data;
+    delete[] choiceBitArr;
+    delete[] chunkSizes;
+    delete[] numChunks;
+  }
+
+  /*
+          - Dot product functionality
+          - Sender has input 'inp' of size 'size'
+  */
+  void funcDotProdOTSender(int size, const intType *inp, intType *outp,
+                           sci::OT<otType> *otInstance) {
+    for (int i = 0; i < size; i++) {
+      outp[i] = 0;
+    }
+    int curBatchSizeOTs = batchSizeOTs;
+    int numOTs = size * bitlength;
+    intType *curCorrData = new intType[curBatchSizeOTs];
+    uint32_t *curChunkSizes = new uint32_t[curBatchSizeOTs];
+    uint32_t *curNumChunks = new uint32_t[curBatchSizeOTs];
+    intType *curData = new intType[curBatchSizeOTs];
+    int bitIdxRecv = 0, cellIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, size, bitlength, cellIdxRecv,
+                                      bitIdxRecv);
+        curCorrData[j - i] =
+            (((intType)inp[cellIdxRecv]) << bitIdxRecv) >> bitIdxRecv;
+        curChunkSizes[j - i] = bitlength - bitIdxRecv;
+        curNumChunks[j - i] = 1;
+      }
+      otInstance->template send_cot_matmul<intType>(
+          curData, curCorrData, curChunkSizes, curNumChunks, j - i, size);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, size, bitlength, cellIdxRecv,
+                                      bitIdxRecv);
+        outp[cellIdxRecv] -= (((intType)curData[j - i]) << bitIdxRecv);
+      }
+    }
+    for (int i = 0; i < size; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] curCorrData;
+    delete[] curChunkSizes;
+    delete[] curNumChunks;
+    delete[] curData;
+  }
+
+  /*
+          - Dot product functionality
+          - Receiver has input 'inp' of size 'size'
+  */
+  void funcDotProdOTReceiver(int size, const intType *inp, intType *outp,
+                             sci::OT<otType> *otInstance) {
+    uint64_t masks[64];
+    for (int i = 0; i < 64; i++) {
+      masks[i] = 1ULL << i;
+    }
+    for (int i = 0; i < size; i++) {
+      outp[i] = 0;
+    }
+    int curBatchSizeOTs = batchSizeOTs;
+    int numOTs = size * bitlength;
+    uint8_t *curChoiceBits = new uint8_t[curBatchSizeOTs];
+    uint32_t *curNumChunks = new uint32_t[curBatchSizeOTs];
+    uint32_t *curChunkSizes = new uint32_t[curBatchSizeOTs];
+    intType *curData = new intType[curBatchSizeOTs];
+    int bitIdxRecv = 0, cellIdxRecv = 0;
+    for (int i = 0; i < numOTs; i += curBatchSizeOTs) {
+      int j;
+      for (j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, size, bitlength, cellIdxRecv,
+                                      bitIdxRecv);
+        curChoiceBits[j - i] =
+            (inp[cellIdxRecv] & masks[bitIdxRecv]) >> bitIdxRecv;
+        curChunkSizes[j - i] = bitlength - bitIdxRecv;
+        curNumChunks[j - i] = 1;
+      }
+      otInstance->template recv_cot_matmul<intType>(
+          curData, curChoiceBits, curChunkSizes, curNumChunks, j - i, size);
+      for (int j = i; (j < numOTs) && (j < i + curBatchSizeOTs); j++) {
+        // current OT# is j
+        sci::linIdxRowMInverseMapping(j, size, bitlength, cellIdxRecv,
+                                      bitIdxRecv);
+        outp[cellIdxRecv] += (curData[j - i] << bitIdxRecv);
+      }
+    }
+    for (int i = 0; i < size; i++) {
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] curChoiceBits;
+    delete[] curNumChunks;
+    delete[] curChunkSizes;
+    delete[] curData;
+  }
+
+  /*
+          This code is unoptimized and not being used anywhere currently.
+          - Matrix triplet of size (s1,s2)*(s2,s3)
+          - A_share, B_share, C_share are shares of A,B,C
+          - shape(A_share) = (s1,s2)
+          - shape(B_share) = (s2,s3)
+          - shape(C_share) = (s1,s3)
+  */
+  void generateBeaverMatrixTriplet(int s1, int s2, int s3, sci::PRG128 prg,
+                                   intType *A_share, intType *B_share,
+                                   intType *C_share) {
+    assert(otImplRoleReversed != nullptr);
+    prg.random_data(A_share, s1 * s2 * sizeof(intType));
+    prg.random_data(B_share, s2 * s3 * sizeof(intType));
+    intType *temp = new intType[s1 * s3];
+    if (party == sci::ALICE) {
+      // The OTs can be done in parallel
+      funcOTSenderInputA(s1, s2, s3, A_share, C_share, otImpl);
+      funcOTReceiverInputB(s1, s2, s3, B_share, temp, otImplRoleReversed);
+    } else if (party == sci::BOB) {
+      funcOTReceiverInputB(s1, s2, s3, B_share, C_share, otImpl);
+      funcOTSenderInputA(s1, s2, s3, A_share, temp, otImplRoleReversed);
+    } else {
+      assert(false);
+    }
+
+    sci::elemWiseAdd<intType>(s1 * s3, C_share, temp, C_share);
+    ideal_func(s1, s2, s3, A_share, B_share, temp);
+    sci::elemWiseAdd<intType>(s1 * s3, C_share, temp, C_share);
+  }
+
+  /*
+          This code is unoptimized and not being used anywhere currently.
+          - Run the online phase of beaver
+          - Matrices to be multiplied are of size (s1,s2) and (s2,s3).
+          - X_share => size = (s1,s2), share of X
+          - Y_share => size = (s2,s3), share of Y
+          - Z_share => size = (s1,s3), share of output Z=X*Y
+          - A_share, B_share, C_share are shares of the beaver triplet (A,B,C)
+  */
+  void runBeaverOnlinePhase(int s1, int s2, int s3, const intType *X_share,
+                            const intType *Y_share, intType *Z_share,
+                            const intType *A_share, const intType *B_share,
+                            const intType *C_share) {
+    intType *E_share = new intType[s1 * s2];
+    intType *F_share = new intType[s2 * s3];
+    intType *E_temp_share = new intType[s1 * s2];
+    intType *F_temp_share = new intType[s2 * s3];
+    intType *Z_temp_share = new intType[s1 * s3];
+    sci::elemWiseSub<intType>(s1 * s2, X_share, A_share, E_share);
+    sci::elemWiseSub<intType>(s2 * s3, Y_share, B_share, F_share);
+
+    if (party == sci::ALICE) {
+      io->send_data(E_share, sizeof(intType) * s1 * s2);
+      io->send_data(F_share, sizeof(intType) * s2 * s3);
+      io->recv_data(E_temp_share, sizeof(intType) * s1 * s2);
+      io->recv_data(F_temp_share, sizeof(intType) * s2 * s3);
+    } else if (party == sci::BOB) {
+      io->recv_data(E_temp_share, sizeof(intType) * s1 * s2);
+      io->recv_data(F_temp_share, sizeof(intType) * s2 * s3);
+      io->send_data(E_share, sizeof(intType) * s1 * s2);
+      io->send_data(F_share, sizeof(intType) * s2 * s3);
+    } else {
+      assert(false);
+    }
+
+    // Add the shares of E and F to get E and F in the clear
+    sci::elemWiseAdd<intType>(s1 * s2, E_share, E_temp_share, E_share);
+    sci::elemWiseAdd<intType>(s2 * s3, F_share, F_temp_share, F_share);
+
+    // Now E_share and F_share hold the clear values of E & F
+    ideal_func(s1, s2, s3, E_share, Y_share, Z_temp_share);
+    if (party == sci::ALICE) {
+      ideal_func(s1, s2, s3, X_share, F_share, Z_share);
+    } else if (party == sci::BOB) {
+      sci::elemWiseSub<intType>(s1 * s2, X_share, E_share, E_temp_share);
+      ideal_func(s1, s2, s3, E_temp_share, F_share, Z_share);
+    }
+
+    sci::elemWiseAdd<intType>(s1 * s3, Z_share, Z_temp_share, Z_share);
+    sci::elemWiseAdd<intType>(s1 * s3, C_share, Z_share, Z_share);
+
+    delete[] E_share;
+    delete[] F_share;
+    delete[] E_temp_share;
+    delete[] F_temp_share;
+    delete[] Z_temp_share;
+  }
+};
+
+#endif // LINEAR_UNIFORM_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Math/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/Math/CMakeLists.txt
new file mode 100644
index 00000000..7fd5df02
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Math/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(SCI-Math math-functions.cpp)
+target_link_libraries(SCI-Math
+    PUBLIC SCI-LinearOT SCI-BuildingBlocks
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.cpp
new file mode 100644
index 00000000..db6c1e30
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.cpp
@@ -0,0 +1,889 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+
+using namespace std;
+using namespace sci;
+
+#define KKOT_LIMIT 8
+#define SQRT_LOOKUP_SCALE 2
+
+MathFunctions::MathFunctions(int party, IOPack *iopack, OTPack *otpack) {
+  this->party = party;
+  this->iopack = iopack;
+  this->otpack = otpack;
+  this->aux = new AuxProtocols(party, iopack, otpack);
+  this->xt = new XTProtocol(party, iopack, otpack);
+  this->trunc = new Truncation(party, iopack, otpack);
+  this->mult = new LinearOT(party, iopack, otpack);
+}
+
+MathFunctions::~MathFunctions() {
+  delete aux;
+  delete xt;
+  delete trunc;
+  delete mult;
+}
+
+// A0 \in (1/4, 1)
+uint64_t lookup_A0(uint64_t index, int m) {
+  uint64_t k = 1ULL << m;
+  double p = 1 + (double(index) / double(k));
+  double A1 = 1.0 / (p * (p + 1.0 / double(k)));
+  int32_t scale = m + 3;
+  uint64_t mask = (1ULL << scale) - 1;
+  uint64_t val = uint64_t(A1 * (1ULL << scale)) & mask;
+  return val;
+}
+
+// A1 \in (1/2, 1)
+uint64_t lookup_A1(uint64_t index, int m) {
+  uint64_t k = 1ULL << m;
+  double p = 1 + (double(index) / double(k));
+  double z = (p * (p + (1.0 / double(k))));
+  double A1 = ((1.0 / double(k * 2)) + sqrt(z)) / z;
+  int32_t scale = 2 * m + 2;
+  uint64_t mask = (1ULL << scale) - 1;
+  uint64_t val = uint64_t(A1 * (1ULL << scale)) & mask;
+  return val;
+}
+
+void MathFunctions::reciprocal_approximation(int32_t dim, int32_t m,
+                                             uint64_t *dn, uint64_t *out,
+                                             int32_t bw_dn, int32_t bw_out,
+                                             int32_t s_dn, int32_t s_out) {
+  assert(bw_out == m + s_dn + 4);
+  assert(s_out == m + s_dn + 4);
+
+  uint64_t s_dn_mask = (1ULL << s_dn) - 1;
+  uint64_t m_mask = (1ULL << m) - 1;
+  uint64_t s_min_m_mask = (1ULL << (s_dn - m)) - 1;
+
+  uint64_t *tmp_1 = new uint64_t[dim];
+  uint64_t *tmp_2 = new uint64_t[dim];
+
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = dn[i] & s_dn_mask;
+  }
+  trunc->truncate_and_reduce(dim, tmp_1, tmp_2, s_dn - m, s_dn);
+
+  int M = (1ULL << m);
+  uint64_t c0_mask = (1ULL << (m + 4)) - 1;
+  uint64_t c1_mask = (1ULL << (2 * m + 3)) - 1;
+  uint64_t *c0 = new uint64_t[dim];
+  uint64_t *c1 = new uint64_t[dim];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[dim];
+    PRG128 prg;
+    prg.random_data(c0, dim * sizeof(uint64_t));
+    prg.random_data(c1, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      spec[i] = new uint64_t[M];
+      c0[i] &= c0_mask;
+      c1[i] &= c1_mask;
+      for (int j = 0; j < M; j++) {
+        int idx = (tmp_2[i] + j) & m_mask;
+        spec[i][j] = (lookup_A0(idx, m) - c0[i]) & c0_mask;
+        spec[i][j] <<= (2 * m + 3);
+        spec[i][j] |= (lookup_A1(idx, m) - c1[i]) & c1_mask;
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, dim, m, 3 * m + 7);
+
+    for (int i = 0; i < dim; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint64_t>(nullptr, tmp_2, c1, dim, m, 3 * m + 7);
+
+    for (int i = 0; i < dim; i++) {
+      c0[i] = (c1[i] >> (2 * m + 3)) & c0_mask;
+      c1[i] = c1[i] & c1_mask;
+    }
+  }
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = dn[i] & s_min_m_mask;
+  }
+  uint8_t *zero_shares = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    zero_shares[i] = 0;
+  }
+
+  // Unsigned mult
+  mult->hadamard_product(dim, c0, tmp_1, tmp_2, m + 4, s_dn - m, s_dn + 4,
+                         false, false, MultMode::None, zero_shares, nullptr);
+
+  xt->z_extend(dim, tmp_2, tmp_1, s_dn + 4, s_dn + m + 4, zero_shares);
+
+  uint64_t out_mask = (1ULL << (s_dn + m + 4)) - 1;
+  uint64_t scale_up = (1ULL << (s_dn - m + 1));
+  for (int i = 0; i < dim; i++) {
+    out[i] = ((c1[i] * scale_up) - tmp_1[i]) & out_mask;
+  }
+
+  delete[] tmp_1;
+  delete[] tmp_2;
+  delete[] c0;
+  delete[] c1;
+  delete[] zero_shares;
+}
+
+void MathFunctions::div(int32_t dim, uint64_t *nm, uint64_t *dn, uint64_t *out,
+                        int32_t bw_nm, int32_t bw_dn, int32_t bw_out,
+                        int32_t s_nm, int32_t s_dn, int32_t s_out,
+                        bool signed_nm, bool compute_msnzb) {
+  assert(s_out <= s_dn);
+
+  // out_precision = iters * (2*m + 2)
+  int32_t m, iters;
+  m = (s_out <= 18 ? ceil((s_out - 2) / 2.0)
+                   : ceil((ceil(s_out / 2.0) - 2) / 2.0));
+  iters = (s_out <= 18 ? 0 : 1);
+
+  int32_t s_tmp_dn;
+  int32_t bw_adjust;
+  int32_t s_adjust;
+  uint64_t *tmp_dn;
+  uint64_t *adjust;
+  if (compute_msnzb) {
+    s_tmp_dn = bw_dn - 1;
+    bw_adjust = bw_dn + 1;
+    s_adjust = bw_dn - 1 - s_dn;
+    uint64_t mask_adjust = (bw_adjust == 64 ? -1 : ((1ULL << bw_adjust) - 1));
+    // MSB is always 0, thus, not including it
+    uint8_t *msnzb_vector_bool = new uint8_t[dim * bw_dn];
+    uint64_t *msnzb_vector = new uint64_t[dim * bw_dn];
+    aux->msnzb_one_hot(dn, msnzb_vector_bool, bw_dn, dim);
+    aux->B2A(msnzb_vector_bool, msnzb_vector, dim * bw_dn, bw_adjust);
+    // adjust: bw = bw_dn, scale = bw_dn - 1 - s_dn
+    adjust = new uint64_t[dim];
+    for (int i = 0; i < dim; i++) {
+      adjust[i] = 0;
+      for (int j = 0; j < bw_dn; j++) {
+        adjust[i] += (1ULL << (bw_dn - 1 - j)) * msnzb_vector[i * bw_dn + j];
+      }
+      adjust[i] &= mask_adjust;
+    }
+    // tmp_dn: bw = bw_dn, scale = bw_dn - 1
+    tmp_dn = new uint64_t[dim];
+    mult->hadamard_product(dim, dn, adjust, tmp_dn, bw_dn, bw_dn + 1, bw_dn + 1,
+                           false, false, MultMode::None);
+
+    delete[] msnzb_vector_bool;
+    delete[] msnzb_vector;
+  } else {
+    // tmp_dn: bw = s_dn + 1, scale = s_dn
+    s_tmp_dn = s_dn;
+    tmp_dn = dn;
+  }
+
+  uint64_t *tmp_1 = new uint64_t[dim];
+  uint64_t *tmp_2 = new uint64_t[dim];
+  // tmp_1: bw = s_tmp_dn + m + 4, scale = s_tmp_dn + m + 3
+  reciprocal_approximation(dim, m, tmp_dn, tmp_1, bw_dn, s_tmp_dn + m + 4,
+                           s_tmp_dn, s_tmp_dn + m + 4);
+
+  uint64_t *w0 = new uint64_t[dim];
+  // w0: bw = s_out + 1, scale = s_out
+  trunc->truncate_and_reduce(dim, tmp_1, w0, s_tmp_dn + m + 3 - s_out,
+                             s_tmp_dn + m + 4);
+
+  uint8_t *msb_nm = new uint8_t[dim];
+  aux->MSB(nm, msb_nm, dim, bw_nm);
+  uint8_t *zero_shares = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    zero_shares[i] = 0;
+  }
+
+  // a0: bw = bw_out, scale = s_out
+  uint64_t *a0 = new uint64_t[dim];
+  // Mixed mult with w0 unsigned
+  mult->hadamard_product(dim, nm, w0, tmp_1, bw_nm, s_out + 1, s_out + bw_nm,
+                         signed_nm, false, MultMode::None, msb_nm, zero_shares);
+  trunc->truncate_and_reduce(dim, tmp_1, tmp_2, s_nm, s_out + bw_nm);
+  if ((bw_nm - s_nm) >= (bw_out - s_out)) {
+    aux->reduce(dim, tmp_2, a0, bw_nm - s_nm + s_out, bw_out);
+  } else {
+    if (signed_nm) {
+      xt->s_extend(dim, tmp_2, a0, s_out + bw_nm - s_nm, bw_out, msb_nm);
+    } else {
+      xt->z_extend(dim, tmp_2, a0, s_out + bw_nm - s_nm, bw_out, nullptr);
+    }
+  }
+
+  if (compute_msnzb) {
+    int32_t bw_tmp1 =
+        (bw_out + s_adjust < bw_adjust ? bw_adjust : bw_out + s_adjust);
+    // tmp_1: bw = bw_tmp1, scale = s_out + s_adjust
+    mult->hadamard_product(dim, a0, adjust, tmp_1, bw_out, bw_adjust, bw_tmp1,
+                           signed_nm, false, MultMode::None,
+                           (signed_nm ? msb_nm : nullptr), zero_shares);
+    // a0: bw = bw_out, scale = s_out
+    trunc->truncate_and_reduce(dim, tmp_1, a0, s_adjust, bw_out + s_adjust);
+  }
+
+  // tmp_1: bw = s_tmp_dn + 2, scale = s_tmp_dn
+  uint64_t s_plus_2_mask = (1ULL << (s_tmp_dn + 2)) - 1;
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = tmp_dn[i] & s_plus_2_mask;
+  }
+
+  if (iters > 0) {
+    // d0: bw = s_out + 2, scale = s_out
+    uint64_t *d0 = new uint64_t[dim];
+    mult->hadamard_product(dim, w0, tmp_1, tmp_2, s_out + 1, s_tmp_dn + 2,
+                           s_out + s_tmp_dn + 2, false, false, MultMode::None,
+                           zero_shares, zero_shares);
+    trunc->truncate_and_reduce(dim, tmp_2, d0, s_tmp_dn, s_out + s_tmp_dn + 2);
+
+    // e0: bw = s_out + 2, scale = s_out
+    // e0 = 1 - d0
+    uint64_t *e0 = new uint64_t[dim];
+    for (int i = 0; i < dim; i++) {
+      e0[i] = (party == ALICE ? (1ULL << (s_out)) : 0) - d0[i];
+    }
+
+    uint64_t e_mask = (1ULL << (s_out + 2)) - 1;
+    uint64_t *a_curr = new uint64_t[dim];
+    uint64_t *e_curr = new uint64_t[dim];
+    uint64_t *a_prev = a0;
+    uint64_t *e_prev = e0;
+    for (int i = 0; i < iters - 1; i++) {
+      // tmp_1: bw = 2*s_out+2, scale: 2*s_out
+      mult->hadamard_product(dim, e_prev, e_prev, tmp_1, s_out + 2, s_out + 2,
+                             2 * s_out + 2, true, true, MultMode::None,
+                             zero_shares, zero_shares);
+      // e_curr: bw = s_out + 2, scale: s_out
+      trunc->truncate_and_reduce(dim, tmp_1, e_curr, s_out, 2 * s_out + 2);
+      // e_prev = 1 + e_prev
+      for (int j = 0; j < dim; j++) {
+        e_prev[j] =
+            ((party == ALICE ? (1ULL << (s_out)) : 0) + e_prev[j]) & e_mask;
+      }
+      // tmp_1: bw = bw_out + s_out, scale: 2*s_out
+      mult->hadamard_product(dim, a_prev, e_prev, tmp_1, bw_out, s_out + 2,
+                             bw_out + s_out, signed_nm, true, MultMode::None,
+                             (signed_nm ? msb_nm : nullptr), zero_shares);
+      // a_curr: bw = bw_out, scale: s_out
+      trunc->truncate_and_reduce(dim, tmp_1, a_curr, s_out, bw_out + s_out);
+      memcpy(a_prev, a_curr, dim * sizeof(uint64_t));
+      memcpy(e_prev, e_curr, dim * sizeof(uint64_t));
+    }
+    // e_prev = 1 + e_prev
+    for (int j = 0; j < dim; j++) {
+      e_prev[j] =
+          ((party == ALICE ? (1ULL << (s_out)) : 0) + e_prev[j]) & e_mask;
+    }
+    // out: bw = bw_out, scale: s_out
+    // Mixed mult with e_prev unsigned
+    mult->hadamard_product(dim, a_prev, e_prev, tmp_1, bw_out, s_out + 2,
+                           bw_out + s_out, signed_nm, false, MultMode::None,
+                           (signed_nm ? msb_nm : nullptr), zero_shares);
+    trunc->truncate_and_reduce(dim, tmp_1, out, s_out, bw_out + s_out);
+
+    delete[] d0;
+    delete[] e0;
+    delete[] a_curr;
+    delete[] e_curr;
+  } else {
+    memcpy(out, a0, dim * sizeof(uint64_t));
+  }
+
+  delete[] tmp_1;
+  delete[] tmp_2;
+  delete[] w0;
+  delete[] a0;
+  delete[] msb_nm;
+  delete[] zero_shares;
+  if (compute_msnzb) {
+    delete[] tmp_dn;
+    delete[] adjust;
+  }
+}
+
+uint64_t lookup_neg_exp(uint64_t val_in, int32_t s_in, int32_t s_out) {
+  if (s_in < 0) {
+    s_in *= -1;
+    val_in *= (1ULL << (s_in));
+    s_in = 0;
+  }
+  uint64_t res_val =
+      exp(-1.0 * (val_in / double(1ULL << s_in))) * (1ULL << s_out);
+  return res_val;
+}
+
+void MathFunctions::lookup_table_exp(int32_t dim, uint64_t *x, uint64_t *y,
+                                     int32_t bw_x, int32_t bw_y, int32_t s_x,
+                                     int32_t s_y) {
+  assert(bw_y >= (s_y + 2));
+
+  int LUT_size = KKOT_LIMIT;
+
+  uint64_t bw_x_mask = (bw_x == 64 ? -1 : (1ULL << bw_x) - 1);
+  uint64_t LUT_out_mask = ((s_y + 2) == 64 ? -1 : (1ULL << (s_y + 2)) - 1);
+
+  uint64_t *tmp_1 = new uint64_t[dim];
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (-1 * x[i]) & bw_x_mask;
+  }
+  int digit_size = LUT_size;
+  int num_digits = ceil(double(bw_x) / digit_size);
+  int last_digit_size = bw_x - (num_digits - 1) * digit_size;
+  uint64_t *x_digits = new uint64_t[num_digits * dim];
+
+  aux->digit_decomposition_sci(dim, tmp_1, x_digits, bw_x, digit_size);
+
+  uint64_t digit_mask = (digit_size == 64 ? -1 : (1ULL << digit_size) - 1);
+  uint64_t last_digit_mask =
+      (last_digit_size == 64 ? -1 : (1ULL << last_digit_size) - 1);
+  int N = (1ULL << digit_size);
+  int last_N = (1ULL << last_digit_size);
+  int N_digits = (digit_size == last_digit_size ? num_digits : num_digits - 1);
+  uint64_t *digits_exp = new uint64_t[num_digits * dim];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[num_digits * dim];
+    PRG128 prg;
+    prg.random_data(digits_exp, num_digits * dim * sizeof(uint64_t));
+    for (int i = 0; i < N_digits * dim; i++) {
+      int digit_idx = i / dim;
+      spec[i] = new uint64_t[N];
+      digits_exp[i] &= LUT_out_mask;
+      for (int j = 0; j < N; j++) {
+        int idx = (x_digits[i] + j) & digit_mask;
+        spec[i][j] = (lookup_neg_exp(idx, s_x - digit_size * digit_idx, s_y) -
+                      digits_exp[i]) &
+                     LUT_out_mask;
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, N_digits * dim,
+                                digit_size, s_y + 2);
+
+    if (digit_size != last_digit_size) {
+      int offset = N_digits * dim;
+      int digit_idx = N_digits;
+      for (int i = offset; i < num_digits * dim; i++) {
+        spec[i] = new uint64_t[last_N];
+        digits_exp[i] &= LUT_out_mask;
+        for (int j = 0; j < last_N; j++) {
+          int idx = (x_digits[i] + j) & last_digit_mask;
+          spec[i][j] = (lookup_neg_exp(idx, s_x - digit_size * digit_idx, s_y) -
+                        digits_exp[i]) &
+                       LUT_out_mask;
+        }
+      }
+      aux->lookup_table<uint64_t>(spec + offset, nullptr, nullptr, dim,
+                                  last_digit_size, s_y + 2);
+    }
+
+    for (int i = 0; i < num_digits * dim; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    aux->lookup_table<uint64_t>(nullptr, x_digits, digits_exp, N_digits * dim,
+                                digit_size, s_y + 2);
+    if (digit_size != last_digit_size) {
+      int offset = N_digits * dim;
+      aux->lookup_table<uint64_t>(nullptr, x_digits + offset,
+                                  digits_exp + offset, dim, last_digit_size,
+                                  s_y + 2);
+    }
+    for (int i = 0; i < num_digits * dim; i++) {
+      digits_exp[i] &= LUT_out_mask;
+    }
+  }
+
+  uint8_t *zero_shares = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    zero_shares[i] = 0;
+  }
+  for (int i = 1; i < num_digits; i *= 2) {
+    for (int j = 0; j < num_digits and j + i < num_digits; j += 2 * i) {
+      mult->hadamard_product(dim, digits_exp + j * dim,
+                             digits_exp + (j + i) * dim, digits_exp + j * dim,
+                             s_y + 2, s_y + 2, 2 * s_y + 2, false, false,
+                             MultMode::None, zero_shares, zero_shares);
+      trunc->truncate_and_reduce(dim, digits_exp + j * dim,
+                                 digits_exp + j * dim, s_y, 2 * s_y + 2);
+    }
+  }
+  xt->z_extend(dim, digits_exp, y, s_y + 2, bw_y, zero_shares);
+
+  delete[] x_digits;
+  delete[] tmp_1;
+  delete[] digits_exp;
+  delete[] zero_shares;
+}
+
+void MathFunctions::sigmoid(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                            int32_t bw_y, int32_t s_x, int32_t s_y) {
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+  uint64_t mask_exp_out = ((s_y + 2) == 64 ? -1 : ((1ULL << (s_y + 2)) - 1));
+  uint64_t mask_den = ((s_y + 2) == 64 ? -1 : ((1ULL << (s_y + 2)) - 1));
+  uint8_t *zero_shares = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    zero_shares[i] = 0;
+  }
+
+  uint8_t *msb_x = new uint8_t[dim];
+  aux->MSB(x, msb_x, dim, bw_x);
+
+  // neg_x = -x + msb_x * (2x) (neg_x is always negative)
+  uint64_t *tmp_1 = new uint64_t[dim];
+  uint64_t *tmp_2 = new uint64_t[dim];
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (-1 * x[i]) & mask_x;
+    tmp_2[i] = (2 * x[i]) & mask_x;
+  }
+  uint64_t *neg_x = new uint64_t[dim];
+  aux->multiplexer(msb_x, tmp_2, neg_x, dim, bw_x, bw_x);
+  for (int i = 0; i < dim; i++) {
+    neg_x[i] = (neg_x[i] + tmp_1[i]) & mask_x;
+  }
+
+  // den = tmp_1 = 1 + exp_neg_x
+  uint64_t *exp_neg_x = new uint64_t[dim];
+  lookup_table_exp(dim, neg_x, exp_neg_x, bw_x, s_y + 2, s_x, s_y);
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] =
+        ((party == ALICE ? (1ULL << s_y) : 0) + exp_neg_x[i]) & mask_exp_out;
+  }
+  // den can't be 2^{s_y+1}, so 1 is subtracted if msb_den is 1
+  uint8_t *msb_den = new uint8_t[dim];
+  aux->MSB(tmp_1, msb_den, dim, s_y + 2);
+  aux->B2A(msb_den, tmp_2, dim, s_y + 2);
+  // den = tmp_1 = den - msb_den
+  // tmp_2 = 1 (all 1 vector)
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (tmp_1[i] - tmp_2[i]) & mask_den;
+    tmp_2[i] = (party == ALICE ? 1 : 0);
+  }
+  uint64_t *sig_neg_x = new uint64_t[dim];
+  // sig_neg_x = 1/(1 + exp_neg_x)
+  div(dim, tmp_2, tmp_1, sig_neg_x, 2, s_y + 2, s_y + 2, 0, s_y, s_y, true,
+      false);
+
+  // tmp_2 = num = 1 + msb_x * (exp_neg_x - 1)
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (exp_neg_x[i] - (party == ALICE ? 1ULL << s_y : 0)) & mask_den;
+  }
+  aux->multiplexer(msb_x, tmp_1, tmp_2, dim, s_y + 2, s_y + 2);
+  for (int i = 0; i < dim; i++) {
+    tmp_2[i] = (tmp_2[i] + (party == ALICE ? 1ULL << s_y : 0)) & mask_den;
+  }
+
+  mult->hadamard_product(dim, tmp_2, sig_neg_x, tmp_1, s_y + 2, s_y + 2,
+                         2 * s_y + 2, false, false, MultMode::None, zero_shares,
+                         zero_shares);
+  trunc->truncate_and_reduce(dim, tmp_1, tmp_2, s_y, 2 * s_y + 2);
+
+  if (bw_y <= (s_y + 2)) {
+    for (int i = 0; i < dim; i++) {
+      y[i] = tmp_2[i] & mask_y;
+    }
+  } else {
+    xt->z_extend(dim, tmp_2, y, s_y + 2, bw_y, zero_shares);
+  }
+
+  delete[] msb_x;
+  delete[] tmp_1;
+  delete[] tmp_2;
+  delete[] neg_x;
+  delete[] exp_neg_x;
+  delete[] msb_den;
+  delete[] sig_neg_x;
+}
+
+void MathFunctions::tanh(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                         int32_t bw_y, int32_t s_x, int32_t s_y) {
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+  uint64_t mask_exp_out = ((s_y + 3) == 64 ? -1 : ((1ULL << (s_y + 3)) - 1));
+  uint64_t mask_den = ((s_y + 2) == 64 ? -1 : ((1ULL << (s_y + 2)) - 1));
+
+  uint8_t *msb_x = new uint8_t[dim];
+  aux->MSB(x, msb_x, dim, bw_x);
+
+  // neg_x = -x + msb_x * (2x) (neg_x is always negative)
+  uint64_t *tmp_1 = new uint64_t[dim];
+  uint64_t *tmp_2 = new uint64_t[dim];
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (-1 * x[i]) & mask_x;
+    tmp_2[i] = (2 * x[i]) & mask_x;
+  }
+  uint64_t *neg_x = new uint64_t[dim];
+  aux->multiplexer(msb_x, tmp_2, neg_x, dim, bw_x, bw_x);
+  for (int i = 0; i < dim; i++) {
+    neg_x[i] = (neg_x[i] + tmp_1[i]) & mask_x;
+  }
+
+  uint64_t *exp_neg_2x = new uint64_t[dim];
+  // scale of neg_x is reduced by 1 instead of mulitplying it with 2 to get
+  // neg_2x
+  lookup_table_exp(dim, neg_x, exp_neg_2x, bw_x, s_y + 2, s_x - 1, s_y);
+  // den = tmp_1 = 1 + exp_neg_2x
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] =
+        ((party == ALICE ? (1ULL << s_y) : 0) + exp_neg_2x[i]) & mask_exp_out;
+  }
+  // den can't be 2^{s_y+1}, so 1 is subtracted if msb_den is 1
+  uint8_t *msb_den = new uint8_t[dim];
+  aux->MSB(tmp_1, msb_den, dim, s_y + 2);
+  aux->B2A(msb_den, tmp_2, dim, s_y + 2);
+  // den = tmp_1 = den - msb_den
+  // num = tmp_2 = 1 - exp_neg_2x
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (tmp_1[i] - tmp_2[i]) & mask_den;
+    tmp_2[i] =
+        ((party == ALICE ? (1ULL << s_y) : 0) - exp_neg_2x[i]) & mask_den;
+  }
+  uint64_t *tanh_neg_x = new uint64_t[dim];
+  // tanh_neg_x = (1 - exp_neg_2x)/(1 + exp_neg_2x)
+  div(dim, tmp_2, tmp_1, tanh_neg_x, s_y + 2, s_y + 2, s_y + 2, s_y, s_y, s_y,
+      true, false);
+
+  // tmp_2 = tanh_neg_x + msb_x * (-2 * tanh_neg_x)
+  // tmp_1 = -2 * tanh_neg_x
+  for (int i = 0; i < dim; i++) {
+    tmp_1[i] = (-2 * tanh_neg_x[i]) & mask_exp_out;
+  }
+  aux->multiplexer(msb_x, tmp_1, tmp_2, dim, s_y + 2, s_y + 2);
+  for (int i = 0; i < dim; i++) {
+    tmp_2[i] = (tmp_2[i] + tanh_neg_x[i]) & mask_exp_out;
+  }
+
+  if (bw_y <= (s_y + 2)) {
+    for (int i = 0; i < dim; i++) {
+      y[i] = tmp_2[i] & mask_y;
+    }
+  } else {
+    xt->s_extend(dim, tmp_2, y, s_y + 2, bw_y, msb_x);
+  }
+
+  delete[] msb_x;
+  delete[] tmp_1;
+  delete[] tmp_2;
+  delete[] neg_x;
+  delete[] exp_neg_2x;
+  delete[] msb_den;
+  delete[] tanh_neg_x;
+}
+
+uint64_t lookup_sqrt(int32_t index, int32_t m, int32_t exp_parity) {
+  int32_t k = 1 << m;
+  double u = (1.0 + (double(index) / double(k))) * (1 << exp_parity);
+  double Y = 1.0 / sqrt(u);
+  int32_t scale = m + SQRT_LOOKUP_SCALE;
+  uint64_t val = (Y * (1ULL << scale));
+  return val;
+}
+
+void MathFunctions::sqrt(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                         int32_t bw_y, int32_t s_x, int32_t s_y, bool inverse) {
+  int32_t m, iters;
+  if (s_y <= 14) {
+    m = ceil(s_y / 2.0);
+    iters = 1;
+  } else {
+    m = ceil((ceil(s_y / 2.0)) / 2.0);
+    iters = 2;
+  }
+  assert(m <= KKOT_LIMIT);
+  int32_t bw_adjust = bw_x - 1;
+  uint64_t mask_adjust = (bw_adjust == 64 ? -1 : ((1ULL << bw_adjust) - 1));
+  // MSB is always 0, thus, not including it
+  uint8_t *msnzb_vector_bool = new uint8_t[dim * (bw_x - 1)];
+  uint64_t *msnzb_vector = new uint64_t[dim * (bw_x - 1)];
+  aux->msnzb_one_hot(x, msnzb_vector_bool, bw_x - 1, dim);
+  aux->B2A(msnzb_vector_bool, msnzb_vector, dim * (bw_x - 1), bw_x - 1);
+  uint64_t *adjust = new uint64_t[dim];
+  uint8_t *exp_parity = new uint8_t[dim];
+  for (int i = 0; i < dim; i++) {
+    adjust[i] = 0;
+    exp_parity[i] = 0;
+    for (int j = 0; j < (bw_x - 1); j++) {
+      adjust[i] += (1ULL << (bw_x - 2 - j)) * msnzb_vector[i * (bw_x - 1) + j];
+      if (((j - s_x) & 1)) {
+        exp_parity[i] ^= msnzb_vector_bool[i * (bw_x - 1) + j];
+      }
+    }
+    adjust[i] &= mask_adjust;
+  }
+  // adjusted_x: bw = bw_x - 1, scale = bw_x - 2
+  uint64_t *adjusted_x = new uint64_t[dim];
+  mult->hadamard_product(dim, x, adjust, adjusted_x, bw_x - 1, bw_x - 1,
+                         bw_x - 1, false, false, MultMode::None);
+  // Top m bits of adjusted_x excluding MSB, which is always 1
+  // adjusted_x_m: bw = m, scale = m
+  uint64_t *adjusted_x_m = new uint64_t[dim];
+  trunc->truncate_and_reduce(dim, adjusted_x, adjusted_x_m, bw_x - m - 2,
+                             bw_x - 2);
+
+  // m + 1 bits will be input to the lookup table
+  int32_t M = (1LL << (m + 1));
+  uint64_t Y_mask = (1ULL << (m + SQRT_LOOKUP_SCALE + 1)) - 1;
+  uint64_t m_mask = (1ULL << m) - 1;
+  // Y: bw = m + SQRT_LOOKUP_SCALE + 1, scale = m + SQRT_LOOKUP_SCALE
+  uint64_t *Y = new uint64_t[dim];
+  if (party == ALICE) {
+    uint64_t **spec;
+    spec = new uint64_t *[dim];
+    PRG128 prg;
+    prg.random_data(Y, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      spec[i] = new uint64_t[M];
+      Y[i] &= Y_mask;
+      for (int j = 0; j < M; j++) {
+        // j = exp_parity || (adjusted_x_m) (LSB -> MSB)
+        int32_t idx = (adjusted_x_m[i] + (j >> 1)) & m_mask;
+        int32_t exp_parity_val = (exp_parity[i] ^ (j & 1));
+        spec[i][j] = (lookup_sqrt(idx, m, exp_parity_val) - Y[i]) & Y_mask;
+      }
+    }
+    aux->lookup_table<uint64_t>(spec, nullptr, nullptr, dim, m + 1,
+                                m + SQRT_LOOKUP_SCALE + 1);
+
+    for (int i = 0; i < dim; i++)
+      delete[] spec[i];
+    delete[] spec;
+  } else {
+    // lut_in = exp_parity || adjusted_x_m
+    uint64_t *lut_in = new uint64_t[dim];
+    for (int i = 0; i < dim; i++) {
+      lut_in[i] = ((adjusted_x_m[i] & m_mask) << 1) | (exp_parity[i] & 1);
+    }
+    aux->lookup_table<uint64_t>(nullptr, lut_in, Y, dim, m + 1,
+                                m + SQRT_LOOKUP_SCALE + 1);
+
+    delete[] lut_in;
+  }
+  // X = (exp_parity ? 2 * adjusted_x : adjusted_x); scale = bw_x - 2
+  // X: bw = bw_x
+  uint64_t *X = new uint64_t[dim];
+  uint64_t *tmp_1 = new uint64_t[dim];
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  xt->z_extend(dim, adjusted_x, X, bw_x - 1, bw_x);
+  aux->multiplexer(exp_parity, X, tmp_1, dim, bw_x, bw_x);
+  for (int i = 0; i < dim; i++) {
+    X[i] = (X[i] + tmp_1[i]) & mask_x;
+  }
+
+  uint64_t *x_prev = new uint64_t[dim];
+  if (inverse) {
+    // x_prev : bw = m + SQRT_LOOKUP_SCALE + 1, scale = m + SQRT_LOOKUP_SCALE
+    // x_prev \approx 0.5 < 1/sqrt(X) < 1
+    memcpy(x_prev, Y, dim * sizeof(uint64_t));
+  } else {
+    // x_prev : bw = s_y + 1, scale = s_y
+    // x_prev \approx 1 < sqrt(X) < 2
+    mult->hadamard_product(dim, X, Y, tmp_1, bw_x, m + SQRT_LOOKUP_SCALE + 1,
+                           bw_x - 1 + m + SQRT_LOOKUP_SCALE, false, false,
+                           MultMode::None);
+    assert((bw_x - 2 + m + SQRT_LOOKUP_SCALE) >= s_y);
+    trunc->truncate_and_reduce(dim, tmp_1, x_prev,
+                               bw_x - 2 + m + SQRT_LOOKUP_SCALE - s_y,
+                               bw_x - 1 + m + SQRT_LOOKUP_SCALE);
+  }
+  // b_prev: bw = s_y + 2, scale = s_y
+  uint64_t *b_prev = new uint64_t[dim];
+  assert((bw_x - 2) >= s_y);
+  trunc->truncate_and_reduce(dim, X, b_prev, bw_x - 2 - s_y, bw_x);
+  // Y_prev: bw = m + SQRT_LOOKUP_SCALE + 1, scale = m + SQRT_LOOKUP_SCALE
+  uint64_t *Y_prev = new uint64_t[dim];
+  memcpy(Y_prev, Y, dim * sizeof(uint64_t));
+
+  uint64_t b_mask = (1ULL << (s_y + 2)) - 1;
+  uint64_t *x_curr = new uint64_t[dim];
+  uint64_t *b_curr = new uint64_t[dim];
+  uint64_t *Y_curr = new uint64_t[dim];
+  uint64_t *Y_sq = new uint64_t[dim];
+  for (int i = 0; i < iters; i++) {
+    if (i == 0) {
+      // Y_sq: bw = 2m + 2SQRT_LOOKUP_SCALE + 1, scale = 2m + 2SQRT_LOOKUP_SCALE
+      mult->hadamard_product(
+          dim, Y_prev, Y_prev, Y_sq, m + SQRT_LOOKUP_SCALE + 1,
+          m + SQRT_LOOKUP_SCALE + 1, 2 * m + 2 * SQRT_LOOKUP_SCALE + 1, false,
+          false, MultMode::None);
+      // tmp_1: bw = 2m+2SQRT_LOOKUP_SCALE+s_y+2, scale =
+      // 2m+2SQRT_LOOKUP_SCALE+s_y
+      mult->hadamard_product(dim, Y_sq, b_prev, tmp_1,
+                             2 * m + 2 * SQRT_LOOKUP_SCALE + 1, s_y + 2,
+                             2 * m + 2 * SQRT_LOOKUP_SCALE + s_y + 2, false,
+                             false, MultMode::None);
+      // b_curr: bw = s_y + 2, scale = s_y
+      trunc->truncate_and_reduce(dim, tmp_1, b_curr,
+                                 2 * m + 2 * SQRT_LOOKUP_SCALE,
+                                 2 * m + 2 * SQRT_LOOKUP_SCALE + s_y + 2);
+    } else {
+      // tmp_1: bw = 2*s_y + 3, scale = 2*s_y + 2
+      mult->hadamard_product(dim, Y_prev, Y_prev, tmp_1, s_y + 2, s_y + 2,
+                             2 * s_y + 3, false, false, MultMode::None);
+      // Y_sq: bw = s_y + 1, scale = s_y
+      trunc->truncate_and_reduce(dim, tmp_1, Y_sq, s_y + 2, 2 * s_y + 3);
+      // tmp_1: bw = 2s_y + 2, scale = 2s_y
+      mult->hadamard_product(dim, Y_sq, b_prev, tmp_1, s_y + 1, s_y + 2,
+                             2 * s_y + 2, false, false, MultMode::None);
+      // b_curr: bw = s_y + 2, scale = s_y
+      trunc->truncate_and_reduce(dim, tmp_1, b_curr, s_y, 2 * s_y + 2);
+    }
+    for (int j = 0; j < dim; j++) {
+      // Y_curr: bw = s_y + 2, scale = s_y + 1
+      // Y_curr = (3 - b_curr)/2
+      Y_curr[j] = ((party == ALICE ? (3ULL << s_y) : 0) - b_curr[j]) & b_mask;
+    }
+    if (inverse && (i == 0)) {
+      // tmp_1: bw = s_y+m+SQRT_LOOKUP_SCALE+2, scale =
+      // s_y+m+SQRT_LOOKUP_SCALE+1
+      mult->hadamard_product(
+          dim, x_prev, Y_curr, tmp_1, m + SQRT_LOOKUP_SCALE + 1, s_y + 2,
+          s_y + m + SQRT_LOOKUP_SCALE + 2, false, false, MultMode::None);
+      // x_curr: bw = s_y + 1, scale = s_y
+      trunc->truncate_and_reduce(dim, tmp_1, x_curr, m + SQRT_LOOKUP_SCALE + 1,
+                                 s_y + m + SQRT_LOOKUP_SCALE + 2);
+    } else {
+      // tmp_1: bw = 2*s_y + 2, scale = 2s_y + 1
+      mult->hadamard_product(dim, x_prev, Y_curr, tmp_1, s_y + 1, s_y + 2,
+                             2 * s_y + 2, false, false, MultMode::None);
+      // x_curr: bw = s_y + 1, scale = s_y
+      trunc->truncate_and_reduce(dim, tmp_1, x_curr, s_y + 1, 2 * s_y + 2);
+    }
+    memcpy(x_prev, x_curr, dim * sizeof(uint64_t));
+    memcpy(b_prev, b_curr, dim * sizeof(uint64_t));
+    memcpy(Y_prev, Y_curr, dim * sizeof(uint64_t));
+  }
+
+  int32_t bw_sqrt_adjust = bw_x / 2;
+  uint64_t mask_sqrt_adjust =
+      (bw_sqrt_adjust == 64 ? -1 : ((1ULL << bw_sqrt_adjust) - 1));
+  uint64_t *sqrt_adjust = new uint64_t[dim];
+  int32_t sqrt_adjust_scale =
+      (inverse ? floor((bw_x - 1 - s_x) / 2.0) : floor((s_x + 1) / 2.0));
+  for (int i = 0; i < dim; i++) {
+    sqrt_adjust[i] = 0;
+    for (int j = 0; j < (bw_x - 1); j++) {
+      if (inverse) {
+        sqrt_adjust[i] +=
+            (1ULL << int(floor((s_x - j + 1) / 2.0) + sqrt_adjust_scale)) *
+            msnzb_vector[i * (bw_x - 1) + j];
+      } else {
+        sqrt_adjust[i] +=
+            (1ULL << int(floor((j - s_x) / 2.0) + sqrt_adjust_scale)) *
+            msnzb_vector[i * (bw_x - 1) + j];
+      }
+    }
+    sqrt_adjust[i] &= mask_sqrt_adjust;
+  }
+  if (iters > 0 || (!inverse)) {
+    // tmp_1: bw = s_y + 1 + bw_sqrt_adjust, scale = s_y + sqrt_adjust_scale
+    mult->hadamard_product(dim, x_prev, sqrt_adjust, tmp_1, s_y + 1,
+                           bw_sqrt_adjust, s_y + 1 + bw_sqrt_adjust, false,
+                           false, MultMode::None);
+    // x_curr: bw = s_y + floor(bw_x/2) + 1 - ceil(s_x/2), scale = s_y
+    trunc->truncate_and_reduce(dim, tmp_1, x_prev, sqrt_adjust_scale,
+                               s_y + 1 + bw_sqrt_adjust);
+    if (bw_y > (s_y + 1 + bw_sqrt_adjust - sqrt_adjust_scale)) {
+      xt->z_extend(dim, x_prev, y, s_y + 1 + bw_sqrt_adjust - sqrt_adjust_scale,
+                   bw_y);
+    } else {
+      aux->reduce(dim, x_prev, y, s_y + 1 + bw_sqrt_adjust - sqrt_adjust_scale,
+                  bw_y);
+    }
+  } else {
+    // tmp_1: bw = m + SQRT_LOOKUP_SCALE + 1 + bw_sqrt_adjust,
+    //        scale = m + SQRT_LOOKUP_SCALE + sqrt_adjust_scale
+    mult->hadamard_product(dim, x_prev, sqrt_adjust, tmp_1,
+                           m + SQRT_LOOKUP_SCALE + 1, bw_sqrt_adjust,
+                           m + SQRT_LOOKUP_SCALE + 1 + bw_sqrt_adjust, false,
+                           false, MultMode::None);
+    // x_curr: bw = m + floor(bw_x/2) + 1 - ceil(s_x/2), scale = m
+    // If iters == 0, we know s_y = m
+    trunc->truncate_and_reduce(dim, tmp_1, x_prev,
+                               sqrt_adjust_scale + SQRT_LOOKUP_SCALE,
+                               m + SQRT_LOOKUP_SCALE + 1 + bw_sqrt_adjust);
+    if (bw_y > (m + 1 + bw_sqrt_adjust - sqrt_adjust_scale)) {
+      xt->z_extend(dim, x_prev, y, m + 1 + bw_sqrt_adjust - sqrt_adjust_scale,
+                   bw_y);
+    } else {
+      aux->reduce(dim, x_prev, y, m + 1 + bw_sqrt_adjust - sqrt_adjust_scale,
+                  bw_y);
+    }
+  }
+
+  delete[] msnzb_vector_bool;
+  delete[] msnzb_vector;
+  delete[] adjust;
+  delete[] exp_parity;
+  delete[] adjusted_x;
+  delete[] X;
+  delete[] tmp_1;
+  delete[] x_prev;
+  delete[] b_prev;
+  delete[] Y_prev;
+  delete[] x_curr;
+  delete[] b_curr;
+  delete[] Y_curr;
+  delete[] Y_sq;
+  delete[] sqrt_adjust;
+
+  return;
+}
+
+void MathFunctions::ReLU(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                         uint64_t six) {
+  bool six_comparison = false;
+  if (six != 0)
+    six_comparison = true;
+  int32_t size = (six_comparison ? 2 * dim : dim);
+
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *tmp = new uint64_t[size];
+  uint8_t *tmp_msb = new uint8_t[size];
+  memcpy(tmp, x, dim * sizeof(uint64_t));
+  if (six_comparison) {
+    for (int i = 0; i < dim; i++) {
+      tmp[dim + i] = (party == ALICE ? x[i] - six : x[i]) & mask_x;
+    }
+  }
+  aux->MSB(tmp, tmp_msb, size, bw_x);
+  for (int i = 0; i < size; i++) {
+    if (party == ALICE) {
+      tmp_msb[i] = tmp_msb[i] ^ 1;
+    }
+  }
+  if (six_comparison)
+    aux->AND(tmp_msb, tmp_msb + dim, tmp_msb + dim, dim);
+
+  aux->multiplexer(tmp_msb, tmp, tmp, size, bw_x, bw_x);
+
+  memcpy(y, tmp, dim * sizeof(uint64_t));
+  if (six_comparison) {
+    for (int i = 0; i < dim; i++) {
+      y[i] = (y[i] - tmp[i + dim]) & mask_x;
+    }
+  }
+
+  delete[] tmp;
+  delete[] tmp_msb;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.h b/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.h
new file mode 100644
index 00000000..3f2e8de5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.h
@@ -0,0 +1,82 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef MATH_FUNCTIONS_H__
+#define MATH_FUNCTIONS_H__
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "BuildingBlocks/truncation.h"
+#include "BuildingBlocks/value-extension.h"
+#include "LinearOT/linear-ot.h"
+
+class MathFunctions {
+public:
+  int party;
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  AuxProtocols *aux;
+  XTProtocol *xt;
+  Truncation *trunc;
+  LinearOT *mult;
+
+  MathFunctions(int party, sci::IOPack *iopack, sci::OTPack *otpack);
+
+  ~MathFunctions();
+
+  // Current implementation assumes that dn is always of the form 1.y1y2y3..yn
+  void reciprocal_approximation(int32_t dim, int32_t m, uint64_t *dn,
+                                uint64_t *out, int32_t bw_dn, int32_t bw_out,
+                                int32_t s_dn, int32_t s_out);
+
+  // If compute_msnzb = false, dn = 1.y1y2y3....
+  // Else if compute_msnzb = true, dn is always positive
+  void div(int32_t dim,
+           // numerator
+           uint64_t *nm,
+           // denominator
+           uint64_t *dn,
+           // output
+           uint64_t *out,
+           // bitwidths
+           int32_t bw_nm, int32_t bw_dn, int32_t bw_out,
+           // scales
+           int32_t s_nm, int32_t s_dn, int32_t s_out, bool signed_nm = true,
+           bool compute_msnzb = false);
+
+  // Assumes x is always negative
+  void lookup_table_exp(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+                        int32_t bw_y, int32_t s_x, int32_t s_y);
+
+  void sigmoid(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+               int32_t bw_y, int32_t s_x, int32_t s_y);
+
+  void tanh(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x, int32_t bw_y,
+            int32_t s_x, int32_t s_y);
+
+  void sqrt(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x, int32_t bw_y,
+            int32_t s_x, int32_t s_y, bool inverse = false);
+
+  // bw_y = bw_x
+  void ReLU(int32_t dim, uint64_t *x, uint64_t *y, int32_t bw_x,
+            uint64_t six = 0);
+};
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/CMakeLists.txt
new file mode 100644
index 00000000..d097b85c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(SCI-Millionaire INTERFACE)
+target_link_libraries(SCI-Millionaire
+	INTERFACE SCI-OTPrimitive
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/bit-triple-generator.h b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/bit-triple-generator.h
new file mode 100644
index 00000000..41d68d01
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/bit-triple-generator.h
@@ -0,0 +1,323 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef TRIPLE_GENERATOR_H__
+#define TRIPLE_GENERATOR_H__
+#include "OT/ot_pack.h"
+
+enum TripleGenMethod {
+  Ideal,          // (Insecure) Ideal Functionality
+  _2ROT,          // 1 Bit Triple from 2 ROT
+  _16KKOT_to_4OT, // 2 Bit Triples from 1oo16 KKOT to 1oo4 OT
+  _8KKOT,         // 2 Correlated Bit Triples from 1oo8 KKOT
+};
+
+class Triple {
+public:
+  bool packed;
+  uint8_t *ai;
+  uint8_t *bi;
+  uint8_t *ci;
+  int num_triples, num_bytes, offset;
+
+  Triple(int num_triples, bool packed = false, int offset = 0) {
+    assert((offset < num_triples) || (num_triples == 0));
+    this->num_triples = num_triples;
+    this->packed = packed;
+    if (packed) {
+      assert(num_triples % 8 == 0);
+      assert(offset % 8 == 0);
+      this->num_bytes = num_triples / 8;
+    } else
+      this->num_bytes = num_triples;
+    if (offset == 0)
+      this->offset = 1;
+    else
+      this->offset = offset;
+    assert((num_triples % this->offset) == 0);
+    this->ai = new uint8_t[num_bytes];
+    this->bi = new uint8_t[num_bytes];
+    this->ci = new uint8_t[num_bytes];
+  }
+
+  ~Triple() {
+    delete[] ai;
+    delete[] bi;
+    delete[] ci;
+  }
+};
+
+class TripleGenerator {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  sci::PRG128 *prg;
+  int party;
+
+  TripleGenerator(int party, sci::IOPack *iopack, sci::OTPack *otpack) {
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->prg = new sci::PRG128;
+  }
+
+  ~TripleGenerator() { delete prg; }
+
+  void generate(int party, uint8_t *ai, uint8_t *bi, uint8_t *ci,
+                int num_triples, TripleGenMethod method, bool packed = false,
+                int offset = 1) {
+    if (!num_triples)
+      return;
+    switch (method) {
+    case Ideal: {
+      int num_bytes = ceil((double)num_triples / 8);
+      if (party == sci::ALICE) {
+        uint8_t *a = new uint8_t[num_bytes];
+        uint8_t *b = new uint8_t[num_bytes];
+        uint8_t *c = new uint8_t[num_bytes];
+        if (packed) {
+          prg->random_data(ai, num_bytes);
+          prg->random_data(bi, num_bytes);
+          prg->random_data(ci, num_bytes);
+        } else {
+          prg->random_bool((bool *)ai, num_triples);
+          prg->random_bool((bool *)bi, num_triples);
+          prg->random_bool((bool *)ci, num_triples);
+        }
+        prg->random_data(a, num_bytes);
+        prg->random_data(b, num_bytes);
+        for (int i = 0; i < num_triples; i += 8) {
+          c[i / 8] = a[i / 8] & b[i / 8];
+          if (packed) {
+            a[i / 8] ^= ai[i / 8];
+            b[i / 8] ^= bi[i / 8];
+            c[i / 8] ^= ci[i / 8];
+          } else {
+            uint8_t temp_a, temp_b, temp_c;
+            if (num_triples - i >= 8) {
+              temp_a = sci::bool_to_uint8(ai + i, 8);
+              temp_b = sci::bool_to_uint8(bi + i, 8);
+              temp_c = sci::bool_to_uint8(ci + i, 8);
+            } else {
+              temp_a = sci::bool_to_uint8(ai + i, num_triples - i);
+              temp_b = sci::bool_to_uint8(bi + i, num_triples - i);
+              temp_c = sci::bool_to_uint8(ci + i, num_triples - i);
+            }
+            a[i / 8] ^= temp_a;
+            b[i / 8] ^= temp_b;
+            c[i / 8] ^= temp_c;
+          }
+        }
+        iopack->io->send_data(a, num_bytes);
+        iopack->io->send_data(b, num_bytes);
+        iopack->io->send_data(c, num_bytes);
+        delete[] a;
+        delete[] b;
+        delete[] c;
+      } else {
+        if (packed) {
+          iopack->io->recv_data(ai, num_bytes);
+          iopack->io->recv_data(bi, num_bytes);
+          iopack->io->recv_data(ci, num_bytes);
+        } else {
+          uint8_t *a = new uint8_t[num_bytes];
+          uint8_t *b = new uint8_t[num_bytes];
+          uint8_t *c = new uint8_t[num_bytes];
+          iopack->io->recv_data(a, num_bytes);
+          iopack->io->recv_data(b, num_bytes);
+          iopack->io->recv_data(c, num_bytes);
+
+          for (int i = 0; i < num_triples; i += 8) {
+            if (num_triples - i >= 8) {
+              sci::uint8_to_bool(ai + i, a[i / 8], 8);
+              sci::uint8_to_bool(bi + i, b[i / 8], 8);
+              sci::uint8_to_bool(ci + i, c[i / 8], 8);
+            } else {
+              sci::uint8_to_bool(ai + i, a[i / 8], num_triples - i);
+              sci::uint8_to_bool(bi + i, b[i / 8], num_triples - i);
+              sci::uint8_to_bool(ci + i, c[i / 8], num_triples - i);
+            }
+          }
+          delete[] a;
+          delete[] b;
+          delete[] c;
+        }
+      }
+      break;
+    }
+    case _2ROT: {
+      throw std::invalid_argument("To be implemented");
+      break;
+    }
+    case _16KKOT_to_4OT: {
+      assert((num_triples & 1) == 0); // num_triples is even
+      uint8_t *a, *b, *c;
+      if (packed) {
+        a = new uint8_t[num_triples];
+        b = new uint8_t[num_triples];
+        c = new uint8_t[num_triples];
+      } else {
+        a = ai;
+        b = bi;
+        c = ci;
+      }
+      prg->random_bool((bool *)a, num_triples);
+      prg->random_bool((bool *)b, num_triples);
+      switch (party) {
+      case sci::ALICE: {
+        prg->random_bool((bool *)c, num_triples);
+        uint8_t **ot_messages; // (num_triples/2) X 16
+        ot_messages = new uint8_t *[num_triples / 2];
+        for (int i = 0; i < num_triples; i += 2)
+          ot_messages[i / 2] = new uint8_t[16];
+        for (int j = 0; j < 16; j++) {
+          uint8_t bits_j[4]; // a01 || b01 || a11 || b11 (LSB->MSB)
+          sci::uint8_to_bool(bits_j, j, 4);
+          for (int i = 0; i < num_triples; i += 2) {
+            ot_messages[i / 2][j] =
+                ((((a[i + 1] ^ bits_j[2]) & (b[i + 1] ^ bits_j[3])) ^ c[i + 1])
+                 << 1) |
+                (((a[i] ^ bits_j[0]) & (b[i] ^ bits_j[1])) ^ c[i]);
+          }
+        }
+        // otpack->kkot_16->send(ot_messages, num_triples/2, 2);
+        otpack->kkot[3]->send(ot_messages, num_triples / 2, 2);
+        for (int i = 0; i < num_triples; i += 2)
+          delete[] ot_messages[i / 2];
+        delete[] ot_messages;
+        break;
+      }
+      case sci::BOB: {
+        uint8_t *ot_selection = new uint8_t[(size_t)num_triples / 2];
+        uint8_t *ot_result = new uint8_t[(size_t)num_triples / 2];
+        for (int i = 0; i < num_triples; i += 2) {
+          ot_selection[i / 2] =
+              (b[i + 1] << 3) | (a[i + 1] << 2) | (b[i] << 1) | a[i];
+        }
+        // otpack->kkot_16->recv(ot_result, ot_selection, num_triples/2, 2);
+        otpack->kkot[3]->recv(ot_result, ot_selection, num_triples / 2, 2);
+        for (int i = 0; i < num_triples; i += 2) {
+          c[i] = ot_result[i / 2] & 1;
+          c[i + 1] = ot_result[i / 2] >> 1;
+        }
+        delete[] ot_selection;
+        delete[] ot_result;
+        break;
+      }
+      }
+      if (packed) {
+        for (int i = 0; i < num_triples; i += 8) {
+          ai[i / 8] = sci::bool_to_uint8(a + i, 8);
+          bi[i / 8] = sci::bool_to_uint8(b + i, 8);
+          ci[i / 8] = sci::bool_to_uint8(c + i, 8);
+        }
+        delete[] a;
+        delete[] b;
+        delete[] c;
+      }
+      break;
+    }
+    case _8KKOT: {
+      assert((num_triples & 1) == 0); // num_triples is even
+      uint8_t *a, *b, *c;
+      if (packed) {
+        a = new uint8_t[num_triples];
+        b = new uint8_t[num_triples];
+        c = new uint8_t[num_triples];
+      } else {
+        a = ai;
+        b = bi;
+        c = ci;
+      }
+      for (int i = 0; i < num_triples; i += 2 * offset) {
+        prg->random_bool((bool *)a + i, offset);
+        memcpy(a + i + offset, a + i, offset);
+      }
+      prg->random_bool((bool *)b, num_triples);
+      switch (party) {
+      case sci::ALICE: {
+        prg->random_bool((bool *)c, num_triples);
+        uint8_t **ot_messages; // (num_triples/2) X 8
+        ot_messages = new uint8_t *[num_triples / 2];
+        for (int i = 0; i < num_triples; i += 2)
+          ot_messages[i / 2] = new uint8_t[8];
+        for (int j = 0; j < 8; j++) {
+          uint8_t bits_j[3]; // a01 || b01 || b11 (LSB->MSB)
+          sci::uint8_to_bool(bits_j, j, 3);
+          for (int i = 0; i < num_triples; i += 2 * offset) {
+            for (int k = 0; k < offset; k++) {
+              ot_messages[i / 2 + k][j] =
+                  ((((a[i + k] ^ bits_j[0]) & (b[i + offset + k] ^ bits_j[2])) ^
+                    c[i + offset + k])
+                   << 1) |
+                  (((a[i + k] ^ bits_j[0]) & (b[i + k] ^ bits_j[1])) ^
+                   c[i + k]);
+            }
+          }
+        }
+        // otpack->kkot_8->send(ot_messages, num_triples/2, 2);
+        otpack->kkot[2]->send(ot_messages, num_triples / 2, 2);
+        for (int i = 0; i < num_triples; i += 2)
+          delete[] ot_messages[i / 2];
+        delete[] ot_messages;
+        break;
+      }
+      case sci::BOB: {
+        uint8_t *ot_selection = new uint8_t[(size_t)num_triples / 2];
+        uint8_t *ot_result = new uint8_t[(size_t)num_triples / 2];
+        for (int i = 0; i < num_triples; i += 2 * offset) {
+          for (int k = 0; k < offset; k++)
+            ot_selection[i / 2 + k] =
+                (b[i + offset + k] << 2) | (b[i + k] << 1) | a[i + k];
+        }
+        // otpack->kkot_8->recv(ot_result, ot_selection, num_triples/2, 2);
+        otpack->kkot[2]->recv(ot_result, ot_selection, num_triples / 2, 2);
+        for (int i = 0; i < num_triples; i += 2 * offset) {
+          for (int k = 0; k < offset; k++) {
+            c[i + k] = ot_result[i / 2 + k] & 1;
+            c[i + offset + k] = ot_result[i / 2 + k] >> 1;
+          }
+        }
+        delete[] ot_selection;
+        delete[] ot_result;
+        break;
+      }
+      }
+      if (packed) {
+        for (int i = 0; i < num_triples; i += 8) {
+          ai[i / 8] = sci::bool_to_uint8(a + i, 8);
+          bi[i / 8] = sci::bool_to_uint8(b + i, 8);
+          ci[i / 8] = sci::bool_to_uint8(c + i, 8);
+        }
+        delete[] a;
+        delete[] b;
+        delete[] c;
+      }
+      break;
+    }
+    }
+  }
+
+  void generate(int party, Triple *triples, TripleGenMethod method) {
+    generate(party, triples->ai, triples->bi, triples->ci, triples->num_triples,
+             method, triples->packed, triples->offset);
+  }
+};
+#endif // TRIPLE_GENERATOR_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/equality.h b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/equality.h
new file mode 100644
index 00000000..64bdcf2d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/equality.h
@@ -0,0 +1,353 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef EQUALITY_H__
+#define EQUALITY_H__
+#include "Millionaire/millionaire.h"
+#include "OT/emp-ot.h"
+#include "utils/emp-tool.h"
+#include <cmath>
+
+class Equality {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *mill;
+  int party;
+  int l, r, log_alpha, beta, beta_pow;
+  int num_digits, num_triples, log_num_digits;
+  uint8_t mask_beta, mask_r;
+
+  Equality(int party, sci::IOPack *iopack, sci::OTPack *otpack,
+           int bitlength = 32, int radix_base = MILL_PARAM) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->mill =
+        new MillionaireProtocol(party, iopack, otpack, bitlength, radix_base);
+    this->triple_gen = mill->triple_gen;
+    configure(bitlength, radix_base);
+  }
+
+  void configure(int bitlength, int radix_base = MILL_PARAM) {
+    assert(radix_base <= 8);
+    assert(bitlength <= 64);
+    this->l = bitlength;
+    this->beta = radix_base;
+
+    this->num_digits = ceil((double)l / beta);
+    this->r = l % beta;
+    this->log_alpha = sci::bitlen(num_digits) - 1;
+    this->log_num_digits = log_alpha + 1;
+    this->num_triples = num_digits - 1;
+    if (beta == 8)
+      this->mask_beta = -1;
+    else
+      this->mask_beta = (1 << beta) - 1;
+    this->mask_r = (1 << r) - 1;
+    this->beta_pow = 1 << beta;
+  }
+
+  ~Equality() { delete mill; }
+
+  void bitlen_lt_beta(uint8_t *res_eq, uint64_t *data, int num_eqs,
+                      int bitlength, bool greater_than = true,
+                      int radix_base = MILL_PARAM) {
+    uint8_t N = 1 << bitlength;
+    uint8_t mask = N - 1;
+    if (party == sci::ALICE) {
+      sci::PRG128 prg;
+      prg.random_data(res_eq, num_eqs * sizeof(uint8_t));
+      uint8_t **leaf_messages = new uint8_t *[num_eqs];
+      for (int i = 0; i < num_eqs; i++) {
+        res_eq[i] &= 1;
+        leaf_messages[i] = new uint8_t[N];
+        this->set_leaf_ot_messages(leaf_messages[i], (data[i] & mask), N,
+                                   res_eq[i]);
+      }
+      if (bitlength > 1) {
+        otpack->kkot[bitlength - 1]->send(leaf_messages, num_eqs, 1);
+      } else {
+        otpack->iknp_straight->send(leaf_messages, num_eqs, 1);
+      }
+
+      for (int i = 0; i < num_eqs; i++)
+        delete[] leaf_messages[i];
+      delete[] leaf_messages;
+    } else { // party == BOB
+      uint8_t *choice = new uint8_t[num_eqs];
+      for (int i = 0; i < num_eqs; i++) {
+        choice[i] = data[i] & mask;
+      }
+      if (bitlength > 1) {
+        otpack->kkot[bitlength - 1]->recv(res_eq, choice, num_eqs, 1);
+      } else {
+        otpack->iknp_straight->recv(res_eq, choice, num_eqs, 1);
+      }
+      for (int i = 0; i < num_eqs; i++) {
+        res_eq[i] = res_eq[i] & 1;
+      }
+      delete[] choice;
+    }
+    return;
+  }
+
+  void check_equality(uint8_t *res_eq, uint64_t *data, int num_eqs,
+                      int bitlength, int radix_base = MILL_PARAM) {
+    configure(bitlength, radix_base);
+
+    if (bitlength <= beta) {
+      bitlen_lt_beta(res_eq, data, num_eqs, bitlength, radix_base);
+      return;
+    }
+
+    int old_num_eqs = num_eqs;
+    // num_eqs should be a multiple of 8
+    num_eqs = ceil(num_eqs / 8.0) * 8;
+
+    // padding with 0s if data dim not multiple of 8
+    uint64_t *data_ext;
+    if (old_num_eqs == num_eqs)
+      data_ext = data;
+    else {
+      data_ext = new uint64_t[num_eqs];
+      memcpy(data_ext, data, old_num_eqs * sizeof(uint64_t));
+      memset(data_ext + old_num_eqs, 0,
+             (num_eqs - old_num_eqs) * sizeof(uint64_t));
+    }
+
+    uint8_t *digits;      // num_digits * num_eqs
+    uint8_t *leaf_res_eq; // num_digits * num_eqs
+
+    digits = new uint8_t[num_digits * num_eqs];
+    leaf_res_eq = new uint8_t[num_digits * num_eqs];
+
+    // Extract radix-digits from data
+    for (int i = 0; i < num_digits; i++) // Stored from LSB to MSB
+      for (int j = 0; j < num_eqs; j++)
+        if ((i == num_digits - 1) && (r != 0))
+          digits[i * num_eqs + j] = (uint8_t)(data_ext[j] >> i * beta) & mask_r;
+        else
+          digits[i * num_eqs + j] =
+              (uint8_t)(data_ext[j] >> i * beta) & mask_beta;
+    // ======================
+
+    // Set leaf OT messages now
+    if (party == sci::ALICE) {
+      uint8_t **leaf_ot_messages; // (num_digits * num_eqs) X beta_pow (=2^beta)
+      leaf_ot_messages = new uint8_t *[num_digits * num_eqs];
+      for (int i = 0; i < num_digits * num_eqs; i++)
+        leaf_ot_messages[i] = new uint8_t[beta_pow];
+
+      // Set Leaf OT messages
+      triple_gen->prg->random_bool((bool *)leaf_res_eq, num_digits * num_eqs);
+      for (int i = 0; i < num_digits; i++) {
+        for (int j = 0; j < num_eqs; j++) {
+          if (i == (num_digits - 1) && (r > 0)) {
+            this->set_leaf_ot_messages(leaf_ot_messages[i * num_eqs + j],
+                                       digits[i * num_eqs + j], 1ULL << r,
+                                       leaf_res_eq[i * num_eqs + j]);
+          } else {
+            this->set_leaf_ot_messages(leaf_ot_messages[i * num_eqs + j],
+                                       digits[i * num_eqs + j], beta_pow,
+                                       leaf_res_eq[i * num_eqs + j]);
+          }
+        }
+      }
+
+      // Perform Leaf OTs with comparison and equality
+      if (r == 1) {
+        // All branches except r
+        otpack->kkot[beta - 1]->send(leaf_ot_messages,
+                                     num_eqs * (num_digits - 1), 1);
+        // r branch
+        otpack->iknp_straight->send(
+            leaf_ot_messages + num_eqs * (num_digits - 1), num_eqs, 1);
+      } else if (r != 0) {
+        // All branches except r
+        otpack->kkot[beta - 1]->send(leaf_ot_messages,
+                                     num_eqs * (num_digits - 1), 1);
+        // r branch
+        otpack->kkot[r - 1]->send(leaf_ot_messages + num_eqs * (num_digits - 1),
+                                  num_eqs, 1);
+      } else {
+        // All branches including r, r is 0
+        otpack->kkot[beta - 1]->send(leaf_ot_messages, num_eqs * (num_digits),
+                                     1);
+      }
+
+      // Cleanup
+      for (int i = 0; i < num_digits * num_eqs; i++)
+        delete[] leaf_ot_messages[i];
+      delete[] leaf_ot_messages;
+    } else // party = sci::BOB
+    {
+      // Perform Leaf OTs
+      if (r == 1) {
+        // All branches except r
+        otpack->kkot[beta - 1]->recv(leaf_res_eq, digits,
+                                     num_eqs * (num_digits - 1), 1);
+        // r branch
+        otpack->iknp_straight->recv(leaf_res_eq + num_eqs * (num_digits - 1),
+                                    digits + num_eqs * (num_digits - 1),
+                                    num_eqs, 1);
+      } else if (r != 0) {
+        // All branches except r
+        otpack->kkot[beta - 1]->recv(leaf_res_eq, digits,
+                                     num_eqs * (num_digits - 1), 1);
+        // r branch
+        otpack->kkot[r - 1]->recv(leaf_res_eq + num_eqs * (num_digits - 1),
+                                  digits + num_eqs * (num_digits - 1), num_eqs,
+                                  1);
+      } else {
+        // All branches including r, r is 0
+        otpack->kkot[beta - 1]->recv(leaf_res_eq, digits,
+                                     num_eqs * (num_digits), 1);
+      }
+
+      for (int i = 0; i < num_digits * num_eqs; i++) {
+        leaf_res_eq[i] = leaf_res_eq[i] & 1;
+      }
+    }
+
+    traverse_and_compute_ANDs(num_eqs, leaf_res_eq);
+
+    for (int i = 0; i < old_num_eqs; i++) {
+      res_eq[i] = leaf_res_eq[i];
+    }
+
+    // Cleanup
+    if (old_num_eqs != num_eqs)
+      delete[] data_ext;
+    delete[] digits;
+    delete[] leaf_res_eq;
+  }
+
+  /**************************************************************************************************
+   *                         AND computation related functions
+   **************************************************************************************************/
+
+  void traverse_and_compute_ANDs(int num_eqs, uint8_t *leaf_res_eq) {
+    Triple triples_std((num_triples)*num_eqs, true);
+
+    // Generate required Bit-Triples
+    triple_gen->generate(party, &triples_std, _16KKOT_to_4OT);
+
+    // Combine leaf OT results in a bottom-up fashion
+    int counter_triples_used = 0, old_counter_triples_used = 0;
+    uint8_t *ei = new uint8_t[(num_triples * num_eqs) / 8];
+    uint8_t *fi = new uint8_t[(num_triples * num_eqs) / 8];
+    uint8_t *e = new uint8_t[(num_triples * num_eqs) / 8];
+    uint8_t *f = new uint8_t[(num_triples * num_eqs) / 8];
+
+    for (int i = 1; i < num_digits;
+         i *= 2) { // i denotes the distance between 2 nodes which should be
+                   // ANDed together
+      for (int j = 0; j < num_digits and j + i < num_digits;
+           j += 2 * i) { // j=0 is LSD and j=num_digits-1 is MSD
+
+        // EQ_j: Use 1 triple for opening e = a + eq_j and f = b + eq_j+i.
+        this->mill->AND_step_1(
+            ei + (counter_triples_used * num_eqs) / 8,
+            fi + (counter_triples_used * num_eqs) / 8,
+            leaf_res_eq + j * num_eqs, leaf_res_eq + (j + i) * num_eqs,
+            (triples_std.ai) + (counter_triples_used * num_eqs) / 8,
+            (triples_std.bi) + (counter_triples_used * num_eqs) / 8, num_eqs);
+        counter_triples_used++;
+      }
+      int offset = (old_counter_triples_used * num_eqs) / 8;
+      int size_used =
+          ((counter_triples_used - old_counter_triples_used) * num_eqs) / 8;
+
+#pragma omp parallel num_threads(2)
+      {
+        if (omp_get_thread_num() == 1) {
+          if (party == sci::ALICE) {
+            iopack->io_rev->recv_data(e + offset, size_used);
+            iopack->io_rev->recv_data(e + offset, size_used);
+            iopack->io_rev->recv_data(f + offset, size_used);
+            iopack->io_rev->recv_data(f + offset, size_used);
+          } else { // party == sci::BOB
+            iopack->io_rev->send_data(ei + offset, size_used);
+            iopack->io_rev->send_data(ei + offset, size_used);
+            iopack->io_rev->send_data(fi + offset, size_used);
+            iopack->io_rev->send_data(fi + offset, size_used);
+          }
+        } else {
+          if (party == sci::ALICE) {
+            iopack->io->send_data(ei + offset, size_used);
+            iopack->io->send_data(ei + offset, size_used);
+            iopack->io->send_data(fi + offset, size_used);
+            iopack->io->send_data(fi + offset, size_used);
+          } else { // party == sci::BOB
+            iopack->io->recv_data(e + offset, size_used);
+            iopack->io->recv_data(e + offset, size_used);
+            iopack->io->recv_data(f + offset, size_used);
+            iopack->io->recv_data(f + offset, size_used);
+          }
+        }
+      }
+
+      // Reconstruct e and f
+      for (int i = 0; i < size_used; i++) {
+        e[i + offset] ^= ei[i + offset];
+        f[i + offset] ^= fi[i + offset];
+      }
+
+      counter_triples_used = old_counter_triples_used;
+
+      // Step 2 of AND computation
+      for (int j = 0; j < num_digits and j + i < num_digits;
+           j += 2 * i) { // j=0 is LSD and j=num_digits-1 is MSD
+        // EQ_j: Use 1 triple compute eq_j AND eq_j+i.
+        this->mill->AND_step_2(
+            leaf_res_eq + j * num_eqs, e + (counter_triples_used * num_eqs) / 8,
+            f + (counter_triples_used * num_eqs) / 8,
+            nullptr, // not used in function
+            nullptr, // not used in function
+            (triples_std.ai) + (counter_triples_used * num_eqs) / 8,
+            (triples_std.bi) + (counter_triples_used * num_eqs) / 8,
+            (triples_std.ci) + (counter_triples_used * num_eqs) / 8, num_eqs);
+        counter_triples_used++;
+      }
+
+      old_counter_triples_used = counter_triples_used;
+    }
+
+    assert(counter_triples_used == num_triples);
+
+    // cleanup
+    delete[] ei;
+    delete[] fi;
+    delete[] e;
+    delete[] f;
+  }
+
+  void set_leaf_ot_messages(uint8_t *ot_messages, uint8_t digit, int N,
+                            uint8_t mask_eq) {
+    for (int i = 0; i < N; i++) {
+      ot_messages[i] = ((digit == i) ^ mask_eq);
+    }
+  }
+};
+
+#endif // EQUALITY_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire.h b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire.h
new file mode 100644
index 00000000..b32fdbf2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire.h
@@ -0,0 +1,557 @@
+/*
+Authors: Deevashwer Rathee, Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef MILLIONAIRE_H__
+#define MILLIONAIRE_H__
+#include "Millionaire/bit-triple-generator.h"
+#include "OT/emp-ot.h"
+#include "utils/emp-tool.h"
+#include <cmath>
+#include <omp.h>
+
+#define MILL_PARAM 4
+#define WAN_EXEC
+
+class MillionaireProtocol {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  int party;
+  int l, r, log_alpha, beta, beta_pow;
+  int num_digits, num_triples_corr, num_triples_std, log_num_digits;
+  int num_triples;
+  uint8_t mask_beta, mask_r;
+
+  MillionaireProtocol(int party, sci::IOPack *iopack, sci::OTPack *otpack,
+                      int bitlength = 32, int radix_base = MILL_PARAM) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->triple_gen = new TripleGenerator(party, iopack, otpack);
+    configure(bitlength, radix_base);
+  }
+
+  void configure(int bitlength, int radix_base = MILL_PARAM) {
+    assert(radix_base <= 8);
+    assert(bitlength <= 64);
+    this->l = bitlength;
+    this->beta = radix_base;
+
+    this->num_digits = ceil((double)l / beta);
+    this->r = l % beta;
+    this->log_alpha = sci::bitlen(num_digits) - 1;
+    this->log_num_digits = log_alpha + 1;
+    this->num_triples_corr = 2 * num_digits - 2 - 2 * log_num_digits;
+    this->num_triples_std = log_num_digits;
+    this->num_triples = num_triples_std + num_triples_corr;
+    if (beta == 8)
+      this->mask_beta = -1;
+    else
+      this->mask_beta = (1 << beta) - 1;
+    this->mask_r = (1 << r) - 1;
+    this->beta_pow = 1 << beta;
+  }
+
+  ~MillionaireProtocol() { delete triple_gen; }
+
+  void compare(uint8_t *res, uint64_t *data, int num_cmps, int bitlength,
+               bool greater_than = true, bool equality = false,
+               int radix_base = MILL_PARAM) {
+    configure(bitlength, radix_base);
+
+    if (bitlength <= beta) {
+      uint8_t N = 1 << bitlength;
+      uint8_t mask = N - 1;
+      if (party == sci::ALICE) {
+        sci::PRG128 prg;
+        prg.random_data(res, num_cmps * sizeof(uint8_t));
+        uint8_t **leaf_messages = new uint8_t *[num_cmps];
+        for (int i = 0; i < num_cmps; i++) {
+          res[i] &= 1;
+          leaf_messages[i] = new uint8_t[N];
+          for (int j = 0; j < N; j++) {
+            if (greater_than) {
+              leaf_messages[i][j] = ((uint8_t(data[i] & mask) > j) ^ res[i]);
+            } else {
+              leaf_messages[i][j] = ((uint8_t(data[i] & mask) < j) ^ res[i]);
+            }
+          }
+        }
+        if (bitlength > 1) {
+          otpack->kkot[bitlength - 1]->send(leaf_messages, num_cmps, 1);
+        } else {
+          otpack->iknp_straight->send(leaf_messages, num_cmps, 1);
+        }
+
+        for (int i = 0; i < num_cmps; i++)
+          delete[] leaf_messages[i];
+        delete[] leaf_messages;
+      } else { // party == BOB
+        uint8_t *choice = new uint8_t[num_cmps];
+        for (int i = 0; i < num_cmps; i++) {
+          choice[i] = data[i] & mask;
+        }
+        if (bitlength > 1) {
+          otpack->kkot[bitlength - 1]->recv(res, choice, num_cmps, 1);
+        } else {
+          otpack->iknp_straight->recv(res, choice, num_cmps, 1);
+        }
+
+        delete[] choice;
+      }
+      return;
+    }
+
+    int old_num_cmps = num_cmps;
+    // num_cmps should be a multiple of 8
+    num_cmps = ceil(num_cmps / 8.0) * 8;
+
+    uint64_t *data_ext;
+    if (old_num_cmps == num_cmps)
+      data_ext = data;
+    else {
+      data_ext = new uint64_t[num_cmps];
+      memcpy(data_ext, data, old_num_cmps * sizeof(uint64_t));
+      memset(data_ext + old_num_cmps, 0,
+             (num_cmps - old_num_cmps) * sizeof(uint64_t));
+    }
+
+    uint8_t *digits;       // num_digits * num_cmps
+    uint8_t *leaf_res_cmp; // num_digits * num_cmps
+    uint8_t *leaf_res_eq;  // num_digits * num_cmps
+
+    digits = new uint8_t[num_digits * num_cmps];
+    leaf_res_cmp = new uint8_t[num_digits * num_cmps];
+    leaf_res_eq = new uint8_t[num_digits * num_cmps];
+
+    // Extract radix-digits from data
+    for (int i = 0; i < num_digits; i++) // Stored from LSB to MSB
+      for (int j = 0; j < num_cmps; j++)
+        if ((i == num_digits - 1) && (r != 0))
+          digits[i * num_cmps + j] =
+              (uint8_t)(data_ext[j] >> i * beta) & mask_r;
+        else
+          digits[i * num_cmps + j] =
+              (uint8_t)(data_ext[j] >> i * beta) & mask_beta;
+
+    if (party == sci::ALICE) {
+      uint8_t *
+          *leaf_ot_messages; // (num_digits * num_cmps) X beta_pow (=2^beta)
+      leaf_ot_messages = new uint8_t *[num_digits * num_cmps];
+      for (int i = 0; i < num_digits * num_cmps; i++)
+        leaf_ot_messages[i] = new uint8_t[beta_pow];
+
+      // Set Leaf OT messages
+      triple_gen->prg->random_bool((bool *)leaf_res_cmp, num_digits * num_cmps);
+      triple_gen->prg->random_bool((bool *)leaf_res_eq, num_digits * num_cmps);
+      for (int i = 0; i < num_digits; i++) {
+        for (int j = 0; j < num_cmps; j++) {
+          if (i == 0) {
+            set_leaf_ot_messages(leaf_ot_messages[i * num_cmps + j],
+                                 digits[i * num_cmps + j], beta_pow,
+                                 leaf_res_cmp[i * num_cmps + j], 0,
+                                 greater_than, false);
+          } else if (i == (num_digits - 1) && (r > 0)) {
+#ifdef WAN_EXEC
+            set_leaf_ot_messages(leaf_ot_messages[i * num_cmps + j],
+                                 digits[i * num_cmps + j], beta_pow,
+                                 leaf_res_cmp[i * num_cmps + j],
+                                 leaf_res_eq[i * num_cmps + j], greater_than);
+#else
+            set_leaf_ot_messages(leaf_ot_messages[i * num_cmps + j],
+                                 digits[i * num_cmps + j], 1 << r,
+                                 leaf_res_cmp[i * num_cmps + j],
+                                 leaf_res_eq[i * num_cmps + j], greater_than);
+#endif
+          } else {
+            set_leaf_ot_messages(leaf_ot_messages[i * num_cmps + j],
+                                 digits[i * num_cmps + j], beta_pow,
+                                 leaf_res_cmp[i * num_cmps + j],
+                                 leaf_res_eq[i * num_cmps + j], greater_than);
+          }
+        }
+      }
+
+      // Perform Leaf OTs
+#ifdef WAN_EXEC
+      // otpack->kkot_beta->send(leaf_ot_messages, num_cmps*(num_digits), 2);
+      otpack->kkot[beta - 1]->send(leaf_ot_messages, num_cmps * (num_digits),
+                                   2);
+#else
+      // otpack->kkot_beta->send(leaf_ot_messages, num_cmps, 1);
+      otpack->kkot[beta - 1]->send(leaf_ot_messages, num_cmps, 1);
+      if (r == 1) {
+        // otpack->kkot_beta->send(leaf_ot_messages+num_cmps,
+        // num_cmps*(num_digits-2), 2);
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_cmps,
+                                     num_cmps * (num_digits - 2), 2);
+        otpack->iknp_straight->send(
+            leaf_ot_messages + num_cmps * (num_digits - 1), num_cmps, 2);
+      } else if (r != 0) {
+        // otpack->kkot_beta->send(leaf_ot_messages+num_cmps,
+        // num_cmps*(num_digits-2), 2);
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_cmps,
+                                     num_cmps * (num_digits - 2), 2);
+        otpack->kkot[r - 1]->send(
+            leaf_ot_messages + num_cmps * (num_digits - 1), num_cmps, 2);
+      } else {
+        // otpack->kkot_beta->send(leaf_ot_messages+num_cmps,
+        // num_cmps*(num_digits-1), 2);
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_cmps,
+                                     num_cmps * (num_digits - 1), 2);
+      }
+#endif
+      // Cleanup
+      for (int i = 0; i < num_digits * num_cmps; i++)
+        delete[] leaf_ot_messages[i];
+      delete[] leaf_ot_messages;
+    } else // party = sci::BOB
+    {
+      // Perform Leaf OTs
+#ifdef WAN_EXEC
+      // otpack->kkot_beta->recv(leaf_res_cmp, digits, num_cmps*(num_digits),
+      // 2);
+      otpack->kkot[beta - 1]->recv(leaf_res_cmp, digits,
+                                   num_cmps * (num_digits), 2);
+#else
+      // otpack->kkot_beta->recv(leaf_res_cmp, digits, num_cmps, 1);
+      otpack->kkot[beta - 1]->recv(leaf_res_cmp, digits, num_cmps, 1);
+      if (r == 1) {
+        // otpack->kkot_beta->recv(leaf_res_cmp+num_cmps, digits+num_cmps,
+        // num_cmps*(num_digits-2), 2);
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp + num_cmps, digits + num_cmps,
+                                     num_cmps * (num_digits - 2), 2);
+        otpack->iknp_straight->recv(leaf_res_cmp + num_cmps * (num_digits - 1),
+                                    digits + num_cmps * (num_digits - 1),
+                                    num_cmps, 2);
+      } else if (r != 0) {
+        // otpack->kkot_beta->recv(leaf_res_cmp+num_cmps, digits+num_cmps,
+        // num_cmps*(num_digits-2), 2);
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp + num_cmps, digits + num_cmps,
+                                     num_cmps * (num_digits - 2), 2);
+        otpack->kkot[r - 1]->recv(leaf_res_cmp + num_cmps * (num_digits - 1),
+                                  digits + num_cmps * (num_digits - 1),
+                                  num_cmps, 2);
+      } else {
+        // otpack->kkot_beta->recv(leaf_res_cmp+num_cmps, digits+num_cmps,
+        // num_cmps*(num_digits-1), 2);
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp + num_cmps, digits + num_cmps,
+                                     num_cmps * (num_digits - 1), 2);
+      }
+#endif
+
+      // Extract equality result from leaf_res_cmp
+      for (int i = num_cmps; i < num_digits * num_cmps; i++) {
+        leaf_res_eq[i] = leaf_res_cmp[i] & 1;
+        leaf_res_cmp[i] >>= 1;
+      }
+    }
+
+    traverse_and_compute_ANDs(num_cmps, leaf_res_eq, leaf_res_cmp);
+
+    for (int i = 0; i < old_num_cmps; i++)
+      res[i] = leaf_res_cmp[i];
+
+    // Cleanup
+    if (old_num_cmps != num_cmps)
+      delete[] data_ext;
+    delete[] digits;
+    delete[] leaf_res_cmp;
+    delete[] leaf_res_eq;
+  }
+
+  void set_leaf_ot_messages(uint8_t *ot_messages, uint8_t digit, int N,
+                            uint8_t mask_cmp, uint8_t mask_eq,
+                            bool greater_than, bool eq = true) {
+    for (int i = 0; i < N; i++) {
+      if (greater_than) {
+        ot_messages[i] = ((digit > i) ^ mask_cmp);
+      } else {
+        ot_messages[i] = ((digit < i) ^ mask_cmp);
+      }
+      if (eq) {
+        ot_messages[i] = (ot_messages[i] << 1) | ((digit == i) ^ mask_eq);
+      }
+    }
+  }
+
+  /**************************************************************************************************
+   *                         AND computation related functions
+   **************************************************************************************************/
+
+  void traverse_and_compute_ANDs(int num_cmps, uint8_t *leaf_res_eq,
+                                 uint8_t *leaf_res_cmp) {
+#ifdef WAN_EXEC
+    Triple triples_std((num_triples)*num_cmps, true);
+#else
+    Triple triples_corr(num_triples_corr * num_cmps, true, num_cmps);
+    Triple triples_std(num_triples_std * num_cmps, true);
+#endif
+    // Generate required Bit-Triples
+#ifdef WAN_EXEC
+    // std::cout<<"Running on WAN_EXEC; Skipping correlated triples"<<std::endl;
+    triple_gen->generate(party, &triples_std, _16KKOT_to_4OT);
+#else
+    triple_gen->generate(party, &triples_corr, _8KKOT);
+    triple_gen->generate(party, &triples_std, _16KKOT_to_4OT);
+#endif
+    // std::cout << "Bit Triples Generated" << std::endl;
+
+    // Combine leaf OT results in a bottom-up fashion
+    int counter_std = 0, old_counter_std = 0;
+    int counter_corr = 0, old_counter_corr = 0;
+    int counter_combined = 0, old_counter_combined = 0;
+    uint8_t *ei = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *fi = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *e = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *f = new uint8_t[(num_triples * num_cmps) / 8];
+
+    for (int i = 1; i < num_digits; i *= 2) {
+      for (int j = 0; j < num_digits and j + i < num_digits; j += 2 * i) {
+        if (j == 0) {
+#ifdef WAN_EXEC
+          AND_step_1(
+              ei + (counter_std * num_cmps) / 8,
+              fi + (counter_std * num_cmps) / 8, leaf_res_cmp + j * num_cmps,
+              leaf_res_eq + (j + i) * num_cmps,
+              (triples_std.ai) + (counter_combined * num_cmps) / 8,
+              (triples_std.bi) + (counter_combined * num_cmps) / 8, num_cmps);
+          counter_std++;
+          counter_combined++;
+#else
+          AND_step_1(ei + (counter_std * num_cmps) / 8,
+                     fi + (counter_std * num_cmps) / 8,
+                     leaf_res_cmp + j * num_cmps,
+                     leaf_res_eq + (j + i) * num_cmps,
+                     (triples_std.ai) + (counter_std * num_cmps) / 8,
+                     (triples_std.bi) + (counter_std * num_cmps) / 8, num_cmps);
+          counter_std++;
+#endif
+        } else {
+#ifdef WAN_EXEC
+          AND_step_1(
+              ei + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+              fi + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+              leaf_res_cmp + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+              (triples_std.ai) + (counter_combined * num_cmps) / 8,
+              (triples_std.bi) + (counter_combined * num_cmps) / 8, num_cmps);
+          counter_combined++;
+          AND_step_1(
+              ei + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              fi + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              leaf_res_eq + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+              (triples_std.ai) + (counter_combined * num_cmps) / 8,
+              (triples_std.bi) + (counter_combined * num_cmps) / 8, num_cmps);
+          counter_combined++;
+          counter_corr++;
+#else
+          AND_step_1(
+              ei + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+              fi + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+              leaf_res_cmp + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+              (triples_corr.ai) + (2 * counter_corr * num_cmps) / 8,
+              (triples_corr.bi) + (2 * counter_corr * num_cmps) / 8, num_cmps);
+          AND_step_1(
+              ei + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              fi + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              leaf_res_eq + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+              (triples_corr.ai) + ((2 * counter_corr + 1) * num_cmps) / 8,
+              (triples_corr.bi) + ((2 * counter_corr + 1) * num_cmps) / 8,
+              num_cmps);
+          counter_corr++;
+#endif
+        }
+      }
+      int offset_std = (old_counter_std * num_cmps) / 8;
+      int size_std = ((counter_std - old_counter_std) * num_cmps) / 8;
+      int offset_corr =
+          ((num_triples_std + 2 * old_counter_corr) * num_cmps) / 8;
+      int size_corr = (2 * (counter_corr - old_counter_corr) * num_cmps) / 8;
+
+#pragma omp parallel num_threads(2)
+      {
+        if (omp_get_thread_num() == 1) {
+          if (party == sci::ALICE) {
+            iopack->io_rev->recv_data(e + offset_std, size_std);
+            iopack->io_rev->recv_data(e + offset_corr, size_corr);
+            iopack->io_rev->recv_data(f + offset_std, size_std);
+            iopack->io_rev->recv_data(f + offset_corr, size_corr);
+          } else { // party == sci::BOB
+            iopack->io_rev->send_data(ei + offset_std, size_std);
+            iopack->io_rev->send_data(ei + offset_corr, size_corr);
+            iopack->io_rev->send_data(fi + offset_std, size_std);
+            iopack->io_rev->send_data(fi + offset_corr, size_corr);
+          }
+        } else {
+          if (party == sci::ALICE) {
+            iopack->io->send_data(ei + offset_std, size_std);
+            iopack->io->send_data(ei + offset_corr, size_corr);
+            iopack->io->send_data(fi + offset_std, size_std);
+            iopack->io->send_data(fi + offset_corr, size_corr);
+          } else { // party == sci::BOB
+            iopack->io->recv_data(e + offset_std, size_std);
+            iopack->io->recv_data(e + offset_corr, size_corr);
+            iopack->io->recv_data(f + offset_std, size_std);
+            iopack->io->recv_data(f + offset_corr, size_corr);
+          }
+        }
+      }
+      for (int i = 0; i < size_std; i++) {
+        e[i + offset_std] ^= ei[i + offset_std];
+        f[i + offset_std] ^= fi[i + offset_std];
+      }
+      for (int i = 0; i < size_corr; i++) {
+        e[i + offset_corr] ^= ei[i + offset_corr];
+        f[i + offset_corr] ^= fi[i + offset_corr];
+      }
+
+      counter_std = old_counter_std;
+      counter_corr = old_counter_corr;
+#ifdef WAN_EXEC
+      counter_combined = old_counter_combined;
+#endif
+      for (int j = 0; j < num_digits and j + i < num_digits; j += 2 * i) {
+        if (j == 0) {
+#ifdef WAN_EXEC
+          AND_step_2(
+              leaf_res_cmp + j * num_cmps, e + (counter_std * num_cmps) / 8,
+              f + (counter_std * num_cmps) / 8,
+              ei + (counter_std * num_cmps) / 8,
+              fi + (counter_std * num_cmps) / 8,
+              (triples_std.ai) + (counter_combined * num_cmps) / 8,
+              (triples_std.bi) + (counter_combined * num_cmps) / 8,
+              (triples_std.ci) + (counter_combined * num_cmps) / 8, num_cmps);
+          counter_combined++;
+#else
+          AND_step_2(leaf_res_cmp + j * num_cmps,
+                     e + (counter_std * num_cmps) / 8,
+                     f + (counter_std * num_cmps) / 8,
+                     ei + (counter_std * num_cmps) / 8,
+                     fi + (counter_std * num_cmps) / 8,
+                     (triples_std.ai) + (counter_std * num_cmps) / 8,
+                     (triples_std.bi) + (counter_std * num_cmps) / 8,
+                     (triples_std.ci) + (counter_std * num_cmps) / 8, num_cmps);
+#endif
+          for (int k = 0; k < num_cmps; k++)
+            leaf_res_cmp[j * num_cmps + k] ^=
+                leaf_res_cmp[(j + i) * num_cmps + k];
+          counter_std++;
+        } else {
+#ifdef WAN_EXEC
+          AND_step_2(leaf_res_cmp + j * num_cmps,
+                     e + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     f + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     ei + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     fi + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     (triples_std.ai) + (counter_combined * num_cmps) / 8,
+                     (triples_std.bi) + (counter_combined * num_cmps) / 8,
+                     (triples_std.ci) + (counter_combined * num_cmps) / 8,
+                     num_cmps);
+          counter_combined++;
+          AND_step_2(
+              leaf_res_eq + j * num_cmps,
+              e + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              f + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              ei + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              fi + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              (triples_std.ai) + (counter_combined * num_cmps) / 8,
+              (triples_std.bi) + (counter_combined * num_cmps) / 8,
+              (triples_std.ci) + (counter_combined * num_cmps) / 8, num_cmps);
+          counter_combined++;
+#else
+          AND_step_2(leaf_res_cmp + j * num_cmps,
+                     e + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     f + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     ei + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     fi + ((num_triples_std + 2 * counter_corr) * num_cmps) / 8,
+                     (triples_corr.ai) + (2 * counter_corr * num_cmps) / 8,
+                     (triples_corr.bi) + (2 * counter_corr * num_cmps) / 8,
+                     (triples_corr.ci) + (2 * counter_corr * num_cmps) / 8,
+                     num_cmps);
+          AND_step_2(
+              leaf_res_eq + j * num_cmps,
+              e + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              f + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              ei + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              fi + ((num_triples_std + (2 * counter_corr + 1)) * num_cmps) / 8,
+              (triples_corr.ai) + ((2 * counter_corr + 1) * num_cmps) / 8,
+              (triples_corr.bi) + ((2 * counter_corr + 1) * num_cmps) / 8,
+              (triples_corr.ci) + ((2 * counter_corr + 1) * num_cmps) / 8,
+              num_cmps);
+#endif
+          for (int k = 0; k < num_cmps; k++)
+            leaf_res_cmp[j * num_cmps + k] ^=
+                leaf_res_cmp[(j + i) * num_cmps + k];
+          counter_corr++;
+        }
+      }
+      old_counter_std = counter_std;
+      old_counter_corr = counter_corr;
+#ifdef WAN_EXEC
+      old_counter_combined = counter_combined;
+#endif
+    }
+
+#ifdef WAN_EXEC
+    assert(counter_combined == num_triples);
+#else
+    assert(counter_std == num_triples_std);
+    assert(2 * counter_corr == num_triples_corr);
+#endif
+
+    // cleanup
+    delete[] ei;
+    delete[] fi;
+    delete[] e;
+    delete[] f;
+  }
+
+  void AND_step_1(uint8_t *ei, // evaluates batch of 8 ANDs
+                  uint8_t *fi, uint8_t *xi, uint8_t *yi, uint8_t *ai,
+                  uint8_t *bi, int num_ANDs) {
+    assert(num_ANDs % 8 == 0);
+    for (int i = 0; i < num_ANDs; i += 8) {
+      ei[i / 8] = ai[i / 8];
+      fi[i / 8] = bi[i / 8];
+      ei[i / 8] ^= sci::bool_to_uint8(xi + i, 8);
+      fi[i / 8] ^= sci::bool_to_uint8(yi + i, 8);
+    }
+  }
+  void AND_step_2(uint8_t *zi, // evaluates batch of 8 ANDs
+                  uint8_t *e, uint8_t *f, uint8_t *ei, uint8_t *fi, uint8_t *ai,
+                  uint8_t *bi, uint8_t *ci, int num_ANDs) {
+    assert(num_ANDs % 8 == 0);
+    for (int i = 0; i < num_ANDs; i += 8) {
+      uint8_t temp_z;
+      if (party == sci::ALICE)
+        temp_z = e[i / 8] & f[i / 8];
+      else
+        temp_z = 0;
+      temp_z ^= f[i / 8] & ai[i / 8];
+      temp_z ^= e[i / 8] & bi[i / 8];
+      temp_z ^= ci[i / 8];
+      sci::uint8_to_bool(zi + i, temp_z, 8);
+    }
+  }
+};
+
+#endif // MILLIONAIRE_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire_with_equality.h b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire_with_equality.h
new file mode 100644
index 00000000..ca0670ee
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire_with_equality.h
@@ -0,0 +1,391 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef MILLIONAIRE_WITH_EQ_H__
+#define MILLIONAIRE_WITH_EQ_H__
+#include "Millionaire/millionaire.h"
+#include "OT/emp-ot.h"
+#include "utils/emp-tool.h"
+#include <cmath>
+
+class MillionaireWithEquality {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *mill;
+  int party;
+  int l, r, log_alpha, beta, beta_pow;
+  int num_digits, num_triples, log_num_digits;
+  uint8_t mask_beta, mask_r;
+
+  MillionaireWithEquality(int party, sci::IOPack *iopack, sci::OTPack *otpack,
+                          int bitlength = 32, int radix_base = MILL_PARAM) {
+    this->party = party;
+    this->iopack = iopack;
+    this->otpack = otpack;
+    this->mill =
+        new MillionaireProtocol(party, iopack, otpack, bitlength, radix_base);
+    this->triple_gen = mill->triple_gen;
+    configure(bitlength, radix_base);
+  }
+
+  void configure(int bitlength, int radix_base = MILL_PARAM) {
+    assert(radix_base <= 8);
+    assert(bitlength <= 64);
+    this->l = bitlength;
+    this->beta = radix_base;
+
+    this->num_digits = ceil((double)l / beta);
+    this->r = l % beta;
+    this->log_alpha = sci::bitlen(num_digits) - 1;
+    this->log_num_digits = log_alpha + 1;
+    this->num_triples = 2 * num_digits - 2;
+    if (beta == 8)
+      this->mask_beta = -1;
+    else
+      this->mask_beta = (1 << beta) - 1;
+    this->mask_r = (1 << r) - 1;
+    this->beta_pow = 1 << beta;
+  }
+
+  ~MillionaireWithEquality() { delete mill; }
+
+  void bitlen_lt_beta(uint8_t *res_cmp, uint8_t *res_eq, uint64_t *data,
+                      int num_cmps, int bitlength, bool greater_than = true,
+                      int radix_base = MILL_PARAM) {
+    uint8_t N = 1 << bitlength;
+    uint8_t mask = N - 1;
+    if (party == sci::ALICE) {
+      sci::PRG128 prg;
+      prg.random_data(res_cmp, num_cmps * sizeof(uint8_t));
+      prg.random_data(res_eq, num_cmps * sizeof(uint8_t));
+      uint8_t **leaf_messages = new uint8_t *[num_cmps];
+      for (int i = 0; i < num_cmps; i++) {
+        res_cmp[i] &= 1;
+        res_eq[i] &= 1;
+        leaf_messages[i] = new uint8_t[N];
+        this->mill->set_leaf_ot_messages(leaf_messages[i], (data[i] & mask), N,
+                                         res_cmp[i], res_eq[i], greater_than,
+                                         true);
+      }
+      if (bitlength > 1) {
+        otpack->kkot[bitlength - 1]->send(leaf_messages, num_cmps, 2);
+      } else {
+        otpack->iknp_straight->send(leaf_messages, num_cmps, 2);
+      }
+
+      for (int i = 0; i < num_cmps; i++)
+        delete[] leaf_messages[i];
+      delete[] leaf_messages;
+    } else { // party == BOB
+      uint8_t *choice = new uint8_t[num_cmps];
+      for (int i = 0; i < num_cmps; i++) {
+        choice[i] = data[i] & mask;
+      }
+      if (bitlength > 1) {
+        otpack->kkot[bitlength - 1]->recv(res_cmp, choice, num_cmps, 2);
+      } else {
+        otpack->iknp_straight->recv(res_cmp, choice, num_cmps, 2);
+      }
+      for (int i = 0; i < num_cmps; i++) {
+        res_eq[i] = res_cmp[i] & 1;
+        res_cmp[i] >>= 1;
+      }
+      delete[] choice;
+    }
+    return;
+  }
+
+  void compare_with_eq(uint8_t *res_cmp, uint8_t *res_eq, uint64_t *data,
+                       int num_cmps, int bitlength, bool greater_than = true,
+                       int radix_base = MILL_PARAM) {
+    configure(bitlength, radix_base);
+
+    if (bitlength <= beta) {
+      bitlen_lt_beta(res_cmp, res_eq, data, num_cmps, bitlength, greater_than,
+                     radix_base);
+      return;
+    }
+
+    int old_num_cmps = num_cmps;
+    // num_cmps should be a multiple of 8
+    num_cmps = ceil(num_cmps / 8.0) * 8;
+
+    // padding with 0s if data dim not multiple of 8
+    uint64_t *data_ext;
+    if (old_num_cmps == num_cmps)
+      data_ext = data;
+    else {
+      data_ext = new uint64_t[num_cmps];
+      memcpy(data_ext, data, old_num_cmps * sizeof(uint64_t));
+      memset(data_ext + old_num_cmps, 0,
+             (num_cmps - old_num_cmps) * sizeof(uint64_t));
+    }
+
+    uint8_t *digits;       // num_digits * num_cmps
+    uint8_t *leaf_res_cmp; // num_digits * num_cmps
+    uint8_t *leaf_res_eq;  // num_digits * num_cmps
+
+    digits = new uint8_t[num_digits * num_cmps];
+    leaf_res_cmp = new uint8_t[num_digits * num_cmps];
+    leaf_res_eq = new uint8_t[num_digits * num_cmps];
+
+    // Extract radix-digits from data
+    for (int i = 0; i < num_digits; i++) // Stored from LSB to MSB
+      for (int j = 0; j < num_cmps; j++)
+        if ((i == num_digits - 1) && (r != 0))
+          digits[i * num_cmps + j] =
+              (uint8_t)(data_ext[j] >> i * beta) & mask_r;
+        else
+          digits[i * num_cmps + j] =
+              (uint8_t)(data_ext[j] >> i * beta) & mask_beta;
+    // ======================
+
+    // Set leaf OT messages now
+    if (party == sci::ALICE) {
+      uint8_t *
+          *leaf_ot_messages; // (num_digits * num_cmps) X beta_pow (=2^beta)
+      leaf_ot_messages = new uint8_t *[num_digits * num_cmps];
+      for (int i = 0; i < num_digits * num_cmps; i++)
+        leaf_ot_messages[i] = new uint8_t[beta_pow];
+
+      // Set Leaf OT messages
+      triple_gen->prg->random_bool((bool *)leaf_res_cmp, num_digits * num_cmps);
+      triple_gen->prg->random_bool((bool *)leaf_res_eq, num_digits * num_cmps);
+      for (int i = 0; i < num_digits; i++) {
+        for (int j = 0; j < num_cmps; j++) {
+          if (i == (num_digits - 1) && (r > 0)) {
+            this->mill->set_leaf_ot_messages(
+                leaf_ot_messages[i * num_cmps + j], digits[i * num_cmps + j],
+                1ULL << r, leaf_res_cmp[i * num_cmps + j],
+                leaf_res_eq[i * num_cmps + j], greater_than);
+          } else {
+            this->mill->set_leaf_ot_messages(
+                leaf_ot_messages[i * num_cmps + j], digits[i * num_cmps + j],
+                beta_pow, leaf_res_cmp[i * num_cmps + j],
+                leaf_res_eq[i * num_cmps + j], greater_than);
+          }
+        }
+      }
+
+      // Perform Leaf OTs with comparison and equality
+      if (r == 1) {
+        // All branches except r
+        otpack->kkot[beta - 1]->send(leaf_ot_messages,
+                                     num_cmps * (num_digits - 1), 2);
+        // r branch
+        otpack->iknp_straight->send(
+            leaf_ot_messages + num_cmps * (num_digits - 1), num_cmps, 2);
+      } else if (r != 0) {
+        // All branches except r
+        otpack->kkot[beta - 1]->send(leaf_ot_messages,
+                                     num_cmps * (num_digits - 1), 2);
+        // r branch
+        otpack->kkot[r - 1]->send(
+            leaf_ot_messages + num_cmps * (num_digits - 1), num_cmps, 2);
+      } else {
+        // All branches including r, r is 0
+        otpack->kkot[beta - 1]->send(leaf_ot_messages, num_cmps * (num_digits),
+                                     2);
+      }
+
+      // Cleanup
+      for (int i = 0; i < num_digits * num_cmps; i++)
+        delete[] leaf_ot_messages[i];
+      delete[] leaf_ot_messages;
+    } else // party = sci::BOB
+    {
+      // Perform Leaf OTs
+      if (r == 1) {
+        // All branches except r
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp, digits,
+                                     num_cmps * (num_digits - 1), 2);
+        // r branch
+        otpack->iknp_straight->recv(leaf_res_cmp + num_cmps * (num_digits - 1),
+                                    digits + num_cmps * (num_digits - 1),
+                                    num_cmps, 2);
+      } else if (r != 0) {
+        // All branches except r
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp, digits,
+                                     num_cmps * (num_digits - 1), 2);
+        // r branch
+        otpack->kkot[r - 1]->recv(leaf_res_cmp + num_cmps * (num_digits - 1),
+                                  digits + num_cmps * (num_digits - 1),
+                                  num_cmps, 2);
+      } else {
+        // All branches including r, r is 0
+        otpack->kkot[beta - 1]->recv(leaf_res_cmp, digits,
+                                     num_cmps * (num_digits), 2);
+      }
+
+      // Extract equality result from leaf_res_cmp
+      for (int i = 0; i < num_digits * num_cmps; i++) {
+        leaf_res_eq[i] = leaf_res_cmp[i] & 1;
+        leaf_res_cmp[i] >>= 1;
+      }
+    }
+
+    traverse_and_compute_ANDs(num_cmps, leaf_res_eq, leaf_res_cmp);
+
+    for (int i = 0; i < old_num_cmps; i++) {
+      res_cmp[i] = leaf_res_cmp[i];
+      res_eq[i] = leaf_res_eq[i];
+    }
+
+    // Cleanup
+    if (old_num_cmps != num_cmps)
+      delete[] data_ext;
+    delete[] digits;
+    delete[] leaf_res_cmp;
+    delete[] leaf_res_eq;
+  }
+
+  /**************************************************************************************************
+   *                         AND computation related functions
+   **************************************************************************************************/
+
+  void traverse_and_compute_ANDs(int num_cmps, uint8_t *leaf_res_eq,
+                                 uint8_t *leaf_res_cmp) {
+    Triple triples_corr((num_triples)*num_cmps, true, num_cmps);
+
+    // Generate required Bit-Triples
+    triple_gen->generate(party, &triples_corr, _8KKOT);
+
+    // Combine leaf OT results in a bottom-up fashion
+    int counter_triples_used = 0, old_counter_triples_used = 0;
+    uint8_t *ei = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *fi = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *e = new uint8_t[(num_triples * num_cmps) / 8];
+    uint8_t *f = new uint8_t[(num_triples * num_cmps) / 8];
+
+    for (int i = 1; i < num_digits;
+         i *= 2) { // i denotes the distance between 2 nodes which should be
+                   // ANDed together
+      for (int j = 0; j < num_digits and j + i < num_digits;
+           j += 2 * i) { // j=0 is LSD and j=num_digits-1 is MSD
+
+        // CMP_j: Use 1 triple for opening e = a + cmp_j and f = b + eq_j+i.
+        this->mill->AND_step_1(
+            ei + (2 * counter_triples_used * num_cmps) / 8,
+            fi + (2 * counter_triples_used * num_cmps) / 8,
+            leaf_res_cmp + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+            (triples_corr.ai) + (2 * counter_triples_used * num_cmps) / 8,
+            (triples_corr.bi) + (2 * counter_triples_used * num_cmps) / 8,
+            num_cmps);
+        // EQ_j: Use 1 triple for opening e = a + eq_j and f = b + eq_j+i.
+        this->mill->AND_step_1(
+            ei + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            fi + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            leaf_res_eq + j * num_cmps, leaf_res_eq + (j + i) * num_cmps,
+            (triples_corr.ai) + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            (triples_corr.bi) + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            num_cmps);
+        counter_triples_used++;
+      }
+      int offset = (2 * old_counter_triples_used * num_cmps) / 8;
+      int size_used =
+          (2 * (counter_triples_used - old_counter_triples_used) * num_cmps) /
+          8;
+
+#pragma omp parallel num_threads(2)
+      {
+        if (omp_get_thread_num() == 1) {
+          if (party == sci::ALICE) {
+            iopack->io_rev->recv_data(e + offset, size_used);
+            iopack->io_rev->recv_data(e + offset, size_used);
+            iopack->io_rev->recv_data(f + offset, size_used);
+            iopack->io_rev->recv_data(f + offset, size_used);
+          } else { // party == sci::BOB
+            iopack->io_rev->send_data(ei + offset, size_used);
+            iopack->io_rev->send_data(ei + offset, size_used);
+            iopack->io_rev->send_data(fi + offset, size_used);
+            iopack->io_rev->send_data(fi + offset, size_used);
+          }
+        } else {
+          if (party == sci::ALICE) {
+            iopack->io->send_data(ei + offset, size_used);
+            iopack->io->send_data(ei + offset, size_used);
+            iopack->io->send_data(fi + offset, size_used);
+            iopack->io->send_data(fi + offset, size_used);
+          } else { // party == sci::BOB
+            iopack->io->recv_data(e + offset, size_used);
+            iopack->io->recv_data(e + offset, size_used);
+            iopack->io->recv_data(f + offset, size_used);
+            iopack->io->recv_data(f + offset, size_used);
+          }
+        }
+      }
+
+      // Reconstruct e and f
+      for (int i = 0; i < size_used; i++) {
+        e[i + offset] ^= ei[i + offset];
+        f[i + offset] ^= fi[i + offset];
+      }
+
+      counter_triples_used = old_counter_triples_used;
+
+      // Step 2 of AND computation
+      for (int j = 0; j < num_digits and j + i < num_digits;
+           j += 2 * i) { // j=0 is LSD and j=num_digits-1 is MSD
+        // CMP_j: Use 1 triple compute cmp_j AND eq_j+i.
+        this->mill->AND_step_2(
+            leaf_res_cmp + j * num_cmps,
+            e + (2 * counter_triples_used * num_cmps) / 8,
+            f + (2 * counter_triples_used * num_cmps) / 8,
+            nullptr, // not used in function
+            nullptr, // not used in function
+            (triples_corr.ai) + (2 * counter_triples_used * num_cmps) / 8,
+            (triples_corr.bi) + (2 * counter_triples_used * num_cmps) / 8,
+            (triples_corr.ci) + (2 * counter_triples_used * num_cmps) / 8,
+            num_cmps);
+        // EQ_j: Use 1 triple compute eq_j AND eq_j+i.
+        this->mill->AND_step_2(
+            leaf_res_eq + j * num_cmps,
+            e + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            f + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            nullptr, // not used in function
+            nullptr, // not used in function
+            (triples_corr.ai) + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            (triples_corr.bi) + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            (triples_corr.ci) + ((2 * counter_triples_used + 1) * num_cmps) / 8,
+            num_cmps);
+        for (int k = 0; k < num_cmps; k++) {
+          leaf_res_cmp[j * num_cmps + k] ^=
+              leaf_res_cmp[(j + i) * num_cmps + k];
+        }
+        counter_triples_used++;
+      }
+
+      old_counter_triples_used = counter_triples_used;
+    }
+
+    assert(2 * counter_triples_used == num_triples);
+
+    // cleanup
+    delete[] ei;
+    delete[] fi;
+    delete[] e;
+    delete[] f;
+  }
+};
+
+#endif // MILLIONAIRE_WITH_EQ_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/CMakeLists.txt
new file mode 100644
index 00000000..fcaab030
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(SCI-NonLinear INTERFACE)
+target_link_libraries(SCI-NonLinear
+	INTERFACE SCI-Millionaire
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/argmax.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/argmax.h
new file mode 100644
index 00000000..75b9abac
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/argmax.h
@@ -0,0 +1,476 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef ARGMAX_H__
+#define ARGMAX_H__
+
+#include "NonLinear/relu-field.h"
+#include "NonLinear/relu-ring.h"
+
+template <typename type> class ArgMaxProtocol {
+public:
+  sci::IOPack *iopack = nullptr;
+  sci::OTPack *otpack = nullptr;
+  ReLURingProtocol<type> *relu_oracle = nullptr;
+  ReLUFieldProtocol<type> *relu_field_oracle = nullptr;
+  int party;
+  int algeb_str;
+  int l, b;
+  int num_cmps;
+  uint64_t prime_mod;
+  uint8_t zero_small = 0;
+  uint64_t mask32_lower = (1ULL << 32) - 1ULL;
+  uint64_t mask32_upper = -1ULL - mask32_lower;
+  uint64_t mask_upper, mask_lower;
+  bool createdReluObj = false;
+  type mask_l;
+
+  // Constructor
+  ArgMaxProtocol(int party, int algeb_str, sci::IOPack *iopack, int l, int b,
+                 uint64_t prime, sci::OTPack *otpack) {
+    this->party = party;
+    this->algeb_str = algeb_str;
+    this->iopack = iopack;
+    this->l = l;
+    mask_lower = (1ULL << this->l) - 1;
+    mask_upper = (1ULL << (2 * this->l)) - 1 - mask_lower;
+    this->b = b;
+    this->prime_mod = prime;
+    this->otpack = otpack;
+    if (algeb_str == RING) {
+      this->relu_oracle =
+          new ReLURingProtocol<type>(party, RING, iopack, l, b, otpack);
+    } else {
+      this->relu_field_oracle = new ReLUFieldProtocol<type>(
+          party, FIELD, iopack, l, b, this->prime_mod, otpack);
+    }
+    configure();
+  }
+
+  // Destructor
+  ~ArgMaxProtocol() {
+    if (algeb_str == RING)
+      delete relu_oracle;
+    else
+      delete relu_field_oracle;
+  }
+
+  void configure() {
+    if (this->l != 32 && this->l != 64) {
+      mask_l = (type)((1ULL << l) - 1);
+    } else if (this->l == 32) {
+      mask_l = -1;
+    } else { // l = 64
+      mask_l = -1ULL;
+    }
+  }
+
+  int next_eight_multiple(int val) {
+    for (int i = 0; i < 8; i++) {
+      if ((val + i) % 8 == 0) {
+        return (val + i);
+      }
+    }
+    return 0;
+  }
+
+  void ArgMaxMPC(int size, type *inpArr, type *maxi, bool get_max_too = false,
+                 type *max_val = nullptr) {
+    type *input_temp = new type[size + 16];
+    type *input_argmax_temp = new type[size + 16];
+    for (int i = 0; i < size; i++) {
+      input_temp[i] = inpArr[i];
+      input_argmax_temp[i] = 0;
+    }
+    if (party == sci::ALICE) {
+      for (type i = 0; i < (type)size; i++) {
+        input_argmax_temp[i] = i;
+      }
+    }
+    if (size & 1) {
+      input_temp[size] = input_temp[size - 1];
+      input_argmax_temp[size] = input_argmax_temp[size - 1];
+      size += 1;
+    }
+    type *compare_with = new type[size + 16];
+    type *compare_with_argmax = new type[size + 16];
+    type *relu_res = new type[size + 16];
+    type *argmax_res = new type[size + 16];
+    int no_of_nodes = size;
+    int no_of_nodes_child;
+    int pad1, pad2;
+    int times_stuck_on_8 = 0;
+
+    while (no_of_nodes > 1) {
+      // std::cout<<"#nodes = "<<no_of_nodes<<std::endl;
+      no_of_nodes_child = (int)ceil((float)no_of_nodes / (float)2);
+      pad1 = next_eight_multiple(no_of_nodes_child);
+      pad1 = pad1 - no_of_nodes_child;
+      pad2 = no_of_nodes + 2 * pad1;
+      no_of_nodes_child += pad1;
+      if (no_of_nodes_child == 8) {
+        times_stuck_on_8++;
+      }
+      if (times_stuck_on_8 >= 5) {
+        // The backend code only supports a minimum batch size of 8
+        // So, whenever we have less than 8 child nodes, we pad it to get 8
+        // nodes. The child nodes are = 8 in the following cases: #parentnodes =
+        // 16, 8, 4, 2, 1. We will get the argmax result when parent nodes = 1.
+        // So, times_stuck_on_8 >= 5.
+        break;
+      }
+      for (int i = no_of_nodes; i < pad2; i++) {
+        input_temp[i] = input_temp[no_of_nodes - 1];
+        input_argmax_temp[i] = input_argmax_temp[no_of_nodes - 1];
+      }
+      if (this->algeb_str == FIELD) {
+        for (int i = 0; i < (pad2); i += 2) {
+          compare_with[i / 2] = sci::neg_mod(
+              (int64_t)(input_temp[i] - input_temp[i + 1]), this->prime_mod);
+          compare_with_argmax[i / 2] = sci::neg_mod(
+              (int64_t)(input_argmax_temp[i] - input_argmax_temp[i + 1]),
+              this->prime_mod);
+        }
+      } else { // RING
+        for (int i = 0; i < (pad2); i += 2) {
+          compare_with[i / 2] = (input_temp[i] - input_temp[i + 1]);
+          compare_with_argmax[i / 2] =
+              (input_argmax_temp[i] - input_argmax_temp[i + 1]);
+        }
+      }
+      if (this->l > 32) {
+        argmax_this_level_super_32(argmax_res, relu_res, compare_with_argmax,
+                                   compare_with, no_of_nodes_child);
+      } else {
+        argmax_this_level_sub_32(argmax_res, relu_res, compare_with_argmax,
+                                 compare_with, no_of_nodes_child);
+      }
+      if (this->algeb_str == FIELD) {
+        for (int i = 0; i < (no_of_nodes_child); i++) {
+          input_temp[i] =
+              (relu_res[i] + input_temp[2 * i + 1]) % this->prime_mod;
+          input_argmax_temp[i] =
+              (argmax_res[i] + input_argmax_temp[2 * i + 1]) % this->prime_mod;
+        }
+      } else { // RING
+        for (int i = 0; i < (no_of_nodes_child); i++) {
+          input_temp[i] = (relu_res[i] + input_temp[2 * i + 1]) & mask_l;
+          input_argmax_temp[i] =
+              (argmax_res[i] + input_argmax_temp[2 * i + 1]) & mask_l;
+        }
+      }
+      no_of_nodes = no_of_nodes_child;
+    }
+    maxi[0] = input_argmax_temp[0];
+    if (get_max_too) {
+      max_val[0] = input_temp[0];
+    }
+    if (this->algeb_str == RING) {
+      maxi[0] &= mask_l;
+      if (get_max_too) {
+        max_val[0] &= mask_l;
+      }
+    }
+
+    delete[] argmax_res;
+    delete[] relu_res;
+    delete[] compare_with_argmax;
+    delete[] compare_with;
+    delete[] input_temp;
+    delete[] input_argmax_temp;
+  }
+
+  /**************************************************************************************************
+   *                           Compute ArgMax for a tree level
+   **************************************************************************************************/
+
+  void argmax_this_level_super_32(type *argmax, type *result, type *indexshare,
+                                  type *share, int num_relu) {
+    uint8_t *drelu_ans = new uint8_t[num_relu];
+    if (this->algeb_str == FIELD) {
+      relu_field_oracle->relu(result, share, num_relu, drelu_ans, true);
+    } else { // RING
+      relu_oracle->relu(result, share, num_relu, drelu_ans, true);
+    }
+
+    // Now perform x.msb(x)
+    // 2 OTs required with reversed roles
+    sci::block128 *ot_messages_0 = new sci::block128[num_relu];
+    sci::block128 *ot_messages_1 = new sci::block128[num_relu];
+
+    uint64_t *additive_masks = new uint64_t[num_relu * 2];
+    sci::block128 *received_shares = new sci::block128[num_relu];
+    uint64_t *received_shares_0 = new uint64_t[num_relu];
+    uint64_t *received_shares_1 = new uint64_t[num_relu];
+
+    if (this->algeb_str == FIELD) {
+      this->relu_field_oracle->triple_gen->prg->template random_mod_p<type>(
+          (type *)additive_masks, 2 * num_relu, this->prime_mod);
+      for (int i = 0; i < 2 * num_relu; i++) {
+        additive_masks[i] %= this->prime_mod;
+      }
+    } else { // RING
+      this->relu_oracle->triple_gen->prg->random_data(
+          additive_masks, 2 * num_relu * sizeof(type));
+    }
+    for (int i = 0; i < num_relu; i++) {
+      set_argmax_end_ot_messages_super_32(
+          ot_messages_0 + i, ot_messages_1 + i, share + i, indexshare + i,
+          drelu_ans + i, ((type *)additive_masks) + i, num_relu);
+    }
+#pragma omp parallel num_threads(2)
+    {
+      if (omp_get_thread_num() == 1) {
+        if (party == sci::ALICE) {
+          if (this->algeb_str == FIELD) {
+            relu_field_oracle->otpack->iknp_reversed->recv(
+                received_shares, (bool *)drelu_ans, num_relu);
+          } else {
+            relu_oracle->otpack->iknp_reversed->recv(
+                received_shares, (bool *)drelu_ans, num_relu);
+          }
+        } else { // party == sci::BOB
+          if (this->algeb_str == FIELD) {
+            relu_field_oracle->otpack->iknp_reversed->send(
+                ot_messages_0, ot_messages_1, num_relu);
+          } else {
+            relu_oracle->otpack->iknp_reversed->send(ot_messages_0,
+                                                     ot_messages_1, num_relu);
+          }
+        }
+      } else {
+        if (party == sci::ALICE) {
+          if (this->algeb_str == FIELD) {
+            relu_field_oracle->otpack->iknp_straight->send(
+                ot_messages_0, ot_messages_1, num_relu);
+          } else {
+            relu_oracle->otpack->iknp_straight->send(ot_messages_0,
+                                                     ot_messages_1, num_relu);
+          }
+        } else { // party == sci::BOB
+          if (this->algeb_str == FIELD) {
+            relu_field_oracle->otpack->iknp_straight->recv(
+                received_shares, (bool *)drelu_ans, num_relu);
+          } else {
+            relu_oracle->otpack->iknp_straight->recv(
+                received_shares, (bool *)drelu_ans, num_relu);
+          }
+        }
+      }
+    }
+    for (int i = 0; i < num_relu; i++) {
+      received_shares_0[i] = _mm_extract_epi64(received_shares[i], 1);
+      received_shares_1[i] = _mm_extract_epi64(received_shares[i], 0);
+    }
+    for (int i = 0; i < num_relu; i++) {
+      result[i] = ((type *)additive_masks)[i] +
+                  ((type *)received_shares_0)[(8 / sizeof(type)) * i];
+      argmax[i] = ((type *)additive_masks)[i + num_relu] +
+                  ((type *)received_shares_1)[(8 / sizeof(type)) * i];
+      if (this->algeb_str == FIELD) {
+        result[i] %= this->prime_mod;
+        argmax[i] %= this->prime_mod;
+      }
+    }
+
+    delete[] additive_masks;
+    delete[] received_shares;
+    delete[] received_shares_0;
+    delete[] received_shares_1;
+    delete[] ot_messages_0;
+    delete[] ot_messages_1;
+    delete[] drelu_ans;
+  }
+
+  void set_argmax_end_ot_messages_super_32(sci::block128 *ot_messages_0,
+                                           sci::block128 *ot_messages_1,
+                                           type *value_share, type *index_share,
+                                           uint8_t *xor_share,
+                                           type *additive_mask, int num_relu) {
+    type temp0, temp1, temp2, temp3;
+    if (this->algeb_str == FIELD) {
+      temp0 = sci::neg_mod((int64_t)value_share[0] - (int64_t)additive_mask[0],
+                           this->prime_mod);
+      temp1 = sci::neg_mod((int64_t)0LL - (int64_t)additive_mask[0],
+                           this->prime_mod);
+      temp2 = sci::neg_mod((int64_t)index_share[0] -
+                               (int64_t)additive_mask[0 + num_relu],
+                           this->prime_mod);
+      temp3 = sci::neg_mod((int64_t)0LL - (int64_t)additive_mask[0 + num_relu],
+                           this->prime_mod);
+    } else { // RING
+      temp0 = (value_share[0] - additive_mask[0]);
+      temp1 = (0 - additive_mask[0]);
+      temp2 = (index_share[0] - additive_mask[0 + num_relu]);
+      temp3 = (0 - additive_mask[0 + num_relu]);
+    }
+    if (*xor_share == zero_small) {
+      ot_messages_0[0] = sci::makeBlock128(0ULL + temp0, 0ULL + temp2);
+      ot_messages_1[0] = sci::makeBlock128(0ULL + temp1, 0ULL + temp3);
+    } else {
+      ot_messages_0[0] = sci::makeBlock128(0ULL + temp1, 0ULL + temp3);
+      ot_messages_1[0] = sci::makeBlock128(0ULL + temp0, 0ULL + temp2);
+    }
+  }
+
+  void argmax_this_level_sub_32(type *argmax, type *result, type *indexshare,
+                                type *share, int num_relu) {
+    uint8_t *drelu_ans = new uint8_t[num_relu];
+
+    if (this->algeb_str == FIELD) {
+      relu_field_oracle->relu(result, share, num_relu, drelu_ans, true);
+    } else { // RING
+      relu_oracle->relu(result, share, num_relu, drelu_ans, true);
+    }
+
+    // Now perform x.msb(x)
+    // 2 OTs required with reversed roles
+    uint64_t **ot_messages = new uint64_t *[num_relu];
+    for (int i = 0; i < num_relu; i++) {
+      ot_messages[i] = new uint64_t[2];
+    }
+    uint64_t *additive_masks = new uint64_t[2 * num_relu];
+
+    uint64_t *received_shares = new uint64_t[num_relu];
+    uint64_t *received_shares_0 = new uint64_t[num_relu];
+    uint64_t *received_shares_1 = new uint64_t[num_relu];
+
+    if (this->algeb_str == FIELD) {
+      this->relu_field_oracle->triple_gen->prg->template random_mod_p<type>(
+          (type *)additive_masks, 2 * num_relu, this->prime_mod);
+      for (int i = 0; i < 2 * num_relu; i++) {
+        additive_masks[i] %= this->prime_mod;
+      }
+    } else { // RING
+      this->relu_oracle->triple_gen->prg->random_data(
+          additive_masks, 2 * num_relu * sizeof(type));
+    }
+    if (party == sci::ALICE) {
+      for (int i = 0; i < num_relu; i++) {
+        set_argmax_end_ot_messages_sub_32(
+            ot_messages[i], share + i, indexshare + i, drelu_ans + i,
+            ((type *)additive_masks) + i, num_relu);
+      }
+      if (this->algeb_str == FIELD) {
+        relu_field_oracle->otpack->iknp_straight->send(ot_messages, num_relu,
+                                                       this->l * 2);
+        relu_field_oracle->otpack->iknp_reversed->recv(
+            received_shares, drelu_ans, num_relu, this->l * 2);
+      } else { // RING
+        relu_oracle->otpack->iknp_straight->send(ot_messages, num_relu, 64);
+        relu_oracle->otpack->iknp_reversed->recv(received_shares, drelu_ans,
+                                                 num_relu, 64);
+      }
+    } else // party = sci::BOB
+    {
+      for (int i = 0; i < num_relu; i++) {
+        set_argmax_end_ot_messages_sub_32(
+            ot_messages[i], share + i, indexshare + i, drelu_ans + i,
+            ((type *)additive_masks) + i, num_relu);
+      }
+      if (this->algeb_str == FIELD) {
+        relu_field_oracle->otpack->iknp_straight->recv(
+            received_shares, drelu_ans, num_relu, this->l * 2);
+        relu_field_oracle->otpack->iknp_reversed->send(ot_messages, num_relu,
+                                                       this->l * 2);
+      } else { // RING
+        relu_oracle->otpack->iknp_straight->recv(received_shares, drelu_ans,
+                                                 num_relu, 64);
+        relu_oracle->otpack->iknp_reversed->send(ot_messages, num_relu, 64);
+      }
+    }
+
+    for (int i = 0; i < num_relu; i++) {
+      if (this->algeb_str == FIELD) {
+        // tightly optimized communication for the field case
+        received_shares_0[i] = (received_shares[i] & mask_upper) >> this->l;
+        received_shares_1[i] = received_shares[i] & mask_lower;
+      } else { // RING
+        received_shares_0[i] = (received_shares[i] & mask32_upper) >> 32;
+        received_shares_1[i] = received_shares[i] & mask32_lower;
+      }
+    }
+    for (int i = 0; i < num_relu; i++) {
+      result[i] = ((type *)additive_masks)[i] +
+                  ((type *)received_shares_0)[(8 / sizeof(type)) * i];
+      argmax[i] = ((type *)additive_masks)[i + num_relu] +
+                  ((type *)received_shares_1)[(8 / sizeof(type)) * i];
+      if (this->algeb_str == FIELD) {
+        result[i] %= this->prime_mod;
+        argmax[i] %= this->prime_mod;
+      }
+    }
+    delete[] additive_masks;
+    delete[] received_shares;
+    delete[] received_shares_0;
+    delete[] received_shares_1;
+    delete[] drelu_ans;
+    for (int i = 0; i < num_relu; i++) {
+      delete[] ot_messages[i];
+    }
+    delete[] ot_messages;
+  }
+
+  void set_argmax_end_ot_messages_sub_32(uint64_t *ot_messages,
+                                         type *value_share, type *index_share,
+                                         uint8_t *xor_share,
+                                         type *additive_mask, int num_relu) {
+    uint64_t temp0, temp1, temp2, temp3;
+    uint64_t mask_upper_general, mask_lower_general;
+    if (this->algeb_str == FIELD) {
+      temp0 = sci::neg_mod((int64_t)value_share[0] - (int64_t)additive_mask[0],
+                           this->prime_mod);
+      temp1 = sci::neg_mod((int64_t)0LL - (int64_t)additive_mask[0],
+                           this->prime_mod);
+      temp2 = sci::neg_mod((int64_t)index_share[0] -
+                               (int64_t)additive_mask[0 + num_relu],
+                           this->prime_mod);
+      temp3 = sci::neg_mod((int64_t)0LL - (int64_t)additive_mask[0 + num_relu],
+                           this->prime_mod);
+      temp0 = temp0 << this->l;
+      temp1 = temp1 << this->l;
+      mask_upper_general = mask_upper;
+      mask_lower_general = mask_lower;
+    } else { // RING
+      temp0 = (type)(value_share[0] - additive_mask[0]);
+      temp0 = temp0 << 32;
+      temp1 = (type)(0 - additive_mask[0]);
+      temp1 = temp1 << 32;
+      temp2 = (type)(index_share[0] - additive_mask[0 + num_relu]);
+      temp3 = (type)(0 - additive_mask[0 + num_relu]);
+      mask_upper_general = mask32_upper;
+      mask_lower_general = mask32_lower;
+    }
+    if (*xor_share == zero_small) {
+      ot_messages[0] = (mask_upper_general & (0ULL + temp0)) ^
+                       (mask_lower_general & (0ULL + temp2));
+      ot_messages[1] = (mask_upper_general & (0ULL + temp1)) ^
+                       (mask_lower_general & (0ULL + temp3));
+    } else {
+      ot_messages[0] = (mask_upper_general & (0ULL + temp1)) ^
+                       (mask_lower_general & (0ULL + temp3));
+      ot_messages[1] = (mask_upper_general & (0ULL + temp0)) ^
+                       (mask_lower_general & (0ULL + temp2));
+    }
+  }
+};
+
+#endif // ARGMAX_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/drelu-field.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/drelu-field.h
new file mode 100644
index 00000000..bab906e9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/drelu-field.h
@@ -0,0 +1,355 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef DRELU_FIELD_H__
+#define DRELU_FIELD_H__
+
+#include "Millionaire/millionaire.h"
+#include "OT/emp-ot.h"
+#include "utils/emp-tool.h"
+#include <cmath>
+
+class DReLUFieldProtocol {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *millionaire;
+  int party;
+  int l, r, log_alpha, beta, beta_pow;
+  int num_digits, num_triples_corr, num_triples_std, log_num_digits;
+  int num_triples;
+  uint8_t mask_beta, mask_r, take_lsb;
+  uint64_t p, p_2;
+
+  DReLUFieldProtocol(int party, int bitlength, int log_radix_base,
+                     uint64_t prime_mod, sci::IOPack *iopack,
+                     sci::OTPack *otpack) {
+    assert(log_radix_base <= 8);
+    assert(bitlength <= 64);
+    this->party = party;
+    this->l = bitlength;
+    this->beta = log_radix_base;
+    this->iopack = iopack;
+    this->p = prime_mod;
+    this->otpack = otpack;
+    this->millionaire = new MillionaireProtocol(party, iopack, otpack,
+                                                bitlength, log_radix_base);
+    this->triple_gen = millionaire->triple_gen;
+    configure();
+  }
+
+  void configure() {
+    this->num_digits = ceil((double)l / beta);
+    this->r = l % beta;
+    this->log_alpha = sci::bitlen(num_digits) - 1;
+    this->log_num_digits = log_alpha + 1;
+    this->num_triples_corr = 2 * num_digits - 2 - 2 * log_num_digits;
+    this->num_triples_std = log_num_digits;
+    this->num_triples = num_triples_std + num_triples_corr;
+    if (beta == 8)
+      this->mask_beta = -1;
+    else
+      this->mask_beta = (1 << beta) - 1;
+    this->mask_r = (1 << r) - 1;
+    this->beta_pow = 1 << beta;
+    this->take_lsb = 1;
+    this->p_2 = (p - 1) / 2;
+  }
+
+  ~DReLUFieldProtocol() { delete millionaire; }
+
+  void compute_drelu(uint8_t *drelu, uint64_t *share, int num_relu) {
+    int old_num_relu = num_relu;
+    // num_relu should be a multiple of 4
+    num_relu = ceil(num_relu / 4.0) * 4;
+
+    uint64_t *share_ext;
+    uint8_t *drelu_ext;
+    if (old_num_relu == num_relu) {
+      share_ext = share;
+      drelu_ext = drelu;
+    } else {
+      share_ext = new uint64_t[num_relu];
+      drelu_ext = new uint8_t[num_relu];
+      memcpy(share_ext, share, old_num_relu * sizeof(uint64_t));
+      memset(share_ext + old_num_relu, 0,
+             (num_relu - old_num_relu) * sizeof(uint64_t));
+    }
+    int num_cmps = 2 * num_relu;
+
+    uint8_t *digits;       // num_digits * num_cmps
+    uint8_t *leaf_res_cmp; // num_digits * num_cmps
+    uint8_t *leaf_res_eq;  // num_digits * num_cmps
+
+    // To save number of rounds in WAN over slight increase in communication
+#ifdef WAN_EXEC
+    Triple triples_std((num_triples)*num_cmps, true);
+#else
+    Triple triples_corr(num_triples_corr * num_cmps, true, num_cmps);
+    Triple triples_std(num_triples_std * num_cmps, true);
+#endif
+    digits = new uint8_t[num_digits * num_cmps];
+    leaf_res_cmp = new uint8_t[num_digits * num_cmps];
+    leaf_res_eq = new uint8_t[num_digits * num_cmps];
+
+    // Extract radix-digits from data
+    assert((beta <= 8) && "Base beta > 8 is not implemented");
+    if (party == sci::ALICE) {
+      for (int j = 0; j < num_relu; j++) {
+        uint64_t input_wrap_cmp = (p - 1 - share_ext[j]) + p_2;
+        uint64_t input_drelu_cmp;
+        if (share_ext[j] > p_2) {
+          input_drelu_cmp = 2 * p - 1 - share_ext[j];
+        } else {
+          input_drelu_cmp = p - 1 - share_ext[j];
+        }
+        for (int i = 0; i < num_digits; i++) { // Stored from LSB to MSB
+          if ((i == num_digits - 1) &&
+              (r != 0)) { // A partially full last digit
+            digits[i * num_relu * 2 + j * 2 + 0] =
+                (uint8_t)(input_wrap_cmp >> i * beta) & mask_r;
+            digits[i * num_relu * 2 + j * 2 + 1] =
+                (uint8_t)(input_drelu_cmp >> i * beta) & mask_r;
+          } else {
+            digits[i * num_relu * 2 + j * 2 + 0] =
+                (uint8_t)(input_wrap_cmp >> i * beta) & mask_beta;
+            digits[i * num_relu * 2 + j * 2 + 1] =
+                (uint8_t)(input_drelu_cmp >> i * beta) & mask_beta;
+          }
+        }
+      }
+    } else // party = sci::BOB
+    {
+      for (int j = 0; j < num_relu; j++) {
+        uint64_t input_cmp = p_2 + share_ext[j];
+        for (int i = 0; i < num_digits; i++) { // Stored from LSB to MSB
+          if ((i == num_digits - 1) &&
+              (r != 0)) { // A partially full last digit
+            digits[i * num_relu + j * 1 + 0] =
+                (uint8_t)(input_cmp >> i * beta) & mask_r;
+          } else {
+            digits[i * num_relu + j * 1 + 0] =
+                (uint8_t)(input_cmp >> i * beta) & mask_beta;
+          }
+        }
+      }
+    }
+
+    if (party == sci::ALICE) {
+      uint8_t *
+          *leaf_ot_messages; // (num_digits * num_relu) X beta_pow (=2^beta)
+      // Do the two relu_comparisons together in 1 leaf OT.
+      leaf_ot_messages = new uint8_t *[num_digits * num_relu];
+      for (int i = 0; i < num_digits * num_relu; i++)
+        leaf_ot_messages[i] = new uint8_t[beta_pow];
+
+      // Set Leaf OT messages
+      triple_gen->prg->random_bool((bool *)leaf_res_cmp,
+                                   num_digits * num_relu * 2);
+      triple_gen->prg->random_bool((bool *)leaf_res_eq,
+                                   num_digits * num_relu * 2);
+      for (int i = 0; i < num_digits; i++) {
+        for (int j = 0; j < num_relu; j++) {
+          if (i == 0) {
+            set_leaf_ot_messages(leaf_ot_messages[i * num_relu + j],
+                                 digits + i * num_relu * 2 + j * 2, beta_pow,
+                                 leaf_res_cmp + i * num_relu * 2 + j * 2, 0,
+                                 false);
+          } else if (i == (num_digits - 1) && (r > 0)) {
+#ifdef WAN_EXEC
+            set_leaf_ot_messages(leaf_ot_messages[i * num_relu + j],
+                                 digits + i * num_relu * 2 + j * 2, beta_pow,
+                                 leaf_res_cmp + i * num_relu * 2 + j * 2,
+                                 leaf_res_eq + i * num_relu * 2 + j * 2);
+#else
+            set_leaf_ot_messages(leaf_ot_messages[i * num_relu + j],
+                                 digits + i * num_relu * 2 + j * 2, 1 << r,
+                                 leaf_res_cmp + i * num_relu * 2 + j * 2,
+                                 leaf_res_eq + i * num_relu * 2 + j * 2);
+#endif
+          } else {
+            set_leaf_ot_messages(leaf_ot_messages[i * num_relu + j],
+                                 digits + i * num_relu * 2 + j * 2, beta_pow,
+                                 leaf_res_cmp + i * num_relu * 2 + j * 2,
+                                 leaf_res_eq + i * num_relu * 2 + j * 2);
+          }
+        }
+      }
+
+      // Perform Leaf OTs
+
+      // Each ReLU has the first digit of which equality is not required.
+#ifdef WAN_EXEC
+      otpack->kkot[beta - 1]->send(leaf_ot_messages, num_relu * (num_digits),
+                                   2 * 2);
+#else
+      otpack->kkot[beta - 1]->send(leaf_ot_messages, num_relu, 1 * 2);
+      if (r == 1) {
+        // For the last digit (MSB), use IKNP because it is just 1-bit
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_relu,
+                                     num_relu * (num_digits - 2), 2 * 2);
+        otpack->iknp_straight->send(
+            leaf_ot_messages + num_relu * (num_digits - 1), num_relu, 2 * 2);
+      } else if (r != 0) {
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_relu,
+                                     num_relu * (num_digits - 2), 2 * 2);
+        otpack->kkot[r - 1]->send(
+            leaf_ot_messages + num_relu * (num_digits - 1), num_relu, 2 * 2);
+      } else
+        otpack->kkot[beta - 1]->send(leaf_ot_messages + num_relu,
+                                     num_relu * (num_digits - 1), 2 * 2);
+#endif
+      // Cleanup
+      for (int i = 0; i < num_digits * num_relu; i++)
+        delete[] leaf_ot_messages[i];
+      delete[] leaf_ot_messages;
+      // Alice's shares are: cmp1, cmp2 and cmp3 are at leaf_res_cmp[0+3*j],
+      // leaf_res_cmp[1+3*j] and leaf_res_cmp[2+3*j]
+    } else // party = sci::BOB
+    {
+      // Perform Leaf OTs
+      uint8_t *leaf_ot_recvd = new uint8_t[num_digits * num_relu];
+#ifdef WAN_EXEC
+      otpack->kkot[beta - 1]->recv(leaf_ot_recvd, digits,
+                                   num_relu * (num_digits), 2 * 2);
+#else
+      otpack->kkot[beta - 1]->recv(leaf_ot_recvd, digits, num_relu, 1 * 2);
+      if (r == 1) {
+        otpack->kkot[beta - 1]->recv(leaf_ot_recvd + num_relu,
+                                     digits + num_relu,
+                                     num_relu * (num_digits - 2), 2 * 2);
+        otpack->iknp_straight->recv(leaf_ot_recvd + num_relu * (num_digits - 1),
+                                    digits + num_relu * (num_digits - 1),
+                                    num_relu, 2 * 2);
+      } else if (r != 0) {
+        otpack->kkot[beta - 1]->recv(leaf_ot_recvd + num_relu,
+                                     digits + num_relu,
+                                     num_relu * (num_digits - 2), 2 * 2);
+        otpack->kkot[r - 1]->recv(leaf_ot_recvd + num_relu * (num_digits - 1),
+                                  digits + num_relu * (num_digits - 1),
+                                  num_relu, 2 * 2);
+      } else
+        otpack->kkot[beta - 1]->recv(leaf_ot_recvd + num_relu,
+                                     digits + num_relu,
+                                     num_relu * (num_digits - 1), 2 * 2);
+#endif
+
+      // Extract equality result from leaf_res_cmp
+      for (int i = 0; i < num_relu; i++) {
+        leaf_res_cmp[2 * i] = (leaf_ot_recvd[i] & (1 << 1)) >> 1;
+        leaf_res_cmp[2 * i + 1] = (leaf_ot_recvd[i] & (1 << 0)) >> 0;
+      }
+      for (int i = num_relu; i < num_digits * num_relu; i++) {
+        leaf_res_cmp[2 * i] = (leaf_ot_recvd[i] & (1 << 3)) >> 3;
+        leaf_res_cmp[2 * i + 1] = (leaf_ot_recvd[i] & (1 << 2)) >> 2;
+
+        leaf_res_eq[2 * i] = (leaf_ot_recvd[i] & (1 << 1)) >> 1;
+        leaf_res_eq[2 * i + 1] = (leaf_ot_recvd[i] & (1 << 0)) >> 0;
+      }
+      delete[] leaf_ot_recvd;
+    }
+
+    // Generate required Bit-Triples and traverse tree to compute the results of
+    // comparsions
+    millionaire->traverse_and_compute_ANDs(num_cmps, leaf_res_eq, leaf_res_cmp);
+
+    if (party == sci::ALICE) {
+      uint8_t **mux_ot_messages = new uint8_t *[num_relu];
+      for (int j = 0; j < num_relu; j++) {
+        mux_ot_messages[j] = new uint8_t[4];
+      }
+      triple_gen->prg->random_bool((bool *)drelu_ext, num_relu);
+      for (int j = 0; j < num_relu; j++) {
+        // drelu[j] = 0;
+        bool neg_share = (share_ext[j] > p_2);
+        set_mux_ot_messages(mux_ot_messages[j], leaf_res_cmp + j * 2, drelu_ext[j],
+                            neg_share);
+      }
+      otpack->kkot[1]->send(mux_ot_messages, num_relu, 1);
+      for (int j = 0; j < num_relu; j++) {
+        delete[] mux_ot_messages[j];
+      }
+      delete[] mux_ot_messages;
+    } else // sci::BOB
+    {
+      uint8_t *mux_ot_selection = new uint8_t[num_relu];
+      for (int j = 0; j < num_relu; j++) {
+        mux_ot_selection[j] =
+            (leaf_res_cmp[j * 2 + 1] << 1) | leaf_res_cmp[j * 2];
+      }
+      otpack->kkot[1]->recv(drelu_ext, mux_ot_selection, num_relu, 1);
+      delete[] mux_ot_selection;
+    }
+    if (old_num_relu != num_relu) {
+      memcpy(drelu, drelu_ext, old_num_relu * sizeof(uint8_t));
+      delete[] share_ext;
+      delete[] drelu_ext;
+    }
+
+    delete[] digits;
+    delete[] leaf_res_cmp;
+    delete[] leaf_res_eq;
+  }
+
+  void set_leaf_ot_messages(uint8_t *ot_messages, uint8_t *digits, int N,
+                            uint8_t *additive_mask_cmp,
+                            uint8_t *additive_mask_eq, bool eq = true) {
+    for (int i = 0; i < N; i++) {
+      if (eq) {
+        ot_messages[i] = 0;
+        // For comparisons
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[0] < i) ^ additive_mask_cmp[0]);
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[1] < i) ^ additive_mask_cmp[1]);
+        // For equality
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[0] == i) ^ additive_mask_eq[0]);
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[1] == i) ^ additive_mask_eq[1]);
+      } else {
+        ot_messages[i] = 0;
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[0] < i) ^ additive_mask_cmp[0]);
+        ot_messages[i] =
+            (ot_messages[i] << 1) | ((digits[1] < i) ^ additive_mask_cmp[1]);
+      }
+    }
+  }
+
+  void set_mux_ot_messages(uint8_t *ot_messages, uint8_t *cmp_results,
+                           uint8_t drelu_mask, bool neg_share) {
+    uint8_t bits_i[2];
+    for (int i = 0; i < 4; i++) {
+      sci::uint8_to_bool(bits_i, i, 2); // wrap_cmp || !drelu_cmp
+      uint8_t drelu_cmp = bits_i[1] ^ cmp_results[1];
+      uint8_t wrap = bits_i[0] ^ cmp_results[0];
+      if (neg_share) {
+        ot_messages[i] = ((1 ^ drelu_cmp) & wrap) ^ 1;
+      } else {
+        ot_messages[i] = drelu_cmp ^ (drelu_cmp & wrap);
+      }
+      ot_messages[i] ^= drelu_mask;
+    }
+  }
+};
+#endif // DRELU_FIELD_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/maxpool.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/maxpool.h
new file mode 100644
index 00000000..51b61b9a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/maxpool.h
@@ -0,0 +1,181 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef MAXPOOL_PRIMARY_H__
+#define MAXPOOL_PRIMARY_H__
+
+#include "NonLinear/relu-field.h"
+#include "NonLinear/relu-ring.h"
+
+template <typename type> class MaxPoolProtocol {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  ReLURingProtocol<type> *relu_oracle;
+  ReLUFieldProtocol<type> *relu_field_oracle;
+  int party;
+  int algeb_str;
+  int l, b;
+  int num_cmps;
+  uint64_t prime_mod;
+  type mask_l;
+
+  // Constructor
+  MaxPoolProtocol(int party, int algeb_str, sci::IOPack *iopack, int l, int b,
+                  uint64_t prime, sci::OTPack *otpack) {
+    this->party = party;
+    this->algeb_str = algeb_str;
+    this->iopack = iopack;
+    this->l = l;
+    this->b = b;
+    this->prime_mod = prime;
+    this->otpack = otpack;
+    if (algeb_str == RING) {
+      this->relu_oracle =
+          new ReLURingProtocol<type>(party, RING, iopack, l, b, otpack);
+    } else {
+      this->relu_field_oracle = new ReLUFieldProtocol<type>(
+          party, FIELD, iopack, l, b, this->prime_mod, otpack);
+    }
+    configure();
+  }
+
+  // Destructor
+  ~MaxPoolProtocol() {
+    if (algeb_str == RING)
+      delete relu_oracle;
+    else
+      delete relu_field_oracle;
+  }
+
+  void configure() {
+    if (this->l != 32 && this->l != 64) {
+      mask_l = (type)((1ULL << l) - 1);
+    } else if (this->l == 32) {
+      mask_l = -1;
+    } else { // l = 64
+      mask_l = -1ULL;
+    }
+  }
+
+  void funcMaxMPC(int rows, int cols, type *inpArr, type *maxi, type *maxiIdx,
+                  bool computeMaxIdx = false) {
+    type *max_temp = new type[rows];
+    type *compare_with = new type[rows];
+    if (this->algeb_str == FIELD) {
+      for (int r = 0; r < rows; r++) {
+        max_temp[r] = inpArr[r * cols];
+      }
+      for (int c = 1; c < cols; c++) {
+        for (int r = 0; r < rows; r++) {
+          compare_with[r] = sci::neg_mod(
+              (int64_t)((int64_t)max_temp[r] - (int64_t)inpArr[r * cols + c]),
+              this->prime_mod);
+        }
+        relu_field_oracle->relu(max_temp, compare_with, rows);
+        for (int r = 0; r < rows; r++) {
+          max_temp[r] = (max_temp[r] + inpArr[r * cols + c]) % this->prime_mod;
+        }
+      }
+      for (int r = 0; r < rows; r++) {
+        maxi[r] = max_temp[r];
+      }
+    } else { // RING
+      for (int r = 0; r < rows; r++) {
+        max_temp[r] = inpArr[r * cols];
+      }
+      for (int c = 1; c < cols; c++) {
+        for (int r = 0; r < rows; r++) {
+          compare_with[r] = max_temp[r] - inpArr[r * cols + c];
+        }
+        relu_oracle->relu(max_temp, compare_with, rows);
+        for (int r = 0; r < rows; r++) {
+          max_temp[r] += inpArr[r * cols + c];
+        }
+      }
+      for (int r = 0; r < rows; r++) {
+        maxi[r] = max_temp[r];
+        maxi[r] &= mask_l;
+      }
+    }
+  }
+
+  void funcMaxMPCIdeal(int rows, int cols, type *inpArr, type *maxi,
+                       type *maxiIdx, bool computeMaxIdx = false) {
+    type *otherPartyData = new type[rows * cols];
+    if (party == 1) {
+      iopack->io->send_data(inpArr, sizeof(type) * rows * cols);
+      for (int i = 0; i < rows; i++) {
+        maxi[i] = 0;
+        maxiIdx[i] = 0;
+      }
+    } else {
+      iopack->io->recv_data(otherPartyData, sizeof(type) * rows * cols);
+      if (this->algeb_str == RING) {
+        for (int i = 0; i < rows * cols; i++) {
+          otherPartyData[i] = otherPartyData[i] + inpArr[i];
+        }
+        for (int i = 0; i < rows; i++) {
+          maxi[i] = otherPartyData[i * cols];
+          maxiIdx[i] = 0;
+        }
+        for (int j = 1; j < cols; j++) {
+          for (int i = 0; i < rows; i++) {
+            if (((int64_t)otherPartyData[i * cols + j]) > ((int64_t)maxi[i])) {
+              // Do signed comparison
+              maxi[i] = otherPartyData[i * cols + j];
+              maxiIdx[i] = j;
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < rows * cols; i++) {
+          otherPartyData[i] = (otherPartyData[i] + inpArr[i]) % prime_mod;
+        }
+        for (int i = 0; i < rows; i++) {
+          maxi[i] = otherPartyData[i * cols];
+          maxiIdx[i] = 0;
+        }
+        for (int j = 1; j < cols; j++) {
+          for (int i = 0; i < rows; i++) {
+            int64_t curValSigned = otherPartyData[i * cols + j];
+            int64_t curMaxSigned = maxi[i];
+            if (curValSigned > (prime_mod / 2)) {
+              curValSigned = curValSigned - prime_mod;
+            }
+            if (curMaxSigned > (prime_mod / 2)) {
+              curMaxSigned = curMaxSigned - prime_mod;
+            }
+            if (curValSigned > curMaxSigned) {
+              // Do signed comparison
+              maxi[i] = otherPartyData[i * cols + j];
+              maxiIdx[i] = j;
+            }
+          }
+        }
+      }
+    }
+    delete[] otherPartyData;
+  }
+};
+
+#endif // MAXPOOL_PRIMARY_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-field.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-field.h
new file mode 100644
index 00000000..0827261b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-field.h
@@ -0,0 +1,213 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef RELU_FIELD_H__
+#define RELU_FIELD_H__
+
+#include "Millionaire/millionaire.h"
+#include "NonLinear/drelu-field.h"
+#include "NonLinear/relu-interface.h"
+
+#define FIELD 1
+
+template <typename type> class ReLUFieldProtocol : public ReLUProtocol<type> {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  DReLUFieldProtocol *relu_triple_compare_oracle;
+  int party;
+  int algeb_str;
+  int l, b;
+  int num_cmps;
+  uint64_t p;
+  uint64_t p_bitlen;
+  uint64_t p_bitlen_triple_comparison;
+  uint64_t rhs_wrap;
+  uint64_t rhs_wrap_off;
+  uint64_t rhs_wrap_on;
+  uint8_t zero_small = 0;
+
+  // Constructor
+  ReLUFieldProtocol(int party, int algeb_str, sci::IOPack *iopack, int l, int b,
+                    uint64_t mod, sci::OTPack *otpack) {
+    this->party = party;
+    this->algeb_str = algeb_str;
+    this->iopack = iopack;
+    this->l = l;
+    this->b = b;
+    this->p = mod;
+    this->p_bitlen = l;
+    this->p_bitlen_triple_comparison = l + 1;
+    this->otpack = otpack;
+    this->triple_gen = new TripleGenerator(party, iopack, otpack);
+    this->relu_triple_compare_oracle =
+        new DReLUFieldProtocol(party, l + 1, b, mod, iopack, otpack);
+    configure();
+  }
+
+  // Destructor
+  ~ReLUFieldProtocol() {
+    delete triple_gen;
+    delete relu_triple_compare_oracle;
+  }
+
+  void configure() {
+    // Set all the inequality RHS values here.
+    rhs_wrap = this->p;
+    rhs_wrap_off = this->p / 2;
+    rhs_wrap_on = this->p + this->p / 2;
+  }
+
+  // Tester Ideal Functionality
+  void drelu_field_ideal_func(uint8_t *result, uint64_t *sh1, uint64_t *sh2,
+                              int num_relu) {
+    uint8_t *wrap, *relu_wrap_off, *relu_wrap_on, *expected_drelu,
+        *actual_drelu;
+    wrap = new uint8_t[num_relu];
+    relu_wrap_off = new uint8_t[num_relu];
+    relu_wrap_on = new uint8_t[num_relu];
+    expected_drelu = new uint8_t[num_relu];
+    actual_drelu = new uint8_t[num_relu];
+    for (int i = 0; i < num_relu; i++) {
+      wrap[i] = greater_than_wrap(
+          sh1[i], sci::neg_mod((int64_t)(this->rhs_wrap - sh2[i]), this->p));
+      relu_wrap_off[i] = greater_than_relu_wrap_off(
+          sh1[i],
+          sci::neg_mod((int64_t)(this->rhs_wrap_off - sh2[i]), this->p));
+      relu_wrap_on[i] = greater_than_relu_wrap_on(
+          sh1[i], sci::neg_mod((int64_t)(this->rhs_wrap_on - sh2[i]), this->p));
+      expected_drelu[i] =
+          (relu_wrap_on[i] & wrap[i]) ^ ((1 ^ wrap[i]) & relu_wrap_off[i]);
+      expected_drelu[i] = 1 ^ expected_drelu[i];
+      actual_drelu[i] = (((sh1[i] + sh2[i]) % this->p) > this->rhs_wrap_off);
+      result[i] = 1 ^ actual_drelu[i];
+      assert(((result[i] & 1) == (1 & expected_drelu[i])) &&
+             "The computed DReLU did not match the actual DReLU");
+    }
+  }
+
+  // To handle the case when RHS is taken modulo p
+  uint8_t greater_than_wrap(uint64_t lhs, uint64_t rhs) {
+    if (rhs == 0ULL) {
+      return 0;
+    } else {
+      return (lhs >= rhs);
+    }
+  }
+
+  uint8_t greater_than_relu_wrap_off(uint64_t lhs, uint64_t rhs) {
+    if (rhs <= this->rhs_wrap_off) {
+      return (lhs > rhs);
+    } else {
+      return 1;
+    }
+  }
+
+  uint8_t greater_than_relu_wrap_on(uint64_t lhs, uint64_t rhs) {
+    if (rhs <= this->rhs_wrap_off) {
+      return 0;
+    } else {
+      return (lhs > rhs);
+    }
+  }
+
+  void drelu(uint8_t *drelu_res, uint64_t *share, int num_relu) {
+    this->relu_triple_compare_oracle->compute_drelu(drelu_res, share, num_relu);
+  }
+
+  void relu(type *result, type *share, int num_relu,
+            uint8_t *drelu_res = nullptr, bool skip_ot = false) {
+    uint8_t *drelu_ans = new uint8_t[num_relu];
+    drelu(drelu_ans, (uint64_t *)share, num_relu);
+    if (drelu_res != nullptr) {
+      memcpy(drelu_res, drelu_ans, num_relu * sizeof(uint8_t));
+    }
+    if (skip_ot) {
+      delete[] drelu_ans;
+      // std::cout<<"Doing Max and ArgMax OTs together"<<std::endl;
+      return;
+    }
+    // Now perform x.msb(x)
+    // 2 OTs required with reversed roles
+    uint64_t **ot_messages = new uint64_t *[num_relu];
+    for (int i = 0; i < num_relu; i++) {
+      ot_messages[i] = new uint64_t[2];
+    }
+    uint64_t *additive_masks = new uint64_t[num_relu];
+    uint64_t *received_shares = new uint64_t[num_relu];
+    this->triple_gen->prg->template random_mod_p<type>((type *)additive_masks,
+                                                       num_relu, this->p);
+    for (int i = 0; i < num_relu; i++) {
+      additive_masks[i] %= this->p;
+    }
+    for (int i = 0; i < num_relu; i++) {
+      set_relu_end_ot_messages(ot_messages[i], share + i, drelu_ans + i,
+                               ((type *)additive_masks) + i);
+    }
+#pragma omp parallel num_threads(2)
+    {
+      if (omp_get_thread_num() == 1) {
+        if (party == sci::ALICE) {
+          otpack->iknp_reversed->recv(received_shares, drelu_ans, num_relu,
+                                      this->l);
+        } else { // party == sci::BOB
+          otpack->iknp_reversed->send(ot_messages, num_relu, this->l);
+        }
+      } else {
+        if (party == sci::ALICE) {
+          otpack->iknp_straight->send(ot_messages, num_relu, this->l);
+        } else { // party == sci::BOB
+          otpack->iknp_straight->recv(received_shares, drelu_ans, num_relu,
+                                      this->l);
+        }
+      }
+    }
+    for (int i = 0; i < num_relu; i++) {
+      result[i] = ((type *)additive_masks)[i] +
+                  ((type *)received_shares)[(8 / sizeof(type)) * i];
+      result[i] %= this->p;
+    }
+    delete[] additive_masks;
+    delete[] received_shares;
+    for (int i = 0; i < num_relu; i++) {
+      delete[] ot_messages[i];
+    }
+    delete[] ot_messages;
+  }
+
+  void set_relu_end_ot_messages(uint64_t *ot_messages, type *value_share,
+                                uint8_t *xor_share, type *additive_mask) {
+    type temp0, temp1;
+    temp0 = sci::neg_mod((int64_t)value_share[0] - (int64_t)additive_mask[0],
+                         this->p);
+    temp1 = sci::neg_mod((int64_t)0LL - (int64_t)additive_mask[0], this->p);
+    if (*xor_share == zero_small) {
+      ot_messages[0] = 0ULL + temp0;
+      ot_messages[1] = 0ULL + temp1;
+    } else {
+      ot_messages[0] = 0ULL + temp1;
+      ot_messages[1] = 0ULL + temp0;
+    }
+  }
+};
+
+#endif // RELU_FIELD_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-interface.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-interface.h
new file mode 100644
index 00000000..49596472
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-interface.h
@@ -0,0 +1,31 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef RELU_INTERFACE_H__
+#define RELU_INTERFACE_H__
+
+template <typename intType> class ReLUProtocol {
+public:
+  virtual void relu(intType *outp, intType *inp, int numRelu,
+                    uint8_t *drelu_res = nullptr, bool skip_ot = false) = 0;
+};
+
+#endif // RELU_INTERFACE_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-ring.h b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-ring.h
new file mode 100644
index 00000000..9771ce0b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-ring.h
@@ -0,0 +1,251 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef RELU_RING_H__
+#define RELU_RING_H__
+
+#include "Millionaire/millionaire.h"
+#include "NonLinear/relu-interface.h"
+#include <omp.h>
+
+#define RING 0
+#define OFF_PLACE
+
+template <typename type> class ReLURingProtocol : public ReLUProtocol<type> {
+public:
+  sci::IOPack *iopack;
+  sci::OTPack *otpack;
+  TripleGenerator *triple_gen;
+  MillionaireProtocol *millionaire;
+  int party;
+  int algeb_str;
+  int l, b;
+  int num_cmps;
+  uint8_t two_small = 1 << 1;
+  uint8_t zero_small = 0;
+  uint64_t mask_take_32 = -1;
+  uint64_t msb_one;
+  uint64_t cut_mask;
+  uint64_t relu_comparison_rhs;
+  type mask_l;
+  type relu_comparison_rhs_type;
+  type cut_mask_type;
+  type msb_one_type;
+
+  // Constructor
+  ReLURingProtocol(int party, int algeb_str, sci::IOPack *iopack, int l, int b,
+                   sci::OTPack *otpack) {
+    this->party = party;
+    this->algeb_str = algeb_str;
+    this->iopack = iopack;
+    this->l = l;
+    this->b = b;
+    this->otpack = otpack;
+    this->millionaire = new MillionaireProtocol(party, iopack, otpack);
+    this->triple_gen = this->millionaire->triple_gen;
+    configure();
+  }
+
+  // Destructor
+  virtual ~ReLURingProtocol() { delete millionaire; }
+
+  void configure() {
+    if (this->l != 32 && this->l != 64) {
+      mask_l = (type)((1ULL << l) - 1);
+    } else if (this->l == 32) {
+      mask_l = -1;
+    } else { // l = 64
+      mask_l = -1ULL;
+    }
+    if (sizeof(type) == sizeof(uint64_t)) {
+      msb_one = (1ULL << (this->l - 1));
+      relu_comparison_rhs_type = msb_one - 1ULL;
+      relu_comparison_rhs = relu_comparison_rhs_type;
+      cut_mask_type = relu_comparison_rhs_type;
+      cut_mask = cut_mask_type;
+    } else {
+      msb_one_type = (1 << (this->l - 1));
+      relu_comparison_rhs_type = msb_one_type - 1;
+      relu_comparison_rhs = relu_comparison_rhs_type + 0ULL;
+      cut_mask_type = relu_comparison_rhs_type;
+      cut_mask = cut_mask_type + 0ULL;
+    }
+  }
+
+  // Ideal Functionality
+  void drelu_ring_ideal_func(uint8_t *result, type *sh1, type *sh2,
+                             int num_relu) {
+    uint8_t *msb1 = new uint8_t[num_relu];
+    uint8_t *msb2 = new uint8_t[num_relu];
+    type *plain_value = new type[num_relu];
+    for (int i = 0; i < num_relu; i++) {
+      plain_value[i] = sh1[i] + sh2[i];
+    }
+    uint8_t *actual_drelu = new uint8_t[num_relu];
+
+    uint64_t index_fetch = (sizeof(type) == sizeof(uint64_t)) ? 7 : 3;
+    for (int i = 0; i < num_relu; i++) {
+      msb1[i] = (*((uint8_t *)(&(sh1[i])) + index_fetch)) >> 7;
+      msb2[i] = (*((uint8_t *)(&(sh2[i])) + index_fetch)) >> 7;
+      actual_drelu[i] = (*((uint8_t *)(&(plain_value[i])) + index_fetch)) >> 7;
+    }
+
+    type *sh1_cut = new type[num_relu];
+    type *sh2_cut = new type[num_relu];
+    uint8_t *wrap = new uint8_t[num_relu];
+    uint8_t *wrap_orig = new uint8_t[num_relu];
+    uint8_t *relu_comparison_avoid_warning = new uint8_t[sizeof(type)];
+    memcpy(relu_comparison_avoid_warning, &relu_comparison_rhs, sizeof(type));
+    for (int i = 0; i < num_relu; i++) {
+      sh1_cut[i] = sh1[i] & cut_mask;
+      sh2_cut[i] = sh2[i] & cut_mask;
+      wrap_orig[i] =
+          ((sh1_cut[i] + sh2_cut[i]) > *(type *)relu_comparison_avoid_warning);
+      wrap[i] = wrap_orig[i];
+      wrap[i] ^= msb1[i];
+      wrap[i] ^= msb2[i];
+    }
+    memcpy(result, wrap, num_relu);
+    for (int i = 0; i < num_relu; i++) {
+      assert((wrap[i] == actual_drelu[i]) &&
+             "The computed DReLU did not match the actual DReLU");
+    }
+  }
+
+  void relu(type *result, type *share, int num_relu,
+            uint8_t *drelu_res = nullptr, bool skip_ot = false) {
+    uint8_t *msb_local_share = new uint8_t[num_relu];
+    uint64_t *array64;
+    type *array_type;
+    array64 = new uint64_t[num_relu];
+    array_type = new type[num_relu];
+
+    if (this->algeb_str == RING) {
+      this->num_cmps = num_relu;
+    } else {
+      abort();
+    }
+    uint8_t *wrap = new uint8_t[num_cmps];
+    for (int i = 0; i < num_relu; i++) {
+      msb_local_share[i] = (uint8_t)(share[i] >> (l - 1));
+      array_type[i] = share[i] & cut_mask_type;
+    }
+
+    type temp;
+
+    switch (this->party) {
+    case sci::ALICE: {
+      for (int i = 0; i < num_relu; i++) {
+        array64[i] = array_type[i] + 0ULL;
+      }
+      break;
+    }
+    case sci::BOB: {
+      for (int i = 0; i < num_relu; i++) {
+        temp = this->relu_comparison_rhs_type -
+               array_type[i]; // This value is never negative.
+        array64[i] = 0ULL + temp;
+      }
+      break;
+    }
+    }
+
+    this->millionaire->compare(wrap, array64, num_cmps, l - 1, true, false, b);
+    for (int i = 0; i < num_relu; i++) {
+      msb_local_share[i] = (msb_local_share[i] + wrap[i]) % two_small;
+    }
+
+    if (drelu_res != nullptr) {
+      for (int i = 0; i < num_relu; i++) {
+        drelu_res[i] = msb_local_share[i];
+      }
+    }
+
+    if (skip_ot) {
+      delete[] msb_local_share;
+      delete[] array64;
+      delete[] array_type;
+      return;
+    }
+
+    // Now perform x.msb(x)
+    uint64_t **ot_messages = new uint64_t *[num_relu];
+    for (int i = 0; i < num_relu; i++) {
+      ot_messages[i] = new uint64_t[2];
+    }
+    uint64_t *additive_masks = new uint64_t[num_relu];
+    uint64_t *received_shares = new uint64_t[num_relu];
+    this->triple_gen->prg->random_data(additive_masks, num_relu * sizeof(type));
+    for (int i = 0; i < num_relu; i++) {
+      set_relu_end_ot_messages(ot_messages[i], share + i, msb_local_share + i,
+                               ((type *)additive_masks) + i);
+    }
+#pragma omp parallel num_threads(2)
+    {
+      if (omp_get_thread_num() == 1) {
+        if (party == sci::ALICE) {
+          otpack->iknp_reversed->recv(received_shares, msb_local_share,
+                                      num_relu, this->l);
+        } else { // party == sci::BOB
+          otpack->iknp_reversed->send(ot_messages, num_relu, this->l);
+        }
+      } else {
+        if (party == sci::ALICE) {
+          otpack->iknp_straight->send(ot_messages, num_relu, this->l);
+        } else { // party == sci::BOB
+          otpack->iknp_straight->recv(received_shares, msb_local_share,
+                                      num_relu, this->l);
+        }
+      }
+    }
+    for (int i = 0; i < num_relu; i++) {
+      result[i] = ((type *)additive_masks)[i] +
+                  ((type *)received_shares)[(8 / sizeof(type)) * i];
+      result[i] &= mask_l;
+    }
+    delete[] msb_local_share;
+    delete[] array64;
+    delete[] array_type;
+    delete[] additive_masks;
+    delete[] received_shares;
+    for (int i = 0; i < num_relu; i++) {
+      delete[] ot_messages[i];
+    }
+    delete[] ot_messages;
+    delete[] wrap;
+  }
+
+  void set_relu_end_ot_messages(uint64_t *ot_messages, type *value_share,
+                                uint8_t *xor_share, type *additive_mask) {
+    type temp0, temp1;
+    temp0 = (value_share[0] - additive_mask[0]);
+    temp1 = (0 - additive_mask[0]);
+    if (*xor_share == zero_small) {
+      ot_messages[0] = 0ULL + temp0;
+      ot_messages[1] = 0ULL + temp1;
+    } else {
+      ot_messages[0] = 0ULL + temp1;
+      ot_messages[1] = 0ULL + temp0;
+    }
+  }
+};
+
+#endif // RELU_RING_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/OT/CMakeLists.txt
new file mode 100644
index 00000000..bf9097cd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(SCI-OTPrimitive INTERFACE)
+target_link_libraries(SCI-OTPrimitive
+    INTERFACE SCI-utils
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h
new file mode 100644
index 00000000..f330fe65
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h
@@ -0,0 +1,13 @@
+/** @defgroup OT Oblivious Transfer
+ */
+
+#include "OT/ideal.h"
+#include "OT/ot.h"
+
+#include "OT/iknp.h"
+#include "OT/np.h"
+
+#include "OT/kkot.h"
+#include "OT/ot_pack.h"
+#include "OT/split-iknp.h"
+#include "OT/split-kkot.h"
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h
new file mode 100644
index 00000000..c826591a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h
@@ -0,0 +1,159 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Nishant Kumar, Deevashwer Rathee
+*/
+
+#ifndef OT_IDEAL_H__
+#define OT_IDEAL_H__
+#include "OT/ot.h"
+#include <cmath>
+/** @addtogroup OT
+    @{
+  */
+namespace sci {
+template <typename IO> class OTIdeal : public OT<OTIdeal<IO>> {
+public:
+  int cnt = 0;
+  IO *io = nullptr;
+  OTIdeal(IO *io) { this->io = io; }
+
+  void send_impl(const block128 *data0, const block128 *data1, int length) {
+    cnt += length;
+    io->send_block(data0, length);
+    io->send_block(data1, length);
+  }
+
+  void send_impl(const block256 *data0, const block256 *data1, int length) {
+    cnt += length;
+    io->send_block(data0, length);
+    io->send_block(data1, length);
+  }
+
+  void recv_impl(block128 *data, const bool *b, int length) {
+    cnt += length;
+    block128 *data1 = new block128[length];
+    io->recv_block(data, length);
+    io->recv_block(data1, length);
+    for (int i = 0; i < length; ++i)
+      if (b[i])
+        data[i] = data1[i];
+    delete[] data1;
+  }
+
+  void recv_impl(block256 *data, const bool *b, int length) {
+    cnt += length;
+    alignas(32) block256 data1[length];
+    io->recv_block(data, length);
+    io->recv_block(data1, length);
+    for (int i = 0; i < length; ++i)
+      if (b[i])
+        data[i] = data1[i];
+  }
+
+  void send_impl(uint8_t **data, int length, int N, int l) {
+    assert(N <= 256 && N >= 2);
+    assert(l <= 8 && l >= 1 && (8 % l) == 0);
+    assert((N * l % 8) == 0);
+    uint8_t *b = new uint8_t[length];
+    io->recv_data(b, length);
+    for (int i = 0; i < length; i++)
+      io->send_data(&data[i][b[i]], 1);
+    delete[] b;
+  }
+
+  void recv_impl(uint8_t *data, const uint8_t *b, int length, int N, int l) {
+    assert(N <= 256 && N >= 2);
+    assert(l <= 8 && l >= 1 && (8 % l) == 0);
+    assert((N * l % 8) == 0);
+    io->send_data(b, length);
+    for (int i = 0; i < length; i++)
+      io->recv_data(&data[i], 1);
+  }
+
+  /*
+      COT function for packed multiplication correlation
+      - Does numOTs COTs for the correlation which is used in matmul
+      - corr is the array which contains the correlated values (i.e. a in the
+     case of matmul). Each value is stored in a separate uint64_t.
+      - For each OT, the chunkSizes[i] denotes the bitlen used for that OT
+     (again referring to matmul)
+      - For each OT, the numChunks[i] denotes the #values of that particular
+     bitlen. Hence, size of corr = summation of numChunks
+      - rdata is the r value which is returned to the sender
+  */
+  void send_cot_matmul(uint64_t *rdata, const uint64_t *corr,
+                       const uint64_t *chunkSizes, const uint64_t *numChunks,
+                       const int numOTs) {
+    // std::cout<<"Using Ideal OT"<<std::endl;
+    uint8_t choices[numOTs];
+    io->recv_data(choices, numOTs);
+    uint64_t corrPtr = 0;
+    for (int i = 0; i < numOTs; i++) {
+      // std::cout<<" i = "<<i<<" choice bit =
+      // "<<unsigned(choices[i])<<std::endl;
+      uint64_t curNumChunks = numChunks[i];
+      uint64_t curChunkSize = chunkSizes[i];
+      uint64_t recv_arr[curNumChunks];
+      for (uint64_t j = 0; j < curNumChunks; j++) {
+        rdata[corrPtr + j] =
+            (1 << curChunkSize) - 1; // Fill deterministic random values
+        // rdata[corrPtr+j] = 0; //Fill deterministic random values
+        if (choices[i] == 0) {
+          recv_arr[j] = rdata[corrPtr + j];
+        } else {
+          assert((choices[i] == 1) && "unknown choice bit");
+          recv_arr[j] = (rdata[corrPtr + j] + corr[corrPtr + j]) &
+                        (all1Mask(curChunkSize));
+          // std::cout<<" ** "<<rdata[corrPtr+j]<<" "<<corr[corrPtr+j]<<"
+          // "<<recv_arr[j]<<" chunksize = "<<curChunkSize<<std::endl;
+        }
+        // std::cout<<"j = "<<j<<" recv_data = "<<recv_arr[j]<<std::endl;
+      }
+      io->send_data(recv_arr, sizeof(uint64_t) * curNumChunks);
+      corrPtr += curNumChunks;
+    }
+  }
+
+  /*
+      COT function for packed multiplication correlation
+      - data is the data which will be read
+      - Rest of parameters have same meaning as specified in sender's
+     description
+  */
+  void recv_cot_matmul(uint64_t *data, const uint8_t *choices,
+                       const uint64_t *chunkSizes, const uint64_t *numChunks,
+                       const int numOTs) {
+    // std::cout<<"Using Ideal OT"<<std::endl;
+    io->send_data(choices, numOTs);
+    uint64_t dataPtr = 0;
+    for (int i = 0; i < numOTs; i++) {
+      uint64_t curNumChunks = numChunks[i];
+      io->recv_data(&data[dataPtr], sizeof(uint64_t) * curNumChunks);
+      dataPtr += curNumChunks;
+    }
+  }
+};
+} // namespace sci
+#endif // OT_IDEAL_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h
new file mode 100644
index 00000000..a96b6bcc
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h
@@ -0,0 +1,783 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Nishant Kumar
+*/
+
+#ifndef OT_IKNP_H__
+#define OT_IKNP_H__
+#include "OT/np.h"
+#include "OT/ot.h"
+#include <algorithm>
+namespace sci {
+template <typename IO> class IKNP : public OT<IKNP<IO>> {
+public:
+  OTNP<IO> *base_ot;
+  PRG128 prg;
+  const int lambda = 128;
+  const int block_size = 1024 * 16;
+  int l;
+
+  block128 *k0 = nullptr, *k1 = nullptr, *qT = nullptr, *tT = nullptr,
+           *tmp = nullptr, block_s;
+  PRG128 *G0, *G1;
+  bool *s = nullptr, *extended_r = nullptr, setup = false;
+  IO *io = nullptr;
+  CRH crh;
+
+  IKNP(IO *io) {
+    this->io = io;
+    base_ot = new OTNP<IO>(io);
+    s = new bool[lambda];
+    k0 = new block128[lambda];
+    k1 = new block128[lambda];
+    G0 = new PRG128[lambda];
+    G1 = new PRG128[lambda];
+    tmp = new block128[block_size / 128];
+    extended_r = new bool[block_size];
+  }
+
+  ~IKNP() {
+    delete base_ot;
+    delete[] s;
+    delete[] k0;
+    delete[] k1;
+    delete[] G0;
+    delete[] G1;
+    delete[] tmp;
+    delete[] extended_r;
+  }
+
+  void setup_send(block128 *in_k0 = nullptr, bool *in_s = nullptr) {
+    setup = true;
+    if (in_s != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block128));
+      memcpy(s, in_s, lambda);
+      block_s = bool_to128(s);
+    } else {
+      prg.random_bool(s, lambda);
+      base_ot->recv(k0, s, lambda);
+      block_s = bool_to128(s);
+    }
+    for (int i = 0; i < lambda; ++i)
+      G0[i].reseed(&k0[i]);
+  }
+
+  void setup_recv(block128 *in_k0 = nullptr, block128 *in_k1 = nullptr) {
+    setup = true;
+    if (in_k0 != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block128));
+      memcpy(k1, in_k1, lambda * sizeof(block128));
+    } else {
+      prg.random_block(k0, lambda);
+      prg.random_block(k1, lambda);
+      base_ot->send(k0, k1, lambda);
+    }
+    for (int i = 0; i < lambda; ++i) {
+      G0[i].reseed(&k0[i]);
+      G1[i].reseed(&k1[i]);
+    }
+  }
+
+  int padded_length(int length) {
+    return ((length + block_size - 1) / block_size) * block_size;
+  }
+
+  void send_pre(int length) {
+    length = padded_length(length);
+    block128 q[block_size];
+    qT = new block128[length];
+    if (!setup)
+      setup_send();
+
+    for (int j = 0; j < length / block_size; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(q + (i * block_size / 128), block_size / 8);
+        io->recv_data(tmp, block_size / 8);
+        if (s[i])
+          xorBlocks_arr(q + (i * block_size / 128), q + (i * block_size / 128),
+                        tmp, block_size / 128);
+      }
+      sse_trans((uint8_t *)(qT + j * block_size), (uint8_t *)q, 128,
+                block_size);
+    }
+  }
+
+  void recv_pre(const bool *r, int length) {
+    int old_length = length;
+    length = padded_length(length);
+    block128 t[block_size];
+    tT = new block128[length];
+
+    if (not setup)
+      setup_recv();
+
+    bool *r2 = new bool[length];
+    prg.random_bool(extended_r, block_size);
+    memcpy(r2, r, old_length);
+    memcpy(r2 + old_length, extended_r, length - old_length);
+
+    block128 *block_r = new block128[length / 128];
+    for (int i = 0; i < length / 128; ++i) {
+      block_r[i] = bool_to128(r2 + i * 128);
+    }
+    for (int j = 0; j * block_size < length; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(t + (i * block_size / 128), block_size / 8);
+        G1[i].random_data(tmp, block_size / 8);
+        xorBlocks_arr(tmp, t + (i * block_size / 128), tmp, block_size / 128);
+        xorBlocks_arr(tmp, block_r + (j * block_size / 128), tmp,
+                      block_size / 128);
+        io->send_data(tmp, block_size / 8);
+      }
+      sse_trans((uint8_t *)(tT + j * block_size), (uint8_t *)t, 128,
+                block_size);
+    }
+
+    delete[] block_r;
+    delete[] r2;
+  }
+
+  void got_send_post(const block128 *data0, const block128 *data1, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = xorBlocks(pad[2 * (j - i)], data0[j]);
+        pad[2 * (j - i) + 1] = xorBlocks(pad[2 * (j - i) + 1], data1[j]);
+      }
+      io->send_data(pad, 2 * sizeof(block128) * std::min(bsize, length - i));
+    }
+    delete[] qT;
+  }
+
+  void got_recv_post(block128 *data, const bool *r, int length) {
+    const int bsize = AES_BATCH_SIZE;
+    block128 res[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      io->recv_data(res, 2 * sizeof(block128) * std::min(bsize, length - i));
+      if (bsize <= length - i)
+        crh.H<bsize>(tT + i, tT + i);
+      else
+        crh.Hn(tT + i, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        data[i + j] = xorBlocks(res[2 * j + r[i + j]], tT[i + j]);
+      }
+    }
+    delete[] tT;
+  }
+
+  void got_send_post(uint64_t **data, int length) {
+    assert(this->l <= 64 && this->l > 8);
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    uint32_t pad2_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(uint64_t) * 8));
+    uint64_t pad2[pad2_size];
+    int start_pos = 0;
+    int end_pos = 0;
+    int start_block64 = 0;
+    int end_block64 = 0;
+    uint64_t temp_bl = 0;
+    uint64_t mask;
+    if (this->l < 64) {
+      mask = (1ULL << this->l) - 1ULL;
+    } else {
+      mask = -1ULL;
+    }
+    int temp = 0;
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      for (int j = 0; j < (int)pad2_size; ++j) {
+        pad2[j] = 0ULL;
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        // OT message 0
+        start_pos = this->l * 2 * (j - i); // inclusive
+        end_pos = start_pos + this->l;     // exclusive
+        end_pos -= 1;                      // inclusive
+        start_block64 = start_pos / (8 * sizeof(uint64_t));
+        end_block64 = end_pos / (8 * sizeof(uint64_t));
+        if (start_block64 == end_block64) {
+          pad2[start_block64] ^=
+              ((((uint64_t)_mm_extract_epi64(pad[2 * (j - i)], 0)) ^
+                data[j][0]) &
+               mask)
+              << (start_pos % (8 * sizeof(uint64_t)));
+        } else {
+          temp_bl = ((((uint64_t)_mm_extract_epi64(pad[2 * (j - i)], 0)) ^
+                      data[j][0]) &
+                     mask);
+          pad2[start_block64] ^= (temp_bl)
+                                 << (start_pos % (8 * sizeof(uint64_t)));
+          pad2[end_block64] ^=
+              (temp_bl) >>
+              ((8 * sizeof(uint64_t)) - (start_pos % (8 * sizeof(uint64_t))));
+        }
+        // OT message 1
+        start_pos = this->l * 2 * (j - i) + this->l; // inclusive
+        end_pos = start_pos + this->l;               // exclusive
+        end_pos -= 1;                                // inclusive
+        start_block64 = start_pos / (8 * sizeof(uint64_t));
+        end_block64 = end_pos / (8 * sizeof(uint64_t));
+        if (start_block64 == end_block64) {
+          pad2[start_block64] ^=
+              ((((uint64_t)_mm_extract_epi64(pad[2 * (j - i) + 1], 0)) ^
+                data[j][1]) &
+               mask)
+              << (start_pos % (8 * sizeof(uint64_t)));
+        } else {
+          temp_bl = ((((uint64_t)_mm_extract_epi64(pad[2 * (j - i) + 1], 0)) ^
+                      data[j][1]) &
+                     mask);
+          pad2[start_block64] ^= (temp_bl)
+                                 << (start_pos % (8 * sizeof(uint64_t)));
+          pad2[end_block64] ^=
+              (temp_bl) >>
+              ((8 * sizeof(uint64_t)) - (start_pos % (8 * sizeof(uint64_t))));
+        }
+        temp = (temp > end_block64) ? temp : end_block64;
+      }
+      uint32_t pad2_size_correct =
+          (uint32_t)ceil((2 * std::min(bsize, length - i) * this->l) /
+                         ((float)sizeof(uint64_t) * 8));
+      io->send_data(pad2, sizeof(uint64_t) * (pad2_size_correct));
+    }
+    delete[] qT;
+  }
+
+  void got_recv_post(uint64_t *data, const uint8_t *r, int length) {
+    assert(this->l <= 64 && this->l > 8);
+    const int bsize = AES_BATCH_SIZE;
+    uint32_t res_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(uint64_t) * 8));
+    uint64_t res[res_size];
+    int start_pos = 0;
+    int end_pos = 0;
+    int start_block64 = 0;
+    int end_block64 = 0;
+    uint64_t mask;
+    if (this->l < 64) {
+      mask = (1ULL << this->l) - 1ULL;
+    } else {
+      mask = -1ULL;
+    }
+    for (int i = 0; i < length; i += bsize) {
+      uint32_t res_size_correct =
+          (uint32_t)ceil((2 * std::min(bsize, length - i) * this->l) /
+                         ((float)sizeof(uint64_t) * 8));
+      io->recv_data(res, sizeof(uint64_t) * (res_size_correct));
+      if (bsize <= length - i)
+        crh.H<bsize>(tT + i, tT + i);
+      else
+        crh.Hn(tT + i, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        start_pos = 2 * j * this->l + r[i + j] * this->l; // inclusive
+        end_pos = start_pos + this->l - 1;                // inclusive
+        start_block64 = start_pos / (8 * sizeof(uint64_t));
+        end_block64 = end_pos / (8 * sizeof(uint64_t));
+        if (start_block64 == end_block64) {
+          data[i + j] =
+              (((res[start_block64] >> (start_pos % (8 * sizeof(uint64_t))))) ^
+               ((uint64_t)_mm_extract_epi64(tT[i + j], 0))) &
+              mask;
+        } else {
+          data[i + j] = 0ULL;
+          data[i + j] ^=
+              (res[start_block64] >> (start_pos % (8 * sizeof(uint64_t))));
+          data[i + j] ^=
+              (res[end_block64] << (8 * sizeof(uint64_t) -
+                                    (start_pos % (8 * sizeof(uint64_t)))));
+          data[i + j] =
+              (data[i + j] ^ ((uint64_t)_mm_extract_epi64(tT[i + j], 0))) &
+              mask;
+        }
+      }
+    }
+    delete[] tT;
+  }
+
+  void cot_send_post(block128 *data0, block128 delta, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    block128 tmp[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        data0[j] = pad[2 * (j - i)];
+        pad[2 * (j - i)] = xorBlocks(pad[2 * (j - i)], delta);
+        tmp[j - i] = xorBlocks(pad[2 * (j - i) + 1], pad[2 * (j - i)]);
+      }
+      io->send_data(tmp, sizeof(block128) * std::min(bsize, length - i));
+    }
+    delete[] qT;
+  }
+
+  void cot_recv_post(block128 *data, const bool *r, int length) {
+    const int bsize = AES_BATCH_SIZE;
+    block128 res[bsize];
+    for (int i = 0; i < length; i += bsize) {
+      io->recv_data(res, sizeof(block128) * std::min(bsize, length - i));
+      if (bsize <= length - i)
+        crh.H<bsize>(data + i, tT + i);
+      else
+        crh.Hn(data + i, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        if (r[i + j])
+          data[i + j] = xorBlocks(res[j], data[i + j]);
+      }
+    }
+    delete[] tT;
+  }
+
+  void rot_send_post(block128 *data0, block128 *data1, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        data0[j] = pad[2 * (j - i)];
+        data1[j] = pad[2 * (j - i) + 1];
+      }
+    }
+    delete[] qT;
+  }
+
+  void rot_recv_post(block128 *data, const bool *r, int length) {
+    const int bsize = AES_BATCH_SIZE;
+    for (int i = 0; i < length; i += bsize) {
+      if (bsize <= length - i)
+        crh.H<bsize>(data + i, tT + i);
+      else
+        crh.Hn(data + i, tT + i, length - i);
+    }
+    delete[] tT;
+  }
+
+  void send_impl(const block128 *data0, const block128 *data1, int length) {
+    if (length < 1)
+      return;
+    send_pre(length);
+    got_send_post(data0, data1, length);
+  }
+
+  void recv_impl(block128 *data, const bool *b, int length) {
+    if (length < 1)
+      return;
+    recv_pre(b, length);
+    got_recv_post(data, b, length);
+  }
+
+  void send_impl(uint64_t **data, int length, int l) {
+    if (length < 1)
+      return;
+    this->l = l;
+    send_pre(length);
+    got_send_post(data, length);
+  }
+
+  void recv_impl(uint64_t *data, const uint8_t *b, int length, int l) {
+    if (length < 1)
+      return;
+    this->l = l;
+    recv_pre((bool *)b, length);
+    got_recv_post(data, b, length);
+  }
+
+  void send_cot(block128 *data0, block128 delta, int length) {
+    if (length < 1)
+      return;
+    send_pre(length);
+    cot_send_post(data0, delta, length);
+  }
+
+  void recv_cot(block128 *data, const bool *b, int length) {
+    if (length < 1)
+      return;
+    recv_pre(b, length);
+    cot_recv_post(data, b, length);
+  }
+
+  void send_rot(block128 *data0, block128 *data1, int length) {
+    if (length < 1)
+      return;
+    send_pre(length);
+    rot_send_post(data0, data1, length);
+  }
+
+  void recv_rot(block128 *data, const bool *b, int length) {
+    if (length < 1)
+      return;
+    recv_pre(b, length);
+    rot_recv_post(data, b, length);
+  }
+
+  template <typename intType>
+  void cot_send_post_matmul(intType *rdata, const intType *corr,
+                            const uint32_t *chunkSizes,
+                            const uint32_t *numChunks, const int numOTs,
+                            int senderMatmulDims) {
+    // TODO(nishkum): replace this maxing logic by replacing array of numChunks
+    // by a constant (row size of sender)
+    uint64_t maxHashLen = 0;
+    for (int i = 0; i < numOTs; i++) {
+      maxHashLen =
+          std::max(maxHashLen,
+                   uint64_t(2 * ceil_val(chunkSizes[i] * numChunks[i], 128)));
+    }
+
+    block128 hashArr[maxHashLen +
+                     1]; // First half of this will contain H(q_i) and second
+                         // half H(q_i \xor s) The +1 is there because
+                         // readFromPackedArr will be called on this arry
+                         // and while reading it reads a uint64_t directly. So,
+                         // to leave sufficient space ahead and to prevent
+                         // reading out of bounds, the +1.
+    block128 corrHashArr
+        [(maxHashLen / 2) +
+         1]; // Since maxHashLen is max of even values, its divisible by 2
+             // This will contain the final packed correlation(H(q_i))
+             // The +1 is there because writePackedArr requires one extra
+             // uint64_t space at end.
+    block128 scratchArr[maxHashLen /
+                        2]; // This will temporarily contain the value to be
+                            // copied in dataToBeSent
+                            // Since one OT is treated as atomic (as in fully
+                            // sent or not sent),
+                            //  need some bound on the data to be sent.
+                            //  Maximum #blocks coming from one OT is
+                            //  ceil(numChunks*bitlen/128). Taking bitlen as
+                            //  maximum = 64, we get ceil(numChunks/2). So,
+                            //  assuming numChunks<=256, we get 128 -- which is
+                            //  the maximum number of 128 bit blocks which can
+                            //  get added in one OT. So, declare an array of
+                            //  size AES_BATCH_SIZE + 130.
+    block128 *dataToBeSent =
+        new block128[AES_BATCH_SIZE + (senderMatmulDims / 2) + 10];
+    uint64_t dataToBeSentByteAlignedPtr = 0;
+    uint64_t corrPtr = 0;
+    for (int i = 0; i < numOTs; i++) {
+      uint64_t curNumChunks = numChunks[i];
+      uint64_t curChunkSize = chunkSizes[i];
+      uint64_t curNumHashes = ceil_val(curNumChunks * curChunkSize, 128);
+      for (uint64_t j = 0; j < curNumHashes; j++) {
+        hashArr[j] = xorBlocks(qT[i], toBlock(j));
+        hashArr[j + curNumHashes] = xorBlocks(hashArr[j], block_s);
+      }
+      crh.Hn(hashArr, hashArr, 2 * curNumHashes);
+      // Now calculate f(Hash), where f represents the correlation
+      for (uint64_t j = 0; j < curNumChunks; j++) {
+        // read from corrPtr onwards for curNumChunks uint64s
+        uint64_t curCorrVal = corr[corrPtr + j];
+        // Now extract the corresponding random value from hashArr
+        uint64_t curRandVal =
+            readFromPackedArr((uint8_t *)hashArr, 16 * curNumHashes,
+                              j * curChunkSize, curChunkSize);
+        uint64_t curFinalVal =
+            (curRandVal + curCorrVal) & (all1Mask(curChunkSize));
+        // Now pack this curFinalVal in data to be xored with H(q \xor s)
+        writeToPackedArr((uint8_t *)corrHashArr, 16 * curNumHashes,
+                         j * curChunkSize, curChunkSize, curFinalVal);
+        rdata[corrPtr + j] = (intType)curRandVal;
+      }
+      // The packed correlated value is ready. Now xor with H(q_i \xor s) and
+      // then its ready to be sent
+      for (uint64_t j = 0; j < curNumHashes; j++) {
+        scratchArr[j] = xorBlocks(corrHashArr[j], hashArr[curNumHashes + j]);
+      }
+      uint64_t bytesToBeSent = ceil_val(curNumChunks * curChunkSize,
+                                        8); // Byte align data to be sent
+      memcpy(((uint8_t *)dataToBeSent) + dataToBeSentByteAlignedPtr, scratchArr,
+             bytesToBeSent);
+      dataToBeSentByteAlignedPtr += bytesToBeSent;
+      corrPtr += curNumChunks;
+
+      // If either enough data is accumulated or this is the last OT, send the
+      // data
+      if ((dataToBeSentByteAlignedPtr >= (AES_BATCH_SIZE * 16)) ||
+          (i == numOTs - 1)) { // Each 128 bit block has 16 bytes
+        // Send this data to receiver for pipelining
+        io->send_data(dataToBeSent, dataToBeSentByteAlignedPtr);
+        io->flush();
+        dataToBeSentByteAlignedPtr = 0;
+      }
+    }
+    delete[] dataToBeSent;
+    delete[] qT;
+  }
+
+  template <typename intType>
+  void cot_recv_post_matmul(intType *data, const uint8_t *choices,
+                            const uint32_t *chunkSizes,
+                            const uint32_t *numChunks, const int numOTs,
+                            int senderMatmulDims) {
+    block128 dataToBeRecvd[AES_BATCH_SIZE + (senderMatmulDims / 2) +
+                           10]; // For logic of this bound, refer to the
+                                // function above.
+
+    /*
+            Logic for the bound used in the following line:
+            In each OT, except for the last hash evluation, whatever is hashed
+       is also accounted for in the data sent. It is only in the last hash
+       evaluation for that OT that data to be sent < data to be stored (for
+       example, if data to be sent is 1 byte misaligned). So, at worst, the
+       maximum amount of extra blocks needed = #OTs which are processed before
+       sending < #totalOTs. Hence, the bound. And it is not that bad either,
+       assuming numOTs = 2048*64 (for row side of right side matrix of receiver
+       of size 2048 in 64 bits) mem consumption of this array =
+       ((2048+130+(2048*64))*16)/(1<<20) MiB = 2.03 MiB.
+
+            Also, since this can be a large array, allocating on heap is better.
+    */
+    block128 *hashesStored =
+        new block128[AES_BATCH_SIZE + (senderMatmulDims / 2) + 10 + numOTs];
+    uint64_t hashesStoredPtr =
+        0; // Indexes into hashesStored to keep track of which hash block to be
+           // used to start storing hashes
+           // for given OT. This is used in the outer loop to fill in hashes,
+           // which gets consumed in the inner loop when enough data is present.
+    uint64_t dataToBeRecvdCummulativeCtr =
+        0; // Keeps track of how much data is expected to be received.
+           // When budget is full, it receives the expected amount of data.
+           // This should work because sender and receiver follow the same
+           // deterministic logic of calculating the amount of data to be
+           // sent/received.
+    int otDataStartCtr = 0; // Represents the OT number from which data is being
+                            // collected in dataToBeRecvd When time comes, data
+                            // received is OT data of OT #[otDataStartCtr,i]
+    uint64_t dataPtr =
+        0; // Indexes into data to keep track that when OT data starts to be
+           // unpacked, which index in data represents its start
+    for (int i = 0; i < numOTs; i++) {
+      uint64_t curNumChunks = numChunks[i];
+      uint64_t curChunkSize = chunkSizes[i];
+      uint64_t curNumHashes = ceil_val(curNumChunks * curChunkSize, 128);
+
+      // For current OT, the hashes need to be stored in hashesStored array
+      // starting at idx = hashesStoredPtr.
+      for (uint64_t j = 0; j < curNumHashes; j++) {
+        hashesStored[hashesStoredPtr + j] = xorBlocks(tT[i], toBlock(j));
+      }
+      crh.Hn(hashesStored + hashesStoredPtr, hashesStored + hashesStoredPtr,
+             curNumHashes);
+      uint64_t bytesToBeRecvd = ceil_val(curNumChunks * curChunkSize, 8);
+      dataToBeRecvdCummulativeCtr += bytesToBeRecvd;
+      hashesStoredPtr += curNumHashes;
+      if ((dataToBeRecvdCummulativeCtr >= (AES_BATCH_SIZE * 16)) ||
+          (i == numOTs - 1)) { // Each 128 bit block has 16 bytes
+        // Expect these many bytes from the sender
+        io->recv_data(dataToBeRecvd, dataToBeRecvdCummulativeCtr);
+        // Now that data has been received for past OTs, process those OTs.
+        //  The OTs to be processed are from [otDataStartCtr,i]
+        assert(otDataStartCtr <= i);
+        uint64_t hashBlockCtr =
+            0; // The hashes of the OTs from #[otDataStartCtr,i] have already
+               // been calculated.
+               // We only need to consume them now.
+               // This variable indexes into hashesStored.
+        uint64_t dataRecvByteAlignedCtr =
+            0; // Indexes into dataToBeRecvd to keep track of from where the
+               // data needs to be read
+               // for current OT. Note that this variable represents byte
+               // aligned read.
+        for (int j = otDataStartCtr; j <= i; j++) {
+          uint64_t curOTChunkSize = chunkSizes[j];
+          uint64_t curOTNumChunks = numChunks[j];
+          uint64_t numHashBlocks =
+              ceil_val(curOTChunkSize * curOTNumChunks, 128);
+          block128 *curOTRecvDataStartBlock =
+              (block128 *)(((uint8_t *)dataToBeRecvd) + dataRecvByteAlignedCtr);
+          if (choices[j] == 1) {
+            for (uint64_t k = 0; k < numHashBlocks; k++) {
+              // Following is a simple trick.
+              //  curOTRecvDataStartBlock is a block128* starting at the byte
+              //  from which the data of current OT is expected. Rem that all OT
+              //  data is packed with byte alignment. So, this casting to
+              //  block128* should work. We read in chunks of block128 from this
+              //  array. But also rem that this array wasn't designed for
+              //  block128 read. However, I claim that this should still work.
+              //  This is because: at worst for a middle OT, the next OT's data
+              //  will get hashed. However, while reading, only required amount
+              //  of data will get read. So, all good here. For the last OT
+              //  also, nothing bad should happen because dataToBeRecvd is
+              //  created with at least one more extra block128 than required.
+              //  Hence, this should work and no seg fault should come.
+              hashesStored[hashBlockCtr + k] =
+                  xorBlocks(hashesStored[hashBlockCtr + k],
+                            _mm_loadu_si128(curOTRecvDataStartBlock + k));
+            }
+          }
+          // Now hashesStored[hashBlockCtr:hashBlockCtr+numHashBlocks] contains
+          // the final data to be used. Unpack and fill in the output variable
+          // data
+          for (uint64_t k = 0; k < curOTNumChunks; k++) {
+            data[dataPtr + k] = (intType)readFromPackedArr(
+                (uint8_t *)(hashesStored + hashBlockCtr), 16 * numHashBlocks,
+                k * curOTChunkSize, curOTChunkSize);
+          }
+          dataPtr += curOTNumChunks;
+          dataRecvByteAlignedCtr +=
+              ceil_val(curOTChunkSize * curOTNumChunks, 8);
+          hashBlockCtr += numHashBlocks;
+        }
+
+        otDataStartCtr = i + 1;
+        dataToBeRecvdCummulativeCtr = 0;
+        assert(hashBlockCtr == hashesStoredPtr);
+        hashesStoredPtr = 0;
+      }
+    }
+
+    // Assertion for my understanding
+    uint64_t totalChunks = 0;
+    for (int i = 0; i < numOTs; i++)
+      totalChunks += numChunks[i];
+    assert(dataPtr == totalChunks);
+    assert(otDataStartCtr == numOTs);
+    delete[] hashesStored;
+    delete[] tT;
+  }
+
+  /*
+  COT function for packed multiplication correlation
+  - Does numOTs COTs for the correlation which is used in matmul
+  - corr is the array which contains the correlated values (i.e. a in the case
+  of matmul). Each value is stored in a separate uint64_t.
+  - For each OT, the chunkSizes[i] denotes the bitlen used for that OT (again
+  referring to matmul)
+  - For each OT, the numChunks[i] denotes the #values of that particular bitlen.
+  Hence, size of corr = summation of numChunks
+  - rdata is the r value which is returned to the sender
+  */
+  template <typename intType>
+  void send_cot_matmul(intType *rdata, const intType *corr,
+                       const uint32_t *chunkSizes, const uint32_t *numChunks,
+                       const int numOTs, int senderMatmulDims) {
+    send_pre(numOTs);
+    cot_send_post_matmul<intType>(rdata, corr, chunkSizes, numChunks, numOTs,
+                                  senderMatmulDims);
+  }
+
+  /*
+  COT function for packed multiplication correlation
+  - data is the data which will be read
+  - Rest of parameters have same meaning as specified in sender's description
+  */
+  template <typename intType>
+  void recv_cot_matmul(intType *data, const uint8_t *choices,
+                       const uint32_t *chunkSizes, const uint32_t *numChunks,
+                       const int numOTs, int senderMatmulDims) {
+    recv_pre((bool *)choices, numOTs);
+    cot_recv_post_matmul<intType>(data, choices, chunkSizes, numChunks, numOTs,
+                                  senderMatmulDims);
+  }
+
+  template <typename intType>
+  void cot_send_post_moduloAdd(intType *rdata, const intType *delta,
+                               const int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    intType tmp[bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        rdata[j] =
+            (intType)_mm_extract_epi64(pad[2 * (j - i)], 0); // Use lower bits
+        intType corrVal = rdata[j] + delta[j];
+        intType mask = (intType)_mm_extract_epi64(pad[2 * (j - i) + 1], 0);
+        intType msgToBeSent = corrVal ^ mask;
+        tmp[j - i] = msgToBeSent;
+      }
+      io->send_data(tmp, sizeof(intType) * std::min(bsize, length - i));
+    }
+    delete[] qT;
+  }
+
+  template <typename intType>
+  void cot_recv_post_moduloAdd(intType *data, const uint8_t *choices,
+                               const int length) {
+    const int bsize = AES_BATCH_SIZE;
+    intType res[bsize];
+    block128 hashes[bsize];
+    for (int i = 0; i < length; i += bsize) {
+      io->recv_data(res, sizeof(intType) * std::min(bsize, length - i));
+      if (bsize <= length - i)
+        crh.H<bsize>(hashes, tT + i);
+      else
+        crh.Hn(hashes, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        data[i + j] = (intType)_mm_extract_epi64(
+            hashes[j], 0); // Use lower bits convention
+        if (choices[i + j]) {
+          data[i + j] = data[i + j] ^ res[j];
+        }
+      }
+    }
+    delete[] tT;
+  }
+
+  /*
+  - For the correlation f(r) = (r+delta[i])mod 2^l
+  */
+  template <typename intType>
+  void send_cot_moduloAdd(intType *rdata, const intType *delta,
+                          const int numOTs) {
+    send_pre(numOTs);
+    cot_send_post_moduloAdd<intType>(rdata, delta, numOTs);
+  }
+
+  /*
+  - For the correlation f(r) = (r+delta[i])mod 2^l
+  */
+  template <typename intType>
+  void recv_cot_moduloAdd(intType *data, const uint8_t *choices,
+                          const int numOTs) {
+    recv_pre((bool *)choices, numOTs);
+    cot_recv_post_moduloAdd<intType>(data, choices, numOTs);
+  }
+};
+
+} // namespace sci
+#endif // IKNP_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h
new file mode 100644
index 00000000..314f3868
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h
@@ -0,0 +1,239 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef OT_KKOT_H__
+#define OT_KKOT_H__
+#include "OT/np.h"
+#include "OT/ot.h"
+
+namespace sci {
+template <typename IO> class KKOT : public OT<KKOT<IO>> {
+public:
+  OTNP<IO> *base_ot;
+  PRG128 prg;
+  const int lambda = 256;
+  int block_size = 1024 * 16;
+  int ro_batch_size = 2048;
+  int N, l, max_N = 0;
+
+  block256 *k0 = nullptr, *k1 = nullptr, *d = nullptr, *c_AND_s = nullptr,
+           *qT = nullptr, *tT = nullptr, *tmp = nullptr, block_s;
+  PRG256 *G0, *G1;
+  bool *s = nullptr, setup = false, precomp_masks = false;
+  uint8_t *extended_r = nullptr;
+  IO *io = nullptr;
+
+  KKOT(IO *io) {
+    this->io = io;
+    base_ot = new OTNP<IO>(io);
+    s = new bool[lambda];
+    k0 = new (std::align_val_t(32)) block256[lambda];
+    k1 = new (std::align_val_t(32)) block256[lambda];
+    d = new (std::align_val_t(32)) block256[block_size];
+    c_AND_s = new (std::align_val_t(32)) block256[lambda];
+    G0 = new PRG256[lambda];
+    G1 = new PRG256[lambda];
+    tmp = new (std::align_val_t(32)) block256[block_size / 256];
+    extended_r = new uint8_t[block_size];
+  }
+
+  ~KKOT() {
+    delete base_ot;
+    delete[] s;
+    delete[] k0;
+    delete[] k1;
+    delete[] d;
+    delete[] c_AND_s;
+    delete[] G0;
+    delete[] G1;
+    delete[] tmp;
+    delete[] extended_r;
+  }
+
+  void setup_send(block256 *in_k0 = nullptr, bool *in_s = nullptr) {
+    setup = true;
+    if (in_s != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block256));
+      memcpy(s, in_s, lambda);
+      block_s = bool_to256(s);
+    } else {
+      prg.random_bool(s, lambda);
+      base_ot->recv(k0, s, lambda);
+      block_s = bool_to256(s);
+    }
+    for (int i = 0; i < lambda; ++i)
+      G0[i].reseed(&k0[i]);
+  }
+
+  void setup_recv(block256 *in_k0 = nullptr, block256 *in_k1 = nullptr) {
+    setup = true;
+    if (in_k0 != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block256));
+      memcpy(k1, in_k1, lambda * sizeof(block256));
+    } else {
+      prg.random_block(k0, lambda);
+      prg.random_block(k1, lambda);
+      base_ot->send(k0, k1, lambda);
+    }
+    for (int i = 0; i < lambda; ++i) {
+      G0[i].reseed(&k0[i]);
+      G1[i].reseed(&k1[i]);
+    }
+  }
+
+  void precompute_masks() {
+    assert(setup == true);
+    assert(N > 0);
+    precomp_masks = true;
+    for (int i = max_N; i < N; i++) {
+      c_AND_s[i] =
+          andBlocks(_mm256_lddqu_si256((const __m256i *)WH_Code[i]), block_s);
+    }
+    max_N = N;
+  }
+
+  int padded_length(int length) {
+    return ((length + block_size - 1) / block_size) * block_size;
+  }
+
+  void send_pre(int length) {
+    length = padded_length(length);
+    alignas(32) block256 q[block_size];
+    qT = new (std::align_val_t(32)) block256[length];
+    if (!setup)
+      setup_send();
+    if (!precomp_masks)
+      precompute_masks();
+
+    for (int j = 0; j < length / block_size; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(q + (i * block_size / 256), block_size / 8);
+        io->recv_data(tmp, block_size / 8);
+        if (s[i])
+          xorBlocks_arr(q + (i * block_size / 256), q + (i * block_size / 256),
+                        tmp, block_size / 256);
+      }
+      sse_trans((uint8_t *)(qT + j * block_size), (uint8_t *)q, 256,
+                block_size);
+    }
+  }
+
+  void recv_pre(const uint8_t *r, int length) {
+    int old_length = length;
+    length = padded_length(length);
+    alignas(32) block256 t[block_size];
+    tT = new (std::align_val_t(32)) block256[length];
+
+    if (not setup)
+      setup_recv();
+
+    uint8_t *r2 = new uint8_t[length];
+    prg.random_data(extended_r, block_size);
+    memcpy(r2, r, old_length);
+    memcpy(r2 + old_length, extended_r, length - old_length);
+
+    block256 *dT = new (std::align_val_t(32)) block256[length];
+    for (int i = 0; i < length; i++)
+      dT[i] = _mm256_lddqu_si256((const __m256i *)WH_Code[r2[i]]);
+
+    for (int j = 0; j * block_size < length; ++j) {
+      sse_trans((uint8_t *)d, (uint8_t *)(dT + j * block_size), block_size,
+                256);
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(t + (i * block_size / 256), block_size / 8);
+        G1[i].random_data(tmp, block_size / 8);
+        xorBlocks_arr(tmp, t + (i * block_size / 256), tmp, block_size / 256);
+        xorBlocks_arr(tmp, d + (i * block_size / 256), tmp, block_size / 256);
+        io->send_data(tmp, block_size / 8);
+      }
+      sse_trans((uint8_t *)(tT + j * block_size), (uint8_t *)t, 256,
+                block_size);
+    }
+
+    delete[] dT;
+    delete[] r2;
+  }
+
+  void got_send_post(block128 **data, int length) {
+    const int bsize = ro_batch_size;
+    block256 *key = new (std::align_val_t(32)) block256[N * bsize];
+    block128 *y = new block128[N * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        for (int k = 0; k < N; k++) {
+          key[N * (j - i) + k] = xorBlocks(qT[j], c_AND_s[k]);
+        }
+      }
+      CCRF(y, key, N * bsize);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        for (int k = 0; k < N; k++) {
+          y[N * (j - i) + k] = xorBlocks(y[N * (j - i) + k], data[j][k]);
+        }
+      }
+      io->send_data(y, N * sizeof(block128) * std::min(bsize, length - i));
+    }
+    delete[] key;
+    delete[] y;
+    delete[] qT;
+  }
+
+  void got_recv_post(block128 *data, const uint8_t *r, int length) {
+    const int bsize = ro_batch_size;
+    block128 *pad = new block128[bsize];
+    block128 *res = new block128[N * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      io->recv_data(res, N * sizeof(block128) * std::min(bsize, length - i));
+      if (bsize <= length - i)
+        CCRF(pad, tT + i, bsize);
+      else
+        CCRF(pad, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        data[i + j] = xorBlocks(res[N * j + r[i + j]], pad[j]);
+      }
+    }
+    delete[] pad;
+    delete[] res;
+    delete[] tT;
+  }
+
+  void send_impl(block128 **data, int length, int N) {
+    if (length < 1)
+      return;
+    assert(N <= lambda && N >= 2);
+    this->N = N;
+    if ((this->max_N < N) && setup == true)
+      precompute_masks();
+    send_pre(length);
+    got_send_post(data, length);
+  }
+
+  void recv_impl(block128 *data, const uint8_t *b, int length, int N) {
+    if (length < 1)
+      return;
+    assert(N <= lambda && N >= 2);
+    this->N = N;
+    recv_pre(b, length);
+    got_recv_post(data, b, length);
+  }
+};
+
+} // namespace sci
+#endif // OT_KKOT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h
new file mode 100644
index 00000000..dcea2134
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h
@@ -0,0 +1,218 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef OT_NP_H__
+#define OT_NP_H__
+#include "OT/ot.h"
+/** @addtogroup OT
+        @{
+*/
+namespace sci {
+template <typename IO> class OTNP : public OT<OTNP<IO>> {
+public:
+  IO *io;
+  emp::Group *G = nullptr;
+  bool delete_G = true;
+  OTNP(IO *io, emp::Group *_G = nullptr) {
+    this->io = io;
+    if (_G == nullptr)
+      G = new emp::Group();
+    else {
+      G = _G;
+      delete_G = false;
+    }
+  }
+  ~OTNP() {
+    if (delete_G)
+      delete G;
+  }
+
+  void send_impl(const block128 *data0, const block128 *data1, int length) {
+    emp::BigInt d;
+    G->get_rand_bn(d);
+    emp::Point C = G->mul_gen(d);
+    io->send_pt(&C);
+    io->flush();
+
+    emp::BigInt *r = new emp::BigInt[length];
+    emp::BigInt *rc = new emp::BigInt[length];
+    emp::Point *pk0 = new emp::Point[length], pk1, *gr = new emp::Point[length],
+               *Cr = new emp::Point[length];
+    for (int i = 0; i < length; ++i) {
+      G->get_rand_bn(r[i]);
+      gr[i] = G->mul_gen(r[i]);
+      rc[i] = r[i].mul(d, G->bn_ctx);
+      rc[i] = rc[i].mod(G->order, G->bn_ctx);
+      Cr[i] = G->mul_gen(rc[i]);
+    }
+
+    for (int i = 0; i < length; ++i) {
+      io->recv_pt(G, &pk0[i]);
+    }
+    for (int i = 0; i < length; ++i) {
+      io->send_pt(&gr[i]);
+    }
+    io->flush();
+
+    block128 m[2];
+    for (int i = 0; i < length; ++i) {
+      pk0[i] = pk0[i].mul(r[i]);
+      emp::Point inv = pk0[i].inv();
+      pk1 = Cr[i].add(inv);
+      m[0] = Hash::KDF128(pk0[i]);
+      m[0] = xorBlocks(data0[i], m[0]);
+      m[1] = Hash::KDF128(pk1);
+      m[1] = xorBlocks(data1[i], m[1]);
+      io->send_data(m, 2 * sizeof(block128));
+    }
+
+    delete[] r;
+    delete[] gr;
+    delete[] Cr;
+    delete[] rc;
+    delete[] pk0;
+  }
+
+  void send_impl(const block256 *data0, const block256 *data1, int length) {
+    emp::BigInt d;
+    G->get_rand_bn(d);
+    emp::Point C = G->mul_gen(d);
+    io->send_pt(&C);
+    io->flush();
+
+    emp::BigInt *r = new emp::BigInt[length];
+    emp::BigInt *rc = new emp::BigInt[length];
+    emp::Point *pk0 = new emp::Point[length], pk1, *gr = new emp::Point[length],
+               *Cr = new emp::Point[length];
+    for (int i = 0; i < length; ++i) {
+      G->get_rand_bn(r[i]);
+      gr[i] = G->mul_gen(r[i]);
+      rc[i] = r[i].mul(d, G->bn_ctx);
+      rc[i] = rc[i].mod(G->order, G->bn_ctx);
+      Cr[i] = G->mul_gen(rc[i]);
+    }
+
+    for (int i = 0; i < length; ++i) {
+      io->recv_pt(G, &pk0[i]);
+    }
+    for (int i = 0; i < length; ++i) {
+      io->send_pt(&gr[i]);
+    }
+    io->flush();
+
+    alignas(32) block256 m[2];
+    for (int i = 0; i < length; ++i) {
+      pk0[i] = pk0[i].mul(r[i]);
+      emp::Point inv = pk0[i].inv();
+      pk1 = Cr[i].add(inv);
+      m[0] = Hash::KDF256(pk0[i]);
+      m[0] = xorBlocks(data0[i], m[0]);
+      m[1] = Hash::KDF256(pk1);
+      m[1] = xorBlocks(data1[i], m[1]);
+      io->send_data(m, 2 * sizeof(block256));
+    }
+
+    delete[] r;
+    delete[] gr;
+    delete[] Cr;
+    delete[] rc;
+    delete[] pk0;
+  }
+
+  void recv_impl(block128 *data, const bool *b, int length) {
+    emp::BigInt *k = new emp::BigInt[length];
+    emp::Point *gr = new emp::Point[length];
+    emp::Point pk[2];
+    block128 m[2];
+    emp::Point C;
+    for (int i = 0; i < length; ++i)
+      G->get_rand_bn(k[i]);
+
+    io->recv_pt(G, &C);
+
+    for (int i = 0; i < length; ++i) {
+      if (b[i]) {
+        pk[1] = G->mul_gen(k[i]);
+        emp::Point inv = pk[1].inv();
+        pk[0] = C.add(inv);
+      } else {
+        pk[0] = G->mul_gen(k[i]);
+      }
+      io->send_pt(&pk[0]);
+    }
+
+    for (int i = 0; i < length; ++i) {
+      io->recv_pt(G, &gr[i]);
+      gr[i] = gr[i].mul(k[i]);
+    }
+    for (int i = 0; i < length; ++i) {
+      int ind = b[i] ? 1 : 0;
+      io->recv_data(m, 2 * sizeof(block128));
+      data[i] = xorBlocks(m[ind], Hash::KDF128(gr[i]));
+    }
+    delete[] k;
+    delete[] gr;
+  }
+
+  void recv_impl(block256 *data, const bool *b, int length) {
+    emp::BigInt *k = new emp::BigInt[length];
+    emp::Point *gr = new emp::Point[length];
+    emp::Point pk[2];
+    alignas(32) block256 m[2];
+    emp::Point C;
+    for (int i = 0; i < length; ++i)
+      G->get_rand_bn(k[i]);
+
+    io->recv_pt(G, &C);
+
+    for (int i = 0; i < length; ++i) {
+      if (b[i]) {
+        pk[1] = G->mul_gen(k[i]);
+        emp::Point inv = pk[1].inv();
+        pk[0] = C.add(inv);
+      } else {
+        pk[0] = G->mul_gen(k[i]);
+      }
+      io->send_pt(&pk[0]);
+    }
+
+    for (int i = 0; i < length; ++i) {
+      io->recv_pt(G, &gr[i]);
+      gr[i] = gr[i].mul(k[i]);
+    }
+    for (int i = 0; i < length; ++i) {
+      int ind = b[i] ? 1 : 0;
+      io->recv_data(m, 2 * sizeof(block256));
+      data[i] = xorBlocks(m[ind], Hash::KDF256(gr[i]));
+    }
+    delete[] k;
+    delete[] gr;
+  }
+};
+/**@}*/
+} // namespace sci
+#endif // OT_NP_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h
new file mode 100644
index 00000000..7d013b9c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h
@@ -0,0 +1,217 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef OT_UTIL_H__
+#define OT_UTIL_H__
+#include "OT/ot.h"
+
+namespace sci {
+template <typename basetype>
+void pack_ot_messages(basetype *y, basetype **data, block128 *pad, int ysize,
+                      int bsize, int bitsize, int N) {
+  assert(y != nullptr && data != nullptr && pad != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype temp_bl = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+  for (int i = 0; i < ysize; i++) {
+    y[i] = 0;
+  }
+  for (int i = 0; i < bsize; i++) {
+    for (int k = 0; k < N; k++) {
+      // OT message k
+      start_pos = i * N * bitsize + k * bitsize; // inclusive
+      end_pos = start_pos + bitsize;
+      end_pos -= 1; // inclusive
+      start_block = start_pos / carriersize;
+      end_block = end_pos / carriersize;
+      if (carriersize == 64) {
+        if (start_block == end_block) {
+          y[start_block] ^=
+              ((((basetype)_mm_extract_epi64(pad[(N * i) + k], 0)) ^
+                data[i][k]) &
+               mask)
+              << (start_pos % carriersize);
+        } else {
+          temp_bl = ((((basetype)_mm_extract_epi64(pad[(N * i) + k], 0)) ^
+                      data[i][k]) &
+                     mask);
+          y[start_block] ^= (temp_bl) << (start_pos % carriersize);
+          y[end_block] ^=
+              (temp_bl) >> (carriersize - (start_pos % carriersize));
+        }
+      } else if (carriersize == 8) {
+        if (start_block == end_block) {
+          y[start_block] ^=
+              ((((basetype)_mm_extract_epi8(pad[(N * i) + k], 0)) ^
+                data[i][k]) &
+               mask)
+              << (start_pos % carriersize);
+        } else {
+          temp_bl = ((((basetype)_mm_extract_epi8(pad[(N * i) + k], 0)) ^
+                      data[i][k]) &
+                     mask);
+          y[start_block] ^= (temp_bl) << (start_pos % carriersize);
+          y[end_block] ^=
+              (temp_bl) >> (carriersize - (start_pos % carriersize));
+        }
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+    }
+  }
+}
+
+template <typename basetype>
+void unpack_ot_messages(basetype *data, const uint8_t *r, basetype *recvd,
+                        block128 *pad, int bsize, int bitsize, int N) {
+  assert(data != nullptr && recvd != nullptr && pad != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * N * bitsize + r[i] * bitsize;
+    end_pos = start_pos + bitsize - 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (carriersize == 64) {
+      if (start_block == end_block) {
+        data[i] = ((recvd[start_block] >> (start_pos % carriersize)) ^
+                   ((basetype)_mm_extract_epi64(pad[i], 0))) &
+                  mask;
+      } else {
+        data[i] = 0;
+        data[i] ^= (recvd[start_block] >> (start_pos % carriersize));
+        data[i] ^=
+            (recvd[end_block] << (carriersize - (start_pos % carriersize)));
+        data[i] = (data[i] ^ ((basetype)_mm_extract_epi64(pad[i], 0))) & mask;
+      }
+    } else if (carriersize == 8) {
+      if (start_block == end_block) {
+        data[i] = ((recvd[start_block] >> (start_pos % carriersize)) ^
+                   ((basetype)_mm_extract_epi8(pad[i], 0))) &
+                  mask;
+      } else {
+        data[i] = 0;
+        data[i] ^= (recvd[start_block] >> (start_pos % carriersize));
+        data[i] ^=
+            (recvd[end_block] << (carriersize - (start_pos % carriersize)));
+        data[i] = (data[i] ^ ((basetype)_mm_extract_epi8(pad[i], 0))) & mask;
+      }
+    } else {
+      throw std::invalid_argument("Not implemented");
+    }
+  }
+}
+
+inline void pack_cot_messages(uint64_t *y, uint64_t *corr_data, int ysize,
+                              int bsize, int bitsize) {
+  assert(y != nullptr && corr_data != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  uint64_t temp_bl = 0;
+  uint64_t mask = (1ULL << bitsize) - 1;
+  if (bitsize == 64)
+    mask = -1;
+
+  uint64_t carriersize = 64;
+  for (int i = 0; i < ysize; i++) {
+    y[i] = 0;
+  }
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * bitsize; // inclusive
+    end_pos = start_pos + bitsize;
+    end_pos -= 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (carriersize == 64) {
+      if (start_block == end_block) {
+        y[start_block] ^= (corr_data[i] & mask) << (start_pos % carriersize);
+      } else {
+        temp_bl = (corr_data[i] & mask);
+        y[start_block] ^= (temp_bl) << (start_pos % carriersize);
+        y[end_block] ^= (temp_bl) >> (carriersize - (start_pos % carriersize));
+      }
+    }
+  }
+}
+
+inline void unpack_cot_messages(uint64_t *corr_data, uint64_t *recvd, int bsize,
+                                int bitsize) {
+  assert(corr_data != nullptr && recvd != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  uint64_t mask = (1ULL << bitsize) - 1;
+  if (bitsize == 64)
+    mask = -1;
+  uint64_t carriersize = 64;
+
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * bitsize;
+    end_pos = start_pos + bitsize - 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (carriersize == 64) {
+      if (start_block == end_block) {
+        corr_data[i] = (recvd[start_block] >> (start_pos % carriersize)) & mask;
+      } else {
+        corr_data[i] = 0;
+        corr_data[i] ^= (recvd[start_block] >> (start_pos % carriersize));
+        corr_data[i] ^=
+            (recvd[end_block] << (carriersize - (start_pos % carriersize)));
+      }
+    }
+  }
+}
+} // namespace sci
+
+#endif // OT_UTIL_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h
new file mode 100644
index 00000000..bfdf2cfb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h
@@ -0,0 +1,124 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee, Nishant Kumar, Mayank Rathee
+*/
+
+#ifndef OT_H__
+#define OT_H__
+#include "utils/emp-tool.h"
+namespace sci {
+template <typename T> class OT {
+public:
+  void send(const block128 *data0, const block128 *data1, int length) {
+    static_cast<T *>(this)->send_impl(data0, data1, length);
+  }
+  void recv(block128 *data, const bool *b, int length) {
+    static_cast<T *>(this)->recv_impl(data, b, length);
+  }
+  void send(const block256 *data0, const block256 *data1, int length) {
+    static_cast<T *>(this)->send_impl(data0, data1, length);
+  }
+  void recv(block256 *data, const bool *b, int length) {
+    static_cast<T *>(this)->recv_impl(data, b, length);
+  }
+  void send(block128 **data, int length, int N) {
+    static_cast<T *>(this)->send_impl(data, length, N);
+  }
+  void recv(block128 *data, const uint8_t *b, int length, int N) {
+    static_cast<T *>(this)->recv_impl(data, b, length, N);
+  }
+  void send(uint8_t **data, int length, int N, int l) {
+    static_cast<T *>(this)->send_impl(data, length, N, l);
+  }
+  void recv(uint8_t *data, const uint8_t *b, int length, int N, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, N, l);
+  }
+  void recv(uint8_t *data, uint8_t *b, int length, int N, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, N, l);
+  }
+  void send(uint8_t **data, int length, int l) {
+    static_cast<T *>(this)->send_impl(data, length, l);
+  }
+  void recv(uint8_t *data, const uint8_t *b, int length, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l);
+  }
+  void recv(uint8_t *data, uint8_t *b, int length, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l);
+  }
+  void send(uint64_t **data, int length, int l) {
+    static_cast<T *>(this)->send_impl(data, length, l);
+  }
+  void recv(uint64_t *data, const uint8_t *b, int length, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l);
+  }
+  void recv(uint64_t *data, uint8_t *b, int length, int l) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l);
+  }
+
+  void send_cot(uint64_t *data0, uint64_t *corr, int length, int l) {
+    static_cast<T *>(this)->send_cot(data0, corr, length, l);
+  }
+  void recv_cot(uint64_t *data, bool *b, int length, int l) {
+    static_cast<T *>(this)->recv_cot(data, b, length, l);
+  }
+
+  template <typename intType>
+  void send_cot_matmul(intType *rdata, const intType *corr,
+                       const uint32_t *chunkSizes, const uint32_t *numChunks,
+                       const int numOTs, int senderMatmulDims) {
+    static_cast<T *>(this)->send_cot_matmul(rdata, corr, chunkSizes, numChunks,
+                                            numOTs, senderMatmulDims);
+  }
+
+  template <typename intType>
+  void recv_cot_matmul(intType *data, const uint8_t *choices,
+                       const uint32_t *chunkSizes, const uint32_t *numChunks,
+                       const int numOTs, int senderMatmulDims) {
+    static_cast<T *>(this)->recv_cot_matmul(
+        data, choices, chunkSizes, numChunks, numOTs, senderMatmulDims);
+  }
+
+  void send(uint8_t **data, int length, int N, int l, bool type) {
+    static_cast<T *>(this)->send_impl(data, length, N, l, type);
+  }
+  void recv(uint8_t *data, const uint8_t *b, int length, int N, int l,
+            bool type) {
+    static_cast<T *>(this)->recv_impl(data, b, length, N, l, type);
+  }
+  void recv(uint8_t *data, uint8_t *b, int length, int N, int l, bool type) {
+    static_cast<T *>(this)->recv_impl(data, b, length, N, l, type);
+  }
+  void send(uint8_t **data, int length, int l, bool type) {
+    static_cast<T *>(this)->send_impl(data, length, l, type);
+  }
+  void recv(uint8_t *data, const uint8_t *b, int length, int l, bool type) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l, type);
+  }
+  void recv(uint8_t *data, uint8_t *b, int length, int l, bool type) {
+    static_cast<T *>(this)->recv_impl(data, b, length, l, type);
+  }
+};
+} // namespace sci
+#endif // OT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h
new file mode 100644
index 00000000..bb13f079
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h
@@ -0,0 +1,124 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef OT_PACK_H__
+#define OT_PACK_H__
+#include "OT/emp-ot.h"
+#include "utils/emp-tool.h"
+
+#define KKOT_TYPES 8
+
+namespace sci {
+class OTPack {
+public:
+  SplitKKOT<NetIO> *kkot[KKOT_TYPES];
+
+  // iknp_straight and iknp_reversed: party
+  // acts as sender in straight and receiver in reversed.
+  // Needed for MUX calls.
+  SplitIKNP<NetIO> *iknp_straight;
+  SplitIKNP<NetIO> *iknp_reversed;
+  IOPack *iopack;
+  int party;
+  bool do_setup = false;
+
+  OTPack(IOPack *iopack, int party, bool do_setup = true) {
+    this->party = party;
+    this->do_setup = do_setup;
+    this->iopack = iopack;
+
+    for (int i = 0; i < KKOT_TYPES; i++) {
+      kkot[i] = new SplitKKOT<NetIO>(party, iopack->io, 1 << (i + 1));
+    }
+
+    iknp_straight = new SplitIKNP<NetIO>(party, iopack->io);
+    iknp_reversed = new SplitIKNP<NetIO>(3 - party, iopack->io_rev);
+
+    if (do_setup) {
+      SetupBaseOTs();
+    }
+  }
+
+  ~OTPack() {
+    for (int i = 0; i < KKOT_TYPES; i++)
+      delete kkot[i];
+    delete iknp_straight;
+    delete iknp_reversed;
+  }
+
+  void SetupBaseOTs() {
+    switch (party) {
+    case 1:
+      kkot[0]->setup_send();
+      iknp_straight->setup_send();
+      iknp_reversed->setup_recv();
+      for (int i = 1; i < KKOT_TYPES; i++) {
+        kkot[i]->setup_send();
+      }
+      break;
+    case 2:
+      kkot[0]->setup_recv();
+      iknp_straight->setup_recv();
+      iknp_reversed->setup_send();
+      for (int i = 1; i < KKOT_TYPES; i++) {
+        kkot[i]->setup_recv();
+      }
+      break;
+    }
+  }
+
+  /*
+   * DISCLAIMER:
+   * OTPack copy method avoids computing setup keys for each OT instance by
+   * reusing the keys generated (through base OTs) for another OT instance.
+   * Ideally, the PRGs within OT instances, using the same keys, should use
+   * mutually exclusive counters for security. However, the current
+   * implementation does not support this.
+   */
+
+  void copy(OTPack *copy_from) {
+    assert(this->do_setup == false && copy_from->do_setup == true);
+    SplitKKOT<NetIO> *kkot_base = copy_from->kkot[0];
+    SplitIKNP<NetIO> *iknp_s_base = copy_from->iknp_straight;
+    SplitIKNP<NetIO> *iknp_r_base = copy_from->iknp_reversed;
+
+    switch (this->party) {
+    case 1:
+      for (int i = 0; i < KKOT_TYPES; i++) {
+        this->kkot[i]->setup_send(kkot_base->k0, kkot_base->s);
+      }
+      this->iknp_straight->setup_send(iknp_s_base->k0, iknp_s_base->s);
+      this->iknp_reversed->setup_recv(iknp_r_base->k0, iknp_r_base->k1);
+      break;
+    case 2:
+      for (int i = 0; i < KKOT_TYPES; i++) {
+        this->kkot[i]->setup_recv(kkot_base->k0, kkot_base->k1);
+      }
+      this->iknp_straight->setup_recv(iknp_s_base->k0, iknp_s_base->k1);
+      this->iknp_reversed->setup_send(iknp_r_base->k0, iknp_r_base->s);
+      break;
+    }
+    this->do_setup = true;
+    return;
+  }
+};
+} // namespace sci
+#endif // OT_PACK_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-iknp.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-iknp.h
new file mode 100644
index 00000000..da00b87d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-iknp.h
@@ -0,0 +1,1023 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SPLIT_OT_IKNP_H__
+#define SPLIT_OT_IKNP_H__
+
+// In split functions, OT is split
+// into offline and online phase.
+
+#include "OT/np.h"
+#include "OT/ot-utils.h"
+#include "OT/ot.h"
+#include "split-utils.h"
+
+namespace sci {
+template <typename IO> class SplitIKNP : public OT<SplitIKNP<IO>> {
+public:
+  OTNP<IO> *base_ot;
+  PRG128 prg;
+  int party;
+  const int lambda = 128;
+  int block_size = 1024 * 16;
+
+  // This specifies how much OTs to preprocess in one go.
+  // 0 means no preprocessing and all everything will be
+  // run in the online phase.
+  int precomp_batch_size = 0;
+  // counter denotes the number of pre-generated OTs used
+  int counter = precomp_batch_size;
+  int l;
+
+  block128 *k0 = nullptr, *k1 = nullptr, *qT = nullptr, *tT = nullptr,
+           *tmp = nullptr, block_s;
+  PRG128 *G0, *G1;
+  bool *s = nullptr, *extended_r = nullptr, setup = false;
+  IO *io = nullptr;
+  CRH crh;
+
+  // h holds the precomputed hashes which can be used directly in the online
+  // phase by xoring with the respective OT messages.
+  uint8_t **h;
+  uint64_t **h64;
+
+  // r_off is the choice input set used for the offline random OT.
+  // This is corrected in the online phase when actual choice_input comes.
+  uint8_t *r_off;
+  int N = 2;
+  SplitIKNP(int party, IO *io) {
+    assert(party == ALICE || party == BOB);
+    this->party = party;
+    this->io = io;
+    base_ot = new OTNP<IO>(io);
+    s = new bool[lambda];
+    k0 = new block128[lambda];
+    k1 = new block128[lambda];
+    switch (party) {
+    case ALICE: {
+      h = new uint8_t *[N];
+      h64 = new uint64_t *[N];
+      for (int i = 0; i < N; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    }
+    case BOB: {
+      h = new uint8_t *[1];
+      h64 = new uint64_t *[1];
+      r_off = new uint8_t[precomp_batch_size];
+      for (int i = 0; i < 1; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    }
+    }
+    G0 = new PRG128[lambda];
+    G1 = new PRG128[lambda];
+    tmp = new block128[block_size / 128];
+    extended_r = new bool[block_size];
+  }
+
+  ~SplitIKNP() {
+    delete base_ot;
+    delete[] s;
+    delete[] k0;
+    delete[] k1;
+    switch (party) {
+    case ALICE: {
+      for (int i = 0; i < N; i++) {
+        delete[] h[i];
+        delete[] h64[i];
+      }
+      delete[] h;
+      delete[] h64;
+      break;
+    }
+    case BOB: {
+      delete[] h[0];
+      delete[] h;
+      delete[] h64[0];
+      delete[] h64;
+      delete[] r_off;
+      break;
+    }
+    }
+    delete[] G0;
+    delete[] G1;
+    delete[] tmp;
+    delete[] extended_r;
+  }
+
+  void set_precomp_batch_size(int batch_size) {
+    this->precomp_batch_size = batch_size;
+    this->counter = batch_size;
+    switch (party) {
+    case ALICE: {
+      for (int i = 0; i < N; i++) {
+        delete[] h[i];
+        delete[] h64[i];
+      }
+      delete[] h;
+      delete[] h64;
+      break;
+    }
+    case BOB: {
+      delete[] h[0];
+      delete[] h;
+      delete[] h64[0];
+      delete[] h64;
+      delete[] r_off;
+      break;
+    }
+    }
+    switch (party) {
+    case ALICE:
+      h = new uint8_t *[N];
+      h64 = new uint64_t *[N];
+      for (int i = 0; i < N; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    case BOB:
+      h = new uint8_t *[1];
+      h64 = new uint64_t *[1];
+      r_off = new uint8_t[precomp_batch_size];
+      for (int i = 0; i < 1; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    }
+  }
+
+  void setup_send(block128 *in_k0 = nullptr, bool *in_s = nullptr) {
+    setup = true;
+    if (in_s != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block128));
+      memcpy(s, in_s, lambda);
+      block_s = bool_to128(s);
+    } else {
+      prg.random_bool(s, lambda);
+      base_ot->recv(k0, s, lambda);
+      block_s = bool_to128(s);
+    }
+    for (int i = 0; i < lambda; ++i)
+      G0[i].reseed(&k0[i]);
+  }
+
+  void setup_recv(block128 *in_k0 = nullptr, block128 *in_k1 = nullptr) {
+    setup = true;
+    if (in_k0 != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block128));
+      memcpy(k1, in_k1, lambda * sizeof(block128));
+    } else {
+      prg.random_block(k0, lambda);
+      prg.random_block(k1, lambda);
+      base_ot->send(k0, k1, lambda);
+    }
+    for (int i = 0; i < lambda; ++i) {
+      G0[i].reseed(&k0[i]);
+      G1[i].reseed(&k1[i]);
+    }
+  }
+
+  int padded_length(int length) {
+    return ((length + block_size - 1) / block_size) * block_size;
+  }
+
+  void send_pre(int length) {
+    int old_block_size = this->block_size;
+    this->block_size =
+        std::min(old_block_size, int(ceil(length / 256.0)) * 256);
+    length = padded_length(length);
+    block128 q[block_size];
+    qT = new block128[length];
+    if (!setup)
+      setup_send();
+
+    for (int j = 0; j < length / block_size; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(q + (i * block_size / 128), block_size / 8);
+        io->recv_data(tmp, block_size / 8);
+        if (s[i])
+          xorBlocks_arr(q + (i * block_size / 128), q + (i * block_size / 128),
+                        tmp, block_size / 128);
+      }
+      sse_trans((uint8_t *)(qT + j * block_size), (uint8_t *)q, 128,
+                block_size);
+    }
+    this->block_size = old_block_size;
+  }
+
+  void recv_pre(bool *r, int length) {
+    int old_block_size = this->block_size;
+    this->block_size =
+        std::min(old_block_size, int(ceil(length / 256.0)) * 256);
+    int old_length = length;
+    length = padded_length(length);
+    block128 t[block_size];
+    tT = new block128[length];
+
+    if (not setup)
+      setup_recv();
+
+    bool *r2 = new bool[length];
+    prg.random_bool(extended_r, block_size);
+    memcpy(r2, r, old_length);
+    memcpy(r2 + old_length, extended_r, length - old_length);
+
+    block128 *block_r = new block128[length / 128];
+    for (int i = 0; i < length / 128; ++i) {
+      block_r[i] = bool_to128(r2 + i * 128);
+    }
+
+    for (int j = 0; j * block_size < length; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(t + (i * block_size / 128), block_size / 8);
+        G1[i].random_data(tmp, block_size / 8);
+        xorBlocks_arr(tmp, t + (i * block_size / 128), tmp, block_size / 128);
+        xorBlocks_arr(tmp, block_r + (j * block_size / 128), tmp,
+                      block_size / 128);
+        io->send_data(tmp, block_size / 8);
+      }
+      sse_trans((uint8_t *)(tT + j * block_size), (uint8_t *)t, 128,
+                block_size);
+    }
+
+    delete[] block_r;
+    delete[] r2;
+
+    this->block_size = old_block_size;
+  }
+
+  /*********************************************************
+   *         Online Offline GOT functions                  *
+   ********************************************************/
+
+  void preprocess() {
+    switch (party) {
+    case ALICE: {
+      send_pre(counter);
+      got_send_offline(counter);
+      break;
+    }
+    case BOB: {
+      prg.random_data(r_off, counter);
+      uint8_t mask = N - 1; // N is a power of 2
+      for (int i = 0; i < counter; i++) {
+        r_off[i] &= mask;
+      }
+      recv_pre((bool *)r_off, counter);
+      got_recv_offline(counter);
+      break;
+    }
+    }
+    counter = 0;
+  }
+
+  void got_send_offline(int length) {
+    const int bsize = AES_BATCH_SIZE;
+    block128 *pad = new block128[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        h64[0][j] = ((uint64_t)_mm_extract_epi64(pad[2 * (j - i)], 0));
+        h[0][j] = ((uint8_t)_mm_extract_epi8(pad[2 * (j - i)], 0));
+        h64[1][j] = ((uint64_t)_mm_extract_epi64(pad[2 * (j - i) + 1], 0));
+        h[1][j] = ((uint8_t)_mm_extract_epi8(pad[2 * (j - i) + 1], 0));
+      }
+    }
+    delete[] qT;
+    delete[] pad;
+  }
+
+  void got_recv_offline(int length) {
+    const int bsize = AES_BATCH_SIZE;
+    block128 *pad = new block128[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      if (bsize <= length - i)
+        crh.H<bsize>(pad, tT + i);
+      else
+        crh.Hn(pad, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        h64[0][i + j] = ((uint64_t)_mm_extract_epi64(pad[j], 0));
+        h[0][i + j] = ((uint8_t)_mm_extract_epi8(pad[j], 0));
+      }
+    }
+    delete[] tT;
+    delete[] pad;
+  }
+
+  template <typename T> void got_send_online(T **data, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    int bits_in_sel_input = 1;
+    uint32_t y_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t a_size = (uint32_t)ceil((bsize * bits_in_sel_input) /
+                                     ((float)sizeof(uint8_t) * 8));
+    uint32_t corrected_bsize, corrected_y_size, corrected_a_size;
+    ;
+    T y[y_size];
+    uint8_t a_packed[a_size];
+    uint8_t a[length];
+    T **maskeddata = new T *[bsize];
+    for (int i = 0; i < bsize; i++) {
+      maskeddata[i] = new T[2];
+    }
+    for (int ctr = 0; ctr < length; ctr += bsize) {
+      corrected_bsize = std::min(bsize, length - ctr);
+      corrected_y_size =
+          (uint32_t)ceil((N * corrected_bsize * l) / ((float)sizeof(T) * 8));
+      corrected_a_size = (uint32_t)ceil((corrected_bsize * bits_in_sel_input) /
+                                        ((float)sizeof(uint8_t) * 8));
+      // Receive correction of choice input
+      io->recv_data(a_packed, sizeof(uint8_t) * corrected_a_size);
+      unpack_a<uint8_t>(a, a_packed, corrected_bsize, bits_in_sel_input);
+
+      for (uint32_t i = 0; i < corrected_bsize; i++) {
+        for (int k = 0; k < 2; k++) {
+          if (sizeof(T) == 8) {
+            maskeddata[i][k] = (h64[k ^ a[i]][counter] ^ data[ctr + i][k]);
+          } else if (sizeof(T) == 1) {
+            maskeddata[i][k] = (h[k ^ a[i]][counter] ^ data[ctr + i][k]);
+          } else {
+            throw std::invalid_argument("Not implemented");
+          }
+        }
+        counter++;
+      }
+      pack_messages<T>(y, maskeddata, corrected_y_size, corrected_bsize, l, 2);
+      io->send_data(y, sizeof(T) * corrected_y_size);
+    }
+    for (int i = 0; i < bsize; i++) {
+      delete[] maskeddata[i];
+    }
+    delete[] maskeddata;
+  }
+
+  template <typename T>
+  void got_recv_online(T *data, const uint8_t *r, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    int bits_in_sel_input = 1;
+    uint32_t res_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t a_size = (uint32_t)ceil((bsize * bits_in_sel_input) /
+                                     ((float)sizeof(uint8_t) * 8));
+    uint32_t corrected_bsize, corrected_res_size, corrected_a_size;
+    ;
+    T res[res_size];
+    uint8_t a_unpacked[length];
+    uint8_t a[a_size];
+    for (int ctr = 0; ctr < length; ctr += bsize) {
+      corrected_bsize = std::min(bsize, length - ctr);
+      corrected_res_size =
+          (uint32_t)ceil((N * corrected_bsize * l) / ((float)sizeof(T) * 8));
+      corrected_a_size = (uint32_t)ceil((corrected_bsize * bits_in_sel_input) /
+                                        ((float)sizeof(uint8_t) * 8));
+
+      int counter_memory = counter;
+      // Send corrected choice inputs
+      for (uint32_t i = 0; i < corrected_bsize; i++) {
+        a_unpacked[i] = r_off[counter++] ^ r[ctr + i];
+      }
+      pack_a<uint8_t>(a, a_unpacked, corrected_a_size, corrected_bsize,
+                      bits_in_sel_input);
+      io->send_data(a, sizeof(uint8_t) * corrected_a_size);
+      counter = counter_memory;
+      // Receive OT messages
+      io->recv_data(res, sizeof(T) * corrected_res_size);
+      if (sizeof(T) == 8) {
+        unpack_messages<uint64_t>((uint64_t *)data + ctr, r + ctr,
+                                  (uint64_t *)res, h64[0], corrected_bsize, l,
+                                  2, counter);
+      } else if (sizeof(T) == 1) {
+        unpack_messages<uint8_t>((uint8_t *)data + ctr, r + ctr, (uint8_t *)res,
+                                 h[0], corrected_bsize, l, 2, counter);
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+    }
+  }
+
+  /*********************************************************
+   *                Normal GOT functions                  *
+   ********************************************************/
+
+  void got_send_post(const block128 *data0, const block128 *data1, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = xorBlocks(pad[2 * (j - i)], data0[j]);
+        pad[2 * (j - i) + 1] = xorBlocks(pad[2 * (j - i) + 1], data1[j]);
+      }
+      io->send_data(pad, 2 * sizeof(block128) * std::min(bsize, length - i));
+    }
+    delete[] qT;
+  }
+
+  void got_recv_post(block128 *data, const bool *r, int length) {
+    const int bsize = AES_BATCH_SIZE;
+    block128 res[2 * bsize];
+    for (int i = 0; i < length; i += bsize) {
+      io->recv_data(res, 2 * sizeof(block128) * std::min(bsize, length - i));
+      if (bsize <= length - i)
+        crh.H<bsize>(tT + i, tT + i);
+      else
+        crh.Hn(tT + i, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        data[i + j] = xorBlocks(res[2 * j + r[i + j]], tT[i + j]);
+      }
+    }
+    delete[] tT;
+  }
+
+  template <typename T> void got_send_post(T **data, int length) {
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    uint32_t y_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t corrected_y_size, corrected_bsize;
+    T y[y_size];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      corrected_y_size = (uint32_t)ceil(
+          (2 * std::min(bsize, length - i) * this->l) / ((float)sizeof(T) * 8));
+      corrected_bsize = std::min(bsize, length - i);
+      if (sizeof(T) == 8) {
+        pack_ot_messages<uint64_t>((uint64_t *)y, (uint64_t **)data + i, pad,
+                                   corrected_y_size, corrected_bsize, this->l,
+                                   2);
+      } else if (sizeof(T) == 1) {
+        pack_ot_messages<uint8_t>((uint8_t *)y, (uint8_t **)data + i, pad,
+                                  corrected_y_size, corrected_bsize, this->l,
+                                  2);
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+      io->send_data(y, sizeof(T) * (corrected_y_size));
+    }
+    delete[] qT;
+  }
+
+  template <typename T>
+  void got_recv_post(T *data, const uint8_t *r, int length) {
+    const int bsize = AES_BATCH_SIZE;
+    uint32_t recvd_size =
+        (uint32_t)ceil((2 * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t corrected_recvd_size, corrected_bsize;
+    uint64_t recvd[recvd_size];
+
+    for (int i = 0; i < length; i += bsize) {
+      corrected_recvd_size = (uint32_t)ceil(
+          (2 * std::min(bsize, length - i) * this->l) / ((float)sizeof(T) * 8));
+      corrected_bsize = std::min(bsize, length - i);
+      io->recv_data(recvd, sizeof(T) * (corrected_recvd_size));
+      if (bsize <= length - i)
+        crh.H<bsize>(tT + i, tT + i);
+      else
+        crh.Hn(tT + i, tT + i, length - i);
+      if (sizeof(T) == 8) {
+        unpack_ot_messages<uint64_t>((uint64_t *)data + i, r + i,
+                                     (uint64_t *)recvd, tT + i, corrected_bsize,
+                                     this->l, 2);
+      } else if (sizeof(T) == 1) {
+        unpack_ot_messages<uint8_t>((uint8_t *)data + i, r + i,
+                                    (uint8_t *)recvd, tT + i, corrected_bsize,
+                                    this->l, 2);
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+    }
+    delete[] tT;
+  }
+
+  // General OT sender with message length > 64
+  void send_batched_got(uint64_t *data, int num_ot, int l,
+                        int msgs_per_ot = 1) {
+    this->l = l;
+    send_pre(num_ot);
+
+    int dim = num_ot;
+    uint64_t modulo_mask = (l == 64 ? -1 : ((1ULL << l) - 1));
+    int max_num_hashes = ceil((l * msgs_per_ot) / 128.0);
+    int max_pad_len = dim * max_num_hashes;
+    block128 *pad = new block128[2 * max_pad_len];
+    block128 *y0_per_ot = new block128[msgs_per_ot];
+    block128 *y1_per_ot = new block128[msgs_per_ot];
+    uint8_t *y0 =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+    uint8_t *y1 =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+
+    int num_hashes = ceil((l * msgs_per_ot) / 128.0);
+    int bsize = std::min(int(ceil(AES_BATCH_SIZE / double(num_hashes))), dim) *
+                num_hashes;
+    for (int j = 0; j < dim; j += (bsize / num_hashes)) {
+      for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+        int ot_idx = k;
+        int pad1_offset = std::min(bsize / num_hashes, dim - j) * num_hashes;
+        for (int h = 0; h < num_hashes; h++) {
+          pad[(k - j) * num_hashes + h] = xorBlocks(qT[ot_idx], toBlock(h));
+          pad[pad1_offset + (k - j) * num_hashes + h] =
+              xorBlocks(pad[(k - j) * num_hashes + h], block_s);
+        }
+      }
+      if (bsize <= (dim - j) * num_hashes)
+        crh.Hn(pad, pad, 2 * bsize);
+      else
+        crh.Hn(pad, pad, 2 * (dim - j) * num_hashes);
+
+      int lnum_ot = std::min(bsize / num_hashes, dim - j);
+      int32_t ysize_per_ot = ceil(msgs_per_ot * l / (8.0));
+
+      for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+        int ot_idx = k;
+        int pad1_offset = std::min(bsize / num_hashes, dim - j) * num_hashes;
+        block128 *pad0_ptr = pad + (k - j) * num_hashes;
+        block128 *pad1_ptr = pad + pad1_offset + (k - j) * num_hashes;
+        for (int h = 0; h < msgs_per_ot; h++) {
+          int msg_idx = ot_idx * msgs_per_ot + h;
+          writeToPackedArr((uint8_t *)y0_per_ot, ysize_per_ot, h * l, l,
+                           data[msg_idx * 2]);
+          writeToPackedArr((uint8_t *)y1_per_ot, ysize_per_ot, h * l, l,
+                           data[msg_idx * 2 + 1]);
+        }
+        for (int h = 0; h < num_hashes; h++) {
+          y0_per_ot[h] = xorBlocks(y0_per_ot[h], pad0_ptr[h]);
+          y1_per_ot[h] = xorBlocks(y1_per_ot[h], pad1_ptr[h]);
+        }
+        memcpy(y0 + (ysize_per_ot * (k - j)), y0_per_ot, ysize_per_ot);
+        memcpy(y1 + (ysize_per_ot * (k - j)), y1_per_ot, ysize_per_ot);
+      }
+      io->send_data(y0, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+      io->send_data(y1, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+    }
+    delete[] pad;
+    // delete[] unpacked_pad0;
+    // delete[] unpacked_pad1;
+    // delete[] corr_data;
+    delete[] y0;
+    delete[] y0_per_ot;
+    delete[] y1;
+    delete[] y1_per_ot;
+    delete[] qT;
+
+    /*
+            const int bsize = AES_BATCH_SIZE/2;
+            block128 pad[2*bsize];
+            uint32_t y_size = (uint32_t)ceil((2*bsize*l)/(float(64)));
+            uint32_t corrected_y_size, corrected_bsize;
+            uint64_t y[y_size];
+            for(int i = 0; i < num_ot*msgs_per_ot; i+=bsize) {
+                    for(int j = i; j < i+bsize and j < num_ot*msgs_per_ot; ++j)
+       { int ot_idx = j/msgs_per_ot; int hash_idx = j % msgs_per_ot;
+                            pad[2*(j-i)] = xorBlocks(qT[ot_idx],
+       toBlock(hash_idx)); pad[2*(j-i)+1] = xorBlocks(pad[2*(j-i)], block_s);
+                    }
+                    crh.H<2*bsize>(pad, pad);
+                    corrected_y_size =
+       ceil((2*std::min(bsize,num_ot*msgs_per_ot-i)*l)/(float(64)));
+                    corrected_bsize = std::min(bsize, num_ot*msgs_per_ot-i);
+        pack_ot_messages<uint64_t>(y, data+i, pad, corrected_y_size,
+       corrected_bsize, l, 2); io->send_data(y,
+       sizeof(uint64_t)*(corrected_y_size));
+            }
+            delete[] qT;
+    */
+  }
+
+  // General OT receiver with message length > 64
+  void recv_batched_got(uint64_t *data, const uint8_t *r, int num_ot, int l,
+                        int msgs_per_ot = 1) {
+    this->l = l;
+    recv_pre((bool *)r, num_ot);
+
+    int dim = num_ot;
+    uint64_t modulo_mask = (l == 64 ? -1 : ((1ULL << l) - 1));
+
+    int max_num_hashes = ceil((l * msgs_per_ot) / 128.0);
+    int max_pad_len = dim * max_num_hashes;
+    block128 *pad = new block128[max_pad_len];
+    uint8_t *y0 =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+    uint8_t *y1 =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+
+    int num_hashes = ceil((l * msgs_per_ot) / 128.0);
+    int bsize = std::min(int(ceil(AES_BATCH_SIZE / double(num_hashes))), dim) *
+                num_hashes;
+    for (int j = 0; j < dim; j += (bsize / num_hashes)) {
+      int lnum_ot = std::min(bsize / num_hashes, dim - j);
+      int32_t ysize_per_ot = ceil(msgs_per_ot * l / (8.0));
+
+      io->recv_data(y0, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+      io->recv_data(y1, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+      for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+        int ot_idx = k;
+        for (int h = 0; h < num_hashes; h++) {
+          pad[(k - j) * num_hashes + h] = xorBlocks(tT[ot_idx], toBlock(h));
+        }
+      }
+      if (bsize <= (dim - j) * num_hashes)
+        crh.Hn(pad, pad, bsize);
+      else
+        crh.Hn(pad, pad, (dim - j) * num_hashes);
+      for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+        int ot_idx = k;
+        block128 *pad_ptr = pad + (k - j) * num_hashes;
+        block128 *y0_ptr = (block128 *)(y0 + (ysize_per_ot * (k - j)));
+        block128 *y1_ptr = (block128 *)(y1 + (ysize_per_ot * (k - j)));
+        if (r[ot_idx]) {
+          for (int h = 0; h < num_hashes; h++) {
+            pad_ptr[h] = xorBlocks(pad_ptr[h], _mm_loadu_si128(y1_ptr + h));
+          }
+        } else {
+          for (int h = 0; h < num_hashes; h++) {
+            pad_ptr[h] = xorBlocks(pad_ptr[h], _mm_loadu_si128(y0_ptr + h));
+          }
+        }
+        for (int h = 0; h < msgs_per_ot; h++) {
+          int msg_idx = ot_idx * msgs_per_ot + h;
+          int corr_idx = (k - j) * msgs_per_ot + h;
+          data[msg_idx] =
+              readFromPackedArr((uint8_t *)pad_ptr, num_hashes, h * l, l);
+        }
+      }
+    }
+    delete[] pad;
+    delete[] y0;
+    delete[] y1;
+    delete[] tT;
+
+    /*
+            const int bsize = AES_BATCH_SIZE;
+            uint32_t recvd_size = (uint32_t)ceil((2*bsize*l)/(float(64)));
+            uint32_t corrected_recvd_size, corrected_bsize;
+            uint64_t recvd[recvd_size];
+    block128 pad[bsize];
+    uint8_t* r_ext = new uint8_t[num_ot*msgs_per_ot];
+    for (int i = 0; i < num_ot; i++) {
+        memset(r_ext + i*msgs_per_ot, r[i], msgs_per_ot);
+    }
+            for(int i = 0; i < num_ot*msgs_per_ot; i+=bsize) {
+                    corrected_recvd_size =
+    ceil((2*std::min(bsize,num_ot*msgs_per_ot-i)*l)/(float(64)));
+                    corrected_bsize = std::min(bsize, num_ot*msgs_per_ot-i);
+                    io->recv_data(recvd,
+    sizeof(uint64_t)*(corrected_recvd_size)); for(int j = i; j < i+bsize and j <
+    num_ot*msgs_per_ot; ++j) { int ot_idx = j/msgs_per_ot; int hash_idx = j %
+    msgs_per_ot; pad[j-i] = xorBlocks(tT[ot_idx], toBlock(hash_idx));
+                    }
+                    if (bsize <= num_ot*msgs_per_ot-i) crh.H<bsize>(pad, pad);
+                    else crh.Hn(pad, pad, num_ot*msgs_per_ot-i);
+        unpack_ot_messages<uint64_t>(data+i, r_ext+i, recvd, pad,
+    corrected_bsize, l, 2);
+            }
+            delete[] tT;
+    delete[] r_ext;
+    */
+  }
+
+  /*********************************************************
+   *                   COT functions                       *
+   ********************************************************/
+
+  void cot_send_post(uint64_t *data0, uint64_t *corr, int length) {
+    uint64_t modulo_mask = (1ULL << this->l) - 1;
+    if (this->l == 64)
+      modulo_mask = -1;
+    const int bsize = AES_BATCH_SIZE / 2;
+    block128 pad[2 * bsize];
+    uint32_t y_size = (uint32_t)ceil((bsize * this->l) / (float(64)));
+    uint32_t corrected_y_size, corrected_bsize;
+    uint64_t y[y_size];
+    uint64_t corr_data[bsize];
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        pad[2 * (j - i)] = qT[j];
+        pad[2 * (j - i) + 1] = xorBlocks(qT[j], block_s);
+      }
+      crh.H<2 * bsize>(pad, pad);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        data0[j] = _mm_extract_epi64(pad[2 * (j - i)], 0) & modulo_mask;
+        corr_data[j - i] =
+            (corr[j] + data0[j] + _mm_extract_epi64(pad[2 * (j - i) + 1], 0)) &
+            modulo_mask;
+      }
+      corrected_y_size =
+          (uint32_t)ceil((std::min(bsize, length - i) * this->l) /
+                         ((float)sizeof(uint64_t) * 8));
+      corrected_bsize = std::min(bsize, length - i);
+      pack_cot_messages(y, corr_data, corrected_y_size, corrected_bsize,
+                        this->l);
+      io->send_data(y, sizeof(uint64_t) * (corrected_y_size));
+    }
+    delete[] qT;
+  }
+
+  void cot_recv_post(uint64_t *data, const bool *r, int length) {
+    uint64_t modulo_mask = (1ULL << this->l) - 1;
+    if (this->l == 64)
+      modulo_mask = -1;
+    const int bsize = AES_BATCH_SIZE;
+    uint32_t recvd_size = (uint32_t)ceil((bsize * this->l) / (float(64)));
+    uint32_t corrected_recvd_size, corrected_bsize;
+    uint64_t corr_data[bsize];
+    uint64_t recvd[recvd_size];
+
+    for (int i = 0; i < length; i += bsize) {
+      corrected_recvd_size =
+          (uint32_t)ceil((std::min(bsize, length - i) * this->l) / (float(64)));
+      corrected_bsize = std::min(bsize, length - i);
+      io->recv_data(recvd, sizeof(uint64_t) * corrected_recvd_size);
+      if (bsize <= length - i)
+        crh.H<bsize>(tT + i, tT + i);
+      else
+        crh.Hn(tT + i, tT + i, length - i);
+      unpack_cot_messages(corr_data, recvd, corrected_bsize, this->l);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        if (r[j])
+          data[j] =
+              (corr_data[j - i] - _mm_extract_epi64(tT[j], 0)) & modulo_mask;
+        else
+          data[j] = _mm_extract_epi64(tT[j], 0) & modulo_mask;
+      }
+    }
+    delete[] tT;
+  }
+
+  // Batched COT sender with messages of different bitlengths
+  // msgs_per_ot specifies the number of OTs with same choice bit (to be
+  // batched)
+  void send_batched_cot(uint64_t *data0, uint64_t *corr,
+                        std::vector<int> msg_len, int num_ot,
+                        int msgs_per_ot = 1) {
+    send_pre(num_ot);
+
+    int num_msg_len = msg_len.size();
+    // The number of OTs of a particular message bitlength
+    // Simplifying assumption: Each message bitlength has equal number of OTs
+    int dim = num_ot / num_msg_len;
+    uint64_t modulo_mask[num_msg_len];
+    for (int bit_idx = 0; bit_idx < num_msg_len; bit_idx++) {
+      modulo_mask[bit_idx] =
+          (msg_len[bit_idx] == 64 ? -1 : ((1ULL << msg_len[bit_idx]) - 1));
+    }
+    int max_num_hashes = ceil((64 * msgs_per_ot) / 128.0);
+    int max_pad_len = dim * max_num_hashes;
+    block128 *pad = new block128[2 * max_pad_len];
+    block128 *y_per_ot = new block128[msgs_per_ot];
+    // uint64_t* unpacked_pad0 = new uint64_t[msgs_per_ot];
+    // uint64_t* unpacked_pad1 = new uint64_t[msgs_per_ot];
+    // uint64_t* corr_data = new uint64_t[dim*msgs_per_ot];
+    uint8_t *y =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+    for (int i = 0; i < num_ot / dim; i++) {
+      int bit_idx = i;
+      int lmsg_len = msg_len[bit_idx];
+      uint64_t lmodulo_mask = modulo_mask[bit_idx];
+      int num_hashes = ceil((lmsg_len * msgs_per_ot) / 128.0);
+      int bsize =
+          std::min(int(ceil(AES_BATCH_SIZE / double(num_hashes))), dim) *
+          num_hashes;
+      for (int j = 0; j < dim; j += (bsize / num_hashes)) {
+        for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+          int ot_idx = i * dim + k;
+          int pad1_offset = std::min(bsize / num_hashes, dim - j) * num_hashes;
+          for (int h = 0; h < num_hashes; h++) {
+            pad[(k - j) * num_hashes + h] = xorBlocks(qT[ot_idx], toBlock(h));
+            pad[pad1_offset + (k - j) * num_hashes + h] =
+                xorBlocks(pad[(k - j) * num_hashes + h], block_s);
+          }
+        }
+        if (bsize <= (dim - j) * num_hashes)
+          crh.Hn(pad, pad, 2 * bsize);
+        else
+          crh.Hn(pad, pad, 2 * (dim - j) * num_hashes);
+
+        int lnum_ot = std::min(bsize / num_hashes, dim - j);
+        int32_t ysize_per_ot = ceil(msgs_per_ot * lmsg_len / (8.0));
+
+        for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+          int ot_idx = i * dim + k;
+          int pad1_offset = std::min(bsize / num_hashes, dim - j) * num_hashes;
+          block128 *pad0_ptr = pad + (k - j) * num_hashes;
+          block128 *pad1_ptr = pad + pad1_offset + (k - j) * num_hashes;
+          for (int h = 0; h < msgs_per_ot; h++) {
+            int msg_idx = ot_idx * msgs_per_ot + h;
+            int32_t corr_idx = (k - j) * msgs_per_ot + h;
+            uint64_t unpacked_pad0 = readFromPackedArr(
+                (uint8_t *)pad0_ptr, num_hashes, h * lmsg_len, lmsg_len);
+            // unpacked_pad1[h] = readFromPackedArr((uint8_t*)pad1_ptr,
+            // num_hashes,
+            //         h*msg_len[bit_idx], msg_len[bit_idx]);
+            data0[msg_idx] = unpacked_pad0 & lmodulo_mask;
+            uint64_t corr_data = (corr[msg_idx] + unpacked_pad0) & lmodulo_mask;
+            writeToPackedArr((uint8_t *)y_per_ot, ysize_per_ot, h * lmsg_len,
+                             lmsg_len, corr_data);
+          }
+          for (int h = 0; h < num_hashes; h++) {
+            y_per_ot[h] = xorBlocks(y_per_ot[h], pad1_ptr[h]);
+          }
+          memcpy(y + (ysize_per_ot * (k - j)), y_per_ot, ysize_per_ot);
+        }
+        io->send_data(y, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+      }
+    }
+    delete[] pad;
+    // delete[] unpacked_pad0;
+    // delete[] unpacked_pad1;
+    // delete[] corr_data;
+    delete[] y;
+    delete[] y_per_ot;
+    delete[] qT;
+  }
+
+  // Batched COT receiver with messages of different bitlengths
+  // msgs_per_ot specifies the number of OTs with same choice bit (to be
+  // batched)
+  void recv_batched_cot(uint64_t *data, bool *b, std::vector<int> msg_len,
+                        int num_ot, int msgs_per_ot = 1) {
+    recv_pre(b, num_ot);
+
+    int num_msg_len = msg_len.size();
+    // The number of OTs of a particular message bitlength
+    // Simplifying assumption: Each message bitlength has equal number of OTs
+    int dim = num_ot / num_msg_len;
+    uint64_t modulo_mask[num_msg_len];
+    for (int bit_idx = 0; bit_idx < num_msg_len; bit_idx++) {
+      modulo_mask[bit_idx] =
+          (msg_len[bit_idx] == 64 ? -1 : ((1ULL << msg_len[bit_idx]) - 1));
+    }
+    int max_num_hashes = ceil((64 * msgs_per_ot) / 128.0);
+    int max_pad_len = dim * max_num_hashes;
+    block128 *pad = new block128[max_pad_len];
+    // uint64_t* unpacked_pad = new uint64_t[msgs_per_ot];
+    // uint64_t* corr_data = new uint64_t[dim*msgs_per_ot];
+    uint8_t *y =
+        new uint8_t[dim * msgs_per_ot * (sizeof(uint64_t) / sizeof(uint8_t))];
+    for (int i = 0; i < num_ot / dim; i++) {
+      int bit_idx = i;
+      int lmsg_len = msg_len[bit_idx];
+      int num_hashes = ceil((lmsg_len * msgs_per_ot) / 128.0);
+      int bsize =
+          std::min(int(ceil(AES_BATCH_SIZE / double(num_hashes))), dim) *
+          num_hashes;
+      for (int j = 0; j < dim; j += (bsize / num_hashes)) {
+        int lnum_ot = std::min(bsize / num_hashes, dim - j);
+        int32_t ysize_per_ot = ceil(msgs_per_ot * lmsg_len / (8.0));
+
+        io->recv_data(y, sizeof(uint8_t) * ysize_per_ot * lnum_ot);
+        for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+          int ot_idx = i * dim + k;
+          for (int h = 0; h < num_hashes; h++) {
+            pad[(k - j) * num_hashes + h] = xorBlocks(tT[ot_idx], toBlock(h));
+          }
+        }
+        if (bsize <= (dim - j) * num_hashes)
+          crh.Hn(pad, pad, bsize);
+        else
+          crh.Hn(pad, pad, (dim - j) * num_hashes);
+        for (int k = j; k < j + (bsize / num_hashes) and k < dim; k++) {
+          int ot_idx = i * dim + k;
+          block128 *pad_ptr = pad + (k - j) * num_hashes;
+          block128 *y_ptr = (block128 *)(y + (ysize_per_ot * (k - j)));
+          if (b[ot_idx]) {
+            for (int h = 0; h < num_hashes; h++) {
+              pad_ptr[h] = xorBlocks(pad_ptr[h], _mm_loadu_si128(y_ptr + h));
+            }
+          }
+          for (int h = 0; h < msgs_per_ot; h++) {
+            int msg_idx = ot_idx * msgs_per_ot + h;
+            int corr_idx = (k - j) * msgs_per_ot + h;
+            uint64_t unpacked_pad = readFromPackedArr(
+                (uint8_t *)pad_ptr, num_hashes, h * lmsg_len, lmsg_len);
+            data[msg_idx] = unpacked_pad; // & modulo_mask[bit_idx];
+          }
+        }
+      }
+    }
+    delete[] pad;
+    // delete[] unpacked_pad;
+    // delete[] corr_data;
+    delete[] y;
+    delete[] tT;
+  }
+
+  /*********************************************************
+   *            Send/Recv wrapper functions                *
+   ********************************************************/
+
+  void send_impl(const block128 *data0, const block128 *data1, int length) {
+    send_pre(length);
+    got_send_post(data0, data1, length);
+  }
+
+  void recv_impl(block128 *data, const bool *b, int length) {
+    recv_pre((bool *)b, length);
+    got_recv_post(data, b, length);
+  }
+
+  void send_impl(uint64_t **data, int length, int l) {
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_send_online<uint64_t>(data, length);
+    } else {
+      send_pre(length);
+      got_send_post<uint64_t>(data, length);
+    }
+  }
+
+  void recv_impl(uint64_t *data, uint8_t *b, int length, int l) {
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_recv_online<uint64_t>(data, b, length);
+    } else {
+      recv_pre((bool *)b, length);
+      got_recv_post<uint64_t>(data, b, length);
+    }
+  }
+
+  void send_impl(uint8_t **data, int length, int l) {
+    assert(l <= 8 && l >= 1);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_send_online<uint8_t>(data, length);
+    } else {
+      send_pre(length);
+      got_send_post<uint8_t>(data, length);
+    }
+  }
+
+  void recv_impl(uint8_t *data, uint8_t *b, int length, int l) {
+    assert(l <= 8 && l >= 1);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_recv_online<uint8_t>(data, b, length);
+    } else {
+      recv_pre((bool *)b, length);
+      got_recv_post<uint8_t>(data, b, length);
+    }
+  }
+
+  void send_cot(uint64_t *data0, uint64_t *corr, int length, int l) {
+    this->l = l;
+    send_pre(length);
+    cot_send_post(data0, corr, length);
+  }
+
+  void recv_cot(uint64_t *data, bool *b, int length, int l) {
+    this->l = l;
+    recv_pre(b, length);
+    cot_recv_post(data, b, length);
+  }
+};
+} // namespace sci
+#endif // SPLIT_OT_IKNP_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-kkot.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-kkot.h
new file mode 100644
index 00000000..bc6fd2ca
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-kkot.h
@@ -0,0 +1,604 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SPLIT_OT_KKOT_H__
+#define SPLIT_OT_KKOT_H__
+
+// In split functions, OT is split
+// into offline and online phase.
+// The KKOT functions without
+// online offline split can
+// be found in OT/kkot.h
+
+#include "OT/np.h"
+#include "OT/ot-utils.h"
+#include "OT/ot.h"
+#include "OT/split-utils.h"
+
+namespace sci {
+template <typename IO> class SplitKKOT : public OT<SplitKKOT<IO>> {
+public:
+  OTNP<IO> *base_ot;
+  PRG128 prg;
+  int party;
+  const int lambda = 256;
+#if __APPLE__
+  int block_size = 1024 * 8;
+#else
+  int block_size = 1024 * 16;
+#endif
+  const int ro_batch_size = 2048;
+
+  // This specifies how much OTs to preprocess in one go.
+  // 0 means no preprocessing and all everything will be
+  // run in the online phase.
+  int precomp_batch_size = 0;
+
+  // counter denotes the number of unused pre-generated OTs.
+  int counter = precomp_batch_size;
+  int N = 0, l;
+
+  block256 *k0 = nullptr, *k1 = nullptr, *d = nullptr, *c_AND_s = nullptr,
+           *qT = nullptr, *tT = nullptr, *tmp = nullptr, block_s;
+
+  // h holds the precomputed hashes which can be used directly
+  // in the online phase by xoring with the respective OT messages.
+  uint8_t **h;
+  uint64_t **h64;
+
+  // r_off is the choice input set used for the offline random OTs.
+  // This is corrected in the online phase when actual choice_input comes.
+  uint8_t *r_off;
+  PRG256 *G0, *G1;
+  bool *s = nullptr, setup = false;
+  bool precomp_masks = false;
+  uint8_t *extended_r = nullptr;
+  IO *io = nullptr;
+
+  SplitKKOT(int party, IO *io, int N) {
+    assert(party == ALICE || party == BOB);
+    this->party = party;
+    this->io = io;
+    assert(N > 0);
+    this->N = N;
+    base_ot = new OTNP<IO>(io);
+    s = new bool[lambda];
+    k0 = new (std::align_val_t(32)) block256[lambda];
+    k1 = new (std::align_val_t(32)) block256[lambda];
+    d = new (std::align_val_t(32)) block256[block_size];
+    c_AND_s = new (std::align_val_t(32)) block256[lambda];
+    switch (party) {
+    case ALICE:
+      h = new uint8_t *[N];
+      h64 = new uint64_t *[N];
+      for (int i = 0; i < N; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    case BOB:
+      h = new uint8_t *[1];
+      h64 = new uint64_t *[1];
+      r_off = new uint8_t[precomp_batch_size];
+      for (int i = 0; i < 1; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    }
+    G0 = new PRG256[lambda];
+    G1 = new PRG256[lambda];
+    tmp = new (std::align_val_t(32)) block256[block_size / 256];
+    extended_r = new uint8_t[block_size];
+  }
+
+  ~SplitKKOT() {
+    delete base_ot;
+    delete[] s;
+    delete[] k0;
+    delete[] k1;
+    delete[] d;
+    if (precomp_masks)
+      delete[] c_AND_s;
+    switch (party) {
+    case ALICE:
+      for (int i = 0; i < N; i++) {
+        delete[] h[i];
+        delete[] h64[i];
+      }
+      delete[] h;
+      delete[] h64;
+      break;
+    case BOB:
+      delete[] h[0];
+      delete[] h;
+      delete[] h64[0];
+      delete[] h64;
+      delete[] r_off;
+      break;
+    }
+    delete[] G0;
+    delete[] G1;
+    delete[] tmp;
+    delete[] extended_r;
+  }
+
+  void set_precomp_batch_size(int batch_size) {
+    this->precomp_batch_size = batch_size;
+    this->counter = batch_size;
+    if (precomp_masks) {
+      delete[] c_AND_s;
+      c_AND_s = new (std::align_val_t(32)) block256[lambda];
+      precomp_masks = false;
+    }
+    switch (party) {
+    case ALICE:
+      for (int i = 0; i < N; i++) {
+        delete[] h[i];
+        delete[] h64[i];
+      }
+      delete[] h;
+      delete[] h64;
+      break;
+    case BOB:
+      delete[] h[0];
+      delete[] h;
+      delete[] h64[0];
+      delete[] h64;
+      delete[] r_off;
+      break;
+    }
+    switch (party) {
+    case ALICE:
+      h = new uint8_t *[N];
+      h64 = new uint64_t *[N];
+      for (int i = 0; i < N; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    case BOB:
+      h = new uint8_t *[1];
+      h64 = new uint64_t *[1];
+      r_off = new uint8_t[precomp_batch_size];
+      for (int i = 0; i < 1; i++) {
+        h[i] = new uint8_t[precomp_batch_size];
+        h64[i] = new uint64_t[precomp_batch_size];
+      }
+      break;
+    }
+  }
+
+  void setup_send(block256 *in_k0 = nullptr, bool *in_s = nullptr) {
+    setup = true;
+    if (in_s != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block256));
+      memcpy(s, in_s, lambda);
+      block_s = bool_to256(s);
+    } else {
+      prg.random_bool(s, lambda);
+      base_ot->recv(k0, s, lambda);
+      block_s = bool_to256(s);
+    }
+    for (int i = 0; i < lambda; ++i)
+      G0[i].reseed(&k0[i]);
+  }
+
+  void setup_recv(block256 *in_k0 = nullptr, block256 *in_k1 = nullptr) {
+    setup = true;
+    if (in_k0 != nullptr) {
+      memcpy(k0, in_k0, lambda * sizeof(block256));
+      memcpy(k1, in_k1, lambda * sizeof(block256));
+    } else {
+      prg.random_block(k0, lambda);
+      prg.random_block(k1, lambda);
+      base_ot->send(k0, k1, lambda);
+    }
+    for (int i = 0; i < lambda; ++i) {
+      G0[i].reseed(&k0[i]);
+      G1[i].reseed(&k1[i]);
+    }
+  }
+
+  void precompute_masks() {
+    assert(setup == true);
+    precomp_masks = true;
+    for (int i = 0; i < N; i++) {
+      c_AND_s[i] =
+          andBlocks(_mm256_lddqu_si256((const __m256i *)WH_Code[i]), block_s);
+    }
+  }
+
+  int padded_length(int length) {
+    return ((length + block_size - 1) / block_size) * block_size;
+  }
+
+  void preprocess() {
+    switch (party) {
+    case ALICE:
+      send_pre(counter);
+      got_send_offline(counter);
+      break;
+    case BOB:
+      prg.random_data(r_off, counter);
+      uint8_t mask = N - 1; // N is a power of 2
+      for (int i = 0; i < counter; i++) {
+        r_off[i] &= mask;
+      }
+      recv_pre(r_off, counter);
+      got_recv_offline(counter);
+      break;
+    }
+    counter = 0;
+  }
+
+  void send_pre(int length) {
+    int old_block_size = this->block_size;
+    this->block_size =
+        std::min(old_block_size, int(ceil(length / 256.0)) * 256);
+    length = padded_length(length);
+    alignas(32) block256 q[block_size];
+    qT = new (std::align_val_t(32)) block256[length];
+    if (!setup)
+      setup_send();
+    if (!precomp_masks)
+      precompute_masks();
+
+    for (int j = 0; j < length / block_size; ++j) {
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(q + (i * block_size / 256), block_size / 8);
+        io->recv_data(tmp, block_size / 8);
+        if (s[i])
+          xorBlocks_arr(q + (i * block_size / 256), q + (i * block_size / 256),
+                        tmp, block_size / 256);
+      }
+      sse_trans((uint8_t *)(qT + j * block_size), (uint8_t *)q, 256,
+                block_size);
+    }
+    this->block_size = old_block_size;
+  }
+
+  void recv_pre(const uint8_t *r, int length) {
+    int old_block_size = this->block_size;
+    this->block_size =
+        std::min(old_block_size, int(ceil(length / 256.0)) * 256);
+    int old_length = length;
+    length = padded_length(length);
+    alignas(32) block256 t[block_size];
+    tT = new (std::align_val_t(32)) block256[length];
+    if (not setup)
+      setup_recv();
+
+    uint8_t *r2 = new uint8_t[length];
+    prg.random_data(extended_r, block_size);
+    memcpy(r2, r, old_length);
+    memcpy(r2 + old_length, extended_r, length - old_length);
+
+    block256 *dT = new (std::align_val_t(32)) block256[length];
+    for (int i = 0; i < length; i++)
+      dT[i] = _mm256_lddqu_si256((const __m256i *)WH_Code[r2[i]]);
+
+    for (int j = 0; j * block_size < length; ++j) {
+      sse_trans((uint8_t *)d, (uint8_t *)(dT + j * block_size), block_size,
+                256);
+      for (int i = 0; i < lambda; ++i) {
+        G0[i].random_data(t + (i * block_size / 256), block_size / 8);
+        G1[i].random_data(tmp, block_size / 8);
+        xorBlocks_arr(tmp, t + (i * block_size / 256), tmp, block_size / 256);
+        xorBlocks_arr(tmp, d + (i * block_size / 256), tmp, block_size / 256);
+        io->send_data(tmp, block_size / 8);
+      }
+      sse_trans((uint8_t *)(tT + j * block_size), (uint8_t *)t, 256,
+                block_size);
+    }
+
+    delete[] dT;
+    delete[] r2;
+
+    this->block_size = old_block_size;
+  }
+
+  void got_send_offline(int length) {
+    const int bsize = ro_batch_size;
+    block256 *key = new (std::align_val_t(32)) block256[N * bsize];
+    block128 *pad = new block128[N * bsize];
+
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        for (int k = 0; k < N; k++) {
+          key[N * (j - i) + k] = xorBlocks(qT[j], c_AND_s[k]);
+        }
+      }
+      CCRF(pad, key, N * bsize);
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        for (int k = 0; k < N; k++) {
+          h[k][j] = ((uint8_t)_mm_extract_epi8(pad[N * (j - i) + k], 0));
+          h64[k][j] = ((uint64_t)_mm_extract_epi64(pad[N * (j - i) + k], 0));
+        }
+      }
+    }
+    delete[] qT;
+    delete[] key;
+    delete[] pad;
+  }
+
+  void got_recv_offline(int length) {
+    const int bsize = ro_batch_size;
+    block128 *pad = new block128[N * bsize];
+
+    for (int i = 0; i < length; i += bsize) {
+      if (bsize <= length - i)
+        CCRF(pad, tT + i, bsize);
+      else
+        CCRF(pad, tT + i, length - i);
+      for (int j = 0; j < bsize and j < length - i; ++j) {
+        h[0][i + j] = ((uint8_t)_mm_extract_epi8(pad[j], 0));
+        h64[0][i + j] = ((uint64_t)_mm_extract_epi64(pad[j], 0));
+      }
+    }
+    delete[] tT;
+    delete[] pad;
+  }
+
+  template <typename T> void got_send_online(T **data, int length) {
+    const int bsize = length; // ro_batch_size;
+    uint32_t y_size = (uint32_t)ceil((N * bsize * l) / ((float)sizeof(T) * 8));
+    int bits_in_sel_input = sci::bitlen(N);
+    uint32_t a_size = (uint32_t)ceil((bsize * bits_in_sel_input) /
+                                     ((float)sizeof(uint8_t) * 8));
+    T y[y_size];
+    uint8_t a_packed[a_size];
+    uint8_t a[length];
+    T **maskeddata = new T *[bsize];
+    for (int i = 0; i < bsize; i++) {
+      maskeddata[i] = new T[N];
+    }
+    int32_t corrected_bsize, corrected_y_size, corrected_a_size;
+    uint8_t mask_a;
+    mask_a = (1 << bits_in_sel_input) - 1;
+    if (bits_in_sel_input == 8) {
+      mask_a = -1;
+    }
+
+    for (int ctr = 0; ctr < length; ctr += bsize) {
+      corrected_bsize = std::min(bsize, length - ctr);
+      corrected_y_size =
+          (uint32_t)ceil((N * corrected_bsize * l) / ((float)sizeof(T) * 8));
+      corrected_a_size = (uint32_t)ceil((corrected_bsize * bits_in_sel_input) /
+                                        ((float)sizeof(uint8_t) * 8));
+      // Receive correction of choice input
+      io->recv_data(a_packed, sizeof(uint8_t) * corrected_a_size);
+      unpack_a<uint8_t>(a, a_packed, corrected_bsize, bits_in_sel_input);
+      if (sizeof(T) == 8) {
+        for (int i = 0; i < corrected_bsize; i++) {
+          for (int k = 0; k < N; k++) {
+            maskeddata[i][k] =
+                (h64[(k + a[i]) & mask_a][counter] ^ data[ctr + i][k]);
+          }
+          counter++;
+        }
+      } else if (sizeof(T) == 1) {
+        for (int i = 0; i < corrected_bsize; i++) {
+          for (int k = 0; k < N; k++) {
+            maskeddata[i][k] =
+                (h[(k + a[i]) & mask_a][counter] ^ data[ctr + i][k]);
+          }
+          counter++;
+        }
+      } else {
+        throw std::invalid_argument("Not yet implemented");
+      }
+      pack_messages<T>(y, maskeddata, corrected_y_size, corrected_bsize, l, N);
+      io->send_data(y, sizeof(T) * corrected_y_size);
+    }
+    for (int i = 0; i < bsize; i++) {
+      delete[] maskeddata[i];
+    }
+    delete[] maskeddata;
+  }
+
+  template <typename T>
+  void got_recv_online(T *data, const uint8_t *r, int length) {
+    const int bsize = length;
+    uint32_t res_size =
+        (uint32_t)ceil((N * length * l) / ((float)sizeof(T) * 8));
+    int bits_in_sel_input = sci::bitlen(N);
+    uint32_t a_size = (uint32_t)ceil((length * bits_in_sel_input) /
+                                     ((float)sizeof(uint8_t) * 8));
+    T res[res_size];
+    uint8_t a_unpacked[length];
+    uint8_t a[a_size];
+    int32_t corrected_bsize, corrected_res_size, corrected_a_size;
+    uint8_t mask_a;
+    mask_a = (1 << bits_in_sel_input) - 1;
+    if (bits_in_sel_input == 8) {
+      mask_a = -1;
+    }
+
+    for (int ctr = 0; ctr < length; ctr += bsize) {
+      corrected_bsize = std::min(bsize, length - ctr);
+      corrected_res_size =
+          (uint32_t)ceil((N * corrected_bsize * l) / ((float)sizeof(T) * 8));
+      corrected_a_size = (uint32_t)ceil((corrected_bsize * bits_in_sel_input) /
+                                        ((float)sizeof(uint8_t) * 8));
+
+      int counter_memory = counter;
+      // Send corrected choice inputs
+      for (int i = 0; i < corrected_bsize; i++) {
+        a_unpacked[i] =
+            (r_off[counter++] - r[ctr + i]) & mask_a; // Replicates neg_mod
+      }
+      pack_a<uint8_t>(a, a_unpacked, corrected_a_size, corrected_bsize,
+                      bits_in_sel_input);
+      io->send_data(a, sizeof(uint8_t) * corrected_a_size);
+      counter = counter_memory;
+      // Receive OT messages
+      io->recv_data(res, sizeof(T) * corrected_res_size);
+      if (sizeof(T) == 8) {
+        unpack_messages<uint64_t>((uint64_t *)data + ctr, r + ctr,
+                                  (uint64_t *)res, h64[0], corrected_bsize, l,
+                                  N, counter);
+      } else if (sizeof(T) == 1) {
+        unpack_messages<uint8_t>((uint8_t *)data + ctr, r + ctr, (uint8_t *)res,
+                                 h[0], corrected_bsize, l, N, counter);
+      } else {
+        throw std::invalid_argument("Not yet implemented");
+      }
+    }
+  }
+
+  template <typename T> void got_send_post(T **data, int length) {
+    const int bsize = ro_batch_size;
+    block256 *key = new (std::align_val_t(32)) block256[N * bsize];
+    block128 *pad = new block128[N * bsize];
+    uint32_t y_size =
+        (uint32_t)ceil((N * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t corrected_y_size, corrected_bsize;
+    T *y = new T[y_size];
+
+    for (int i = 0; i < length; i += bsize) {
+      for (int j = i; j < i + bsize and j < length; ++j) {
+        for (int k = 0; k < N; k++) {
+          key[N * (j - i) + k] = xorBlocks(qT[j], c_AND_s[k]);
+        }
+      }
+      CCRF(pad, key, N * bsize);
+      corrected_y_size = (uint32_t)ceil(
+          (N * std::min(bsize, length - i) * this->l) / ((float)sizeof(T) * 8));
+      corrected_bsize = std::min(bsize, length - i);
+      if (sizeof(T) == 8) {
+        pack_ot_messages<uint64_t>((uint64_t *)y, (uint64_t **)data + i, pad,
+                                   corrected_y_size, corrected_bsize, this->l,
+                                   this->N);
+      } else if (sizeof(T) == 1) {
+        pack_ot_messages<uint8_t>((uint8_t *)y, (uint8_t **)data + i, pad,
+                                  corrected_y_size, corrected_bsize, this->l,
+                                  this->N);
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+      io->send_data(y, sizeof(T) * (corrected_y_size));
+    }
+    delete[] qT;
+    delete[] y;
+    delete[] key;
+    delete[] pad;
+  }
+
+  template <typename T>
+  void got_recv_post(T *data, const uint8_t *r, int length) {
+    const int bsize = ro_batch_size;
+    block128 *pad = new block128[N * bsize];
+    uint32_t recvd_size =
+        (uint32_t)ceil((N * bsize * this->l) / ((float)sizeof(T) * 8));
+    uint32_t corrected_recvd_size, corrected_bsize;
+    T *recvd = new T[recvd_size];
+    for (int i = 0; i < length; i += bsize) {
+      uint32_t corrected_recvd_size = (uint32_t)ceil(
+          (N * std::min(bsize, length - i) * this->l) / ((float)sizeof(T) * 8));
+      corrected_bsize = std::min(bsize, length - i);
+      io->recv_data(recvd, sizeof(T) * (corrected_recvd_size));
+      if (bsize <= length - i)
+        CCRF(pad, tT + i, bsize);
+      else
+        CCRF(pad, tT + i, length - i);
+
+      if (sizeof(T) == 8) {
+        unpack_ot_messages<uint64_t>((uint64_t *)data + i, r + i,
+                                     (uint64_t *)recvd, pad, corrected_bsize,
+                                     this->l, this->N);
+      } else if (sizeof(T) == 1) {
+        unpack_ot_messages<uint8_t>((uint8_t *)data + i, r + i,
+                                    (uint8_t *)recvd, pad, corrected_bsize,
+                                    this->l, this->N);
+      } else {
+        throw std::invalid_argument("Not implemented");
+      }
+    }
+    delete[] tT;
+    delete[] pad;
+    delete[] recvd;
+  }
+
+  void send_impl(uint8_t **data, int length, int l) {
+    assert(N <= lambda && N >= 2);
+    assert(l <= 8 && l >= 1);
+    // assert(((N*l*length) % 8) == 0);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_send_online<uint8_t>(data, length);
+    } else {
+      send_pre(length);
+      got_send_post<uint8_t>(data, length);
+    }
+  }
+
+  void recv_impl(uint8_t *data, uint8_t *b, int length, int l) {
+    assert(N <= lambda && N >= 2);
+    assert(l <= 8 && l >= 1);
+    // assert(((N*l*length) % 8) == 0);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_recv_online<uint8_t>(data, b, length);
+    } else {
+      recv_pre(b, length);
+      got_recv_post<uint8_t>(data, b, length);
+    }
+  }
+
+  void send_impl(uint64_t **data, int length, int l) {
+    assert(N <= lambda && N >= 2);
+    // assert(l > 8);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_send_online<uint64_t>(data, length);
+    } else {
+      send_pre(length);
+      got_send_post<uint64_t>(data, length);
+    }
+  }
+
+  void recv_impl(uint64_t *data, uint8_t *b, int length, int l) {
+    assert(N <= lambda && N >= 2);
+    // assert(l > 8);
+    this->l = l;
+    if (length <= precomp_batch_size) {
+      if (length > (precomp_batch_size - counter)) {
+        preprocess();
+      }
+      got_recv_online<uint64_t>(data, b, length);
+    } else {
+      recv_pre(b, length);
+      got_recv_post<uint64_t>(data, b, length);
+    }
+  }
+};
+} // namespace sci
+#endif // SPLIT_OT_KKOT_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-utils.h b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-utils.h
new file mode 100644
index 00000000..5f49ecc4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/OT/split-utils.h
@@ -0,0 +1,190 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SPLIT_UTIL_H__
+#define SPLIT_UTIL_H__
+#include "OT/ot.h"
+
+namespace sci {
+template <typename basetype>
+void pack_a(basetype *a, basetype *a_unpacked, int asize, int bsize,
+            int bitsize) {
+  assert(a != nullptr && a_unpacked != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype temp_bl = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+  for (int i = 0; i < asize; i++) {
+    a[i] = 0;
+  }
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * bitsize;
+    end_pos = start_pos + bitsize - 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (start_block == end_block) {
+      a[start_block] ^= ((a_unpacked[i] & mask) << (start_pos % carriersize));
+    } else {
+      temp_bl = (a_unpacked[i] & mask);
+      a[start_block] ^= (temp_bl) << (start_pos % carriersize);
+      a[end_block] ^= (temp_bl) >> (carriersize - (start_pos % carriersize));
+    }
+  }
+}
+
+template <typename basetype>
+void unpack_a(basetype *a, basetype *a_packed, int bsize, int bitsize) {
+  assert(a != nullptr && a_packed != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * bitsize;
+    end_pos = start_pos + bitsize - 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (start_block == end_block) {
+      a[i] = (a_packed[start_block] >> (start_pos % carriersize)) & mask;
+    } else {
+      a[i] = 0;
+      a[i] ^= (a_packed[start_block] >> (start_pos % carriersize));
+      a[i] ^=
+          (a_packed[end_block] << (carriersize - (start_pos % carriersize)));
+      a[i] = a[i] & mask;
+    }
+  }
+}
+
+template <typename basetype>
+void pack_messages(basetype *y, basetype **maskeddata, int ysize, int bsize,
+                   int bitsize, int N) {
+  assert(y != nullptr && maskeddata != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype temp_bl = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+  for (int i = 0; i < ysize; i++) {
+    y[i] = 0;
+  }
+  for (int i = 0; i < bsize; i++) {
+    for (int k = 0; k < N; k++) {
+      // OT message k
+      start_pos = i * N * bitsize + k * bitsize; // inclusive
+      end_pos = start_pos + bitsize;
+      end_pos -= 1; // inclusive
+      start_block = start_pos / carriersize;
+      end_block = end_pos / carriersize;
+
+      if (start_block == end_block) {
+        y[start_block] ^= (maskeddata[i][k] & mask)
+                          << (start_pos % carriersize);
+      } else {
+        temp_bl = (maskeddata[i][k] & mask);
+        y[start_block] ^= (temp_bl) << (start_pos % carriersize);
+        y[end_block] ^= (temp_bl) >> (carriersize - (start_pos % carriersize));
+      }
+    }
+  }
+}
+
+template <typename basetype>
+void unpack_messages(basetype *data, const uint8_t *r, basetype *recvd,
+                     basetype *maskhash, int bsize, int bitsize, int N,
+                     int &counter) {
+  assert(data != nullptr && maskhash != nullptr);
+  uint64_t start_pos = 0;
+  uint64_t end_pos = 0;
+  uint64_t start_block = 0;
+  uint64_t end_block = 0;
+  basetype mask = (1 << bitsize) - 1;
+  if (8 * sizeof(basetype) == 64) {
+    mask = (basetype)((1ULL << bitsize) - 1ULL);
+  }
+  if (8 * sizeof(basetype) == bitsize) {
+    if (bitsize == 64) {
+      mask = (basetype)(-1ULL);
+    } else {
+      mask = (basetype)(-1);
+    }
+  }
+  uint64_t carriersize = 8 * (sizeof(basetype));
+
+  for (int i = 0; i < bsize; i++) {
+    start_pos = i * N * bitsize + r[i] * bitsize;
+    end_pos = start_pos + bitsize - 1; // inclusive
+    start_block = start_pos / carriersize;
+    end_block = end_pos / carriersize;
+    if (start_block == end_block) {
+      data[i] = ((recvd[start_block] >> (start_pos % carriersize)) ^
+                 maskhash[counter]) &
+                mask;
+    } else {
+      data[i] = 0;
+      data[i] ^= (recvd[start_block] >> (start_pos % carriersize));
+      data[i] ^=
+          (recvd[end_block] << (carriersize - (start_pos % carriersize)));
+      data[i] = (data[i] ^ maskhash[counter]) & mask;
+    }
+    counter++;
+  }
+}
+} // namespace sci
+#endif // SPLIT_UTIL_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.cpp
new file mode 100644
index 00000000..c3f5947f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.cpp
@@ -0,0 +1,1224 @@
+/*
+Authors: Deevashwer Rathee, G Rahul Kranti Kiran
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "cleartext_library_fixed.h"
+
+using namespace std;
+using namespace sci;
+
+inline int64_t Saturate(int32_t inp) { return (int64_t)inp; }
+
+void cleartext_Sigmoid(int64_t *A, int I, int J, int scale_in, int scale_out,
+                       int bwA, int bwB, int64_t *B) {
+  int32_t sA = log2(scale_in);
+  int32_t sB = log2(scale_out);
+
+  int64_t *neg_A = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    if (A[i] < 0) {
+      neg_A[i] = int64_t(A[i]);
+    } else {
+      neg_A[i] = (-1LL * int64_t(A[i]));
+    }
+    // std::cout << neg_A[i] << std::endl;
+  }
+
+  int64_t *exp_neg_A = new int64_t[I * J];
+  cleartext_Exp_lookup(neg_A, I, J, bwA, scale_in, scale_out, exp_neg_A, 1);
+  int64_t *sig_neg_A = new int64_t[I * J];
+  int64_t *all1 = new int64_t[I * J];
+  int64_t *den = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    if (exp_neg_A[i] == (1 << sB)) {
+      den[i] = (exp_neg_A[i] + (1LL << sB) - 1);
+    } else {
+      den[i] = (exp_neg_A[i] + (1LL << sB));
+    }
+    all1[i] = 1;
+  }
+  cleartext_div(all1, den, I, J, 1, scale_out, scale_out, sig_neg_A, false);
+
+  int64_t error = 0;
+  for (int i = 0; i < I * J; i++) {
+    if (A[i] >= 0) {
+      B[i] = all1Mask(bwB) & sig_neg_A[i];
+    } else {
+      B[i] = all1Mask(bwB) & ((exp_neg_A[i] * sig_neg_A[i]) >> sB);
+    }
+    double flA = (A[i] / double(1LL << sA));
+    double sig_A = 1.0 / (1.0 + exp(-1.0 * flA));
+    int64_t actualB = sig_A * (1LL << sB);
+    error += abs(actualB - B[i]);
+  }
+  if (party == 2) {
+    std::cout << "Sigmoid Average Error: " << error / (I * J) << std::endl;
+  }
+
+  delete[] neg_A;
+  delete[] exp_neg_A;
+  delete[] all1;
+  delete[] sig_neg_A;
+  delete[] den;
+
+  return;
+}
+
+void cleartext_TanH(int64_t *A, int32_t I, int32_t J, int64_t scale_in,
+                    int64_t scale_out, int32_t bwA, int32_t bwB, int64_t *B) {
+#ifdef APPROXIMATE_TANH
+  for (int i = 0; i < I * J; i++) {
+    float a = float(A[i]) / scale_in;
+    float b = a;
+
+    if (b < -1)
+      b = -1;
+    else if (b > 1)
+      b = 1;
+    B[i] = b * scale_out;
+  }
+#else
+  int32_t sA = log2(scale_in);
+  int32_t sB = log2(scale_out);
+
+  int64_t *neg_A = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    if (A[i] < 0) {
+      neg_A[i] = int64_t(A[i]);
+    } else {
+      neg_A[i] = (-1LL * int64_t(A[i]));
+    }
+  }
+  int64_t *exp_neg_2A = new int64_t[I * J];
+  // scale_in/2 because we need exp(-2x)
+  cleartext_Exp_lookup(neg_A, I, J, bwA, scale_in / 2, scale_out, exp_neg_2A,
+                       1);
+  int64_t *tanh_neg_A = new int64_t[I * J];
+  int64_t *num = new int64_t[I * J];
+  int64_t *den = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    if (exp_neg_2A[i] == (1 << sB)) {
+      den[i] = (exp_neg_2A[i] + (1LL << sB) - 1);
+    } else {
+      den[i] = (exp_neg_2A[i] + (1LL << sB));
+    }
+    num[i] = (1LL << sB) - exp_neg_2A[i];
+  }
+  cleartext_div(num, den, I, J, scale_out, scale_out, scale_out, tanh_neg_A,
+                false);
+
+  int64_t error = 0;
+  for (int i = 0; i < I * J; i++) {
+    if (A[i] >= 0) {
+      B[i] = Saturate(tanh_neg_A[i]);
+    } else {
+      B[i] = Saturate(-1 * tanh_neg_A[i]);
+    }
+    double flA = (A[i] / double(1LL << sA));
+    double tanh_A = tanh(flA);
+    int64_t actualB = tanh_A * (1LL << sB);
+    error += abs(actualB - B[i]);
+  }
+  if (party == 2) {
+    std::cout << "TanH Average Error: " << error / (I * J) << std::endl;
+  }
+
+  delete[] neg_A;
+  delete[] exp_neg_2A;
+  delete[] num;
+  delete[] den;
+  delete[] tanh_neg_A;
+
+  return;
+#endif
+}
+
+void cleartext_MatAdd(int64_t *A, int64_t *B, int64_t *C, int32_t I, int32_t J,
+                      int32_t shrA, int32_t shrB, int32_t shrC,
+                      int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t b = (int64_t)B[i * J + j];
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = b / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = b >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a + b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MatSub(int64_t *A, const int64_t *B, int64_t *C, int32_t I,
+                      int32_t J, int32_t shrA, int32_t shrB, int32_t shrC,
+                      int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t b = (int64_t)B[i * J + j];
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = b / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = b >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a - b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MatMul(int64_t *A, int64_t *B, int64_t *C, int64_t *tmp,
+                      int32_t I, int32_t K, int32_t J, int32_t shrA,
+                      int32_t shrB, int32_t H1, int32_t H2, int32_t demote) {
+  // /*
+  // int64_t* temp = new int64_t[K];
+  int64_t *temp = new int64_t[K];
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+  int depth = ceil(log2(K));
+  int32_t shift = shiftA + shiftB + shift_demote + H1 - depth;
+
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      for (int k = 0; k < K; k++) {
+        int64_t a = (int64_t)A[i * K + k];
+        int64_t b = (int64_t)B[k * J + j];
+
+        // int64_t prod = a * b;
+        int64_t prod = int64_t(a) * int64_t(b);
+
+        temp[k] = prod;
+      }
+      // int64_t sum = 0;
+      int64_t sum = 0;
+      for (int k = 0; k < K; k++) {
+        // sum += (temp[k] >> depth);
+        sum += temp[k];
+      }
+      sum = (sum >> depth);
+      // C[i * J + j] = Saturate(sum >> (shiftA + shiftB + shift_demote));
+      if (shift >= 0) {
+#ifdef DIV_RESCALING
+        C[i * J + j] = Saturate(sum / (1LL << shift));
+#else
+        C[i * J + j] = Saturate(sum >> shift);
+#endif
+      } else {
+        C[i * J + j] = Saturate(sum * (1LL << (-1 * shift)));
+      }
+    }
+  }
+
+  delete[] temp;
+  return;
+  // */
+}
+
+void cleartext_MatAddBroadCastA(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)*A;
+      int64_t b = (int64_t)B[i * J + j];
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = b / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = b >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a + b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MatAddBroadCastB(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t b = (int64_t)*B;
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = b / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = b >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a + b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MatSubBroadCastA(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)*A;
+      int64_t b = (int64_t)B[i * J + j];
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = (-1 * b) / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = (-1 * b) >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a + b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MatSubBroadCastB(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t b = (int64_t)*B;
+
+#ifdef DIV_RESCALING
+      a = a / (shrA * shrC);
+      b = (-1 * b) / (shrB * shrC);
+#else
+      a = a >> (shiftA + shiftC);
+      b = (-1 * b) >> (shiftB + shiftC);
+#endif
+
+      int64_t c = a + b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(c / demote);
+#else
+      C[i * J + j] = Saturate(c >> shift_demote);
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_ScalarMul(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                         int32_t J, int32_t shrA, int32_t shrB, int demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+  int64_t a = (int64_t)*A;
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t b = (int64_t)B[i * J + j];
+
+      int64_t prod = a * b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(prod / (shrA * shrB * demote));
+#else
+      C[i * J + j] = Saturate(prod >> (shiftA + shiftB + shift_demote));
+#endif
+    }
+  }
+
+  return;
+}
+
+void cleartext_MulCir(int64_t *A, int64_t *B, int64_t *C, int32_t I, int32_t J,
+                      int32_t shrA, int32_t shrB, int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t b = (int64_t)B[i * J + j];
+
+      int64_t prod = a * b;
+
+#ifdef DIV_RESCALING
+      C[i * J + j] = Saturate(prod / (shrA * shrB * demote));
+#else
+      C[i * J + j] = Saturate(prod >> (shiftA + shiftB + shift_demote));
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_sqrt(int64_t *A, int32_t I, int32_t J, int32_t shrA,
+                    int32_t shrB, int32_t bwA, int32_t bwB, int64_t *B,
+                    bool inverse) {
+  int32_t sA = log2(shrA);
+  int32_t sB = log2(shrB);
+  int32_t m, iters;
+  if (sB <= 14) {
+    m = ceil(sB / 2.0);
+    iters = 1;
+  } else {
+    m = ceil((ceil(sB / 2.0)) / 2.0);
+    iters = 2;
+  }
+  assert(m <= KKOT_LIMIT);
+  assert(sA >= m);
+
+  int64_t m_mask = (1LL << m) - 1;
+  int64_t error = 0;
+  for (int i = 0; i < I * J; i++) {
+    assert(A[i] >= 0);
+    int32_t msnzb = floor(log2(int64_t(A[i])));
+    int32_t adjust_amt = bwA - 2 - msnzb;
+    int64_t adjust = (1LL << adjust_amt);
+    int64_t sqrt_adjust_scale =
+        (inverse ? floor((bwA - 1 - sA) / 2.0) : floor((sA + 1) / 2.0));
+    int64_t sqrt_adjust;
+    if (inverse) {
+      sqrt_adjust =
+          (1LL << int(floor((sA - msnzb + 1) / 2.0) + sqrt_adjust_scale));
+    } else {
+      sqrt_adjust = (1LL << int(floor((msnzb - sA) / 2.0) + sqrt_adjust_scale));
+    }
+    int64_t exp_parity = ((msnzb - sA) & 1);
+    // shifted_A: bw = bwA - 1, scale = bwA - 2
+    int64_t shifted_A = int64_t(A[i]) * adjust;
+
+    // Top m bits of shifted_A excluding MSB, which is always 1
+    // A_m: bw = m, scale = m
+    int64_t A_m = (int64_t(shifted_A) >> (bwA - 2 - m)) & m_mask;
+    // Y: bw = m + SQRT_LOOKUP_SCALE + 1, scale = m + SQRT_LOOKUP_SCALE
+    int64_t Y = lookup_sqrt(A_m, m, exp_parity);
+
+    // X = (exp_parity ? 2 * shifted_A : shifted_A); scale = bwA - 2
+    // X: bw = bwA
+    int64_t X = (exp_parity ? 2 * shifted_A : shifted_A);
+
+    int64_t x_prev;
+    if (inverse) {
+      // x_prev : bw = m + SQRT_LOOKUP_SCALE + 1, scale = m + SQRT_LOOKUP_SCALE
+      // x_prev \approx 0.5 < 1/sqrt(X) < 1
+      x_prev = Y;
+    } else {
+      // x_prev : bw = sB + 1, scale = sB
+      // x_prev \approx 1 < sqrt(X) < 2
+      x_prev = (X * Y);
+      if (bwA - 2 + m + SQRT_LOOKUP_SCALE > sB) {
+        x_prev = x_prev >> (bwA - 2 + m + SQRT_LOOKUP_SCALE - sB);
+      } else {
+        x_prev = x_prev << (sB - bwA + 2 - m - SQRT_LOOKUP_SCALE);
+      }
+    }
+    // b_prev_scale = sB
+    int64_t b_prev = X;
+    if (bwA - 2 > sB) {
+      b_prev = b_prev >> (bwA - 2 - sB);
+    } else {
+      b_prev = b_prev << (sB + 2 - bwA);
+    }
+    // Y_prev_scale = m + 1
+    int64_t Y_prev = Y;
+    int64_t x_curr, b_curr, Y_curr;
+    for (int j = 0; j < iters; j++) {
+      int64_t Y_sq = Y_prev * Y_prev;
+      if (j == 0) {
+        // b_curr: bw = sB + 2, scale = sB
+        b_curr = (Y_sq * b_prev) >> (2 * m + 2 * SQRT_LOOKUP_SCALE);
+      } else {
+        Y_sq = Y_sq >> (sB + 2);
+        // b_curr: bw = sB + 2, scale = sB
+        b_curr = (Y_sq * b_prev) >> sB;
+      }
+      // Y_curr: bw = sB + 2, scale = sB + 1
+      // Y_curr = (3 - b_curr)/2
+      Y_curr = (1.5 * (1LL << (sB + 1))) - b_curr;
+      x_curr = x_prev * Y_curr;
+      if (inverse && (j == 0)) {
+        // x_curr: bw = sB + 1, scale = sB
+        x_curr = x_curr >> (m + SQRT_LOOKUP_SCALE + 1);
+      } else {
+        // x_curr: bw = sB + 1, scale = sB
+        x_curr = x_curr >> (sB + 1);
+      }
+      x_prev = x_curr;
+      b_prev = b_curr;
+      Y_prev = Y_curr;
+    }
+    int64_t res;
+    if (iters > 0 || (!inverse)) {
+      res = x_curr * sqrt_adjust;
+      res = res >> sqrt_adjust_scale;
+    } else {
+      // Scale of x_curr is m + 1 if iters == 0 and sB = m
+      res = x_curr * sqrt_adjust;
+      res = res >> (sqrt_adjust_scale + SQRT_LOOKUP_SCALE);
+    }
+    B[i] = Saturate(res);
+
+    double flA = (A[i] / double(1LL << sA));
+    double sqrt_A = (inverse ? 1.0 / sqrt(flA) : sqrt(flA));
+    int64_t actualB = sqrt_A * (1LL << sB);
+    error += abs(actualB - B[i]);
+  }
+  if (party == 2) {
+    std::cout << "Sqrt Average Error: " << error / (I * J) << std::endl;
+  }
+
+  return;
+}
+
+void cleartext_div(int64_t *A, int64_t *B, int32_t I, int32_t J, int32_t shrA,
+                   int32_t shrB, int32_t shrC, int64_t *C, bool compute_msnzb) {
+  int32_t bwA = sizeof(int64_t) * 8;
+  int32_t bwB = sizeof(int64_t) * 8;
+  int32_t bwC = sizeof(int64_t) * 8;
+  int32_t sA = log2(shrA);
+  int32_t sB = log2(shrB);
+  int32_t sC = log2(shrC);
+  int32_t m, iters;
+  m = (sC <= 18 ? ceil((sC - 2) / 2.0) : ceil((ceil(sC / 2.0) - 2) / 2.0));
+  iters = (sC <= 18 ? 0 : 1);
+  assert(m <= KKOT_LIMIT);
+  assert(sB >= m);
+
+  int64_t m_mask = (1LL << m) - 1;
+  uint64_t s_min_m_mask = (1ULL << (sB - m)) - 1;
+  int64_t error = 0;
+  for (int i = 0; i < I * J; i++) {
+    int32_t tmp_sB;
+    int32_t msnzb, s_adjust;
+    int64_t adjust, tmpB;
+    if (compute_msnzb) {
+      msnzb = floor(log2(int64_t(A[i])));
+      tmp_sB = bwB - 1;
+      s_adjust = bwB - 1 - sB;
+      int32_t adjust_amt = bwB - 1 - msnzb;
+      adjust = (1LL << adjust_amt);
+      tmpB = (adjust * int64_t(B[i]));
+    } else {
+      assert((B[i] >= (1LL << sB)) && (B[i] < (2LL << sB)));
+      tmpB = int64_t(B[i]);
+      tmp_sB = sB;
+    }
+    int64_t B_m = (tmpB >> (tmp_sB - m)) & m_mask;
+    int64_t A0 = lookup_div_A0(B_m, m);
+    int64_t A1 = lookup_div_A1(B_m, m);
+    int64_t Q = tmpB & s_min_m_mask;
+    int64_t A0_Q = A0 * Q;
+    // reciprocal approximation of B with precision sC
+    int64_t Y = ((A1 << (tmp_sB + 1 - m)) - A0_Q) >> (tmp_sB + m + 3 - sC);
+    int64_t a0 = (int64_t(A[i]) * Y) >> sA;
+    if (compute_msnzb) {
+      a0 = (a0 * adjust) >> s_adjust;
+    }
+    if (iters > 0) {
+      int64_t e = (tmpB * Y) >> sB;
+      // e0 = 1 - e
+      int64_t e0 = (1LL << sC) - e;
+      int64_t a_prev = a0, e_prev = e0, a_curr, e_curr;
+      for (int j = 0; j < iters - 1; j++) {
+        e_curr = (e_prev * e_prev) >> sC;
+        a_curr = (a_prev * ((1LL << sC) + e_prev)) >> sC;
+        a_prev = a_curr;
+        e_prev = e_curr;
+      }
+      a0 = (a_prev * ((1LL << sC) + e_prev)) >> sC;
+    }
+    C[i] = Saturate(a0);
+
+    double flA = (A[i] / double(1LL << sA));
+    double flB = (B[i] / double(1LL << sB));
+    double A_by_B = flA / flB;
+    int64_t actualC = A_by_B * (1LL << sC);
+    error += abs(actualC - C[i]);
+  }
+  if (party == 2) {
+    std::cout << "Div Average Error: " << error / (I * J) << std::endl;
+  }
+
+  return;
+}
+
+void cleartext_Exp_lookup(int64_t *A, int32_t I, int32_t J, int bwA,
+                          int32_t shrA, int32_t shrB, int64_t *B,
+                          int32_t demote) {
+  int32_t sA = log2(shrA);
+  int32_t sB = log2(shrB);
+  int32_t s_demote = log2(demote);
+  int32_t LUT_size = KKOT_LIMIT;
+
+  int digit_size = LUT_size;
+  int num_digits = ceil(double(bwA) / digit_size);
+  int last_digit_size = bwA - (num_digits - 1) * digit_size;
+  int64_t digit_mask = (digit_size == 64 ? -1 : (1LL << digit_size) - 1);
+  int64_t A_digits[num_digits];
+  int64_t error = 0;
+  for (int i = 0; i < I * J; i++) {
+    assert(A[i] <= 0);
+    int64_t neg_A = -1LL * (A[i]);
+    for (int j = 0; j < num_digits; j++) {
+      A_digits[j] = (neg_A >> (j * digit_size)) & digit_mask;
+      A_digits[j] = lookup_neg_exp(A_digits[j], sA - digit_size * j, sB);
+    }
+    for (int j = 1; j < num_digits; j *= 2) {
+      for (int k = 0; k < num_digits and k + j < num_digits; k += 2 * j) {
+        A_digits[k] = (A_digits[k + j] * A_digits[k]) >> sB;
+      }
+    }
+    B[i] = Saturate(A_digits[0] >> s_demote);
+    double flA = (A[i] / double(1LL << sA));
+    double expA = exp(flA);
+    int64_t actualB = expA * (1LL << sB);
+    error += abs(actualB - B[i]);
+  }
+  if (party == 2) {
+    std::cout << "Exp Average Error: " << error / (I * J) << std::endl;
+  }
+
+  return;
+}
+
+void cleartext_ArgMax(int64_t *A, int32_t I, int32_t J, int32_t *index) {
+  int64_t max = A[0];
+  int32_t maxIndex = 0, counter = 0;
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t x = A[i * J + j];
+
+      if (max < x) {
+        maxIndex = counter;
+        max = x;
+      }
+
+      counter++;
+    }
+  }
+
+  *index = maxIndex;
+
+  return;
+}
+
+void cleartext_Convolution(int64_t *A, int64_t *B, int64_t *C, int64_t *tmp,
+                           int32_t N, int32_t H, int32_t W, int32_t CIN,
+                           int32_t HF, int32_t WF, int32_t CINF, int32_t COUTF,
+                           int32_t HOUT, int32_t WOUT, int32_t HPADL,
+                           int32_t HPADR, int32_t WPADL, int32_t WPADR,
+                           int32_t HSTR, int32_t WSTR, int32_t HDL, int32_t WDL,
+                           int32_t G, int32_t shrA, int32_t shrB, int32_t H1,
+                           int32_t H2, int32_t demote) {
+  int32_t HOffsetL = HDL * (HF / 2) - HPADL;
+  int32_t WOffsetL = WDL * (WF / 2) - WPADL;
+  int32_t HOffsetR = HDL * (HF / 2) - HPADR;
+  int32_t WOffsetR = WDL * (WF / 2) - WPADR;
+
+  int K = HF * WF * CINF;
+  int64_t *temp = new int64_t[K];
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+  int depth = ceil(log2(K));
+  int32_t shift = shiftA + shiftB + shift_demote + H1 - depth;
+
+  for (int32_t n = 0; n < N; n++) {
+    for (int32_t h = HOffsetL, hout = 0; h < H - HOffsetR; h += HSTR, hout++) {
+      for (int32_t w = WOffsetL, wout = 0; w < W - WOffsetR;
+           w += WSTR, wout++) {
+        for (int32_t g = 0; g < G; g++) {
+          for (int32_t co = 0; co < COUTF; co++) {
+
+            int32_t counter = 0;
+            for (int32_t hf = -(HF / 2); hf <= HF / 2; hf++) {
+              for (int32_t wf = -(WF / 2); wf <= WF / 2; wf++) {
+                for (int32_t ci = 0; ci < CINF; ci++) {
+
+                  int64_t a =
+                      (int64_t)(((h + HDL * hf) < 0) || ((h + HDL * hf) >= H) ||
+                                ((w + WDL * wf) < 0) || ((w + WDL * wf) >= W))
+                          ? 0
+                          : A[n * H * W * CIN + (h + HDL * hf) * W * CIN +
+                              (w + WDL * wf) * CIN + (ci + g * CINF)];
+
+                  int64_t b = (int64_t)
+                      B[g * HF * WF * CINF * COUTF +
+                        (hf + HF / 2) * WF * CINF * COUTF +
+                        (wf + WF / 2) * CINF * COUTF + ci * COUTF + co];
+
+                  // tmp[counter] = a * b;
+                  int64_t prod = int64_t(a) * int64_t(b);
+                  temp[counter] = prod;
+                  counter++;
+                }
+              }
+            }
+
+            int64_t sum = 0;
+            for (int k = 0; k < K; k++) {
+              sum += temp[k];
+            }
+            sum = (sum >> depth);
+            int C_idx = n * HOUT * WOUT * (COUTF * G) +
+                        hout * WOUT * (COUTF * G) + wout * (COUTF * G) +
+                        (co + g * COUTF);
+            if (shift >= 0) {
+#ifdef DIV_RESCALING
+              C[C_idx] = Saturate(sum / (1LL << shift));
+#else
+              C[C_idx] = Saturate(sum >> shift);
+#endif
+            } else {
+              C[C_idx] = Saturate(sum * (1LL << (-1 * shift)));
+            }
+            // C[n * HOUT * WOUT * (COUTF * G) + hout * WOUT * (COUTF * G) +
+            // wout * (COUTF * G) + (co + g * COUTF)] = Saturate(((tmp[0] /
+            // shrA) / shrB) / demote);
+          }
+        }
+      }
+    }
+  }
+  delete[] temp;
+}
+
+void cleartext_AddOrSubCir4D(int64_t *A, int64_t *B, int64_t *X, int32_t N,
+                             int32_t H, int32_t W, int32_t C, int32_t shrA,
+                             int32_t shrB, int32_t shrC, bool add,
+                             int32_t demote) {
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+
+  for (int n = 0; n < N; n++) {
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        for (int c = 0; c < C; c++) {
+          int64_t a = (int64_t)A[n * H * W * C + h * W * C + w * C + c];
+          int64_t b = (int64_t)B[c];
+#ifdef DIV_RESCALING
+          a = a / (shrA * shrC);
+          b = b / (shrB * shrC);
+#else
+          a = a >> (shiftA + shiftC);
+          b = b >> (shiftB + shiftC);
+#endif
+
+          int64_t res;
+          if (add)
+            res = a + b;
+          else
+            res = a - b;
+
+#ifdef DIV_RESCALING
+          X[n * H * W * C + h * W * C + w * C + c] = Saturate(res / demote);
+#else
+          X[n * H * W * C + h * W * C + w * C + c] =
+              Saturate(res >> shift_demote);
+#endif
+        }
+      }
+    }
+  }
+  return;
+}
+
+void cleartext_Relu6(int64_t *A, int64_t *B, int32_t N, int32_t H, int32_t W,
+                     int32_t C, int64_t six, int32_t div) {
+  int32_t shift_div = log2(div);
+
+  for (int n = 0; n < N; n++) {
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        for (int c = 0; c < C; c++) {
+          int64_t a = A[n * H * W * C + h * W * C + w * C + c];
+          if (a < 0)
+            a = 0;
+          if (a > six)
+            a = six;
+#ifdef DIV_RESCALING
+          B[n * H * W * C + h * W * C + w * C + c] = (int64_t)(a / div);
+#else
+          B[n * H * W * C + h * W * C + w * C + c] = (int64_t)(a >> shift_div);
+#endif
+        }
+      }
+    }
+  }
+  return;
+}
+
+void cleartext_BNorm(int64_t *A, int64_t *BNW, int64_t *BNB, int64_t *B,
+                     int32_t I, int32_t J, int32_t shA, int32_t shBNB,
+                     int32_t shB) {
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      int64_t a = (int64_t)A[i * J + j];
+      int64_t bn_b = (int64_t)BNB[j];
+      int64_t bn_w = (int64_t)BNW[j];
+
+      // sh = +ve : right shift
+      // sh = -ve : left shift
+      if (shA <= 0) {
+        a = (a * (1LL << (-1 * shA)));
+      } else {
+#ifdef DIV_RESCALING
+        a = a / (1LL << shA);
+#else
+        a = a >> shA;
+#endif
+      }
+      if (shBNB <= 0) {
+        bn_b = (bn_b * (1LL << (-1 * shBNB)));
+      } else {
+#ifdef DIV_RESCALING
+        bn_b = bn_b / (1LL << shBNB);
+#else
+        bn_b = bn_b >> shBNB;
+#endif
+      }
+
+      int64_t b = (a + bn_b) * bn_w;
+
+      if (shB <= 0) {
+        B[i * J + j] = Saturate(b * (1LL << (-1 * shB)));
+      } else {
+#ifdef DIV_RESCALING
+        B[i * J + j] = Saturate(b / (1LL << shB));
+#else
+        B[i * J + j] = Saturate(b >> shB);
+#endif
+      }
+    }
+  }
+  return;
+}
+
+void cleartext_MBConv(int64_t *A, int64_t *F1, int64_t *BN1W, int64_t *BN1B,
+                      int64_t *F2, int64_t *BN2W, int64_t *BN2B, int64_t *F3,
+                      int64_t *BN3W, int64_t *BN3B, int64_t *C, int64_t *X,
+                      int64_t *T, int64_t *U, int32_t N, int32_t H, int32_t W,
+                      int32_t Cin, int32_t Ct, int32_t HF, int32_t WF,
+                      int32_t Cout, int32_t Hout, int32_t Wout, int32_t HPADL,
+                      int32_t HPADR, int32_t WPADL, int32_t WPADR, int32_t HSTR,
+                      int32_t WSTR, int32_t D1, int32_t D2, int32_t D3,
+                      int64_t SIX_1, int64_t SIX_2, int64_t shr1, int64_t shr2,
+                      int64_t shr3, int64_t shr4, int64_t shr5, int64_t shr6,
+                      int64_t shr7, int64_t shr8, int64_t shr9, int64_t shl1,
+                      int64_t shl2, int64_t shl3, int64_t shl4, int64_t shl5,
+                      int64_t shl6, int64_t shl7, int64_t shl8, int64_t shl9) {
+  int32_t HOffsetL = (HF / 2) - HPADL;
+  int32_t WOffsetL = (WF / 2) - WPADL;
+  int32_t HOffsetR = (HF / 2) - HPADR;
+  int32_t WOffsetR = (WF / 2) - WPADR;
+
+  bool del_X = false;
+  if (X == nullptr) {
+    X = new int64_t[H * W * Ct];
+    del_X = true;
+  }
+
+  bool del_T = false;
+  if (T == nullptr) {
+    T = new int64_t[Ct];
+    del_T = true;
+  }
+
+  bool del_U = false;
+  if (U == nullptr) {
+    U = new int64_t[Cin + HF * WF + Ct];
+    del_U = true;
+  }
+
+  int32_t shift1 = log2(shr1);
+  int32_t shift2 = log2(shr2);
+  int32_t shift3 = log2(shr3);
+  int32_t shift4 = log2(shr4);
+  int32_t shift5 = log2(shr5);
+  int32_t shift6 = log2(shr6);
+  int32_t shift7 = log2(shr7);
+  int32_t shift8 = log2(shr8);
+  int32_t shift9 = log2(shr9);
+
+  for (int32_t n = 0; n < N; n++) {
+    int32_t margin =
+        HOffsetL + (HF / 2 + 1) - HSTR > 0 ? HOffsetL + (HF / 2 + 1) - HSTR : 0;
+    int32_t nstart = HOffsetL - (HF / 2) < 0 ? 0 : HOffsetL - (HF / 2);
+    for (int32_t i = nstart; i < margin; i++) {
+      for (int32_t j = 0; j < W; j++) {
+        for (int32_t k = 0; k < Ct; k++) {
+          int64_t tmpU = 0;
+          for (int32_t l = 0; l < Cin; l++) {
+            tmpU += ((int64_t)A[n * H * W * Cin + i * W * Cin + j * Cin + l]) *
+                    ((int64_t)F1[l * Ct + k]);
+          }
+          tmpU = tmpU >> D1;
+
+          // std::cout << i << "\t" << tmpU << std::endl;
+#ifdef DIV_RESCALING
+          int64_t x = (((int64_t)((int64_t(tmpU) * shl1) / shr1 +
+                                  (BN1B[k] * shl2) / shr2)) *
+                       ((int64_t)BN1W[k]));
+#else
+          int64_t x = (((int64_t)(((int64_t(tmpU) * shl1) >> shift1) +
+                                  ((BN1B[k] * shl2) >> shift2))) *
+                       ((int64_t)BN1W[k]));
+#endif
+          x = x < 0 ? 0 : x;
+          x = x > SIX_1 ? SIX_1 : x;
+#ifdef DIV_RESCALING
+          X[i * W * Ct + j * Ct + k] = Saturate((x * shl3) / shr3);
+#else
+          X[i * W * Ct + j * Ct + k] = Saturate((x * shl3) >> shift3);
+#endif
+          // std::cout << i << "\t" << X[i * W * Ct + j * Ct + k] << std::endl;
+        }
+      }
+    }
+
+    for (int32_t h = HOffsetL, hout = 0; h < H - HOffsetR; hout++, h += HSTR) {
+
+      for (int32_t i = 0; i < HSTR; i++) {
+        for (int32_t j = 0; j < W; j++) {
+          for (int32_t k = 0; k < Ct; k++) {
+            int32_t iRed = (i + margin + hout * HSTR) % HF,
+                    iFull = i + margin + hout * HSTR;
+            X[iRed * W * Ct + j * Ct + k] = 0;
+            int64_t tmpU = 0;
+            for (int32_t l = 0; l < Cin; l++) {
+              int64_t a =
+                  iFull < H ? A[n * H * W * Cin + iFull * W * Cin + j * Cin + l]
+                            : 0;
+              tmpU += ((int64_t)a) * ((int64_t)F1[l * Ct + k]);
+            }
+            tmpU = tmpU >> D1;
+
+            // std::cout << iFull << "\t" << tmpU << std::endl;
+#ifdef DIV_RESCALING
+            int64_t x = (((int64_t)((int64_t(tmpU) * shl1) / shr1 +
+                                    (BN1B[k] * shl2) / shr2)) *
+                         ((int64_t)BN1W[k]));
+#else
+            int64_t x = (((int64_t)(((int64_t(tmpU) * shl1) >> shift1) +
+                                    ((BN1B[k] * shl2) >> shift2))) *
+                         ((int64_t)BN1W[k]));
+#endif
+            x = x < 0 ? 0 : x;
+            x = x > SIX_1 ? SIX_1 : x;
+#ifdef DIV_RESCALING
+            X[iRed * W * Ct + j * Ct + k] = Saturate((x * shl3) / shr3);
+#else
+            X[iRed * W * Ct + j * Ct + k] = Saturate((x * shl3) >> shift3);
+#endif
+            // std::cout << iFull << "\t" << X[iRed * W * Ct + j * Ct + k] <<
+            // std::endl;
+          }
+        }
+      }
+
+      for (int32_t w = WOffsetL, wout = 0; w < W - WOffsetR;
+           w += WSTR, wout++) {
+        for (int32_t g = 0; g < Ct; g++) {
+          int32_t counter = 0;
+          int64_t tmpU = 0;
+          for (int32_t hf = -(HF / 2); hf <= (HF / 2); hf++) {
+            for (int32_t wf = -(WF / 2); wf <= (WF / 2); wf++) {
+              int64_t x = (((h + hf) < 0) || ((h + hf) >= H) ||
+                           ((w + wf) < 0) || ((w + wf) >= W))
+                              ? 0
+                              : X[((h + hf) % HF) * W * Ct + (w + wf) * Ct + g];
+              int64_t b = F2[g * HF * WF + (hf + HF / 2) * WF + (wf + WF / 2)];
+              tmpU += ((int64_t)x) * ((int64_t)b);
+              counter++;
+            }
+          }
+          tmpU = tmpU >> D2;
+
+          // std::cout << tmpU << std::endl;
+
+#ifdef DIV_RESCALING
+          int64_t x = (((int64_t)((int64_t(tmpU) * shl4) / shr4 +
+                                  (BN2B[g] * shl5) / shr5)) *
+                       ((int64_t)BN2W[g]));
+#else
+          int64_t x = (((int64_t)(((int64_t(tmpU) * shl4) >> shift4) +
+                                  ((BN2B[g] * shl5) >> shift5))) *
+                       ((int64_t)BN2W[g]));
+#endif
+          x = x < 0 ? 0 : x;
+          x = x > SIX_2 ? SIX_2 : x;
+#ifdef DIV_RESCALING
+          T[g] = Saturate((x * shl6) / shr6);
+#else
+          T[g] = Saturate((x * shl6) >> shift6);
+#endif
+        }
+
+        for (int32_t i = 0; i < Cout; i++) {
+          int64_t tmpU = 0;
+          for (int32_t g = 0; g < Ct; g++) {
+            tmpU += T[g] * F3[g * Cout + i];
+          }
+          tmpU = tmpU >> D3;
+
+#ifdef DIV_RESCALING
+          C[n * Hout * Wout * Cout + hout * Wout * Cout + wout * Cout + i] =
+              Saturate(((((int64_t)((tmpU * shl7) / shr7 +
+                                    (BN3B[i] * shl8) / shr8)) *
+                         ((int64_t)BN3W[i])) *
+                        shl9) /
+                       shr9);
+#else
+          C[n * Hout * Wout * Cout + hout * Wout * Cout + wout * Cout + i] =
+              Saturate(((((int64_t)(((tmpU * shl7) >> shift7) +
+                                    ((BN3B[i] * shl8) >> shift8))) *
+                         ((int64_t)BN3W[i])) *
+                        shl9) >>
+                       shift9);
+#endif
+        }
+      }
+    }
+  }
+
+  if (del_X)
+    delete[] X;
+  if (del_T)
+    delete[] T;
+  if (del_U)
+    delete[] U;
+}
+
+void cleartext_NormaliseL2(int64_t *A, int64_t *B, int32_t N, int32_t H,
+                           int32_t W, int32_t C, int32_t scaleA, int32_t shrA,
+                           int32_t bwA, int32_t bwB) {
+  int32_t scale_in = -1 * scaleA;
+  int32_t scale_out = -1 * (scaleA + 1);
+  int32_t shrAdiv = (1 << shrA);
+  int64_t *sumSquare = new int64_t[N * H * W];
+  int64_t *inverseNorm = new int64_t[N * H * W];
+  for (int32_t i = 0; i < N * H * W; i++) {
+    // calculate the sum square
+    sumSquare[i] = 0;
+    for (int32_t j = 0; j < C; j++) {
+      int64_t tmp = A[i * C + j];
+      sumSquare[i] += int64_t((tmp * tmp) >> (2 * shrA));
+    }
+  }
+  cleartext_sqrt(sumSquare, N * H * W, 1, 1 << (2 * (scale_in - shrA)),
+                 1 << (scale_out - scale_in + shrA), bwA, bwB, inverseNorm,
+                 true);
+  /*
+  for(int i = 0; i < N*H*W; i++) {
+      std::cout << double(sumSquare[i])/(1LL << (2*(scale_in-shrA))) << "\t" <<
+  double(inverseNorm[i])/(1LL << (scale_out-scale_in+shrA)) << std::endl;
+  }
+  */
+
+  for (int32_t i = 0; i < N * H * W; i++) {
+    // multiply all elements by the 1/sqrt(sumSquare)
+    for (int32_t j = 0; j < C; j++) {
+#ifdef DIV_RESCALING
+      B[i * C + j] =
+          (int64_t(A[i * C + j]) * int64_t(inverseNorm[i])) / (shrAdiv);
+#else
+      B[i * C + j] = (int64_t(A[i * C + j]) * int64_t(inverseNorm[i])) >> shrA;
+#endif
+    }
+  }
+  delete[] sumSquare;
+  delete[] inverseNorm;
+
+  return;
+}
+
+void cleartext_NormaliseL2_seedot(int64_t *A, int64_t *B, int32_t N, int32_t H,
+                                  int32_t W, int32_t C, int32_t scaleA,
+                                  int32_t shrA) {
+  for (int32_t n = 0; n < N; n++) {
+    for (int32_t h = 0; h < H; h++) {
+      for (int32_t w = 0; w < W; w++) {
+
+        // calculate the sum square
+        int32_t sumSquare = 0;
+        int32_t shrAdiv = (1 << shrA);
+
+        for (int32_t c = 0; c < C; c++) {
+#ifdef FASTAPPROX
+          int32_t tmp = (A[n * H * W * C + h * W * C + w * C + c] / shrAdiv);
+          sumSquare += tmp * tmp;
+#else
+          int32_t tmp = A[n * H * W * C + h * W * C + w * C + c];
+          sumSquare += (((tmp * tmp) / shrAdiv) / shrAdiv);
+#endif
+        }
+
+        // calculate the inverse square root of sumSquare
+        int32_t yLow = 1;
+
+        // yHigh: A number of length shrA with all 1s in binary representation
+        // e.g. for shrA=8 --> y_high = 0b11111111
+        int32_t yHigh = ((1 << shrA) - 1);
+
+        // one: value of 1 with same scale as y*y*sumSquare
+        // scale of sumSquare = 2*scale_in + 2*shrA
+        // since we assume scale of y = 1 - shrA
+        // scale of y*y*sumSquare =  2*scale_in + 2*shrA + 2(1-shrA) =
+        // 2*scale_in + 2
+        int32_t one = (1 << (-(2 * scaleA + 2)));
+
+        // binary search for the inverse square root
+        while (yLow + 1 < yHigh) {
+
+          // using int32_t sotherwise (y*y*sumSquare) will overflow
+          int32_t yMid = ((yHigh + yLow) >> 1);
+
+          int64_t cmpValue = (int64_t)sumSquare * yMid * yMid;
+
+          if (cmpValue > one) {
+            yHigh = yMid;
+          } else {
+            yLow = yMid;
+          }
+        }
+        int32_t inverseNorm = yLow;
+
+        // multiply all elements by the 1/sqrt(sumSquare)
+        for (int32_t c = 0; c < C; c++) {
+          B[n * H * W * C + h * W * C + w * C + c] =
+              (A[n * H * W * C + h * W * C + w * C + c] / shrAdiv) *
+              inverseNorm;
+        }
+      }
+    }
+  }
+  return;
+}
+
+void cleartext_AdjustScaleShr(int64_t *A, int32_t I, int32_t J, int32_t scale) {
+  int32_t shift_scale = ceil(log2(scale));
+  for (int32_t i = 0; i < I; i++) {
+    for (int32_t j = 0; j < J; j++) {
+      int64_t a = A[i * J + j];
+#ifdef DIV_RESCALING
+      A[i * J + j] = a / scale;
+#else
+      A[i * J + j] = a >> shift_scale;
+#endif
+    }
+  }
+  return;
+}
+
+void cleartext_MaxPool2D(int64_t *A, int32_t I, int32_t J, int64_t *B) {
+  int32_t counter = 0;
+  bool verbose = false;
+  for (int i = 0; i < I; i++) {
+    int64_t max = A[i * J + 0];
+    for (int j = 0; j < J; j++) {
+      int64_t x = max - A[i * J + j];
+      int64_t diff = signed_val_cleartext(x, sizeof(int64_t) * 8);
+
+      if (verbose) {
+        std::cout << "max: " << max << " | cur: " << A[i * J + j] << " | diff:\
+                    "
+                  << diff << std::endl;
+      }
+
+      if (diff <= 0) {
+        max = A[i * J + j];
+      }
+    }
+    verbose = false;
+    B[i] = (int64_t)max;
+  }
+
+  return;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.h b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.h
new file mode 100644
index 00000000..44c956b5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.h
@@ -0,0 +1,180 @@
+/*
+Authors: Deevashwer Rathee, G Rahul Kranti Kiran
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef CLEARTEXT_LIBRARY_FIXED_H__
+#define CLEARTEXT_LIBRARY_FIXED_H__
+
+#include "defines.h"
+#include "globals.h"
+#include <cassert>
+#include <cmath>
+
+#define KKOT_LIMIT 8
+#define SQRT_LOOKUP_SCALE 2
+
+static inline int64_t signed_val_cleartext(uint64_t x, int bw_x) {
+  uint64_t pow_x = (bw_x == 64 ? 0ULL : (1ULL << bw_x));
+  uint64_t mask_x = pow_x - 1;
+  x = x & mask_x;
+  return int64_t(x - ((x >= (pow_x / 2)) * pow_x));
+}
+
+static int64_t lookup_sqrt(int32_t index, int m, int exp_parity) {
+  int32_t k = 1 << m;
+  double u = (1.0 + (double(index) / double(k))) * (1 << exp_parity);
+  double Y = 1.0 / sqrt(u);
+  int32_t scale = m + SQRT_LOOKUP_SCALE;
+  int64_t val = (Y * (1ULL << scale));
+  return val;
+}
+
+// A0 \in (1/4, 1)
+static int64_t lookup_div_A0(int64_t index, int m) {
+  int32_t k = 1 << m;
+  double p = 1 + (double(index) / double(k));
+  double A1 = 1.0 / (p * (p + 1.0 / double(k)));
+  int32_t scale = m + 3;
+  int64_t val = (A1 * (1LL << scale));
+  return val;
+}
+
+// A1 \in (1/2, 1)
+static int64_t lookup_div_A1(int64_t index, int m) {
+  int32_t k = 1 << m;
+  double p = 1 + (double(index) / double(k));
+  double z = (p * (p + (1.0 / double(k))));
+  double A1 = ((1.0 / double(k * 2)) + sqrt(z)) / z;
+  int32_t scale = 2 * m + 2;
+  int64_t val = (A1 * (1LL << scale));
+  return val;
+}
+
+static int64_t lookup_neg_exp(int64_t val_in, int32_t s_in, int32_t s_out) {
+  if (s_in < 0) {
+    s_in *= -1;
+    val_in *= (1 << (s_in));
+    s_in = 0;
+  }
+  int64_t res_val = exp(-1.0 * (val_in / double(1LL << s_in))) * (1LL << s_out);
+  return res_val;
+}
+
+void cleartext_Sigmoid(int64_t *A, int I, int J, int scale_in, int scale_out,
+                       int bwA, int bwB, int64_t *B);
+
+void cleartext_TanH(int64_t *A, int I, int J, int64_t scale_in,
+                    int64_t scale_out, int bwA, int bwB, int64_t *B);
+
+void cleartext_MatAdd(int64_t *A, int64_t *B, int64_t *C, int32_t I, int32_t J,
+                      int32_t shrA, int32_t shrB, int32_t shrC, int32_t demote);
+
+void cleartext_MatSub(int64_t *A, const int64_t *B, int64_t *C, int32_t I,
+                      int32_t J, int32_t shrA, int32_t shrB, int32_t shrC,
+                      int32_t demote);
+
+void cleartext_MatMul(int64_t *A, int64_t *B, int64_t *C, int64_t *tmp,
+                      int32_t I, int32_t K, int32_t J, int32_t shrA,
+                      int32_t shrB, int32_t H1, int32_t H2, int32_t demote);
+
+void cleartext_MatAddBroadCastA(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote);
+
+void cleartext_MatAddBroadCastB(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote);
+
+void cleartext_MatSubBroadCastA(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote);
+
+void cleartext_MatSubBroadCastB(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                                int32_t J, int32_t shrA, int32_t shrB,
+                                int32_t shrC, int32_t demote);
+
+void cleartext_ScalarMul(int64_t *A, int64_t *B, int64_t *C, int32_t I,
+                         int32_t J, int32_t shrA, int32_t shrB, int demote);
+
+void cleartext_MulCir(int64_t *A, int64_t *B, int64_t *C, int32_t I, int32_t J,
+                      int32_t shrA, int32_t shrB, int32_t demote);
+
+void cleartext_sqrt(int64_t *A, int32_t I, int32_t J, int32_t shrA,
+                    int32_t shrB, int32_t bwA, int32_t bwB, int64_t *B,
+                    bool inverse = false);
+
+void cleartext_div(int64_t *A, int64_t *B, int32_t I, int32_t J, int32_t shrA,
+                   int32_t shrB, int32_t shrC, int64_t *C,
+                   bool compute_msnzb = false);
+
+void cleartext_Exp_lookup(int64_t *A, int32_t I, int32_t J, int bwA,
+                          int32_t shrA, int32_t shrB, int64_t *B,
+                          int32_t demote);
+
+void cleartext_ArgMax(int64_t *A, int32_t I, int32_t J, int32_t *index);
+
+void cleartext_Convolution(int64_t *A, int64_t *B, int64_t *C, int64_t *tmp,
+                           int32_t N, int32_t H, int32_t W, int32_t CIN,
+                           int32_t HF, int32_t WF, int32_t CINF, int32_t COUTF,
+                           int32_t HOUT, int32_t WOUT, int32_t HPADL,
+                           int32_t HPADR, int32_t WPADL, int32_t WPADR,
+                           int32_t HSTR, int32_t WSTR, int32_t HDL, int32_t WDL,
+                           int32_t G, int32_t shrA, int32_t shrB, int32_t H1,
+                           int32_t H2, int32_t demote);
+
+void cleartext_AddOrSubCir4D(int64_t *A, int64_t *B, int64_t *X, int32_t N,
+                             int32_t H, int32_t W, int32_t C, int32_t shrA,
+                             int32_t shrB, int32_t shrC, bool add,
+                             int32_t demote);
+
+void cleartext_Relu6(int64_t *A, int64_t *B, int32_t N, int32_t H, int32_t W,
+                     int32_t C, int64_t six, int32_t div);
+
+void cleartext_BNorm(int64_t *A, int64_t *BNW, int64_t *BNB, int64_t *B,
+                     int32_t I, int32_t J, int32_t shA, int32_t shBNB,
+                     int32_t shB);
+
+void cleartext_MBConv(int64_t *A, int64_t *F1, int64_t *BN1W, int64_t *BN1B,
+                      int64_t *F2, int64_t *BN2W, int64_t *BN2B, int64_t *F3,
+                      int64_t *BN3W, int64_t *BN3B, int64_t *C, int64_t *X,
+                      int64_t *T, int64_t *U, int32_t N, int32_t H, int32_t W,
+                      int32_t Cin, int32_t Ct, int32_t HF, int32_t WF,
+                      int32_t Cout, int32_t Hout, int32_t Wout, int32_t HPADL,
+                      int32_t HPADR, int32_t WPADL, int32_t WPADR, int32_t HSTR,
+                      int32_t WSTR, int32_t D1, int32_t D2, int32_t D3,
+                      int64_t SIX_1, int64_t SIX_2, int64_t shr1, int64_t shr2,
+                      int64_t shr3, int64_t shr4, int64_t shr5, int64_t shr6,
+                      int64_t shr7, int64_t shr8, int64_t shr9, int64_t shl1,
+                      int64_t shl2, int64_t shl3, int64_t shl4, int64_t shl5,
+                      int64_t shl6, int64_t shl7, int64_t shl8, int64_t shl9);
+
+void cleartext_NormaliseL2(int64_t *A, int64_t *B, int32_t N, int32_t H,
+                           int32_t W, int32_t C, int32_t scaleA, int32_t shrA,
+                           int32_t bwA, int32_t bwB);
+
+void cleartext_NormaliseL2_seedot(int64_t *A, int64_t *B, int32_t N, int32_t H,
+                                  int32_t W, int32_t C, int32_t scaleA,
+                                  int32_t shrA);
+
+void cleartext_AdjustScaleShr(int64_t *A, int32_t I, int32_t J, int32_t scale);
+
+void cleartext_MaxPool2D(int64_t *A, int32_t I, int32_t J, int64_t *B);
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed_uniform.h b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed_uniform.h
new file mode 100644
index 00000000..d080f421
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed_uniform.h
@@ -0,0 +1,3061 @@
+/*
+Authors: Nishant Kumar, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef CLEARTEXT_LIBRARY_FIXED_CSF_H__
+#define CLEARTEXT_LIBRARY_FIXED_CSF_H__
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <vector>
+
+extern uint64_t prime_mod;
+extern uint64_t moduloMask;
+extern uint64_t moduloMidPt;
+
+typedef std::vector<uint64_t> uint64_1D;
+typedef std::vector<std::vector<uint64_t>> uint64_2D;
+typedef std::vector<std::vector<std::vector<uint64_t>>> uint64_3D;
+typedef std::vector<std::vector<std::vector<std::vector<uint64_t>>>> uint64_4D;
+typedef std::vector<
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>>>
+    uint64_5D;
+
+#if defined(SCI_OT)
+
+void div_floor(int64_t a, int64_t b, int64_t &quot, int64_t &rem) {
+  assert(b > 0);
+  int64_t q = a / b;
+  int64_t r = a % b;
+  int64_t corr = ((r != 0) && (r < 0));
+  quot = q - corr;
+  rem = (r + b) % b;
+}
+
+inline int64_t getSignedVal(uint64_t x) {
+  assert(x < prime_mod);
+  int64_t sx = x;
+  if (x >= moduloMidPt)
+    sx = x - prime_mod;
+  return sx;
+}
+
+inline uint64_t getRingElt(int64_t x) { return ((uint64_t)x) & moduloMask; }
+
+inline uint64_t PublicAdd(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x + y) & moduloMask;
+}
+
+inline uint64_t PublicSub(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x - y) & moduloMask;
+}
+
+inline uint64_t PublicMult(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x * y) & moduloMask; // This works because its a two-power ring
+}
+
+inline bool PublicGT(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx > sy);
+}
+
+inline bool PublicGTE(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx >= sy);
+}
+
+inline bool PublicLT(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx < sy);
+}
+
+inline bool PublicLTE(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx <= sy);
+}
+
+uint64_t PublicDiv(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  int64_t q, r;
+  div_floor(sx, sy, q, r);
+  return getRingElt(q);
+}
+
+uint64_t PublicMod(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  int64_t q, r;
+  div_floor(sx, sy, q, r);
+  return r;
+}
+
+inline uint64_t PublicRShiftA(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  int64_t sx = getSignedVal(x);
+  int64_t ans = sx >> y;
+  return getRingElt(ans);
+}
+
+inline uint64_t PublicRShiftL(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x >> y);
+}
+
+inline uint64_t PublicLShift(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x << y) & moduloMask;
+}
+
+#else
+
+// Assumption at some places in the following code is that 2*mod < (1<<64)
+//  which allows things like (x+y)%(1<<64).
+uint64_t moduloMult(uint64_t a, uint64_t b, uint64_t mod) {
+  uint64_t res = 0;
+  a %= mod;
+  while (b) {
+    if (b & 1)
+      res = (res + a) % mod;
+    a = (2 * a) % mod;
+    b >>= 1;
+  }
+  return res;
+}
+
+void div_floor(int64_t a, int64_t b, int64_t &quot, int64_t &rem) {
+  assert(b > 0);
+  int64_t q = a / b;
+  int64_t r = a % b;
+  int64_t corr = ((r != 0) && (r < 0));
+  quot = q - corr;
+  rem = (r + b) % b;
+}
+
+int64_t getSignedVal(uint64_t x) {
+  assert(x < prime_mod);
+  bool xPos;
+  if (prime_mod & 1)
+    xPos = (x <= (prime_mod / 2));
+  else
+    xPos = (x < (prime_mod / 2));
+  int64_t sx = x;
+  if (!xPos)
+    sx = x - prime_mod;
+  return sx;
+}
+
+uint64_t getRingElt(int64_t x) {
+  if (x > 0)
+    return x % prime_mod;
+  else {
+    int64_t y = -x;
+    int64_t temp = prime_mod - y;
+    int64_t temp1 = temp % ((int64_t)prime_mod);
+    uint64_t ans = (temp1 + prime_mod) % prime_mod;
+    return ans;
+  }
+}
+
+uint64_t PublicAdd(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x + y) % prime_mod;
+}
+
+uint64_t PublicSub(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  uint64_t ans;
+  if (x >= y)
+    ans = (x - y) % prime_mod;
+  else
+    ans = ((x + prime_mod) - y) % prime_mod;
+  return ans;
+}
+
+uint64_t PublicMult(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+#ifdef __SIZEOF_INT128__
+  __int128 ix = x;
+  __int128 iy = y;
+  __int128 iz = ix * iy;
+
+  return iz % prime_mod;
+#else
+  return moduloMult(x, y, prime_mod);
+#endif
+}
+
+bool PublicGT(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx > sy);
+}
+
+bool PublicGTE(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx >= sy);
+}
+
+bool PublicLT(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx < sy);
+}
+
+bool PublicLTE(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  return (sx <= sy);
+}
+
+uint64_t PublicDiv(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  int64_t q, r;
+  div_floor(sx, sy, q, r);
+  return getRingElt(q);
+}
+
+uint64_t PublicMod(uint64_t x, uint64_t y) {
+  int64_t sx = getSignedVal(x);
+  int64_t sy = getSignedVal(y);
+  int64_t q, r;
+  div_floor(sx, sy, q, r);
+  return r;
+}
+
+uint64_t PublicRShiftA(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  int64_t sx = getSignedVal(x);
+  int64_t ans = sx >> y;
+  return getRingElt(ans);
+}
+
+uint64_t PublicRShiftL(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x >> y);
+}
+
+uint64_t PublicLShift(uint64_t x, uint64_t y) {
+  assert((x < prime_mod) && (y < prime_mod));
+  return (x << y) % prime_mod;
+}
+
+#endif
+
+using namespace std;
+
+uint32_t public_lrshift(uint32_t x, uint32_t y) { return (x >> y); }
+
+int32_t public_lrshift(int32_t x, uint32_t y) {
+  return ((int32_t)(((uint32_t)x) >> y));
+}
+
+uint64_t public_lrshift(uint64_t x, uint64_t y) { return (x >> y); }
+
+int64_t public_lrshift(int64_t x, uint64_t y) {
+  return ((int64_t)(((uint64_t)x) >> y));
+}
+
+template <typename T> vector<T> make_vector(size_t size) {
+  return std::vector<T>(size);
+}
+
+template <typename T, typename... Args>
+auto make_vector(size_t first, Args... sizes) {
+  auto inner = make_vector<T>(sizes...);
+  return vector<decltype(inner)>(first, inner);
+}
+
+template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) {
+  for (auto it = v.begin(); it != v.end(); ++it) {
+    os << *it << endl;
+  }
+  return os;
+}
+
+void MatMul2DEigen_pt(int64_t i, int64_t j, int64_t k, uint64_2D &A,
+                      uint64_2D &B, uint64_2D &C, int64_t consSF) {
+  Eigen::Matrix<__int128, Eigen::Dynamic, Eigen::Dynamic> eigen_a(i, j);
+  Eigen::Matrix<__int128, Eigen::Dynamic, Eigen::Dynamic> eigen_b(j, k);
+  Eigen::Matrix<__int128, Eigen::Dynamic, Eigen::Dynamic> eigen_c(i, k);
+
+  for (int i0 = 0; i0 < i; ++i0) {
+    for (int i1 = 0; i1 < j; ++i1) {
+      eigen_a(i0, i1) = A[i0][i1];
+    }
+  }
+
+  for (int i0 = 0; i0 < j; ++i0) {
+    for (int i1 = 0; i1 < k; ++i1) {
+      eigen_b(i0, i1) = B[i0][i1];
+    }
+  }
+
+  eigen_c = eigen_a * eigen_b; // No overflows since running in __int128
+
+  for (int i0 = 0; i0 < i; ++i0) {
+    for (int i1 = 0; i1 < k; ++i1) {
+      if (bitlength == 64) {
+        C[i0][i1] = eigen_c(i0, i1);
+      } else {
+        C[i0][i1] = (eigen_c(i0, i1)) % prime_mod;
+      }
+    }
+  }
+}
+
+void MatMul2D_pt(uint64_t i, uint64_t j, uint64_t k, uint64_2D &A, uint64_2D &B,
+                 uint64_2D &C, uint64_t consSF) {
+  MatMul2DEigen_pt(i, j, k, A, B, C, consSF);
+}
+
+void ArgMax_pt(uint64_t s1, uint64_t s2, uint64_2D &inArr, uint64_1D &outArr) {
+  for (uint64_t od = (int32_t)0; od < s1; od++) {
+
+    uint64_t maxi = inArr[od][(int32_t)0];
+
+    uint64_t maxiIdx = (int64_t)0;
+    for (uint64_t i = (int32_t)0; i < s2; i++) {
+
+      uint64_t iL = i;
+      maxiIdx = (PublicGT(inArr[od][i], maxi)) ? iL : maxiIdx;
+      maxi = (PublicGT(inArr[od][i], maxi)) ? inArr[od][i] : maxi;
+    }
+    outArr[od] = maxiIdx;
+  }
+}
+
+void Relu_pt(uint64_t s1, uint64_1D &inArr, uint64_1D &outArr, uint64_t sf,
+             uint64_t doTruncation) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    outArr[i1] = (PublicGT(inArr[i1], (int64_t)0)) ? inArr[i1] : (int64_t)0;
+  }
+  if (doTruncation) {
+    for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+      outArr[i1] = (PublicRShiftA(outArr[i1], sf));
+    }
+  }
+}
+
+void Floor_pt(uint64_t s1, uint64_1D &inArr, uint64_1D &outArr, uint64_t sf) {
+
+  uint64_t mask = ~(PublicSub((PublicLShift((int32_t)1, sf)), (int32_t)1));
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    outArr[i1] = (inArr[i1] & mask);
+  }
+}
+
+void MaxPool_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t C, uint64_t ksizeH,
+                uint64_t ksizeW, uint64_t zPadHLeft, uint64_t zPadHRight,
+                uint64_t zPadWLeft, uint64_t zPadWRight, uint64_t strideH,
+                uint64_t strideW, uint64_t N1, uint64_t imgH, uint64_t imgW,
+                uint64_t C1,
+                std::vector<std::vector<std::vector<uint64_1D>>> &inArr,
+                std::vector<std::vector<std::vector<uint64_1D>>> &outArr) {
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t c = (int32_t)0; c < C; c++) {
+
+      uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadHLeft));
+
+      uint64_t extremeRightBottomCornerH =
+          (PublicAdd((PublicSub(imgH, (int32_t)1)), zPadHRight));
+
+      uint64_t ctH = (int32_t)0;
+      while ((PublicLTE(
+          (PublicSub((PublicAdd(leftTopCornerH, ksizeH)), (int32_t)1)),
+          extremeRightBottomCornerH))) {
+
+        uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadWLeft));
+
+        uint64_t extremeRightBottomCornerW =
+            (PublicAdd((PublicSub(imgW, (int32_t)1)), zPadWRight));
+
+        uint64_t ctW = (int32_t)0;
+        while ((PublicLTE(
+            (PublicSub((PublicAdd(leftTopCornerW, ksizeW)), (int32_t)1)),
+            extremeRightBottomCornerW))) {
+
+          uint64_t maxi = (int64_t)0;
+          if ((((PublicLT(leftTopCornerH, (int32_t)0)) ||
+                (PublicGTE(leftTopCornerH, imgH))) ||
+               ((PublicLT(leftTopCornerW, (int32_t)0)) ||
+                (PublicGTE(leftTopCornerW, imgW))))) {
+            maxi = (int64_t)0;
+          } else {
+            maxi = inArr[n][leftTopCornerH][leftTopCornerW][c];
+          }
+          for (uint64_t fh = (int32_t)0; fh < ksizeH; fh++) {
+            for (uint64_t fw = (int32_t)0; fw < ksizeW; fw++) {
+
+              uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+              uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+              uint64_t temp = (int64_t)0;
+              if ((((PublicLT(curPosH, (int32_t)0)) ||
+                    (PublicGTE(curPosH, imgH))) ||
+                   ((PublicLT(curPosW, (int32_t)0)) ||
+                    (PublicGTE(curPosW, imgW))))) {
+                temp = (int64_t)0;
+              } else {
+                temp = inArr[n][curPosH][curPosW][c];
+              }
+              // maxi = (PublicLT(maxi, temp)) ? temp : maxi;
+              maxi = (getSignedVal(PublicSub(maxi, temp)) < 0) ? temp : maxi;
+            }
+          }
+          outArr[n][ctH][ctW][c] = maxi;
+          leftTopCornerW = (PublicAdd(leftTopCornerW, strideW));
+          ctW = (PublicAdd(ctW, (int32_t)1));
+        }
+
+        leftTopCornerH = (PublicAdd(leftTopCornerH, strideH));
+        ctH = (PublicAdd(ctH, (int32_t)1));
+      }
+    }
+  }
+}
+
+void AvgPool_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t C, uint64_t ksizeH,
+                uint64_t ksizeW, uint64_t zPadHLeft, uint64_t zPadHRight,
+                uint64_t zPadWLeft, uint64_t zPadWRight, uint64_t strideH,
+                uint64_t strideW, uint64_t N1, uint64_t imgH, uint64_t imgW,
+                uint64_t C1,
+                std::vector<std::vector<std::vector<uint64_1D>>> &inArr,
+                std::vector<std::vector<std::vector<uint64_1D>>> &outArr) {
+
+  uint64_t rows = (PublicMult((PublicMult((PublicMult(N, C)), H)), W));
+
+  auto filterAvg = make_vector<uint64_t>(rows);
+
+  uint64_t rowIdx = (int32_t)0;
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t c = (int32_t)0; c < C; c++) {
+
+      uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadHLeft));
+
+      uint64_t extremeRightBottomCornerH =
+          (PublicAdd((PublicSub(imgH, (int32_t)1)), zPadHRight));
+
+      uint64_t ctH = (int32_t)0;
+      while ((PublicLTE(
+          (PublicSub((PublicAdd(leftTopCornerH, ksizeH)), (int32_t)1)),
+          extremeRightBottomCornerH))) {
+
+        uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadWLeft));
+
+        uint64_t extremeRightBottomCornerW =
+            (PublicAdd((PublicSub(imgW, (int32_t)1)), zPadWRight));
+
+        uint64_t ctW = (int32_t)0;
+        while ((PublicLTE(
+            (PublicSub((PublicAdd(leftTopCornerW, ksizeW)), (int32_t)1)),
+            extremeRightBottomCornerW))) {
+
+          uint64_t curFilterSum = (int64_t)0;
+          for (uint64_t fh = (int32_t)0; fh < ksizeH; fh++) {
+            for (uint64_t fw = (int32_t)0; fw < ksizeW; fw++) {
+
+              uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+              uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+              uint64_t temp = (int64_t)0;
+              if ((((PublicLT(curPosH, (int32_t)0)) ||
+                    (PublicGTE(curPosH, imgH))) ||
+                   ((PublicLT(curPosW, (int32_t)0)) ||
+                    (PublicGTE(curPosW, imgW))))) {
+                temp = (int64_t)0;
+              } else {
+                temp = inArr[n][curPosH][curPosW][c];
+              }
+              curFilterSum = (PublicAdd(curFilterSum, temp));
+            }
+          }
+
+          uint64_t ksizeH64 = ksizeH;
+
+          uint64_t ksizeW64 = ksizeW;
+
+          uint64_t filterSz64 = (PublicMult(ksizeH64, ksizeW64));
+
+          uint64_t curFilterAvg = (PublicDiv(curFilterSum, filterSz64));
+          filterAvg[rowIdx] = curFilterAvg;
+          rowIdx = (PublicAdd(rowIdx, (int32_t)1));
+          leftTopCornerW = (PublicAdd(leftTopCornerW, strideW));
+          ctW = (PublicAdd(ctW, (int32_t)1));
+        }
+
+        leftTopCornerH = (PublicAdd(leftTopCornerH, strideH));
+        ctH = (PublicAdd(ctH, (int32_t)1));
+      }
+    }
+  }
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t c = (int32_t)0; c < C; c++) {
+      for (uint64_t h = (int32_t)0; h < H; h++) {
+        for (uint64_t w = (int32_t)0; w < W; w++) {
+          outArr[n][h][w][c] = filterAvg[(PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(n, C)), H)), W)),
+                      (PublicMult((PublicMult(c, H)), W)))),
+                  (PublicMult(h, W)))),
+              w))];
+        }
+      }
+    }
+  }
+}
+
+void ElemWiseSecretSharedVectorMult_pt(uint64_t s1, uint64_1D &arr1,
+                                       uint64_1D &arr2, uint64_1D &outArr) {
+  for (uint64_t ii = (int32_t)0; ii < s1; ii++) {
+    outArr[ii] = (PublicMult(arr1[ii], arr2[ii]));
+  }
+}
+
+void ElemWiseActModelVectorMult_pt(uint64_t s1, uint64_1D &arr1,
+                                   uint64_1D &arr2, uint64_1D &outArr) {
+  ElemWiseSecretSharedVectorMult_pt(s1, arr1, arr2, outArr);
+}
+
+void ElemWiseVectorPublicDiv_pt(uint64_t s1, uint64_1D &arr1, uint64_t divisor,
+                                uint64_1D &outArr) {
+
+  uint64_t divisor64 = divisor;
+  for (uint64_t ii = (int32_t)0; ii < s1; ii++) {
+    outArr[ii] = (PublicDiv(arr1[ii], divisor64));
+  }
+}
+
+void ScaleUp_pt(uint64_t s1, uint64_1D &arr, uint64_t sf) {
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    arr[i] = (PublicLShift(arr[i], sf));
+  }
+}
+
+void ScaleDown_pt(uint64_t s1, uint64_1D &arr, uint64_t sf) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    arr[i1] = (PublicRShiftA(arr[i1], sf));
+  }
+}
+
+void ClearMemSecret1_pt(uint64_t s1, uint64_1D &arr) { return; }
+
+void ClearMemSecret2_pt(uint64_t s1, uint64_t s2, uint64_2D &arr) { return; }
+
+void ClearMemSecret3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_3D &arr) {
+  return;
+}
+
+void ClearMemSecret4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                        uint64_4D &arr) {
+  return;
+}
+
+void ClearMemSecret5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                        uint64_t s5, uint64_5D &arr) {
+  return;
+}
+
+void ClearMemPublic_pt(uint64_t x) { return; }
+
+void ClearMemPublic1_pt(uint64_t s, uint64_1D &x) { return; }
+
+void ClearMemPublic2_pt(uint64_t s1, uint64_t s2, uint64_2D &arr) { return; }
+
+void ClearMemPublic3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_3D &arr) {
+  return;
+}
+
+void ClearMemPublic4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                        uint64_4D &arr) {
+  return;
+}
+
+void ClearMemPublic5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                        uint64_t s5, uint64_5D &arr) {
+  return;
+}
+
+void StartComputation_pt() { return; }
+
+void EndComputation_pt() { return; }
+
+void MatAddBroadCast2_pt(uint64_t s1, uint64_t s2, uint64_2D &A, uint64_1D &B,
+                         uint64_2D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      outArr[i1][i2] = (PublicAdd(A[i1][i2], B[i2]));
+    }
+  }
+}
+
+void MatAdd2_pt(uint64_t s1, uint64_t s2, uint64_2D &A, uint64_2D &B,
+                uint64_2D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      outArr[i1][i2] = (PublicAdd(A[i1][i2], B[i1][i2]));
+    }
+  }
+}
+
+void MatAddBroadCast4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                         uint64_4D &A, uint64_1D &B, uint64_4D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          outArr[i1][i2][i3][i4] = (PublicAdd(A[i1][i2][i3][i4], B[i4]));
+        }
+      }
+    }
+  }
+}
+
+void MatAdd4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                uint64_4D &A, uint64_4D &B, uint64_4D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          outArr[i1][i2][i3][i4] =
+              (PublicAdd(A[i1][i2][i3][i4], B[i1][i2][i3][i4]));
+        }
+      }
+    }
+  }
+}
+
+void MatAddBroadCast5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                         uint64_t s5, uint64_5D &A, uint64_1D &B,
+                         uint64_5D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+            outArr[i1][i2][i3][i4][i5] =
+                (PublicAdd(A[i1][i2][i3][i4][i5], B[i5]));
+          }
+        }
+      }
+    }
+  }
+}
+
+void MatAdd5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4, uint64_t s5,
+                uint64_5D &A, uint64_5D &B, uint64_5D &outArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+            outArr[i1][i2][i3][i4][i5] =
+                (PublicAdd(A[i1][i2][i3][i4][i5], B[i1][i2][i3][i4][i5]));
+          }
+        }
+      }
+    }
+  }
+}
+
+void CreateTensor1_pt(uint64_t s1, uint64_t val, uint64_1D &arr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    arr[i1] = val;
+  }
+}
+
+void CreateTensor2_pt(uint64_t s1, uint64_t s2, uint64_t val, uint64_2D &arr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      arr[i1][i2] = val;
+    }
+  }
+}
+
+void CreateTensor3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t val,
+                      uint64_3D &arr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        arr[i1][i2][i3] = val;
+      }
+    }
+  }
+}
+
+void CreateTensor4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                      uint64_t val, uint64_4D &arr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          arr[i1][i2][i3][i4] = val;
+        }
+      }
+    }
+  }
+}
+
+void CreateTensor5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                      uint64_t s5, uint64_t val, uint64_5D &arr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+            arr[i1][i2][i3][i4][i5] = val;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CopyTensor1_pt(uint64_t s1, uint64_1D &targetArr, uint64_1D &fromArr,
+                    uint64_1D &ignore) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    targetArr[i1] = fromArr[i1];
+  }
+}
+
+void CopyTensor2_pt(uint64_t s1, uint64_t s2, uint64_2D &targetArr,
+                    uint64_2D &fromArr, uint64_2D &ignore) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      targetArr[i1][i2] = fromArr[i1][i2];
+    }
+  }
+}
+
+void CopyTensor3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_3D &targetArr,
+                    uint64_3D &fromArr, uint64_3D &ignore) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        targetArr[i1][i2][i3] = fromArr[i1][i2][i3];
+      }
+    }
+  }
+}
+
+void CopyTensor4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                    uint64_4D &targetArr, uint64_4D &fromArr,
+                    uint64_4D &ignore) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          targetArr[i1][i2][i3][i4] = fromArr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+}
+
+void CreateIdentity11_pt(uint64_t s1, uint64_1D &fromArr, uint64_1D &newArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    newArr[i1] = fromArr[i1];
+  }
+}
+
+void CreateIdentity22_pt(uint64_t s1, uint64_t s2, uint64_2D &fromArr,
+                         uint64_2D &newArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      newArr[i1][i2] = fromArr[i1][i2];
+    }
+  }
+}
+
+void CreateIdentity33_pt(uint64_t s1, uint64_t s2, uint64_t s3,
+                         uint64_3D &fromArr, uint64_3D &newArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        newArr[i1][i2][i3] = fromArr[i1][i2][i3];
+      }
+    }
+  }
+}
+
+void CreateIdentity44_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                         uint64_4D &fromArr, uint64_4D &newArr) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          newArr[i1][i2][i3][i4] = fromArr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+}
+
+void CreateCopy2211_pt(uint64_t s1, uint64_t s2, uint64_t inps1, uint64_t inps2,
+                       uint64_2D &inArr, uint64_t perDimSize,
+                       uint64_1D &beginIdx, uint64_1D &sizeIdx,
+                       uint64_2D &outArr) {
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    for (uint64_t j = (int32_t)0; j < s2; j++) {
+      outArr[i][j] = inArr[(PublicAdd(beginIdx[(int32_t)0], i))]
+                          [(PublicAdd(beginIdx[(int32_t)1], j))];
+    }
+  }
+}
+
+void CreateCopy5511_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                       uint64_t s5, uint64_t inps1, uint64_t inps2,
+                       uint64_t inps3, uint64_t inps4, uint64_t inps5,
+                       uint64_5D &inArr, uint64_t perDimSize,
+                       uint64_1D &beginIdx, uint64_1D &sizeIdx,
+                       uint64_5D &outArr) {
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    for (uint64_t j = (int32_t)0; j < s2; j++) {
+      for (uint64_t k = (int32_t)0; k < s3; k++) {
+        for (uint64_t l = (int32_t)0; l < s4; l++) {
+          for (uint64_t m = (int32_t)0; m < s5; m++) {
+            outArr[i][j][k][l][m] = inArr[(PublicAdd(beginIdx[(int32_t)0], i))]
+                                         [(PublicAdd(beginIdx[(int32_t)1], j))]
+                                         [(PublicAdd(beginIdx[(int32_t)2], k))]
+                                         [(PublicAdd(beginIdx[(int32_t)3], l))]
+                                         [(PublicAdd(beginIdx[(int32_t)4], m))];
+          }
+        }
+      }
+    }
+  }
+}
+
+void Concat2T222_pt(uint64_t s1, uint64_t s2, uint64_t inp1s1, uint64_t inp1s2,
+                    uint64_2D &inp1, uint64_t inp2s1, uint64_t inp2s2,
+                    uint64_2D &inp2, uint64_t axis, uint64_2D &outp) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      if (axis == (int32_t)0) {
+        if ((PublicLT(i1, inp1s1))) {
+          outp[i1][i2] = inp1[i1][i2];
+        } else {
+          outp[i1][i2] = inp2[(PublicSub(i1, inp1s1))][i2];
+        }
+      } else {
+        if ((PublicLT(i2, inp1s2))) {
+          outp[i1][i2] = inp1[i1][i2];
+        } else {
+          outp[i1][i2] = inp2[i1][(PublicSub(i2, inp1s2))];
+        }
+      }
+    }
+  }
+}
+
+void Concat2T444_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                    uint64_t inp1s1, uint64_t inp1s2, uint64_t inp1s3,
+                    uint64_t inp1s4, uint64_4D &inp1, uint64_t inp2s1,
+                    uint64_t inp2s2, uint64_t inp2s3, uint64_t inp2s4,
+                    uint64_4D &inp2, uint64_t axis, uint64_4D &outp) {
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          if (axis == (int32_t)0) {
+            if ((PublicLT(i1, inp1s1))) {
+              outp[i1][i2][i3][i4] = inp1[i1][i2][i3][i4];
+            } else {
+              outp[i1][i2][i3][i4] = inp2[(PublicSub(i1, inp1s1))][i2][i3][i4];
+            }
+          } else {
+            if (axis == (int32_t)1) {
+              if ((PublicLT(i2, inp1s2))) {
+                outp[i1][i2][i3][i4] = inp1[i1][i2][i3][i4];
+              } else {
+                outp[i1][i2][i3][i4] =
+                    inp2[i1][(PublicSub(i2, inp1s2))][i3][i4];
+              }
+            } else {
+              if (axis == (int32_t)2) {
+                if ((PublicLT(i3, inp1s3))) {
+                  outp[i1][i2][i3][i4] = inp1[i1][i2][i3][i4];
+                } else {
+                  outp[i1][i2][i3][i4] =
+                      inp2[i1][i2][(PublicSub(i3, inp1s3))][i4];
+                }
+              } else {
+                if ((PublicLT(i4, inp1s4))) {
+                  outp[i1][i2][i3][i4] = inp1[i1][i2][i3][i4];
+                } else {
+                  outp[i1][i2][i3][i4] =
+                      inp2[i1][i2][i3][(PublicSub(i4, inp1s4))];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Split44_pt(uint64_t O1, uint64_t O2, uint64_t O3, uint64_t O4, uint64_t I1,
+                uint64_t I2, uint64_t I3, uint64_t I4, uint64_4D &inp,
+                uint64_t axis, uint64_t curCount, uint64_t total,
+                uint64_4D &out) {
+  for (uint64_t o1 = (int32_t)0; o1 < O1; o1++) {
+    for (uint64_t o2 = (int32_t)0; o2 < O2; o2++) {
+      for (uint64_t o3 = (int32_t)0; o3 < O3; o3++) {
+        for (uint64_t o4 = (int32_t)0; o4 < O4; o4++) {
+
+          uint64_t i1 = o1;
+
+          uint64_t i2 = o2;
+
+          uint64_t i3 = o3;
+
+          uint64_t i4 = o4;
+          if (axis == (int32_t)0) {
+            i1 =
+                (PublicAdd((PublicMult((PublicDiv(I1, total)), curCount)), o1));
+          }
+          if (axis == (int32_t)1) {
+            i2 =
+                (PublicAdd((PublicMult((PublicDiv(I2, total)), curCount)), o2));
+          }
+          if (axis == (int32_t)2) {
+            i3 =
+                (PublicAdd((PublicMult((PublicDiv(I3, total)), curCount)), o3));
+          }
+          if (axis == (int32_t)3) {
+            i4 =
+                (PublicAdd((PublicMult((PublicDiv(I4, total)), curCount)), o4));
+          }
+          out[o1][o2][o3][o4] = inp[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+}
+
+void Conv2DReshapeFilter_pt(uint64_t FH, uint64_t FW, uint64_t CI, uint64_t CO,
+                            uint64_4D &inputArr, uint64_2D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+      for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+        for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+          uint64_t linIdx =
+              (PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CI)),
+                                    (PublicMult(fw, CI)))),
+                         ci));
+          outputArr[co][linIdx] = inputArr[fh][fw][ci][co];
+        }
+      }
+    }
+  }
+}
+
+void Conv2DReshapeMatMulOP_pt(uint64_t N, uint64_t finalH, uint64_t finalW,
+                              uint64_t CO, uint64_2D &inputArr,
+                              uint64_4D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t n = (int32_t)0; n < N; n++) {
+      for (uint64_t h = (int32_t)0; h < finalH; h++) {
+        for (uint64_t w = (int32_t)0; w < finalW; w++) {
+          outputArr[n][h][w][co] = inputArr[co][(PublicAdd(
+              (PublicAdd((PublicMult((PublicMult(n, finalH)), finalW)),
+                         (PublicMult(h, finalW)))),
+              w))];
+        }
+      }
+    }
+  }
+}
+
+void Conv2DReshapeInput_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                           uint64_t FH, uint64_t FW, uint64_t zPadHLeft,
+                           uint64_t zPadHRight, uint64_t zPadWLeft,
+                           uint64_t zPadWRight, uint64_t strideH,
+                           uint64_t strideW, uint64_t RRows, uint64_t RCols,
+                           uint64_4D &inputArr, uint64_2D &outputArr) {
+
+  uint64_t linIdxFilterMult = (int32_t)0;
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+
+    uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadHLeft));
+
+    uint64_t extremeRightBottomCornerH =
+        (PublicAdd((PublicSub(H, (int32_t)1)), zPadHRight));
+    while ((PublicLTE((PublicSub((PublicAdd(leftTopCornerH, FH)), (int32_t)1)),
+                      extremeRightBottomCornerH))) {
+
+      uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadWLeft));
+
+      uint64_t extremeRightBottomCornerW =
+          (PublicAdd((PublicSub(W, (int32_t)1)), zPadWRight));
+      while (
+          (PublicLTE((PublicSub((PublicAdd(leftTopCornerW, FW)), (int32_t)1)),
+                     extremeRightBottomCornerW))) {
+        for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+          for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+
+            uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+            uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+            uint64_t val = (int64_t)0;
+            for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+              if ((((PublicLT(curPosH, (int32_t)0)) ||
+                    (PublicGTE(curPosH, H))) ||
+                   ((PublicLT(curPosW, (int32_t)0)) ||
+                    (PublicGTE(curPosW, W))))) {
+                val = (int64_t)0;
+              } else {
+                val = inputArr[n][curPosH][curPosW][ci];
+              }
+              outputArr[(
+                  PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CI)),
+                                       (PublicMult(fw, CI)))),
+                            ci))][linIdxFilterMult] = val;
+            }
+          }
+        }
+        linIdxFilterMult = (PublicAdd(linIdxFilterMult, (int32_t)1));
+        leftTopCornerW = (PublicAdd(leftTopCornerW, strideW));
+      }
+
+      leftTopCornerH = (PublicAdd(leftTopCornerH, strideH));
+    }
+  }
+}
+
+void Conv2D_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI, uint64_t FH,
+               uint64_t FW, uint64_t CO, uint64_t zPadHLeft,
+               uint64_t zPadHRight, uint64_t zPadWLeft, uint64_t zPadWRight,
+               uint64_t strideH, uint64_t strideW, uint64_4D &inputArr,
+               uint64_4D &filterArr, uint64_4D &outArr) {
+
+  uint64_t reshapedFilterRows = CO;
+
+  uint64_t reshapedFilterCols = (PublicMult((PublicMult(FH, FW)), CI));
+
+  uint64_t reshapedIPRows = (PublicMult((PublicMult(FH, FW)), CI));
+
+  uint64_t newH = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(H, (PublicAdd(zPadHLeft, zPadHRight)))), FH)),
+          strideH)),
+      (int32_t)1));
+
+  uint64_t newW = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(W, (PublicAdd(zPadWLeft, zPadWRight)))), FW)),
+          strideW)),
+      (int32_t)1));
+
+  uint64_t reshapedIPCols = (PublicMult((PublicMult(N, newH)), newW));
+
+  auto filterReshaped =
+      make_vector<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  auto inputReshaped = make_vector<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  auto matmulOP = make_vector<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  Conv2DReshapeFilter_pt(FH, FW, CI, CO, filterArr, filterReshaped);
+  Conv2DReshapeInput_pt(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight, zPadWLeft,
+                        zPadWRight, strideH, strideW, reshapedIPRows,
+                        reshapedIPCols, inputArr, inputReshaped);
+  MatMul2D_pt(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+              filterReshaped, inputReshaped, matmulOP, 1);
+  Conv2DReshapeMatMulOP_pt(N, newH, newW, CO, matmulOP, outArr);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2_pt(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void Conv2DLoopInner_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                        uint64_t FH, uint64_t FW, uint64_t CO,
+                        uint64_t zPadHLeft, uint64_t zPadHRight,
+                        uint64_t zPadWLeft, uint64_t zPadWRight,
+                        uint64_t strideH, uint64_t strideW, uint64_t outH,
+                        uint64_t outW, uint64_t G, uint64_4D &inputArr,
+                        uint64_4D &filterArr, uint64_4D &outArr) {
+
+  uint64_t GIS = (PublicDiv(CI, G));
+
+  uint64_t GOS = (PublicDiv(CO, G));
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t cog = (int32_t)0; cog < GOS; cog++) {
+      for (uint64_t cig = (int32_t)0; cig < GIS; cig++) {
+        for (uint64_t g = (int32_t)0; g < G; g++) {
+          for (uint64_t h = (int32_t)0; h < outH; h++) {
+            for (uint64_t w = (int32_t)0; w < outW; w++) {
+
+              uint64_t val = (int64_t)0;
+
+              uint64_t ci = (PublicAdd((PublicMult(GIS, g)), cig));
+
+              uint64_t co = (PublicAdd((PublicMult(GOS, g)), cog));
+
+              uint64_t curPosH =
+                  (PublicSub((PublicMult(strideH, h)), zPadHLeft));
+              for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+
+                uint64_t curPosW =
+                    (PublicSub((PublicMult(strideW, w)), zPadWLeft));
+                for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+                  if (((((PublicGTE(curPosH, (int32_t)0)) &&
+                         (PublicGTE(curPosW, (int32_t)0))) &&
+                        (PublicLT(curPosH, H))) &&
+                       (PublicLT(curPosW, W)))) {
+                    val = (PublicAdd(
+                        val, (PublicMult(
+                                 inputArr[n][curPosH][curPosW][ci],
+                                 filterArr[fh][fw][(PublicDiv(ci, G))][co]))));
+                  }
+                  curPosW = (PublicAdd(curPosW, (int32_t)1));
+                }
+                curPosH = (PublicAdd(curPosH, (int32_t)1));
+              }
+              outArr[n][h][w][co] = (PublicAdd(outArr[n][h][w][co], val));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Conv2DLoop_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI, uint64_t FH,
+                   uint64_t FW, uint64_t CO, uint64_t zPadHLeft,
+                   uint64_t zPadHRight, uint64_t zPadWLeft, uint64_t zPadWRight,
+                   uint64_t strideH, uint64_t strideW, uint64_t G,
+                   uint64_4D &inputArr, uint64_4D &filterArr,
+                   uint64_4D &outArr) {
+
+  uint64_t outH =
+      (PublicAdd((PublicDiv((PublicAdd((PublicSub(H, FH)),
+                                       (PublicAdd(zPadHLeft, zPadHRight)))),
+                            strideH)),
+                 (int32_t)1));
+
+  uint64_t outW =
+      (PublicAdd((PublicDiv((PublicAdd((PublicSub(W, FW)),
+                                       (PublicAdd(zPadWLeft, zPadWRight)))),
+                            strideW)),
+                 (int32_t)1));
+  Conv2DLoopInner_pt(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+                     zPadWRight, strideH, strideW, outH, outW, G, inputArr,
+                     filterArr, outArr);
+}
+
+void Conv2DReshapeFilterGroup_pt(uint64_t FH, uint64_t FW, uint64_t CI,
+                                 uint64_t CO, uint64_t g, uint64_t G,
+                                 uint64_4D &inputArr, uint64_2D &outputArr) {
+
+  uint64_t CIG = (PublicDiv(CI, G));
+
+  uint64_t COG = (PublicDiv(CO, G));
+
+  uint64_t startCO = (PublicMult(g, COG));
+  for (uint64_t co = (int32_t)0; co < COG; co++) {
+    for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+      for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+        for (uint64_t ci = (int32_t)0; ci < CIG; ci++) {
+
+          uint64_t linIdx =
+              (PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CIG)),
+                                    (PublicMult(fw, CIG)))),
+                         ci));
+          outputArr[co][linIdx] =
+              inputArr[fh][fw][ci][(PublicAdd(co, startCO))];
+        }
+      }
+    }
+  }
+}
+
+void Conv2DReshapeMatMulOPGroup_pt(uint64_t N, uint64_t finalH, uint64_t finalW,
+                                   uint64_t CO, uint64_t g, uint64_t G,
+                                   uint64_2D &inputArr, uint64_4D &outputArr) {
+
+  uint64_t COG = (PublicDiv(CO, G));
+
+  uint64_t startCO = (PublicMult(g, COG));
+  for (uint64_t co = (int32_t)0; co < COG; co++) {
+    for (uint64_t n = (int32_t)0; n < N; n++) {
+      for (uint64_t h = (int32_t)0; h < finalH; h++) {
+        for (uint64_t w = (int32_t)0; w < finalW; w++) {
+          outputArr[n][h][w][(PublicAdd(co, startCO))] =
+              inputArr[co][(PublicAdd(
+                  (PublicAdd((PublicMult((PublicMult(n, finalH)), finalW)),
+                             (PublicMult(h, finalW)))),
+                  w))];
+        }
+      }
+    }
+  }
+}
+
+void Conv2DReshapeInputGroup_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                                uint64_t FH, uint64_t FW, uint64_t zPadHLeft,
+                                uint64_t zPadHRight, uint64_t zPadWLeft,
+                                uint64_t zPadWRight, uint64_t strideH,
+                                uint64_t strideW, uint64_t g, uint64_t G,
+                                uint64_t RRows, uint64_t RCols,
+                                uint64_4D &inputArr, uint64_2D &outputArr) {
+
+  uint64_t linIdxFilterMult = (int32_t)0;
+
+  uint64_t CIG = (PublicDiv(CI, G));
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+
+    uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadHLeft));
+
+    uint64_t extremeRightBottomCornerH =
+        (PublicAdd((PublicSub(H, (int32_t)1)), zPadHRight));
+    while ((PublicLTE((PublicSub((PublicAdd(leftTopCornerH, FH)), (int32_t)1)),
+                      extremeRightBottomCornerH))) {
+
+      uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadWLeft));
+
+      uint64_t extremeRightBottomCornerW =
+          (PublicAdd((PublicSub(W, (int32_t)1)), zPadWRight));
+      while (
+          (PublicLTE((PublicSub((PublicAdd(leftTopCornerW, FW)), (int32_t)1)),
+                     extremeRightBottomCornerW))) {
+        for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+          for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+
+            uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+            uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+            uint64_t val = (int64_t)0;
+
+            uint64_t startCI = (PublicMult(g, CIG));
+            for (uint64_t ci = (int32_t)0; ci < CIG; ci++) {
+              if ((((PublicLT(curPosH, (int32_t)0)) ||
+                    (PublicGTE(curPosH, H))) ||
+                   ((PublicLT(curPosW, (int32_t)0)) ||
+                    (PublicGTE(curPosW, W))))) {
+                val = (int64_t)0;
+              } else {
+                val = inputArr[n][curPosH][curPosW][(PublicAdd(ci, startCI))];
+              }
+              outputArr[(
+                  PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CIG)),
+                                       (PublicMult(fw, CIG)))),
+                            ci))][linIdxFilterMult] = val;
+            }
+          }
+        }
+        linIdxFilterMult = (PublicAdd(linIdxFilterMult, (int32_t)1));
+        leftTopCornerW = (PublicAdd(leftTopCornerW, strideW));
+      }
+
+      leftTopCornerH = (PublicAdd(leftTopCornerH, strideH));
+    }
+  }
+}
+
+void Conv2DGroup_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                    uint64_t FH, uint64_t FW, uint64_t CO, uint64_t zPadHLeft,
+                    uint64_t zPadHRight, uint64_t zPadWLeft,
+                    uint64_t zPadWRight, uint64_t strideH, uint64_t strideW,
+                    uint64_t G, uint64_4D &inputArr, uint64_4D &filterArr,
+                    uint64_4D &outArr) {
+
+  uint64_t CIG = (PublicDiv(CI, G));
+
+  uint64_t reshapedFilterRows = (PublicDiv(CO, G));
+
+  uint64_t reshapedFilterCols = (PublicMult((PublicMult(FH, FW)), CIG));
+
+  uint64_t reshapedIPRows = (PublicMult((PublicMult(FH, FW)), CIG));
+
+  uint64_t outH = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(H, (PublicAdd(zPadHLeft, zPadHRight)))), FH)),
+          strideH)),
+      (int32_t)1));
+
+  uint64_t outW = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(W, (PublicAdd(zPadWLeft, zPadWRight)))), FW)),
+          strideW)),
+      (int32_t)1));
+
+  uint64_t reshapedIPCols = (PublicMult((PublicMult(N, outH)), outW));
+  for (uint64_t g = (int32_t)0; g < G; g++) {
+
+    auto inputReshaped = make_vector<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+    auto matmulOP = make_vector<uint64_t>(reshapedFilterRows, reshapedIPCols);
+
+    auto filterReshaped =
+        make_vector<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+    Conv2DReshapeFilterGroup_pt(FH, FW, CI, CO, g, G, filterArr,
+                                filterReshaped);
+    Conv2DReshapeInputGroup_pt(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight,
+                               zPadWLeft, zPadWRight, strideH, strideW, g, G,
+                               reshapedIPRows, reshapedIPCols, inputArr,
+                               inputReshaped);
+    MatMul2D_pt(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+                filterReshaped, inputReshaped, matmulOP, 1);
+    Conv2DReshapeMatMulOPGroup_pt(N, outH, outW, CO, g, G, matmulOP, outArr);
+    ClearMemSecret2_pt(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+    ClearMemSecret2_pt(reshapedIPRows, reshapedIPCols, inputReshaped);
+    ClearMemSecret2_pt(reshapedFilterRows, reshapedIPCols, matmulOP);
+  }
+}
+
+void Conv3DReshapeFilter_pt(uint64_t FD, uint64_t FH, uint64_t FW, uint64_t CI,
+                            uint64_t CO, uint64_5D &inputArr,
+                            uint64_2D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t fd = (int32_t)0; fd < FD; fd++) {
+      for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+        for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+          for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd((PublicMult(
+                                   (PublicMult((PublicMult(fd, FH)), FW)), CI)),
+                               (PublicMult((PublicMult(fh, FW)), CI)))),
+                    (PublicMult(fw, CI)))),
+                ci));
+            outputArr[co][linIdx] = inputArr[fd][fh][fw][ci][co];
+          }
+        }
+      }
+    }
+  }
+}
+
+void Conv3DReshapeMatMulOP_pt(uint64_t N, uint64_t finalD, uint64_t finalH,
+                              uint64_t finalW, uint64_t CO, uint64_2D &inputArr,
+                              uint64_5D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t n = (int32_t)0; n < N; n++) {
+      for (uint64_t d = (int32_t)0; d < finalD; d++) {
+        for (uint64_t h = (int32_t)0; h < finalH; h++) {
+          for (uint64_t w = (int32_t)0; w < finalW; w++) {
+            outputArr[n][d][h][w][co] = inputArr[co][(PublicAdd(
+                (PublicAdd(
+                    (PublicAdd((PublicMult((PublicMult((PublicMult(n, finalD)),
+                                                       finalH)),
+                                           finalW)),
+                               (PublicMult((PublicMult(d, finalH)), finalW)))),
+                    (PublicMult(h, finalW)))),
+                w))];
+          }
+        }
+      }
+    }
+  }
+}
+
+void Conv3DReshapeInput_pt(uint64_t N, uint64_t D, uint64_t H, uint64_t W,
+                           uint64_t CI, uint64_t FD, uint64_t FH, uint64_t FW,
+                           uint64_t zPadDLeft, uint64_t zPadDRight,
+                           uint64_t zPadHLeft, uint64_t zPadHRight,
+                           uint64_t zPadWLeft, uint64_t zPadWRight,
+                           int64_t strideD, uint64_t strideH, uint64_t strideW,
+                           uint64_t RRows, uint64_t RCols, uint64_5D &inputArr,
+                           uint64_2D &outputArr) {
+
+  uint64_t linIdxFilterMult = (int32_t)0;
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+
+    uint64_t leftTopCornerD = (PublicSub((int32_t)0, zPadDLeft));
+
+    uint64_t extremeRightBottomCornerD =
+        (PublicAdd((PublicSub(D, (int32_t)1)), zPadDRight));
+    while ((PublicLTE((PublicSub((PublicAdd(leftTopCornerD, FD)), (int32_t)1)),
+                      extremeRightBottomCornerD))) {
+
+      uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadHLeft));
+
+      uint64_t extremeRightBottomCornerH =
+          (PublicAdd((PublicSub(H, (int32_t)1)), zPadHRight));
+      while (
+          (PublicLTE((PublicSub((PublicAdd(leftTopCornerH, FH)), (int32_t)1)),
+                     extremeRightBottomCornerH))) {
+
+        uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadWLeft));
+
+        uint64_t extremeRightBottomCornerW =
+            (PublicAdd((PublicSub(W, (int32_t)1)), zPadWRight));
+        while (
+            (PublicLTE((PublicSub((PublicAdd(leftTopCornerW, FW)), (int32_t)1)),
+                       extremeRightBottomCornerW))) {
+          for (uint64_t fd = (int32_t)0; fd < FD; fd++) {
+            for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+              for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+
+                uint64_t curPosD = (PublicAdd(leftTopCornerD, fd));
+
+                uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+                uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+                uint64_t val = (int64_t)0;
+                for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+                  if (((((PublicLT(curPosD, (int32_t)0)) ||
+                         (PublicGTE(curPosD, D))) ||
+                        ((PublicLT(curPosH, (int32_t)0)) ||
+                         (PublicGTE(curPosH, H)))) ||
+                       ((PublicLT(curPosW, (int32_t)0)) ||
+                        (PublicGTE(curPosW, W))))) {
+                    val = (int64_t)0;
+                  } else {
+                    val = inputArr[n][curPosD][curPosH][curPosW][ci];
+                  }
+                  outputArr[(PublicAdd(
+                      (PublicAdd(
+                          (PublicAdd(
+                              (PublicMult(
+                                  (PublicMult((PublicMult(fd, FH)), FW)), CI)),
+                              (PublicMult((PublicMult(fh, FW)), CI)))),
+                          (PublicMult(fw, CI)))),
+                      ci))][linIdxFilterMult] = val;
+                }
+              }
+            }
+          }
+          linIdxFilterMult = (PublicAdd(linIdxFilterMult, (int32_t)1));
+          leftTopCornerW = (PublicAdd(leftTopCornerW, strideW));
+        }
+
+        leftTopCornerH = (PublicAdd(leftTopCornerH, strideH));
+      }
+
+      leftTopCornerD = (PublicAdd(leftTopCornerD, strideD));
+    }
+  }
+}
+
+void Conv3D_pt(uint64_t N, uint64_t D, uint64_t H, uint64_t W, uint64_t CI,
+               uint64_t FD, uint64_t FH, uint64_t FW, uint64_t CO,
+               uint64_t zPadDLeft, uint64_t zPadDRight, uint64_t zPadHLeft,
+               uint64_t zPadHRight, uint64_t zPadWLeft, uint64_t zPadWRight,
+               uint64_t strideD, uint64_t strideH, uint64_t strideW,
+               uint64_5D &inputArr, uint64_5D &filterArr, uint64_5D &outArr) {
+
+  uint64_t reshapedFilterRows = CO;
+
+  uint64_t reshapedFilterCols =
+      (PublicMult((PublicMult((PublicMult(FD, FH)), FW)), CI));
+
+  uint64_t reshapedIPRows =
+      (PublicMult((PublicMult((PublicMult(FD, FH)), FW)), CI));
+
+  uint64_t newD = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(D, (PublicAdd(zPadDLeft, zPadDRight)))), FD)),
+          strideD)),
+      (int32_t)1));
+
+  uint64_t newH = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(H, (PublicAdd(zPadHLeft, zPadHRight)))), FH)),
+          strideH)),
+      (int32_t)1));
+
+  uint64_t newW = (PublicAdd(
+      (PublicDiv(
+          (PublicSub((PublicAdd(W, (PublicAdd(zPadWLeft, zPadWRight)))), FW)),
+          strideW)),
+      (int32_t)1));
+
+  uint64_t reshapedIPCols =
+      (PublicMult((PublicMult((PublicMult(N, newD)), newH)), newW));
+
+  auto filterReshaped =
+      make_vector<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  auto inputReshaped = make_vector<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  auto matmulOP = make_vector<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  Conv3DReshapeFilter_pt(FD, FH, FW, CI, CO, filterArr, filterReshaped);
+  Conv3DReshapeInput_pt(N, D, H, W, CI, FD, FH, FW, zPadDLeft, zPadDRight,
+                        zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD,
+                        strideH, strideW, reshapedIPRows, reshapedIPCols,
+                        inputArr, inputReshaped);
+  MatMul2D_pt(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+              filterReshaped, inputReshaped, matmulOP, 1);
+  Conv3DReshapeMatMulOP_pt(N, newD, newH, newW, CO, matmulOP, outArr);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2_pt(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void Conv3DLoopInner_pt(uint64_t N, uint64_t D, uint64_t H, uint64_t W,
+                        uint64_t CI, uint64_t FD, uint64_t FH, uint64_t FW,
+                        uint64_t CO, uint64_t zPadDLeft, uint64_t zPadDRight,
+                        uint64_t zPadHLeft, uint64_t zPadHRight,
+                        uint64_t zPadWLeft, uint64_t zPadWRight,
+                        uint64_t strideD, uint64_t strideH, uint64_t strideW,
+                        uint64_t outD, uint64_t outH, uint64_t outW,
+                        uint64_5D &inputArr, uint64_5D &filterArr,
+                        uint64_5D &outArr) {
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t co = (int32_t)0; co < CO; co++) {
+      for (uint64_t d = (int32_t)0; d < outD; d++) {
+        for (uint64_t h = (int32_t)0; h < outH; h++) {
+          for (uint64_t w = (int32_t)0; w < outW; w++) {
+            for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+              uint64_t val = (int64_t)0;
+              for (uint64_t fd = (PublicMult(d, strideD));
+                   fd < (PublicAdd((PublicMult(d, strideD)), FD)); fd++) {
+                for (uint64_t fh = (PublicMult(h, strideH));
+                     fh < (PublicAdd((PublicMult(h, strideH)), FH)); fh++) {
+                  for (uint64_t fw = (PublicMult(w, strideW));
+                       fw < (PublicAdd((PublicMult(w, strideW)), FW)); fw++) {
+
+                    uint64_t curPosD = (PublicSub(fd, zPadDLeft));
+
+                    uint64_t curPosH = (PublicSub(fh, zPadHLeft));
+
+                    uint64_t curPosW = (PublicSub(fw, zPadWLeft));
+                    if (((((((PublicGTE(curPosD, (int32_t)0)) &&
+                             (PublicGTE(curPosH, (int32_t)0))) &&
+                            (PublicGTE(curPosW, (int32_t)0))) &&
+                           (PublicLT(curPosD, D))) &&
+                          (PublicLT(curPosH, H))) &&
+                         (PublicLT(curPosW, W)))) {
+
+                      uint64_t curFilterPosD =
+                          (PublicSub(fd, (PublicMult(d, strideD))));
+
+                      uint64_t curFilterPosH =
+                          (PublicSub(fh, (PublicMult(h, strideH))));
+
+                      uint64_t curFilterPosW =
+                          (PublicSub(fw, (PublicMult(w, strideW))));
+                      val = (PublicAdd(
+                          val, (PublicMult(
+                                   inputArr[n][curPosD][curPosH][curPosW][ci],
+                                   filterArr[curFilterPosD][curFilterPosH]
+                                            [curFilterPosW][ci][co]))));
+                    }
+                  }
+                }
+              }
+              outArr[n][d][h][w][co] = (PublicAdd(outArr[n][d][h][w][co], val));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void Conv3DLoop_pt(uint64_t N, uint64_t D, uint64_t H, uint64_t W, uint64_t CI,
+                   uint64_t FD, uint64_t FH, uint64_t FW, uint64_t CO,
+                   uint64_t zPadDLeft, uint64_t zPadDRight, uint64_t zPadHLeft,
+                   uint64_t zPadHRight, uint64_t zPadWLeft, uint64_t zPadWRight,
+                   uint64_t strideD, uint64_t strideH, uint64_t strideW,
+                   uint64_5D &inputArr, uint64_5D &filterArr,
+                   uint64_5D &outArr) {
+
+  uint64_t outD =
+      (PublicAdd((PublicDiv((PublicAdd((PublicSub(D, FD)),
+                                       (PublicAdd(zPadDLeft, zPadDRight)))),
+                            strideD)),
+                 (int32_t)1));
+
+  uint64_t outH =
+      (PublicAdd((PublicDiv((PublicAdd((PublicSub(H, FH)),
+                                       (PublicAdd(zPadHLeft, zPadHRight)))),
+                            strideH)),
+                 (int32_t)1));
+
+  uint64_t outW =
+      (PublicAdd((PublicDiv((PublicAdd((PublicSub(W, FW)),
+                                       (PublicAdd(zPadWLeft, zPadWRight)))),
+                            strideW)),
+                 (int32_t)1));
+  Conv3DLoopInner_pt(N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight,
+                     zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideD,
+                     strideH, strideW, outD, outH, outW, inputArr, filterArr,
+                     outArr);
+}
+
+void ConvTranspose2DReshapeMatMulOP_pt(uint64_t N, uint64_t finalH,
+                                       uint64_t finalW, uint64_t CO,
+                                       uint64_2D &inputArr,
+                                       uint64_4D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t n = (int32_t)0; n < N; n++) {
+      for (uint64_t h = (int32_t)0; h < finalH; h++) {
+        for (uint64_t w = (int32_t)0; w < finalW; w++) {
+          outputArr[n][h][w][co] = inputArr[co][(PublicAdd(
+              (PublicAdd((PublicMult((PublicMult(n, finalH)), finalW)),
+                         (PublicMult(h, finalW)))),
+              w))];
+        }
+      }
+    }
+  }
+}
+
+void ConvTranspose2DReshapeFilter_pt(uint64_t FH, uint64_t FW, uint64_t CO,
+                                     uint64_t CI, uint64_4D &inputArr,
+                                     uint64_2D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+      for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+        for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+          uint64_t linIdx =
+              (PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CI)),
+                                    (PublicMult(fw, CI)))),
+                         ci));
+          outputArr[co][linIdx] =
+              inputArr[(PublicSub((PublicSub(FH, (int32_t)1)), fh))]
+                      [(PublicSub((PublicSub(FW, (int32_t)1)), fw))][co][ci];
+        }
+      }
+    }
+  }
+}
+
+void ConvTranspose2DReshapeInput_pt(uint64_t N, uint64_t HPrime,
+                                    uint64_t WPrime, uint64_t CI, uint64_t FH,
+                                    uint64_t FW, uint64_t zPadTrHLeft,
+                                    uint64_t zPadTrHRight, uint64_t zPadTrWLeft,
+                                    uint64_t zPadTrWRight, uint64_t strideH,
+                                    uint64_t strideW, uint64_t RRows,
+                                    uint64_t RCols, uint64_4D &inputArr,
+                                    uint64_2D &outputArr) {
+
+  uint64_t linIdxFilterMult = (int32_t)0;
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+
+    uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadTrHLeft));
+
+    uint64_t HPrimeTilde =
+        (PublicAdd(HPrime, (PublicMult((PublicSub(HPrime, (int32_t)1)),
+                                       (PublicSub(strideH, (int32_t)1))))));
+
+    uint64_t extremeRightBottomCornerH =
+        (PublicAdd((PublicSub(HPrimeTilde, (int32_t)1)), zPadTrHRight));
+    while ((PublicLTE((PublicSub((PublicAdd(leftTopCornerH, FH)), (int32_t)1)),
+                      extremeRightBottomCornerH))) {
+
+      uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadTrWLeft));
+
+      uint64_t WPrimeTilde =
+          (PublicAdd(WPrime, (PublicMult((PublicSub(WPrime, (int32_t)1)),
+                                         (PublicSub(strideW, (int32_t)1))))));
+
+      uint64_t extremeRightBottomCornerW =
+          (PublicAdd((PublicSub(WPrimeTilde, (int32_t)1)), zPadTrWRight));
+      while (
+          (PublicLTE((PublicSub((PublicAdd(leftTopCornerW, FW)), (int32_t)1)),
+                     extremeRightBottomCornerW))) {
+        for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+          for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+
+            uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+            uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+            uint64_t val = (int64_t)0;
+            for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+              if ((((PublicLT(curPosH, (int32_t)0)) ||
+                    (PublicGTE(curPosH, HPrimeTilde))) ||
+                   ((PublicLT(curPosW, (int32_t)0)) ||
+                    (PublicGTE(curPosW, WPrimeTilde))))) {
+                val = (int64_t)0;
+              } else {
+                if (((PublicMod(curPosH, strideH)) == (int32_t)0) &&
+                    ((PublicMod(curPosW, strideW)) == (int32_t)0)) {
+
+                  uint64_t idxInputH = (PublicDiv(curPosH, strideH));
+
+                  uint64_t idxInputW = (PublicDiv(curPosW, strideW));
+                  val = inputArr[n][idxInputH][idxInputW][ci];
+                } else {
+                  val = (int64_t)0;
+                }
+              }
+              outputArr[(
+                  PublicAdd((PublicAdd((PublicMult((PublicMult(fh, FW)), CI)),
+                                       (PublicMult(fw, CI)))),
+                            ci))][linIdxFilterMult] = val;
+            }
+          }
+        }
+        linIdxFilterMult = (PublicAdd(linIdxFilterMult, (int32_t)1));
+        leftTopCornerW = (PublicAdd(leftTopCornerW, (int32_t)1));
+      }
+
+      leftTopCornerH = (PublicAdd(leftTopCornerH, (int32_t)1));
+    }
+  }
+}
+
+void ConvTranspose2D_pt(uint64_t N, uint64_t HPrime, uint64_t WPrime,
+                        uint64_t CI, uint64_t FH, uint64_t FW, uint64_t CO,
+                        uint64_t H, uint64_t W, uint64_t zPadTrHLeft,
+                        uint64_t zPadTrHRight, uint64_t zPadTrWLeft,
+                        uint64_t zPadTrWRight, uint64_t strideH,
+                        uint64_t strideW, uint64_4D &inputArr,
+                        uint64_4D &filterArr, uint64_4D &outArr) {
+
+  uint64_t reshapedFilterRows = CO;
+
+  uint64_t reshapedFilterCols = (PublicMult((PublicMult(FH, FW)), CI));
+
+  uint64_t reshapedIPRows = (PublicMult((PublicMult(FH, FW)), CI));
+
+  uint64_t reshapedIPCols = (PublicMult((PublicMult(N, H)), W));
+
+  auto filterReshaped =
+      make_vector<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  auto inputReshaped = make_vector<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  auto matmulOP = make_vector<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  ConvTranspose2DReshapeFilter_pt(FH, FW, CO, CI, filterArr, filterReshaped);
+  ConvTranspose2DReshapeInput_pt(N, HPrime, WPrime, CI, FH, FW, zPadTrHLeft,
+                                 zPadTrHRight, zPadTrWLeft, zPadTrWRight,
+                                 strideH, strideW, reshapedIPRows,
+                                 reshapedIPCols, inputArr, inputReshaped);
+  MatMul2D_pt(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+              filterReshaped, inputReshaped, matmulOP, 1);
+  ConvTranspose2DReshapeMatMulOP_pt(N, H, W, CO, matmulOP, outArr);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2_pt(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void ConvTranspose3DReshapeFilter_pt(uint64_t FD, uint64_t FH, uint64_t FW,
+                                     uint64_t CO, uint64_t CI,
+                                     uint64_5D &inputArr,
+                                     uint64_2D &outputArr) {
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t fd = (int32_t)0; fd < FD; fd++) {
+      for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+        for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+          for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd((PublicMult(
+                                   (PublicMult((PublicMult(fd, FH)), FW)), CI)),
+                               (PublicMult((PublicMult(fh, FW)), CI)))),
+                    (PublicMult(fw, CI)))),
+                ci));
+            outputArr[co][linIdx] =
+                inputArr[(PublicSub((PublicSub(FD, (int32_t)1)), fd))]
+                        [(PublicSub((PublicSub(FH, (int32_t)1)), fh))]
+                        [(PublicSub((PublicSub(FW, (int32_t)1)), fw))][co][ci];
+          }
+        }
+      }
+    }
+  }
+}
+
+void ConvTranspose3DReshapeInput_pt(
+    uint64_t N, uint64_t DPrime, uint64_t HPrime, uint64_t WPrime, uint64_t CI,
+    uint64_t FD, uint64_t FH, uint64_t FW, uint64_t zPadTrDLeft,
+    uint64_t zPadTrDRight, uint64_t zPadTrHLeft, uint64_t zPadTrHRight,
+    uint64_t zPadTrWLeft, uint64_t zPadTrWRight, uint64_t strideD,
+    uint64_t strideH, uint64_t strideW, uint64_t RRows, uint64_t RCols,
+    uint64_5D &inputArr, uint64_2D &outputArr) {
+
+  uint64_t linIdxFilterMult = (int32_t)0;
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+
+    uint64_t leftTopCornerD = (PublicSub((int32_t)0, zPadTrDLeft));
+
+    uint64_t DPrimeTilde =
+        (PublicAdd(DPrime, (PublicMult((PublicSub(DPrime, (int32_t)1)),
+                                       (PublicSub(strideD, (int32_t)1))))));
+
+    uint64_t extremeRightBottomCornerD =
+        (PublicAdd((PublicSub(DPrimeTilde, (int32_t)1)), zPadTrDRight));
+    while ((PublicLTE((PublicSub((PublicAdd(leftTopCornerD, FD)), (int32_t)1)),
+                      extremeRightBottomCornerD))) {
+
+      uint64_t leftTopCornerH = (PublicSub((int32_t)0, zPadTrHLeft));
+
+      uint64_t HPrimeTilde =
+          (PublicAdd(HPrime, (PublicMult((PublicSub(HPrime, (int32_t)1)),
+                                         (PublicSub(strideH, (int32_t)1))))));
+
+      uint64_t extremeRightBottomCornerH =
+          (PublicAdd((PublicSub(HPrimeTilde, (int32_t)1)), zPadTrHRight));
+      while (
+          (PublicLTE((PublicSub((PublicAdd(leftTopCornerH, FH)), (int32_t)1)),
+                     extremeRightBottomCornerH))) {
+
+        uint64_t leftTopCornerW = (PublicSub((int32_t)0, zPadTrWLeft));
+
+        uint64_t WPrimeTilde =
+            (PublicAdd(WPrime, (PublicMult((PublicSub(WPrime, (int32_t)1)),
+                                           (PublicSub(strideW, (int32_t)1))))));
+
+        uint64_t extremeRightBottomCornerW =
+            (PublicAdd((PublicSub(WPrimeTilde, (int32_t)1)), zPadTrWRight));
+        while (
+            (PublicLTE((PublicSub((PublicAdd(leftTopCornerW, FW)), (int32_t)1)),
+                       extremeRightBottomCornerW))) {
+          for (uint64_t fd = (int32_t)0; fd < FD; fd++) {
+            for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+              for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+
+                uint64_t curPosD = (PublicAdd(leftTopCornerD, fd));
+
+                uint64_t curPosH = (PublicAdd(leftTopCornerH, fh));
+
+                uint64_t curPosW = (PublicAdd(leftTopCornerW, fw));
+
+                uint64_t val = (int64_t)0;
+                for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+                  if (((((PublicLT(curPosD, (int32_t)0)) ||
+                         (PublicGTE(curPosD, DPrimeTilde))) ||
+                        ((PublicLT(curPosH, (int32_t)0)) ||
+                         (PublicGTE(curPosH, HPrimeTilde)))) ||
+                       ((PublicLT(curPosW, (int32_t)0)) ||
+                        (PublicGTE(curPosW, WPrimeTilde))))) {
+                    val = (int64_t)0;
+                  } else {
+                    if ((((PublicMod(curPosD, strideD)) == (int32_t)0) &&
+                         ((PublicMod(curPosH, strideH)) == (int32_t)0)) &&
+                        ((PublicMod(curPosW, strideW)) == (int32_t)0)) {
+
+                      uint64_t idxInputD = (PublicDiv(curPosD, strideD));
+
+                      uint64_t idxInputH = (PublicDiv(curPosH, strideH));
+
+                      uint64_t idxInputW = (PublicDiv(curPosW, strideW));
+                      val = inputArr[n][idxInputD][idxInputH][idxInputW][ci];
+                    } else {
+                      val = (int64_t)0;
+                    }
+                  }
+                  outputArr[(PublicAdd(
+                      (PublicAdd(
+                          (PublicAdd(
+                              (PublicMult(
+                                  (PublicMult((PublicMult(fd, FH)), FW)), CI)),
+                              (PublicMult((PublicMult(fh, FW)), CI)))),
+                          (PublicMult(fw, CI)))),
+                      ci))][linIdxFilterMult] = val;
+                }
+              }
+            }
+          }
+          linIdxFilterMult = (PublicAdd(linIdxFilterMult, (int32_t)1));
+          leftTopCornerW = (PublicAdd(leftTopCornerW, (int32_t)1));
+        }
+
+        leftTopCornerH = (PublicAdd(leftTopCornerH, (int32_t)1));
+      }
+
+      leftTopCornerD = (PublicAdd(leftTopCornerD, (int32_t)1));
+    }
+  }
+}
+
+void ConvTranspose3D_pt(uint64_t N, uint64_t DPrime, uint64_t HPrime,
+                        uint64_t WPrime, uint64_t CI, uint64_t FD, uint64_t FH,
+                        uint64_t FW, uint64_t CO, uint64_t D, uint64_t H,
+                        uint64_t W, uint64_t zPadTrDLeft, uint64_t zPadTrDRight,
+                        uint64_t zPadTrHLeft, uint64_t zPadTrHRight,
+                        uint64_t zPadTrWLeft, uint64_t zPadTrWRight,
+                        uint64_t strideD, uint64_t strideH, uint64_t strideW,
+                        uint64_5D &inputArr, uint64_5D &filterArr,
+                        uint64_5D &outArr) {
+
+  uint64_t reshapedFilterRows = CO;
+
+  uint64_t reshapedFilterCols =
+      (PublicMult((PublicMult((PublicMult(FD, FH)), FW)), CI));
+
+  uint64_t reshapedIPRows =
+      (PublicMult((PublicMult((PublicMult(FD, FH)), FW)), CI));
+
+  uint64_t reshapedIPCols =
+      (PublicMult((PublicMult((PublicMult(N, D)), H)), W));
+
+  auto filterReshaped =
+      make_vector<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  auto inputReshaped = make_vector<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  auto matmulOP = make_vector<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  ConvTranspose3DReshapeFilter_pt(FD, FH, FW, CO, CI, filterArr,
+                                  filterReshaped);
+  ConvTranspose3DReshapeInput_pt(
+      N, DPrime, HPrime, WPrime, CI, FD, FH, FW, zPadTrDLeft, zPadTrDRight,
+      zPadTrHLeft, zPadTrHRight, zPadTrWLeft, zPadTrWRight, strideD, strideH,
+      strideW, reshapedIPRows, reshapedIPCols, inputArr, inputReshaped);
+  MatMul2D_pt(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+              filterReshaped, inputReshaped, matmulOP, 1);
+  Conv3DReshapeMatMulOP_pt(N, D, H, W, CO, matmulOP, outArr);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2_pt(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2_pt(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void ConvTranspose3DLoopInner_pt(
+    uint64_t N, uint64_t D, uint64_t H, uint64_t W, uint64_t CI, uint64_t FD,
+    uint64_t FH, uint64_t FW, uint64_t CO, uint64_t zPadDLeft,
+    uint64_t zPadDRight, uint64_t zPadHLeft, uint64_t zPadHRight,
+    uint64_t zPadWLeft, uint64_t zPadWRight, uint64_t strideD, uint64_t strideH,
+    uint64_t strideW, uint64_t outD, uint64_t outH, uint64_t outW,
+    uint64_5D &inputArr, uint64_5D &filterArr, uint64_5D &outArr) {
+  for (uint64_t n = (int32_t)0; n < N; n++) {
+    for (uint64_t co = (int32_t)0; co < CO; co++) {
+      for (uint64_t d = (int32_t)0; d < outD; d++) {
+        for (uint64_t h = (int32_t)0; h < outH; h++) {
+          for (uint64_t w = (int32_t)0; w < outW; w++) {
+            for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+              uint64_t val = (int64_t)0;
+              for (uint64_t fd = d; fd < (PublicAdd(d, FD)); fd++) {
+                for (uint64_t fh = h; fh < (PublicAdd(h, FH)); fh++) {
+                  for (uint64_t fw = w; fw < (PublicAdd(w, FW)); fw++) {
+
+                    uint64_t curPosD =
+                        (PublicDiv((PublicSub(fd, zPadDLeft)), strideD));
+
+                    uint64_t curPosH =
+                        (PublicDiv((PublicSub(fh, zPadHLeft)), strideD));
+
+                    uint64_t curPosW =
+                        (PublicDiv((PublicSub(fw, zPadWLeft)), strideD));
+                    if (((((((((PublicGTE(curPosD, (int32_t)0)) &&
+                               (PublicGTE(curPosH, (int32_t)0))) &&
+                              (PublicGTE(curPosW, (int32_t)0))) &&
+                             (PublicLT(curPosD, D))) &&
+                            (PublicLT(curPosH, H))) &&
+                           (PublicLT(curPosW, W))) &&
+                          ((PublicMod((PublicSub(fd, zPadDLeft)), strideD)) ==
+                           (int32_t)0)) &&
+                         ((PublicMod((PublicSub(fh, zPadHLeft)), strideH)) ==
+                          (int32_t)0)) &&
+                        ((PublicMod((PublicSub(fw, zPadWLeft)), strideW)) ==
+                         (int32_t)0)) {
+
+                      uint64_t curFilterPosD = (PublicSub(
+                          (PublicSub((PublicAdd(FD, d)), fd)), (int32_t)1));
+
+                      uint64_t curFilterPosH = (PublicSub(
+                          (PublicSub((PublicAdd(FH, h)), fh)), (int32_t)1));
+
+                      uint64_t curFilterPosW = (PublicSub(
+                          (PublicSub((PublicAdd(FW, w)), fw)), (int32_t)1));
+                      val = (PublicAdd(
+                          val, (PublicMult(
+                                   inputArr[n][curPosD][curPosH][curPosW][ci],
+                                   filterArr[curFilterPosD][curFilterPosH]
+                                            [curFilterPosW][co][ci]))));
+                    }
+                  }
+                }
+              }
+              outArr[n][d][h][w][co] = (PublicAdd(outArr[n][d][h][w][co], val));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void ConvTranspose3DLoop_pt(
+    uint64_t N, uint64_t DPrime, uint64_t HPrime, uint64_t WPrime, uint64_t CI,
+    uint64_t FD, uint64_t FH, uint64_t FW, uint64_t CO, uint64_t D, uint64_t H,
+    uint64_t W, uint64_t zPadTrDLeft, uint64_t zPadTrDRight,
+    uint64_t zPadTrHLeft, uint64_t zPadTrHRight, uint64_t zPadTrWLeft,
+    uint64_t zPadTrWRight, uint64_t strideD, uint64_t strideH, uint64_t strideW,
+    uint64_5D &inputArr, uint64_5D &filterArr, uint64_5D &outArr) {
+  ConvTranspose3DLoopInner_pt(
+      N, DPrime, HPrime, WPrime, CI, FD, FH, FW, CO, zPadTrDLeft, zPadTrDRight,
+      zPadTrHLeft, zPadTrHRight, zPadTrWLeft, zPadTrWRight, strideD, strideH,
+      strideW, D, H, W, inputArr, filterArr, outArr);
+}
+
+void Transpose2_pt(uint64_t s1, uint64_t s2, uint64_2D &inArr,
+                   uint64_2D &outArr) {
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    for (uint64_t j = (int32_t)0; j < s2; j++) {
+      outArr[i][j] = inArr[j][i];
+    }
+  }
+}
+
+void Pad442_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+               uint64_t inps1, uint64_t inps2, uint64_t inps3, uint64_t inps4,
+               uint64_4D &inpArr, uint64_t pads1, uint64_t pads2,
+               uint64_2D &paddings, uint64_4D &outArr) {
+
+  uint64_t lbounds1 = paddings[(int32_t)0][(int32_t)0];
+
+  uint64_t rbounds1excl = (PublicSub(s1, paddings[(int32_t)0][(int32_t)1]));
+
+  uint64_t lbounds2 = paddings[(int32_t)1][(int32_t)0];
+
+  uint64_t rbounds2excl = (PublicSub(s2, paddings[(int32_t)1][(int32_t)1]));
+
+  uint64_t lbounds3 = paddings[(int32_t)2][(int32_t)0];
+
+  uint64_t rbounds3excl = (PublicSub(s3, paddings[(int32_t)2][(int32_t)1]));
+
+  uint64_t lbounds4 = paddings[(int32_t)3][(int32_t)0];
+
+  uint64_t rbounds4excl = (PublicSub(s4, paddings[(int32_t)3][(int32_t)1]));
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    for (uint64_t j = (int32_t)0; j < s2; j++) {
+      for (uint64_t k = (int32_t)0; k < s3; k++) {
+        for (uint64_t l = (int32_t)0; l < s4; l++) {
+          if (((((((((PublicGTE(i, lbounds1)) && (PublicLT(i, rbounds1excl))) &&
+                    (PublicGTE(j, lbounds2))) &&
+                   (PublicLT(j, rbounds2excl))) &&
+                  (PublicGTE(k, lbounds3))) &&
+                 (PublicLT(k, rbounds3excl))) &&
+                (PublicGTE(l, lbounds4))) &&
+               (PublicLT(l, rbounds4excl)))) {
+            outArr[i][j][k][l] =
+                inpArr[(PublicSub(i, paddings[(int32_t)0][(int32_t)0]))]
+                      [(PublicSub(j, paddings[(int32_t)1][(int32_t)0]))]
+                      [(PublicSub(k, paddings[(int32_t)2][(int32_t)0]))]
+                      [(PublicSub(l, paddings[(int32_t)3][(int32_t)0]))];
+          } else {
+            outArr[i][j][k][l] = (int64_t)0;
+          }
+        }
+      }
+    }
+  }
+}
+
+void Pad552_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4, uint64_t s5,
+               uint64_t inps1, uint64_t inps2, uint64_t inps3, uint64_t inps4,
+               uint64_t inps5, uint64_5D &inpArr, uint64_t pads1,
+               uint64_t pads2, uint64_2D &paddings, uint64_5D &outArr) {
+
+  uint64_t lbounds1 = paddings[(int32_t)0][(int32_t)0];
+
+  uint64_t rbounds1excl = (PublicSub(s1, paddings[(int32_t)0][(int32_t)1]));
+
+  uint64_t lbounds2 = paddings[(int32_t)1][(int32_t)0];
+
+  uint64_t rbounds2excl = (PublicSub(s2, paddings[(int32_t)1][(int32_t)1]));
+
+  uint64_t lbounds3 = paddings[(int32_t)2][(int32_t)0];
+
+  uint64_t rbounds3excl = (PublicSub(s3, paddings[(int32_t)2][(int32_t)1]));
+
+  uint64_t lbounds4 = paddings[(int32_t)3][(int32_t)0];
+
+  uint64_t rbounds4excl = (PublicSub(s4, paddings[(int32_t)3][(int32_t)1]));
+
+  uint64_t lbounds5 = paddings[(int32_t)4][(int32_t)0];
+
+  uint64_t rbounds5excl = (PublicSub(s5, paddings[(int32_t)4][(int32_t)1]));
+  for (uint64_t i = (int32_t)0; i < s1; i++) {
+    for (uint64_t j = (int32_t)0; j < s2; j++) {
+      for (uint64_t k = (int32_t)0; k < s3; k++) {
+        for (uint64_t l = (int32_t)0; l < s4; l++) {
+          for (uint64_t m = (int32_t)0; m < s5; m++) {
+            if (((((((((((PublicGTE(i, lbounds1)) &&
+                         (PublicLT(i, rbounds1excl))) &&
+                        (PublicGTE(j, lbounds2))) &&
+                       (PublicLT(j, rbounds2excl))) &&
+                      (PublicGTE(k, lbounds3))) &&
+                     (PublicLT(k, rbounds3excl))) &&
+                    (PublicGTE(l, lbounds4))) &&
+                   (PublicLT(l, rbounds4excl))) &&
+                  (PublicGTE(m, lbounds5))) &&
+                 (PublicLT(m, rbounds5excl)))) {
+              outArr[i][j][k][l][m] =
+                  inpArr[(PublicSub(i, paddings[(int32_t)0][(int32_t)0]))]
+                        [(PublicSub(j, paddings[(int32_t)1][(int32_t)0]))]
+                        [(PublicSub(k, paddings[(int32_t)2][(int32_t)0]))]
+                        [(PublicSub(l, paddings[(int32_t)3][(int32_t)0]))]
+                        [(PublicSub(m, paddings[(int32_t)4][(int32_t)0]))];
+            } else {
+              outArr[i][j][k][l][m] = (int64_t)0;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void PadONNX441_pt(uint64_t o1, uint64_t o2, uint64_t o3, uint64_t o4,
+                   uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4,
+                   uint64_4D &inpArr, uint64_t pads, uint64_1D &paddings,
+                   uint64_4D &outArr) {
+
+  uint64_t lbounds1 = paddings[(int32_t)0];
+
+  uint64_t rbounds1excl = (PublicSub(o1, paddings[(int32_t)4]));
+
+  uint64_t lbounds2 = paddings[(int32_t)1];
+
+  uint64_t rbounds2excl = (PublicSub(o2, paddings[(int32_t)5]));
+
+  uint64_t lbounds3 = paddings[(int32_t)2];
+
+  uint64_t rbounds3excl = (PublicSub(o3, paddings[(int32_t)6]));
+
+  uint64_t lbounds4 = paddings[(int32_t)3];
+
+  uint64_t rbounds4excl = (PublicSub(o4, paddings[(int32_t)7]));
+  for (uint64_t i = (int32_t)0; i < o1; i++) {
+    for (uint64_t j = (int32_t)0; j < o2; j++) {
+      for (uint64_t k = (int32_t)0; k < o3; k++) {
+        for (uint64_t l = (int32_t)0; l < o4; l++) {
+          if (((((((((PublicGTE(i, lbounds1)) && (PublicLT(i, rbounds1excl))) &&
+                    (PublicGTE(j, lbounds2))) &&
+                   (PublicLT(j, rbounds2excl))) &&
+                  (PublicGTE(k, lbounds3))) &&
+                 (PublicLT(k, rbounds3excl))) &&
+                (PublicGTE(l, lbounds4))) &&
+               (PublicLT(l, rbounds4excl)))) {
+            outArr[i][j][k][l] = inpArr[(PublicSub(i, paddings[(int32_t)0]))]
+                                       [(PublicSub(j, paddings[(int32_t)1]))]
+                                       [(PublicSub(k, paddings[(int32_t)2]))]
+                                       [(PublicSub(l, paddings[(int32_t)3]))];
+          } else {
+            outArr[i][j][k][l] = (int64_t)0;
+          }
+        }
+      }
+    }
+  }
+}
+
+void Squeeze24_pt(uint64_t s1, uint64_t s2, uint64_t dim1, uint64_t dim2,
+                  uint64_t ins1, uint64_t ins2, uint64_t ins3, uint64_t ins4,
+                  uint64_4D &inArr, uint64_2D &outArr) {
+  for (uint64_t i = (int32_t)0; i < ins1; i++) {
+    for (uint64_t j = (int32_t)0; j < ins2; j++) {
+      for (uint64_t k = (int32_t)0; k < ins3; k++) {
+        for (uint64_t l = (int32_t)0; l < ins4; l++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i, ins2)), ins3)),
+                                  ins4)),
+                      (PublicMult((PublicMult(j, ins3)), ins4)))),
+                  (PublicMult(k, ins4)))),
+              l));
+
+          uint64_t outIdx1 = (PublicDiv(linIdx, s2));
+
+          uint64_t outIdx2 = (PublicMod(linIdx, s2));
+          outArr[outIdx1][outIdx2] = inArr[i][j][k][l];
+        }
+      }
+    }
+  }
+}
+
+void FusedBatchNorm4411_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                           uint64_4D &inArr, uint64_1D &multArr,
+                           uint64_1D &biasArr, uint64_t multExprScaleDownSf,
+                           uint64_t biasExprScaleUpSf, uint64_4D &outputArr) {
+
+  uint64_t inpSize = (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4));
+
+  auto inArrReshaped = make_vector<uint64_t>(inpSize);
+
+  auto multArrReshaped = make_vector<uint64_t>(inpSize);
+
+  auto multExprAns = make_vector<uint64_t>(inpSize);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          inArrReshaped[linIdx] = inArr[i1][i2][i3][i4];
+          multArrReshaped[linIdx] = multArr[i4];
+        }
+      }
+    }
+  }
+  ElemWiseActModelVectorMult_pt(inpSize, inArrReshaped, multArrReshaped,
+                                multExprAns);
+  if ((PublicGT(multExprScaleDownSf, (int32_t)0))) {
+    ScaleDown_pt(inpSize, multExprAns, multExprScaleDownSf);
+  }
+
+  auto biasArrScaledUp = make_vector<uint64_t>(s4);
+  for (uint64_t ii = (int32_t)0; ii < s4; ii++) {
+    biasArrScaledUp[ii] = biasArr[ii];
+  }
+  if ((PublicGT(biasExprScaleUpSf, (int32_t)0))) {
+    ScaleUp_pt(s4, biasArrScaledUp, biasExprScaleUpSf);
+  }
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          outputArr[i1][i2][i3][i4] =
+              (PublicAdd(multExprAns[linIdx], biasArrScaledUp[i4]));
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(inpSize, inArrReshaped);
+  ClearMemSecret1_pt(inpSize, multArrReshaped);
+  ClearMemSecret1_pt(inpSize, multExprAns);
+  ClearMemSecret1_pt(s4, biasArrScaledUp);
+}
+
+void FusedBatchNorm5511_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                           uint64_t s5, uint64_5D &inArr, uint64_1D &multArr,
+                           uint64_1D &biasArr, uint64_t multExprScaleDownSf,
+                           uint64_t biasExprScaleUpSf, uint64_5D &outputArr) {
+
+  uint64_t inpSize = (PublicMult(
+      (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4)), s5));
+
+  auto inArrReshaped = make_vector<uint64_t>(inpSize);
+
+  auto multArrReshaped = make_vector<uint64_t>(inpSize);
+
+  auto multExprAns = make_vector<uint64_t>(inpSize);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            inArrReshaped[linIdx] = inArr[i1][i2][i3][i4][i5];
+            multArrReshaped[linIdx] = multArr[i5];
+          }
+        }
+      }
+    }
+  }
+  ElemWiseActModelVectorMult_pt(inpSize, inArrReshaped, multArrReshaped,
+                                multExprAns);
+  if ((PublicGT(multExprScaleDownSf, (int32_t)0))) {
+    ScaleDown_pt(inpSize, multExprAns, multExprScaleDownSf);
+  }
+
+  auto biasArrScaledUp = make_vector<uint64_t>(s5);
+  for (uint64_t ii = (int32_t)0; ii < s5; ii++) {
+    biasArrScaledUp[ii] = biasArr[ii];
+  }
+  if ((PublicGT(biasExprScaleUpSf, (int32_t)0))) {
+    ScaleUp_pt(s5, biasArrScaledUp, biasExprScaleUpSf);
+  }
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            outputArr[i1][i2][i3][i4][i5] =
+                (PublicAdd(multExprAns[linIdx], biasArrScaledUp[i5]));
+          }
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(inpSize, inArrReshaped);
+  ClearMemSecret1_pt(inpSize, multArrReshaped);
+  ClearMemSecret1_pt(inpSize, multExprAns);
+  ClearMemSecret1_pt(s5, biasArrScaledUp);
+}
+
+void ElemWiseMul2_pt(uint64_t s1, uint64_t s2, uint64_2D &arr1, uint64_2D &arr2,
+                     uint64_2D &outArr) {
+
+  uint64_t inpSize = (PublicMult(s1, s2));
+
+  auto arr1Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto arr2Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto outArrReshaped = make_vector<uint64_t>(inpSize);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      arr1Reshaped[linIdx] = arr1[i1][i2];
+      arr2Reshaped[linIdx] = arr2[i1][i2];
+    }
+  }
+  ElemWiseSecretSharedVectorMult_pt(inpSize, arr1Reshaped, arr2Reshaped,
+                                    outArrReshaped);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      outArr[i1][i2] = outArrReshaped[linIdx];
+    }
+  }
+  ClearMemSecret1_pt(inpSize, arr1Reshaped);
+  ClearMemSecret1_pt(inpSize, arr2Reshaped);
+  ClearMemSecret1_pt(inpSize, outArrReshaped);
+}
+
+void ElemWiseMul4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                     uint64_4D &arr1, uint64_4D &arr2, uint64_4D &outArr) {
+
+  uint64_t inpSize = (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4));
+
+  auto arr1Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto arr2Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto outArrReshaped = make_vector<uint64_t>(inpSize);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          arr1Reshaped[linIdx] = arr1[i1][i2][i3][i4];
+          arr2Reshaped[linIdx] = arr2[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+  ElemWiseSecretSharedVectorMult_pt(inpSize, arr1Reshaped, arr2Reshaped,
+                                    outArrReshaped);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          outArr[i1][i2][i3][i4] = outArrReshaped[linIdx];
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(inpSize, arr1Reshaped);
+  ClearMemSecret1_pt(inpSize, arr2Reshaped);
+  ClearMemSecret1_pt(inpSize, outArrReshaped);
+}
+
+void ElemWiseMul5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                     uint64_t s5, uint64_5D &arr1, uint64_5D &arr2,
+                     uint64_5D &outArr) {
+
+  uint64_t inpSize = (PublicMult(
+      (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4)), s5));
+
+  auto arr1Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto arr2Reshaped = make_vector<uint64_t>(inpSize);
+
+  auto outArrReshaped = make_vector<uint64_t>(inpSize);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            arr1Reshaped[linIdx] = arr1[i1][i2][i3][i4][i5];
+            arr2Reshaped[linIdx] = arr2[i1][i2][i3][i4][i5];
+          }
+        }
+      }
+    }
+  }
+  ElemWiseSecretSharedVectorMult_pt(inpSize, arr1Reshaped, arr2Reshaped,
+                                    outArrReshaped);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            outArr[i1][i2][i3][i4][i5] = outArrReshaped[linIdx];
+          }
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(inpSize, arr1Reshaped);
+  ClearMemSecret1_pt(inpSize, arr2Reshaped);
+  ClearMemSecret1_pt(inpSize, outArrReshaped);
+}
+
+void ReduceMean24_pt(uint64_t outS1, uint64_t outS2, uint64_t inS1,
+                     uint64_t inS2, uint64_t inS3, uint64_t inS4,
+                     uint64_4D &inputArr, uint64_1D &axes,
+                     uint64_2D &outputArr) {
+
+  uint64_t divisor = (PublicMult(inS2, inS3));
+
+  uint64_t outputSize = (PublicMult(outS1, outS2));
+
+  auto sumArr = make_vector<uint64_t>(outputSize);
+
+  auto outputArrReshaped = make_vector<uint64_t>(outputSize);
+  for (uint64_t i1 = (int32_t)0; i1 < outS1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < outS2; i2++) {
+
+      uint64_t summ = (int64_t)0;
+      for (uint64_t i = (int32_t)0; i < inS2; i++) {
+        for (uint64_t j = (int32_t)0; j < inS3; j++) {
+          summ = (PublicAdd(summ, inputArr[i1][i][j][i2]));
+        }
+      }
+      sumArr[(PublicAdd((PublicMult(i1, outS2)), i2))] = summ;
+    }
+  }
+  ElemWiseVectorPublicDiv_pt(outputSize, sumArr, divisor, outputArrReshaped);
+  for (uint64_t i1 = (int32_t)0; i1 < outS1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < outS2; i2++) {
+      outputArr[i1][i2] =
+          outputArrReshaped[(PublicAdd((PublicMult(i1, outS2)), i2))];
+    }
+  }
+  ClearMemSecret1_pt(outputSize, sumArr);
+  ClearMemSecret1_pt(outputSize, outputArrReshaped);
+}
+
+void ReduceMeanONNX24_pt(uint64_t outS1, uint64_t outS2, uint64_t inS1,
+                         uint64_t inS2, uint64_t inS3, uint64_t inS4,
+                         uint64_4D &inputArr, uint64_t axis1, uint64_t axis2,
+                         uint64_2D &outputArr) {
+
+  uint64_t divisor = (PublicMult(inS3, inS4));
+
+  uint64_t outputSize = (PublicMult(outS1, outS2));
+
+  auto sumArr = make_vector<uint64_t>(outputSize);
+
+  auto outputArrReshaped = make_vector<uint64_t>(outputSize);
+  for (uint64_t i1 = (int32_t)0; i1 < outS1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < outS2; i2++) {
+
+      uint64_t summ = (int64_t)0;
+      for (uint64_t i = (int32_t)0; i < inS3; i++) {
+        for (uint64_t j = (int32_t)0; j < inS4; j++) {
+          summ = (PublicAdd(summ, inputArr[i1][i2][i][j]));
+        }
+      }
+      sumArr[(PublicAdd((PublicMult(i1, outS2)), i2))] = summ;
+    }
+  }
+  ElemWiseVectorPublicDiv_pt(outputSize, sumArr, divisor, outputArrReshaped);
+  for (uint64_t i1 = (int32_t)0; i1 < outS1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < outS2; i2++) {
+      outputArr[i1][i2] =
+          outputArrReshaped[(PublicAdd((PublicMult(i1, outS2)), i2))];
+    }
+  }
+  ClearMemSecret1_pt(outputSize, sumArr);
+  ClearMemSecret1_pt(outputSize, outputArrReshaped);
+}
+
+void ArgMax1_pt(uint64_t outArrS1, uint64_t inArrS1, uint64_t inArrS2,
+                uint64_2D &inArr, uint64_t dim, uint64_1D &outArr) {
+  ArgMax_pt(inArrS1, inArrS2, inArr, outArr);
+}
+
+void ArgMax3_pt(uint64_t outs1, uint64_t outs2, uint64_t outs3, uint64_t ins1,
+                uint64_t ins2, uint64_t ins3, uint64_t ins4, uint64_4D &inArr,
+                uint64_t dim, uint64_3D &outArr) {
+
+  uint64_t size = (PublicMult((PublicMult(ins1, ins2)), ins3));
+
+  auto reshapedInArr = make_vector<uint64_t>(size, ins4);
+
+  auto reshapedOutArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < ins1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < ins2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < ins3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < ins4; i4++) {
+
+          uint64_t linIdx =
+              (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, ins2)), ins3)),
+                                    (PublicMult(i2, ins3)))),
+                         i3));
+          reshapedInArr[linIdx][i4] = inArr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+  ArgMax_pt(size, ins4, reshapedInArr, reshapedOutArr);
+  for (uint64_t i1 = (int32_t)0; i1 < ins1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < ins2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < ins3; i3++) {
+
+        uint64_t linIdx =
+            (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, ins2)), ins3)),
+                                  (PublicMult(i2, ins3)))),
+                       i3));
+        outArr[i1][i2][i3] = reshapedOutArr[linIdx];
+      }
+    }
+  }
+  ClearMemSecret2_pt(size, ins4, reshapedInArr);
+  ClearMemSecret1_pt(size, reshapedOutArr);
+}
+
+void Relu2_pt(uint64_t s1, uint64_t s2, uint64_2D &inArr, uint64_2D &outArr,
+              uint64_t sf, uint64_t doTruncation) {
+
+  uint64_t size = (PublicMult(s1, s2));
+
+  auto reshapedInArr = make_vector<uint64_t>(size);
+
+  auto reshapedOutArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      reshapedInArr[linIdx] = inArr[i1][i2];
+    }
+  }
+  Relu_pt(size, reshapedInArr, reshapedOutArr, sf, doTruncation);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      outArr[i1][i2] = reshapedOutArr[linIdx];
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedInArr);
+  ClearMemSecret1_pt(size, reshapedOutArr);
+}
+
+void Relu4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+              uint64_4D &inArr, uint64_4D &outArr, uint64_t sf,
+              uint64_t doTruncation) {
+
+  uint64_t size = (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4));
+
+  auto reshapedInArr = make_vector<uint64_t>(size);
+
+  auto reshapedOutArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          reshapedInArr[linIdx] = inArr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+  Relu_pt(size, reshapedInArr, reshapedOutArr, sf, doTruncation);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          outArr[i1][i2][i3][i4] = reshapedOutArr[linIdx];
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedInArr);
+  ClearMemSecret1_pt(size, reshapedOutArr);
+}
+
+void Relu5_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4, uint64_t s5,
+              uint64_5D &inArr, uint64_5D &outArr, uint64_t sf,
+              uint64_t doTruncation) {
+
+  uint64_t size = (PublicMult(
+      (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4)), s5));
+
+  auto reshapedInArr = make_vector<uint64_t>(size);
+
+  auto reshapedOutArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            reshapedInArr[linIdx] = inArr[i1][i2][i3][i4][i5];
+          }
+        }
+      }
+    }
+  }
+  Relu_pt(size, reshapedInArr, reshapedOutArr, sf, doTruncation);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+          for (uint64_t i5 = (int32_t)0; i5 < s5; i5++) {
+
+            uint64_t linIdx = (PublicAdd(
+                (PublicAdd(
+                    (PublicAdd(
+                        (PublicAdd(
+                            (PublicMult(
+                                (PublicMult(
+                                    (PublicMult((PublicMult(i1, s2)), s3)),
+                                    s4)),
+                                s5)),
+                            (PublicMult((PublicMult((PublicMult(i2, s3)), s4)),
+                                        s5)))),
+                        (PublicMult((PublicMult(i3, s4)), s5)))),
+                    (PublicMult(i4, s5)))),
+                i5));
+            outArr[i1][i2][i3][i4][i5] = reshapedOutArr[linIdx];
+          }
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedInArr);
+  ClearMemSecret1_pt(size, reshapedOutArr);
+}
+
+void Floor2_pt(uint64_t s1, uint64_t s2, uint64_2D &inArr, uint64_2D &outArr,
+               uint64_t sf) {
+
+  uint64_t size = (PublicMult(s1, s2));
+
+  auto reshapedInArr = make_vector<uint64_t>(size);
+
+  auto reshapedOutArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      reshapedInArr[linIdx] = inArr[i1][i2];
+    }
+  }
+  Floor_pt(size, reshapedInArr, reshapedOutArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      outArr[i1][i2] = reshapedOutArr[linIdx];
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedInArr);
+  ClearMemSecret1_pt(size, reshapedOutArr);
+}
+
+void ScaleUp1_pt(uint64_t s1, uint64_1D &arr, uint64_t sf) {
+  ScaleUp_pt(s1, arr, sf);
+}
+
+void ScaleUp2_pt(uint64_t s1, uint64_t s2, uint64_2D &arr, uint64_t sf) {
+
+  uint64_t size = (PublicMult(s1, s2));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      reshapedArr[linIdx] = arr[i1][i2];
+    }
+  }
+  ScaleUp_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      arr[i1][i2] = reshapedArr[linIdx];
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void ScaleUp3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_3D &arr,
+                 uint64_t sf) {
+
+  uint64_t size = (PublicMult((PublicMult(s1, s2)), s3));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+
+        uint64_t linIdx =
+            (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, s2)), s3)),
+                                  (PublicMult(i2, s3)))),
+                       i3));
+        reshapedArr[linIdx] = arr[i1][i2][i3];
+      }
+    }
+  }
+  ScaleUp_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+
+        uint64_t linIdx =
+            (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, s2)), s3)),
+                                  (PublicMult(i2, s3)))),
+                       i3));
+        arr[i1][i2][i3] = reshapedArr[linIdx];
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void ScaleUp4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                 uint64_4D &arr, uint64_t sf) {
+
+  uint64_t size = (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          reshapedArr[linIdx] = arr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+  ScaleUp_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          arr[i1][i2][i3][i4] = reshapedArr[linIdx];
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void ScaleDown1_pt(uint64_t s1, uint64_1D &arr, uint64_t sf) {
+  ScaleDown_pt(s1, arr, sf);
+}
+
+void ScaleDown2_pt(uint64_t s1, uint64_t s2, uint64_2D &arr, uint64_t sf) {
+
+  uint64_t size = (PublicMult(s1, s2));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      reshapedArr[linIdx] = arr[i1][i2];
+    }
+  }
+  ScaleDown_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+
+      uint64_t linIdx = (PublicAdd((PublicMult(i1, s2)), i2));
+      arr[i1][i2] = reshapedArr[linIdx];
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void ScaleDown3_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_3D &arr,
+                   uint64_t sf) {
+
+  uint64_t size = (PublicMult((PublicMult(s1, s2)), s3));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+
+        uint64_t linIdx =
+            (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, s2)), s3)),
+                                  (PublicMult(i2, s3)))),
+                       i3));
+        reshapedArr[linIdx] = arr[i1][i2][i3];
+      }
+    }
+  }
+  ScaleDown_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+
+        uint64_t linIdx =
+            (PublicAdd((PublicAdd((PublicMult((PublicMult(i1, s2)), s3)),
+                                  (PublicMult(i2, s3)))),
+                       i3));
+        arr[i1][i2][i3] = reshapedArr[linIdx];
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void ScaleDown4_pt(uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4,
+                   uint64_4D &arr, uint64_t sf) {
+
+  uint64_t size = (PublicMult((PublicMult((PublicMult(s1, s2)), s3)), s4));
+
+  auto reshapedArr = make_vector<uint64_t>(size);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          reshapedArr[linIdx] = arr[i1][i2][i3][i4];
+        }
+      }
+    }
+  }
+  ScaleDown_pt(size, reshapedArr, sf);
+  for (uint64_t i1 = (int32_t)0; i1 < s1; i1++) {
+    for (uint64_t i2 = (int32_t)0; i2 < s2; i2++) {
+      for (uint64_t i3 = (int32_t)0; i3 < s3; i3++) {
+        for (uint64_t i4 = (int32_t)0; i4 < s4; i4++) {
+
+          uint64_t linIdx = (PublicAdd(
+              (PublicAdd(
+                  (PublicAdd(
+                      (PublicMult((PublicMult((PublicMult(i1, s2)), s3)), s4)),
+                      (PublicMult((PublicMult(i2, s3)), s4)))),
+                  (PublicMult(i3, s4)))),
+              i4));
+          arr[i1][i2][i3][i4] = reshapedArr[linIdx];
+        }
+      }
+    }
+  }
+  ClearMemSecret1_pt(size, reshapedArr);
+}
+
+void Conv2DWrapper_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                      uint64_t FH, uint64_t FW, uint64_t CO, uint64_t zPadHLeft,
+                      uint64_t zPadHRight, uint64_t zPadWLeft,
+                      uint64_t zPadWRight, uint64_t strideH, uint64_t strideW,
+                      uint64_4D &inputArr, uint64_4D &filterArr,
+                      uint64_4D &outArr) {
+  Conv2D_pt(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+            zPadWRight, strideH, strideW, inputArr, filterArr, outArr);
+}
+
+void Conv3DWrapper_pt(uint64_t N, uint64_t D, uint64_t H, uint64_t W,
+                      uint64_t CI, uint64_t FD, uint64_t FH, uint64_t FW,
+                      uint64_t CO, uint64_t zPadDLeft, uint64_t zPadDRight,
+                      uint64_t zPadHLeft, uint64_t zPadHRight,
+                      uint64_t zPadWLeft, uint64_t zPadWRight, uint64_t strideD,
+                      uint64_t strideH, uint64_t strideW, uint64_5D &inputArr,
+                      uint64_5D &filterArr, uint64_5D &outArr) {
+  Conv3D_pt(N, D, H, W, CI, FD, FH, FW, CO, zPadDLeft, zPadDRight, zPadHLeft,
+            zPadHRight, zPadWLeft, zPadWRight, strideD, strideH, strideW,
+            inputArr, filterArr, outArr);
+}
+
+void Conv2DGroupWrapper_pt(uint64_t N, uint64_t H, uint64_t W, uint64_t CI,
+                           uint64_t FH, uint64_t FW, uint64_t CO,
+                           uint64_t zPadHLeft, uint64_t zPadHRight,
+                           uint64_t zPadWLeft, uint64_t zPadWRight,
+                           uint64_t strideH, uint64_t strideW, uint64_t G,
+                           uint64_4D &inputArr, uint64_4D &filterArr,
+                           uint64_4D &outArr) {
+  Conv2DGroup_pt(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+                 zPadWRight, strideH, strideW, G, inputArr, filterArr, outArr);
+}
+
+void ConvTranspose2DWrapper_pt(uint64_t N, uint64_t HPrime, uint64_t WPrime,
+                               uint64_t CI, uint64_t FH, uint64_t FW,
+                               uint64_t CO, uint64_t H, uint64_t W,
+                               uint64_t zPadTrHLeft, uint64_t zPadTrHRight,
+                               uint64_t zPadTrWLeft, uint64_t zPadTrWRight,
+                               uint64_t strideH, uint64_t strideW,
+                               uint64_4D &inputArr, uint64_4D &filterArr,
+                               uint64_4D &outArr) {
+  ConvTranspose2D_pt(N, HPrime, WPrime, CI, FH, FW, CO, H, W, zPadTrHLeft,
+                     zPadTrHRight, zPadTrWLeft, zPadTrWRight, strideH, strideW,
+                     inputArr, filterArr, outArr);
+}
+
+void ConvTranspose3DWrapper_pt(
+    uint64_t N, uint64_t DPrime, uint64_t HPrime, uint64_t WPrime, uint64_t CI,
+    uint64_t FD, uint64_t FH, uint64_t FW, uint64_t CO, uint64_t D, uint64_t H,
+    uint64_t W, uint64_t zPadTrDLeft, uint64_t zPadTrDRight,
+    uint64_t zPadTrHLeft, uint64_t zPadTrHRight, uint64_t zPadTrWLeft,
+    uint64_t zPadTrWRight, uint64_t strideD, uint64_t strideH, uint64_t strideW,
+    uint64_5D &inputArr, uint64_5D &filterArr, uint64_5D &outArr) {
+  ConvTranspose3D_pt(N, DPrime, HPrime, WPrime, CI, FD, FH, FW, CO, D, H, W,
+                     zPadTrDLeft, zPadTrDRight, zPadTrHLeft, zPadTrHRight,
+                     zPadTrWLeft, zPadTrWRight, strideD, strideH, strideW,
+                     inputArr, filterArr, outArr);
+}
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.cpp
new file mode 100644
index 00000000..7bc7394e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.cpp
@@ -0,0 +1,162 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "cleartext_library_float.h"
+#include "math.h"
+
+using namespace std ;
+
+float intToFloat(int32_t m) {
+	return (float)m ;
+}
+
+void Softmax1(int32_t s1,  vector<float>& inArr, vector<float>& outArr) {
+	float sum = 0.0 ;
+	float max = inArr[0] ;
+
+	for (int i = 1 ; i < s1 ; i++)
+		if (max > inArr[i])
+			max = inArr[i] ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i] = exp(inArr[i]-max) ;
+		sum += outArr[i] ;
+	}	
+
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i] /= sum ;
+	}
+}
+
+void Softmax2(int32_t s1, int32_t s2, vector<vector<float>>& inArr, vector<vector<float>>& outArr) {
+	for (int i = 0 ; i < s1 ; i++)
+		Softmax1(s2, inArr[i], outArr[i]) ;
+}
+
+void Ln(int32_t s1, vector<float>& inArr, vector<float>& outArr) {
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i] = log(inArr[i]) ;
+	}
+}
+
+void Tanh(int32_t s1, vector<float>& inArr, vector<float>& outArr) {
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i] = inArr[i] + 1 ;
+	}
+}
+
+void getOutDer(int32_t s1, int32_t s2, vector<vector<float>>& batchSoft, vector<vector<float>>& lab, vector<vector<float>>& der) {
+for (uint32_t i1 = 0; i1 < s1; i1++){
+for (uint32_t i2 = 0; i2 < s2; i2++){
+der[i1][i2] = ((batchSoft[i1][i2] - lab[i1][i2]) / intToFloat(s1)) ;
+}
+}
+}
+
+void MatMul(int32_t s1, int32_t s2, int32_t s3, vector<vector<float>>& mat1, vector<vector<float>>& mat2, vector<vector<float>>& mat3) {
+for (uint32_t i1 = 0; i1 < s1; i1++){
+for (uint32_t i3 = 0; i3 < s3; i3++){
+mat3[i1][i3] = 0. ;
+
+for (uint32_t i2 = 0; i2 < s2; i2++){
+mat3[i1][i3] = (mat3[i1][i3] + (mat1[i1][i2] * mat2[i2][i3])) ;
+
+}
+}
+}
+}
+
+void GemmAdd(int32_t s1, int32_t s2, vector<vector<float>>& prod, vector<float>& bias, vector<vector<float>>& out) {
+for (uint32_t i1 = 0; i1 < s1; i1++){
+for (uint32_t i2 = 0; i2 < s2; i2++){
+out[i1][i2] = (prod[i1][i2] + bias[i2]) ;
+
+}
+}
+}
+
+void dotProduct2(int32_t s1, int32_t s2, vector<vector<float>>& arr1, vector<vector<float>>& arr2, vector<float>& outArr) {
+for (uint32_t i = 0; i < s1; i++){
+outArr[i] = 0. ;
+
+for (uint32_t j = 0; j < s2; j++){
+outArr[i] = (outArr[i] + (arr1[i][j] * arr2[i][j])) ;
+
+}
+}
+}
+
+void getLoss(int32_t m, vector<float>& lossTerms, vector<float>& loss) {
+loss[0] = 0. ;
+
+for (uint32_t i = 0; i < m; i++){
+loss[0] = (loss[0] + lossTerms[i]) ;
+
+}
+loss[0] = ((0. - loss[0]) / intToFloat(m)) ;
+
+}
+
+void computeMSELoss(int32_t m, int32_t s, vector<vector<float>>& target, vector<vector<float>>& fwdOut, vector<float>& loss) {
+loss[0] = 0. ;
+
+float term ;
+
+for (uint32_t i = 0; i < m; i++){
+term = (fwdOut[i][0] - target[i][0]) ;
+
+loss[0] = (loss[0] + (term * term)) ;
+
+}
+loss[0] = (loss[0] / (2. * intToFloat(m))) ;
+
+}
+
+void getBiasDer(int32_t s1, int32_t s2, vector<vector<float>>& der, vector<float>& biasDer) {
+for (uint32_t i2 = 0; i2 < s2; i2++){
+biasDer[i2] = der[0][i2] ;
+
+for (uint32_t i1 = 1; i1 < s1; i1++){
+biasDer[i2] = (biasDer[i2] + der[i1][i2]) ;
+
+}
+}
+}
+
+
+void updateWeights(int32_t s, float lr, vector<float>& bias, vector<float>& der) {
+for (uint32_t i = 0; i < s; i++){
+bias[i] = (bias[i] - (lr * der[i])) ;
+}
+}
+
+void IfElse(int32_t s1, vector<float>& dat, vector<bool>& hot, vector<float>& out, bool flip) {
+	for (uint32_t i1 = 0; i1 < s1; i1++) {
+		out[i1] = hot[i1] ? dat[i1] : 0. ;
+	}
+}
+
+void Relu(int32_t s1, vector<float>& inArr, vector<float>& outArr, vector<bool>& hotArr) {
+	for (uint32_t i1 = 0; i1 < s1; i1++){
+		hotArr[i1] = (inArr[i1] > 0.) ;
+		outArr[i1] = hotArr[i1] ? inArr[i1] : 0. ;	
+	}
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.h b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.h
new file mode 100644
index 00000000..97a2ed6b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.h
@@ -0,0 +1,62 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LIBRARY_CLEARTEXT_FLOAT_H__
+#define LIBRARY_CLEARTEXT_FLOAT_H__
+
+#include <vector>
+#include <math.h>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+
+using namespace std ;
+
+template<typename T>
+vector<T> make_vector(size_t size) {
+return std::vector<T>(size) ;
+}
+
+template <typename T, typename... Args>
+auto make_vector(size_t first, Args... sizes)
+{
+auto inner = make_vector<T>(sizes...) ;
+return vector<decltype(inner)>(first, inner) ;
+}
+
+float intToFloat(int32_t m);
+void Softmax2(int32_t s1, int32_t s2, vector<vector<float>>& inArr, vector<vector<float>>& outArr);
+void Ln(int32_t s1, vector<float>& inArr, vector<float>& outArr);
+void getOutDer(int32_t s1, int32_t s2, vector<vector<float>>& batchSoft, vector<vector<float>>& lab, vector<vector<float>>& der);
+void MatMul(int32_t s1, int32_t s2, int32_t s3, vector<vector<float>>& mat1, vector<vector<float>>& mat2, vector<vector<float>>& mat3);
+void GemmAdd(int32_t s1, int32_t s2, vector<vector<float>>& prod, vector<float>& bias, vector<vector<float>>& out);
+void dotProduct2(int32_t s1, int32_t s2, vector<vector<float>>& arr1, vector<vector<float>>& arr2, vector<float>& outArr);
+void Relu(int32_t s1, vector<float>& inArr, vector<float>& outArr, vector<bool>& hotArr);
+void getBiasDer(int32_t s1, int32_t s2, vector<vector<float>>& der, vector<float>& biasDer);
+void IfElse(int32_t s1, vector<float>& dat, vector<bool>& hot, vector<float>& out, bool flip);
+void updateWeights(int32_t s, float lr, vector<float>& bias, vector<float>& der);
+void getLoss(int32_t m, vector<float>& lossTerms, vector<float>& loss);
+void computeMSELoss(int32_t m, int32_t s, vector<vector<float>>& target, vector<vector<float>>& fwdOut, vector<float>& loss);
+
+void Tanh(int32_t s1, vector<float>& inArr, vector<float>& outArr) ;
+
+
+#endif
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/defines.h b/GPU-MPC/ext/sytorch/ext/sci/src/defines.h
new file mode 100644
index 00000000..b69307d3
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/defines.h
@@ -0,0 +1,54 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef DEFINES_H___
+#define DEFINES_H___
+
+#define LOG_LAYERWISE
+// #define VERIFY_LAYERWISE
+#define USE_EIGEN
+#define WRITE_LOG
+// #define APPROXIMATE_SIGMOID
+// #define APPROXIMATE_TANH
+#define THREADING_MIN_CHUNK_SIZE 128
+
+#define DIV_RESCALING
+#ifndef DIV_RESCALING
+#define TRUNCATION_RESCALING
+#endif
+
+#include <fstream>
+#include <stdint.h>
+#include <string>
+
+#define RESET "\033[0m"
+#define RED "\033[31m"   /* Red */
+#define GREEN "\033[32m" /* Green */
+
+const int SERVER = 1;
+const int CLIENT = 2;
+
+extern int party;
+extern std::string address;
+extern int port;
+extern int num_threads;
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/defines_float.h b/GPU-MPC/ext/sytorch/ext/sci/src/defines_float.h
new file mode 100644
index 00000000..421e9db9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/defines_float.h
@@ -0,0 +1,37 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef DEFINES_FLOAT_H___
+#define DEFINES_FLOAT_H___
+
+#include <string>
+#include <stdint.h>
+
+using namespace std ;
+
+#define MAX_THREADS 4
+
+extern string __address ;
+extern int __port ;
+extern int __party ;
+extern int __nt ;
+
+#endif
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/defines_uniform.h b/GPU-MPC/ext/sytorch/ext/sci/src/defines_uniform.h
new file mode 100644
index 00000000..7cfa4379
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/defines_uniform.h
@@ -0,0 +1,88 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef DEFINES_UNIFORM_H___
+#define DEFINES_UNIFORM_H___
+
+#include "defines.h"
+#include <cassert>
+#include <chrono> //Keep the local repo based headers below, once constants are defined
+#include <cstdint> //Only keep standard headers over here
+#include <iostream>
+#include <map>
+#include <thread>
+
+typedef uint64_t intType;
+typedef int64_t signedIntType;
+
+extern int32_t bitlength;
+extern uint64_t prime_mod;
+
+// #define NDEBUG //This must come first -- so that this marco is used
+// throughout code
+// Defining this will disable all asserts throughout code
+// #define USE_LINEAR_UNIFORM
+#define TRAINING
+#ifdef TRAINING
+#undef USE_LINEAR_UNIFORM // Linear Uniform only for inference
+#endif
+#define RUNOPTI
+#ifdef RUNOPTI
+#define MULTITHREADED_MATMUL
+#define MULTITHREADED_NONLIN
+#define MULTITHREADED_TRUNC
+#define MULTITHREADED_DOTPROD
+#endif
+
+#if defined(SCI_HE)
+const bool isNativeRing = false;
+#elif defined(SCI_OT)
+const bool isNativeRing = true;
+#endif
+
+/*
+Bitlength 32 prime: 4293918721
+Bitlength 33 prime: 8589475841
+Bitlength 34 prime: 17179672577
+Bitlength 35 prime: 34359410689
+Bitlength 36 prime: 68718428161
+Bitlength 37 prime: 137438822401
+Bitlength 38 prime: 274876334081
+Bitlength 39 prime: 549755486209
+Bitlength 40 prime: 1099510054913
+Bitlength 41 prime: 2199023190017
+*/
+
+static void checkIfUsingEigen() {
+#ifdef USE_EIGEN
+  std::cout << "Using Eigen for Matmul" << std::endl;
+#else
+  std::cout << "Using normal Matmul" << std::endl;
+#endif
+}
+
+static void assertFieldRun() {
+  assert(sizeof(intType) == sizeof(uint64_t));
+  assert(sizeof(signedIntType) == sizeof(int64_t));
+  assert(bitlength >= 32 && bitlength <= 41);
+}
+
+#endif // DEFINES_UNIFORM_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/functionalities_uniform.h b/GPU-MPC/ext/sytorch/ext/sci/src/functionalities_uniform.h
new file mode 100644
index 00000000..db249355
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/functionalities_uniform.h
@@ -0,0 +1,1127 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to` permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef FUNCTIONALITIES_UNIFORM_H__
+#define FUNCTIONALITIES_UNIFORM_H__
+
+#include "globals.h"
+#include <cmath>
+
+void funcLocalTruncate(int s, intType *arr, int consSF) {
+  if (party == SERVER) {
+    for (int i = 0; i < s; i++) {
+      arr[i] =
+          static_cast<intType>((static_cast<signedIntType>(arr[i])) >> consSF);
+    }
+  } else if (party == CLIENT) {
+    for (int i = 0; i < s; i++) {
+      arr[i] = -static_cast<intType>((static_cast<signedIntType>(-arr[i])) >>
+                                     consSF);
+    }
+  } else {
+    assert(false);
+  }
+}
+
+signedIntType getAnyRingSignedVal(intType x) {
+  signedIntType ans = x;
+  if (x > (prime_mod / 2)) {
+    ans = (x - prime_mod);
+  }
+  return ans;
+}
+
+intType funcSSCons(int32_t x) {
+  /*
+          Secret share public value x between the two parties.
+          Corresponding ezpc statement would be int32_al x = 0;
+          Set one party share as x and other party's share as 0.
+  */
+  if (party == SERVER) {
+    return x;
+  } else {
+    return 0;
+  }
+}
+
+intType funcSSCons(int64_t x) {
+  /*
+          Secret share public value x between the two parties.
+          Corresponding ezpc statement would be int32_al x = 0;
+          Set one party share as x and other party's share as 0.
+  */
+  if (party == SERVER) {
+    return x;
+  } else {
+    return 0;
+  }
+}
+
+inline intType getFieldMsb(intType x) { return (x > (prime_mod / 2)); }
+
+void funcReconstruct2PCCons(signedIntType *y, const intType *x, int len) {
+  intType temp = 0;
+  signedIntType ans = 0;
+  if (party == SERVER) {
+    io->send_data(x, len * sizeof(intType));
+  } else if (party == CLIENT) {
+    io->recv_data(y, len * sizeof(intType));
+    for (int i = 0; i < len; i++) {
+      temp = y[i] + x[i];
+      if (!isNativeRing) {
+        temp = sci::neg_mod(temp,
+                            (int64_t)prime_mod); // cast temp to signed int and
+                                                 // then take a proper modulo p
+        y[i] = getAnyRingSignedVal(temp);
+      } else {
+        temp = temp & (prime_mod - 1);
+        if (temp >= (prime_mod / 2))
+          y[i] = temp - prime_mod;
+        else
+          y[i] = temp;
+      }
+    }
+  }
+  return;
+}
+
+signedIntType funcReconstruct2PCCons(intType x, int revealParty) {
+  #ifdef NO_REVEAL_OUTPUT
+    // Modified code for secure AI validation ("mlcomp")
+    #warning ("At the end of 2PC, output will not be revealed and secret shares  will be APPENDED to file secret_shares.txt")
+    std::ofstream ofile;
+    ofile.open("secret_shares.txt", std::ios::app);
+    ofile<<x<<std::endl;
+    ofile.close();
+    return x;
+  #else
+  // Original code for reveal
+  #warning ("At the end of 2PC, output will revealed as usual.")
+  assert(revealParty == 2 && "Reveal to only client is supported right now.");
+  intType temp = 0;
+  signedIntType ans = 0;
+  static const uint64_t moduloMask = sci::all1Mask(bitlength);
+  if (party == SERVER) {
+    io->send_data(&x, sizeof(intType));
+  } else if (party == CLIENT) {
+    io->recv_data(&temp, sizeof(intType));
+    temp = temp + x;
+    if (!isNativeRing) {
+      temp =
+          sci::neg_mod(temp, (int64_t)prime_mod); // cast temp to signed int and
+                                                  // then take a proper modulo p
+      ans = getAnyRingSignedVal(temp);
+    } else {
+      ans = temp & moduloMask;
+    }
+  }
+  return ans;
+  #endif
+}
+
+signedIntType div_floor(signedIntType a, signedIntType b) {
+  signedIntType q = a / b;
+  signedIntType r = a % b;
+  signedIntType corr = ((r != 0) && (r < 0));
+  return q - corr;
+}
+
+void funcTruncationIdeal(int size, intType *arr, int consSF) {
+  if (party == SERVER) {
+    io->send_data(arr, sizeof(intType) * size);
+    for (int i = 0; i < size; i++) {
+      arr[i] = 0;
+    }
+  } else {
+    intType *arrOther = new intType[size];
+    io->recv_data(arrOther, sizeof(intType) * size);
+    for (int i = 0; i < size; i++) {
+      signedIntType ans;
+      intType temp = arr[i] + arrOther[i];
+      if (!isNativeRing) {
+        temp = sci::neg_mod(temp, (int64_t)prime_mod);
+        ans = getAnyRingSignedVal(temp);
+        ans = div_floor(ans, 1ULL << consSF);
+      } else {
+        uint64_t moduloMask = (1ULL << bitlength) - 1;
+        if (bitlength == 64)
+          moduloMask = -1;
+        temp = temp & moduloMask;
+        ans = ((signedIntType)temp) >> consSF;
+      }
+      arr[i] = ans;
+    }
+  }
+}
+
+void printAllReconstructedValuesSigned(int size, intType *arr) {
+  if (party == SERVER) {
+    io->send_data(arr, sizeof(intType) * size);
+  } else {
+    intType *arrOther = new intType[size];
+    io->recv_data(arrOther, sizeof(intType) * size);
+    for (int i = 0; i < size; i++) {
+      signedIntType ans;
+      intType temp = arr[i] + arrOther[i];
+      if (!isNativeRing) {
+        temp = sci::neg_mod(temp, (int64_t)prime_mod);
+        ans = getAnyRingSignedVal(temp);
+      } else {
+        ans = temp;
+      }
+      std::cout << ans << std::endl;
+    }
+  }
+}
+
+intType funcSigendDivIdeal(intType x, uint32_t y) {
+  if (party == SERVER) {
+    io->send_data(&x, sizeof(intType));
+    return 0;
+  } else {
+    intType temp;
+    io->recv_data(&temp, sizeof(intType));
+    temp += x;
+    return (((signedIntType)temp) / y);
+  }
+}
+
+void funcReLUThread(int tid, intType *outp, intType *inp, int numRelu,
+                    uint8_t *drelu_res = nullptr, bool skip_ot = false) {
+  reluArr[tid]->relu(outp, inp, numRelu, drelu_res, skip_ot);
+}
+
+void funcMaxpoolThread(int tid, int rows, int cols, intType *inpArr,
+                       intType *maxi, intType *maxiIdx) {
+  maxpoolArr[tid]->funcMaxMPC(rows, cols, inpArr, maxi, maxiIdx);
+}
+
+#ifdef SCI_OT
+void funcMatmulThread(int tid, int N, int s1, int s2, int s3, intType *A,
+                      intType *B, intType *C, int partyWithAInAB_mul) {
+  assert(tid >= 0);
+  int bucket_size = std::ceil(s2 / (double)N);
+  int s2StartIdx = tid * bucket_size;                   // Inclusive
+  int s2EndIdx = std::min((tid + 1) * bucket_size, s2); // Exclusive
+
+
+  if(s2StartIdx == s2EndIdx || s2StartIdx > s2){
+    memset(C, 0, s1 * s3 * sizeof(intType));
+    return;
+  }
+
+  intType *APtr = new intType[s1 * (s2EndIdx - s2StartIdx)];
+  for (int i = 0; i < s1; i++) {
+    for (int j = s2StartIdx; j < s2EndIdx; j++) {
+      Arr2DIdxRowM(APtr, s1, (s2EndIdx - s2StartIdx), i, (j - s2StartIdx)) =
+          Arr2DIdxRowM(A, s1, s2, i, j);
+    }
+  }
+  intType *BPtr = B + s2StartIdx * s3;
+  /*
+    Case 1: If Alice has A and Bob has B
+    Then for even threads, Bob (holding B) acts as receiver and Alice
+     (holding A) acts as sender For odd threads, Bob (holding B) acts as sender
+     and Alice (holding A) acts as receiver.
+
+    Case 2: If Alice has B and Bob has A
+    Then for even threads, Bob (holding A) acts as receiver and Alice
+    (holding B) as sender For odd threads, Bob (holding A) acts as sender and
+    Alice (holding B) as receiver.
+
+    Note that for even threads, Bob is always the
+    receiver and Alice is always the sender and for odd threads, Bob is always
+    the sender and Alice the receiver This makes sure that the matmul and
+    otinstances (indexed by tid) are always used by with the same
+    sender-receiver pair.
+  */
+
+#ifdef USE_LINEAR_UNIFORM
+  assert(s2EndIdx > s2StartIdx);
+  bool useBobAsSender = tid & 1;
+  if (partyWithAInAB_mul == sci::BOB)
+    useBobAsSender = !useBobAsSender;
+  if (useBobAsSender) {
+    // Odd tid, use Bob (holding B) as sender and Alice (holding A) as receiver
+    if (party == partyWithAInAB_mul) {
+      multUniformArr[tid]->funcOTReceiverInputA(s1, (s2EndIdx - s2StartIdx), s3,
+                                                APtr, C, otInstanceArr[tid]);
+    } else {
+      multUniformArr[tid]->funcOTSenderInputB(s1, (s2EndIdx - s2StartIdx), s3,
+                                              BPtr, C, otInstanceArr[tid]);
+    }
+  } else {
+    // Even tid, use Bob (holding B) as receiver and Alice (holding A) as sender
+    if (party == partyWithAInAB_mul) {
+      multUniformArr[tid]->funcOTSenderInputA(s1, (s2EndIdx - s2StartIdx), s3,
+                                              APtr, C, otInstanceArr[tid]);
+    } else {
+      multUniformArr[tid]->funcOTReceiverInputB(s1, (s2EndIdx - s2StartIdx), s3,
+                                                BPtr, C, otInstanceArr[tid]);
+    }
+  }
+#else  // USE_LINEAR_UNIFORM
+  MultMode mode;
+#ifdef TRAINING
+  mode = MultMode::None;
+#else
+  if (tid & 1) {
+    if (partyWithAInAB_mul == sci::ALICE)
+      mode = MultMode::Bob_has_A;
+    else
+      mode = MultMode::Bob_has_B;
+  } else {
+    if (partyWithAInAB_mul == sci::ALICE)
+      mode = MultMode::Alice_has_A;
+    else
+      mode = MultMode::Alice_has_B;
+  }
+#endif
+  if (s2EndIdx > s2StartIdx) {
+    multArr[tid]->matmul_cross_terms(s1, (s2EndIdx - s2StartIdx), s3, APtr,
+                                     BPtr, C, bitlength, bitlength, bitlength,
+                                     true, mode);
+  } else {
+    memset(C, 0, s1 * s3 * sizeof(intType));
+  }
+#endif // USE_LINEAR_UNIFORM
+  delete[] APtr;
+}
+
+void funcDotProdThread(int tid, int N, int size, intType *multiplyArr,
+                       intType *inArr, intType *outArr,
+                       bool both_cross_terms = false) {
+  if (size == 0)
+    return;
+  assert(tid >= 0);
+#ifdef TRAINING
+    multArr[tid]->hadamard_cross_terms(size, multiplyArr, inArr, outArr,
+                                       bitlength, bitlength, bitlength, MultMode::None);
+#else
+  if (tid & 1) {
+    MultMode mode = (both_cross_terms ? MultMode::None : MultMode::Bob_has_A);
+    multArr[tid]->hadamard_cross_terms(size, multiplyArr, inArr, outArr,
+                                       bitlength, bitlength, bitlength, mode);
+  } else {
+    MultMode mode = (both_cross_terms ? MultMode::None : MultMode::Alice_has_A);
+    multArr[tid]->hadamard_cross_terms(size, multiplyArr, inArr, outArr,
+                                       bitlength, bitlength, bitlength, mode);
+  }
+#endif
+}
+#endif
+
+/*
+        Note this assumes 1 <= s < ell
+        More optimizations that could be done for this function:
+        - The call to 1oo4 KKOT and 1oo2 IKNP results in 4 rounds. Round
+   compression can potentially be done to reduce rounds to 2. The interface can
+   also potentially be cleaned up to use only otpack. However, currently the COT
+   required to be used is present in IKNP - so keeping that too in the
+   interface.
+*/
+void funcTruncateTwoPowerRing(
+    int curParty, sci::IOPack *curiopack, sci::OTPack *curotpack,
+    sci::IKNP<sci::NetIO> *curiknp, ReLUProtocol<intType> *curReluImpl,
+    sci::PRG128 *curPrgInstance, int size, intType *inp, intType *outp,
+    int consSF, uint8_t *msbShare, bool doCarryBitCalculation = true) {
+  static const int rightShiftForMsb = bitlength - 1;
+  uint64_t *carryBitCompArr;
+  uint8_t *carryBitCompAns;
+  uint64_t moduloMask = sci::all1Mask(bitlength);
+
+  for (int i = 0; i < size; i++) {
+    assert(inp[i] <= moduloMask);
+  }
+
+  if (doCarryBitCalculation) {
+    carryBitCompArr = new uint64_t[size];
+    carryBitCompAns = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      carryBitCompArr[i] = (inp[i] & sci::all1Mask(consSF));
+      if (curParty == sci::BOB) {
+        carryBitCompArr[i] = (sci::all1Mask(consSF)) - carryBitCompArr[i];
+      }
+    }
+    MillionaireProtocol millionaire(curParty, curiopack, curotpack);
+    millionaire.compare(carryBitCompAns, carryBitCompArr, size, consSF);
+  }
+
+  bool createdMsbSharesHere = false;
+  if (msbShare == nullptr) {
+    msbShare = new uint8_t[size];
+    curReluImpl->relu(nullptr, inp, size, msbShare, true);
+    createdMsbSharesHere = true;
+  }
+
+  if (curParty == sci::ALICE) {
+    uint64_t **otMessages1oo4 = new uint64_t *[size];
+    intType *otMessages1oo2Data = new intType[size];
+    intType *otMessages1oo2Corr = new intType[size];
+    intType *localShare1oo4 = new intType[size];
+    curPrgInstance->random_data(localShare1oo4, sizeof(intType) * size);
+
+    for (int i = 0; i < size; i++) {
+      otMessages1oo4[i] = new uint64_t[4];
+      uint8_t localShareMSB = ((uint8_t)(inp[i] >> rightShiftForMsb));
+      intType r = -localShare1oo4[i];
+      for (int j = 0; j < 4; j++) {
+        uint8_t b0 = j & 1;
+        uint8_t b1 =
+            (j >> 1) &
+            1; // b1,b0 is the bit representation of j = [msb(a)]_1, msb(a_1)
+        uint8_t temp = ((msbShare[i] + b1 + localShareMSB) & 1) &
+                       ((msbShare[i] + b1 + b0) & 1);
+        intType curMsg = r;
+        if (temp & (localShareMSB == 0)) {
+          // msb(a_0)=0, msb(a_1)=0, msb(a)=1
+          curMsg -= 1;
+        } else if (temp & (localShareMSB == 1)) {
+          // msb(a_0)=1, msb(a_1)=1, msb(a)=0
+          curMsg += 1;
+        }
+        // In other cases, extra term is 0
+        otMessages1oo4[i][j] = (uint64_t)curMsg;
+      }
+      if (doCarryBitCalculation) {
+        otMessages1oo2Corr[i] = ((intType)carryBitCompAns[i]);
+      }
+    }
+
+    curotpack->kkot[1]->send_impl(otMessages1oo4, size, bitlength);
+    if (doCarryBitCalculation) {
+      curiknp->send_cot_moduloAdd<intType>(otMessages1oo2Data,
+                                           otMessages1oo2Corr, size);
+    }
+
+    for (int i = 0; i < size; i++) {
+      intType shareTerm1 = ((intType)((getAnyRingSignedVal(inp[i])) >> consSF));
+      intType shareTerm2 = (localShare1oo4[i] << (bitlength - consSF));
+      outp[i] = shareTerm1 + shareTerm2;
+      if (doCarryBitCalculation) {
+        intType curCarryShare = -otMessages1oo2Data[i];
+        outp[i] = outp[i] + ((intType)carryBitCompAns[i]) - 2 * curCarryShare;
+      }
+      outp[i] = outp[i] & moduloMask;
+      delete[] otMessages1oo4[i];
+    }
+
+    delete[] otMessages1oo4;
+    delete[] otMessages1oo2Data;
+    delete[] otMessages1oo2Corr;
+    delete[] localShare1oo4;
+  } else {
+    uint64_t *otRecvMsg1oo4 = new uint64_t[size];
+    uint8_t *choiceBits = new uint8_t[size];
+    intType *otRecvMsg1oo2 = new intType[size];
+    uint8_t *choiceBits1oo2OT = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      uint8_t localShareMSB = ((uint8_t)(inp[i] >> rightShiftForMsb));
+      choiceBits[i] = (msbShare[i] << 1) + localShareMSB;
+      if (doCarryBitCalculation) {
+        choiceBits1oo2OT[i] = carryBitCompAns[i];
+      }
+    }
+
+    curotpack->kkot[1]->recv_impl(otRecvMsg1oo4, choiceBits, size, bitlength);
+    if (doCarryBitCalculation) {
+      curiknp->recv_cot_moduloAdd<intType>(otRecvMsg1oo2, choiceBits1oo2OT,
+                                           size);
+    }
+    for (int i = 0; i < size; i++) {
+      intType G1 = (intType)otRecvMsg1oo4[i];
+      intType shareTerm1 = (getAnyRingSignedVal(inp[i])) >> consSF;
+      intType shareTerm2 = (G1 << (bitlength - consSF));
+      outp[i] = shareTerm1 + shareTerm2;
+      if (doCarryBitCalculation) {
+        outp[i] =
+            outp[i] + ((intType)carryBitCompAns[i]) - 2 * otRecvMsg1oo2[i];
+      }
+      outp[i] = outp[i] & moduloMask;
+    }
+
+    delete[] otRecvMsg1oo4;
+    delete[] choiceBits;
+    delete[] otRecvMsg1oo2;
+    delete[] choiceBits1oo2OT;
+  }
+
+  if (doCarryBitCalculation) {
+    delete[] carryBitCompArr;
+    delete[] carryBitCompAns;
+  }
+
+  if (createdMsbSharesHere) {
+    delete[] msbShare;
+  }
+}
+
+void funcTruncateTwoPowerRingWrapper(int size, intType *inp, intType *outp,
+                                     int consSF, uint8_t *msbShare,
+                                     bool doCarryBitCalculation = true) {
+#ifdef MULTITHREADED_TRUNC
+  std::thread truncThreads[num_threads];
+  int chunk_size = size / num_threads;
+  for (int i = 0; i < num_threads; i++) {
+    int offset = i * chunk_size;
+    int curSize;
+    if (i == (num_threads - 1)) {
+      curSize = size - offset;
+    } else {
+      curSize = chunk_size;
+    }
+    int curParty = party;
+    if (i & 1)
+      curParty = 3 - curParty;
+    uint8_t *msbShareArg = msbShare;
+    if (msbShare != nullptr)
+      msbShareArg = msbShareArg + offset;
+    truncThreads[i] = std::thread(
+        funcTruncateTwoPowerRing, curParty, iopackArr[i], otpackArr[i],
+        otInstanceArr[i], reluArr[i], prgInstanceArr[i], curSize, inp + offset,
+        outp + offset, consSF, msbShareArg, doCarryBitCalculation);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    truncThreads[i].join();
+  }
+#else
+  funcTruncateTwoPowerRing(party, iopack, otpack, iknpOT, kkot, relu,
+                           prg128Instance, // Global variables
+                           size, inp, outp, consSF, msbShare,
+                           doCarryBitCalculation);
+#endif
+}
+
+/*
+        More optimizations that could be done for this function:
+        - Currently if bitsForA + bitlength > 64, then the function resorts to
+   using block128 based 1oo4 KKOT, skipping packing. This could be optimized to
+   do ideal packing even in this case.
+        - The last COT being performed is also not using packing for bitlenth !=
+   32/64.
+*/
+void funcAvgPoolTwoPowerRing(int curParty, sci::IOPack *curiopack,
+                             sci::OTPack *curotpack,
+                             sci::IKNP<sci::NetIO> *curiknp,
+                             sci::KKOT<sci::NetIO> *curkkot,
+                             ReLUProtocol<intType> *curReluImpl,
+                             sci::PRG128 *curPrgInstance, int size,
+                             intType *inp, intType *outp, intType divisor) {
+  assert(inp != outp &&
+         "Assumption is there is a separate array for input and output");
+  assert(divisor > 0 && "working with positive divisor");
+  static const int rightShiftForMsb = bitlength - 1;
+  assert(sci::all1Mask(bitlength) >
+         (6 * divisor - 1)); // 2^l-1 > 6d-1 => 2^l > 6d
+  intType ringRem, ringQuot;
+  if (bitlength == 64) {
+    ringRem = ((intType)(-divisor)) % divisor;        //(2^l-d)%d = 2^l%d
+    ringQuot = (((intType)(-divisor)) / divisor) + 1; //((2^l-d)/d)+1 = 2^l/d
+  } else {
+    ringRem = prime_mod % divisor;
+    ringQuot = prime_mod / divisor;
+  }
+  uint8_t *msbShare = new uint8_t[size];
+  uint64_t moduloMask = sci::all1Mask(bitlength);
+
+  for (int i = 0; i < size; i++) {
+    inp[i] = inp[i] & moduloMask;
+    intType shareTerm1 = div_floor(getAnyRingSignedVal(inp[i]), divisor);
+    intType localShareMSB = (inp[i] >> rightShiftForMsb);
+    intType shareTerm2 = div_floor(
+        (signedIntType)((inp[i] % divisor) - localShareMSB * ringRem), divisor);
+    outp[i] = shareTerm1 - shareTerm2;
+    if (curParty == sci::ALICE)
+      outp[i] += 1;
+  }
+
+  curReluImpl->relu(nullptr, inp, size, msbShare, true);
+
+  const uint64_t bitsForA = std::ceil(std::log2(6 * divisor));
+  uint64_t totalRandomBytesForLargeRing = sci::ceil_val(bitlength * size, 8);
+  uint8_t *localShareCorrPacked = new uint8_t[totalRandomBytesForLargeRing];
+  intType *localShareCorr = new intType[size];
+  int totalRandomBytesForSmallRing = sci::ceil_val(bitsForA * size, 8);
+  uint8_t *localShareCorrSmallRingPacked =
+      new uint8_t[totalRandomBytesForSmallRing];
+  uint64_t *localShareCorrSmallRing = new uint64_t[size];
+  intType *otMsgCorrRing = new intType[4 * size];
+  intType *otMsgCorrSmallRing = new intType[4 * size];
+
+  bool OT1oo4FitsIn64Bits = ((bitsForA + bitlength) <= 64);
+  if (curParty == sci::ALICE) {
+    curPrgInstance->random_data(localShareCorrPacked,
+                                totalRandomBytesForLargeRing);
+    curPrgInstance->random_data(localShareCorrSmallRingPacked,
+                                totalRandomBytesForSmallRing);
+    for (int i = 0; i < size; i++) {
+      localShareCorr[i] = sci::readFromPackedArr(localShareCorrPacked,
+                                                 totalRandomBytesForLargeRing,
+                                                 i * (bitlength), bitlength);
+      localShareCorrSmallRing[i] = sci::readFromPackedArr(
+          localShareCorrSmallRingPacked, totalRandomBytesForSmallRing,
+          i * bitsForA, bitsForA);
+      uint8_t localShareMSB = ((uint8_t)(inp[i] >> rightShiftForMsb));
+      for (int j = 0; j < 4; j++) {
+        uint8_t b0 = j & 1;
+        uint8_t b1 =
+            (j >> 1) &
+            1; // b1,b0 is the bit representation of j = [msb(a)]_1, msb(a_1)
+        uint8_t temp = ((msbShare[i] + b1 + localShareMSB) & 1) &
+                       ((msbShare[i] + b1 + b0) & 1);
+        intType curMsg = -localShareCorr[i];
+        intType curMsgSmallRing = -localShareCorrSmallRing[i];
+        if (temp & (localShareMSB == 0)) {
+          // msb(a_0)=0, msb(a_1)=0, msb(a)=1
+          curMsg -= 1;
+          curMsgSmallRing -= 1;
+        } else if (temp & (localShareMSB == 1)) {
+          // msb(a_0)=1, msb(a_1)=1, msb(a)=0
+          curMsg += 1;
+          curMsgSmallRing += 1;
+        }
+        curMsg = curMsg & moduloMask;
+        curMsgSmallRing = curMsgSmallRing & sci::all1Mask(bitsForA);
+        otMsgCorrRing[i * 4 + j] = curMsg;
+        otMsgCorrSmallRing[i * 4 + j] = curMsgSmallRing;
+      }
+    }
+
+    if (OT1oo4FitsIn64Bits) {
+      uint64_t **otMessages1oo4 = new uint64_t *[size];
+      for (int i = 0; i < size; i++) {
+        otMessages1oo4[i] = new uint64_t[4];
+        for (int j = 0; j < 4; j++)
+          otMessages1oo4[i][j] =
+              (((uint64_t)otMsgCorrSmallRing[i * 4 + j]) << bitlength) +
+              ((uint64_t)otMsgCorrRing[i * 4 + j]);
+      }
+      curotpack->kkot[1]->send_impl(otMessages1oo4, size,
+                                    (bitlength + bitsForA));
+      for (int i = 0; i < size; i++) {
+        delete[] otMessages1oo4[i];
+      }
+      delete[] otMessages1oo4;
+    } else {
+      sci::block128 **otMessages1oo4 = new sci::block128 *[size];
+      for (int i = 0; i < size; i++) {
+        otMessages1oo4[i] = new sci::block128[4];
+        for (int j = 0; j < 4; j++)
+          otMessages1oo4[i][j] = _mm_set_epi64x(otMsgCorrSmallRing[i * 4 + j],
+                                                otMsgCorrRing[i * 4 + j]);
+      }
+      curkkot->send_impl(otMessages1oo4, size, 4);
+      for (int i = 0; i < size; i++) {
+        delete[] otMessages1oo4[i];
+      }
+      delete[] otMessages1oo4;
+    }
+    for (int i = 0; i < size; i++) {
+      outp[i] += (localShareCorr[i] * ringQuot);
+    }
+  } else {
+    uint8_t *choiceBits = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      uint8_t localShareMSB = ((uint8_t)(inp[i] >> rightShiftForMsb));
+      choiceBits[i] = (msbShare[i] << 1) + localShareMSB;
+    }
+
+    if (OT1oo4FitsIn64Bits) {
+      uint64_t *otRecvMsg1oo4 = new uint64_t[size];
+      curotpack->kkot[1]->recv_impl(otRecvMsg1oo4, choiceBits, size,
+                                    (bitlength + bitsForA));
+      for (int i = 0; i < size; i++) {
+        otMsgCorrRing[i] = otRecvMsg1oo4[i] & sci::all1Mask(bitlength);
+        otMsgCorrSmallRing[i] = otRecvMsg1oo4[i] >> bitlength;
+      }
+      delete[] otRecvMsg1oo4;
+    } else {
+      sci::block128 *otRecvMsg1oo4 = new sci::block128[size];
+      curkkot->recv_impl(otRecvMsg1oo4, choiceBits, size, 4);
+      for (int i = 0; i < size; i++) {
+        uint64_t temp = _mm_extract_epi64(otRecvMsg1oo4[i], 0);
+        otMsgCorrRing[i] = temp;
+        temp = _mm_extract_epi64(otRecvMsg1oo4[i], 1);
+        otMsgCorrSmallRing[i] = temp;
+      }
+      delete[] otRecvMsg1oo4;
+    }
+
+    for (int i = 0; i < size; i++) {
+      localShareCorr[i] = otMsgCorrRing[i];
+      localShareCorrSmallRing[i] = otMsgCorrSmallRing[i];
+      outp[i] += (localShareCorr[i] * ringQuot);
+    }
+
+    delete[] choiceBits;
+  }
+
+  intType *localShareA_all3 = new intType[3 * size];
+  uint8_t *localShareA_all3_drelu = new uint8_t[3 * size];
+  uint64_t bitsAmask = sci::all1Mask(bitsForA);
+  uint64_t bitsAMinusOneMask = sci::all1Mask((bitsForA - 1));
+  uint64_t *radixCompValues = new uint64_t[3 * size];
+  uint8_t *carryBit = new uint8_t[3 * size];
+
+  // Optimization to reduce comparions to 2
+  int totalComp = 3 * size;
+  int compPerElt = 3;
+  if (2 * ringRem < divisor) {
+    // A+d<0 becomes moot
+    totalComp = 2 * size;
+    compPerElt = 2;
+  }
+
+  for (int i = 0; i < size; i++) {
+    intType localShareA = ((inp[i] % divisor) - ((inp[i] >> rightShiftForMsb) -
+                                                 localShareCorrSmallRing[i]) *
+                                                    ringRem) &
+                          bitsAmask;
+    for (int j = 0; j < compPerElt; j++) {
+      localShareA_all3[compPerElt * i + j] = localShareA;
+    }
+
+    if (curParty == sci::ALICE) {
+      if (compPerElt == 3) {
+        localShareA_all3[3 * i] =
+            (localShareA_all3[3 * i] - divisor) & bitsAmask;
+        localShareA_all3[3 * i + 2] =
+            (localShareA_all3[3 * i + 2] + divisor) & bitsAmask;
+      } else {
+        localShareA_all3[2 * i] =
+            (localShareA_all3[2 * i] - divisor) & bitsAmask;
+      }
+    }
+
+    for (int j = 0; j < compPerElt; j++) {
+      radixCompValues[compPerElt * i + j] =
+          (localShareA_all3[compPerElt * i + j] & bitsAMinusOneMask);
+      localShareA_all3_drelu[compPerElt * i + j] =
+          (localShareA_all3[compPerElt * i + j] >> (bitsForA - 1));
+      if (curParty == sci::BOB) {
+        radixCompValues[compPerElt * i + j] =
+            bitsAMinusOneMask - radixCompValues[compPerElt * i + j];
+      }
+    }
+  }
+
+  MillionaireProtocol millionaire(curParty, curiopack, curotpack);
+  millionaire.compare(carryBit, radixCompValues, totalComp, bitsForA - 1);
+  for (int i = 0; i < totalComp; i++) {
+    localShareA_all3_drelu[i] = (localShareA_all3_drelu[i] + carryBit[i]) & 1;
+  }
+
+  if (curParty == sci::ALICE) {
+    intType *cotData = new intType[totalComp];
+    intType *cotCorr = new intType[totalComp];
+    for (int i = 0; i < totalComp; i++) {
+      cotCorr[i] = (intType)localShareA_all3_drelu[i];
+    }
+    curiknp->send_cot_moduloAdd<intType>(cotData, cotCorr, totalComp);
+    for (int i = 0; i < size; i++) {
+      intType curCTermShare = 0;
+      for (int j = 0; j < compPerElt; j++) {
+        intType curMultTermShare = -cotData[compPerElt * i + j];
+        intType curDReluAns =
+            (intType)localShareA_all3_drelu[compPerElt * i + j] -
+            2 * curMultTermShare;
+        curCTermShare += curDReluAns;
+      }
+      outp[i] -= curCTermShare;
+    }
+    delete[] cotData;
+    delete[] cotCorr;
+  } else {
+    intType *cotDataRecvd = new intType[totalComp];
+    curiknp->recv_cot_moduloAdd<intType>(cotDataRecvd, localShareA_all3_drelu,
+                                         totalComp);
+    for (int i = 0; i < size; i++) {
+      intType curCTermShare = 0;
+      for (int j = 0; j < compPerElt; j++) {
+        intType curMultTermShare = cotDataRecvd[compPerElt * i + j];
+        intType curDReluAns =
+            (intType)localShareA_all3_drelu[compPerElt * i + j] -
+            2 * curMultTermShare;
+        curCTermShare += curDReluAns;
+      }
+      outp[i] -= curCTermShare;
+    }
+    delete[] cotDataRecvd;
+  }
+
+  for (int i = 0; i < size; i++) {
+    outp[i] = outp[i] & moduloMask;
+  }
+
+  delete[] msbShare;
+  delete[] localShareCorr;
+  delete[] localShareCorrSmallRingPacked;
+  delete[] localShareCorrSmallRing;
+  delete[] otMsgCorrRing;
+  delete[] otMsgCorrSmallRing;
+  delete[] localShareA_all3;
+  delete[] localShareA_all3_drelu;
+  delete[] radixCompValues;
+  delete[] carryBit;
+}
+
+void funcAvgPoolTwoPowerRingWrapper(int size, intType *inp, intType *outp,
+                                    intType divisor) {
+#ifdef MULTITHREADED_TRUNC
+  std::thread truncThreads[num_threads];
+  int chunk_size = size / num_threads;
+  for (int i = 0; i < num_threads; i++) {
+    int offset = i * chunk_size;
+    int curSize;
+    if (i == (num_threads - 1)) {
+      curSize = size - offset;
+    } else {
+      curSize = chunk_size;
+    }
+    int curParty = party;
+    if (i & 1)
+      curParty = 3 - curParty;
+    truncThreads[i] = std::thread(
+        funcAvgPoolTwoPowerRing, curParty, iopackArr[i], otpackArr[i],
+        otInstanceArr[i], kkotInstanceArr[i], reluArr[i], prgInstanceArr[i],
+        curSize, inp + offset, outp + offset, divisor);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    truncThreads[i].join();
+  }
+#else
+  funcAvgPoolTwoPowerRing(party, iopack, otpack, iknpOT, kkot, relu,
+                          prg128Instance, size, inp, outp, divisor);
+#endif
+}
+
+/*
+        Assume uint64_t for fields
+        More optimizations that could be performed for this function:
+        - The 1oo4 KKOT being performed resorts to using block128 and no packing
+   when fieldBits + bitsForA > 64. This can be optimized to use ideal packing
+   even in this case.
+        - The last OT being performed is a COT, but is being done using an OT
+   with no packing.
+*/
+template <typename intType>
+void funcFieldDiv(int curParty, sci::IOPack *curiopack, sci::OTPack *curotpack,
+                  sci::IKNP<sci::NetIO> *curiknp,
+                  sci::KKOT<sci::NetIO> *curkkot,
+                  ReLUProtocol<intType> *curReluImpl,
+                  sci::PRG128 *curPrgInstance, int size, intType *inp,
+                  intType *outp, intType divisor, uint8_t *msbShare) {
+  assert(inp != outp &&
+         "Assumption is there is a separate array for input and output");
+  assert((divisor > 0) && (divisor < prime_mod) &&
+         "working with positive divisor");
+  assert(prime_mod > 6 * divisor);
+  const intType ringRem = prime_mod % divisor;
+  const intType ringQuot = prime_mod / divisor;
+  bool doMSBComputation = (msbShare == nullptr);
+  if (doMSBComputation)
+    msbShare = new uint8_t[size];
+
+  for (int i = 0; i < size; i++) {
+    assert(inp[i] < prime_mod && "input is not a valid share modulo prime_mod");
+    signedIntType shareTerm1 =
+        div_floor((signedIntType)getAnyRingSignedVal(inp[i]), divisor);
+    intType localShareMSB = getFieldMsb(inp[i]);
+    signedIntType shareTerm2 = div_floor(
+        (signedIntType)((inp[i] % divisor) - localShareMSB * ringRem), divisor);
+    signedIntType temp = shareTerm1 - shareTerm2;
+    if (curParty == sci::BOB)
+      temp += 1;
+    outp[i] = sci::neg_mod(temp, (int64_t)prime_mod);
+  }
+
+  if (doMSBComputation) {
+    curReluImpl->relu(nullptr, inp, size, msbShare, true);
+  }
+
+  static const int fieldBits = std::ceil(std::log2(prime_mod));
+  const uint64_t bitsForA = std::ceil(std::log2(6 * divisor));
+  const uint64_t totalBitlen = fieldBits + bitsForA;
+  bool OT1oo4FitsIn64Bits = (totalBitlen <= 64);
+
+  intType *localShareCorr = new intType[size];
+  int totalRandomBytesForSmallRing = sci::ceil_val(bitsForA * size, 8);
+  uint8_t *localShareCorrSmallRingPacked =
+      new uint8_t[totalRandomBytesForSmallRing];
+  uint64_t *localShareCorrSmallRing = new uint64_t[size];
+  intType *otMsgCorrField = new intType[4 * size];
+  intType *otMsgCorrSmallRing = new intType[4 * size];
+
+  if (curParty == sci::ALICE) {
+    curPrgInstance->random_mod_p<intType>(localShareCorr, size, prime_mod);
+    curPrgInstance->random_data(localShareCorrSmallRingPacked,
+                                totalRandomBytesForSmallRing);
+    for (int i = 0; i < size; i++) {
+      localShareCorrSmallRing[i] = sci::readFromPackedArr(
+          localShareCorrSmallRingPacked, totalRandomBytesForSmallRing,
+          i * bitsForA, bitsForA);
+      uint8_t localShareMSB = getFieldMsb(inp[i]);
+      for (int j = 0; j < 4; j++) {
+        uint8_t b0 = j & 1;
+        uint8_t b1 =
+            (j >> 1) &
+            1; // b1,b0 is the bit representation of j = [msb(a)]_1, msb(a_1)
+        uint8_t temp = ((msbShare[i] + b1 + localShareMSB) & 1) &
+                       ((msbShare[i] + b1 + b0) & 1);
+        signedIntType curMsg = -localShareCorr[i];
+        intType curMsgSmallRing = -localShareCorrSmallRing[i];
+        if (temp & (localShareMSB == 0)) {
+          // msb(a_0)=0, msb(a_1)=0, msb(a)=1
+          curMsg -= 1;
+          curMsgSmallRing -= 1;
+        } else if (temp & (localShareMSB == 1)) {
+          // msb(a_0)=1, msb(a_1)=1, msb(a)=0
+          curMsg += 1;
+          curMsgSmallRing += 1;
+        }
+        intType curMsgField = sci::neg_mod(curMsg, (int64_t)prime_mod);
+        curMsgSmallRing = curMsgSmallRing & sci::all1Mask(bitsForA);
+        otMsgCorrField[i * 4 + j] = curMsgField;
+        otMsgCorrSmallRing[i * 4 + j] = curMsgSmallRing;
+      }
+    }
+    if (OT1oo4FitsIn64Bits) {
+      uint64_t **otMessages1oo4 = new uint64_t *[size];
+      for (int i = 0; i < size; i++) {
+        otMessages1oo4[i] = new uint64_t[4];
+        for (int j = 0; j < 4; j++)
+          otMessages1oo4[i][j] =
+              (((uint64_t)otMsgCorrSmallRing[i * 4 + j]) << fieldBits) +
+              ((uint64_t)otMsgCorrField[i * 4 + j]);
+      }
+      curotpack->kkot[1]->send_impl(otMessages1oo4, size, totalBitlen);
+      for (int i = 0; i < size; i++) {
+        delete[] otMessages1oo4[i];
+      }
+      delete[] otMessages1oo4;
+    } else {
+      sci::block128 **otMessages1oo4 = new sci::block128 *[size];
+      for (int i = 0; i < size; i++) {
+        otMessages1oo4[i] = new sci::block128[4];
+        for (int j = 0; j < 4; j++)
+          otMessages1oo4[i][j] = _mm_set_epi64x(otMsgCorrSmallRing[i * 4 + j],
+                                                otMsgCorrField[i * 4 + j]);
+      }
+      curkkot->send_impl(otMessages1oo4, size, 4);
+      for (int i = 0; i < size; i++) {
+        delete[] otMessages1oo4[i];
+      }
+      delete[] otMessages1oo4;
+    }
+    for (int i = 0; i < size; i++) {
+#ifdef __SIZEOF_INT128__
+      intType temp =
+          (((__int128)localShareCorr[i]) * ((__int128)ringQuot)) % prime_mod;
+#else
+      intType temp = sci::moduloMult(localShareCorr[i], ringQuot, prime_mod);
+#endif
+      outp[i] = sci::neg_mod(outp[i] + temp, (int64_t)prime_mod);
+    }
+  } else {
+    uint8_t *choiceBits = new uint8_t[size];
+    for (int i = 0; i < size; i++) {
+      uint8_t localShareMSB = getFieldMsb(inp[i]);
+      choiceBits[i] = (msbShare[i] << 1) + localShareMSB;
+    }
+
+    if (OT1oo4FitsIn64Bits) {
+      uint64_t *otRecvMsg1oo4 = new uint64_t[size];
+      curotpack->kkot[1]->recv_impl(otRecvMsg1oo4, choiceBits, size,
+                                    totalBitlen);
+      for (int i = 0; i < size; i++) {
+        otMsgCorrField[i] = otRecvMsg1oo4[i] & sci::all1Mask(fieldBits);
+        otMsgCorrSmallRing[i] = otRecvMsg1oo4[i] >> fieldBits;
+      }
+      delete[] otRecvMsg1oo4;
+    } else {
+      sci::block128 *otRecvMsg1oo4 = new sci::block128[size];
+      curkkot->recv_impl(otRecvMsg1oo4, choiceBits, size, 4);
+      for (int i = 0; i < size; i++) {
+        uint64_t temp = _mm_extract_epi64(otRecvMsg1oo4[i], 0);
+        otMsgCorrField[i] = temp;
+        temp = _mm_extract_epi64(otRecvMsg1oo4[i], 1);
+        otMsgCorrSmallRing[i] = temp;
+      }
+      delete[] otRecvMsg1oo4;
+    }
+
+    for (int i = 0; i < size; i++) {
+      localShareCorr[i] = otMsgCorrField[i];
+      localShareCorrSmallRing[i] = otMsgCorrSmallRing[i];
+#ifdef __SIZEOF_INT128__
+      intType temp =
+          (((__int128)localShareCorr[i]) * ((__int128)ringQuot)) % prime_mod;
+#else
+      intType temp = sci::moduloMult(localShareCorr[i], ringQuot, prime_mod);
+#endif
+      outp[i] = sci::neg_mod(outp[i] + temp, (int64_t)prime_mod);
+    }
+
+    delete[] choiceBits;
+  }
+
+  int totalComp = 3 * size;
+  int compPerElt = 3;
+  if (2 * ringRem < divisor) {
+    // A+d<0 becomes moot
+    totalComp = 2 * size;
+    compPerElt = 2;
+  }
+
+  intType *localShareA_all3 = new intType[3 * size];
+  uint8_t *localShareA_all3_drelu = new uint8_t[3 * size];
+  uint64_t bitsAmask = sci::all1Mask(bitsForA);
+  uint64_t bitsAMinusOneMask = sci::all1Mask((bitsForA - 1));
+  uint64_t *radixCompValues = new uint64_t[3 * size];
+  uint8_t *carryBit = new uint8_t[3 * size];
+  for (int i = 0; i < size; i++) {
+    intType localShareA =
+        (inp[i] % divisor) -
+        (getFieldMsb(inp[i]) - localShareCorrSmallRing[i]) * ringRem;
+    for (int j = 0; j < compPerElt; j++) {
+      localShareA_all3[compPerElt * i + j] = localShareA;
+    }
+
+    if (curParty == sci::ALICE) {
+      if (compPerElt == 3) {
+        localShareA_all3[3 * i] =
+            (localShareA_all3[3 * i] - divisor) & bitsAmask;
+        localShareA_all3[3 * i + 2] =
+            (localShareA_all3[3 * i + 2] + divisor) & bitsAmask;
+      } else {
+        localShareA_all3[2 * i] =
+            (localShareA_all3[2 * i] - divisor) & bitsAmask;
+      }
+    }
+    for (int j = 0; j < compPerElt; j++) {
+      radixCompValues[compPerElt * i + j] =
+          (localShareA_all3[compPerElt * i + j] & bitsAMinusOneMask);
+      localShareA_all3_drelu[compPerElt * i + j] =
+          (localShareA_all3[compPerElt * i + j] >> (bitsForA - 1));
+      if (curParty == sci::BOB) {
+        radixCompValues[compPerElt * i + j] =
+            bitsAMinusOneMask - radixCompValues[compPerElt * i + j];
+      }
+    }
+  }
+
+  MillionaireProtocol millionaire(curParty, curiopack, curotpack);
+  millionaire.compare(carryBit, radixCompValues, totalComp, bitsForA - 1);
+  for (int i = 0; i < totalComp; i++) {
+    localShareA_all3_drelu[i] = (localShareA_all3_drelu[i] + carryBit[i]) & 1;
+  }
+
+  if (curParty == sci::ALICE) {
+    uint64_t **otMsg = new uint64_t *[totalComp];
+    intType *localShareDRelu = new intType[totalComp];
+    curPrgInstance->random_mod_p<intType>(localShareDRelu, totalComp,
+                                          prime_mod);
+    for (int i = 0; i < totalComp; i++) {
+      otMsg[i] = new uint64_t[2];
+      otMsg[i][0] =
+          sci::neg_mod(-localShareDRelu[i] + (localShareA_all3_drelu[i]),
+                       (int64_t)prime_mod);
+      otMsg[i][1] = sci::neg_mod(-localShareDRelu[i] +
+                                     ((localShareA_all3_drelu[i] + 1) & 1),
+                                 (int64_t)prime_mod);
+    }
+    curiknp->send_impl(otMsg, totalComp, fieldBits);
+    for (int i = 0; i < size; i++) {
+      intType curCTermShare = 0;
+      for (int j = 0; j < compPerElt; j++) {
+        curCTermShare =
+            sci::neg_mod(curCTermShare + localShareDRelu[compPerElt * i + j],
+                         (int64_t)prime_mod);
+      }
+      outp[i] = sci::neg_mod(outp[i] - curCTermShare, (int64_t)prime_mod);
+      delete[] otMsg[i];
+    }
+    delete[] otMsg;
+    delete[] localShareDRelu;
+  } else {
+    uint64_t *otDataRecvd = new uint64_t[totalComp];
+    curiknp->recv_impl(otDataRecvd, localShareA_all3_drelu, totalComp,
+                       fieldBits);
+    for (int i = 0; i < size; i++) {
+      intType curCTermShare = 0;
+      for (int j = 0; j < compPerElt; j++) {
+        uint64_t curDReluAns = otDataRecvd[compPerElt * i + j];
+        curCTermShare =
+            sci::neg_mod(curCTermShare + curDReluAns, (int64_t)prime_mod);
+      }
+      outp[i] = sci::neg_mod(outp[i] - curCTermShare, (int64_t)prime_mod);
+    }
+    delete[] otDataRecvd;
+  }
+
+  if (doMSBComputation)
+    delete[] msbShare;
+  delete[] localShareCorr;
+  delete[] localShareCorrSmallRingPacked;
+  delete[] localShareCorrSmallRing;
+  delete[] otMsgCorrField;
+  delete[] otMsgCorrSmallRing;
+  delete[] localShareA_all3;
+  delete[] localShareA_all3_drelu;
+  delete[] radixCompValues;
+  delete[] carryBit;
+}
+
+template <typename intType>
+void funcFieldDivWrapper(int size, intType *inp, intType *outp, intType divisor,
+                         uint8_t *msbShare) {
+#ifdef MULTITHREADED_TRUNC
+  std::thread truncThreads[num_threads];
+  int chunk_size = size / num_threads;
+  for (int i = 0; i < num_threads; i++) {
+    int offset = i * chunk_size;
+    int curSize;
+    if (i == (num_threads - 1)) {
+      curSize = size - offset;
+    } else {
+      curSize = chunk_size;
+    }
+    int curParty = party;
+    if (i & 1)
+      curParty = 3 - curParty;
+    uint8_t *msbShareArg = msbShare;
+    if (msbShare != nullptr)
+      msbShareArg = msbShareArg + offset;
+    truncThreads[i] = std::thread(
+        funcFieldDiv<intType>, curParty, iopackArr[i], otpackArr[i],
+        otInstanceArr[i], kkotInstanceArr[i], reluArr[i], prgInstanceArr[i],
+        curSize, inp + offset, outp + offset, divisor, msbShareArg);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    truncThreads[i].join();
+  }
+#else
+  funcFieldDiv<intType>(party, iopack, otpack, iknpOT, kkot, relu,
+                        prg128Instance, size, inp, outp, divisor, msbShare);
+#endif
+}
+
+#endif // FUNCTIONALITIES_UNIFORM_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp
new file mode 100644
index 00000000..f182afd0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp
@@ -0,0 +1,118 @@
+/*
+Authors: Nishant Kumar, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "globals.h"
+
+sci::NetIO *io;
+sci::IOPack *iopack;
+sci::OTPack *otpack;
+
+AuxProtocols *aux;
+Truncation *truncation;
+XTProtocol *xt;
+#ifdef SCI_OT
+LinearOT *mult;
+#endif
+MathFunctions *math;
+ArgMaxProtocol<intType> *argmax;
+ReLUProtocol<intType> *relu;
+MaxPoolProtocol<intType> *maxpool;
+// Additional classes for Athos
+#ifdef SCI_OT
+MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>> *multUniform;
+#endif
+#ifdef SCI_HE
+ConvField *he_conv;
+FCField *he_fc;
+ElemWiseProdField *he_prod;
+#endif
+sci::IKNP<sci::NetIO> *iknpOT;
+sci::IKNP<sci::NetIO> *iknpOTRoleReversed;
+sci::KKOT<sci::NetIO> *kkot;
+sci::PRG128 *prg128Instance;
+
+sci::NetIO *ioArr[MAX_THREADS];
+sci::IOPack *iopackArr[MAX_THREADS];
+sci::OTPack *otpackArr[MAX_THREADS];
+MathFunctions *mathArr[MAX_THREADS];
+#ifdef SCI_OT
+LinearOT *multArr[MAX_THREADS];
+#endif
+AuxProtocols *auxArr[MAX_THREADS];
+Truncation *truncationArr[MAX_THREADS];
+XTProtocol *xtArr[MAX_THREADS];
+ReLUProtocol<intType> *reluArr[MAX_THREADS];
+MaxPoolProtocol<intType> *maxpoolArr[MAX_THREADS];
+// Additional classes for Athos
+#ifdef SCI_OT
+MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>>
+    *multUniformArr[MAX_THREADS];
+#endif
+sci::IKNP<sci::NetIO> *otInstanceArr[MAX_THREADS];
+sci::KKOT<sci::NetIO> *kkotInstanceArr[MAX_THREADS];
+sci::PRG128 *prgInstanceArr[MAX_THREADS];
+
+std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
+uint64_t comm_threads[MAX_THREADS];
+uint64_t num_rounds;
+
+#ifdef LOG_LAYERWISE
+uint64_t ConvTimeInMilliSec = 0;
+uint64_t MatAddTimeInMilliSec = 0;
+uint64_t BatchNormInMilliSec = 0;
+uint64_t TruncationTimeInMilliSec = 0;
+uint64_t ReluTimeInMilliSec = 0;
+uint64_t MaxpoolTimeInMilliSec = 0;
+uint64_t AvgpoolTimeInMilliSec = 0;
+uint64_t MatMulTimeInMilliSec = 0;
+uint64_t MatAddBroadCastTimeInMilliSec = 0;
+uint64_t MulCirTimeInMilliSec = 0;
+uint64_t ScalarMulTimeInMilliSec = 0;
+uint64_t SigmoidTimeInMilliSec = 0;
+uint64_t TanhTimeInMilliSec = 0;
+uint64_t SqrtTimeInMilliSec = 0;
+uint64_t NormaliseL2TimeInMilliSec = 0;
+uint64_t ArgMaxTimeInMilliSec = 0;
+
+uint64_t ConvCommSent = 0;
+uint64_t MatAddCommSent = 0;
+uint64_t BatchNormCommSent = 0;
+uint64_t TruncationCommSent = 0;
+uint64_t ReluCommSent = 0;
+uint64_t MaxpoolCommSent = 0;
+uint64_t AvgpoolCommSent = 0;
+uint64_t MatMulCommSent = 0;
+uint64_t MatAddBroadCastCommSent = 0;
+uint64_t MulCirCommSent = 0;
+uint64_t ScalarMulCommSent = 0;
+uint64_t SigmoidCommSent = 0;
+uint64_t TanhCommSent = 0;
+uint64_t SqrtCommSent = 0;
+uint64_t NormaliseL2CommSent = 0;
+uint64_t ArgMaxCommSent = 0;
+
+// Kanav: I hope this doesn't break
+int32_t bitlength = 63;
+int32_t num_threads = 4;
+int party = 0;
+int port = 32000;
+string address = "127.0.0.1";
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/globals.h b/GPU-MPC/ext/sytorch/ext/sci/src/globals.h
new file mode 100644
index 00000000..9fb41051
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/globals.h
@@ -0,0 +1,140 @@
+/*
+Authors: Nishant Kumar, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef GLOBALS_H___
+#define GLOBALS_H___
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "BuildingBlocks/truncation.h"
+#include "Math/math-functions.h"
+#include "NonLinear/argmax.h"
+#include "NonLinear/maxpool.h"
+#include "NonLinear/relu-interface.h"
+#include "defines.h"
+#include "defines_uniform.h"
+#include <chrono>
+#include <cstdint>
+#include <thread>
+#ifdef SCI_OT
+#include "LinearOT/linear-ot.h"
+#include "LinearOT/linear-uniform.h"
+#endif
+// Additional Headers for Athos
+#ifdef SCI_HE
+#include "LinearHE/conv-field.h"
+#include "LinearHE/elemwise-prod-field.h"
+#include "LinearHE/fc-field.h"
+#endif
+
+// #define MULTI_THREADING
+
+#define MAX_THREADS 4
+
+extern sci::NetIO *io;
+extern sci::IOPack *iopack;
+extern sci::OTPack *otpack;
+
+extern AuxProtocols *aux;
+extern Truncation *truncation;
+extern XTProtocol *xt;
+#ifdef SCI_OT
+extern LinearOT *mult;
+#endif
+extern MathFunctions *math;
+extern ArgMaxProtocol<intType> *argmax;
+extern ReLUProtocol<intType> *relu;
+extern MaxPoolProtocol<intType> *maxpool;
+// Additional classes for Athos
+#ifdef SCI_OT
+extern MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>> *multUniform;
+#endif
+#ifdef SCI_HE
+extern ConvField *he_conv;
+extern FCField *he_fc;
+extern ElemWiseProdField *he_prod;
+#endif
+extern sci::IKNP<sci::NetIO> *iknpOT;
+extern sci::IKNP<sci::NetIO> *iknpOTRoleReversed;
+extern sci::KKOT<sci::NetIO> *kkot;
+extern sci::PRG128 *prg128Instance;
+
+extern sci::NetIO *ioArr[MAX_THREADS];
+extern sci::IOPack *iopackArr[MAX_THREADS];
+extern sci::OTPack *otpackArr[MAX_THREADS];
+extern MathFunctions *mathArr[MAX_THREADS];
+#ifdef SCI_OT
+extern LinearOT *multArr[MAX_THREADS];
+#endif
+extern AuxProtocols *auxArr[MAX_THREADS];
+extern Truncation *truncationArr[MAX_THREADS];
+extern XTProtocol *xtArr[MAX_THREADS];
+extern ReLUProtocol<intType> *reluArr[MAX_THREADS];
+extern MaxPoolProtocol<intType> *maxpoolArr[MAX_THREADS];
+// Additional classes for Athos
+#ifdef SCI_OT
+extern MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>>
+    *multUniformArr[MAX_THREADS];
+#endif
+extern sci::IKNP<sci::NetIO> *otInstanceArr[MAX_THREADS];
+extern sci::KKOT<sci::NetIO> *kkotInstanceArr[MAX_THREADS];
+extern sci::PRG128 *prgInstanceArr[MAX_THREADS];
+
+extern std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
+extern uint64_t comm_threads[MAX_THREADS];
+extern uint64_t num_rounds;
+
+#ifdef LOG_LAYERWISE
+extern uint64_t ConvTimeInMilliSec;
+extern uint64_t MatAddTimeInMilliSec;
+extern uint64_t BatchNormInMilliSec;
+extern uint64_t TruncationTimeInMilliSec;
+extern uint64_t ReluTimeInMilliSec;
+extern uint64_t MaxpoolTimeInMilliSec;
+extern uint64_t AvgpoolTimeInMilliSec;
+extern uint64_t MatMulTimeInMilliSec;
+extern uint64_t MatAddBroadCastTimeInMilliSec;
+extern uint64_t MulCirTimeInMilliSec;
+extern uint64_t ScalarMulTimeInMilliSec;
+extern uint64_t SigmoidTimeInMilliSec;
+extern uint64_t TanhTimeInMilliSec;
+extern uint64_t SqrtTimeInMilliSec;
+extern uint64_t NormaliseL2TimeInMilliSec;
+extern uint64_t ArgMaxTimeInMilliSec;
+
+extern uint64_t ConvCommSent;
+extern uint64_t MatAddCommSent;
+extern uint64_t BatchNormCommSent;
+extern uint64_t TruncationCommSent;
+extern uint64_t ReluCommSent;
+extern uint64_t MaxpoolCommSent;
+extern uint64_t AvgpoolCommSent;
+extern uint64_t MatMulCommSent;
+extern uint64_t MatAddBroadCastCommSent;
+extern uint64_t MulCirCommSent;
+extern uint64_t ScalarMulCommSent;
+extern uint64_t SigmoidCommSent;
+extern uint64_t TanhCommSent;
+extern uint64_t SqrtCommSent;
+extern uint64_t NormaliseL2CommSent;
+extern uint64_t ArgMaxCommSent;
+#endif
+
+#endif // GLOBALS_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.cpp
new file mode 100644
index 00000000..ffa305ce
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.cpp
@@ -0,0 +1,35 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "defines_float.h"
+#include "FloatingPoint/floating-point.h"
+#include "FloatingPoint/fp-math.h"
+
+using namespace sci ;
+using namespace std ;
+
+IOPack *iopackArr[MAX_THREADS] ;
+OTPack *otpackArr[MAX_THREADS] ;
+
+BoolOp *boolopArr[MAX_THREADS] ;
+FixOp *fixopArr[MAX_THREADS] ;
+FPOp *fpopArr[MAX_THREADS] ;
+FPMath *fpmathArr[MAX_THREADS] ;
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.h b/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.h
new file mode 100644
index 00000000..eb0792d5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/globals_float.h
@@ -0,0 +1,40 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef GLOBALS_FLOAT_H___
+#define GLOBALS_FLOAT_H___
+
+#include "defines_float.h"
+#include "FloatingPoint/floating-point.h"
+#include "FloatingPoint/fp-math.h"
+
+using namespace sci ;
+using namespace std ;
+
+extern IOPack *iopackArr[MAX_THREADS] ;
+extern OTPack *otpackArr[MAX_THREADS] ;
+
+extern BoolOp *boolopArr[MAX_THREADS] ;
+extern FixOp *fixopArr[MAX_THREADS] ;
+extern FPOp *fpopArr[MAX_THREADS] ;
+extern FPMath *fpmathArr[MAX_THREADS] ;
+
+#endif
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.cpp
new file mode 100644
index 00000000..bd19cc14
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.cpp
@@ -0,0 +1,2810 @@
+/*
+Authors: Deevashwer Rathee, G Rahul Kranti Kiran
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "library_fixed.h"
+#include "globals.h"
+#include "library_fixed_common.h"
+
+using namespace std;
+using namespace sci;
+
+#define START_IDX 0
+#define print_vec(vec, bw, N)                                                  \
+  {                                                                            \
+    uint64_t* rec_vec = new uint64_t[N];                                       \
+    reconstruct(N, vec+START_IDX, rec_vec, bw);                                \
+    std::cout << #vec << "_pub:" << std::endl;                                 \
+    for (int i = 0; i < N; i++) {                                              \
+        std::cout << rec_vec[i] << " ";                                        \
+    }                                                                          \
+    std::cout << std::endl;                                                    \
+  }                                                                            \
+
+void initialize() {
+  assert(num_threads <= MAX_THREADS);
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new sci::IOPack(party, port + i, address);
+    ioArr[i] = iopackArr[i]->io;
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  io = ioArr[0];
+  iopack = iopackArr[0];
+  otpack = otpackArr[0];
+
+  for (int i = 0; i < num_threads; i++) {
+    if (i & 1) {
+      auxArr[i] = new AuxProtocols(3 - party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] = new Truncation(3 - party, iopackArr[i], otpackArr[i]);
+      xtArr[i] = new XTProtocol(3 - party, iopackArr[i], otpackArr[i]);
+      multArr[i] = new LinearOT(3 - party, iopackArr[i], otpackArr[i]);
+      mathArr[i] = new MathFunctions(3 - party, iopackArr[i], otpackArr[i]);
+    } else {
+      auxArr[i] = new AuxProtocols(party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] = new Truncation(party, iopackArr[i], otpackArr[i]);
+      xtArr[i] = new XTProtocol(party, iopackArr[i], otpackArr[i]);
+      multArr[i] = new LinearOT(party, iopackArr[i], otpackArr[i]);
+      mathArr[i] = new MathFunctions(party, iopackArr[i], otpackArr[i]);
+    }
+  }
+  aux = auxArr[0];
+  truncation = truncationArr[0];
+  xt = xtArr[0];
+  mult = multArr[0];
+  math = mathArr[0];
+
+  io->sync();
+  num_rounds = iopack->get_rounds();
+  start_time = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < num_threads; i++) {
+    auto temp = iopackArr[i]->get_comm();
+    comm_threads[i] = temp;
+  }
+}
+
+void finalize() {
+  auto end_time = chrono::high_resolution_clock::now();
+  auto execTimeInMilliSec =
+      chrono::duration_cast<chrono::milliseconds>(end_time - start_time)
+          .count();
+  uint64_t totalComm = 0;
+  for (int i = 0; i < num_threads; i++) {
+    auto temp = iopackArr[i]->get_comm();
+    totalComm += (temp - comm_threads[i]);
+  }
+  uint64_t totalCommClient;
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "Total time taken = " << execTimeInMilliSec
+            << " milliseconds.\n";
+  std::cout << "Total data sent = " << (totalComm / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "Number of rounds = " << iopack->get_rounds() - num_rounds
+            << std::endl;
+  if (party == SERVER) {
+    io->recv_data(&totalCommClient, sizeof(uint64_t));
+    std::cout << "Total comm (sent+received) = "
+              << ((totalComm + totalCommClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+  } else if (party == CLIENT) {
+    io->send_data(&totalComm, sizeof(uint64_t));
+    std::cout << "Total comm (sent+received) = (see SERVER OUTPUT)"
+              << std::endl;
+  }
+  std::cout << "------------------------------------------------------\n";
+
+#ifdef LOG_LAYERWISE
+  std::cout << "Total time in MatAdd = " << (MatAddTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in BatchNorm = " << (BatchNormInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MatMul = " << (MatMulTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Conv = " << (ConvTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in ReLU = " << (ReluTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MatAddBroadCast = "
+            << (MatAddBroadCastTimeInMilliSec / 1000.0) << " seconds."
+            << std::endl;
+  std::cout << "Total time in MulCir = " << (MulCirTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in ScalarMul = "
+            << (ScalarMulTimeInMilliSec / 1000.0) << " seconds." << std::endl;
+  std::cout << "Total time in Sigmoid = " << (SigmoidTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Tanh = " << (TanhTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Sqrt = " << (SqrtTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in NormaliseL2 = "
+            << (NormaliseL2TimeInMilliSec / 1000.0) << " seconds." << std::endl;
+  std::cout << "Total time in ArgMax = " << (ArgMaxTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "MatAdd data sent = "
+            << ((MatAddCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "BatchNorm data sent = "
+            << ((BatchNormCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "MatMul data sent = "
+            << ((MatMulCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Conv data sent = " << ((ConvCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "ReLU data sent = " << ((ReluCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "MatAddBroadCast data sent = "
+            << ((MatAddBroadCastCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "MulCir data sent = "
+            << ((MulCirCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Sigmoid data sent = "
+            << ((SigmoidCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Tanh data sent = " << ((TanhCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "Sqrt data sent = " << ((SqrtCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "NormaliseL2 data sent = "
+            << ((NormaliseL2CommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "ArgMax data sent = "
+            << ((ArgMaxCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "------------------------------------------------------\n";
+
+  if (party == SERVER) {
+    uint64_t MatAddCommSentClient = 0;
+    uint64_t BatchNormCommSentClient = 0;
+    uint64_t MatMulCommSentClient = 0;
+    uint64_t ConvCommSentClient = 0;
+    uint64_t ReluCommSentClient = 0;
+    uint64_t MatAddBroadCastCommSentClient = 0;
+    uint64_t MulCirCommSentClient = 0;
+    uint64_t ScalarMulCommSentClient = 0;
+    uint64_t SigmoidCommSentClient = 0;
+    uint64_t TanhCommSentClient = 0;
+    uint64_t SqrtCommSentClient = 0;
+    uint64_t NormaliseL2CommSentClient = 0;
+    uint64_t ArgMaxCommSentClient = 0;
+    io->recv_data(&MatAddCommSentClient, sizeof(uint64_t));
+    io->recv_data(&BatchNormCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MatMulCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ConvCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ReluCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MatAddBroadCastCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MulCirCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ScalarMulCommSentClient, sizeof(uint64_t));
+    io->recv_data(&SigmoidCommSentClient, sizeof(uint64_t));
+    io->recv_data(&TanhCommSentClient, sizeof(uint64_t));
+    io->recv_data(&SqrtCommSentClient, sizeof(uint64_t));
+    io->recv_data(&NormaliseL2CommSentClient, sizeof(uint64_t));
+    io->recv_data(&ArgMaxCommSentClient, sizeof(uint64_t));
+    std::cout << "MatAdd data (sent+received) = "
+              << ((MatAddCommSent + MatAddCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "BatchNorm data (sent+received) = "
+              << ((BatchNormCommSent + BatchNormCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MatMul data (sent+received) = "
+              << ((MatMulCommSent + MatMulCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Conv data (sent+received) = "
+              << ((ConvCommSent + ConvCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "ReLU data (sent+received) = "
+              << ((ReluCommSent + ReluCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MatAddBroadCast data (sent+received) = "
+              << ((MatAddBroadCastCommSent + MatAddBroadCastCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MulCir data (sent+received) = "
+              << ((MulCirCommSent + MulCirCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "ScalarMul data (sent+received) = "
+              << ((ScalarMulCommSent + ScalarMulCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Sigmoid data (sent+received) = "
+              << ((SigmoidCommSent + SigmoidCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Sqrt data (sent+received) = "
+              << ((SqrtCommSent + SqrtCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Tanh data (sent+received) = "
+              << ((TanhCommSent + TanhCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "NormaliseL2 data (sent+received) = "
+              << ((NormaliseL2CommSent + NormaliseL2CommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "ArgMax data (sent+received) = "
+              << ((ArgMaxCommSent + ArgMaxCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+  } else if (party == CLIENT) {
+    io->send_data(&MatAddCommSent, sizeof(uint64_t));
+    io->send_data(&BatchNormCommSent, sizeof(uint64_t));
+    io->send_data(&MatMulCommSent, sizeof(uint64_t));
+    io->send_data(&ConvCommSent, sizeof(uint64_t));
+    io->send_data(&ReluCommSent, sizeof(uint64_t));
+    io->send_data(&MatAddBroadCastCommSent, sizeof(uint64_t));
+    io->send_data(&MulCirCommSent, sizeof(uint64_t));
+    io->send_data(&ScalarMulCommSent, sizeof(uint64_t));
+    io->send_data(&SigmoidCommSent, sizeof(uint64_t));
+    io->send_data(&TanhCommSent, sizeof(uint64_t));
+    io->send_data(&SqrtCommSent, sizeof(uint64_t));
+    io->send_data(&NormaliseL2CommSent, sizeof(uint64_t));
+    io->send_data(&ArgMaxCommSent, sizeof(uint64_t));
+  }
+#endif
+
+  for (int i = 0; i < num_threads; i++) {
+    delete ioArr[i];
+    delete otpackArr[i];
+    delete auxArr[i];
+    delete xtArr[i];
+    delete truncationArr[i];
+    delete multArr[i];
+    delete mathArr[i];
+  }
+}
+
+void reconstruct(int64_t *A, int64_t *B, int32_t I, int32_t J, int bwA) {
+  reconstruct(I * J, (uint64_t *)A, (uint64_t *)B, bwA);
+  for (int i = 0; i < I * J; i++) {
+    B[i] = signed_val((uint64_t)B[i], bwA);
+  }
+}
+
+void reconstruct(int dim, uint64_t *x, uint64_t *y, int bw_x) {
+  uint64_t mask = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  if (party == sci::ALICE) {
+    io->send_data(x, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      y[i] = 0;
+    }
+  } else {
+    io->recv_data(y, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      y[i] = (y[i] + x[i]) & mask;
+    }
+  }
+}
+
+void typecast_to_uint64(int64_t *A, uint64_t *A64, int32_t I, int32_t J,
+                        int32_t bwA) {
+  uint64_t mask = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+
+  for (int i = 0; i < I * J; i++) {
+    A64[i] = uint64_t(A[i]) & mask;
+  }
+  return;
+}
+
+void typecast_from_uint64(uint64_t *A64, int64_t *A, int32_t I, int32_t J,
+                          int bwA) {
+  for (int i = 0; i < I * J; i++) {
+    if (bwA <= 8)
+      A[i] = int64_t(int8_t(A64[i]));
+    else if (bwA <= 16)
+      A[i] = int64_t(int16_t(A64[i]));
+    else if (bwA <= 32)
+      A[i] = int64_t(int32_t(A64[i]));
+    else
+      A[i] = int64_t(A64[i]);
+  }
+  return;
+}
+
+void reconstruct(uint64_t *A, int64_t *B, int32_t I, int32_t J, int bwA) {
+  reconstruct(I * J, A, (uint64_t *)B, bwA);
+  for (int i = 0; i < I * J; i++) {
+    B[i] = signed_val((uint64_t)B[i], bwA);
+  }
+}
+
+void AdjustScaleShr(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+                    int32_t scale) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  int32_t dim = I * J;
+#ifdef DIV_RESCALING
+  truncation->div_pow2(dim, A, B, scale, bwA, true);
+#else
+  truncation->truncate(dim, A, B, scale, bwA, true);
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MatAddTimeInMilliSec += temp;
+  std::cout << "Time in sec for current AdjustScaleShr = " << (temp / 1000.0)
+            << std::endl;
+  MatAddCommSent += curComm;
+#endif
+}
+
+void MatAdd(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+            int32_t bwA, int32_t bwB, int32_t bwC, int32_t bwTemp, int32_t shrA,
+            int32_t shrB, int32_t shrC, int32_t demote, bool subroutine) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  assert(bwTemp <= 64);
+  assert(bwA <= bwTemp);
+  assert(bwB <= bwTemp);
+  assert(bwC <= bwTemp);
+
+  int32_t dim = I * J;
+  uint64_t maskTemp = (bwTemp == 64 ? -1 : ((1ULL << bwTemp) - 1));
+
+  uint64_t *tmpA = new uint64_t[dim];
+  uint64_t *tmpB = new uint64_t[dim];
+  uint64_t *tmpC = new uint64_t[dim];
+
+  xt->s_extend(dim, A, tmpA, bwA, bwTemp);
+  xt->s_extend(dim, B, tmpB, bwB, bwTemp);
+
+#ifdef DIV_RESCALING
+  truncation->div_pow2(dim, tmpA, A, shrA + shrC, bwTemp, true);
+  truncation->div_pow2(dim, tmpB, B, shrB + shrC, bwTemp, true);
+#else
+  truncation->truncate(dim, tmpA, A, shrA + shrC, bwTemp, true);
+  truncation->truncate(dim, tmpB, B, shrB + shrC, bwTemp, true);
+#endif
+
+  for (int i = 0; i < dim; i++) {
+    tmpC[i] = (A[i] + B[i]) & maskTemp;
+  }
+
+#ifdef DIV_RESCALING
+  truncation->div_pow2(dim, tmpC, C, demote, bwTemp, true);
+#else
+  truncation->truncate(dim, tmpC, C, demote, bwTemp, true);
+#endif
+  aux->reduce(dim, C, C, bwTemp, bwC);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  if (!subroutine) {
+    MatAddTimeInMilliSec += temp;
+    std::cout << "Time in sec for current MatAdd = " << (temp / 1000.0)
+              << std::endl;
+    MatAddCommSent += curComm;
+  }
+#endif
+}
+
+void MatAddBroadCast(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I,
+                     int32_t J, int32_t bwA, int32_t bwB, int32_t bwC,
+                     int32_t bwTemp, int32_t shrA, int32_t shrB, int32_t shrC,
+                     int32_t demote, bool scalar_A) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  int32_t dim = I * J;
+
+  uint64_t tmp;
+  if (scalar_A) {
+    if (party == BOB)
+      A[0] = 0; // assert(A[0] == 0);
+    uint64_t *tmpA = new uint64_t[dim];
+    for (int i = 0; i < dim; i++) {
+#ifndef DIV_RESCALING
+      tmpA[i] = signed_val(A[0], bwA) >> (shrA + shrC);
+#else
+      tmpA[i] = signed_val(A[0], bwA) / (1LL << (shrA + shrC));
+#endif
+    }
+    MatAdd(tmpA, B, C, I, J, bwTemp, bwB, bwC, bwTemp, 0, shrB + shrC, 0,
+           demote, true);
+
+    delete[] tmpA;
+  } else {
+    if (party == BOB)
+      B[0] = 0; // assert(B[0] == 0);
+    uint64_t *tmpB = new uint64_t[dim];
+    for (int i = 0; i < dim; i++) {
+#ifndef DIV_RESCALING
+      tmpB[i] = signed_val(B[0], bwB) >> (shrB + shrC);
+#else
+      tmpB[i] = signed_val(B[0], bwB) / (1LL << (shrB + shrC));
+#endif
+    }
+    MatAdd(A, tmpB, C, I, J, bwA, bwTemp, bwC, bwTemp, shrA + shrC, 0, 0,
+           demote, true);
+
+    delete[] tmpB;
+  }
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  MatAddBroadCastTimeInMilliSec += temp;
+  std::cout << "Time in sec for current MatAddBroadCast = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MatAddBroadCastCommSent += curComm;
+#endif
+}
+
+void AddOrSubCir(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+                 int32_t bwA, int32_t bwB, int32_t bwC, int32_t bwTemp,
+                 int32_t shrA, int32_t shrB, int32_t shrC, bool add,
+                 int32_t demote) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  int32_t dim = I * J;
+
+  uint64_t *tmpB = new uint64_t[dim];
+
+  xt->s_extend(J, B, tmpB, bwB, bwTemp);
+#ifdef DIV_RESCALING
+  truncation->div_pow2(J, tmpB, B, shrB + shrC, bwTemp, true);
+#else
+  truncation->truncate(J, tmpB, B, shrB + shrC, bwTemp, true);
+#endif
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      if (add) {
+        tmpB[i * J + j] = B[j];
+      } else {
+        tmpB[i * J + j] = (-1 * B[j]);
+      }
+    }
+  }
+  MatAdd(A, tmpB, C, I, J, bwA, bwTemp, bwC, bwTemp, shrA + shrC, 0, 0, demote,
+         true);
+
+  delete[] tmpB;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  MatAddTimeInMilliSec += temp;
+  std::cout << "Time in sec for current AddOrSubCir = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MatAddCommSent += curComm;
+#endif
+}
+
+void ScalarMul(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+               int32_t bwA, int32_t bwB, int32_t bwTemp, int32_t bwC,
+               int32_t shrA, int32_t shrB, int32_t demote) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  int32_t shift = shrA + shrB + demote;
+
+  uint64_t maskTemp = (bwTemp == 64 ? -1 : ((1ULL << bwTemp) - 1));
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  xt->s_extend(I * J, B, tmpC, bwB, bwTemp);
+  for (int i = 0; i < I * J; i++) {
+    C[i] = (tmpC[i] * A[0]) & maskTemp;
+  }
+#ifdef DIV_RESCALING
+  truncation->div_pow2(I * J, C, tmpC, shift, bwTemp, true);
+  aux->reduce(I * J, tmpC, C, bwTemp, bwC);
+#else
+  if ((bwTemp - bwC) >= shift) {
+    truncation->truncate_and_reduce(I * J, C, tmpC, shift, bwTemp);
+    aux->reduce(I * J, tmpC, C, bwTemp - shift, bwC);
+  } else {
+    truncation->truncate(I * J, C, tmpC, shift, bwTemp, true);
+    aux->reduce(I * J, tmpC, C, bwTemp, bwC);
+  }
+#endif
+
+  delete[] tmpC;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ScalarMulTimeInMilliSec += temp;
+  std::cout << "Time in sec for current ScalarMul = " << (temp / 1000.0)
+            << std::endl;
+  ScalarMulCommSent += curComm;
+#endif
+}
+
+void MulCir_thread(int32_t tid, uint64_t *A, uint64_t *B, uint64_t *C,
+                   int32_t dim, int32_t bwA, int32_t bwB, int32_t bwC,
+                   int32_t bwTemp, int32_t shrA, int32_t shrB, int32_t demote) {
+  int32_t shift = shrA + shrB + demote;
+
+  uint64_t *tmpC = new uint64_t[dim];
+
+  multArr[tid]->hadamard_product(dim, A, B, C, bwA, bwB, bwA + bwB, true, true,
+                                 MultMode::None);
+#ifdef DIV_RESCALING
+  truncationArr[tid]->div_pow2(dim, C, tmpC, shift, bwA + bwB, true);
+  auxArr[tid]->reduce(dim, tmpC, C, bwA + bwB, bwC);
+#else
+  if ((bwA + bwB - bwC) >= shift) {
+    truncationArr[tid]->truncate_and_reduce(dim, C, tmpC, shift, bwA + bwB);
+    auxArr[tid]->reduce(dim, tmpC, C, bwA + bwB - shift, bwC);
+  } else {
+    truncationArr[tid]->truncate(dim, C, tmpC, shift, bwA + bwB, true);
+    auxArr[tid]->reduce(dim, tmpC, C, bwA + bwB, bwC);
+  }
+#endif
+  delete[] tmpC;
+}
+
+void MulCir(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+            int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, uint64_t *A,
+            uint64_t *B, uint64_t *C) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". MulCir (" << I << " x " << J << ")" << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+
+  int min_chunk_size = THREADING_MIN_CHUNK_SIZE;
+  std::vector<int> chunks_per_thread =
+      divide_instances(::num_threads, I * J, min_chunk_size);
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    threads[i] = std::thread(MulCir_thread, i, A + offset, B + offset,
+                             C + offset, chunks_per_thread[i], bwA, bwB, bwC,
+                             bwTemp, shiftA, shiftB, shift_demote);
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MulCirTimeInMilliSec += temp;
+  std::cout << "Time in sec for current MulCir = " << (temp / 1000.0)
+            << std::endl;
+  MulCirCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MulCir(recA, recB, correctC, I, J, shrA, shrB, demote);
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != signed_val(correctC[i], bwC)) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recC[i]) << "\t" <<
+      // int(signed_val(correctC[i],bwC)) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MulCir Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MulCir Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+}
+
+void MatMul_thread(int32_t tid, uint64_t *A, uint64_t *B, uint64_t *C,
+                   int32_t I, int32_t K, int32_t J, int32_t bwA, int32_t bwB,
+                   int32_t bwC, int32_t bwTemp, int32_t shrA, int32_t shrB,
+                   int32_t H1, int32_t demote,
+                   MultMode mode = MultMode::Alice_has_B) {
+  assert(bwA + bwB >= bwC);
+  int32_t depth = ceil(log2(K));
+  int32_t shift = shrA + shrB + demote + H1 - depth;
+
+  uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  multArr[tid]->matrix_multiplication(I, K, J, A, B, C, bwA, bwB, bwA + bwB,
+                                      true, true, true, mode);
+  if (shift <= 0) {
+    for (int i = 0; i < I * J; i++) {
+      tmpC[i] = (C[i] << (-1 * shift)) & maskC;
+    }
+    auxArr[tid]->reduce(I * J, tmpC, C, bwA + bwB, bwC);
+#ifdef DIV_RESCALING
+  } else {
+    truncationArr[tid]->div_pow2(I * J, C, tmpC, shift, bwA + bwB, true);
+    auxArr[tid]->reduce(I * J, tmpC, C, bwA + bwB, bwC);
+  }
+#else
+  } else if ((bwA + bwB - bwC) >= shift) {
+    truncationArr[tid]->truncate_and_reduce(I * J, C, tmpC, shift, bwA + bwB);
+    auxArr[tid]->reduce(I * J, tmpC, C, bwA + bwB - shift, bwC);
+  } else {
+    truncationArr[tid]->truncate(I * J, C, tmpC, shift, bwA + bwB, true);
+    auxArr[tid]->reduce(I * J, tmpC, C, bwA + bwB, bwC);
+  }
+#endif
+
+  delete[] tmpC;
+}
+
+void MatMul(int64_t I, int64_t K, int64_t J, int64_t shrA, int64_t shrB,
+            int64_t H1, int64_t H2, int64_t demote, int32_t bwA, int32_t bwB,
+            int32_t bwTemp, int32_t bwC, uint64_t *A, uint64_t *B, uint64_t *C,
+            uint64_t *tmp, bool verbose) {
+#ifdef LOG_LAYERWISE
+  if (verbose)
+    std::cout << ctr++ << ". MatMul (" << I << " x " << K << " x " << J << ")"
+              << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  if (party == CLIENT) {
+    for (int i = 0; i < K * J; i++) {
+      assert(B[i] == 0 &&
+             "The multiplication mode for MatMul is set assuming the server "
+             "has the model weights (B) in the clear, and therefore, the "
+             "client's share for the weights must be zero. If the weights "
+             "should also be secret-shared, then change the MultMode in this "
+             "function to MultMode::None");
+    }
+  }
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+
+  int min_chunk_size = ceil(THREADING_MIN_CHUNK_SIZE / double(K * J));
+  std::vector<int> chunks_per_thread =
+      divide_instances(::num_threads, I, min_chunk_size);
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  // cout << "lnum_threads: " << lnum_threads << endl;
+  // cout << "chunks[0]: " << chunks_per_thread[0] << endl;
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    MultMode mode = (i & 1 ? MultMode::Bob_has_B : MultMode::Alice_has_B);
+    threads[i] =
+        std::thread(MatMul_thread, i, A + (K * offset), B, C + (J * offset),
+                    chunks_per_thread[i], K, J, bwA, bwB, bwC, bwTemp, shiftA,
+                    shiftB, H1, shift_demote, mode);
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+
+  if (!verbose)
+    return;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MatMulCommSent += curComm;
+  MatMulTimeInMilliSec += temp;
+  std::cout << "Time in sec for current MatMul = " << (temp / 1000.0)
+            << std::endl;
+#endif
+}
+
+void MatMul(int64_t I, int64_t K, int64_t J, int64_t shrA, int64_t shrB,
+            int64_t H1, int64_t H2, int64_t demote, int32_t bwA, int32_t bwB,
+            int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B, int64_t *C,
+            int64_t *tmp, bool verbose) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatMul(A, B, C, tmp, I, K, J, shrA, shrB, H1, H2, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+
+#else
+  uint64_t *tmpA = new uint64_t[I * K];
+  typecast_to_uint64(A, tmpA, I, K, bwA);
+  uint64_t *tmpB = new uint64_t[K * J];
+  typecast_to_uint64(B, tmpB, K, J, bwB);
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  MatMul(I, K, J, shrA, shrB, H1, H2, demote, bwA, bwB, bwTemp, bwC, tmpA, tmpB,
+         tmpC, nullptr, verbose);
+
+  typecast_from_uint64(tmpC, C, I, J, bwC);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * K];
+  int64_t *recB = new int64_t[K * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, K, bwA);
+  reconstruct(B, recB, K, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+
+  cleartext_MatMul(recA, recB, correctC, tmp, I, K, J, shrA, shrB, H1, H2,
+                   demote);
+
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+        if (verbose)
+          std::cout << i << "\t" << int(recC[i]) << "\t" << int(correctC[i])
+                    << std::endl;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recC[i]) << "\t" <<
+      // int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatMul Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MatMul Output Mismatch" << RESET << std::endl;
+    /*
+    if (!pass) {
+        for(int i = 0; i < I*J; i++) {
+            if(recC[i] != correctC[i]) {
+                std::cout << i << "\t" << int(recC[i]) << "\t" <<
+    int(correctC[i]) << std::endl;
+            }
+        }
+    }
+    */
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void Sigmoid_thread(int32_t tid, uint64_t *A, uint64_t *B, int32_t dim,
+                    int32_t bwA, int32_t bwB, int32_t sA, int32_t sB) {
+  mathArr[tid]->sigmoid(dim, A, B, bwA, bwB, sA, sB);
+}
+
+void Sigmoid(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+             int64_t bwA, int64_t bwB, uint64_t *A, uint64_t *B) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Sigmoid (" << I << " x " << J << ")" << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  int32_t s_A = log2(scale_in);
+  int32_t s_B = log2(scale_out);
+
+  int min_chunk_size = THREADING_MIN_CHUNK_SIZE;
+  std::vector<int> chunks_per_thread =
+      divide_instances(::num_threads, I * J, min_chunk_size);
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    threads[i] = std::thread(Sigmoid_thread, i, A + offset, B + offset,
+                             chunks_per_thread[i], bwA, bwB, s_A, s_B);
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  SigmoidTimeInMilliSec += temp;
+  std::cout << "Time in sec for current Sigmoid = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  SigmoidCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *correctB = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwA);
+  cleartext_Sigmoid(recA, I, J, scale_in, scale_out, bwA, bwA, correctB);
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+        std::cout << recA[i] / double(1LL << s_A) << "\t"
+                  << recB[i] / double(1LL << s_B) << "\t"
+                  << correctB[i] / double(1LL << s_B) << std::endl;
+      }
+      // std::cout << recA[i]/double(1LL << s_A) << "\t" << recB[i]/double(1LL
+      // << s_B) << "\t" << correctB[i]/double(1LL << s_B) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "Sigmoid Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Sigmoid Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+}
+
+void TanH_thread(int32_t tid, uint64_t *A, uint64_t *B, int32_t dim,
+                 int32_t bwA, int32_t bwB, int32_t sA, int32_t sB) {
+  mathArr[tid]->tanh(dim, A, B, bwA, bwB, sA, sB);
+}
+
+void TanH(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, uint64_t *A, uint64_t *B) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". TanH (" << I << " x " << J << ")" << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  int32_t s_A = log2(scale_in);
+  int32_t s_B = log2(scale_out);
+
+  int min_chunk_size = THREADING_MIN_CHUNK_SIZE;
+  std::vector<int> chunks_per_thread =
+      divide_instances(::num_threads, I * J, min_chunk_size);
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    threads[i] = std::thread(TanH_thread, i, A + offset, B + offset,
+                             chunks_per_thread[i], bwA, bwB, s_A, s_B);
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  TanhTimeInMilliSec += temp;
+  std::cout << "Time in sec for current TanH = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  TanhCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *correctB = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwA);
+  cleartext_TanH(recA, I, J, scale_in, scale_out, bwA, bwA, correctB);
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+      // std::cout << recA[i]/double(1LL << s_A) << "\t" << recB[i]/double(1LL
+      // << s_B) << "\t" << correctB[i]/double(1LL << s_B) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "TanH Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "TanH Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+}
+
+void Sqrt_thread(int32_t tid, uint64_t *A, uint64_t *B, int32_t dim,
+                 int32_t bwA, int32_t bwB, int32_t sA, int32_t sB,
+                 bool inverse) {
+  mathArr[tid]->sqrt(dim, A, B, bwA, bwB, sA, sB, inverse);
+}
+
+void Sqrt(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, bool inverse, uint64_t *A, uint64_t *B) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Sqrt (" << I << " x " << J << ")" << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  int32_t s_A = log2(scale_in);
+  int32_t s_B = log2(scale_out);
+
+  int min_chunk_size = THREADING_MIN_CHUNK_SIZE;
+  std::vector<int> chunks_per_thread =
+      divide_instances(::num_threads, I * J, min_chunk_size);
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    threads[i] = std::thread(Sqrt_thread, i, A + offset, B + offset,
+                             chunks_per_thread[i], bwA, bwB, s_A, s_B, inverse);
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  SqrtTimeInMilliSec += temp;
+  std::cout << "Time in sec for current Sqrt = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  SqrtCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *correctB = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwB);
+  cleartext_sqrt(recA, I, J, scale_in, scale_out, bwA, bwB, correctB, inverse);
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Sqrt Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Sqrt Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+}
+
+void Exp(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+         int32_t bwB, int32_t sA, int32_t sB) {
+  math->lookup_table_exp(I * J, A, B, bwA, bwB, sA, sB);
+}
+
+void Div(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+         int32_t bwA, int32_t bwB, int32_t bwC, int32_t sA, int32_t sB,
+         int32_t sC) {
+  math->div(I * J, A, B, C, bwA, bwB, bwC, sA, sB, sC, true, false);
+}
+
+void ArgMax(uint64_t *A, int32_t I, int32_t J, int32_t bwA, int32_t bw_index,
+            uint64_t *index) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  argmax = new ArgMaxProtocol<uint64_t>(party, RING, iopack, bwA, MILL_PARAM,
+                                               0, otpack);
+
+  argmax->ArgMaxMPC(I * J, A, index, false, nullptr);
+  if (bw_index > bwA) {
+    xt->z_extend(1, index, index, bwA, bw_index);
+  }
+
+  delete argmax;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ArgMaxTimeInMilliSec += temp;
+  std::cout << "Time in sec for current ArgMax = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ArgMaxCommSent += curComm;
+#endif
+}
+
+void MaxPool2D(uint64_t *A, int32_t I, int32_t J, int32_t bwA, int32_t bwB,
+               uint64_t *B) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  maxpool = new MaxPoolProtocol<uint64_t>(party, RING, iopack, bwA,
+                                                 MILL_PARAM, 0, otpack);
+  uint64_t *B_temp = new uint64_t[I];
+  maxpool->funcMaxMPC(I, J, A, B_temp, nullptr);
+  if (bwB > bwA) {
+    xt->z_extend(I, B_temp, B, bwA, bwB);
+  } else {
+    memcpy(B, B_temp, sizeof(uint64_t) * I);
+  }
+  delete[] B_temp;
+  delete maxpool;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  MaxpoolTimeInMilliSec += temp;
+  std::cout << "Time in sec for current MaxPool = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MaxpoolCommSent += curComm;
+#endif
+}
+
+void GroupedMatMul_thread(int32_t tid, uint64_t *A, uint64_t *B, uint64_t *C,
+                          int32_t I, int32_t K, int32_t J, int32_t G,
+                          int32_t bwA, int32_t bwB, int32_t bwC, int32_t bwTemp,
+                          int32_t shrA, int32_t shrB, int32_t H1, int32_t H2,
+                          int32_t demote,
+                          MultMode mode = MultMode::Alice_has_A) {
+  // assert(bwA + bwB >= bwC);
+  int32_t depth = H1 + H2;
+  int32_t shift = shrA + shrB + demote - H2;
+  int out_size = G * I * J;
+
+  for (int32_t g = 0; g < G; g++) {
+    multArr[tid]->matrix_multiplication(I, K, J, A + (g * I * K),
+                                        B + (g * K * J), C + (g * I * J), bwA,
+                                        bwB, bwA + bwB, true, true, true, mode);
+  }
+
+  uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+  uint64_t *tmpC = new uint64_t[out_size];
+
+  if (shift <= 0) {
+    if (bwA + bwB < bwC) {
+      xtArr[tid]->s_extend(out_size, C, tmpC, bwA + bwB, bwC);
+    } else {
+      auxArr[tid]->reduce(out_size, C, tmpC, bwA + bwB, bwC);
+    }
+    for (int i = 0; i < out_size; i++) {
+      C[i] = (tmpC[i] * (1ULL << (-1 * shift))) & maskC;
+    }
+#ifdef DIV_RESCALING
+  } else {
+    truncationArr[tid]->div_pow2(out_size, C, tmpC, shift, bwA + bwB, true);
+    if (bwA + bwB < bwC) {
+      xtArr[tid]->s_extend(out_size, tmpC, C, bwA + bwB, bwC);
+    } else {
+      auxArr[tid]->reduce(out_size, tmpC, C, bwA + bwB, bwC);
+    }
+  }
+#else
+  } else if ((bwA + bwB - bwC) >= shift) {
+    truncationArr[tid]->truncate_and_reduce(out_size, C, tmpC, shift,
+                                            bwA + bwB);
+    if (bwA + bwB < bwC) {
+      xtArr[tid]->s_extend(out_size, tmpC, C, bwA + bwB - shift, bwC);
+    } else {
+      auxArr[tid]->reduce(out_size, tmpC, C, bwA + bwB - shift, bwC);
+    }
+  } else {
+    truncationArr[tid]->truncate(out_size, C, tmpC, shift, bwA + bwB, true);
+    if (bwA + bwB < bwC) {
+      xtArr[tid]->s_extend(out_size, tmpC, C, bwA + bwB, bwC);
+    } else {
+      auxArr[tid]->reduce(out_size, tmpC, C, bwA + bwB, bwC);
+    }
+  }
+#endif
+
+  delete[] tmpC;
+}
+
+void Convolution(int32_t N, int32_t H, int32_t W, int32_t CIN, int32_t HF,
+                 int32_t WF, int32_t CINF, int32_t COUTF, int32_t HOUT,
+                 int32_t WOUT, int32_t HPADL, int32_t HPADR, int32_t WPADL,
+                 int32_t WPADR, int32_t HSTR, int32_t WSTR, int32_t HDL,
+                 int32_t WDL, int32_t G, int32_t bwA, int32_t bwB, int32_t bwC,
+                 int32_t bwTemp, int32_t shrA, int32_t shrB, int32_t H1,
+                 int32_t H2, int32_t demote, uint64_t *A, uint64_t *B,
+                 uint64_t *C) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Convolution (I = (" << N << "x" << H << "x" << W
+            << "x" << CIN << "), F = (" << G << "x" << HF << "x" << WF << "x"
+            << CINF << "x" << COUTF << "), S = (" << HSTR << "x" << WSTR << "))"
+            << std::endl;
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  if (party == CLIENT) {
+    for (int i = 0; i < G * HF * WF * CINF * COUTF; i++) {
+      assert(B[i] == 0 &&
+             "The multiplication mode for convolution is set assuming the "
+             "server has the model weights (B) in the clear, and therefore, "
+             "the client's share for the weights must be zero. If the weights "
+             "should also be secret-shared, then change the MultMode in this "
+             "function to MultMode::None");
+    }
+  }
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+
+  int32_t reshaped_image_size = HF * WF * CINF * N * HOUT * WOUT;
+
+  uint64_t *Image = new uint64_t[G * reshaped_image_size];
+  uint64_t *Filter = new uint64_t[G * COUTF * HF * WF * CINF];
+  uint64_t *Output = new uint64_t[G * COUTF * N * HOUT * WOUT];
+  std::vector<int> chunks_per_thread;
+  if (G > 1) {
+    chunks_per_thread = divide_instances(::num_threads, G, COUTF);
+  } else {
+    int min_chunk_size =
+        ceil(THREADING_MIN_CHUNK_SIZE / double(reshaped_image_size));
+    chunks_per_thread = divide_instances(::num_threads, COUTF, min_chunk_size);
+  }
+
+  cout << "chunks_per_thread[0]: " << chunks_per_thread[0] << endl;
+  for (int g = 0; g < G; g++) {
+    Conv2DReshapeInputGroup(N, H, W, CIN, HF, WF, HPADL, HPADR, WPADL, WPADR,
+                            HSTR, WSTR, g, G, HF * WF * CINF, N * HOUT * WOUT,
+                            A, Image + (g * reshaped_image_size));
+    for (int co = 0; co < COUTF; co++) {
+      for (int f = 0; f < HF * WF * CINF; f++) {
+        Filter[g * (COUTF * HF * WF * CINF) + co * (HF * WF * CINF) + f] =
+            B[g * (HF * WF * CINF * COUTF) + f * COUTF + co];
+      }
+    }
+  }
+
+  int offset = 0;
+  int lnum_threads = chunks_per_thread.size();
+  std::thread threads[lnum_threads];
+  for (int i = 0; i < lnum_threads; i++) {
+    MultMode mode = (i & 1 ? MultMode::Bob_has_A : MultMode::Alice_has_A);
+    if (G > 1) {
+      threads[i] = std::thread(
+          GroupedMatMul_thread, i, Filter + (offset * COUTF * HF * WF * CINF),
+          Image + (offset * reshaped_image_size),
+          Output + (offset * COUTF * N * HOUT * WOUT), COUTF, HF * WF * CINF,
+          N * HOUT * WOUT, chunks_per_thread[i], bwB, bwA, bwC, bwTemp, shiftB,
+          shiftA, H1, H2, shift_demote, mode);
+    } else {
+      threads[i] = std::thread(
+          GroupedMatMul_thread, i, Filter + (offset * HF * WF * CINF), Image,
+          Output + (offset * N * HOUT * WOUT), chunks_per_thread[i],
+          HF * WF * CINF, N * HOUT * WOUT, 1, bwB, bwA, bwC, bwTemp, shiftB,
+          shiftA, H1, H2, shift_demote, mode);
+    }
+    offset += chunks_per_thread[i];
+  }
+  for (int i = 0; i < lnum_threads; ++i) {
+    threads[i].join();
+  }
+
+  for (int g = 0; g < G; g++) {
+    Conv2DReshapeMatMulOPGroup(N, HOUT, WOUT, COUTF * G, g, G,
+                               Output + (g * COUTF * N * HOUT * WOUT), C);
+  }
+  delete[] Image;
+  delete[] Filter;
+  delete[] Output;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ConvCommSent += curComm;
+  ConvTimeInMilliSec += temp;
+  std::cout << "Time in sec for current Conv = " << (temp / 1000.0)
+            << std::endl;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[N * H * W * CIN];
+  int64_t *recB = new int64_t[G * HF * WF * CINF * COUTF];
+  int64_t *recC = new int64_t[N * HOUT * WOUT * COUTF * G];
+  int64_t *correctC = new int64_t[N * HOUT * WOUT * COUTF * G];
+  reconstruct(A, recA, N, H * W * CIN, bwA);
+  reconstruct(B, recB, G, HF * WF * CINF * COUTF, bwB);
+  reconstruct(C, recC, N, HOUT * WOUT * COUTF * G, bwC);
+  cleartext_Convolution(recA, recB, correctC, nullptr, N, H, W, CIN, HF, WF,
+                        CINF, COUTF, HOUT, WOUT, HPADL, HPADR, WPADL, WPADR,
+                        HSTR, WSTR, HDL, WDL, G, shrA, shrB, H1, H2, demote);
+
+  for (int i = 0; i < N * HOUT * WOUT * COUTF * G; i++) {
+    uint64_t tmpC = uint64_t(correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < N * HOUT * WOUT * COUTF * G; i++) {
+      if (recC[i] != (correctC[i])) {
+        pass = false;
+        std::cout << i << "\t" << int(recC[i]) << "\t" << int(correctC[i])
+                  << std::endl;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Convolution Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Convolution Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+}
+
+void ReLU(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+          int32_t bwB, uint64_t six, int32_t div) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+
+  assert(bwA >= bwB);
+
+  int32_t dim = I * J;
+  uint64_t *tmpB = new uint64_t[dim];
+  math->ReLU(dim, A, tmpB, bwA, six);
+
+#ifdef DIV_RESCALING
+  truncation->div_pow2(dim, tmpB, B, div, bwA, true);
+  aux->reduce(dim, B, B, bwA, bwB);
+#else
+  if ((bwA - bwB) >= div) {
+    truncation->truncate_and_reduce(dim, tmpB, B, div, bwA);
+    aux->reduce(dim, B, B, bwA - div, bwB);
+  } else {
+    truncation->truncate(dim, tmpB, B, div, bwA, true);
+    aux->reduce(dim, B, B, bwA, bwB);
+  }
+#endif
+
+  delete[] tmpB;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ReluTimeInMilliSec += temp;
+  std::cout << "Time in sec for current ReLU = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ReluCommSent += curComm;
+#endif
+}
+
+void BNorm(uint64_t *A, uint64_t *BNW, uint64_t *BNB, uint64_t *B, int32_t I,
+           int32_t J, int32_t bwA, int32_t bwBNW, int32_t bwBNB, int32_t bwTemp,
+           int32_t bwB, int32_t shA, int32_t shBNB, int32_t shB) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  uint64_t maskTemp = (bwTemp == 64 ? -1 : ((1ULL << bwTemp) - 1));
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  uint64_t *tmpBNB = new uint64_t[J];
+  uint64_t *tmpBNW = new uint64_t[J];
+  uint64_t *tmpB = new uint64_t[I * J];
+  xt->s_extend(I * J, A, tmpA, bwA, bwTemp);
+  xt->s_extend(J, BNB, tmpBNB, bwBNB, bwTemp);
+  // xt->s_extend(J, BNW, tmpBNW, bwBNW, bwTemp);
+  memcpy(tmpBNW, BNW, J * sizeof(uint64_t));
+  if (shA <= 0) {
+    for (int i = 0; i < I * J; i++) {
+      A[i] = (tmpA[i] << (-1 * shA));
+    }
+  } else {
+#ifdef DIV_RESCALING
+    truncation->div_pow2(I * J, tmpA, A, shA, bwTemp, true);
+#else
+    truncation->truncate(I * J, tmpA, A, shA, bwTemp, true);
+#endif
+  }
+  if (shBNB <= 0) {
+    for (int i = 0; i < J; i++) {
+      BNB[i] = (tmpBNB[i] << (-1 * shBNB));
+    }
+  } else {
+#ifdef DIV_RESCALING
+    truncation->div_pow2(J, tmpBNB, BNB, shBNB, bwTemp, true);
+#else
+    truncation->truncate(J, tmpBNB, BNB, shBNB, bwTemp, true);
+#endif
+  }
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      tmpA[i * J + j] = (A[i * J + j] + BNB[j]) & maskTemp;
+    }
+  }
+  // mult->hadamard_product(I*J, tmpA, tmpBNW, tmpB, bwTemp, bwTemp, bwTemp,
+  // true, true);
+  mult->matrix_multiplication(I, J, 1, tmpA, tmpBNW, tmpB, bwTemp, bwBNW,
+                              bwTemp, true, true, false, MultMode::Alice_has_B);
+
+  if (shB <= 0) {
+    for (int i = 0; i < I * J; i++) {
+      B[i] = (tmpB[i] << (-1 * shB));
+    }
+  } else {
+#ifdef DIV_RESCALING
+    truncation->div_pow2(I * J, tmpB, B, shB, bwTemp, true);
+#else
+    truncation->truncate(I * J, tmpB, B, shB, bwTemp, true);
+#endif
+  }
+  aux->reduce(I * J, B, B, bwTemp, bwB);
+
+  delete[] tmpA;
+  delete[] tmpBNB;
+  delete[] tmpBNW;
+  delete[] tmpB;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  BatchNormInMilliSec += temp;
+  std::cout << "Time in sec for current BatchNorm = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  BatchNormCommSent += curComm;
+#endif
+}
+
+void NormaliseL2(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+                 int32_t scaleA, int32_t shrA) {
+#ifdef LOG_LAYERWISE
+  INIT_TIMER;
+  INIT_ALL_IO_DATA_SENT;
+#endif
+  int32_t scale_in = -1 * scaleA;
+  int32_t scale_out = -1 * (scaleA + 1);
+  int32_t bw_sumSquare = (2 * bwA - 2 * shrA) + ceil(log2(J));
+  uint64_t mask_sumSquare =
+      (bw_sumSquare == 64 ? -1 : ((1ULL << (bw_sumSquare)) - 1));
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  uint64_t *sumSquare = new uint64_t[I];
+  uint64_t *inverseNorm = new uint64_t[I];
+  mult->hadamard_product(I * J, A, A, tmpB, bwA, bwA, 2 * bwA, true, true,
+                         MultMode::None);
+  truncation->truncate(I * J, tmpB, B, 2 * shrA, 2 * bwA, true);
+  // truncation->truncate_and_reduce(I*J, tmpB, B, 2*shrA, 2*bwA);
+  for (int i = 0; i < I; i++) {
+    sumSquare[i] = 0;
+    for (int j = 0; j < J; j++) {
+      sumSquare[i] = (sumSquare[i] + B[i * J + j]);
+    }
+    sumSquare[i] &= mask_sumSquare;
+  }
+
+  math->sqrt(I, sumSquare, inverseNorm, bw_sumSquare, bwA + shrA,
+             2 * (scale_in - shrA), scale_out - scale_in + shrA, true);
+
+  mult->matrix_multiplication(1, I, J, inverseNorm, A, B, bwA + shrA, bwA,
+                              bwA + shrA, true, true, false, MultMode::None);
+
+#ifdef DIV_RESCALING
+  truncation->div_pow2(I * J, B, tmpB, shrA, bwA + shrA, true);
+  aux->reduce(I * J, tmpB, tmpB, bwA + shrA, bwA);
+#else
+  truncation->truncate_and_reduce(I * J, B, tmpB, shrA, bwA + shrA);
+#endif
+
+  for (int i = 0; i < I; i++) {
+    for (int j = 0; j < J; j++) {
+      B[i * J + j] = tmpB[j * I + i];
+    }
+  }
+
+  delete[] tmpB;
+  delete[] sumSquare;
+  delete[] inverseNorm;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  NormaliseL2TimeInMilliSec += temp;
+  std::cout << "Time in sec for current NormaliseL2 = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  NormaliseL2CommSent += curComm;
+#endif
+}
+
+// template<class int64_t, class int64_t, class int64_t, class int64_t>
+void MatAdd(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t shrC,
+            int64_t demote, int64_t bwA, int64_t bwB, int64_t bwTemp,
+            int64_t bwC, int64_t *A, int64_t *B, int64_t *C, bool verbose) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatAdd(A, B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+
+#else
+#ifdef LOG_LAYERWISE
+  if (verbose)
+    std::cout << "MatAdd (" << I << " x " << J << ")" << std::endl;
+#endif
+  // C = (A / (shrA*shrC) + B / (shrB*ShrC)) / demote
+  // int32_t bwA = sizeof(int64_t)*8;
+  // int32_t bwB = sizeof(int64_t)*8;
+  // int32_t bwC = sizeof(int64_t)*8;
+  // int32_t bwTemp = sizeof(int64_t)*8;
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+  uint64_t *tmpB = new uint64_t[I * J];
+  typecast_to_uint64(B, tmpB, I, J, bwB);
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  MatAdd(tmpA, tmpB, tmpC, I, J, bwA, bwB, bwC, bwTemp, shiftA, shiftB, shiftC,
+         shift_demote);
+
+  typecast_from_uint64(tmpC, C, I, J, bwC);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+
+  if (!verbose)
+    return;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatAdd(recA, recB, correctC, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+  /*
+  for(int i = 0; i < I*J; i++) {
+      std::cout << i << "\t" << int64_t(recA[i]) << std::endl;
+  }
+  for(int i = 0; i < I*J; i++) {
+      std::cout << i << "\t" << int64_t(recB[i]) << std::endl;
+  }
+  */
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recC[i]) << "\t" <<
+      // int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatAdd Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MatAdd Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+// template<class int64_t, class int64_t, class int64_t, class int64_t>
+void MatSub(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t shrC,
+            int64_t demote, int64_t bwA, int64_t bwB, int64_t bwTemp,
+            int64_t bwC, int64_t *A, int64_t *B, int64_t *C) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatSub(A, B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "MatSub (" << I << " x " << J << ")" << std::endl;
+#endif
+  int64_t *minus_B = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    minus_B[i] = int64_t(-1) * B[i];
+  }
+  MatAdd(I, J, shrA, shrB, shrC, demote, bwA, bwB, bwTemp, bwC, A, minus_B, C,
+         false);
+
+  delete[] minus_B;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatSub(recA, recB, correctC, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recC[i]) << "\t" << int(correctC[i]) <<
+      // std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatSub Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MatSub Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void MatAddBroadCastA(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t A, int64_t *B,
+                      int64_t *C, bool verbose) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatAddBroadCastA(&A, B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  if (verbose)
+    std::cout << "MatAddBroadCastA (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+
+  uint64_t tmpA = uint64_t(A);
+  uint64_t *tmpB = new uint64_t[I * J];
+  typecast_to_uint64(B, tmpB, I, J, bwB);
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  MatAddBroadCast(&tmpA, tmpB, tmpC, I, J, bwA, bwB, bwC, bwTemp, shiftA,
+                  shiftB, shiftC, shift_demote, true);
+  typecast_from_uint64(tmpC, C, I, J, bwC);
+
+  delete[] tmpB;
+  delete[] tmpC;
+
+  if (!verbose)
+    return;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatAddBroadCastA(&A, recB, correctC, I, J, shrA, shrB, shrC,
+                             demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recC[i]) << "\t" <<
+      // int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatAddBroadCastA Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "MatAddBroadCastA Output Mismatch" << RESET
+                << std::endl;
+  }
+
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void MatAddBroadCastB(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t *A, int64_t B,
+                      int64_t *C, bool verbose) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatAddBroadCastB(A, &B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+
+#else
+#ifdef LOG_LAYERWISE
+  if (verbose)
+    std::cout << "MatAddBroadCastB (" << I << " x " << J << ")" << std::endl;
+#endif
+  // int32_t bwA = sizeof(int64_t)*8;
+  // int32_t bwB = sizeof(int64_t)*8;
+  // int32_t bwC = sizeof(int64_t)*8;
+  // int32_t bwTemp = sizeof(int64_t)*8;
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+
+  uint64_t tmpB = uint64_t(B);
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  MatAddBroadCast(tmpA, &tmpB, tmpC, I, J, bwA, bwB, bwC, bwTemp, shiftA,
+                  shiftB, shiftC, shift_demote, false);
+  typecast_from_uint64(tmpC, C, I, J, bwC);
+
+  delete[] tmpA;
+  delete[] tmpC;
+
+  if (!verbose)
+    return;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatAddBroadCastB(recA, &B, correctC, I, J, shrA, shrB, shrC,
+                             demote);
+
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recA[i]) << "\t" <<
+      // int(recC[i]) << "\t" << int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatAddBroadCastB Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "MatAddBroadCastB Output Mismatch" << RESET
+                << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void MatSubBroadCastA(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t A, int64_t *B,
+                      int64_t *C) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatSubBroadCastA(&A, B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "MatSubBroadCastA (" << I << " x " << J << ")" << std::endl;
+#endif
+  int64_t *minus_B = new int64_t[I * J];
+  for (int i = 0; i < I * J; i++) {
+    minus_B[i] = int64_t(-1) * B[i];
+  }
+  MatAddBroadCastA(I, J, shrA, shrB, shrC, demote, bwA, bwB, bwTemp, bwC, A,
+                   minus_B, C, false);
+
+  delete[] minus_B;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatSubBroadCastA(&A, recB, correctC, I, J, shrA, shrB, shrC,
+                             demote);
+
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recC[i]) << "\t" << int(correctC[i]) <<
+      // std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatSubBroadCastA Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "MatSubBroadCastA Output Mismatch" << RESET
+                << std::endl;
+  }
+
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void MatSubBroadCastB(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t *A, int64_t B,
+                      int64_t *C) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MatSubBroadCastB(A, &B, C, I, J, shrA, shrB, shrC, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "MatSubBroadCastB (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  int64_t minus_B = int64_t(-1) * (B);
+  MatAddBroadCastB(I, J, shrA, shrB, shrC, demote, bwA, bwB, bwTemp, bwC, A,
+                   minus_B, C, false);
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_MatSubBroadCastB(recA, &B, correctC, I, J, shrA, shrB, shrC,
+                             demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recA[i]) << "\t" <<
+      // int(recC[i]) << "\t" << int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatSubBroadCastB Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "MatSubBroadCastB Output Mismatch" << RESET
+                << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+// template<class int64_t>
+void AdjustScaleShl(int64_t I, int64_t J, int64_t scale, int64_t *A) {
+  // A = (A * scale)
+  for (int i = 0; i < I * J; i++) {
+    A[i] = A[i] * int64_t(scale);
+  }
+}
+
+void MulCir(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+            int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, int64_t *A,
+            int64_t *B, int64_t *C) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MulCir(A, B, C, I, J, shrA, shrB, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "MulCir (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  MulCir(I, J, shrA, shrB, demote, bwA, bwB, bwTemp, bwC, (uint64_t *)A,
+         (uint64_t *)B, (uint64_t *)C);
+
+#endif
+}
+
+void ScalarMul(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+               int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, int64_t A,
+               int64_t *B, int64_t *C) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_ScalarMul(&A, B, C, I, J, shrA, shrB, demote);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". ScalarMul (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shift_demote = log2(demote);
+
+  uint64_t *tmpA = new uint64_t[1];
+  typecast_to_uint64(&A, tmpA, 1, 1, bwA);
+  uint64_t *tmpB = new uint64_t[I * J];
+  typecast_to_uint64(B, tmpB, I, J, bwB);
+
+  uint64_t *tmpC = new uint64_t[I * J];
+
+  ScalarMul(tmpA, tmpB, tmpC, I, J, bwA, bwB, bwTemp, bwC, shiftA, shiftB,
+            shift_demote);
+  typecast_from_uint64(tmpC, C, I, J, bwC);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(B, recB, I, J, bwB);
+  reconstruct(C, recC, I, J, bwC);
+  cleartext_ScalarMul(&A, recB, correctC, I, J, shrA, shrB, demote);
+
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // if (verbose) std::cout << i << "\t" << int(recC[i]) << "\t" <<
+      // int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "ScalarMul Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "ScalarMul Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void Sigmoid(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+             int64_t bwA, int64_t bwB, int64_t *A, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_Sigmoid(A, I, J, scale_in, scale_out, bwA, bwB, B);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (B[i]) % (1LL << bwB);
+    B[i] = signed_val(tmpC, bwB);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "Sigmoid (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  Sigmoid(I, J, scale_in, scale_out, bwA, bwB, tmpA, tmpB);
+  typecast_from_uint64(tmpB, B, I, J, bwB);
+
+  delete[] tmpA;
+  delete[] tmpB;
+
+#endif
+}
+
+void TanH(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, int64_t *A, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_TanH(A, I, J, scale_in, scale_out, bwA, bwB, B);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (B[i]) % (1LL << bwB);
+    B[i] = signed_val(tmpC, bwB);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "TanH (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  TanH(I, J, scale_in, scale_out, bwA, bwA, tmpA, tmpB);
+  typecast_from_uint64(tmpB, B, I, J, bwB);
+
+  delete[] tmpA;
+  delete[] tmpB;
+
+#endif
+}
+
+// template<class int64_t>
+void ArgMax(int64_t I, int64_t J, int32_t bwA, int32_t bw_index, int64_t *A,
+            int64_t *index) {
+#ifdef CLEARTEXT_ONLY
+  int32_t index_tmp;
+  cleartext_ArgMax(A, (int32_t)I, (int32_t)J, &index_tmp);
+  *index = index_tmp;
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "ArgMax (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmp_index = new uint64_t[1];
+  ArgMax(tmpA, I, J, bwA, bw_index, tmp_index);
+  typecast_from_uint64(tmp_index, index, 1, 1, bw_index);
+
+  delete[] tmpA;
+  delete[] tmp_index;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *rec_index = new int64_t[1];
+  int *correct_index = new int[1];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(index, rec_index, 1, 1, bw_index);
+
+  cleartext_ArgMax(recA, I, J, correct_index);
+
+  if (party == 2) {
+    bool pass = true;
+    if (rec_index[0] != correct_index[0]) {
+      pass = false;
+    }
+    std::cout << rec_index[0] << "\t" << correct_index[0] << std::endl;
+    if (pass == true)
+      std::cout << GREEN << "ArgMax Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "ArgMax Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] rec_index;
+  delete[] correct_index;
+#endif
+#endif
+}
+
+void AdjustScaleShr(int32_t I, int32_t J, int32_t scale, int64_t bwA,
+                    int64_t *A) {
+#ifdef CLEARTEXT_ONLY
+  for (int i = 0; i < I * J; i++) {
+    A[i] /= scale;
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". AdjustScaleShr (" << I << " x " << J << ")"
+            << std::endl;
+#endif
+
+  int32_t shift_scale = ceil(log2(scale));
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  AdjustScaleShr(tmpA, tmpB, I, J, bwA, shift_scale);
+  typecast_from_uint64(tmpB, A, I, J, bwA);
+
+  delete[] tmpA;
+  delete[] tmpB;
+#endif
+}
+
+void Convolution(int32_t N, int32_t H, int32_t W, int32_t CIN, int32_t HF,
+                 int32_t WF, int32_t CINF, int32_t COUTF, int32_t HOUT,
+                 int32_t WOUT, int32_t HPADL, int32_t HPADR, int32_t WPADL,
+                 int32_t WPADR, int32_t HSTR, int32_t WSTR, int32_t HDL,
+                 int32_t WDL, int32_t G, int32_t shrA, int32_t shrB, int32_t H1,
+                 int32_t H2, int32_t demote, int32_t bwA, int32_t bwB,
+                 int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B,
+                 int64_t *C, int64_t *tmp, bool verbose) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_Convolution(A, B, C, nullptr, N, H, W, CIN, HF, WF, CINF, COUTF,
+                        HOUT, WOUT, HPADL, HPADR, WPADL, WPADR, HSTR, WSTR, HDL,
+                        WDL, G, shrA, shrB, H1, H2, demote);
+  for (int i = 0; i < N * HOUT * WOUT * COUTF * G; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+
+#else
+  assert((HDL == 1) && (WDL == 1));
+
+  uint64_t *tmpA = new uint64_t[N * H * W * CIN];
+  typecast_to_uint64(A, tmpA, N, H * W * CIN, bwA);
+  uint64_t *tmpB = new uint64_t[G * HF * WF * CINF * COUTF];
+  typecast_to_uint64(B, tmpB, G, HF * WF * CINF * COUTF, bwB);
+
+  uint64_t *tmpC = new uint64_t[N * HOUT * WOUT * COUTF * G];
+  Convolution(N, H, W, CIN, HF, WF, CINF, COUTF, HOUT, WOUT, HPADL, HPADR,
+              WPADL, WPADR, HSTR, WSTR, HDL, WDL, G, bwA, bwB, bwC, bwTemp,
+              shrA, shrB, H1, H2, demote, tmpA, tmpB, tmpC);
+
+  typecast_from_uint64(tmpC, C, N, HOUT * WOUT * COUTF * G, bwC);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+#endif
+}
+
+void NormaliseL2(int32_t N, int32_t H, int32_t W, int32_t C, int32_t scaleA,
+                 int32_t shrA, int32_t bwA, int64_t *A, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_NormaliseL2(A, B, N, H, W, C, scaleA, shrA, bwA, bwA);
+  for (int i = 0; i < (N * H * W * C); i++) {
+    uint64_t tmpB = (B[i]) % (1LL << bwA);
+    B[i] = signed_val(tmpB, bwA);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". NormaliseL2 (" << N * H * W << " x " << C << ")"
+            << std::endl;
+#endif
+  int32_t scale_in = -1 * scaleA;
+  int32_t scale_out = -1 * (scaleA + 1);
+
+  uint64_t *tmpA = new uint64_t[N * H * W * C];
+  typecast_to_uint64(A, tmpA, N * H * W, C, bwA);
+  uint64_t *tmpB = new uint64_t[N * H * W * C];
+
+  NormaliseL2(tmpA, tmpB, N * H * W, C, bwA, scaleA, shrA);
+  typecast_from_uint64(tmpB, B, N * H * W, C, bwA);
+
+  delete[] tmpA;
+  delete[] tmpB;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[N * H * W * C];
+  int64_t *recB = new int64_t[N * H * W * C];
+  int64_t *correctB = new int64_t[N * H * W * C];
+  reconstruct(A, recA, N * H * W, C, bwA);
+  reconstruct(B, recB, N * H * W, C, bwA);
+  cleartext_NormaliseL2(recA, correctB, N, H, W, C, scaleA, shrA, bwA, bwA);
+  // cleartext_NormaliseL2_seedot<int64_t>(recA, correctB, N, H, W, C, scaleA,
+  // shrA);
+  for (int i = 0; i < (N * H * W * C); i++) {
+    uint64_t tmpB = (correctB[i]) % (1LL << bwA);
+    correctB[i] = signed_val(tmpB, bwA);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < N * H * W * C; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+      std::cout << double(recA[i]) / (1LL << scale_in) << "\t"
+                << double(recB[i]) / (1LL << scale_out) << "\t"
+                << double(correctB[i]) / (1LL << scale_out) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "NormaliseL2 Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "NormaliseL2 Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+#endif
+}
+
+void Relu6(int32_t N, int32_t H, int32_t W, int32_t C, int64_t six, int32_t div,
+           int32_t bwA, int32_t bwB, int64_t *A, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_Relu6(A, B, N, H, W, C, six, div);
+  for (int i = 0; i < N * H * W * C; i++) {
+    uint64_t tmpC = (B[i]) % (1LL << bwB);
+    B[i] = signed_val(tmpC, bwB);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Relu6 (" << N << "x" << H << "x" << W << "x" << C
+            << ")" << std::endl;
+#endif
+
+  int32_t shift_div = log2(div);
+
+  uint64_t *tmpA = new uint64_t[N * H * W * C];
+  typecast_to_uint64(A, tmpA, N * H * W, C, bwA);
+
+  uint64_t *tmpB = new uint64_t[N * H * W * C];
+  ReLU(tmpA, tmpB, N * H * W, C, bwA, bwB, six, shift_div);
+  typecast_from_uint64(tmpB, B, N * H * W, C, bwB);
+
+  delete[] tmpA;
+  delete[] tmpB;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[N * H * W * C];
+  int64_t *recB = new int64_t[N * H * W * C];
+  int64_t *correctB = new int64_t[N * H * W * C];
+  reconstruct(A, recA, N * H * W, C, bwA);
+  reconstruct(B, recB, N * H * W, C, bwB);
+
+  cleartext_Relu6(recA, correctB, N, H, W, C, six, div);
+  for (int i = 0; i < N * H * W * C; i++) {
+    uint64_t tmpC = (correctB[i]) % (1LL << bwB);
+    correctB[i] = signed_val(tmpC, bwB);
+  }
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < N * H * W * C; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Relu6 Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Relu6 Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+#endif
+}
+
+void BNorm(int32_t I, int32_t J, int32_t shA, int32_t shBNB, int32_t shB,
+           int32_t bwA, int32_t bwBNW, int32_t bwBNB, int32_t bwTemp,
+           int32_t bwB, int64_t *A, int64_t *BNW, int64_t *BNB, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_BNorm(A, BNW, BNB, B, I, J, shA, shBNB, shB);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (B[i]) % (1LL << bwB);
+    B[i] = signed_val(tmpC, bwB);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". BNorm (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  assert(bwA <= bwTemp);
+  assert(bwBNB <= bwTemp);
+  assert(bwBNW <= bwTemp);
+  assert(bwB <= bwTemp);
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+  uint64_t *tmpBNW = new uint64_t[J];
+  typecast_to_uint64(BNW, tmpBNW, 1, J, bwBNW);
+  uint64_t *tmpBNB = new uint64_t[J];
+  typecast_to_uint64(BNB, tmpBNB, 1, J, bwBNB);
+  uint64_t *tmpB = new uint64_t[I * J];
+
+  BNorm(tmpA, tmpBNW, tmpBNB, tmpB, I, J, bwA, bwBNW, bwBNB, bwTemp, bwB, shA,
+        shBNB, shB);
+
+  typecast_from_uint64(tmpB, B, I, J, bwB);
+
+  delete[] tmpA;
+  delete[] tmpBNW;
+  delete[] tmpBNB;
+  delete[] tmpB;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recBNW = new int64_t[J];
+  int64_t *recBNB = new int64_t[J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *correctB = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(BNW, recBNW, 1, J, bwBNW);
+  reconstruct(BNB, recBNB, 1, J, bwBNB);
+  reconstruct(B, recB, I, J, bwB);
+  cleartext_BNorm(recA, recBNW, recBNB, correctB, I, J, shA, shBNB, shB);
+  for (int i = 0; i < I * J; i++) {
+    uint64_t tmpC = (correctB[i]) % (1LL << bwB);
+    correctB[i] = signed_val(tmpC, bwB);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recB[i]) << "\t" << int(correctB[i]) <<
+      // std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "BNorm Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "BNorm Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recBNW;
+  delete[] recBNB;
+  delete[] recB;
+  delete[] correctB;
+#endif
+#endif
+}
+
+void MaxPool2D(int I, int J, int bwA, int bwB, int64_t *A, int64_t *B) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MaxPool2D(A, I, J, B);
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". MaxPool2D (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmp_max = new uint64_t[I];
+  MaxPool2D(tmpA, I, J, bwA, bwB, tmp_max);
+  typecast_from_uint64(tmp_max, B, I, 1, bwB);
+
+  delete[] tmpA;
+  delete[] tmp_max;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I];
+  int64_t *correct_max = new int64_t[I];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, 1, bwB);
+
+  cleartext_MaxPool2D(recA, I, J, correct_max);
+
+  // for(int i=0; i<I; i++){
+  // B[i] = (party == 2 ? correct_max[i] : 0);
+  //}
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I; i++) {
+      if (recB[i] != correct_max[i]) {
+        // if(i <= 10)
+        // std::cout<<"s: "<<recB[i]<<" | e: "<<correct_max[i]<<std::endl;
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "MaxPool Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MaxPool Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correct_max;
+#endif
+#endif
+}
+
+// void fillArrayFullSize2D(int64_t *src, int64_t *dst, int I1, int I2, int J1,
+// int J2)
+// {
+//     int itersx = (J1) / (I1);
+//     int itersy = (J2) / (I2);
+
+//     for (int i = 0; i < itersx; i++)
+//     {
+//         for (int j = 0; j < itersy; j++)
+//         {
+//             for (int k = 0; k < I1 * I2; k++)
+//                 dst[i * J2 + j + k] = src[k];
+//         }
+//     }
+// }
+
+void MBConv(int32_t N, int32_t H, int32_t W, int32_t Cin, int32_t Ct,
+            int32_t HF, int32_t WF, int32_t Cout, int32_t Hout, int32_t Wout,
+            int32_t HPADL, int32_t HPADR, int32_t WPADL, int32_t WPADR,
+            int32_t HSTR, int32_t WSTR, int32_t D1, int32_t D2, int32_t D3,
+            int64_t SIX_1, int64_t SIX_2, int32_t shr1, int32_t shr2,
+            int32_t shr3, int32_t shr4, int32_t shr5, int32_t shr6,
+            int32_t shr7, int32_t shr8, int32_t shr9, int32_t shl1,
+            int32_t shl2, int32_t shl3, int32_t shl4, int32_t shl5,
+            int32_t shl6, int32_t shl7, int32_t shl8, int32_t shl9, int32_t bwA,
+            int32_t bwF1, int32_t bwB1W, int32_t bwB1B, int32_t bwF2,
+            int32_t bwB2W, int32_t bwB2B, int32_t bwF3, int32_t bwB3W,
+            int32_t bwB3B, int32_t bwC, int32_t bwX, int32_t bwT, int32_t bwU,
+            int32_t bwUB1W, int32_t bwUB2W, int32_t bwUB3W, int64_t *A,
+            int64_t *F1, int64_t *BN1W, int64_t *BN1B, int64_t *F2,
+            int64_t *BN2W, int64_t *BN2B, int64_t *F3, int64_t *BN3W,
+            int64_t *BN3B, int64_t *C, int64_t *X, int64_t *T, int64_t *U) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_MBConv(A, F1, BN1W, BN1B, F2, BN2W, BN2B, F3, BN3W, BN3B, C,
+                   nullptr, nullptr, nullptr, N, H, W, Cin, Ct, HF, WF, Cout,
+                   Hout, Wout, HPADL, HPADR, WPADL, WPADR, HSTR, WSTR, D1, D2,
+                   D3, SIX_1, SIX_2, shr1, shr2, shr3, shr4, shr5, shr6, shr7,
+                   shr8, shr9, shl1, shl2, shl3, shl4, shl5, shl6, shl7, shl8,
+                   shl9);
+  for (int i = 0; i < N * Hout * Wout * Cout; i++) {
+    uint64_t tmpC = (C[i]) % (1LL << bwC);
+    C[i] = signed_val(tmpC, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << "MBConv: Conv -> BNorm -> ReLU6 -> Conv -> BNorm -> ReLU6 -> "
+               "Conv -> BNorm"
+            << std::endl;
+#endif
+  int32_t depth_1 = ceil(log2(Cin));
+  int64_t *tmpU = new int64_t[N * H * W * Ct];
+  Convolution(N, H, W, Cin, 1, 1, Cin, Ct, H, W, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+              1, D1, depth_1 - D1, 1, bwA, bwF1, bwU, bwU, A, F1, tmpU, nullptr,
+              true);
+
+  int64_t *tmpUB1W = new int64_t[N * H * W * Ct];
+  int64_t *tmpX = new int64_t[N * H * W * Ct];
+  BNorm(N * H * W, Ct, int32_t(log2(shr1 / double(shl1))),
+        int32_t(log2(shr2 / double(shl2))), 0, bwU, bwB1W, bwB1B, bwUB1W,
+        bwUB1W, tmpU, BN1W, BN1B, tmpUB1W);
+
+  Relu6(N, H, W, Ct, SIX_1, shr3, bwUB1W, bwX, tmpUB1W, tmpX);
+  if (shl3 > 1) {
+    for (int i = 0; i < N * H * W * Ct; i++) {
+      tmpX[i] = (tmpX[i] * shl3);
+    }
+  }
+
+  int32_t depth_2 = ceil(log2(HF * WF));
+  delete[] tmpU;
+  tmpU = new int64_t[N * Hout * Wout * Ct];
+  Convolution(N, H, W, Ct, HF, WF, 1, 1, Hout, Wout, HPADL, HPADR, WPADL, WPADR,
+              HSTR, WSTR, 1, 1, Ct, 1, 1, D2, depth_2 - D2, 1, bwX, bwF2, bwU,
+              bwU, tmpX, F2, tmpU, nullptr, true);
+
+  int64_t *tmpUB2W = new int64_t[N * Hout * Wout * Ct];
+  int64_t *tmpT = new int64_t[N * Hout * Wout * Ct];
+  BNorm(N * Hout * Wout, Ct, int32_t(log2(shr4 / double(shl4))),
+        int32_t(log2(shr5 / double(shl5))), 0, bwU, bwB2W, bwB2B, bwUB2W,
+        bwUB2W, tmpU, BN2W, BN2B, tmpUB2W);
+
+  Relu6(N, Hout, Wout, Ct, SIX_2, shr6, bwUB2W, bwT, tmpUB2W, tmpT);
+
+  int32_t depth_3 = ceil(log2(Ct));
+  delete[] tmpU;
+  tmpU = new int64_t[N * Hout * Wout * Cout];
+  Convolution(N, Hout, Wout, Ct, 1, 1, Ct, Cout, Hout, Wout, 0, 0, 0, 0, 1, 1,
+              1, 1, 1, 1, 1, D3, depth_3 - D3, 1, bwT, bwF3, bwU, bwU, tmpT, F3,
+              tmpU, nullptr, true);
+
+  BNorm(N * Hout * Wout, Cout, int32_t(log2(shr7 / double(shl7))),
+        int32_t(log2(shr8 / double(shl8))), int32_t(log2(shr9 / double(shl9))),
+        bwU, bwB3W, bwB3B, bwUB3W, bwC, tmpU, BN3W, BN3B, C);
+
+  delete[] tmpU;
+  delete[] tmpT;
+  delete[] tmpX;
+  delete[] tmpUB1W;
+  delete[] tmpUB2W;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[N * H * W * Cin];
+  int64_t *recF1 = new int64_t[Cin * Ct];
+  int64_t *recBN1W = new int64_t[Ct];
+  int64_t *recBN1B = new int64_t[Ct];
+  int64_t *recF2 = new int64_t[Ct * HF * WF];
+  int64_t *recBN2W = new int64_t[Ct];
+  int64_t *recBN2B = new int64_t[Ct];
+  int64_t *recF3 = new int64_t[Ct * Cout];
+  int64_t *recBN3W = new int64_t[Cout];
+  int64_t *recBN3B = new int64_t[Cout];
+  int64_t *recC = new int64_t[N * Hout * Wout * Cout];
+  int64_t *correctC = new int64_t[N * Hout * Wout * Cout];
+
+  reconstruct(A, recA, N * H * W, Cin, bwA);
+  reconstruct(F1, recF1, Cin, Ct, bwF1);
+  reconstruct(BN1W, recBN1W, 1, Ct, bwB1W);
+  reconstruct(BN1B, recBN1B, 1, Ct, bwB1B);
+  reconstruct(F2, recF2, Ct, HF * WF, bwF2);
+  reconstruct(BN2W, recBN2W, 1, Ct, bwB2W);
+  reconstruct(BN2B, recBN2B, 1, Ct, bwB2B);
+  reconstruct(F3, recF3, Ct, Cout, bwF3);
+  reconstruct(BN3W, recBN3W, 1, Cout, bwB3W);
+  reconstruct(BN3B, recBN3B, 1, Cout, bwB3B);
+  reconstruct(C, recC, N * Hout * Wout, Cout, bwC);
+
+  cleartext_MBConv(recA, recF1, recBN1W, recBN1B, recF2, recBN2W, recBN2B,
+                   recF3, recBN3W, recBN3B, correctC, nullptr, nullptr, nullptr,
+                   N, H, W, Cin, Ct, HF, WF, Cout, Hout, Wout, HPADL, HPADR,
+                   WPADL, WPADR, HSTR, WSTR, D1, D2, D3, SIX_1, SIX_2, shr1,
+                   shr2, shr3, shr4, shr5, shr6, shr7, shr8, shr9, shl1, shl2,
+                   shl3, shl4, shl5, shl6, shl7, shl8, shl9);
+  for (int i = 0; i < N * Hout * Wout * Cout; i++) {
+    uint64_t tmpC = (correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpC, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < N * Hout * Wout * Cout; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recC[i]) << "\t" << int(correctC[i]) <<
+      // std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "MBConv Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MBConv Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recC;
+  delete[] correctC;
+#endif
+#endif
+}
+
+void AddOrSubCir4D(int32_t N, int32_t H, int32_t W, int32_t C, int32_t shrA,
+                   int32_t shrB, int32_t shrC, bool add, int32_t demote,
+                   int32_t bwA, int32_t bwB, int32_t bwTemp, int32_t bwC,
+                   int64_t *A, int64_t *B, int64_t *X) {
+#ifdef CLEARTEXT_ONLY
+  cleartext_AddOrSubCir4D(A, B, X, N, H, W, C, shrA, shrB, shrC, add, demote);
+  for (int i = 0; i < (N * H * W * C); i++) {
+    uint64_t tmpX = (X[i]) % (1LL << bwC);
+    X[i] = signed_val(tmpX, bwC);
+  }
+#else
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". AddOrSubCir4D (" << N << "x" << H << "x" << W << "x"
+            << C << ")" << std::endl;
+#endif
+  // C = (A / (shrA*shrC) + B / (shrB*ShrC)) / demote
+  int32_t shiftA = log2(shrA);
+  int32_t shiftB = log2(shrB);
+  int32_t shiftC = log2(shrC);
+  int32_t shift_demote = log2(demote);
+
+  uint64_t *tmpA = new uint64_t[N * H * W * C];
+  typecast_to_uint64(A, tmpA, N * H * W, C, bwA);
+  uint64_t *tmpB = new uint64_t[C];
+  typecast_to_uint64(B, tmpB, 1, C, bwB);
+  uint64_t *tmpC = new uint64_t[N * H * W * C];
+
+  AddOrSubCir(tmpA, tmpB, tmpC, N * H * W, C, bwA, bwB, bwC, bwTemp, shiftA,
+              shiftB, shiftC, add, shift_demote);
+
+  typecast_from_uint64(tmpC, X, N * H * W, C, bwC);
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *origA = new int64_t[N * H * W * C];
+  typecast_from_uint64(tmpA, origA, N * H * W, C, bwA);
+  int64_t *recA = new int64_t[N * H * W * C];
+  int64_t *recB = new int64_t[C];
+  int64_t *recC = new int64_t[N * H * W * C];
+  int64_t *correctC = new int64_t[N * H * W * C];
+  reconstruct(origA, recA, N * H * W, C, bwA);
+  reconstruct(B, recB, C, 1, bwB);
+  reconstruct(X, recC, N * H * W, C, bwC);
+  cleartext_AddOrSubCir4D(recA, recB, correctC, N, H, W, C, shrA, shrB, shrC,
+                          add, demote);
+  for (int i = 0; i < (N * H * W * C); i++) {
+    uint64_t tmpX = uint64_t(correctC[i]) % (1LL << bwC);
+    correctC[i] = signed_val(tmpX, bwC);
+  }
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < N * H * W * C; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // std::cout << i << "\t" << int(recA[i]) << "\t" << int(recB[i%C]) <<
+      // "\t" << int(recC[i]) << "\t" << int(correctC[i]) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "AddOrSubCir4D Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "AddOrSubCir4D Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] origA;
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+#endif
+}
+
+void MatAdd4(int32_t N, int32_t H, int32_t W, int32_t C, int32_t shrA,
+             int32_t shrB, int32_t shrC, int32_t demote, int32_t bwA,
+             int32_t bwB, int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B,
+             int64_t *X) {
+  MatAdd(N * H * W, C, shrA, shrB, shrC, demote, bwA, bwB, bwTemp, bwC, A, B, X,
+         true);
+}
+
+// Athos wrappers. Athos passes scale factor as is. We need to take pow_2
+int64_t pow_2(int32_t p) {
+  int64_t res = 1;
+  for (int i = 0; i < p; i++) {
+    res *= 2;
+  }
+  return res;
+}
+
+void Sigmoid(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+             int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B) {
+  Sigmoid((int64_t)I, (int64_t)J, pow_2(scale_in), pow_2(scale_out),
+          (int64_t)bwA, (int64_t)bwB, A, B);
+}
+
+void TanH(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B) {
+  TanH((int64_t)I, (int64_t)J, pow_2(scale_in), pow_2(scale_out), (int64_t)bwA,
+       (int64_t)bwB, A, B);
+}
+
+void Sqrt(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, bool inverse, uint64_t *A, uint64_t *B) {
+  Sqrt((int64_t)I, (int64_t)J, pow_2(scale_in), pow_2(scale_out), (int64_t)bwA,
+       (int64_t)bwB, inverse, A, B);
+}
+
+void Exp(int32_t I, int32_t J, int32_t shrA, int32_t shrB, int32_t bwA,
+         int64_t *A, int64_t *B) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Exp (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  int32_t s_A = log2(shrA);
+  int32_t s_B = log2(shrB);
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  Exp(tmpA, tmpB, I, J, bwA, bwA, s_A, s_B);
+  typecast_from_uint64(tmpB, B, I, J, bwA);
+
+  delete[] tmpA;
+  delete[] tmpB;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *correctB = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwA);
+  cleartext_Exp_lookup(recA, I, J, bwA, shrA, shrB, correctB, 1);
+
+  if (party == 2) {
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recB[i] != correctB[i]) {
+        pass = false;
+      }
+      // std::cout << recA[i]/double(1LL << s_A) << "\t" << recB[i]/double(1LL
+      // << s_B) << "\t" << correctB[i]/double(1LL << s_B) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "Exp Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Exp Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] correctB;
+#endif
+  return;
+}
+
+void Div(int32_t I, int32_t J, int32_t shrA, int32_t shrB, int32_t shrC,
+         int32_t bwA, int64_t *A, int64_t *B, int64_t *C) {
+#ifdef LOG_LAYERWISE
+  std::cout << ctr++ << ". Div (" << I << " x " << J << ")" << std::endl;
+#endif
+
+  int32_t s_A = log2(shrA);
+  int32_t s_B = log2(shrB);
+  int32_t s_C = log2(shrB);
+
+  uint64_t *tmpA = new uint64_t[I * J];
+  typecast_to_uint64(A, tmpA, I, J, bwA);
+
+  uint64_t *tmpB = new uint64_t[I * J];
+  typecast_to_uint64(B, tmpB, I, J, bwA);
+
+  uint64_t *tmpC = new uint64_t[I * J];
+  Div(tmpA, tmpB, tmpC, I, J, bwA, bwA, bwA, s_A, s_B, s_C);
+  typecast_from_uint64(tmpC, C, I, J, bwA);
+
+  delete[] tmpA;
+  delete[] tmpB;
+  delete[] tmpC;
+
+#ifdef VERIFY_LAYERWISE
+  int64_t *recA = new int64_t[I * J];
+  int64_t *recB = new int64_t[I * J];
+  int64_t *recC = new int64_t[I * J];
+  int64_t *correctC = new int64_t[I * J];
+  reconstruct(A, recA, I, J, bwA);
+  reconstruct(B, recB, I, J, bwA);
+  reconstruct(C, recC, I, J, bwA);
+
+  if (party == 2) {
+    cleartext_div(recA, recB, I, J, shrA, shrB, shrC, correctC, false);
+    bool pass = true;
+    for (int i = 0; i < I * J; i++) {
+      if (recC[i] != correctC[i]) {
+        pass = false;
+      }
+      // std::cout << recA[i]/double(1LL << s_A) << "\t" << recB[i]/double(1LL
+      // << s_B) << "\t" << recC[i]/double(1LL << s_C) << "\t" <<
+      // correctC[i]/double(1LL << s_C) << std::endl;
+    }
+    if (pass == true)
+      std::cout << GREEN << "Div Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Div Output Mismatch" << RESET << std::endl;
+  }
+
+  delete[] recA;
+  delete[] recB;
+  delete[] recC;
+  delete[] correctC;
+#endif
+  return;
+}
+
+void output_vector(int64_t *x, int32_t I, int32_t J, int32_t bwX) {
+  int64_t *y = new int64_t[I * J];
+  reconstruct(x, y, I, J, bwX);
+  if (party == 2) {
+    for (int i = 0; i < I * J; i++) {
+      std::cout << y[i] << " ";
+    }
+    std::cout << std::endl;
+  }
+  delete[] y;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.h b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.h
new file mode 100644
index 00000000..482a2bbb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.h
@@ -0,0 +1,298 @@
+/*
+Authors: Deevashwer Rathee, G Rahul Kranti Kiran
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LIBRARY_FIXED_H__
+#define LIBRARY_FIXED_H__
+
+#include "cleartext_library_fixed.h"
+#include "defines.h"
+#include "library_fixed_uniform.h"
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <vector>
+
+static int32_t ctr = 1;
+
+inline std::vector<int> divide_instances(
+    // Total number of threads
+    int num_threads,
+    // Number of instances
+    int num_instances,
+    // Minimum chunk size of each thread
+    int min_chunk_size = THREADING_MIN_CHUNK_SIZE) {
+  int lnum_threads =
+      std::min(int(ceil(num_instances / double(min_chunk_size))), num_threads);
+  std::vector<int> chunks_per_thread(lnum_threads);
+  int chunk_size = num_instances / lnum_threads;
+  for (int i = 0; i < lnum_threads; ++i) {
+    int offset = i * chunk_size;
+    if (i == (lnum_threads - 1)) {
+      chunks_per_thread[i] = num_instances - offset;
+    } else {
+      chunks_per_thread[i] = chunk_size;
+    }
+  }
+  return chunks_per_thread;
+}
+
+void initialize();
+
+void finalize();
+
+void reconstruct(int dim, uint64_t *x, uint64_t *y, int bw_x);
+
+void AdjustScaleShr(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+                    int32_t scale);
+
+void AdjustScaleShr(int32_t I, int32_t J, int32_t scale, int64_t bwA,
+                    int64_t *A);
+
+void MatAdd(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+            int32_t bwA, int32_t bwB, int32_t bwC, int32_t bwTemp, int32_t shrA,
+            int32_t shrB, int32_t shrC, int32_t demote,
+            bool subroutine = false);
+
+void MatAddBroadCast(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I,
+                     int32_t J, int32_t bwA, int32_t bwB, int32_t bwC,
+                     int32_t bwTemp, int32_t shrA, int32_t shrB, int32_t shrC,
+                     int32_t demote, bool scalar_A = true);
+
+void AddOrSubCir(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+                 int32_t bwA, int32_t bwB, int32_t bwC, int32_t bwTemp,
+                 int32_t shrA, int32_t shrB, int32_t shrC, bool add,
+                 int32_t demote);
+
+void AddOrSubCir4D(int32_t N, int32_t H, int32_t W, int32_t C, int32_t shrA,
+                   int32_t shrB, int32_t shrC, bool add, int32_t demote,
+                   int32_t bwA, int32_t bwB, int32_t bwTemp, int32_t bwC,
+                   int64_t *A, int64_t *B, int64_t *X);
+
+void Exp(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+         int32_t bwB, int32_t sA, int32_t sB);
+
+void Div(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+         int32_t bwA, int32_t bwB, int32_t bwC, int32_t sA, int32_t sB,
+         int32_t sC);
+
+void ArgMax(uint64_t *A, int32_t I, int32_t J, int32_t bwA, int32_t bw_index,
+            uint64_t *index);
+
+void MaxPool2D(uint64_t *A, int32_t I, int32_t J, int32_t bwA, int32_t bwB,
+               uint64_t *B);
+
+void MaxPool2D(int I, int J, int bwA, int bwB, int64_t *A, int64_t *B);
+
+void Convolution(int32_t N, int32_t H, int32_t W, int32_t CIN, int32_t HF,
+                 int32_t WF, int32_t CINF, int32_t COUTF, int32_t HOUT,
+                 int32_t WOUT, int32_t HPADL, int32_t HPADR, int32_t WPADL,
+                 int32_t WPADR, int32_t HSTR, int32_t WSTR, int32_t HDL,
+                 int32_t WDL, int32_t G, int32_t bwA, int32_t bwB, int32_t bwC,
+                 int32_t bwTemp, int32_t shrA, int32_t shrB, int32_t H1,
+                 int32_t H2, int32_t demote, uint64_t *A, uint64_t *B,
+                 uint64_t *C);
+
+void Convolution(int32_t N, int32_t H, int32_t W, int32_t CIN, int32_t HF,
+                 int32_t WF, int32_t CINF, int32_t COUTF, int32_t HOUT,
+                 int32_t WOUT, int32_t HPADL, int32_t HPADR, int32_t WPADL,
+                 int32_t WPADR, int32_t HSTR, int32_t WSTR, int32_t HDL,
+                 int32_t WDL, int32_t G, int32_t shrA, int32_t shrB, int32_t H1,
+                 int32_t H2, int32_t demote, int32_t bwA, int32_t bwB,
+                 int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B,
+                 int64_t *C, int64_t *tmp, bool verbose = true);
+
+void ReLU(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+          int32_t bwB, uint64_t six, int32_t div);
+void Relu6(int32_t N, int32_t H, int32_t W, int32_t C, int64_t six, int32_t div,
+           int32_t bwA, int32_t bwB, int64_t *A, int64_t *B);
+
+void BNorm(uint64_t *A, uint64_t *BNW, uint64_t *BNB, uint64_t *B, int32_t I,
+           int32_t J, int32_t bwA, int32_t bwBNW, int32_t bwBNB, int32_t bwTemp,
+           int32_t bwB, int32_t shA, int32_t shBNB, int32_t shB);
+
+void NormaliseL2(uint64_t *A, uint64_t *B, int32_t I, int32_t J, int32_t bwA,
+                 int32_t scaleA, int32_t shrA);
+void BNorm(int32_t I, int32_t J, int32_t shA, int32_t shBNB, int32_t shB,
+           int32_t bwA, int32_t bwBNW, int32_t bwBNB, int32_t bwTemp,
+           int32_t bwB, int64_t *A, int64_t *BNW, int64_t *BNB, int64_t *B);
+
+void NormaliseL2(int32_t N, int32_t H, int32_t W, int32_t C, int32_t scaleA,
+                 int32_t shrA, int32_t bwA, int64_t *A, int64_t *B);
+
+void MBConv(int32_t N, int32_t H, int32_t W, int32_t Cin, int32_t Ct,
+            int32_t HF, int32_t WF, int32_t Cout, int32_t Hout, int32_t Wout,
+            int32_t HPADL, int32_t HPADR, int32_t WPADL, int32_t WPADR,
+            int32_t HSTR, int32_t WSTR, int32_t D1, int32_t D2, int32_t D3,
+            int64_t SIX_1, int64_t SIX_2, int32_t shr1, int32_t shr2,
+            int32_t shr3, int32_t shr4, int32_t shr5, int32_t shr6,
+            int32_t shr7, int32_t shr8, int32_t shr9, int32_t shl1,
+            int32_t shl2, int32_t shl3, int32_t shl4, int32_t shl5,
+            int32_t shl6, int32_t shl7, int32_t shl8, int32_t shl9, int32_t bwA,
+            int32_t bwF1, int32_t bwB1W, int32_t bwB1B, int32_t bwF2,
+            int32_t bwB2W, int32_t bwB2B, int32_t bwF3, int32_t bwB3W,
+            int32_t bwB3B, int32_t bwC, int32_t bwX, int32_t bwT, int32_t bwU,
+            int32_t bwUB1W, int32_t bwUB2W, int32_t bwUB3W, int64_t *A,
+            int64_t *F1, int64_t *BN1W, int64_t *BN1B, int64_t *F2,
+            int64_t *BN2W, int64_t *BN2B, int64_t *F3, int64_t *BN3W,
+            int64_t *BN3B, int64_t *C, int64_t *X, int64_t *T, int64_t *U);
+
+void output_vector(int64_t *x, int32_t I, int32_t J, int32_t bwX);
+
+/*
+    MatAdd fucntion for EzPC compatibility followed by MatAdd
+    function developed using SeeDot.
+*/
+void MatAdd(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t shrC,
+            int64_t demote, int64_t bwA, int64_t bwB, int64_t bwTemp,
+            int64_t bwC, int64_t *A, int64_t *B, int64_t *C,
+            bool verbose = true);
+
+void MatAdd4(int32_t N, int32_t H, int32_t W, int32_t C, int32_t shrA,
+             int32_t shrB, int32_t shrC, int32_t demote, int32_t bwA,
+             int32_t bwB, int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B,
+             int64_t *X);
+
+/**/
+void MatSub(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t shrC,
+            int64_t demote, int64_t bwA, int64_t bwB, int64_t bwTemp,
+            int64_t bwC, int64_t *A, int64_t *B, int64_t *C);
+
+/**/
+void MatAddBroadCastA(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t A, int64_t *B,
+                      int64_t *C, bool verbose = true);
+
+/**/
+void MatAddBroadCastB(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t *A, int64_t B,
+                      int64_t *C, bool verbose = true);
+
+/**/
+void MatSubBroadCastA(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t A, int64_t *B,
+                      int64_t *C);
+
+/**/
+void MatSubBroadCastB(int64_t I, int64_t J, int64_t shrA, int64_t shrB,
+                      int64_t shrC, int64_t demote, int64_t bwA, int64_t bwB,
+                      int64_t bwTemp, int64_t bwC, int64_t *A, int64_t B,
+                      int64_t *C);
+
+/**/
+void MulCir(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+            int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, uint64_t *A,
+            uint64_t *B, uint64_t *C);
+
+void MulCir(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+            int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, int64_t *A,
+            int64_t *B, int64_t *C);
+
+void MatMul(int64_t I, int64_t K, int64_t J, int64_t shrA, int64_t shrB,
+            int64_t H1, int64_t H2, int64_t demote, int32_t bwA, int32_t bwB,
+            int32_t bwTemp, int32_t bwC, int64_t *A, int64_t *B, int64_t *C,
+            int64_t *tmp, bool verbose = true);
+
+void MatMul(int64_t I, int64_t K, int64_t J, int64_t shrA, int64_t shrB,
+            int64_t H1, int64_t H2, int64_t demote, int32_t bwA, int32_t bwB,
+            int32_t bwTemp, int32_t bwC, uint64_t *A, uint64_t *B, uint64_t *C,
+            uint64_t *tmp, bool verbose = true);
+
+/**/
+void ScalarMul(uint64_t *A, uint64_t *B, uint64_t *C, int32_t I, int32_t J,
+               int32_t bwA, int32_t bwB, int32_t bwTemp, int32_t bwC,
+               int32_t shrA, int32_t shrB, int32_t demote);
+
+void ScalarMul(int64_t I, int64_t J, int64_t shrA, int64_t shrB, int64_t demote,
+               int64_t bwA, int64_t bwB, int64_t bwTemp, int64_t bwC, int64_t A,
+               int64_t *B, int64_t *C);
+
+/**/
+
+void Sigmoid(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+             int64_t bwA, int64_t bwB, uint64_t *A, uint64_t *B);
+
+void Sigmoid(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+             int64_t bwA, int64_t bwB, int64_t *A, int64_t *B);
+
+/**/
+void TanH(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, int64_t *A, int64_t *B);
+
+void TanH(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, uint64_t *A, uint64_t *B);
+
+void Sqrt(int64_t I, int64_t J, int64_t scale_in, int64_t scale_out,
+          int64_t bwA, int64_t bwB, bool inverse, uint64_t *A, uint64_t *B);
+
+void reconstruct(int64_t *A, int64_t *B, int32_t I, int32_t J, int bwA);
+
+// template<class int64_t>
+void AdjustScaleShl(int64_t I, int64_t J, int64_t scale, int64_t *A);
+
+// template<class int64_t>
+void ArgMax(int64_t I, int64_t J, int32_t bwA, int32_t bw_index, int64_t *A,
+            int64_t *index);
+
+void typecast_to_uint64(int64_t *A, uint64_t *A64, int32_t I, int32_t J,
+                        int32_t bwA);
+
+void typecast_from_uint64(uint64_t *A64, int64_t *A, int32_t I, int32_t J,
+                          int bwA);
+
+void Exp(int32_t I, int32_t J, int32_t shrA, int32_t shrB, int32_t bwA,
+         int64_t *A, int64_t *B);
+
+void Div(int32_t I, int32_t J, int32_t shrA, int32_t shrB, int32_t shrC,
+         int32_t bwA, int64_t *A, int64_t *B, int64_t *C);
+
+// Athos Wrappers
+#ifdef SCI_OT
+void Sigmoid(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+             int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B);
+
+void TanH(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B);
+
+void Sqrt(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, bool inverse, uint64_t *A, uint64_t *B);
+#endif
+
+#ifdef SCI_HE
+void Sigmoid(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+             int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B) {
+  assert(false && "Sigmoid not supported in SCI_HE.");
+}
+
+void TanH(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, uint64_t *A, uint64_t *B) {
+  assert(false && "TanH not supported in SCI_HE.");
+}
+
+void Sqrt(int32_t I, int32_t J, int32_t scale_in, int32_t scale_out,
+          int32_t bwA, int32_t bwB, bool inverse, uint64_t *A, uint64_t *B) {
+  assert(false && "Sqrt not supported in SCI_HE.");
+}
+#endif
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_common.h b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_common.h
new file mode 100644
index 00000000..e2da9a1a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_common.h
@@ -0,0 +1,361 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+/************************ Standard Conv **************************/
+
+static void Conv2DReshapeFilter(int32_t FH, int32_t FW, int32_t CI, int32_t CO,
+                                uint64_t *inputArr, uint64_t *outputArr) {
+  for (uint32_t co = (int32_t)0; co < (uint32_t)CO; co++) {
+    for (uint32_t fh = (int32_t)0; fh < (uint32_t)FH; fh++) {
+      for (uint32_t fw = (int32_t)0; fw < (uint32_t)FW; fw++) {
+        for (uint32_t ci = (int32_t)0; ci < (uint32_t)CI; ci++) {
+
+          int32_t linIdx = ((((fh * FW) * CI) + (fw * CI)) + ci);
+          Arr2DIdxRowM(outputArr, CO, ((FH * FW) * CI), co, linIdx) =
+              Arr4DIdxRowM(inputArr, FH, FW, CI, CO, fh, fw, ci, co);
+        }
+      }
+    }
+  }
+}
+
+static void Conv2DReshapeMatMulOP(int32_t N, int32_t finalH, int32_t finalW,
+                                  int32_t CO, uint64_t *inputArr,
+                                  uint64_t *outputArr) {
+  for (uint32_t co = (int32_t)0; co < (uint32_t)CO; co++) {
+    for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+      for (uint32_t h = (int32_t)0; h < (uint32_t)finalH; h++) {
+        for (uint32_t w = (int32_t)0; w < (uint32_t)finalW; w++) {
+          Arr4DIdxRowM(outputArr, N, finalH, finalW, CO, n, h, w, co) =
+              Arr2DIdxRowM(inputArr, CO, ((N * finalH) * finalW), co,
+                           ((((n * finalH) * finalW) + (h * finalW)) + w));
+        }
+      }
+    }
+  }
+}
+
+static void Conv2DReshapeInput(int32_t N, int32_t H, int32_t W, int32_t CI,
+                               int32_t FH, int32_t FW, int32_t zPadHLeft,
+                               int32_t zPadHRight, int32_t zPadWLeft,
+                               int32_t zPadWRight, int32_t strideH,
+                               int32_t strideW, int32_t RRows, int32_t RCols,
+                               uint64_t *inputArr, uint64_t *outputArr) {
+
+  int32_t linIdxFilterMult = (int32_t)0;
+  for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+
+    int32_t leftTopCornerH = ((int32_t)0 - zPadHLeft);
+
+    int32_t extremeRightBottomCornerH = ((H - (int32_t)1) + zPadHRight);
+    while (
+        (((leftTopCornerH + FH) - (int32_t)1) <= extremeRightBottomCornerH)) {
+
+      int32_t leftTopCornerW = ((int32_t)0 - zPadWLeft);
+
+      int32_t extremeRightBottomCornerW = ((W - (int32_t)1) + zPadWRight);
+      while (
+          (((leftTopCornerW + FW) - (int32_t)1) <= extremeRightBottomCornerW)) {
+        for (uint32_t fh = (int32_t)0; fh < (uint32_t)FH; fh++) {
+          for (uint32_t fw = (int32_t)0; fw < (uint32_t)FW; fw++) {
+
+            int32_t curPosH = (leftTopCornerH + fh);
+
+            int32_t curPosW = (leftTopCornerW + fw);
+
+            uint64_t val = funcSSCons((int64_t)0);
+            for (uint32_t ci = (int32_t)0; ci < (uint32_t)CI; ci++) {
+              if ((((curPosH < (int32_t)0) || (curPosH >= H)) ||
+                   ((curPosW < (int32_t)0) || (curPosW >= W)))) {
+                val = funcSSCons((int64_t)0);
+              } else {
+                val = Arr4DIdxRowM(inputArr, N, H, W, CI, n, curPosH, curPosW,
+                                   ci);
+              }
+              Arr2DIdxRowM(outputArr, RRows, RCols,
+                           ((((fh * FW) * CI) + (fw * CI)) + ci),
+                           linIdxFilterMult) = val;
+            }
+          }
+        }
+        linIdxFilterMult = (linIdxFilterMult + (int32_t)1);
+        leftTopCornerW = (leftTopCornerW + strideW);
+      }
+
+      leftTopCornerH = (leftTopCornerH + strideH);
+    }
+  }
+}
+
+
+/************************ Transpose Conv **************************/
+
+
+static void ConvTranspose2DReshapeFilter(int64_t FH, int64_t FW, int64_t CO,
+                                     int64_t CI, uint64_t *inputArr,
+                                     uint64_t *outputArr){
+  for (uint64_t co = (int32_t)0; co < CO; co++) {
+    for (uint64_t fh = (int32_t)0; fh < FH; fh++) {
+      for (uint64_t fw = (int32_t)0; fw < FW; fw++) {
+        for (uint64_t ci = (int32_t)0; ci < CI; ci++) {
+
+          uint64_t linIdx =((((fh * FW) * CI) + (fw * CI)) + ci);
+           Arr2DIdxRowM(outputArr, CO, ((FH * FW) * CI), co, linIdx) =
+              Arr4DIdxRowM(inputArr, FH, FW, CO, CI, ((FH-(int32_t)1)-fh), ((FW-(int32_t)1)-fw), co, ci);
+        }
+      }
+    }
+  }
+
+ }
+
+static void ConvTranspose2DReshapeMatMulOP(int32_t N, int32_t finalH, int32_t finalW,
+                                  int32_t CO, uint64_t *inputArr,
+                                  uint64_t *outputArr) {
+  for (uint32_t co = (int32_t)0; co < (uint32_t)CO; co++) {
+    for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+      for (uint32_t h = (int32_t)0; h < (uint32_t)finalH; h++) {
+        for (uint32_t w = (int32_t)0; w < (uint32_t)finalW; w++) {
+          Arr4DIdxRowM(outputArr, N, finalH, finalW, CO, n, h, w, co) =
+              Arr2DIdxRowM(inputArr, CO, ((N * finalH) * finalW), co,
+                           ((((n * finalH) * finalW) + (h * finalW)) + w));
+        }
+      }
+    }
+  }
+}
+
+static void ConvTranspose2DReshapeInput(int64_t N, int64_t HPrime,
+                                    int64_t WPrime, int64_t CI, int64_t FH,
+                                    int64_t FW, int64_t zPadTrHLeft,
+                                    int64_t zPadTrHRight, int64_t zPadTrWLeft,
+                                    int64_t zPadTrWRight, int64_t strideH,
+                                    int64_t strideW, int64_t RRows,
+                                    int64_t RCols, uint64_t *inputArr,
+                                    uint64_t *outputArr) {
+
+  int32_t linIdxFilterMult = (int32_t)0;
+  for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+
+    int32_t leftTopCornerH = ((int32_t)0 - zPadTrHLeft);
+
+    int32_t HPrimeTilde = (HPrime + ((HPrime-(int32_t)1) * (strideH - (int32_t)1)));
+
+    int32_t extremeRightBottomCornerH = ((HPrimeTilde - (int32_t)1) + zPadTrHRight);
+    while (
+        (((leftTopCornerH + FH) - (int32_t)1) <= extremeRightBottomCornerH)) {
+
+      int32_t leftTopCornerW = ((int32_t)0 - zPadTrWLeft);
+
+      int32_t WPrimeTilde = (WPrime + ((WPrime-(int32_t)1) * (strideW - (int32_t)1)));
+
+      int32_t extremeRightBottomCornerW = ((WPrimeTilde - (int32_t)1) + zPadTrWRight);
+      while (
+          (((leftTopCornerW + FW) - (int32_t)1) <= extremeRightBottomCornerW)) {
+        for (uint32_t fh = (int32_t)0; fh < (uint32_t)FH; fh++) {
+          for (uint32_t fw = (int32_t)0; fw < (uint32_t)FW; fw++) {
+
+            int32_t curPosH = (leftTopCornerH + fh);
+
+            int32_t curPosW = (leftTopCornerW + fw);
+
+            uint64_t val = funcSSCons((int64_t)0);
+            for (uint32_t ci = (int32_t)0; ci < (uint32_t)CI; ci++) {
+              if ((((curPosH < (int32_t)0) || (curPosH >= HPrimeTilde)) ||
+                   ((curPosW < (int32_t)0) || (curPosW >= WPrimeTilde)))) {
+                val = funcSSCons((int64_t)0);
+              } else {
+                if(((curPosH % strideH) == (int32_t)0) && ((curPosW % strideW) == (int32_t)0)){
+                  uint64_t idxInputH = (curPosH / strideH);
+
+                  uint64_t idxInputW = (curPosW / strideW);
+                  val = Arr4DIdxRowM(inputArr, N, HPrime, WPrime, CI, n, idxInputH, idxInputW,ci);
+                }
+                else{
+                  val = (int64_t)0;
+                }
+              
+              }
+              Arr2DIdxRowM(outputArr, RRows, RCols,
+                           ((((fh * FW) * CI) + (fw * CI)) + ci),
+                           linIdxFilterMult) = val;
+            }
+          }
+        }
+        linIdxFilterMult = (linIdxFilterMult + (int32_t)1);
+        leftTopCornerW = (leftTopCornerW + (int32_t)1);
+      }
+
+      leftTopCornerH = (leftTopCornerH + (int32_t)1);
+    }
+  }
+}
+
+/************************ Grouped Conv **************************/
+
+static void Conv2DReshapeFilterGroup(int32_t FH, int32_t FW, int32_t CI,
+                                     int32_t CO, int32_t g, int32_t G,
+                                     uint64_t *inputArr, uint64_t *outputArr) {
+
+  int32_t CIG = (CI / G);
+
+  int32_t COG = (CO / G);
+
+  int32_t startCO = (g * COG);
+  for (uint32_t co = (int32_t)0; co < (uint32_t)COG; co++) {
+    for (uint32_t fh = (int32_t)0; fh < (uint32_t)FH; fh++) {
+      for (uint32_t fw = (int32_t)0; fw < (uint32_t)FW; fw++) {
+        for (uint32_t ci = (int32_t)0; ci < (uint32_t)CIG; ci++) {
+
+          int32_t linIdx = ((((fh * FW) * CIG) + (fw * CIG)) + ci);
+          Arr2DIdxRowM(outputArr, (CO / G), ((FH * FW) * (CI / G)), co,
+                       linIdx) = Arr4DIdxRowM(inputArr, FH, FW, (CI / G), CO,
+                                              fh, fw, ci, (co + startCO));
+        }
+      }
+    }
+  }
+}
+
+static void Conv2DReshapeMatMulOPGroup(int32_t N, int32_t finalH,
+                                       int32_t finalW, int32_t CO, int32_t g,
+                                       int32_t G, uint64_t *inputArr,
+                                       uint64_t *outputArr) {
+
+  int32_t COG = (CO / G);
+
+  int32_t startCO = (g * COG);
+  for (uint32_t co = (int32_t)0; co < (uint32_t)COG; co++) {
+    for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+      for (uint32_t h = (int32_t)0; h < (uint32_t)finalH; h++) {
+        for (uint32_t w = (int32_t)0; w < (uint32_t)finalW; w++) {
+          Arr4DIdxRowM(outputArr, N, finalH, finalW, CO, n, h, w,
+                       (co + startCO)) =
+              Arr2DIdxRowM(inputArr, (CO / G), ((N * finalH) * finalW), co,
+                           ((((n * finalH) * finalW) + (h * finalW)) + w));
+        }
+      }
+    }
+  }
+}
+
+static void Conv2DReshapeInputGroup(int32_t N, int32_t H, int32_t W, int32_t CI,
+                                    int32_t FH, int32_t FW, int32_t zPadHLeft,
+                                    int32_t zPadHRight, int32_t zPadWLeft,
+                                    int32_t zPadWRight, int32_t strideH,
+                                    int32_t strideW, int32_t g, int32_t G,
+                                    int32_t RRows, int32_t RCols,
+                                    uint64_t *inputArr, uint64_t *outputArr) {
+
+  int32_t linIdxFilterMult = (int32_t)0;
+
+  int32_t CIG = (CI / G);
+  for (uint32_t n = (int32_t)0; n < (uint32_t)N; n++) {
+
+    int32_t leftTopCornerH = ((int32_t)0 - zPadHLeft);
+
+    int32_t extremeRightBottomCornerH = ((H - (int32_t)1) + zPadHRight);
+    while (
+        (((leftTopCornerH + FH) - (int32_t)1) <= extremeRightBottomCornerH)) {
+
+      int32_t leftTopCornerW = ((int32_t)0 - zPadWLeft);
+
+      int32_t extremeRightBottomCornerW = ((W - (int32_t)1) + zPadWRight);
+      while (
+          (((leftTopCornerW + FW) - (int32_t)1) <= extremeRightBottomCornerW)) {
+        for (uint32_t fh = (int32_t)0; fh < (uint32_t)FH; fh++) {
+          for (uint32_t fw = (int32_t)0; fw < (uint32_t)FW; fw++) {
+
+            int32_t curPosH = (leftTopCornerH + fh);
+
+            int32_t curPosW = (leftTopCornerW + fw);
+
+            uint64_t val = funcSSCons((int64_t)0);
+
+            int32_t startCI = (g * CIG);
+            for (uint32_t ci = (int32_t)0; ci < (uint32_t)CIG; ci++) {
+              if ((((curPosH < (int32_t)0) || (curPosH >= H)) ||
+                   ((curPosW < (int32_t)0) || (curPosW >= W)))) {
+                val = funcSSCons((int64_t)0);
+              } else {
+                val = Arr4DIdxRowM(inputArr, N, H, W, CI, n, curPosH, curPosW,
+                                   (ci + startCI));
+              }
+              Arr2DIdxRowM(outputArr, RRows, RCols,
+                           ((((fh * FW) * CIG) + (fw * CIG)) + ci),
+                           linIdxFilterMult) = val;
+            }
+          }
+        }
+        linIdxFilterMult = (linIdxFilterMult + (int32_t)1);
+        leftTopCornerW = (leftTopCornerW + strideW);
+      }
+
+      leftTopCornerH = (leftTopCornerH + strideH);
+    }
+  }
+}
+
+static void Conv2DGroup(int32_t N, int32_t H, int32_t W, int32_t CI, int32_t FH,
+                        int32_t FW, int32_t CO, int32_t zPadHLeft,
+                        int32_t zPadHRight, int32_t zPadWLeft,
+                        int32_t zPadWRight, int32_t strideH, int32_t strideW,
+                        int32_t G, uint64_t *inputArr, uint64_t *filterArr,
+                        uint64_t *outArr) {
+
+  int32_t CIG = (CI / G);
+
+  int32_t reshapedFilterRows = (CO / G);
+
+  int32_t reshapedFilterCols = ((FH * FW) * CIG);
+
+  int32_t reshapedIPRows = ((FH * FW) * CIG);
+
+  int32_t outH =
+      ((((H + (zPadHLeft + zPadHRight)) - FH) / strideH) + (int32_t)1);
+
+  int32_t outW =
+      ((((W + (zPadWLeft + zPadWRight)) - FW) / strideW) + (int32_t)1);
+
+  int32_t reshapedIPCols = ((N * outH) * outW);
+  for (uint32_t g = (int32_t)0; g < (uint32_t)G; g++) {
+
+    uint64_t *inputReshaped =
+        make_array<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+    uint64_t *matmulOP =
+        make_array<uint64_t>(reshapedFilterRows, reshapedIPCols);
+
+    uint64_t *filterReshaped =
+        make_array<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+    Conv2DReshapeFilterGroup(FH, FW, CI, CO, g, G, filterArr, filterReshaped);
+    Conv2DReshapeInputGroup(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight,
+                            zPadWLeft, zPadWRight, strideH, strideW, g, G,
+                            reshapedIPRows, reshapedIPCols, inputArr,
+                            inputReshaped);
+    MatMul2D(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+             filterReshaped, inputReshaped, matmulOP, 1);
+    Conv2DReshapeMatMulOPGroup(N, outH, outW, CO, g, G, matmulOP, outArr);
+    ClearMemSecret2(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+    ClearMemSecret2(reshapedIPRows, reshapedIPCols, inputReshaped);
+    ClearMemSecret2(reshapedFilterRows, reshapedIPCols, matmulOP);
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.cpp
new file mode 100644
index 00000000..c4a8d40a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.cpp
@@ -0,0 +1,2093 @@
+/*
+Authors: Nishant Kumar, Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "library_fixed_uniform.h"
+#include "functionalities_uniform.h"
+#include "library_fixed_common.h"
+#ifdef SCI_HE
+uint64_t prime_mod = sci::default_prime_mod.at(bitlength);
+#elif SCI_OT
+uint64_t prime_mod = (bitlength == 64 ? 0ULL : 1ULL << bitlength);
+uint64_t moduloMask = prime_mod - 1;
+uint64_t moduloMidPt = prime_mod / 2;
+#endif
+#ifdef VERIFY_LAYERWISE
+#include "cleartext_library_fixed_uniform.h"
+#else
+#if defined(SCI_OT)
+inline uint64_t getRingElt(int64_t x) { return ((uint64_t)x) & moduloMask; }
+#else
+uint64_t getRingElt(int64_t x) {
+  if (x > 0)
+    return x % prime_mod;
+  else {
+    int64_t y = -x;
+    int64_t temp = prime_mod - y;
+    int64_t temp1 = temp % ((int64_t)prime_mod);
+    uint64_t ans = (temp1 + prime_mod) % prime_mod;
+    return ans;
+  }
+}
+#endif
+#endif
+
+using namespace std;
+
+void MatMul2D(int32_t s1, int32_t s2, int32_t s3, const intType *A,
+              const intType *B, intType *C, bool modelIsA) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  std::cout << "Matmul called s1,s2,s3 = " << s1 << " " << s2 << " " << s3
+            << std::endl;
+
+  // By default, the model is A and server/Alice has it
+  // So, in the AB mult, party with A = server and party with B = client.
+  int partyWithAInAB_mul = sci::ALICE; 
+  int partyWithBInAB_mul = sci::BOB;
+  if (!modelIsA) {
+    // Model is B
+    partyWithAInAB_mul = sci::BOB;
+    partyWithBInAB_mul = sci::ALICE;
+  }
+
+#if defined(SCI_OT)
+#ifndef MULTITHREADED_MATMUL
+#ifdef USE_LINEAR_UNIFORM
+  if (partyWithAInAB_mul == sci::ALICE) {
+    if (party == sci::ALICE) {
+      multUniform->funcOTSenderInputA(s1, s2, s3, A, C, iknpOT);
+    } else {
+      multUniform->funcOTReceiverInputB(s1, s2, s3, B, C, iknpOT);
+    }
+  } else {
+    if (party == sci::BOB) {
+      multUniform->funcOTSenderInputA(s1, s2, s3, A, C, iknpOTRoleReversed);
+    } else {
+      multUniform->funcOTReceiverInputB(s1, s2, s3, B, C, iknpOTRoleReversed);
+    }
+  }
+#else  // USE_LINEAR_UNIFORM
+#ifdef TRAINING
+  mult->matmul_cross_terms(s1, s2, s3, A, B, C, bitlength, bitlength,
+                           bitlength, true, MultMode::None);
+#else
+  if (modelIsA) {
+    mult->matmul_cross_terms(s1, s2, s3, A, B, C, bitlength, bitlength,
+                             bitlength, true, MultMode::Alice_has_A);
+  } else {
+    mult->matmul_cross_terms(s1, s2, s3, A, B, C, bitlength, bitlength,
+                             bitlength, true, MultMode::Alice_has_B);
+  }
+#endif
+#endif // USE_LINEAR_UNIFORM
+
+  if (party == sci::ALICE) {
+    // Now irrespective of whether A is the model or B is the model and whether
+    //	server holds A or B, server should add locally A*B.
+    //
+    // Add also A*own share of B
+    intType *CTemp = new intType[s1 * s3];
+#ifdef USE_LINEAR_UNIFORM
+    multUniform->ideal_func(s1, s2, s3, A, B, CTemp);
+#else  // USE_LINEAR_UNIFORM
+    mult->matmul_cleartext(s1, s2, s3, A, B, CTemp, true);
+#endif // USE_LINEAR_UNIFORM
+    sci::elemWiseAdd<intType>(s1 * s3, C, CTemp, C);
+    delete[] CTemp;
+  } else {
+    // For minionn kind of hacky runs, switch this off
+#ifndef HACKY_RUN
+    if (modelIsA) {
+      for (int i = 0; i < s1 * s2; i++)
+        assert(A[i] == 0);
+    } else {
+      for (int i = 0; i < s1 * s2; i++)
+        assert(B[i] == 0);
+    }
+#endif
+  }
+
+#else // MULTITHREADED_MATMUL is ON
+  int required_num_threads = num_threads;
+  if (s2 < num_threads) {
+    required_num_threads = s2;
+  }
+  intType *C_ans_arr[required_num_threads];
+  std::thread matmulThreads[required_num_threads];
+  for (int i = 0; i < required_num_threads; i++) {
+    C_ans_arr[i] = new intType[s1 * s3];
+    matmulThreads[i] = std::thread(funcMatmulThread, i, required_num_threads,
+                                   s1, s2, s3, (intType *)A, (intType *)B,
+                                   (intType *)C_ans_arr[i], partyWithAInAB_mul);
+  }
+  for (int i = 0; i < required_num_threads; i++) {
+    matmulThreads[i].join();
+  }
+  for (int i = 0; i < s1 * s3; i++) {
+    C[i] = 0;
+  }
+  for (int i = 0; i < required_num_threads; i++) {
+    for (int j = 0; j < s1 * s3; j++) {
+      C[j] += C_ans_arr[i][j];
+    }
+    delete[] C_ans_arr[i];
+  }
+
+  if (party == sci::ALICE) {
+    intType *CTemp = new intType[s1 * s3];
+#ifdef USE_LINEAR_UNIFORM
+    multUniform->ideal_func(s1, s2, s3, A, B, CTemp);
+#else  // USE_LINEAR_UNIFORM
+    mult->matmul_cleartext(s1, s2, s3, (intType*)A, (intType*)B, CTemp, true);
+#endif // USE_LINEAR_UNIFORM
+    sci::elemWiseAdd<intType>(s1 * s3, C, CTemp, C);
+    delete[] CTemp;
+  } else {
+    // For minionn kind of hacky runs, switch this off
+#ifndef HACKY_RUN
+    if (modelIsA) {
+      for (int i = 0; i < s1 * s2; i++)
+        assert(A[i] == 0);
+    } else {
+      for (int i = 0; i < s1 * s2; i++)
+        assert(B[i] == 0);
+    }
+#endif
+  }
+#endif
+  intType moduloMask = (1ULL << bitlength) - 1;
+  if (bitlength == 64)
+    moduloMask = -1;
+  for (int i = 0; i < s1 * s3; i++) {
+    C[i] = C[i] & moduloMask;
+  }
+
+#elif defined(SCI_HE)
+  // We only support matrix vector multiplication.
+  assert(modelIsA == false &&
+         "Assuming code generated by compiler produces B as the model.");
+  std::vector<std::vector<intType>> At(s2);
+  std::vector<std::vector<intType>> Bt(s3);
+  std::vector<std::vector<intType>> Ct(s3);
+  for (int i = 0; i < s2; i++) {
+    At[i].resize(s1);
+    for (int j = 0; j < s1; j++) {
+      At[i][j] = getRingElt(Arr2DIdxRowM(A, s1, s2, j, i));
+    }
+  }
+  for (int i = 0; i < s3; i++) {
+    Bt[i].resize(s2);
+    Ct[i].resize(s1);
+    for (int j = 0; j < s2; j++) {
+      Bt[i][j] = getRingElt(Arr2DIdxRowM(B, s2, s3, j, i));
+    }
+  }
+  he_fc->matrix_multiplication(s3, s2, s1, Bt, At, Ct);
+  for (int i = 0; i < s1; i++) {
+    for (int j = 0; j < s3; j++) {
+      Arr2DIdxRowM(C, s1, s3, i, j) = getRingElt(Ct[j][i]);
+    }
+  }
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  MatMulTimeInMilliSec += temp;
+  std::cout << "Time in sec for current matmul = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MatMulCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < s1; i++) {
+    for (int j = 0; j < s3; j++) {
+      assert(Arr2DIdxRowM(C, s1, s3, i, j) < prime_mod);
+    }
+  }
+#endif
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, A, s1 * s2);
+    funcReconstruct2PCCons(nullptr, B, s2 * s3);
+    funcReconstruct2PCCons(nullptr, C, s1 * s3);
+  } else {
+    signedIntType *VA = new signedIntType[s1 * s2];
+    funcReconstruct2PCCons(VA, A, s1 * s2);
+    signedIntType *VB = new signedIntType[s2 * s3];
+    funcReconstruct2PCCons(VB, B, s2 * s3);
+    signedIntType *VC = new signedIntType[s1 * s3];
+    funcReconstruct2PCCons(VC, C, s1 * s3);
+
+    std::vector<std::vector<uint64_t>> VAvec;
+    std::vector<std::vector<uint64_t>> VBvec;
+    std::vector<std::vector<uint64_t>> VCvec;
+    VAvec.resize(s1, std::vector<uint64_t>(s2, 0));
+    VBvec.resize(s2, std::vector<uint64_t>(s3, 0));
+    VCvec.resize(s1, std::vector<uint64_t>(s3, 0));
+
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s2; j++) {
+        VAvec[i][j] = getRingElt(Arr2DIdxRowM(VA, s1, s2, i, j));
+      }
+    }
+    for (int i = 0; i < s2; i++) {
+      for (int j = 0; j < s3; j++) {
+        VBvec[i][j] = getRingElt(Arr2DIdxRowM(VB, s2, s3, i, j));
+      }
+    }
+
+    MatMul2D_pt(s1, s2, s3, VAvec, VBvec, VCvec, 0);
+
+    bool pass = true;
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s3; j++) {
+        if (Arr2DIdxRowM(VC, s1, s3, i, j) != getSignedVal(VCvec[i][j])) {
+          pass = false;
+        }
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "MatMul Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "MatMul Output Mismatch" << RESET << std::endl;
+
+    delete[] VA;
+    delete[] VB;
+    delete[] VC;
+  }
+#endif
+}
+
+static void Conv2D(int32_t N, int32_t H, int32_t W, int32_t CI, int32_t FH,
+                   int32_t FW, int32_t CO, int32_t zPadHLeft,
+                   int32_t zPadHRight, int32_t zPadWLeft, int32_t zPadWRight,
+                   int32_t strideH, int32_t strideW, uint64_t *inputArr,
+                   uint64_t *filterArr, uint64_t *outArr) {
+
+  int32_t reshapedFilterRows = CO;
+
+  int32_t reshapedFilterCols = ((FH * FW) * CI);
+
+  int32_t reshapedIPRows = ((FH * FW) * CI);
+
+  int32_t newH =
+      ((((H + (zPadHLeft + zPadHRight)) - FH) / strideH) + (int32_t)1);
+
+  int32_t newW =
+      ((((W + (zPadWLeft + zPadWRight)) - FW) / strideW) + (int32_t)1);
+
+  int32_t reshapedIPCols = ((N * newH) * newW);
+
+  uint64_t *filterReshaped =
+      make_array<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  uint64_t *inputReshaped =
+      make_array<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  uint64_t *matmulOP = make_array<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  Conv2DReshapeFilter(FH, FW, CI, CO, filterArr, filterReshaped);
+  Conv2DReshapeInput(N, H, W, CI, FH, FW, zPadHLeft, zPadHRight, zPadWLeft,
+                     zPadWRight, strideH, strideW, reshapedIPRows,
+                     reshapedIPCols, inputArr, inputReshaped);
+  MatMul2D(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+           filterReshaped, inputReshaped, matmulOP, 1);
+  Conv2DReshapeMatMulOP(N, newH, newW, CO, matmulOP, outArr);
+  ClearMemSecret2(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void Conv2DWrapper(signedIntType N, signedIntType H, signedIntType W,
+                   signedIntType CI, signedIntType FH, signedIntType FW,
+                   signedIntType CO, signedIntType zPadHLeft,
+                   signedIntType zPadHRight, signedIntType zPadWLeft,
+                   signedIntType zPadWRight, signedIntType strideH,
+                   signedIntType strideW, intType *inputArr, intType *filterArr,
+                   intType *outArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "Conv2DCSF " << ctr << " called N=" << N << ", H=" << H
+            << ", W=" << W << ", CI=" << CI << ", FH=" << FH << ", FW=" << FW
+            << ", CO=" << CO << ", S=" << strideH << std::endl;
+  ctr++;
+
+  signedIntType newH = (((H + (zPadHLeft + zPadHRight) - FH) / strideH) + 1);
+  signedIntType newW = (((W + (zPadWLeft + zPadWRight) - FW) / strideW) + 1);
+
+#ifdef SCI_OT
+  // If its a ring, then its a OT based -- use the default Conv2DCSF
+  // implementation that comes from the EzPC library
+  Conv2D(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight,
+         strideH, strideW, inputArr, filterArr, outArr);
+#endif
+
+#ifdef SCI_HE
+  // If its a field, then its a HE based -- use the HE based conv implementation
+  std::vector<std::vector<std::vector<std::vector<intType>>>> inputVec;
+  inputVec.resize(N, std::vector<std::vector<std::vector<intType>>>(
+                         H, std::vector<std::vector<intType>>(
+                                W, std::vector<intType>(CI, 0))));
+
+  std::vector<std::vector<std::vector<std::vector<intType>>>> filterVec;
+  filterVec.resize(FH, std::vector<std::vector<std::vector<intType>>>(
+                           FW, std::vector<std::vector<intType>>(
+                                   CI, std::vector<intType>(CO, 0))));
+
+  std::vector<std::vector<std::vector<std::vector<intType>>>> outputVec;
+  outputVec.resize(N, std::vector<std::vector<std::vector<intType>>>(
+                          newH, std::vector<std::vector<intType>>(
+                                    newW, std::vector<intType>(CO, 0))));
+
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < H; j++) {
+      for (int k = 0; k < W; k++) {
+        for (int p = 0; p < CI; p++) {
+          inputVec[i][j][k][p] =
+              getRingElt(Arr4DIdxRowM(inputArr, N, H, W, CI, i, j, k, p));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < FH; i++) {
+    for (int j = 0; j < FW; j++) {
+      for (int k = 0; k < CI; k++) {
+        for (int p = 0; p < CO; p++) {
+          filterVec[i][j][k][p] =
+              getRingElt(Arr4DIdxRowM(filterArr, FH, FW, CI, CO, i, j, k, p));
+        }
+      }
+    }
+  }
+
+  he_conv->convolution(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight,
+                       zPadWLeft, zPadWRight, strideH, strideW, inputVec,
+                       filterVec, outputVec);
+
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < newH; j++) {
+      for (int k = 0; k < newW; k++) {
+        for (int p = 0; p < CO; p++) {
+          Arr4DIdxRowM(outArr, N, newH, newW, CO, i, j, k, p) =
+              getRingElt(outputVec[i][j][k][p]);
+        }
+      }
+    }
+  }
+
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ConvTimeInMilliSec += temp;
+  std::cout << "Time in sec for current conv = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ConvCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < newH; j++) {
+      for (int k = 0; k < newW; k++) {
+        for (int p = 0; p < CO; p++) {
+          assert(Arr4DIdxRowM(outArr, N, newH, newW, CO, i, j, k, p) <
+                 prime_mod);
+        }
+      }
+    }
+  }
+#endif
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inputArr, N * H * W * CI);
+    funcReconstruct2PCCons(nullptr, filterArr, FH * FW * CI * CO);
+    funcReconstruct2PCCons(nullptr, outArr, N * newH * newW * CO);
+  } else {
+    signedIntType *VinputArr = new signedIntType[N * H * W * CI];
+    funcReconstruct2PCCons(VinputArr, inputArr, N * H * W * CI);
+    signedIntType *VfilterArr = new signedIntType[FH * FW * CI * CO];
+    funcReconstruct2PCCons(VfilterArr, filterArr, FH * FW * CI * CO);
+    signedIntType *VoutputArr = new signedIntType[N * newH * newW * CO];
+    funcReconstruct2PCCons(VoutputArr, outArr, N * newH * newW * CO);
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VinputVec;
+    VinputVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                            H, std::vector<std::vector<uint64_t>>(
+                                   W, std::vector<uint64_t>(CI, 0))));
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VfilterVec;
+    VfilterVec.resize(FH, std::vector<std::vector<std::vector<uint64_t>>>(
+                              FW, std::vector<std::vector<uint64_t>>(
+                                      CI, std::vector<uint64_t>(CO, 0))));
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VoutputVec;
+    VoutputVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                             newH, std::vector<std::vector<uint64_t>>(
+                                       newW, std::vector<uint64_t>(CO, 0))));
+
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < H; j++) {
+        for (int k = 0; k < W; k++) {
+          for (int p = 0; p < CI; p++) {
+            VinputVec[i][j][k][p] =
+                getRingElt(Arr4DIdxRowM(VinputArr, N, H, W, CI, i, j, k, p));
+          }
+        }
+      }
+    }
+    for (int i = 0; i < FH; i++) {
+      for (int j = 0; j < FW; j++) {
+        for (int k = 0; k < CI; k++) {
+          for (int p = 0; p < CO; p++) {
+            VfilterVec[i][j][k][p] = getRingElt(
+                Arr4DIdxRowM(VfilterArr, FH, FW, CI, CO, i, j, k, p));
+          }
+        }
+      }
+    }
+
+    Conv2DWrapper_pt(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+                     zPadWRight, strideH, strideW, VinputVec, VfilterVec,
+                     VoutputVec); // consSF = 0
+
+    bool pass = true;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < newH; j++) {
+        for (int k = 0; k < newW; k++) {
+          for (int p = 0; p < CO; p++) {
+            if (Arr4DIdxRowM(VoutputArr, N, newH, newW, CO, i, j, k, p) !=
+                getSignedVal(VoutputVec[i][j][k][p])) {
+              pass = false;
+            }
+          }
+        }
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Convolution Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Convolution Output Mismatch" << RESET << std::endl;
+
+    delete[] VinputArr;
+    delete[] VfilterArr;
+    delete[] VoutputArr;
+  }
+#endif
+}
+
+#ifdef SCI_OT
+void Conv2DGroup(int32_t N, int32_t H, int32_t W, int32_t CI, int32_t FH,
+                 int32_t FW, int32_t CO, int32_t zPadHLeft, int32_t zPadHRight,
+                 int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+                 int32_t strideW, int32_t G, intType *inputArr,
+                 intType *filterArr, intType *outArr);
+#endif
+
+void Conv2DGroupWrapper(signedIntType N, signedIntType H, signedIntType W,
+                        signedIntType CI, signedIntType FH, signedIntType FW,
+                        signedIntType CO, signedIntType zPadHLeft,
+                        signedIntType zPadHRight, signedIntType zPadWLeft,
+                        signedIntType zPadWRight, signedIntType strideH,
+                        signedIntType strideW, signedIntType G,
+                        intType *inputArr, intType *filterArr,
+                        intType *outArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "Conv2DGroupCSF " << ctr << " called N=" << N << ", H=" << H
+            << ", W=" << W << ", CI=" << CI << ", FH=" << FH << ", FW=" << FW
+            << ", CO=" << CO << ", S=" << strideH << ",G=" << G << std::endl;
+  ctr++;
+
+#ifdef SCI_OT
+  // If its a ring, then its a OT based -- use the default Conv2DGroupCSF
+  // implementation that comes from the EzPC library
+  Conv2DGroup(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+              zPadWRight, strideH, strideW, G, inputArr, filterArr, outArr);
+#endif
+
+#ifdef SCI_HE
+  if (G == 1)
+    Conv2DWrapper(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+                  zPadWRight, strideH, strideW, inputArr, filterArr, outArr);
+  else
+    assert(false && "Grouped conv not implemented in HE");
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ConvTimeInMilliSec += temp;
+  std::cout << "Time in sec for current conv = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ConvCommSent += curComm;
+#endif
+}
+
+static void ConvTranspose2D(int32_t N, int32_t HPrime, int32_t WPrime,
+                               int32_t CI, int32_t FH, int32_t FW,
+                               int32_t CO, int32_t H, int32_t W,
+                               int32_t zPadTrHLeft, int32_t zPadTrHRight,
+                               int32_t zPadTrWLeft, int32_t zPadTrWRight,
+                               int32_t strideH, int32_t strideW,
+                               uint64_t* inArr, uint64_t* filterArr,
+                               uint64_t* outArr){
+                                  
+  uint64_t reshapedFilterRows = CO;
+
+  uint64_t reshapedFilterCols =(( FH * FW ) * CI);
+
+  uint64_t reshapedIPRows = ((FH * FW) * CI);
+
+  uint64_t reshapedIPCols = ((N * H) * W);
+
+  uint64_t *filterReshaped =
+      make_array<uint64_t>(reshapedFilterRows, reshapedFilterCols);
+
+  uint64_t *inputReshaped = make_array<uint64_t>(reshapedIPRows, reshapedIPCols);
+
+  uint64_t *matmulOP = make_array<uint64_t>(reshapedFilterRows, reshapedIPCols);
+  ConvTranspose2DReshapeFilter(FH, FW, CO, CI, filterArr, filterReshaped);
+  ConvTranspose2DReshapeInput(N, HPrime, WPrime, CI, FH, FW, zPadTrHLeft,
+                                zPadTrHRight, zPadTrWLeft, zPadTrWRight,
+                                strideH, strideW, reshapedIPRows,
+                                reshapedIPCols, inArr, inputReshaped);
+  MatMul2D(reshapedFilterRows, reshapedFilterCols, reshapedIPCols,
+              filterReshaped, inputReshaped, matmulOP, 1);
+  ConvTranspose2DReshapeMatMulOP(N, H, W, CO, matmulOP, outArr);
+  ClearMemSecret2(reshapedFilterRows, reshapedFilterCols, filterReshaped);
+  ClearMemSecret2(reshapedIPRows, reshapedIPCols, inputReshaped);
+  ClearMemSecret2(reshapedFilterRows, reshapedIPCols, matmulOP);
+}
+
+void ConvTranspose2DWrapper(int32_t N, int32_t HPrime, int32_t WPrime,
+                               int32_t CI, int32_t FH, int32_t FW,
+                               int32_t CO, int32_t H, int32_t W,
+                               int32_t zPadTrHLeft, int32_t zPadTrHRight,
+                               int32_t zPadTrWLeft, int32_t zPadTrWRight,
+                               int32_t strideH, int32_t strideW,
+                               uint64_t* inArr, uint64_t* filterArr,
+                               uint64_t* outArr){
+  #ifdef LOG_LAYERWISE
+    INIT_ALL_IO_DATA_SENT;
+    INIT_TIMER;
+  #endif
+
+  static int ctr = 1;
+  std::cout << "ConvTranspose2DCSF " << ctr << " called N=" << N << ", H=" << H
+            << ", W=" << W << ", CI=" << CI << ", FH=" << FH << ", FW=" << FW
+            << ", CO=" << CO << ", S=" << strideH << std::endl;
+  ctr++;
+
+  signedIntType newH = (((H + (zPadTrHLeft + zPadTrHRight) - FH) / strideH) + 1);
+  signedIntType newW = (((W + (zPadTrWLeft + zPadTrWRight) - FW) / strideW) + 1);
+
+  #ifdef SCI_OT
+    // If its a ring, then its a OT based -- use the default Conv2DCSF
+    // implementation that comes from the EzPC library
+    ConvTranspose2D(N, HPrime, WPrime, CI, FH, FW, CO, H, W,
+                   zPadTrHLeft, zPadTrHRight, zPadTrWLeft, zPadTrWRight,
+                   strideH, strideW, inArr, filterArr, outArr);
+  #endif
+
+  #ifdef SCI_HE
+      assert(false && "Conv Transpose2D not implemented in HE");
+  #endif
+
+
+  #ifdef LOG_LAYERWISE
+    auto temp = TIMER_TILL_NOW;
+    ConvTimeInMilliSec += temp;
+    std::cout << "Time in sec for current conv = " << (temp / 1000.0)
+              << std::endl;
+    uint64_t curComm;
+    FIND_ALL_IO_TILL_NOW(curComm);
+    ConvCommSent += curComm;
+  #endif
+
+}
+
+void ElemWiseActModelVectorMult(int32_t size, intType *inArr,
+                                intType *multArrVec, intType *outputArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  if (party == CLIENT) {
+    for (int i = 0; i < size; i++) {
+      assert((multArrVec[i] == 0) &&
+             "The semantics of ElemWiseActModelVectorMult dictate multArrVec "
+             "should be the model and client share should be 0 for it.");
+    }
+  }
+
+  static int batchNormCtr = 1;
+  std::cout << "Starting fused batchNorm #" << batchNormCtr << std::endl;
+  batchNormCtr++;
+
+#ifdef SCI_OT
+#ifdef MULTITHREADED_DOTPROD
+  std::thread dotProdThreads[num_threads];
+  int chunk_size = ceil(size / double(num_threads));
+  intType *inputArrPtr;
+  if (party == SERVER) {
+    inputArrPtr = multArrVec;
+  } else {
+    inputArrPtr = inArr;
+  }
+  for (int i = 0; i < num_threads; i++) {
+    int offset = i * chunk_size;
+    int curSize;
+    curSize =
+        ((i + 1) * chunk_size > size ? max(0, size - offset) : chunk_size);
+    /*
+    if (i == (num_threads - 1)) {
+        curSize = size - offset;
+    }
+    else{
+        curSize = chunk_size;
+    }
+    */
+    dotProdThreads[i] = std::thread(funcDotProdThread, i, num_threads, curSize,
+                                    multArrVec + offset, inArr + offset,
+                                    outputArr + offset, false);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    dotProdThreads[i].join();
+  }
+#else
+#ifdef TRAINING
+  matmul->hadamard_cross_terms(size, multArrVec, inArr, outputArr, bitlength,
+                               bitlength, bitlength, MultMode::None);
+#else
+  matmul->hadamard_cross_terms(size, multArrVec, inArr, outputArr, bitlength,
+                               bitlength, bitlength, MultMode::Alice_has_A);
+#endif
+#endif
+
+  if (party == SERVER) {
+    for (int i = 0; i < size; i++) {
+      outputArr[i] += (inArr[i] * multArrVec[i]);
+    }
+  } else {
+    for (int i = 0; i < size; i++) {
+      assert(multArrVec[i] == 0 && "Client's share of model is non-zero.");
+    }
+  }
+
+#endif // SCI_OT
+
+#ifdef SCI_HE
+  std::vector<uint64_t> tempInArr(size);
+  std::vector<uint64_t> tempOutArr(size);
+  std::vector<uint64_t> tempMultArr(size);
+
+  for (int i = 0; i < size; i++) {
+    tempInArr[i] = getRingElt(inArr[i]);
+    tempMultArr[i] = getRingElt(multArrVec[i]);
+  }
+
+  he_prod->elemwise_product(size, tempInArr, tempMultArr, tempOutArr);
+
+  for (int i = 0; i < size; i++) {
+    outputArr[i] = getRingElt(tempOutArr[i]);
+  }
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  BatchNormInMilliSec += temp;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  BatchNormCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < size; i++) {
+    assert(outputArr[i] < prime_mod);
+  }
+#endif
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, size);
+    funcReconstruct2PCCons(nullptr, multArrVec, size);
+    funcReconstruct2PCCons(nullptr, outputArr, size);
+  } else {
+    signedIntType *VinArr = new signedIntType[size];
+    funcReconstruct2PCCons(VinArr, inArr, size);
+    signedIntType *VmultArr = new signedIntType[size];
+    funcReconstruct2PCCons(VmultArr, multArrVec, size);
+    signedIntType *VoutputArr = new signedIntType[size];
+    funcReconstruct2PCCons(VoutputArr, outputArr, size);
+
+    std::vector<uint64_t> VinVec(size);
+    std::vector<uint64_t> VmultVec(size);
+    std::vector<uint64_t> VoutputVec(size);
+
+    for (int i = 0; i < size; i++) {
+      VinVec[i] = getRingElt(VinArr[i]);
+      VmultVec[i] = getRingElt(VmultArr[i]);
+    }
+
+    ElemWiseActModelVectorMult_pt(size, VinVec, VmultVec, VoutputVec);
+
+    bool pass = true;
+    for (int i = 0; i < size; i++) {
+      if (VoutputArr[i] != getSignedVal(VoutputVec[i])) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "ElemWiseSecretVectorMult Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "ElemWiseSecretVectorMult Output Mismatch" << RESET
+                << std::endl;
+
+    delete[] VinArr;
+    delete[] VmultArr;
+    delete[] VoutputArr;
+  }
+#endif
+}
+
+void ArgMax(int32_t s1, int32_t s2, intType *inArr, intType *outArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "ArgMax " << ctr << " called, s1=" << s1 << ", s2=" << s2
+            << std::endl;
+  ctr++;
+
+  assert(s1 == 1 && "ArgMax impl right now assumes s1==1");
+  argmax->ArgMaxMPC(s2, inArr, outArr);
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ArgMaxTimeInMilliSec += temp;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ArgMaxCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, s1 * s2);
+    funcReconstruct2PCCons(nullptr, outArr, s1);
+  } else {
+    signedIntType *VinArr = new signedIntType[s1 * s2];
+    funcReconstruct2PCCons(VinArr, inArr, s1 * s2);
+    signedIntType *VoutArr = new signedIntType[s1];
+    funcReconstruct2PCCons(VoutArr, outArr, s1);
+
+    std::vector<std::vector<uint64_t>> VinVec;
+    VinVec.resize(s1, std::vector<uint64_t>(s2, 0));
+    std::vector<uint64_t> VoutVec(s1);
+
+    for (int i = 0; i < s1; i++) {
+      for (int j = 0; j < s2; j++) {
+        VinVec[i][j] = getRingElt(Arr2DIdxRowM(VinArr, s1, s2, i, j));
+      }
+    }
+
+    ArgMax_pt(s1, s2, VinVec, VoutVec);
+
+    bool pass = true;
+    for (int i = 0; i < s1; i++) {
+      if (VoutArr[i] != getSignedVal(VoutVec[i])) {
+        pass = false;
+        std::cout << VoutArr[i] << "\t" << getSignedVal(VoutVec[i])
+                  << std::endl;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "ArgMax1 Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "ArgMax1 Output Mismatch" << RESET << std::endl;
+
+    delete[] VinArr;
+    delete[] VoutArr;
+  }
+#endif
+}
+
+// void Clip(int32_t size, int64_t alpha, int64_t beta, intType *inArr, intType *outArr, int sf, bool doTruncation) {
+//   intType *maxIn = new intType[size] ;
+//   Max(size, inArr, alpha, maxIn, sf, doTruncation) ;
+//   Min(size, maxIn, beta, outArr, sf, doTruncation) ;
+
+//   delete[] maxIn ;
+// }
+
+void Min(int32_t size, intType *inArr, int32_t alpha, intType *outArr, int32_t sf, bool doTruncation) {
+  intType *tempIn = new intType[size] ;
+  intType *tempOut = new intType[size] ;
+
+  intType affine ;
+  if (alpha < 0) {
+  	affine = ((intType)((int32_t)(-1)*alpha)) << sf ;
+  	affine = (intType)((-((signedIntType)1))*((signedIntType)affine)) ;
+  } else {
+  	affine = ((intType)alpha) << sf ;
+  }
+
+  for (int i = 0 ; i < size ; i++) {
+    if (party == SERVER)
+      tempIn[i] = affine - inArr[i] ;
+    else
+      tempIn[i] = -inArr[i] ;
+  }
+  
+  Relu(size, tempIn, tempOut, sf, doTruncation) ;
+
+  for (int i = 0 ; i < size ; i++) {
+    if (party == SERVER)
+      outArr[i] = affine - tempOut[i] ;
+    else
+      outArr[i] = -tempOut[i] ;
+  }
+
+  delete[] tempIn ;
+  delete[] tempOut ;
+}
+
+void Max(int32_t size, intType *inArr, int32_t alpha, intType *outArr, int32_t sf, bool doTruncation) {
+  intType *tempIn = new intType[size] ;
+  intType *tempOut = new intType[size] ;
+
+  intType affine ;
+  if (alpha < 0) {
+  	affine = ((intType)((int32_t)(-1)*alpha)) << sf ;
+  	affine = (intType)((-((signedIntType)1))*((signedIntType)affine)) ;
+  } else {
+  	affine = ((intType)alpha) << sf ;
+  }
+  	
+  for (int i = 0 ; i < size ; i++) {
+  	tempIn[i] = inArr[i] ;
+  	if (party == SERVER)
+  		tempIn[i] = tempIn[i] - affine ;
+  }
+  
+  Relu(size, tempIn, tempOut, sf, doTruncation) ;
+
+  for (int i = 0 ; i < size ; i++) {
+  	outArr[i] = tempOut[i] ;
+    if (party == SERVER)
+       outArr[i] += affine ; 
+  }
+
+  delete[] tempIn ;
+  delete[] tempOut ;
+}
+
+void HardSigmoid(int32_t size, intType *inArr, intType *outArr, int32_t sf, bool doTruncation) {
+  intType *tmpIn = new intType[size] ;
+  intType *tmpIn1 = new intType[size] ;
+  for(int i=0;i<size;i++) {
+    if (party == SERVER)
+    	tmpIn[i] = inArr[i] + (intType)(3<<sf) ;
+    else 
+    	tmpIn[i] = inArr[i] ;
+  }
+
+  ElemWiseVectorPublicDiv(size,tmpIn,6,tmpIn1);
+  Min(size, tmpIn1, (int32_t)1, tmpIn1, sf, doTruncation) ;
+  Max(size, tmpIn1, (int32_t)0, outArr, sf, doTruncation) ;
+
+  delete[] tmpIn ;
+  delete[] tmpIn1 ;
+}
+
+void Relu(int32_t size, intType *inArr, intType *outArr, int sf,
+          bool doTruncation) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "Relu " << ctr << " called size=" << size << std::endl;
+  ctr++;
+
+  intType moduloMask = sci::all1Mask(bitlength);
+  uint8_t *msbShare = new uint8_t[size];
+  intType *tempOutp = new intType[size];
+
+#ifndef MULTITHREADED_NONLIN
+  relu->relu(tempOutp, inArr, size, nullptr);
+#else
+  std::thread relu_threads[num_threads];
+  int chunk_size = size / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_relu;
+    if (i == (num_threads - 1)) {
+      lnum_relu = size - offset;
+    } else {
+      lnum_relu = chunk_size;
+    }
+    relu_threads[i] = std::thread(funcReLUThread, i, tempOutp + offset,
+                                  inArr + offset, lnum_relu, nullptr, false);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    relu_threads[i].join();
+  }
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  ReluTimeInMilliSec += temp;
+  std::cout << "Time in sec for current relu = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  ReluCommSent += curComm;
+#endif
+
+  if (doTruncation) {
+#ifdef LOG_LAYERWISE
+    INIT_ALL_IO_DATA_SENT;
+    INIT_TIMER;
+#endif
+    for (int i = 0; i < size; i++) {
+      msbShare[i] = 0; // After relu, all numbers are +ve
+    }
+
+#ifdef SCI_OT
+    for (int i = 0; i < size; i++) {
+      tempOutp[i] = tempOutp[i] & moduloMask;
+    }
+    funcTruncateTwoPowerRingWrapper(size, tempOutp, outArr, sf, msbShare, true);
+#else
+    funcFieldDivWrapper<intType>(size, tempOutp, outArr, 1ULL << sf, msbShare);
+#endif
+
+#ifdef LOG_LAYERWISE
+    auto temp = TIMER_TILL_NOW;
+    TruncationTimeInMilliSec += temp;
+    uint64_t curComm;
+    FIND_ALL_IO_TILL_NOW(curComm);
+    TruncationCommSent += curComm;
+#endif
+  } else {
+    for (int i = 0; i < size; i++) {
+      outArr[i] = tempOutp[i];
+    }
+  }
+
+#ifdef SCI_OT
+  for (int i = 0; i < size; i++) {
+    outArr[i] = outArr[i] & moduloMask;
+  }
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < size; i++) {
+    assert(tempOutp[i] < prime_mod);
+    assert(outArr[i] < prime_mod);
+  }
+#endif
+
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, size);
+    funcReconstruct2PCCons(nullptr, tempOutp, size);
+    funcReconstruct2PCCons(nullptr, outArr, size);
+  } else {
+    signedIntType *VinArr = new signedIntType[size];
+    funcReconstruct2PCCons(VinArr, inArr, size);
+    signedIntType *VtempOutpArr = new signedIntType[size];
+    funcReconstruct2PCCons(VtempOutpArr, tempOutp, size);
+    signedIntType *VoutArr = new signedIntType[size];
+    funcReconstruct2PCCons(VoutArr, outArr, size);
+
+    std::vector<uint64_t> VinVec;
+    VinVec.resize(size, 0);
+
+    std::vector<uint64_t> VoutVec;
+    VoutVec.resize(size, 0);
+
+    for (int i = 0; i < size; i++) {
+      VinVec[i] = getRingElt(VinArr[i]);
+    }
+
+    Relu_pt(size, VinVec, VoutVec, 0, false); // sf = 0
+
+    bool pass = true;
+    for (int i = 0; i < size; i++) {
+      if (VtempOutpArr[i] != getSignedVal(VoutVec[i])) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "ReLU Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "ReLU Output Mismatch" << RESET << std::endl;
+
+    ScaleDown_pt(size, VoutVec, sf);
+
+    pass = true;
+    for (int i = 0; i < size; i++) {
+      if (VoutArr[i] != getSignedVal(VoutVec[i])) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Truncation (after ReLU) Output Matches" << RESET
+                << std::endl;
+    else
+      std::cout << RED << "Truncation (after ReLU) Output Mismatch" << RESET
+                << std::endl;
+
+    delete[] VinArr;
+    delete[] VtempOutpArr;
+    delete[] VoutArr;
+  }
+#endif
+
+  delete[] tempOutp;
+  delete[] msbShare;
+}
+
+void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, intType *inArr, intType *outArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "Maxpool " << ctr << " called N=" << N << ", H=" << H
+            << ", W=" << W << ", C=" << C << ", ksizeH=" << ksizeH
+            << ", ksizeW=" << ksizeW << std::endl;
+  ctr++;
+
+  uint64_t moduloMask = sci::all1Mask(bitlength);
+  int rows = N * H * W * C;
+  int cols = ksizeH * ksizeW;
+
+  intType *reInpArr = new intType[rows * cols];
+  intType *maxi = new intType[rows];
+  intType *maxiIdx = new intType[rows];
+
+  int rowIdx = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      int32_t leftTopCornerH = -zPadHLeft;
+      int32_t extremeRightBottomCornerH = imgH - 1 + zPadHRight;
+      while ((leftTopCornerH + ksizeH - 1) <= extremeRightBottomCornerH) {
+        int32_t leftTopCornerW = -zPadWLeft;
+        int32_t extremeRightBottomCornerW = imgW - 1 + zPadWRight;
+        while ((leftTopCornerW + ksizeW - 1) <= extremeRightBottomCornerW) {
+
+          for (int fh = 0; fh < ksizeH; fh++) {
+            for (int fw = 0; fw < ksizeW; fw++) {
+              int32_t colIdx = fh * ksizeW + fw;
+              int32_t finalIdx = rowIdx * (ksizeH * ksizeW) + colIdx;
+
+              int32_t curPosH = leftTopCornerH + fh;
+              int32_t curPosW = leftTopCornerW + fw;
+
+              intType temp = 0;
+              if ((((curPosH < 0) || (curPosH >= imgH)) ||
+                   ((curPosW < 0) || (curPosW >= imgW)))) {
+                temp = 0;
+              } else {
+                temp = Arr4DIdxRowM(inArr, N, imgH, imgW, C, n, curPosH,
+                                    curPosW, c);
+              }
+              reInpArr[finalIdx] = temp;
+            }
+          }
+
+          rowIdx += 1;
+          leftTopCornerW = leftTopCornerW + strideW;
+        }
+
+        leftTopCornerH = leftTopCornerH + strideH;
+      }
+    }
+  }
+
+#ifndef MULTITHREADED_NONLIN
+  maxpool->funcMaxMPC(rows, cols, reInpArr, maxi, maxiIdx);
+#else
+  std::thread maxpool_threads[num_threads];
+  int chunk_size = rows / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_rows;
+    if (i == (num_threads - 1)) {
+      lnum_rows = rows - offset;
+    } else {
+      lnum_rows = chunk_size;
+    }
+    maxpool_threads[i] =
+        std::thread(funcMaxpoolThread, i, lnum_rows, cols,
+                    reInpArr + offset * cols, maxi + offset, maxiIdx + offset);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    maxpool_threads[i].join();
+  }
+#endif
+
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; h++) {
+        for (int w = 0; w < W; w++) {
+          int iidx = n * C * H * W + c * H * W + h * W + w;
+          Arr4DIdxRowM(outArr, N, H, W, C, n, h, w, c) = getRingElt(maxi[iidx]);
+        }
+      }
+    }
+  }
+
+  delete[] reInpArr;
+  delete[] maxi;
+  delete[] maxiIdx;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  MaxpoolTimeInMilliSec += temp;
+  std::cout << "Time in sec for current maxpool = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  MaxpoolCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < H; j++) {
+      for (int k = 0; k < W; k++) {
+        for (int p = 0; p < C; p++) {
+          assert(Arr4DIdxRowM(outArr, N, H, W, C, i, j, k, p) < prime_mod);
+        }
+      }
+    }
+  }
+#endif
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, N * imgH * imgW * C);
+    funcReconstruct2PCCons(nullptr, outArr, N * H * W * C);
+  } else {
+    signedIntType *VinArr = new signedIntType[N * imgH * imgW * C];
+    funcReconstruct2PCCons(VinArr, inArr, N * imgH * imgW * C);
+    signedIntType *VoutArr = new signedIntType[N * H * W * C];
+    funcReconstruct2PCCons(VoutArr, outArr, N * H * W * C);
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VinVec;
+    VinVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                         imgH, std::vector<std::vector<uint64_t>>(
+                                   imgW, std::vector<uint64_t>(C, 0))));
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VoutVec;
+    VoutVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                          H, std::vector<std::vector<uint64_t>>(
+                                 W, std::vector<uint64_t>(C, 0))));
+
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < imgH; j++) {
+        for (int k = 0; k < imgW; k++) {
+          for (int p = 0; p < C; p++) {
+            VinVec[i][j][k][p] =
+                getRingElt(Arr4DIdxRowM(VinArr, N, imgH, imgW, C, i, j, k, p));
+          }
+        }
+      }
+    }
+
+    MaxPool_pt(N, H, W, C, ksizeH, ksizeW, zPadHLeft, zPadHRight, zPadWLeft,
+               zPadWRight, strideH, strideW, N1, imgH, imgW, C1, VinVec,
+               VoutVec);
+
+    bool pass = true;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < H; j++) {
+        for (int k = 0; k < W; k++) {
+          for (int p = 0; p < C; p++) {
+            if (Arr4DIdxRowM(VoutArr, N, H, W, C, i, j, k, p) !=
+                getSignedVal(VoutVec[i][j][k][p])) {
+              pass = false;
+              // std::cout << i << "\t" << j << "\t" << k << "\t" << p << "\t"
+              // << Arr4DIdxRowM(VoutArr,N,H,W,C,i,j,k,p) << "\t" <<
+              // getSignedVal(VoutVec[i][j][k][p]) << std::endl;
+            }
+          }
+        }
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "Maxpool Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Maxpool Output Mismatch" << RESET << std::endl;
+
+    delete[] VinArr;
+    delete[] VoutArr;
+  }
+#endif
+}
+
+void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, intType *inArr, intType *outArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  static int ctr = 1;
+  std::cout << "AvgPool " << ctr << " called N=" << N << ", H=" << H
+            << ", W=" << W << ", C=" << C << ", ksizeH=" << ksizeH
+            << ", ksizeW=" << ksizeW << std::endl;
+  ctr++;
+
+  uint64_t moduloMask = sci::all1Mask(bitlength);
+  int rows = N * H * W * C;
+  intType *filterSum = new intType[rows];
+  intType *filterAvg = new intType[rows];
+
+  int rowIdx = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      int32_t leftTopCornerH = -zPadHLeft;
+      int32_t extremeRightBottomCornerH = imgH - 1 + zPadHRight;
+      while ((leftTopCornerH + ksizeH - 1) <= extremeRightBottomCornerH) {
+        int32_t leftTopCornerW = -zPadWLeft;
+        int32_t extremeRightBottomCornerW = imgW - 1 + zPadWRight;
+        while ((leftTopCornerW + ksizeW - 1) <= extremeRightBottomCornerW) {
+
+          intType curFilterSum = 0;
+          for (int fh = 0; fh < ksizeH; fh++) {
+            for (int fw = 0; fw < ksizeW; fw++) {
+              int32_t curPosH = leftTopCornerH + fh;
+              int32_t curPosW = leftTopCornerW + fw;
+
+              intType temp = 0;
+              if ((((curPosH < 0) || (curPosH >= imgH)) ||
+                   ((curPosW < 0) || (curPosW >= imgW)))) {
+                temp = 0;
+              } else {
+                temp = Arr4DIdxRowM(inArr, N, imgH, imgW, C, n, curPosH,
+                                    curPosW, c);
+              }
+#ifdef SCI_OT
+              curFilterSum += temp;
+#else
+              curFilterSum =
+                  sci::neg_mod(curFilterSum + temp, (int64_t)prime_mod);
+#endif
+            }
+          }
+
+          filterSum[rowIdx] = curFilterSum;
+          rowIdx += 1;
+          leftTopCornerW = leftTopCornerW + strideW;
+        }
+
+        leftTopCornerH = leftTopCornerH + strideH;
+      }
+    }
+  }
+
+#ifdef SCI_OT
+  for (int i = 0; i < rows; i++) {
+    filterSum[i] = filterSum[i] & moduloMask;
+  }
+  funcAvgPoolTwoPowerRingWrapper(rows, filterSum, filterAvg, ksizeH * ksizeW);
+#else
+  for (int i = 0; i < rows; i++) {
+    filterSum[i] = sci::neg_mod(filterSum[i], (int64_t)prime_mod);
+  }
+  funcFieldDivWrapper<intType>(rows, filterSum, filterAvg,
+                               ksizeH * ksizeW, nullptr);
+#endif
+
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; h++) {
+        for (int w = 0; w < W; w++) {
+          int iidx = n * C * H * W + c * H * W + h * W + w;
+          Arr4DIdxRowM(outArr, N, H, W, C, n, h, w, c) = filterAvg[iidx];
+#ifdef SCI_OT
+          Arr4DIdxRowM(outArr, N, H, W, C, n, h, w, c) =
+              Arr4DIdxRowM(outArr, N, H, W, C, n, h, w, c) & moduloMask;
+#endif
+        }
+      }
+    }
+  }
+
+  delete[] filterSum;
+  delete[] filterAvg;
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  AvgpoolTimeInMilliSec += temp;
+  std::cout << "Time in sec for current avgpool = " << (temp / 1000.0)
+            << std::endl;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  AvgpoolCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < H; j++) {
+      for (int k = 0; k < W; k++) {
+        for (int p = 0; p < C; p++) {
+          assert(Arr4DIdxRowM(outArr, N, H, W, C, i, j, k, p) < prime_mod);
+        }
+      }
+    }
+  }
+#endif
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, N * imgH * imgW * C);
+    funcReconstruct2PCCons(nullptr, outArr, N * H * W * C);
+  } else {
+    signedIntType *VinArr = new signedIntType[N * imgH * imgW * C];
+    funcReconstruct2PCCons(VinArr, inArr, N * imgH * imgW * C);
+    signedIntType *VoutArr = new signedIntType[N * H * W * C];
+    funcReconstruct2PCCons(VoutArr, outArr, N * H * W * C);
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VinVec;
+    VinVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                         imgH, std::vector<std::vector<uint64_t>>(
+                                   imgW, std::vector<uint64_t>(C, 0))));
+
+    std::vector<std::vector<std::vector<std::vector<uint64_t>>>> VoutVec;
+    VoutVec.resize(N, std::vector<std::vector<std::vector<uint64_t>>>(
+                          H, std::vector<std::vector<uint64_t>>(
+                                 W, std::vector<uint64_t>(C, 0))));
+
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < imgH; j++) {
+        for (int k = 0; k < imgW; k++) {
+          for (int p = 0; p < C; p++) {
+            VinVec[i][j][k][p] =
+                getRingElt(Arr4DIdxRowM(VinArr, N, imgH, imgW, C, i, j, k, p));
+          }
+        }
+      }
+    }
+
+    AvgPool_pt(N, H, W, C, ksizeH, ksizeW, zPadHLeft, zPadHRight, zPadWLeft,
+               zPadWRight, strideH, strideW, N1, imgH, imgW, C1, VinVec,
+               VoutVec);
+
+    bool pass = true;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < H; j++) {
+        for (int k = 0; k < W; k++) {
+          for (int p = 0; p < C; p++) {
+            if (Arr4DIdxRowM(VoutArr, N, H, W, C, i, j, k, p) !=
+                getSignedVal(VoutVec[i][j][k][p])) {
+              pass = false;
+            }
+          }
+        }
+      }
+    }
+
+    if (pass == true)
+      std::cout << GREEN << "AvgPool Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "AvgPool Output Mismatch" << RESET << std::endl;
+
+    delete[] VinArr;
+    delete[] VoutArr;
+  }
+#endif
+}
+
+void ScaleDown(int32_t size, intType *inArr, int32_t sf) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+
+  intType *outp = new intType[size];
+
+#ifdef SCI_OT
+  uint64_t moduloMask = sci::all1Mask(bitlength);
+  for (int i = 0; i < size; i++) {
+    inArr[i] = inArr[i] & moduloMask;
+  }
+  truncation->truncate(size, inArr, outp, sf, bitlength, true);
+#else
+  for (int i = 0; i < size; i++) {
+    inArr[i] = sci::neg_mod(inArr[i], (int64_t)prime_mod);
+  }
+  funcFieldDivWrapper<intType>(size, inArr, outp, 1ULL << sf, nullptr);
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  TruncationTimeInMilliSec += temp;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  TruncationCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+#ifdef SCI_HE
+  for (int i = 0; i < size; i++) {
+    assert(outp[i] < prime_mod);
+  }
+#endif
+
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, size);
+    funcReconstruct2PCCons(nullptr, outp, size);
+  } else {
+    signedIntType *VinArr = new signedIntType[size];
+    funcReconstruct2PCCons(VinArr, inArr, size);
+    signedIntType *VoutpArr = new signedIntType[size];
+    funcReconstruct2PCCons(VoutpArr, outp, size);
+
+    std::vector<uint64_t> VinVec;
+    VinVec.resize(size, 0);
+
+    for (int i = 0; i < size; i++) {
+      VinVec[i] = getRingElt(VinArr[i]);
+    }
+
+    ScaleDown_pt(size, VinVec, sf);
+
+    bool pass = true;
+    for (int i = 0; i < size; i++) {
+      if (VoutpArr[i] != getSignedVal(VinVec[i])) {
+        pass = false;
+      }
+    }
+
+    if (pass == true)
+      std::cout << GREEN << "Truncation4 Output Matches" << RESET << std::endl;
+    else
+      std::cout << RED << "Truncation4 Output Mismatch" << RESET << std::endl;
+
+    delete[] VinArr;
+    delete[] VoutpArr;
+  }
+#endif
+
+  memcpy(inArr, outp, sizeof(intType) * size);
+  delete[] outp;
+}
+
+void ScaleUp(int32_t size, intType *arr, int32_t sf) {
+  for (int i = 0; i < size; i++) {
+#ifdef SCI_OT
+    arr[i] = (arr[i] << sf);
+#else
+    arr[i] = sci::neg_mod(arr[i] << sf, (int64_t)prime_mod);
+#endif
+  }
+}
+
+void StartComputation() {
+  assert(bitlength < 64 && bitlength > 0);
+  assert(num_threads <= MAX_THREADS);
+#ifdef SCI_HE
+  prime_mod = sci::default_prime_mod.at(bitlength);
+#elif SCI_OT
+  prime_mod = (bitlength == 64 ? 0ULL : 1ULL << bitlength);
+  moduloMask = prime_mod - 1;
+  moduloMidPt = prime_mod / 2;
+#endif
+  std::cout << "bitlength: " << bitlength << std::endl;
+  std::cout << "prime_mod: " << prime_mod << std::endl;
+  checkIfUsingEigen();
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new sci::IOPack(party, port + i, address);
+    ioArr[i] = iopackArr[i]->io;
+    otInstanceArr[i] = new sci::IKNP<sci::NetIO>(ioArr[i]);
+    prgInstanceArr[i] = new sci::PRG128();
+    kkotInstanceArr[i] = new sci::KKOT<sci::NetIO>(ioArr[i]);
+#ifdef SCI_OT
+    multUniformArr[i] =
+        new MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>>(
+            party, bitlength, ioArr[i], otInstanceArr[i], nullptr);
+#endif
+    if (i & 1) {
+      otpackArr[i] = new sci::OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new sci::OTPack(iopackArr[i], party);
+    }
+  }
+
+  io = ioArr[0];
+  iopack = iopackArr[0];
+  otpack = otpackArr[0];
+  iknpOT = otInstanceArr[0];
+  iknpOTRoleReversed = new sci::IKNP<sci::NetIO>(io);
+  kkot = kkotInstanceArr[0];
+  prg128Instance = prgInstanceArr[0];
+
+#ifdef SCI_OT
+  mult = new LinearOT(party, iopack, otpack);
+  truncation = new Truncation(party, iopack, otpack);
+  multUniform = new MatMulUniform<sci::NetIO, intType, sci::IKNP<sci::NetIO>>(
+      party, bitlength, io, iknpOT, iknpOTRoleReversed);
+  relu = new ReLURingProtocol<intType>(party, RING, iopack, bitlength,
+                                       MILL_PARAM, otpack);
+  maxpool = new MaxPoolProtocol<intType>(party, RING, iopack, bitlength,
+                                         MILL_PARAM, 0, otpack);
+  argmax = new ArgMaxProtocol<intType>(party, RING, iopack, bitlength,
+                                       MILL_PARAM, 0, otpack);
+  math = new MathFunctions(party, iopack, otpack);
+#endif
+
+#ifdef SCI_HE
+  relu = new ReLUFieldProtocol<intType>(party, FIELD, iopack, bitlength,
+                                        MILL_PARAM, prime_mod, otpack);
+  maxpool = new MaxPoolProtocol<intType>(party, FIELD, iopack, bitlength,
+                                         MILL_PARAM, prime_mod, otpack);
+  argmax = new ArgMaxProtocol<intType>(party, FIELD, iopack, bitlength,
+                                       MILL_PARAM, prime_mod, otpack);
+  he_conv = new ConvField(party, io);
+  he_fc = new FCField(party, io);
+  he_prod = new ElemWiseProdField(party, io);
+  assertFieldRun();
+#endif
+
+#if defined MULTITHREADED_NONLIN && defined SCI_OT
+  for (int i = 0; i < num_threads; i++) {
+    if (i & 1) {
+      reluArr[i] = new ReLURingProtocol<intType>(
+          3 - party, RING, iopackArr[i], bitlength, MILL_PARAM, otpackArr[i]);
+      maxpoolArr[i] =
+          new MaxPoolProtocol<intType>(3 - party, RING, iopackArr[i], bitlength,
+                                       MILL_PARAM, 0, otpackArr[i]);
+      multArr[i] = new LinearOT(3 - party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] = new Truncation(3 - party, iopackArr[i], otpackArr[i]);
+    } else {
+      reluArr[i] = new ReLURingProtocol<intType>(
+          party, RING, iopackArr[i], bitlength, MILL_PARAM, otpackArr[i]);
+      maxpoolArr[i] = new MaxPoolProtocol<intType>(
+          party, RING, iopackArr[i], bitlength, MILL_PARAM, 0, otpackArr[i]);
+      multArr[i] = new LinearOT(party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] = new Truncation(party, iopackArr[i], otpackArr[i]);
+    }
+  }
+#endif
+
+#ifdef SCI_HE
+  for (int i = 0; i < num_threads; i++) {
+    if (i & 1) {
+      reluArr[i] = new ReLUFieldProtocol<intType>(
+          3 - party, FIELD, iopackArr[i], bitlength, MILL_PARAM, prime_mod,
+          otpackArr[i]);
+      maxpoolArr[i] = new MaxPoolProtocol<intType>(
+          3 - party, FIELD, iopackArr[i], bitlength, MILL_PARAM, prime_mod,
+          otpackArr[i]);
+    } else {
+      reluArr[i] =
+          new ReLUFieldProtocol<intType>(party, FIELD, iopackArr[i], bitlength,
+                                         MILL_PARAM, prime_mod, otpackArr[i]);
+      maxpoolArr[i] =
+          new MaxPoolProtocol<intType>(party, FIELD, iopackArr[i], bitlength,
+                                       MILL_PARAM, prime_mod, otpackArr[i]);
+    }
+  }
+#endif
+
+// Math Protocols
+#ifdef SCI_OT
+  for (int i = 0; i < num_threads; i++) {
+    if (i & 1) {
+      auxArr[i] = new AuxProtocols(3 - party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] =
+          new Truncation(3 - party, iopackArr[i], otpackArr[i]);
+      xtArr[i] = new XTProtocol(3 - party, iopackArr[i], otpackArr[i]);
+      mathArr[i] = new MathFunctions(3 - party, iopackArr[i], otpackArr[i]);
+    } else {
+      auxArr[i] = new AuxProtocols(party, iopackArr[i], otpackArr[i]);
+      truncationArr[i] =
+          new Truncation(party, iopackArr[i], otpackArr[i]);
+      xtArr[i] = new XTProtocol(party, iopackArr[i], otpackArr[i]);
+      mathArr[i] = new MathFunctions(party, iopackArr[i], otpackArr[i]);
+    }
+  }
+  aux = auxArr[0];
+  truncation = truncationArr[0];
+  xt = xtArr[0];
+  mult = multArr[0];
+  math = mathArr[0];
+#endif
+
+  if (party == sci::ALICE) {
+    iknpOT->setup_send();
+    iknpOTRoleReversed->setup_recv();
+  } else if (party == sci::BOB) {
+    iknpOT->setup_recv();
+    iknpOTRoleReversed->setup_send();
+  }
+
+  cout << "After base ots, communication = " << (iopack->get_comm()) << " bytes"
+       << endl;
+  start_time = std::chrono::high_resolution_clock::now();
+  num_rounds = iopack->get_rounds();
+  for (int i = 0; i < num_threads; i++) {
+    auto temp = iopackArr[i]->get_comm();
+    comm_threads[i] = temp;
+    std::cout << "Thread i = " << i << ", total data sent till now = " << temp
+              << std::endl;
+  }
+  std::cout << "-----------Syncronizing-----------" << std::endl;
+  io->sync();
+  num_rounds = iopack->get_rounds();
+  std::cout << "-----------Syncronized - now starting execution-----------"
+            << std::endl;
+}
+
+void EndComputation() {
+  auto endTimer = std::chrono::high_resolution_clock::now();
+  auto execTimeInMilliSec =
+      std::chrono::duration_cast<std::chrono::milliseconds>(endTimer -
+                                                            start_time)
+          .count();
+  uint64_t totalComm = 0;
+  for (int i = 0; i < num_threads; i++) {
+    auto temp = iopackArr[i]->get_comm();
+    std::cout << "Thread i = " << i << ", total data sent till now = " << temp
+              << std::endl;
+    totalComm += (temp - comm_threads[i]);
+  }
+  uint64_t totalCommClient;
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "Total time taken = " << execTimeInMilliSec
+            << " milliseconds.\n";
+  std::cout << "Total data sent = " << (totalComm / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "Number of rounds = " << iopack->get_rounds() - num_rounds
+            << std::endl;
+  if (party == SERVER) {
+    io->recv_data(&totalCommClient, sizeof(uint64_t));
+    std::cout << "Total comm (sent+received) = "
+              << ((totalComm + totalCommClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+  } else if (party == CLIENT) {
+    io->send_data(&totalComm, sizeof(uint64_t));
+    std::cout << "Total comm (sent+received) = (see SERVER OUTPUT)"
+              << std::endl;
+  }
+  std::cout << "------------------------------------------------------\n";
+
+#ifdef LOG_LAYERWISE
+  std::cout << "Total time in Conv = " << (ConvTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MatMul = " << (MatMulTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in BatchNorm = " << (BatchNormInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Truncation = "
+            << (TruncationTimeInMilliSec / 1000.0) << " seconds." << std::endl;
+  std::cout << "Total time in Relu = " << (ReluTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MaxPool = " << (MaxpoolTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in AvgPool = " << (AvgpoolTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in ArgMax = " << (ArgMaxTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MatAdd = " << (MatAddTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in MatAddBroadCast = "
+            << (MatAddBroadCastTimeInMilliSec / 1000.0) << " seconds."
+            << std::endl;
+  std::cout << "Total time in MulCir = " << (MulCirTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in ScalarMul = "
+            << (ScalarMulTimeInMilliSec / 1000.0) << " seconds." << std::endl;
+  std::cout << "Total time in Sigmoid = " << (SigmoidTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Tanh = " << (TanhTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in Sqrt = " << (SqrtTimeInMilliSec / 1000.0)
+            << " seconds." << std::endl;
+  std::cout << "Total time in NormaliseL2 = "
+            << (NormaliseL2TimeInMilliSec / 1000.0) << " seconds." << std::endl;
+  std::cout << "------------------------------------------------------\n";
+  std::cout << "Conv data sent = " << ((ConvCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "MatMul data sent = "
+            << ((MatMulCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "BatchNorm data sent = "
+            << ((BatchNormCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Truncation data sent = "
+            << ((TruncationCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Relu data sent = " << ((ReluCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "Maxpool data sent = "
+            << ((MaxpoolCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Avgpool data sent = "
+            << ((AvgpoolCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "ArgMax data sent = "
+            << ((ArgMaxCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "MatAdd data sent = "
+            << ((MatAddCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "MatAddBroadCast data sent = "
+            << ((MatAddBroadCastCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "MulCir data sent = "
+            << ((MulCirCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Sigmoid data sent = "
+            << ((SigmoidCommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "Tanh data sent = " << ((TanhCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "Sqrt data sent = " << ((SqrtCommSent) / (1.0 * (1ULL << 20)))
+            << " MiB." << std::endl;
+  std::cout << "NormaliseL2 data sent = "
+            << ((NormaliseL2CommSent) / (1.0 * (1ULL << 20))) << " MiB."
+            << std::endl;
+  std::cout << "------------------------------------------------------\n";
+  if (party == SERVER) {
+    uint64_t ConvCommSentClient = 0;
+    uint64_t MatMulCommSentClient = 0;
+    uint64_t BatchNormCommSentClient = 0;
+    uint64_t TruncationCommSentClient = 0;
+    uint64_t ReluCommSentClient = 0;
+    uint64_t MaxpoolCommSentClient = 0;
+    uint64_t AvgpoolCommSentClient = 0;
+    uint64_t ArgMaxCommSentClient = 0;
+    uint64_t MatAddCommSentClient = 0;
+    uint64_t MatAddBroadCastCommSentClient = 0;
+    uint64_t MulCirCommSentClient = 0;
+    uint64_t ScalarMulCommSentClient = 0;
+    uint64_t SigmoidCommSentClient = 0;
+    uint64_t TanhCommSentClient = 0;
+    uint64_t SqrtCommSentClient = 0;
+    uint64_t NormaliseL2CommSentClient = 0;
+
+    io->recv_data(&ConvCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MatMulCommSentClient, sizeof(uint64_t));
+    io->recv_data(&BatchNormCommSentClient, sizeof(uint64_t));
+    io->recv_data(&TruncationCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ReluCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MaxpoolCommSentClient, sizeof(uint64_t));
+    io->recv_data(&AvgpoolCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ArgMaxCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MatAddCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MatAddBroadCastCommSentClient, sizeof(uint64_t));
+    io->recv_data(&MulCirCommSentClient, sizeof(uint64_t));
+    io->recv_data(&ScalarMulCommSentClient, sizeof(uint64_t));
+    io->recv_data(&SigmoidCommSentClient, sizeof(uint64_t));
+    io->recv_data(&TanhCommSentClient, sizeof(uint64_t));
+    io->recv_data(&SqrtCommSentClient, sizeof(uint64_t));
+    io->recv_data(&NormaliseL2CommSentClient, sizeof(uint64_t));
+
+    std::cout << "Conv data (sent+received) = "
+              << ((ConvCommSent + ConvCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MatMul data (sent+received) = "
+              << ((MatMulCommSent + MatMulCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "BatchNorm data (sent+received) = "
+              << ((BatchNormCommSent + BatchNormCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Truncation data (sent+received) = "
+              << ((TruncationCommSent + TruncationCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Relu data (sent+received) = "
+              << ((ReluCommSent + ReluCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Maxpool data (sent+received) = "
+              << ((MaxpoolCommSent + MaxpoolCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Avgpool data (sent+received) = "
+              << ((AvgpoolCommSent + AvgpoolCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "ArgMax data (sent+received) = "
+              << ((ArgMaxCommSent + ArgMaxCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MatAdd data (sent+received) = "
+              << ((MatAddCommSent + MatAddCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MatAddBroadCast data (sent+received) = "
+              << ((MatAddBroadCastCommSent + MatAddBroadCastCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "MulCir data (sent+received) = "
+              << ((MulCirCommSent + MulCirCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "ScalarMul data (sent+received) = "
+              << ((ScalarMulCommSent + ScalarMulCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Sigmoid data (sent+received) = "
+              << ((SigmoidCommSent + SigmoidCommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Tanh data (sent+received) = "
+              << ((TanhCommSent + TanhCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "Sqrt data (sent+received) = "
+              << ((SqrtCommSent + SqrtCommSentClient) / (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+    std::cout << "NormaliseL2 data (sent+received) = "
+              << ((NormaliseL2CommSent + NormaliseL2CommSentClient) /
+                  (1.0 * (1ULL << 20)))
+              << " MiB." << std::endl;
+
+#ifdef WRITE_LOG
+    std::string file_addr = "results-Porthos2PC-server.csv";
+    bool write_title = true;
+    {
+      std::fstream result(file_addr.c_str(), std::fstream::in);
+      if (result.is_open())
+        write_title = false;
+      result.close();
+    }
+    std::fstream result(file_addr.c_str(),
+                        std::fstream::out | std::fstream::app);
+    if (write_title) {
+      result << "Algebra,Bitlen,Base,#Threads,Total Time,Total Comm,Conv "
+                "Time,Conv Comm,MatMul Time,MatMul Comm,BatchNorm "
+                "Time,BatchNorm Comm,Truncation Time,Truncation Comm,ReLU "
+                "Time,ReLU Comm,MaxPool Time,MaxPool Comm,AvgPool Time,AvgPool "
+                "Comm,ArgMax Time,ArgMax Comm"
+             << std::endl;
+    }
+    result << (isNativeRing ? "Ring" : "Field") << "," << bitlength << ","
+           << MILL_PARAM << "," << num_threads << ","
+           << execTimeInMilliSec / 1000.0 << ","
+           << (totalComm + totalCommClient) / (1.0 * (1ULL << 20)) << ","
+           << ConvTimeInMilliSec / 1000.0 << ","
+           << (ConvCommSent + ConvCommSentClient) / (1.0 * (1ULL << 20)) << ","
+           << MatMulTimeInMilliSec / 1000.0 << ","
+           << (MatMulCommSent + MatMulCommSentClient) / (1.0 * (1ULL << 20))
+           << "," << BatchNormInMilliSec / 1000.0 << ","
+           << (BatchNormCommSent + BatchNormCommSentClient) /
+                  (1.0 * (1ULL << 20))
+           << "," << TruncationTimeInMilliSec / 1000.0 << ","
+           << (TruncationCommSent + TruncationCommSentClient) /
+                  (1.0 * (1ULL << 20))
+           << "," << ReluTimeInMilliSec / 1000.0 << ","
+           << (ReluCommSent + ReluCommSentClient) / (1.0 * (1ULL << 20)) << ","
+           << MaxpoolTimeInMilliSec / 1000.0 << ","
+           << (MaxpoolCommSent + MaxpoolCommSentClient) / (1.0 * (1ULL << 20))
+           << "," << AvgpoolTimeInMilliSec / 1000.0 << ","
+           << (AvgpoolCommSent + AvgpoolCommSentClient) / (1.0 * (1ULL << 20))
+           << "," << ArgMaxTimeInMilliSec / 1000.0 << ","
+           << (ArgMaxCommSent + ArgMaxCommSentClient) / (1.0 * (1ULL << 20))
+           << std::endl;
+    result.close();
+#endif
+  } else if (party == CLIENT) {
+    io->send_data(&ConvCommSent, sizeof(uint64_t));
+    io->send_data(&MatMulCommSent, sizeof(uint64_t));
+    io->send_data(&BatchNormCommSent, sizeof(uint64_t));
+    io->send_data(&TruncationCommSent, sizeof(uint64_t));
+    io->send_data(&ReluCommSent, sizeof(uint64_t));
+    io->send_data(&MaxpoolCommSent, sizeof(uint64_t));
+    io->send_data(&AvgpoolCommSent, sizeof(uint64_t));
+    io->send_data(&ArgMaxCommSent, sizeof(uint64_t));
+    io->send_data(&MatAddCommSent, sizeof(uint64_t));
+    io->send_data(&MatAddBroadCastCommSent, sizeof(uint64_t));
+    io->send_data(&MulCirCommSent, sizeof(uint64_t));
+    io->send_data(&ScalarMulCommSent, sizeof(uint64_t));
+    io->send_data(&SigmoidCommSent, sizeof(uint64_t));
+    io->send_data(&TanhCommSent, sizeof(uint64_t));
+    io->send_data(&SqrtCommSent, sizeof(uint64_t));
+    io->send_data(&NormaliseL2CommSent, sizeof(uint64_t));
+  }
+#endif
+}
+
+intType SecretAdd(intType x, intType y) {
+#ifdef SCI_OT
+  return (x + y);
+#else
+  return sci::neg_mod(x + y, (int64_t)prime_mod);
+#endif
+}
+
+intType SecretSub(intType x, intType y) {
+#ifdef SCI_OT
+  return (x - y);
+#else
+  return sci::neg_mod(x - y, (int64_t)prime_mod);
+#endif
+}
+
+intType SecretMult(intType x, intType y) {
+  // Not being used in any of our networks right now
+  assert(false);
+}
+
+void ElemWiseVectorPublicDiv(int32_t s1, intType *arr1, int32_t divisor,
+                             intType *outArr) {
+  intType *inp = arr1;
+  intType *out = outArr;
+
+  assert(divisor > 0 && "No support for division by a negative divisor.");
+
+#ifdef SCI_OT
+  funcAvgPoolTwoPowerRingWrapper(s1, inp, out, (intType)divisor);
+#else
+  funcFieldDivWrapper(s1, inp, out, (intType)divisor, nullptr);
+#endif
+
+  return;
+}
+
+void ElemWiseSecretSharedVectorMult(int32_t size, intType *inArr,
+                                    intType *multArrVec, intType *outputArr) {
+#ifdef LOG_LAYERWISE
+  INIT_ALL_IO_DATA_SENT;
+  INIT_TIMER;
+#endif
+  static int batchNormCtr = 1;
+  std::cout << "Starting fused batchNorm #" << batchNormCtr << std::endl;
+  batchNormCtr++;
+
+#ifdef SCI_OT
+#ifdef MULTITHREADED_DOTPROD
+  std::thread dotProdThreads[num_threads];
+  int chunk_size = (size / num_threads);
+  for (int i = 0; i < num_threads; i++) {
+    int offset = i * chunk_size;
+    int curSize;
+    if (i == (num_threads - 1)) {
+      curSize = size - offset;
+    } else {
+      curSize = chunk_size;
+    }
+    dotProdThreads[i] = std::thread(funcDotProdThread, i, num_threads, curSize,
+                                    multArrVec + offset, inArr + offset,
+                                    outputArr + offset, true);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    dotProdThreads[i].join();
+  }
+#else
+  matmul->hadamard_cross_terms(size, multArrVec, inArr, outputArr, bitlength,
+                               bitlength, bitlength, MultMode::None);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    outputArr[i] += (inArr[i] * multArrVec[i]);
+  }
+#endif
+
+#ifdef LOG_LAYERWISE
+  auto temp = TIMER_TILL_NOW;
+  BatchNormInMilliSec += temp;
+  uint64_t curComm;
+  FIND_ALL_IO_TILL_NOW(curComm);
+  BatchNormCommSent += curComm;
+#endif
+
+#ifdef VERIFY_LAYERWISE
+  if (party == SERVER) {
+    funcReconstruct2PCCons(nullptr, inArr, size);
+    funcReconstruct2PCCons(nullptr, multArrVec, size);
+    funcReconstruct2PCCons(nullptr, outputArr, size);
+  } else {
+    signedIntType *VinArr = new signedIntType[size];
+    funcReconstruct2PCCons(VinArr, inArr, size);
+    signedIntType *VmultArr = new signedIntType[size];
+    funcReconstruct2PCCons(VmultArr, multArrVec, size);
+    signedIntType *VoutputArr = new signedIntType[size];
+    funcReconstruct2PCCons(VoutputArr, outputArr, size);
+
+    std::vector<uint64_t> VinVec(size);
+    std::vector<uint64_t> VmultVec(size);
+    std::vector<uint64_t> VoutputVec(size);
+
+    for (int i = 0; i < size; i++) {
+      VinVec[i] = getRingElt(VinArr[i]);
+      VmultVec[i] = getRingElt(VmultArr[i]);
+    }
+
+    ElemWiseSecretSharedVectorMult_pt(size, VinVec, VmultVec, VoutputVec);
+
+    bool pass = true;
+    for (int i = 0; i < size; i++) {
+      if (VoutputArr[i] != getSignedVal(VoutputVec[i])) {
+        pass = false;
+      }
+    }
+    if (pass == true)
+      std::cout << GREEN << "ElemWiseSecretSharedVectorMult Output Matches"
+                << RESET << std::endl;
+    else
+      std::cout << RED << "ElemWiseSecretSharedVectorMult Output Mismatch"
+                << RESET << std::endl;
+
+    delete[] VinArr;
+    delete[] VmultArr;
+    delete[] VoutputArr;
+  }
+#endif
+}
+
+void Floor(int32_t s1, intType *inArr, intType *outArr, int32_t sf) {
+  // Not being used in any of our networks right now
+  assert(false);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.h b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.h
new file mode 100644
index 00000000..02f6e707
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.h
@@ -0,0 +1,209 @@
+/*
+Authors: Nishant Kumar
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LIBRARY_FIXED_UNIFORM_H__
+#define LIBRARY_FIXED_UNIFORM_H__
+
+#include "defines_uniform.h"
+#include "utils/ArgMapping/ArgMapping.h"
+
+// Note of the bracket around each expression use -- if this is not there, not
+// macro expansion
+//  can result in hard in trace bugs when expanded around expressions like 1-2.
+#define Arr1DIdxRowM(arr, s0, i) (*((arr) + (i)))
+#define Arr2DIdxRowM(arr, s0, s1, i, j) (*((arr) + (i) * (s1) + (j)))
+#define Arr3DIdxRowM(arr, s0, s1, s2, i, j, k)                                 \
+  (*((arr) + (i) * (s1) * (s2) + (j) * (s2) + (k)))
+#define Arr4DIdxRowM(arr, s0, s1, s2, s3, i, j, k, l)                          \
+  (*((arr) + (i) * (s1) * (s2) * (s3) + (j) * (s2) * (s3) + (k) * (s3) + (l)))
+#define Arr5DIdxRowM(arr, s0, s1, s2, s3, s4, i, j, k, l, m)                   \
+  (*((arr) + (i) * (s1) * (s2) * (s3) * (s4) + (j) * (s2) * (s3) * (s4) +      \
+     (k) * (s3) * (s4) + (l) * (s4) + (m)))
+
+#define Arr2DIdxColM(arr, s0, s1, i, j) (*((arr) + (j) * (s0) + (i)))
+
+intType funcSSCons(int64_t x);
+void funcReconstruct2PCCons(signedIntType *y, const intType *x, int len);
+signedIntType funcReconstruct2PCCons(intType x, int revealParty);
+
+void MatMul2D(int32_t s1, int32_t s2, int32_t s3, const intType *A,
+              const intType *B, intType *C, bool modelIsA);
+
+void Conv2DWrapper(signedIntType N, signedIntType H, signedIntType W,
+                   signedIntType CI, signedIntType FH, signedIntType FW,
+                   signedIntType CO, signedIntType zPadHLeft,
+                   signedIntType zPadHRight, signedIntType zPadWLeft,
+                   signedIntType zPadWRight, signedIntType strideH,
+                   signedIntType strideW, intType *inputArr, intType *filterArr,
+                   intType *outArr);
+
+void Conv2DGroupWrapper(signedIntType N, signedIntType H, signedIntType W,
+                        signedIntType CI, signedIntType FH, signedIntType FW,
+                        signedIntType CO, signedIntType zPadHLeft,
+                        signedIntType zPadHRight, signedIntType zPadWLeft,
+                        signedIntType zPadWRight, signedIntType strideH,
+                        signedIntType strideW, signedIntType G,
+                        intType *inputArr, intType *filterArr, intType *outArr);
+
+void ConvTranspose2DWrapper(int32_t N, int32_t HPrime, int32_t WPrime,
+                               int32_t CI, int32_t FH, int32_t FW,
+                               int32_t CO, int32_t H, int32_t W,
+                               int32_t zPadTrHLeft, int32_t zPadTrHRight,
+                               int32_t zPadTrWLeft, int32_t zPadTrWRight,
+                               int32_t strideH, int32_t strideW,
+                               uint64_t* inputArr, uint64_t* filterArr,
+                               uint64_t* outArr);
+
+void ElemWiseActModelVectorMult(int32_t size, intType *inArr,
+                                intType *multArrVec, intType *outputArr);
+
+void ArgMax(int32_t s1, int32_t s2, intType *inArr, intType *outArr);
+
+void Min(int32_t size, intType *inArr, int32_t alpha, intType *outArr, int sf, bool doTruncation) ;
+
+void Max(int32_t size, intType *inArr, int32_t alpha, intType *outArr, int sf, bool doTruncation) ;
+
+void Relu(int32_t size, intType *inArr, intType *outArr, int sf,
+          bool doTruncation);
+
+// void Clip(int32_t size, int64_t alpha, int64_t beta, intType *inArr, intType *outArr, int sf, bool doTruncation) ;
+
+void HardSigmoid(int32_t size, intType *inArr, intType *outArr, int sf, bool doTruncation);
+
+void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, intType *inArr, intType *outArr);
+
+void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
+             int32_t ksizeW, int32_t zPadHLeft, int32_t zPadHRight,
+             int32_t zPadWLeft, int32_t zPadWRight, int32_t strideH,
+             int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
+             int32_t C1, intType *inArr, intType *outArr);
+
+void ScaleDown(int32_t size, intType *inArr, int32_t sf);
+
+void ScaleUp(int32_t size, intType *arr, int32_t sf);
+
+void StartComputation();
+
+void EndComputation();
+
+intType SecretAdd(intType x, intType y);
+
+intType SecretSub(intType x, intType y);
+
+intType SecretMult(intType x, intType y);
+
+void ElemWiseVectorPublicDiv(int32_t s1, intType *arr1, int32_t divisor,
+                             intType *outArr);
+
+void ElemWiseSecretSharedVectorMult(int32_t size, intType *inArr,
+                                    intType *multArrVec, intType *outputArr);
+
+void Floor(int32_t s1, intType *inArr, intType *outArr, int32_t sf);
+
+inline void ClearMemSecret1(int32_t s1, intType *arr) { delete[] arr; }
+
+inline void ClearMemSecret2(int32_t s1, int32_t s2, intType *arr) {
+  delete[] arr; // At the end of the day, everything is done using 1D array
+}
+
+inline void ClearMemSecret3(int32_t s1, int32_t s2, int32_t s3, intType *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemSecret4(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            intType *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemSecret5(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            int32_t s5, intType *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic(int32_t x) { return; }
+
+inline void ClearMemPublic1(int32_t s1, int32_t *arr) { delete[] arr; }
+
+inline void ClearMemPublic2(int32_t s1, int32_t s2, int32_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic3(int32_t s1, int32_t s2, int32_t s3, int32_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic4(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            int32_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic5(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            int32_t s5, int32_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic(int64_t x) { return; }
+
+inline void ClearMemPublic1(int32_t s1, int64_t *arr) { delete[] arr; }
+
+inline void ClearMemPublic2(int32_t s1, int32_t s2, int64_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic3(int32_t s1, int32_t s2, int32_t s3, int64_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic4(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            int64_t *arr) {
+  delete[] arr;
+}
+
+inline void ClearMemPublic5(int32_t s1, int32_t s2, int32_t s3, int32_t s4,
+                            int32_t s5, int64_t *arr) {
+  delete[] arr;
+}
+
+template <typename T> T *make_array(size_t s1) { return new T[s1]; }
+
+template <typename T> T *make_array(size_t s1, size_t s2) {
+  return new T[s1 * s2];
+}
+
+template <typename T> T *make_array(size_t s1, size_t s2, size_t s3) {
+  return new T[s1 * s2 * s3];
+}
+
+template <typename T>
+T *make_array(size_t s1, size_t s2, size_t s3, size_t s4) {
+  return new T[s1 * s2 * s3 * s4];
+}
+
+template <typename T>
+T *make_array(size_t s1, size_t s2, size_t s3, size_t s4, size_t s5) {
+  return new T[s1 * s2 * s3 * s4 * s5];
+}
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_float.cpp b/GPU-MPC/ext/sytorch/ext/sci/src/library_float.cpp
new file mode 100644
index 00000000..8f8ea07b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_float.cpp
@@ -0,0 +1,1252 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "globals_float.h"
+#include "library_float.h"
+
+using namespace std ;
+using namespace sci ;
+
+#define WHICHPARTY tid&1?3-__party:__party
+
+// Packs
+IOPack *__iopack = nullptr ;		
+OTPack *__otpack = nullptr ;
+
+// Addressing stuff
+string __address = "127.0.0.1" ;
+int __port = 32000 ;
+
+// Operations
+BoolOp *__bool_op = nullptr ;		// bool
+FixOp *__fix_op = nullptr ;			// int
+FPOp *__fp_op = nullptr ;			// float
+FPMath *__fp_math = nullptr ;		// float math operations
+
+// Floating point descriptors
+int __m_bits = 23 ;					// mantissa bits
+int __e_bits = 8 ;					// exponent bits
+
+// Other stuff from defines_float.h
+int __nt = MAX_THREADS ;
+int __party = 0 ;
+
+// Handy globals for experiments
+int BATCH = 1 ;
+
+// Output operations
+BoolArray __bool_pub ;				// bool
+FixArray __fix_pub ;				// int
+FPArray __fp_pub ;					// float
+
+// Initialization
+void __init(int __argc, char **__argv) {
+	cout.precision(15) ;
+	ArgMapping __amap ;
+
+	__amap.arg("r", __party, "Role of party: ALICE/SERVER = 1; BOB/CLIENT = 2") ;
+	__amap.arg("nt", __nt, "Number of threads") ;
+	__amap.arg("mbits", __m_bits, "mantissa bits") ;
+	__amap.arg("ebits", __e_bits, "exponent bits") ;
+	__amap.arg("port", __port, "port") ;
+	__amap.arg("add", __address, "address") ;
+	__amap.arg("batch", BATCH, "batch size for experiments") ;
+	__amap.parse(__argc, __argv);
+
+    for (int i = 0; i < __nt ; i++) {
+    	iopackArr[i] = new IOPack(__party, __port+i, __address) ;
+    	if (i & 1) {
+    		otpackArr[i] = new OTPack(iopackArr[i], 3-__party) ;
+    	}
+    	else {
+    		otpackArr[i] = new OTPack(iopackArr[i], __party) ;
+    	}
+    }
+
+    for (int i = 0 ; i < __nt ; i++) {
+    	int pt ;
+    	if (i & 1)
+    		pt = 3 - __party ;
+    	else
+    		pt = __party ;
+
+    	boolopArr[i] = new BoolOp(pt, iopackArr[i], otpackArr[i]) ;
+    	fixopArr[i] = new FixOp(pt, iopackArr[i], otpackArr[i]) ;
+    	fpopArr[i] = new FPOp(pt, iopackArr[i], otpackArr[i]) ;
+    	fpmathArr[i] = new FPMath(pt, iopackArr[i], otpackArr[i]) ;
+    }
+
+    __iopack = iopackArr[0] ;
+    __otpack = otpackArr[0] ;
+
+    __bool_op = boolopArr[0] ;
+    __fix_op = fixopArr[0] ;
+    __fp_op = fpopArr[0] ;    
+    __fp_math = fpmathArr[0] ;
+}
+
+float __get_comm() {
+	float c = 0.0 ;
+	for (int i = 0 ; i < __nt ; i++)
+		c += (float)iopackArr[i]->get_comm() ;
+
+	return c ;
+}
+
+BoolArray __public_bool_to_boolean(uint8_t b, int party=ALICE) {
+	uint8_t *_dummy = new uint8_t[1] ;
+	_dummy[0] = b ;
+	BoolArray _ret = __bool_op->input(party, 1, _dummy) ;
+	delete[] _dummy ;
+	return _ret ;
+}
+
+FixArray __public_int_to_arithmetic(uint64_t i, bool sign, int len, int party=ALICE) {
+	uint64_t *_dummy = new uint64_t[1] ;
+	_dummy[0] = i ;
+	FixArray _ret = __fix_op->input(party, 1, _dummy, sign, len, 0) ;
+	delete[] _dummy ;
+	return _ret ;
+}
+
+FPArray __public_float_to_arithmetic(float f, int party=ALICE) {
+	float *_dummy = new float[1] ;
+	_dummy[0] = f ;
+	FPArray _ret = __fp_op->input<float>(party, 1, _dummy, __m_bits, __e_bits) ;
+	delete[] _dummy ;
+	return _ret ;
+}
+
+FPArray __rand_float(int party) {
+	float *_dummy = new float[1] ;
+	_dummy[0] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
+	FPArray _ret = __fp_op->input<float>(party, 1, _dummy, __m_bits, __e_bits) ;
+	delete[] _dummy ;
+	return _ret ;
+}
+
+BoolArray __rand_bool(int party) {
+	bool b ;
+	if (static_cast <float> (rand()) / static_cast <float> (RAND_MAX) > 0.5)
+		b = true ;
+	else
+		b = false ;
+	return __public_bool_to_boolean(b, party) ;
+}
+
+vector<BoolArray> make_vector_bool(int party, size_t last) {
+	vector<BoolArray> _ret ;
+	for (size_t i = 0 ; i < last ; i++) {
+		_ret.push_back(__public_bool_to_boolean(false, party)) ;
+	}
+	return _ret ;
+}
+
+
+vector<FixArray> make_vector_int(int party, bool sign, int len, size_t last) {
+	vector<FixArray> _ret ;
+	for (size_t i = 0 ; i < last ; i++) {
+		_ret.push_back(__public_int_to_arithmetic(0, sign, len, party)) ;
+	}
+	return _ret ;
+}
+
+vector<FPArray> make_vector_float(int party, size_t last) {
+	vector<FPArray> _ret ;
+	for (size_t i = 0 ; i < last ; i++) {
+		_ret.push_back(__public_float_to_arithmetic(0.0, party)) ;
+	}
+	return _ret ;
+}
+
+vector<FPArray> make_vector_float_rand(int party, size_t last) {
+	vector<FPArray> _ret ;
+	for (size_t i = 0 ; i < last ; i++) {
+		_ret.push_back(__rand_float(party)) ;
+	}
+	return _ret ;
+}
+
+vector<BoolArray> make_vector_bool_rand(int party, size_t last) {
+	vector<BoolArray> _ret ;
+	for (size_t i = 0 ; i < last ; i++) {
+		_ret.push_back(__rand_bool(party)) ;
+	}
+	return _ret ;
+}
+
+vector<int> get_chunks(int items, int slots) {
+	int allocated, chunk, remaining ;
+
+	chunk = items/slots ;
+	vector<int> ret(slots, chunk) ;
+
+	allocated = chunk*slots ;
+	remaining = items - allocated ;
+	for (int i = 0 ; i < remaining ; i++)
+		ret[i]++ ;
+
+	return ret ;
+}
+
+tuple<BoolArray,BoolArray,FixArray,FixArray> get_components(int tid, const FPArray &x) {
+  BoolArray x_s = boolopArr[tid]->input(x.party, x.size, x.s);
+  BoolArray x_z = boolopArr[tid]->input(x.party, x.size, x.z);
+  FixArray x_m = fixopArr[tid]->input(x.party, x.size, x.m, false, x.m_bits + 1, x.m_bits);
+  FixArray x_e = fixopArr[tid]->input(x.party, x.size, x.e, true, x.e_bits + 2, 0);
+  return make_tuple(x_s, x_z, x_m, x_e);
+}
+
+void ElemWiseSub_thread(
+	int32_t tid, int32_t sz, int m_bits, int e_bits,
+	uint8_t *arr1_s, uint8_t *arr1_z, uint64_t *arr1_m, uint64_t *arr1_e,
+	uint8_t *arr2_s, uint8_t *arr2_z, uint64_t *arr2_m, uint64_t *arr2_e,
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e
+	) {
+	FPArray arr1_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr1_s, arr1_z, arr1_m, arr1_e, m_bits, e_bits) ;
+	FPArray arr2_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr2_s, arr2_z, arr2_m, arr2_e, m_bits, e_bits) ;
+	FPArray out = fpopArr[tid]->sub(arr1_flat, arr2_flat) ;
+
+	memcpy(out_s, out.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out.e, sz*sizeof(uint64_t)) ;
+}
+
+void ElemWiseSub(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) {
+	int m_bits, e_bits ;
+	m_bits = arr1[0].m_bits ;
+	e_bits = arr1[0].e_bits ;
+
+	uint8_t *arr1_s = new uint8_t[s1] ;
+	uint8_t *arr1_z = new uint8_t[s1] ;
+	uint64_t *arr1_m = new uint64_t[s1] ;
+	uint64_t *arr1_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr1_s[i] = arr1[i].s[0] ;
+		arr1_z[i] = arr1[i].z[0] ;
+		arr1_m[i] = arr1[i].m[0] ;
+		arr1_e[i] = arr1[i].e[0] ;
+	}
+
+	uint8_t *arr2_s = new uint8_t[s1] ;
+	uint8_t *arr2_z = new uint8_t[s1] ;
+	uint64_t *arr2_m = new uint64_t[s1] ;
+	uint64_t *arr2_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr2_s[i] = arr2[i].s[0] ;
+		arr2_z[i] = arr2[i].z[0] ;
+		arr2_m[i] = arr2[i].m[0] ;
+		arr2_e[i] = arr2[i].e[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(ElemWiseSub_thread,
+				i, chunks[i], m_bits, e_bits,
+				arr1_s+offset, arr1_z+offset, arr1_m+offset, arr1_e+offset,
+				arr2_s+offset, arr2_z+offset, arr2_m+offset, arr2_e+offset,
+				out_s+offset, out_z+offset, out_m+offset, out_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0)
+			threads[i].join() ;
+	}
+	
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+	}
+
+	delete[] arr1_s ; delete[] arr2_s ; delete[] out_s ;
+	delete[] arr1_z ; delete[] arr2_z ; delete[] out_z ;
+	delete[] arr1_m ; delete[] arr2_m ; delete[] out_m ;
+	delete[] arr1_e ; delete[] arr2_e ; delete[] out_e ;
+}
+
+void ElemWiseMul_thread(
+	int32_t tid, int32_t sz, int m_bits, int e_bits,
+	uint8_t *arr1_s, uint8_t *arr1_z, uint64_t *arr1_m, uint64_t *arr1_e,
+	uint8_t *arr2_s, uint8_t *arr2_z, uint64_t *arr2_m, uint64_t *arr2_e,
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e
+	) {
+	FPArray arr1_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr1_s, arr1_z, arr1_m, arr1_e, m_bits, e_bits) ;
+	FPArray arr2_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr2_s, arr2_z, arr2_m, arr2_e, m_bits, e_bits) ;
+	FPArray out = fpopArr[tid]->mul(arr1_flat, arr2_flat) ;
+
+	memcpy(out_s, out.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out.e, sz*sizeof(uint64_t)) ;
+}
+
+void ElemWiseMul(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) {
+	int m_bits, e_bits ;
+	m_bits = arr1[0].m_bits ;
+	e_bits = arr1[0].e_bits ;
+
+	uint8_t *arr1_s = new uint8_t[s1] ;
+	uint8_t *arr1_z = new uint8_t[s1] ;
+	uint64_t *arr1_m = new uint64_t[s1] ;
+	uint64_t *arr1_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr1_s[i] = arr1[i].s[0] ;
+		arr1_z[i] = arr1[i].z[0] ;
+		arr1_m[i] = arr1[i].m[0] ;
+		arr1_e[i] = arr1[i].e[0] ;
+	}
+
+	uint8_t *arr2_s = new uint8_t[s1] ;
+	uint8_t *arr2_z = new uint8_t[s1] ;
+	uint64_t *arr2_m = new uint64_t[s1] ;
+	uint64_t *arr2_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr2_s[i] = arr2[i].s[0] ;
+		arr2_z[i] = arr2[i].z[0] ;
+		arr2_m[i] = arr2[i].m[0] ;
+		arr2_e[i] = arr2[i].e[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0 ) {
+			threads[i] = thread(ElemWiseMul_thread,
+				i, chunks[i], m_bits, e_bits,
+				arr1_s+offset, arr1_z+offset, arr1_m+offset, arr1_e+offset,
+				arr2_s+offset, arr2_z+offset, arr2_m+offset, arr2_e+offset,
+				out_s+offset, out_z+offset, out_m+offset, out_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0)
+			threads[i].join() ;
+	}
+	
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+	}
+
+	delete[] arr1_s ; delete[] arr2_s ; delete[] out_s ;
+	delete[] arr1_z ; delete[] arr2_z ; delete[] out_z ;
+	delete[] arr1_m ; delete[] arr2_m ; delete[] out_m ;
+	delete[] arr1_e ; delete[] arr2_e ; delete[] out_e ;
+}
+
+void getOutDer(int32_t s1, int32_t s2, vector<vector<FPArray>>& P, vector<vector<FPArray>>& Phat, vector<vector<FPArray>>& der) {
+	int sz = s1*s2 ;
+	int m_bits, e_bits ;
+	m_bits = P[0][0].m_bits ;
+	e_bits = P[0][0].e_bits ;
+
+	vector<FPArray> flat1, flat2 ;
+	vector<FPArray> flat3 = make_vector_float(ALICE, sz) ;
+
+	for (int i = 0 ; i < s1 ; i++)
+		for (int j = 0 ; j < s2 ; j++)
+			flat1.push_back(P[i][j]) ;
+	
+	for (int i = 0 ; i < s1 ; i++)
+		for (int j = 0 ; j < s2 ; j++)
+			flat2.push_back(Phat[i][j]) ;
+
+	ElemWiseSub(sz, flat1, flat2, flat3) ;
+
+	vector<FPArray> divver = make_vector_float(ALICE, sz) ;
+	for (int i = 0 ; i < sz ; i++)
+		divver[i] = __fp_op->input<float>(PUBLIC, sz, (float)(1.0/s1), m_bits, e_bits) ;
+	ElemWiseMul(sz, flat3, divver, flat3) ;
+
+	for (int i = 0, k = 0 ; i < s1 ; i++) 
+		for (int j = 0 ; j < s2 ; j++, k++)
+			der[i][j] = flat3[k] ;
+}
+
+void MatMul_thread(
+	int tid, int m_chunk, int n, int p, int m_bits, int e_bits, FPMatrix B,
+	uint8_t *A_s, uint8_t *A_z, uint64_t *A_m, uint64_t *A_e,
+	uint8_t *res_s, uint8_t *res_z, uint64_t *res_m, uint64_t *res_e
+	) {
+
+	FPMatrix A_chunk = fpopArr[tid]->input(WHICHPARTY, m_chunk, n, A_s, A_z, A_m, A_e, m_bits, e_bits) ;
+	FPMatrix res = fpopArr[tid]->matrix_multiplication(A_chunk, B) ;
+	
+	memcpy(res_s, res.s, m_chunk*p*sizeof(uint8_t)) ;
+	memcpy(res_z, res.z, m_chunk*p*sizeof(uint8_t)) ;
+	memcpy(res_m, res.m, m_chunk*p*sizeof(uint64_t)) ;
+	memcpy(res_e, res.e, m_chunk*p*sizeof(uint64_t)) ;
+}
+
+void MatMul(int32_t m, int32_t n, int32_t p, 
+	vector<vector<FPArray>> &A, 
+	vector<vector<FPArray>> &B, 
+	vector<vector<FPArray>> &C) {
+
+	if (m <= __nt && p > __nt) {
+		vector<vector<FPArray>> BT = make_vector_float(ALICE, p, n) ;
+		vector<vector<FPArray>> AT = make_vector_float(ALICE, n, m) ;
+		vector<vector<FPArray>> CT = make_vector_float(ALICE, p, m) ;
+
+		for (int i = 0 ; i < n ; i++) {
+			for (int j = 0 ; j < m ; j++) {
+				AT[i][j] = A[j][i] ;
+			}
+		}
+
+		for (int i = 0 ; i < p ; i++)
+			for (int j = 0 ; j < n ; j++)
+				BT[i][j] = B[j][i] ;
+
+		MatMul(p, n, m, BT, AT, CT) ;
+
+		for (int i = 0 ; i < m ; i++)
+			for (int j = 0 ; j < p ; j++)
+				C[i][j] = CT[j][i] ;
+
+		return ;
+	} 
+
+	int m_bits = A[0][0].m_bits ;
+	int e_bits = A[0][0].e_bits ;
+
+	uint8_t *A_s = new uint8_t[m*n] ;
+	uint8_t *A_z = new uint8_t[m*n] ;
+	uint64_t *A_m = new uint64_t[m*n] ;
+	uint64_t *A_e = new uint64_t[m*n] ;
+	for (int i = 0, k=0 ; i < m ; i++) {
+		for (int j = 0 ; j < n ; j++, k++) {
+			A_s[k] = A[i][j].s[0] ;
+			A_z[k] = A[i][j].z[0] ;
+			A_m[k] = A[i][j].m[0] ;
+			A_e[k] = A[i][j].e[0] ;
+		}
+	}
+
+	uint8_t *B_s = new uint8_t[n*p] ;
+	uint8_t *B_z = new uint8_t[n*p] ;
+	uint64_t *B_m = new uint64_t[n*p] ;
+	uint64_t *B_e = new uint64_t[n*p] ;
+	for (int i = 0, k = 0 ; i < n ; i++) {
+		for (int j = 0 ; j < p ; j++, k++) {
+			B_s[k] = B[i][j].s[0] ;
+			B_z[k] = B[i][j].z[0] ;
+			B_m[k] = B[i][j].m[0] ;
+			B_e[k] = B[i][j].e[0] ;
+		}
+	}
+	FPMatrix mat2 = __fp_op->input(__party, n, p, B_s, B_z, B_m, B_e, m_bits, e_bits) ;
+
+	uint8_t *res_s = new uint8_t[m*p] ;
+	uint8_t *res_z = new uint8_t[m*p] ;
+	uint64_t *res_m = new uint64_t[m*p] ;
+	uint64_t *res_e = new uint64_t[m*p] ;
+
+	vector<int> chunks = get_chunks(m, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int m_offset, A_offset, res_offset ;
+	m_offset = A_offset = res_offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(MatMul_thread,
+				i, chunks[i], n, p, m_bits, e_bits, mat2,
+				A_s+A_offset, A_z+A_offset, A_m+A_offset, A_e+A_offset,
+				res_s+res_offset, res_z+res_offset, res_m+res_offset, res_e+res_offset
+			) ;
+
+			m_offset += chunks[i] ;
+			A_offset += chunks[i]*n ;
+			res_offset += chunks[i]*p ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0)
+			threads[i].join() ;
+	}
+
+	for (int i = 0, k = 0 ; i < m ; i++) {
+		for (int j = 0 ; j < p ; j++, k++) {
+			C[i][j].m_bits = m_bits ;
+			C[i][j].e_bits = e_bits ;
+
+			C[i][j].s[0] = res_s[k] ;
+			C[i][j].z[0] = res_z[k] ;
+			C[i][j].m[0] = res_m[k] ;
+			C[i][j].e[0] = res_e[k] ;
+		}
+	}
+
+	delete[] A_s ; delete[] B_s ; delete[] res_s ;
+	delete[] A_z ; delete[] B_z ; delete[] res_z ;
+	delete[] A_m ; delete[] B_m ; delete[] res_m ;
+	delete[] A_e ; delete[] B_e ; delete[] res_e ; 
+}
+
+void IfElse_thread(
+	int tid, int sz, int m_bits, int e_bits,
+	uint8_t *in_s, uint8_t *in_z, uint64_t *in_m, uint64_t *in_e, uint8_t *in_hot,
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e, bool flip=false
+	) {
+
+	FPArray in_flat = fpopArr[tid]->input(WHICHPARTY, sz, in_s, in_z, in_m, in_e, m_bits, e_bits) ;
+	BoolArray cond_flat = boolopArr[tid]->input(WHICHPARTY, sz, in_hot) ;
+	FPArray zero_flat = fpopArr[tid]->input<float>(ALICE, sz, (float)0.0, m_bits, e_bits) ;
+
+	FPArray out_flat ;
+	if (flip)
+		out_flat = fpopArr[tid]->if_else(cond_flat, zero_flat, in_flat) ;
+	else
+		out_flat = fpopArr[tid]->if_else(cond_flat, in_flat, zero_flat) ;
+
+	memcpy(out_s, out_flat.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out_flat.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out_flat.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out_flat.e, sz*sizeof(uint64_t)) ;
+}
+
+void IfElse(
+	int32_t s1, 
+	vector<FPArray> &inArr,
+	vector<BoolArray> &condArr, 
+	vector<FPArray> &outArr, bool flip=false) {
+	int m_bits, e_bits ;
+	m_bits = inArr[0].m_bits ;
+	e_bits = inArr[0].e_bits ;
+
+	uint8_t *in_s = new uint8_t[s1] ;
+	uint8_t *in_z = new uint8_t[s1] ;
+	uint64_t *in_m = new uint64_t[s1] ;
+	uint64_t *in_e = new uint64_t[s1] ;
+	uint8_t *in_hot = new uint8_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		in_s[i] = inArr[i].s[0] ;
+		in_z[i] = inArr[i].z[0] ;
+		in_m[i] = inArr[i].m[0] ;
+		in_e[i] = inArr[i].e[0] ;
+
+		in_hot[i] = condArr[i].data[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(IfElse_thread,
+				i, chunks[i], m_bits, e_bits,
+				in_s + offset, in_z + offset, in_m + offset, in_e + offset, in_hot+offset,
+				out_s + offset, out_z + offset, out_m + offset, out_e + offset, flip
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+	}
+
+	delete[] in_s ; delete[] out_s ;
+	delete[] in_z ; delete[] out_z ;
+	delete[] in_m ; delete[] out_m ;
+	delete[] in_e ; delete[] out_e ;
+	delete[] in_hot ;
+}
+
+void ElemWiseAdd_thread(
+	int32_t tid, int32_t sz, int m_bits, int e_bits,
+	uint8_t *arr1_s, uint8_t *arr1_z, uint64_t *arr1_m, uint64_t *arr1_e,
+	uint8_t *arr2_s, uint8_t *arr2_z, uint64_t *arr2_m, uint64_t *arr2_e,
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e
+	) {
+	FPArray arr1_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr1_s, arr1_z, arr1_m, arr1_e, m_bits, e_bits) ;
+	FPArray arr2_flat = fpopArr[tid]->input(WHICHPARTY, sz, arr2_s, arr2_z, arr2_m, arr2_e, m_bits, e_bits) ;
+	FPArray out = fpopArr[tid]->add(arr1_flat, arr2_flat) ;
+
+	memcpy(out_s, out.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out.e, sz*sizeof(uint64_t)) ;
+}
+
+void ElemWiseAdd(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) {
+	int m_bits, e_bits ;
+	m_bits = arr1[0].m_bits ;
+	e_bits = arr1[0].e_bits ;
+
+	uint8_t *arr1_s = new uint8_t[s1] ;
+	uint8_t *arr1_z = new uint8_t[s1] ;
+	uint64_t *arr1_m = new uint64_t[s1] ;
+	uint64_t *arr1_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr1_s[i] = arr1[i].s[0] ;
+		arr1_z[i] = arr1[i].z[0] ;
+		arr1_m[i] = arr1[i].m[0] ;
+		arr1_e[i] = arr1[i].e[0] ;
+	}
+
+	uint8_t *arr2_s = new uint8_t[s1] ;
+	uint8_t *arr2_z = new uint8_t[s1] ;
+	uint64_t *arr2_m = new uint64_t[s1] ;
+	uint64_t *arr2_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		arr2_s[i] = arr2[i].s[0] ;
+		arr2_z[i] = arr2[i].z[0] ;
+		arr2_m[i] = arr2[i].m[0] ;
+		arr2_e[i] = arr2[i].e[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(ElemWiseAdd_thread,
+				i, chunks[i], m_bits, e_bits,
+				arr1_s+offset, arr1_z+offset, arr1_m+offset, arr1_e+offset,
+				arr2_s+offset, arr2_z+offset, arr2_m+offset, arr2_e+offset,
+				out_s+offset, out_z+offset, out_m+offset, out_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0)
+			threads[i].join() ;
+	}
+	
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+	}
+
+	delete[] arr1_s ; delete[] arr2_s ; delete[] out_s ;
+	delete[] arr1_z ; delete[] arr2_z ; delete[] out_z ;
+	delete[] arr1_m ; delete[] arr2_m ; delete[] out_m ;
+	delete[] arr1_e ; delete[] arr2_e ; delete[] out_e ;
+}
+
+void GemmAdd(int32_t s1, int32_t s2, 
+	vector<vector<FPArray>> &inArr, 
+	vector<FPArray> &bias, 
+	vector<vector<FPArray>> &outArr) {
+
+	int m_bits, e_bits ;
+	int sz ;
+
+	m_bits = inArr[0][0].m_bits ;
+	e_bits = inArr[0][0].e_bits ;
+	sz = s1*s2 ;
+
+	vector<FPArray> arr1 = make_vector_float(ALICE, sz) ;
+	vector<FPArray> arr2 = make_vector_float(ALICE, sz) ;
+	vector<FPArray> out = make_vector_float(ALICE, sz) ;
+
+	for (int i1=0, k=0 ; i1 < s1 ; i1++) {
+		for (int i2 = 0 ; i2 < s2 ; i2++, k++) {
+			arr1[k] = inArr[i1][i2] ;
+			arr2[k] = bias[i2] ;
+		}
+	}
+
+	ElemWiseAdd(sz, arr1, arr2, out) ;
+
+	for (int i1 = 0, k = 0 ; i1 < s1 ; i1++) {
+		for (int i2 = 0 ; i2 < s2 ; i2++, k++) {
+			outArr[i1][i2] = out[k] ;
+		}
+	}
+}
+
+void Relu_thread(
+	int tid, int sz, int m_bits, int e_bits,
+	uint8_t *in_s, uint8_t *in_z, uint64_t *in_m, uint64_t *in_e,   
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e, uint8_t *hot
+	) {
+
+	FPArray in_flat = fpopArr[tid]->input(WHICHPARTY, sz, in_s, in_z, in_m, in_e, m_bits, e_bits) ;
+
+	BoolArray sgn, zero ;
+	FixArray _2, _3 ;
+	std::tie(sgn, zero, _2, _3) = get_components(tid, in_flat) ;
+
+	FPArray zero_flat = fpopArr[tid]->input<float>(ALICE, sz, (float)0.0, m_bits, e_bits) ;
+	FPArray out_flat = fpopArr[tid]->if_else(sgn, zero_flat, in_flat) ;
+
+	memcpy(out_s, out_flat.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out_flat.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out_flat.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out_flat.e, sz*sizeof(uint64_t)) ;
+	memcpy(hot, sgn.data, sz*sizeof(uint8_t))  ;
+}
+
+void Relu(
+	int32_t s1, 
+	vector<FPArray> &inArr, 
+	vector<FPArray> &outArr,
+	vector<BoolArray> &hotArr) {
+	int m_bits, e_bits ;
+	m_bits = inArr[0].m_bits ;
+	e_bits = inArr[0].e_bits ;
+
+	uint8_t *in_s = new uint8_t[s1] ;
+	uint8_t *in_z = new uint8_t[s1] ;
+	uint64_t *in_m = new uint64_t[s1] ;
+	uint64_t *in_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		in_s[i] = inArr[i].s[0] ;
+		in_z[i] = inArr[i].z[0] ;
+		in_m[i] = inArr[i].m[0] ;
+		in_e[i] = inArr[i].e[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+	uint8_t *hot = new uint8_t[s1] ;
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(Relu_thread,
+				i, chunks[i], m_bits, e_bits,
+				in_s + offset, in_z + offset, in_m + offset, in_e + offset,
+				out_s + offset, out_z + offset, out_m + offset, out_e + offset, hot + offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+		hotArr[i].data[0] = hot[i] ;
+	}
+
+	delete[] in_s ; delete[] out_s ;
+	delete[] in_z ; delete[] out_z ;
+	delete[] in_m ; delete[] out_m ;
+	delete[] in_e ; delete[] out_e ;
+	delete[] hot ;
+}
+
+void vectorSum_thread(
+	int tid, int chunk, int colsize, int m_bits, int e_bits,
+	uint8_t **Row_s, uint8_t **Row_z, uint64_t **Row_m, uint64_t **Row_e,
+	uint8_t *row_s, uint8_t *row_z, uint64_t *row_m, uint64_t *row_e
+	) {
+	vector<FPArray> sums ;
+	for (int i = 0 ; i < chunk ; i++) {
+		sums.push_back(
+			fpopArr[tid]->input(
+				WHICHPARTY, colsize, Row_s[i], Row_z[i], Row_m[i], Row_e[i], m_bits, e_bits
+			)
+		) ;
+	}
+
+	FPArray vsum = fpopArr[tid]->treesum(sums) ;
+
+	memcpy(row_s, vsum.s, chunk*sizeof(uint8_t)) ;
+	memcpy(row_z, vsum.z, chunk*sizeof(uint8_t)) ;
+	memcpy(row_m, vsum.m, chunk*sizeof(uint64_t)) ;
+	memcpy(row_e, vsum.e, chunk*sizeof(uint64_t)) ;
+}
+
+void getBiasDer(int32_t m, int32_t s2, vector<vector<FPArray>>& batchSoftDer, vector<FPArray> &biasDer) {
+	int m_bits, e_bits ;
+	m_bits = batchSoftDer[0][0].m_bits ;
+	e_bits = batchSoftDer[0][0].e_bits ;
+
+	uint8_t **Row_s = new uint8_t*[s2] ;
+	uint8_t **Row_z = new uint8_t*[s2] ;
+	uint64_t **Row_m = new uint64_t*[s2] ;
+	uint64_t **Row_e = new uint64_t*[s2] ;
+
+	uint8_t *row_s = new uint8_t[s2] ;
+	uint8_t *row_z = new uint8_t[s2] ;
+	uint64_t *row_m = new uint64_t[s2] ;
+	uint64_t *row_e = new uint64_t[s2] ;
+
+	for (int i = 0 ; i < s2 ; i++) {
+		Row_s[i] = new uint8_t[m] ;
+		Row_z[i] = new uint8_t[m] ;
+		Row_m[i] = new uint64_t[m] ;
+		Row_e[i] = new uint64_t[m] ;
+
+		for (int j = 0 ; j < m ; j++) {
+			Row_s[i][j] = batchSoftDer[j][i].s[0] ;
+			Row_z[i][j] = batchSoftDer[j][i].z[0] ;
+			Row_m[i][j] = batchSoftDer[j][i].m[0] ;
+			Row_e[i][j] = batchSoftDer[j][i].e[0] ;
+		}
+	}
+
+
+	vector<int> chunks = get_chunks(s2, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(vectorSum_thread,
+				i, chunks[i], m, m_bits, e_bits,
+				Row_s+offset, Row_z+offset, Row_m+offset, Row_e+offset,
+				row_s+offset, row_z+offset, row_m+offset, row_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}	
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+	for (int i = 0 ; i < s2 ; i++) {
+		biasDer[i].m_bits = m_bits ;
+		biasDer[i].e_bits = e_bits ;
+
+		biasDer[i].s[0] = row_s[i] ;
+		biasDer[i].z[0] = row_z[i] ;
+		biasDer[i].m[0] = row_m[i] ;
+		biasDer[i].e[0] = row_e[i] ;
+	}
+
+	for (int i = 0 ; i < s2 ; i++) {
+		delete[] Row_s[i] ;
+		delete[] Row_z[i] ;
+		delete[] Row_m[i] ;
+		delete[] Row_e[i] ;
+	}
+
+	delete[] Row_s ; delete[] row_s ;
+	delete[] Row_z ; delete[] row_z ;
+	delete[] Row_m ; delete[] row_m ;
+	delete[] Row_e ; delete[] row_e ;
+}
+
+void updateWeights_thread(
+	int tid, int chunk, float lr, int m_bits, int e_bits,
+	uint8_t *in_s, uint8_t *in_z, uint64_t *in_m, uint64_t *in_e,
+	uint8_t *der_s, uint8_t *der_z, uint64_t *der_m, uint64_t *der_e
+	) {
+	FPArray weight = fpopArr[tid]->input(WHICHPARTY, chunk, in_s, in_z, in_m, in_e, m_bits, e_bits) ;
+	FPArray der = fpopArr[tid]->input(WHICHPARTY, chunk, der_s, der_z, der_m, der_e, m_bits, e_bits) ;
+	FPArray muller = fpopArr[tid]->input<float>(PUBLIC, chunk, lr, m_bits, e_bits) ;
+
+	der = fpopArr[tid]->mul(der, muller) ;
+	weight = fpopArr[tid]->sub(weight, der) ;
+
+	memcpy(in_s, weight.s, chunk*sizeof(uint8_t)) ;
+	memcpy(in_z, weight.z, chunk*sizeof(uint8_t)) ;
+	memcpy(in_m, weight.m, chunk*sizeof(uint64_t)) ;
+	memcpy(in_e, weight.e, chunk*sizeof(uint64_t)) ;
+}
+
+void updateWeights(int32_t sz, float lr, vector<FPArray>& inArr, vector<FPArray>& derArr) {
+	int m_bits, e_bits ;
+	m_bits = inArr[0].m_bits ;
+	e_bits = inArr[0].e_bits ;
+
+	uint8_t *in_s = new uint8_t[sz] ;
+	uint8_t *in_z = new uint8_t[sz] ;
+	uint64_t *in_m = new uint64_t[sz] ;
+	uint64_t *in_e = new uint64_t[sz] ;
+
+	uint8_t *der_s = new uint8_t[sz] ;
+	uint8_t *der_z = new uint8_t[sz] ;
+	uint64_t *der_m = new uint64_t[sz] ;
+	uint64_t *der_e = new uint64_t[sz] ;
+
+	for (int i = 0 ; i < sz ; i++) {
+		in_s[i] = inArr[i].s[0] ; der_s[i] = derArr[i].s[0] ;
+		in_z[i] = inArr[i].z[0] ; der_z[i] = derArr[i].z[0] ;
+		in_m[i] = inArr[i].m[0] ; der_m[i] = derArr[i].m[0] ;
+		in_e[i] = inArr[i].e[0] ; der_e[i] = derArr[i].e[0] ;
+	}
+
+	vector<int> chunks = get_chunks(sz, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(updateWeights_thread,
+				i, chunks[i], lr, m_bits, e_bits,
+				in_s+offset, in_z+offset, in_m+offset, in_e+offset,
+				der_s+offset, der_z+offset, der_m+offset, der_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+	for (int i = 0 ; i < sz ; i++) {
+		inArr[i].m_bits = m_bits ;
+		inArr[i].e_bits = e_bits ;
+
+		inArr[i].s[0] = in_s[i] ;
+		inArr[i].z[0] = in_z[i] ;
+		inArr[i].m[0] = in_m[i] ;
+		inArr[i].e[0] = in_e[i] ;
+	}
+
+	delete[] in_s ; delete[] der_s ;
+	delete[] in_z ; delete[] der_z ;
+	delete[] in_m ; delete[] der_m ;
+	delete[] in_e ; delete[] der_e ;
+}
+
+void Softmax2_thread(
+	int tid, int mchunk, int n, int m_bits, int e_bits,
+	uint8_t **in_s, uint8_t **in_z, uint64_t **in_m, uint64_t **in_e,
+	uint8_t **out_s, uint8_t **out_z, uint64_t **out_m, uint64_t **out_e
+	) {
+	vector<FPArray> softin, softout ;
+
+	for (int i = 0 ; i < mchunk ; i++)
+		softin.push_back(fpopArr[tid]->input(WHICHPARTY, n, in_s[i], in_z[i], in_m[i], in_e[i], m_bits, e_bits)) ;
+
+	softout = fpmathArr[tid]->softmax(softin) ;	
+	for (int i = 0 ; i < mchunk ; i++) {
+		memcpy(out_s[i], softout[i].s, n*sizeof(uint8_t)) ;
+		memcpy(out_z[i], softout[i].z, n*sizeof(uint8_t)) ;
+		memcpy(out_m[i], softout[i].m, n*sizeof(uint64_t)) ;
+		memcpy(out_e[i], softout[i].e, n*sizeof(uint64_t)) ;
+	}
+}
+
+void Softmax2(
+	int32_t s1, 
+	int32_t s2, 
+	vector<vector<FPArray>> &inArr, 
+	vector<vector<FPArray>> &outArr) {
+	int m_bits = inArr[0][0].m_bits ;
+	int e_bits = inArr[0][0].e_bits ;
+
+	uint8_t **row_s = new uint8_t*[s1] ;
+	uint8_t **row_z = new uint8_t*[s1] ;
+	uint64_t **row_m = new uint64_t*[s1] ;
+	uint64_t **row_e = new uint64_t*[s1] ;
+
+	uint8_t **out_s = new uint8_t*[s1] ;
+	uint8_t **out_z = new uint8_t*[s1] ;
+	uint64_t **out_m = new uint64_t*[s1] ;
+	uint64_t **out_e = new uint64_t*[s1] ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		row_s[i] = new uint8_t[s2] ;
+		row_z[i] = new uint8_t[s2] ;
+		row_m[i] = new uint64_t[s2] ;
+		row_e[i] = new uint64_t[s2] ;
+
+		out_s[i] = new uint8_t[s2] ;
+		out_z[i] = new uint8_t[s2] ;
+		out_m[i] = new uint64_t[s2] ;
+		out_e[i] = new uint64_t[s2] ;
+
+		for (int j = 0 ; j < s2 ; j++) {
+			row_s[i][j] = inArr[i][j].s[0] ;
+			row_z[i][j] = inArr[i][j].z[0] ;
+			row_m[i][j] = inArr[i][j].m[0] ;
+			row_e[i][j] = inArr[i][j].e[0] ;
+		}
+	}
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(Softmax2_thread,
+				i, chunks[i], s2, m_bits, e_bits,
+				row_s+offset, row_z+offset, row_m+offset, row_e+offset,
+				out_s+offset, out_z+offset, out_m+offset, out_e+offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+
+	for (int i = 0 ; i < s1 ; i++) {
+		for (int j = 0 ; j < s2 ; j++) {
+			outArr[i][j].s[0] = out_s[i][j] ;
+			outArr[i][j].z[0] = out_z[i][j] ;
+			outArr[i][j].m[0] = out_m[i][j] ;
+			outArr[i][j].e[0] = out_e[i][j] ;
+		}
+	}
+
+	for (int i = 0 ; i < s1 ; i++) {
+		delete[] row_s[i] ; delete[] out_s[i] ;
+		delete[] row_z[i] ; delete[] out_z[i] ;
+		delete[] row_m[i] ; delete[] out_m[i] ;
+		delete[] row_e[i] ; delete[] out_e[i] ;
+	}
+
+	delete[] row_s ; delete[] out_s ;
+	delete[] row_z ; delete[] out_z ;
+	delete[] row_m ; delete[] out_m ;
+	delete[] row_e ; delete[] out_e ;
+}
+
+void Ln_thread(
+	int tid, int sz, int m_bits, int e_bits,
+	uint8_t *in_s, uint8_t *in_z, uint64_t *in_m, uint64_t *in_e,   
+	uint8_t *out_s, uint8_t *out_z, uint64_t *out_m, uint64_t *out_e
+	) {
+
+	FPArray in_flat = fpopArr[tid]->input(WHICHPARTY, sz, in_s, in_z, in_m, in_e, m_bits, e_bits) ;
+	FPArray out_flat = fpmathArr[tid]->ln(in_flat) ;
+
+	memcpy(out_s, out_flat.s, sz*sizeof(uint8_t)) ;
+	memcpy(out_z, out_flat.z, sz*sizeof(uint8_t)) ;
+	memcpy(out_m, out_flat.m, sz*sizeof(uint64_t)) ;
+	memcpy(out_e, out_flat.e, sz*sizeof(uint64_t)) ;
+}
+
+void Ln(
+	int32_t s1, 
+	vector<FPArray> &inArr, 
+	vector<FPArray> &outArr) {
+	int m_bits, e_bits ;
+	m_bits = inArr[0].m_bits ;
+	e_bits = inArr[0].e_bits ;
+
+	uint8_t *in_s = new uint8_t[s1] ;
+	uint8_t *in_z = new uint8_t[s1] ;
+	uint64_t *in_m = new uint64_t[s1] ;
+	uint64_t *in_e = new uint64_t[s1] ;
+	for (int i = 0 ; i < s1 ; i++) {
+		in_s[i] = inArr[i].s[0] ;
+		in_z[i] = inArr[i].z[0] ;
+		in_m[i] = inArr[i].m[0] ;
+		in_e[i] = inArr[i].e[0] ;
+	}
+
+	uint8_t *out_s = new uint8_t[s1] ;
+	uint8_t *out_z = new uint8_t[s1] ;
+	uint64_t *out_m = new uint64_t[s1] ;
+	uint64_t *out_e = new uint64_t[s1] ;
+	uint8_t *hot = new uint8_t[s1] ;
+
+	vector<int> chunks = get_chunks(s1, __nt) ;
+	thread threads[MAX_THREADS] ;
+	int offset = 0 ;
+	for (int i = 0 ; i < __nt ; i++) {
+		if (chunks[i] > 0) {
+			threads[i] = thread(Ln_thread,
+				i, chunks[i], m_bits, e_bits,
+				in_s + offset, in_z + offset, in_m + offset, in_e + offset,
+				out_s + offset, out_z + offset, out_m + offset, out_e + offset
+			) ;
+			offset += chunks[i] ;
+		}
+	}
+
+	for (int i = 0 ; i < __nt ; i++)
+		if (chunks[i] > 0)
+			threads[i].join() ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].m_bits = m_bits ;
+		outArr[i].e_bits = e_bits ;
+
+		outArr[i].s[0] = out_s[i] ;
+		outArr[i].z[0] = out_z[i] ;
+		outArr[i].m[0] = out_m[i] ;
+		outArr[i].e[0] = out_e[i] ;
+	}
+
+	delete[] in_s ; delete[] out_s ;
+	delete[] in_z ; delete[] out_z ;
+	delete[] in_m ; delete[] out_m ;
+	delete[] in_e ; delete[] out_e ;
+}
+
+void dotProduct2(int32_t s1, int32_t s2, vector<vector<FPArray>>& arr1, vector<vector<FPArray>>& arr2, vector<FPArray>& outArr) {
+	int m_bits, e_bits ;
+	m_bits = arr1[0][0].m_bits ;
+	e_bits = arr1[0][0].e_bits ;
+
+	vector<FPArray> dot1, dot2 ;
+
+	uint8_t *arr1_s = new uint8_t[s2] ;
+	uint8_t *arr1_z = new uint8_t[s2] ;
+	uint64_t *arr1_m = new uint64_t[s2] ;
+	uint64_t *arr1_e = new uint64_t[s2] ;
+
+	uint8_t *arr2_s = new uint8_t[s2] ;
+	uint8_t *arr2_z = new uint8_t[s2] ;
+	uint64_t *arr2_m = new uint64_t[s2] ;
+	uint64_t *arr2_e = new uint64_t[s2] ;
+
+	for (int i = 0 ; i < s1 ; i++) {
+		for (int j = 0 ; j < s2 ; j++) {
+			arr1_s[j] = arr1[i][j].s[0] ; arr2_s[j] = arr2[i][j].s[0] ;
+			arr1_z[j] = arr1[i][j].z[0] ; arr2_z[j] = arr2[i][j].z[0] ;
+			arr1_m[j] = arr1[i][j].m[0] ; arr2_m[j] = arr2[i][j].m[0] ;
+			arr1_e[j] = arr1[i][j].e[0] ; arr2_e[j] = arr2[i][j].e[0] ;
+		}
+
+		dot1.push_back(__fp_op->input(__party, s2, arr1_s, arr1_z, arr1_m, arr1_e, m_bits, e_bits)) ;
+		dot2.push_back(__fp_op->input(__party, s2, arr2_s, arr2_z, arr2_m, arr2_e, m_bits, e_bits)) ;
+	}
+
+	FPArray res = __fp_op->treesum(__fp_op->mul(dot1, dot2)) ;
+	for (int i = 0 ; i < s1 ; i++) {
+		outArr[i].s[0] = res.s[i] ;
+		outArr[i].z[0] = res.z[i] ;
+		outArr[i].m[0] = res.m[i] ;
+		outArr[i].e[0] = res.e[i] ;
+	}
+
+	delete[] arr1_s ; delete[] arr2_s ;
+	delete[] arr1_z ; delete[] arr2_z ;
+	delete[] arr1_m ; delete[] arr2_m ;
+	delete[] arr1_e ; delete[] arr2_e ;
+}
+
+void getLoss(int32_t s, vector<FPArray>& arr, vector<FPArray>& outArr) {
+	int m_bits, e_bits ;
+	m_bits = arr[0].m_bits ;
+	e_bits = arr[0].e_bits ;
+
+	uint8_t *in_s = new uint8_t[s] ;
+	uint8_t *in_z = new uint8_t[s] ;
+	uint64_t *in_m = new uint64_t[s] ;
+	uint64_t *in_e = new uint64_t[s] ;
+
+	for (int i = 0 ; i < s ; i++) {
+		in_s[i] = arr[i].s[0] ;
+		in_z[i] = arr[i].z[0] ;
+		in_m[i] = arr[i].m[0] ;
+		in_e[i] = arr[i].e[0] ;
+	}
+
+	vector<FPArray> sum ;
+	sum.push_back(__fp_op->input(__party, s, in_s, in_z, in_m, in_e, m_bits, e_bits)) ;
+	
+	FPArray res = __fp_op->treesum(sum) ;
+	FPArray div = __fp_op->input<float>(ALICE, 1, (float)-1.0/s, m_bits, e_bits) ;
+	res = __fp_op->mul(res, div) ;
+
+	outArr[0].s[0] = res.s[0] ;
+	outArr[0].z[0] = res.z[0] ;
+	outArr[0].m[0] = res.m[0] ;
+	outArr[0].e[0] = res.e[0] ;
+
+	delete[] in_s ;
+	delete[] in_z ;
+	delete[] in_m ;
+	delete[] in_e ;
+}
+
+void computeMSELoss(int32_t m, int32_t s, vector<vector<FPArray>>& target, vector<vector<FPArray>>& fwdOut, vector<FPArray>& loss) {
+	vector<FPArray> target_flat = make_vector_float(ALICE, m) ;
+	vector<FPArray> out_flat = make_vector_float(ALICE, m) ;
+
+	for (int i = 0 ; i < m ; i++) {
+		target_flat[i] = target[i][0] ;
+		out_flat[i] = fwdOut[i][0] ;
+	}
+
+	vector<FPArray> subbed = make_vector_float(ALICE, m) ;
+	vector<FPArray> loss_terms = make_vector_float(ALICE, m) ;
+
+	ElemWiseSub(m, out_flat, target_flat, subbed) ;
+	ElemWiseMul(m, subbed, subbed, loss_terms) ;
+	getLoss(m, loss_terms, loss) ;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/library_float.h b/GPU-MPC/ext/sytorch/ext/sci/src/library_float.h
new file mode 100644
index 00000000..e1525740
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/library_float.h
@@ -0,0 +1,207 @@
+/*
+Authors: Anwesh Bhattacharya
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef LIBRARY_FLOAT_H__
+#define LIBRARY_FLOAT_H__
+
+#include "defines_float.h"
+#include "FloatingPoint/floating-point.h"
+#include "FloatingPoint/fp-math.h"
+
+using namespace std;
+using namespace sci;
+
+// Packs
+extern IOPack *__iopack ;
+extern OTPack *__otpack ;
+
+// Operations
+extern BoolOp *__bool_op ;		// bool
+extern FixOp *__fix_op ;			// int
+extern FPOp *__fp_op ;			// float
+extern FPMath *__fp_math ;		// float math operations
+
+// Floating point descriptors
+extern int __m_bits ;				// mantissa bits
+extern int __e_bits ;				// exponent bits
+
+// Handy globals ;
+extern int BATCH ;
+
+// Output operations
+extern BoolArray __bool_pub ;				// bool
+extern FixArray __fix_pub ;				// int
+extern FPArray __fp_pub ;					// float
+
+/********************* Public Variables *********************/
+
+// Initialization
+void __init(int __argc, char **__argv) ;
+
+float __get_comm() ;
+
+/********************* Primitive Vectors *********************/
+
+template<typename T>
+vector<T> make_vector(size_t size) {
+	return std::vector<T>(size) ;
+}
+
+template <typename T, typename... Args>
+auto make_vector(size_t first, Args... sizes) {
+	auto inner = make_vector<T>(sizes...) ;
+	return vector<decltype(inner)>(first, inner) ;
+}
+
+/********************* Boolean Multidimensional Arrays *********************/
+
+BoolArray __public_bool_to_boolean(uint8_t b, int party) ;
+
+vector<BoolArray> make_vector_bool(int party, size_t last) ;
+
+template <typename... Args>
+auto make_vector_bool(int party, size_t first, Args... sizes) {
+	auto _inner = make_vector_bool(party, sizes...) ;
+	vector<decltype(_inner)> _ret ;
+	_ret.push_back(_inner) ;
+	for (size_t i = 1 ; i < first ; i++) {
+		_ret.push_back(make_vector_bool(party, sizes...)) ;
+	}
+	return _ret ;
+}
+
+BoolArray __rand_bool(int party) ;
+
+vector<BoolArray> make_vector_bool_rand(int party, size_t last) ;
+
+template <typename... Args>
+auto make_vector_bool_rand(int party, size_t first, Args... sizes) {
+	auto _inner = make_vector_bool_rand(party, sizes...) ;
+	vector<decltype(_inner)> _ret ;
+	_ret.push_back(_inner) ;
+	for (size_t i = 1 ; i < first ; i++) {
+		_ret.push_back(make_vector_bool_rand(party, sizes...)) ;
+	}
+	return _ret ;
+}
+
+/********************* Integer Multidimensional Arrays *********************/
+
+FixArray __public_int_to_arithmetic(uint64_t i, bool sign, int len, int party) ;
+
+vector<FixArray> make_vector_int(int party, bool sign, int len, size_t last) ;
+
+template <typename... Args>
+auto make_vector_int(int party, bool sign, int len, size_t first, Args... sizes) {
+	auto _inner = make_vector_int(party, sign, len, sizes...) ;
+	vector<decltype(_inner)> _ret ;
+	_ret.push_back(_inner) ;
+	for (size_t i = 1 ; i < first ; i++) {
+		_ret.push_back(make_vector_int(party, sign, len, sizes...)) ;
+	}
+	return _ret ;
+}
+
+/********************* Floating Multidimensional Arrays *********************/
+
+FPArray __public_float_to_arithmetic(float f, int party) ;
+
+vector<FPArray> make_vector_float(int party, size_t last) ;
+
+template <typename... Args>
+auto make_vector_float(int party, size_t first, Args... sizes) {
+	auto _inner = make_vector_float(party, sizes...) ;
+	vector<decltype(_inner)> _ret ;
+	_ret.push_back(_inner) ;
+	for (size_t i = 1 ; i < first ; i++) {
+		_ret.push_back(make_vector_float(party, sizes...)) ;
+	}
+	return _ret ;
+}
+
+
+FPArray __rand_float(int party) ;
+
+vector<FPArray> make_vector_float_rand(int party, size_t last) ;
+
+template <typename... Args>
+auto make_vector_float_rand(int party, size_t first, Args... sizes) {
+	auto _inner = make_vector_float_rand(party, sizes...) ;
+	vector<decltype(_inner)> _ret ;
+	_ret.push_back(_inner) ;
+	for (size_t i = 1 ; i < first ; i++) {
+		_ret.push_back(make_vector_float_rand(party, sizes...)) ;
+	}
+	return _ret ;
+}
+
+
+void ElemWiseSub(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) ; 
+
+void ElemWiseMul(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) ;
+
+void getOutDer(int32_t s1, int32_t s2, vector<vector<FPArray>>& P, vector<vector<FPArray>>& Phat, vector<vector<FPArray>>& der) ;
+
+void MatMul(int32_t m, int32_t n, int32_t p, 
+	vector<vector<FPArray>> &A, 
+	vector<vector<FPArray>> &B, 
+	vector<vector<FPArray>> &C) ;
+
+// use flip as true if 
+void IfElse(
+	int32_t s1, 
+	vector<FPArray> &inArr,
+	vector<BoolArray> &condArr, 
+	vector<FPArray> &outArr, bool flip) ;
+
+void ElemWiseAdd(int32_t s1, vector<FPArray>& arr1, vector<FPArray>& arr2, vector<FPArray>& outArr) ;
+
+void GemmAdd(int32_t s1, int32_t s2, 
+	vector<vector<FPArray>> &inArr, 
+	vector<FPArray> &bias, 
+	vector<vector<FPArray>> &outArr) ;
+
+// hotArr is positive if input is negative
+void Relu(
+	int32_t s1, 
+	vector<FPArray> &inArr, 
+	vector<FPArray> &outArr,
+	vector<BoolArray> &hotArr) ; 
+
+void getBiasDer(int32_t m, int32_t s2, vector<vector<FPArray>>& batchSoftDer, vector<FPArray> &biasDer) ;
+
+void updateWeights(int32_t sz, float lr, vector<FPArray>& inArr, vector<FPArray>& derArr) ; 
+
+void Softmax2(
+	int32_t s1, 
+	int32_t s2, 
+	vector<vector<FPArray>> &inArr, 
+	vector<vector<FPArray>> &outArr) ; 
+
+void Ln(int32_t s1, vector<FPArray>& inArr, vector<FPArray>& outArr) ; 
+
+void dotProduct2(int32_t s1, int32_t s2, vector<vector<FPArray>>& arr1, vector<vector<FPArray>>& arr2, vector<FPArray>& outArr) ;
+
+void getLoss(int32_t s, vector<FPArray>& arr, vector<FPArray>& outArr) ; 
+
+void computeMSELoss(int32_t m, int32_t s, vector<vector<FPArray>>& target, vector<vector<FPArray>>& fwdOut, vector<FPArray>& loss) ;
+
+#endif
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/ArgMapping.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/ArgMapping.h
new file mode 100644
index 00000000..4f3623bc
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/ArgMapping.h
@@ -0,0 +1,142 @@
+/* Copyright (C) 2012-2017 IBM Corp.
+ * This program is Licensed under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. See accompanying LICENSE file.
+ *
+ * @file NumbTh.h
+ * @brief Miscellaneous utility functions.
+ *
+ * Modified for CrypTFlow2. Stripped-down version of the original file.
+ */
+
+#ifndef ARGMAPPING_H
+#define ARGMAPPING_H
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+typedef std::unordered_map<std::string, const char *> argmap_t;
+
+inline bool parseArgs(int argc, char *argv[], argmap_t &argmap) {
+  for (long i = 1; i < argc; i++) {
+    char *x = argv[i];
+    long j = 0;
+    while (x[j] != '=' && x[j] != '\0')
+      j++;
+    if (x[j] == '\0')
+      return false;
+    std::string arg(x, j);
+    if (argmap[arg] == NULL)
+      return false;
+    argmap[arg] = x + j + 1;
+  }
+  return true;
+}
+
+inline bool doArgProcessing(std::string *value, const char *s) {
+  *value = std::string(s);
+  return true;
+}
+
+template <class T> inline bool doArgProcessing(T *value, const char *s) {
+  std::string ss(s);
+  std::stringstream sss(ss);
+  return bool(sss >> *value);
+}
+
+class ArgProcessor {
+public:
+  virtual bool process(const char *s) = 0;
+};
+
+/* ArgProcessorDerived: templated subclasses */
+
+template <class T> class ArgProcessorDerived : public ArgProcessor {
+public:
+  T *value;
+
+  virtual bool process(const char *s) { return doArgProcessing(value, s); }
+
+  ArgProcessorDerived(T *_value) : value(_value) {}
+  virtual ~ArgProcessorDerived(){};
+};
+
+class ArgMapping {
+public:
+  std::unordered_map<std::string, std::shared_ptr<ArgProcessor>> map;
+  std::stringstream doc;
+
+  // no documentation
+  template <class T> void arg(const char *name, T &value) {
+    std::shared_ptr<ArgProcessor> ap =
+        std::shared_ptr<ArgProcessor>(new ArgProcessorDerived<T>(&value));
+
+    assert(!map[name]);
+    map[name] = ap;
+  }
+
+  // documentation + standard default info
+  template <class T> void arg(const char *name, T &value, const char *doc1) {
+    arg(name, value);
+    doc << "\t" << name << " \t" << doc1 << "  [ default=" << value << " ]"
+        << "\n";
+  }
+
+  // documentation + standard non-standard default info:
+  // NULL => no default info
+  template <class T>
+  void arg(const char *name, T &value, const char *doc1, const char *info) {
+    arg(name, value);
+    doc << "\t" << name << " \t" << doc1;
+    if (info)
+      doc << "  [ default=" << info << " ]"
+          << "\n";
+    else
+      doc << "\n";
+  }
+
+  inline void note(const char *s);
+  inline void usage(const char *prog);
+  inline void parse(int argc, char **argv);
+  inline std::string documentation();
+};
+
+void ArgMapping::note(const char *s) { doc << "\t\t   " << s << "\n"; }
+
+void ArgMapping::usage(const char *prog) {
+  std::cerr << "Usage: " << prog << " [ name=value ]...\n";
+  std::cerr << documentation();
+  exit(0);
+}
+
+void ArgMapping::parse(int argc, char **argv) {
+  for (long i = 1; i < argc; i++) {
+    const char *x = argv[i];
+    long j = 0;
+    while (x[j] != '=' && x[j] != '\0')
+      j++;
+    if (x[j] == '\0')
+      usage(argv[0]);
+    std::string name(x, j);
+    const char *s = x + j + 1;
+
+    std::shared_ptr<ArgProcessor> ap = map[name];
+    if (!ap)
+      return usage(argv[0]);
+    if (!ap->process(s))
+      usage(argv[0]);
+  }
+}
+
+std::string ArgMapping::documentation() { return doc.str(); }
+
+#endif // ARGMAPPING_H
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/LICENSE b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/LICENSE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/NOTICE b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/NOTICE
new file mode 100644
index 00000000..a2e5bc88
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/NOTICE
@@ -0,0 +1,3 @@
+HElib
+Copyright (C) 2012-2017 IBM Corp.
+
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/src/utils/CMakeLists.txt
new file mode 100644
index 00000000..48923c1f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+include(cmake/source_of_randomness.cmake)
+
+set(OPENSSL_ROOT_DIR "/usr/local/opt/openssl")
+find_package(OpenSSL REQUIRED)
+find_package(GMP REQUIRED)
+
+add_library(SCI-utils INTERFACE)
+
+target_compile_options(SCI-utils
+    INTERFACE
+    "-pthread;-maes;-msse4.1;-mavx;-mavx2;-faligned-new;-std=c++17;-O3")
+#     "-pthread;-Wall;-maes;-msse4.1;-mavx;-mavx2;-faligned-new;-std=c++17;-w;-g;-ggdb;-O0")
+
+if(USE_RANDOM_DEVICE)
+    target_compile_definitions(SCI-utils INTERFACE EMP_USE_RANDOM_DEVICE=1)
+else(USE_RANDOM_DEVICE)
+    target_compile_options(SCI-utils INTERFACE "-mrdseed")
+endif(USE_RANDOM_DEVICE)
+
+target_include_directories(SCI-utils
+    INTERFACE ${OPENSSL_INCLUDE_DIR} ${GMP_INCLUDE_DIR}
+)
+
+target_link_libraries(SCI-utils
+    INTERFACE ${OPENSSL_LIBRARIES} ${GMP_LIBRARIES}
+)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ThreadPool.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ThreadPool.h
new file mode 100644
index 00000000..e3a1e483
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ThreadPool.h
@@ -0,0 +1,116 @@
+/*
+https://github.com/progschj/ThreadPool
+
+Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+*/
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+class ThreadPool {
+public:
+  ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+  int size() const;
+
+private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+
+int inline ThreadPool::size() const { return workers.size(); }
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        std::function<void()> task;
+
+        {
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          if (this->stop && this->tasks.empty())
+            return;
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+
+        task();
+      }
+    });
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto task = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+  std::future<return_type> res = task->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    if (stop)
+      throw std::runtime_error("enqueue on stopped ThreadPool");
+
+    tasks.emplace([task]() { (*task)(); });
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread &worker : workers)
+    worker.join();
+}
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes-ni.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes-ni.h
new file mode 100644
index 00000000..5015c488
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes-ni.h
@@ -0,0 +1,408 @@
+/*
+ *  AESNI.c: AES using AES-NI instructions
+ *
+ * Written in 2013 by Sebastian Ramacher <sebastian@ramacher.at>
+ *
+ * ===================================================================
+ * The contents of this file are dedicated to the public domain.  To
+ * the extent that dedication to the public domain is not available,
+ * everyone is granted a worldwide, perpetual, royalty-free,
+ * non-exclusive license to exercise all rights associated with the
+ * contents of this file for any purpose whatsoever.
+ * No rights are reserved.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * ===================================================================
+ *
+ * Modified for CrypTFlow2
+ */
+
+#ifndef LIBGARBLE_AES_NI_H
+#define LIBGARBLE_AES_NI_H
+
+#if !defined(__AES__)
+#error "AES-NI instructions not enabled"
+#endif
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <wmmintrin.h>
+#if defined(HAVE__ALIGNED_MALLOC)
+#include <malloc.h>
+#endif
+#include "utils/block.h"
+
+#define MODULE_NAME _AESNI
+#define BLOCK_SIZE 16
+#define KEY_SIZE 0
+
+#define MAXKC (256 / 32)
+#define MAXKB (256 / 8)
+#define MAXNR 14
+
+namespace sci {
+
+typedef unsigned char u8;
+
+typedef struct {
+  block128 rk[15];
+  int rounds;
+} AESNI_KEY;
+
+static block128 one = makeBlock128(0L, 1L);
+static block128 two = makeBlock128(0L, 2L);
+
+#define ENC_256_round(t)                                                       \
+  {                                                                            \
+    b1 = _mm_aesenc_si128(b1, key[0].rk[t]);                                   \
+    b2 = _mm_aesenc_si128(b2, key[1].rk[t]);                                   \
+    b3 = _mm_aesenc_si128(b3, key[2].rk[t]);                                   \
+    b4 = _mm_aesenc_si128(b4, key[3].rk[t]);                                   \
+    b5 = _mm_aesenc_si128(b5, key[4].rk[t]);                                   \
+    b6 = _mm_aesenc_si128(b6, key[5].rk[t]);                                   \
+    b7 = _mm_aesenc_si128(b7, key[6].rk[t]);                                   \
+    b8 = _mm_aesenc_si128(b8, key[7].rk[t]);                                   \
+  }
+
+/* Helper functions to expand keys */
+
+static inline block128 aes128_keyexpand(block128 key) {
+  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
+  return _mm_xor_si128(key, _mm_slli_si128(key, 4));
+}
+
+static inline block128 aes192_keyexpand_2(block128 key, block128 key2) {
+  key = _mm_shuffle_epi32(key, 0xff);
+  key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
+  return _mm_xor_si128(key, key2);
+}
+
+#define KEYEXP128_H(K1, K2, I, S)                                              \
+  _mm_xor_si128(aes128_keyexpand(K1),                                          \
+                _mm_shuffle_epi32(_mm_aeskeygenassist_si128(K2, I), S))
+
+#define KEYEXP128(K, I) KEYEXP128_H(K, K, I, 0xff)
+#define KEYEXP192(K1, K2, I) KEYEXP128_H(K1, K2, I, 0x55)
+#define KEYEXP192_2(K1, K2) aes192_keyexpand_2(K1, K2)
+#define KEYEXP256(K1, K2, I) KEYEXP128_H(K1, K2, I, 0xff)
+#define KEYEXP256_2(K1, K2) KEYEXP128_H(K1, K2, 0x00, 0xaa)
+
+/* Encryption key setup */
+static inline void aes_key_setup_enc(block128 rk[], const u8 *cipherKey,
+                                     int keylen) {
+  switch (keylen) {
+  case 16: {
+    /* 128 bit key setup */
+    rk[0] = _mm_loadu_si128((const block128 *)cipherKey);
+    rk[1] = KEYEXP128(rk[0], 0x01);
+    rk[2] = KEYEXP128(rk[1], 0x02);
+    rk[3] = KEYEXP128(rk[2], 0x04);
+    rk[4] = KEYEXP128(rk[3], 0x08);
+    rk[5] = KEYEXP128(rk[4], 0x10);
+    rk[6] = KEYEXP128(rk[5], 0x20);
+    rk[7] = KEYEXP128(rk[6], 0x40);
+    rk[8] = KEYEXP128(rk[7], 0x80);
+    rk[9] = KEYEXP128(rk[8], 0x1B);
+    rk[10] = KEYEXP128(rk[9], 0x36);
+    break;
+  }
+  case 24: {
+    /* 192 bit key setup */
+    block128 temp[2];
+    rk[0] = _mm_loadu_si128((const block128 *)cipherKey);
+    rk[1] = _mm_loadu_si128((const block128 *)(cipherKey + 16));
+    temp[0] = KEYEXP192(rk[0], rk[1], 0x01);
+    temp[1] = KEYEXP192_2(temp[0], rk[1]);
+    rk[1] = (block128)_mm_shuffle_pd((__m128d)rk[1], (__m128d)temp[0], 0);
+    rk[2] = (block128)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
+    rk[3] = KEYEXP192(temp[0], temp[1], 0x02);
+    rk[4] = KEYEXP192_2(rk[3], temp[1]);
+    temp[0] = KEYEXP192(rk[3], rk[4], 0x04);
+    temp[1] = KEYEXP192_2(temp[0], rk[4]);
+    rk[4] = (block128)_mm_shuffle_pd((__m128d)rk[4], (__m128d)temp[0], 0);
+    rk[5] = (block128)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
+    rk[6] = KEYEXP192(temp[0], temp[1], 0x08);
+    rk[7] = KEYEXP192_2(rk[6], temp[1]);
+    temp[0] = KEYEXP192(rk[6], rk[7], 0x10);
+    temp[1] = KEYEXP192_2(temp[0], rk[7]);
+    rk[7] = (block128)_mm_shuffle_pd((__m128d)rk[7], (__m128d)temp[0], 0);
+    rk[8] = (block128)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
+    rk[9] = KEYEXP192(temp[0], temp[1], 0x20);
+    rk[10] = KEYEXP192_2(rk[9], temp[1]);
+    temp[0] = KEYEXP192(rk[9], rk[10], 0x40);
+    temp[1] = KEYEXP192_2(temp[0], rk[10]);
+    rk[10] = (block128)_mm_shuffle_pd((__m128d)rk[10], (__m128d)temp[0], 0);
+    rk[11] = (block128)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
+    rk[12] = KEYEXP192(temp[0], temp[1], 0x80);
+    break;
+  }
+  case 32: {
+    /* 256 bit key setup */
+    rk[0] = _mm_loadu_si128((const block128 *)cipherKey);
+    rk[1] = _mm_loadu_si128((const block128 *)(cipherKey + 16));
+    rk[2] = KEYEXP256(rk[0], rk[1], 0x01);
+    rk[3] = KEYEXP256_2(rk[1], rk[2]);
+    rk[4] = KEYEXP256(rk[2], rk[3], 0x02);
+    rk[5] = KEYEXP256_2(rk[3], rk[4]);
+    rk[6] = KEYEXP256(rk[4], rk[5], 0x04);
+    rk[7] = KEYEXP256_2(rk[5], rk[6]);
+    rk[8] = KEYEXP256(rk[6], rk[7], 0x08);
+    rk[9] = KEYEXP256_2(rk[7], rk[8]);
+    rk[10] = KEYEXP256(rk[8], rk[9], 0x10);
+    rk[11] = KEYEXP256_2(rk[9], rk[10]);
+    rk[12] = KEYEXP256(rk[10], rk[11], 0x20);
+    rk[13] = KEYEXP256_2(rk[11], rk[12]);
+    rk[14] = KEYEXP256(rk[12], rk[13], 0x40);
+    break;
+  }
+  }
+}
+
+/* Encryption key (256) setup */
+static inline void aes_key_setup_enc(block128 rk[], const block256 cipherKey) {
+  /* 256 bit key setup */
+  _mm256_storeu2_m128i(rk + 1, rk, cipherKey);
+  // rk[0] = _mm_loadu_si128((const block128*) cipherKey);
+  // rk[1] = _mm_loadu_si128((const block128*) (cipherKey+16));
+  rk[2] = KEYEXP256(rk[0], rk[1], 0x01);
+  rk[3] = KEYEXP256_2(rk[1], rk[2]);
+  rk[4] = KEYEXP256(rk[2], rk[3], 0x02);
+  rk[5] = KEYEXP256_2(rk[3], rk[4]);
+  rk[6] = KEYEXP256(rk[4], rk[5], 0x04);
+  rk[7] = KEYEXP256_2(rk[5], rk[6]);
+  rk[8] = KEYEXP256(rk[6], rk[7], 0x08);
+  rk[9] = KEYEXP256_2(rk[7], rk[8]);
+  rk[10] = KEYEXP256(rk[8], rk[9], 0x10);
+  rk[11] = KEYEXP256_2(rk[9], rk[10]);
+  rk[12] = KEYEXP256(rk[10], rk[11], 0x20);
+  rk[13] = KEYEXP256_2(rk[11], rk[12]);
+  rk[14] = KEYEXP256(rk[12], rk[13], 0x40);
+}
+
+/* Decryption key setup */
+static inline void aes_key_setup_dec(block128 dk[], const block128 ek[],
+                                     int rounds) {
+  dk[rounds] = ek[0];
+  for (int i = 1; i < rounds; ++i) {
+    dk[rounds - i] = _mm_aesimc_si128(ek[i]);
+  }
+  dk[0] = ek[rounds];
+}
+
+static inline void AESNI_set_encrypt_key(AESNI_KEY *self, unsigned char *key,
+                                         int keylen) {
+  switch (keylen) {
+  case 16:
+    self->rounds = 10;
+    break;
+  case 24:
+    self->rounds = 12;
+    break;
+  case 32:
+    self->rounds = 14;
+    break;
+  }
+
+  aes_key_setup_enc(self->rk, key, keylen);
+}
+
+static inline void AESNI_set_encrypt_key(AESNI_KEY *self, const block256 key) {
+  self->rounds = 14;
+  aes_key_setup_enc(self->rk, key);
+}
+
+static inline void AESNI_set_decrypt_key(AESNI_KEY *self, unsigned char *key,
+                                         int keylen) {
+  switch (keylen) {
+  case 16:
+    self->rounds = 10;
+    break;
+  case 24:
+    self->rounds = 12;
+    break;
+  case 32:
+    self->rounds = 14;
+    break;
+  }
+
+  AESNI_KEY temp_key;
+  aes_key_setup_enc(temp_key.rk, key, keylen);
+  aes_key_setup_dec(self->rk, temp_key.rk, self->rounds);
+}
+
+/*
+static inline void block_encrypt(AESNI_KEY* self, const u8* in, u8* out)
+{
+    block128 m = _mm_loadu_si128((const block128*) in);
+    m = _mm_xor_si128(m, self->rk[0]);
+    m = _mm_aesenc_si128(m, self->rk[1]);
+    m = _mm_aesenc_si128(m, self->rk[2]);
+    m = _mm_aesenc_si128(m, self->rk[3]);
+    m = _mm_aesenc_si128(m, self->rk[4]);
+    m = _mm_aesenc_si128(m, self->rk[5]);
+    m = _mm_aesenc_si128(m, self->rk[6]);
+    m = _mm_aesenc_si128(m, self->rk[7]);
+    m = _mm_aesenc_si128(m, self->rk[8]);
+    m = _mm_aesenc_si128(m, self->rk[9]);
+    if (self->rounds != 10) {
+        m = _mm_aesenc_si128(m, self->rk[10]);
+        m = _mm_aesenc_si128(m, self->rk[11]);
+        if (self->rounds == 14) {
+            m = _mm_aesenc_si128(m, self->rk[12]);
+            m = _mm_aesenc_si128(m, self->rk[13]);
+        }
+    }
+    m = _mm_aesenclast_si128(m, self->rk[self->rounds]);
+    _mm_storeu_si128((block128*) out, m);
+}
+
+static inline void block_decrypt(AESNI_KEY* self, const u8* in, u8* out)
+{
+    block128 m = _mm_loadu_si128((const block128*) in);
+    m = _mm_xor_si128(m, self->rk[0]);
+    m = _mm_aesdec_si128(m, self->rk[1]);
+    m = _mm_aesdec_si128(m, self->rk[2]);
+    m = _mm_aesdec_si128(m, self->rk[3]);
+    m = _mm_aesdec_si128(m, self->rk[4]);
+    m = _mm_aesdec_si128(m, self->rk[5]);
+    m = _mm_aesdec_si128(m, self->rk[6]);
+    m = _mm_aesdec_si128(m, self->rk[7]);
+    m = _mm_aesdec_si128(m, self->rk[8]);
+    m = _mm_aesdec_si128(m, self->rk[9]);
+    if (self->rounds != 10) {
+        m = _mm_aesdec_si128(m, self->rk[10]);
+        m = _mm_aesdec_si128(m, self->rk[11]);
+        if (self->rounds == 14) {
+            m = _mm_aesdec_si128(m, self->rk[12]);
+            m = _mm_aesdec_si128(m, self->rk[13]);
+        }
+    }
+    m = _mm_aesdeclast_si128(m, self->rk[self->rounds]);
+    _mm_storeu_si128((block128*) out, m);
+}
+*/
+
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC push_options
+#pragma GCC optimize("unroll-loops")
+#endif
+#endif
+static inline void __attribute__((target("aes,sse2")))
+AESNI_ecb_encrypt_blks_8(block128 *blks, const AESNI_KEY *keys) {
+  for (unsigned int i = 0; i < 8; ++i)
+    blks[i] = _mm_xor_si128(blks[i], keys[i].rk[0]);
+  for (int j = 1; j < 14; ++j)
+    for (unsigned int i = 0; i < 8; ++i)
+      blks[i] = _mm_aesenc_si128(blks[i], keys[i].rk[j]);
+  for (unsigned int i = 0; i < 8; ++i)
+    blks[i] = _mm_aesenclast_si128(blks[i], keys[i].rk[keys->rounds]);
+}
+
+static inline void __attribute__((target("aes,sse2")))
+AESNI_ecb_encrypt_blks_ks(block128 *blks, unsigned int nblks,
+                          const AESNI_KEY *key) {
+
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_xor_si128(one, key[i].rk[0]);
+  // blks[i] = _mm_xor_si128(blks[i], key[i].rk[0]);
+  for (int j = 1; j < key->rounds; ++j)
+    for (unsigned int i = 0; i < nblks; ++i)
+      blks[i] = _mm_aesenc_si128(blks[i], key[i].rk[j]);
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_aesenclast_si128(blks[i], key[i].rk[key->rounds]);
+}
+
+static inline void __attribute__((target("aes,sse2")))
+AESNI_ecb_encrypt_blks_ks_x8(block128 *blks, unsigned int nblks,
+                             const AESNI_KEY *key) {
+  __m128i b1, b2, b3, b4, b5, b6, b7, b8;
+  assert((nblks == 8) &&
+         "AESNI_ecb_encrypt_blks_ks_x8 only works for 8 blocks");
+  b1 = _mm_xor_si128(one, key[0].rk[0]);
+  b2 = _mm_xor_si128(one, key[1].rk[0]);
+  b3 = _mm_xor_si128(one, key[2].rk[0]);
+  b4 = _mm_xor_si128(one, key[3].rk[0]);
+  b5 = _mm_xor_si128(one, key[4].rk[0]);
+  b6 = _mm_xor_si128(one, key[5].rk[0]);
+  b7 = _mm_xor_si128(one, key[6].rk[0]);
+  b8 = _mm_xor_si128(one, key[7].rk[0]);
+
+  ENC_256_round(1);
+  ENC_256_round(2);
+  ENC_256_round(3);
+  ENC_256_round(4);
+  ENC_256_round(5);
+  ENC_256_round(6);
+  ENC_256_round(7);
+  ENC_256_round(8);
+  ENC_256_round(9);
+  ENC_256_round(10);
+  ENC_256_round(11);
+  ENC_256_round(12);
+  ENC_256_round(13);
+
+  blks[0] = _mm_aesenclast_si128(b1, key[0].rk[key->rounds]);
+  blks[1] = _mm_aesenclast_si128(b2, key[1].rk[key->rounds]);
+  blks[2] = _mm_aesenclast_si128(b3, key[2].rk[key->rounds]);
+  blks[3] = _mm_aesenclast_si128(b4, key[3].rk[key->rounds]);
+  blks[4] = _mm_aesenclast_si128(b5, key[4].rk[key->rounds]);
+  blks[5] = _mm_aesenclast_si128(b6, key[5].rk[key->rounds]);
+  blks[6] = _mm_aesenclast_si128(b7, key[6].rk[key->rounds]);
+  blks[7] = _mm_aesenclast_si128(b8, key[7].rk[key->rounds]);
+}
+
+static inline void __attribute__((target("aes,sse2")))
+AESNI_ecb_encrypt_blks(block128 *blks, unsigned int nblks,
+                       const AESNI_KEY *key) {
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_xor_si128(blks[i], key->rk[0]);
+  for (int j = 1; j < key->rounds; ++j)
+    for (unsigned int i = 0; i < nblks; ++i)
+      blks[i] = _mm_aesenc_si128(blks[i], key->rk[j]);
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_aesenclast_si128(blks[i], key->rk[key->rounds]);
+}
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC pop_options
+#endif
+#endif
+
+static inline void __attribute__((target("aes,sse2")))
+AESNI_ecb_decrypt_blks(block128 *blks, unsigned nblks, const AESNI_KEY *key) {
+  unsigned i, j, rnds = key->rounds;
+  for (i = 0; i < nblks; ++i)
+    blks[i] = _mm_xor_si128(blks[i], key->rk[0]);
+  for (j = 1; j < rnds; ++j)
+    for (i = 0; i < nblks; ++i)
+      blks[i] = _mm_aesdec_si128(blks[i], key->rk[j]);
+  for (i = 0; i < nblks; ++i)
+    blks[i] = _mm_aesdeclast_si128(blks[i], key->rk[j]);
+}
+
+/*
+static inline void AESNI_encrypt_label(Label& lb, const AESNI_KEY* key, uint64_t
+gid, uint8_t entry, uint8_t pos){
+    // Label counters(one, two);
+    uint64_t tweak_hi = gid;
+    uint64_t tweak_lo = (entry << 2) || (pos << 1);
+    block128 counters[2];
+    counters[0] = makeBlock128(tweak_hi, tweak_lo | 0);
+    counters[1] = makeBlock128(tweak_hi, tweak_lo | 1);
+
+    AESNI_ecb_encrypt_blks((block128*) &counters, 2, key);
+    lb.hi = _mm_xor_si128(counters[1], lb.hi);
+    lb.lo = _mm_xor_si128(counters[0], lb.lo);
+}
+*/
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h
new file mode 100644
index 00000000..6ffd1017
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h
@@ -0,0 +1,147 @@
+/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#ifndef LIBGARBLE_AES_H
+#define LIBGARBLE_AES_H
+
+#include "utils/block.h"
+
+namespace sci {
+
+typedef struct {
+  block128 rd_key[11];
+  unsigned int rounds;
+} AES_KEY;
+
+#define EXPAND_ASSIST(v1, v2, v3, v4, shuff_const, aes_const)                  \
+  v2 = _mm_aeskeygenassist_si128(v4, aes_const);                               \
+  v3 = _mm_castps_si128(                                                       \
+      _mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 16));         \
+  v1 = _mm_xor_si128(v1, v3);                                                  \
+  v3 = _mm_castps_si128(                                                       \
+      _mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 140));        \
+  v1 = _mm_xor_si128(v1, v3);                                                  \
+  v2 = _mm_shuffle_epi32(v2, shuff_const);                                     \
+  v1 = _mm_xor_si128(v1, v2)
+
+inline void __attribute__((target("aes,sse2")))
+AES_set_encrypt_key(const block128 userkey, AES_KEY *key) {
+  block128 x0, x1, x2;
+  block128 *kp = key->rd_key;
+  kp[0] = x0 = userkey;
+  x2 = _mm_setzero_si128();
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 1);
+  kp[1] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 2);
+  kp[2] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 4);
+  kp[3] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 8);
+  kp[4] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 16);
+  kp[5] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 32);
+  kp[6] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 64);
+  kp[7] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 128);
+  kp[8] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 27);
+  kp[9] = x0;
+  EXPAND_ASSIST(x0, x1, x2, x0, 255, 54);
+  kp[10] = x0;
+  key->rounds = 10;
+}
+
+inline void __attribute__((target("aes,sse2")))
+AES_ecb_encrypt_blks(block128 *blks, unsigned int nblks, const AES_KEY *key) {
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_xor_si128(blks[i], key->rd_key[0]);
+  for (unsigned int j = 1; j < key->rounds; ++j)
+    for (unsigned int i = 0; i < nblks; ++i)
+      blks[i] = _mm_aesenc_si128(blks[i], key->rd_key[j]);
+  for (unsigned int i = 0; i < nblks; ++i)
+    blks[i] = _mm_aesenclast_si128(blks[i], key->rd_key[key->rounds]);
+}
+
+inline void __attribute__((target("aes,sse2")))
+AES_set_decrypt_key_fast(AES_KEY *dkey, const AES_KEY *ekey) {
+  int j = 0;
+  int i = ekey->rounds;
+#if (OCB_KEY_LEN == 0)
+  dkey->rounds = i;
+#endif
+  dkey->rd_key[i--] = ekey->rd_key[j++];
+  while (i)
+    dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]);
+  dkey->rd_key[i] = ekey->rd_key[j];
+}
+
+inline void __attribute__((target("aes,sse2")))
+AES_set_decrypt_key(block128 userkey, AES_KEY *key) {
+  AES_KEY temp_key;
+  AES_set_encrypt_key(userkey, &temp_key);
+  AES_set_decrypt_key_fast(key, &temp_key);
+}
+
+inline void __attribute__((target("aes,sse2")))
+AES_ecb_decrypt_blks(block128 *blks, unsigned nblks, const AES_KEY *key) {
+  unsigned i, j, rnds = key->rounds;
+  for (i = 0; i < nblks; ++i)
+    blks[i] = _mm_xor_si128(blks[i], key->rd_key[0]);
+  for (j = 1; j < rnds; ++j)
+    for (i = 0; i < nblks; ++i)
+      blks[i] = _mm_aesdec_si128(blks[i], key->rd_key[j]);
+  for (i = 0; i < nblks; ++i)
+    blks[i] = _mm_aesdeclast_si128(blks[i], key->rd_key[j]);
+}
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes_opt.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes_opt.h
new file mode 100644
index 00000000..9ca0094c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/aes_opt.h
@@ -0,0 +1,1052 @@
+/********************************************************************/
+/* Copyright(c) 2014, Intel Corp.                                   */
+/* Developers and authors: Shay Gueron (1) (2)                      */
+/* (1) University of Haifa, Israel                                  */
+/* (2) Intel, Israel                                                */
+/* IPG, Architecture, Israel Development Center, Haifa, Israel      */
+/********************************************************************/
+
+/*
+Initially Modified Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Mayank Rathee
+Reference code: https://github.com/Shay-Gueron/AES-GCM-SIV
+*/
+
+#ifndef LIBGARBLE_AES_OPT_H
+#define LIBGARBLE_AES_OPT_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <wmmintrin.h>
+
+#if defined(__INTEL_COMPILER)
+#include <ia32intrin.h>
+#elif defined(__GNUC__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#endif
+
+#include "utils/aes-ni.h"
+#include "utils/block.h"
+#include "utils/ubuntu_terminal_colors.h"
+
+#if !defined(ALIGN16)
+#if defined(__GNUC__)
+#define ALIGN16 __attribute__((aligned(16)))
+#else
+#define ALIGN16 __declspec(align(16))
+#endif
+#endif
+
+// number of batched key schedule
+#define KS_BATCH_N 8
+
+namespace sci {
+
+typedef struct KEY_SCHEDULE {
+  ALIGN16 unsigned char KEY[16 * 15];
+  unsigned int nr;
+} ROUND_KEYS;
+
+/*********************** 256-bit Key Schedules ***************************/
+
+/*********************** 1 Key Pipelined in Schedule
+ * ***************************/
+
+#define KS_BLOCK_RO(t, reg, reg2)                                              \
+  {                                                                            \
+    xmm4 = _mm_slli_epi64(reg, 32);                                            \
+    reg = _mm_xor_si128(reg, xmm4);                                            \
+    xmm4 = _mm_shuffle_epi8(reg, con3);                                        \
+    reg = _mm_xor_si128(reg, xmm4);                                            \
+    reg = _mm_xor_si128(reg, reg2);                                            \
+  }
+
+#define KS_round_first_x1(i)                                                   \
+  {                                                                            \
+    xmm2 = _mm_shuffle_epi8(xmm3, mask);                                       \
+    xmm2 = _mm_aesenclast_si128(xmm2, con1);                                   \
+    KS_BLOCK_RO(0, xmm1, xmm2);                                                \
+    con1 = _mm_slli_epi32(con1, 1);                                            \
+    _mm_storeu_si128(&Key_Schedule[(i + 1) * 2], xmm1);                        \
+  }
+
+#define KS_round_second_x1(i)                                                  \
+  {                                                                            \
+    xmm2 = _mm_shuffle_epi32(xmm1, 0xff);                                      \
+    xmm2 = _mm_aesenclast_si128(xmm2, xmm14);                                  \
+    KS_BLOCK_RO(0, xmm3, xmm2);                                                \
+    _mm_storeu_si128(&Key_Schedule[(i + 1) * 2 + 1], xmm3);                    \
+  }
+
+#define KS_round_last_x1(i)                                                    \
+  {                                                                            \
+    xmm2 = _mm_shuffle_epi8(xmm3, mask);                                       \
+    xmm2 = _mm_aesenclast_si128(xmm2, con1);                                   \
+    KS_BLOCK_RO(0, xmm1, xmm2);                                                \
+    _mm_storeu_si128(&Key_Schedule[14], xmm1);                                 \
+  }
+
+/*********************** 2 Keys Pipelined in Schedule
+ * ***************************/
+
+#define KS_round_first_x2(k)                                                   \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi8(keyARight, mask);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, con1);                             \
+    KS_BLOCK_RO(0, keyALeft, keyAAux);                                         \
+    keyBAux = _mm_shuffle_epi8(keyBRight, mask);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, con1);                             \
+    KS_BLOCK_RO(1, keyBLeft, keyBAux);                                         \
+    con1 = _mm_slli_epi32(con1, 1);                                            \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyALeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBLeft);                 \
+  }
+
+#define KS_round_second_x2(k)                                                  \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi32(keyALeft, 0xff);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, xmm14);                            \
+    keyBAux = _mm_shuffle_epi32(keyBLeft, 0xff);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, xmm14);                            \
+    KS_BLOCK_RO(0, keyARight, keyAAux);                                        \
+    KS_BLOCK_RO(1, keyBRight, keyBAux);                                        \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyARight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBRight);                \
+  }
+
+#define KS_round_last_x2(k)                                                    \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi8(keyARight, mask);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, con1);                             \
+    keyBAux = _mm_shuffle_epi8(keyBRight, mask);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, con1);                             \
+    KS_BLOCK_RO(0, keyALeft, keyAAux);                                         \
+    KS_BLOCK_RO(1, keyBLeft, keyBAux);                                         \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyALeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBLeft);                 \
+  }
+
+/*********************** 8 Keys Pipelined in Schedule
+ * ***************************/
+
+#define KS_round_first_x8(k)                                                   \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi8(keyARight, mask);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, con1);                             \
+    KS_BLOCK_RO(0, keyALeft, keyAAux);                                         \
+                                                                               \
+    keyBAux = _mm_shuffle_epi8(keyBRight, mask);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, con1);                             \
+    KS_BLOCK_RO(1, keyBLeft, keyBAux);                                         \
+                                                                               \
+    keyCAux = _mm_shuffle_epi8(keyCRight, mask);                               \
+    keyCAux = _mm_aesenclast_si128(keyCAux, con1);                             \
+    KS_BLOCK_RO(2, keyCLeft, keyCAux);                                         \
+                                                                               \
+    keyDAux = _mm_shuffle_epi8(keyDRight, mask);                               \
+    keyDAux = _mm_aesenclast_si128(keyDAux, con1);                             \
+    KS_BLOCK_RO(3, keyDLeft, keyDAux);                                         \
+                                                                               \
+    keyEAux = _mm_shuffle_epi8(keyERight, mask);                               \
+    keyEAux = _mm_aesenclast_si128(keyEAux, con1);                             \
+    KS_BLOCK_RO(4, keyELeft, keyEAux);                                         \
+                                                                               \
+    keyFAux = _mm_shuffle_epi8(keyFRight, mask);                               \
+    keyFAux = _mm_aesenclast_si128(keyFAux, con1);                             \
+    KS_BLOCK_RO(5, keyFLeft, keyFAux);                                         \
+                                                                               \
+    keyGAux = _mm_shuffle_epi8(keyGRight, mask);                               \
+    keyGAux = _mm_aesenclast_si128(keyGAux, con1);                             \
+    KS_BLOCK_RO(6, keyGLeft, keyGAux);                                         \
+                                                                               \
+    keyHAux = _mm_shuffle_epi8(keyHRight, mask);                               \
+    keyHAux = _mm_aesenclast_si128(keyHAux, con1);                             \
+    KS_BLOCK_RO(7, keyHLeft, keyHAux);                                         \
+                                                                               \
+    con1 = _mm_slli_epi32(con1, 1);                                            \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyALeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[2].rk[k])), keyCLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[3].rk[k])), keyDLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[4].rk[k])), keyELeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[5].rk[k])), keyFLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[6].rk[k])), keyGLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[7].rk[k])), keyHLeft);                 \
+  }
+
+#define KS_round_second_x8(k)                                                  \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi32(keyALeft, 0xff);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, xmm14);                            \
+                                                                               \
+    keyBAux = _mm_shuffle_epi32(keyBLeft, 0xff);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, xmm14);                            \
+                                                                               \
+    keyCAux = _mm_shuffle_epi32(keyCLeft, 0xff);                               \
+    keyCAux = _mm_aesenclast_si128(keyCAux, xmm14);                            \
+                                                                               \
+    keyDAux = _mm_shuffle_epi32(keyDLeft, 0xff);                               \
+    keyDAux = _mm_aesenclast_si128(keyDAux, xmm14);                            \
+                                                                               \
+    keyEAux = _mm_shuffle_epi32(keyELeft, 0xff);                               \
+    keyEAux = _mm_aesenclast_si128(keyEAux, xmm14);                            \
+                                                                               \
+    keyFAux = _mm_shuffle_epi32(keyFLeft, 0xff);                               \
+    keyFAux = _mm_aesenclast_si128(keyFAux, xmm14);                            \
+                                                                               \
+    keyGAux = _mm_shuffle_epi32(keyGLeft, 0xff);                               \
+    keyGAux = _mm_aesenclast_si128(keyGAux, xmm14);                            \
+                                                                               \
+    keyHAux = _mm_shuffle_epi32(keyHLeft, 0xff);                               \
+    keyHAux = _mm_aesenclast_si128(keyHAux, xmm14);                            \
+                                                                               \
+    KS_BLOCK_RO(0, keyARight, keyAAux);                                        \
+    KS_BLOCK_RO(1, keyBRight, keyBAux);                                        \
+    KS_BLOCK_RO(2, keyCRight, keyCAux);                                        \
+    KS_BLOCK_RO(3, keyDRight, keyDAux);                                        \
+    KS_BLOCK_RO(4, keyERight, keyEAux);                                        \
+    KS_BLOCK_RO(5, keyFRight, keyFAux);                                        \
+    KS_BLOCK_RO(6, keyGRight, keyGAux);                                        \
+    KS_BLOCK_RO(7, keyHRight, keyHAux);                                        \
+                                                                               \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyARight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBRight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[2].rk[k])), keyCRight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[3].rk[k])), keyDRight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[4].rk[k])), keyERight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[5].rk[k])), keyFRight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[6].rk[k])), keyGRight);                \
+    _mm_storeu_si128((__m128i *)(&(keys[7].rk[k])), keyHRight);                \
+  }
+
+#define KS_round_last_x8(k)                                                    \
+  {                                                                            \
+    keyAAux = _mm_shuffle_epi8(keyARight, mask);                               \
+    keyAAux = _mm_aesenclast_si128(keyAAux, con1);                             \
+                                                                               \
+    keyBAux = _mm_shuffle_epi8(keyBRight, mask);                               \
+    keyBAux = _mm_aesenclast_si128(keyBAux, con1);                             \
+                                                                               \
+    keyCAux = _mm_shuffle_epi8(keyCRight, mask);                               \
+    keyCAux = _mm_aesenclast_si128(keyCAux, con1);                             \
+                                                                               \
+    keyDAux = _mm_shuffle_epi8(keyDRight, mask);                               \
+    keyDAux = _mm_aesenclast_si128(keyDAux, con1);                             \
+                                                                               \
+    keyEAux = _mm_shuffle_epi8(keyERight, mask);                               \
+    keyEAux = _mm_aesenclast_si128(keyEAux, con1);                             \
+                                                                               \
+    keyFAux = _mm_shuffle_epi8(keyFRight, mask);                               \
+    keyFAux = _mm_aesenclast_si128(keyFAux, con1);                             \
+                                                                               \
+    keyGAux = _mm_shuffle_epi8(keyGRight, mask);                               \
+    keyGAux = _mm_aesenclast_si128(keyGAux, con1);                             \
+                                                                               \
+    keyHAux = _mm_shuffle_epi8(keyHRight, mask);                               \
+    keyHAux = _mm_aesenclast_si128(keyHAux, con1);                             \
+                                                                               \
+    KS_BLOCK_RO(0, keyALeft, keyAAux);                                         \
+    KS_BLOCK_RO(1, keyBLeft, keyBAux);                                         \
+    KS_BLOCK_RO(2, keyCLeft, keyCAux);                                         \
+    KS_BLOCK_RO(3, keyDLeft, keyDAux);                                         \
+    KS_BLOCK_RO(4, keyELeft, keyEAux);                                         \
+    KS_BLOCK_RO(5, keyFLeft, keyFAux);                                         \
+    KS_BLOCK_RO(6, keyGLeft, keyGAux);                                         \
+    KS_BLOCK_RO(7, keyHLeft, keyHAux);                                         \
+                                                                               \
+    _mm_storeu_si128((__m128i *)(&(keys[0].rk[k])), keyALeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[1].rk[k])), keyBLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[2].rk[k])), keyCLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[3].rk[k])), keyDLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[4].rk[k])), keyELeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[5].rk[k])), keyFLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[6].rk[k])), keyGLeft);                 \
+    _mm_storeu_si128((__m128i *)(&(keys[7].rk[k])), keyHLeft);                 \
+  }
+
+/*********************** 128-bit Key Schedules ***************************/
+
+#define KS_BLOCK(t, reg, reg2)                                                 \
+  {                                                                            \
+    globAux = _mm_slli_epi64(reg, 32);                                         \
+    reg = _mm_xor_si128(globAux, reg);                                         \
+    globAux = _mm_shuffle_epi8(reg, con3);                                     \
+    reg = _mm_xor_si128(globAux, reg);                                         \
+    reg = _mm_xor_si128(reg2, reg);                                            \
+  }
+
+#define KS_round_2(i)                                                          \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    con = _mm_slli_epi32(con, 1);                                              \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+  }
+
+#define KS_round_2_last(i)                                                     \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+  }
+
+#define KS_round_4(i)                                                          \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyC, mask);                                         \
+    keyC_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(2, keyC, keyC_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyD, mask);                                         \
+    keyD_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(3, keyD, keyD_aux);                                               \
+    con = _mm_slli_epi32(con, 1);                                              \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+    _mm_storeu_si128((__m128i *)(keys[2].KEY + i * 16), keyC);                 \
+    _mm_storeu_si128((__m128i *)(keys[3].KEY + i * 16), keyD);                 \
+  }
+
+#define KS_round_4_last(i)                                                     \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyC, mask);                                         \
+    keyC_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyD, mask);                                         \
+    keyD_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    KS_BLOCK(2, keyC, keyC_aux);                                               \
+    KS_BLOCK(3, keyD, keyD_aux);                                               \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+    _mm_storeu_si128((__m128i *)(keys[2].KEY + i * 16), keyC);                 \
+    _mm_storeu_si128((__m128i *)(keys[3].KEY + i * 16), keyD);                 \
+  }
+
+#define KS_round_8(i)                                                          \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyC, mask);                                         \
+    keyC_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(2, keyC, keyC_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyD, mask);                                         \
+    keyD_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(3, keyD, keyD_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyE, mask);                                         \
+    keyE_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(4, keyE, keyE_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyF, mask);                                         \
+    keyF_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(5, keyF, keyF_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyG, mask);                                         \
+    keyG_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(6, keyG, keyG_aux);                                               \
+    x2 = _mm_shuffle_epi8(keyH, mask);                                         \
+    keyH_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(7, keyH, keyH_aux);                                               \
+    con = _mm_slli_epi32(con, 1);                                              \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+    _mm_storeu_si128((__m128i *)(keys[2].KEY + i * 16), keyC);                 \
+    _mm_storeu_si128((__m128i *)(keys[3].KEY + i * 16), keyD);                 \
+    _mm_storeu_si128((__m128i *)(keys[4].KEY + i * 16), keyE);                 \
+    _mm_storeu_si128((__m128i *)(keys[5].KEY + i * 16), keyF);                 \
+    _mm_storeu_si128((__m128i *)(keys[6].KEY + i * 16), keyG);                 \
+    _mm_storeu_si128((__m128i *)(keys[7].KEY + i * 16), keyH);                 \
+  }
+
+#define KS_round_8_last(i)                                                     \
+  {                                                                            \
+    x2 = _mm_shuffle_epi8(keyA, mask);                                         \
+    keyA_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyB, mask);                                         \
+    keyB_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyC, mask);                                         \
+    keyC_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyD, mask);                                         \
+    keyD_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyE, mask);                                         \
+    keyE_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyF, mask);                                         \
+    keyF_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyG, mask);                                         \
+    keyG_aux = _mm_aesenclast_si128(x2, con);                                  \
+    x2 = _mm_shuffle_epi8(keyH, mask);                                         \
+    keyH_aux = _mm_aesenclast_si128(x2, con);                                  \
+    KS_BLOCK(0, keyA, keyA_aux);                                               \
+    KS_BLOCK(1, keyB, keyB_aux);                                               \
+    KS_BLOCK(2, keyC, keyC_aux);                                               \
+    KS_BLOCK(3, keyD, keyD_aux);                                               \
+    KS_BLOCK(4, keyE, keyE_aux);                                               \
+    KS_BLOCK(5, keyF, keyF_aux);                                               \
+    KS_BLOCK(6, keyG, keyG_aux);                                               \
+    KS_BLOCK(7, keyH, keyH_aux);                                               \
+    _mm_storeu_si128((__m128i *)(keys[0].KEY + i * 16), keyA);                 \
+    _mm_storeu_si128((__m128i *)(keys[1].KEY + i * 16), keyB);                 \
+    _mm_storeu_si128((__m128i *)(keys[2].KEY + i * 16), keyC);                 \
+    _mm_storeu_si128((__m128i *)(keys[3].KEY + i * 16), keyD);                 \
+    _mm_storeu_si128((__m128i *)(keys[4].KEY + i * 16), keyE);                 \
+    _mm_storeu_si128((__m128i *)(keys[5].KEY + i * 16), keyF);                 \
+    _mm_storeu_si128((__m128i *)(keys[6].KEY + i * 16), keyG);                 \
+    _mm_storeu_si128((__m128i *)(keys[7].KEY + i * 16), keyH);                 \
+  }
+
+#define READ_KEYS_2(i)                                                         \
+  {                                                                            \
+    keyA = _mm_loadu_si128((__m128i const *)(keys[0].KEY + i * 16));           \
+    keyB = _mm_loadu_si128((__m128i const *)(keys[1].KEY + i * 16));           \
+  }
+
+#define READ_KEYS_4(i)                                                         \
+  {                                                                            \
+    keyA = _mm_loadu_si128((__m128i const *)(keys[0].KEY + i * 16));           \
+    keyB = _mm_loadu_si128((__m128i const *)(keys[1].KEY + i * 16));           \
+    keyC = _mm_loadu_si128((__m128i const *)(keys[2].KEY + i * 16));           \
+    keyD = _mm_loadu_si128((__m128i const *)(keys[3].KEY + i * 16));           \
+  }
+
+#define READ_KEYS_8(i)                                                         \
+  {                                                                            \
+    keyA = _mm_loadu_si128((__m128i const *)(keys[0].KEY + i * 16));           \
+    keyB = _mm_loadu_si128((__m128i const *)(keys[1].KEY + i * 16));           \
+    keyC = _mm_loadu_si128((__m128i const *)(keys[2].KEY + i * 16));           \
+    keyD = _mm_loadu_si128((__m128i const *)(keys[3].KEY + i * 16));           \
+    keyE = _mm_loadu_si128((__m128i const *)(keys[4].KEY + i * 16));           \
+    keyF = _mm_loadu_si128((__m128i const *)(keys[5].KEY + i * 16));           \
+    keyG = _mm_loadu_si128((__m128i const *)(keys[6].KEY + i * 16));           \
+    keyH = _mm_loadu_si128((__m128i const *)(keys[7].KEY + i * 16));           \
+  }
+
+#define ENC_round_22(i)                                                        \
+  {                                                                            \
+    block1 =                                                                   \
+        _mm_aesenc_si128(block1, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block2 =                                                                   \
+        _mm_aesenc_si128(block2, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+  }
+
+#define ENC_round_22_last(i)                                                   \
+  {                                                                            \
+    block1 = _mm_aesenclast_si128(block1,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block2 = _mm_aesenclast_si128(block2,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+  }
+
+#define ENC_round_24(i)                                                        \
+  {                                                                            \
+    block1 =                                                                   \
+        _mm_aesenc_si128(block1, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block2 =                                                                   \
+        _mm_aesenc_si128(block2, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block3 =                                                                   \
+        _mm_aesenc_si128(block3, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+    block4 =                                                                   \
+        _mm_aesenc_si128(block4, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+  }
+
+#define ENC_round_24_last(i)                                                   \
+  {                                                                            \
+    block1 = _mm_aesenclast_si128(block1,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block2 = _mm_aesenclast_si128(block2,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block3 = _mm_aesenclast_si128(block3,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+    block4 = _mm_aesenclast_si128(block4,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+  }
+
+#define ENC_round_48(i)                                                        \
+  {                                                                            \
+    block1 =                                                                   \
+        _mm_aesenc_si128(block1, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block2 =                                                                   \
+        _mm_aesenc_si128(block2, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block3 =                                                                   \
+        _mm_aesenc_si128(block3, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+    block4 =                                                                   \
+        _mm_aesenc_si128(block4, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+    block5 =                                                                   \
+        _mm_aesenc_si128(block5, (*(__m128i const *)(keys[2].KEY + i * 16)));  \
+    block6 =                                                                   \
+        _mm_aesenc_si128(block6, (*(__m128i const *)(keys[2].KEY + i * 16)));  \
+    block7 =                                                                   \
+        _mm_aesenc_si128(block7, (*(__m128i const *)(keys[3].KEY + i * 16)));  \
+    block8 =                                                                   \
+        _mm_aesenc_si128(block8, (*(__m128i const *)(keys[3].KEY + i * 16)));  \
+  }
+
+#define ENC_round_48_last(i)                                                   \
+  {                                                                            \
+    block1 = _mm_aesenclast_si128(block1,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block2 = _mm_aesenclast_si128(block2,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block3 = _mm_aesenclast_si128(block3,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+    block4 = _mm_aesenclast_si128(block4,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+    block5 = _mm_aesenclast_si128(block5,                                      \
+                                  (*(__m128i const *)(keys[2].KEY + i * 16))); \
+    block6 = _mm_aesenclast_si128(block6,                                      \
+                                  (*(__m128i const *)(keys[2].KEY + i * 16))); \
+    block7 = _mm_aesenclast_si128(block7,                                      \
+                                  (*(__m128i const *)(keys[3].KEY + i * 16))); \
+    block8 = _mm_aesenclast_si128(block8,                                      \
+                                  (*(__m128i const *)(keys[3].KEY + i * 16))); \
+  }
+
+#define ENC_round_88(i)                                                        \
+  {                                                                            \
+    block1 =                                                                   \
+        _mm_aesenc_si128(block1, (*(__m128i const *)(keys[0].KEY + i * 16)));  \
+    block2 =                                                                   \
+        _mm_aesenc_si128(block2, (*(__m128i const *)(keys[1].KEY + i * 16)));  \
+    block3 =                                                                   \
+        _mm_aesenc_si128(block3, (*(__m128i const *)(keys[2].KEY + i * 16)));  \
+    block4 =                                                                   \
+        _mm_aesenc_si128(block4, (*(__m128i const *)(keys[3].KEY + i * 16)));  \
+    block5 =                                                                   \
+        _mm_aesenc_si128(block5, (*(__m128i const *)(keys[4].KEY + i * 16)));  \
+    block6 =                                                                   \
+        _mm_aesenc_si128(block6, (*(__m128i const *)(keys[5].KEY + i * 16)));  \
+    block7 =                                                                   \
+        _mm_aesenc_si128(block7, (*(__m128i const *)(keys[6].KEY + i * 16)));  \
+    block8 =                                                                   \
+        _mm_aesenc_si128(block8, (*(__m128i const *)(keys[7].KEY + i * 16)));  \
+  }
+
+#define ENC_round_88_last(i)                                                   \
+  {                                                                            \
+    block1 = _mm_aesenclast_si128(block1,                                      \
+                                  (*(__m128i const *)(keys[0].KEY + i * 16))); \
+    block2 = _mm_aesenclast_si128(block2,                                      \
+                                  (*(__m128i const *)(keys[1].KEY + i * 16))); \
+    block3 = _mm_aesenclast_si128(block3,                                      \
+                                  (*(__m128i const *)(keys[2].KEY + i * 16))); \
+    block4 = _mm_aesenclast_si128(block4,                                      \
+                                  (*(__m128i const *)(keys[3].KEY + i * 16))); \
+    block5 = _mm_aesenclast_si128(block5,                                      \
+                                  (*(__m128i const *)(keys[4].KEY + i * 16))); \
+    block6 = _mm_aesenclast_si128(block6,                                      \
+                                  (*(__m128i const *)(keys[5].KEY + i * 16))); \
+    block7 = _mm_aesenclast_si128(block7,                                      \
+                                  (*(__m128i const *)(keys[6].KEY + i * 16))); \
+    block8 = _mm_aesenclast_si128(block8,                                      \
+                                  (*(__m128i const *)(keys[7].KEY + i * 16))); \
+  }
+
+/*
+ * AES key scheduling for 2/4/8 keys
+ */
+static inline void AES_ks2(block128 *user_key, ROUND_KEYS *KEYS) {
+  unsigned char *first_key = (unsigned char *)user_key;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB, con, mask, x2, keyA_aux, keyB_aux, globAux;
+  int _con1[4] = {1, 1, 1, 1};
+  int _con2[4] = {0x1b, 0x1b, 0x1b, 0x1b};
+  int _mask[4] = {0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d};
+  unsigned int _con3[4] = {0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
+  __m128i con3 = _mm_loadu_si128((__m128i const *)_con3);
+
+  keys[0].nr = 10;
+  keys[1].nr = 10;
+
+  keyA = _mm_loadu_si128((__m128i const *)(first_key));
+  keyB = _mm_loadu_si128((__m128i const *)(first_key + 16));
+
+  _mm_storeu_si128((__m128i *)keys[0].KEY, keyA);
+  _mm_storeu_si128((__m128i *)keys[1].KEY, keyB);
+
+  con = _mm_loadu_si128((__m128i const *)_con1);
+  mask = _mm_loadu_si128((__m128i const *)_mask);
+
+  KS_round_2(1) KS_round_2(2) KS_round_2(3) KS_round_2(4) KS_round_2(5)
+      KS_round_2(6) KS_round_2(7) KS_round_2(8)
+
+          con = _mm_loadu_si128((__m128i const *)_con2);
+
+  KS_round_2(9) KS_round_2_last(10)
+}
+
+static inline void AES_ks4(block128 *user_key, ROUND_KEYS *KEYS) {
+  unsigned char *first_key = (unsigned char *)user_key;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux,
+      keyD_aux, globAux;
+  int _con1[4] = {1, 1, 1, 1};
+  int _con2[4] = {0x1b, 0x1b, 0x1b, 0x1b};
+  int _mask[4] = {0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d};
+  unsigned int _con3[4] = {0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
+  __m128i con3 = _mm_loadu_si128((__m128i const *)_con3);
+
+  keys[0].nr = 10;
+  keys[1].nr = 10;
+  keys[2].nr = 10;
+  keys[3].nr = 10;
+
+  keyA = _mm_loadu_si128((__m128i const *)(first_key));
+  keyB = _mm_loadu_si128((__m128i const *)(first_key + 16));
+  keyC = _mm_loadu_si128((__m128i const *)(first_key + 32));
+  keyD = _mm_loadu_si128((__m128i const *)(first_key + 48));
+
+  _mm_storeu_si128((__m128i *)keys[0].KEY, keyA);
+  _mm_storeu_si128((__m128i *)keys[1].KEY, keyB);
+  _mm_storeu_si128((__m128i *)keys[2].KEY, keyC);
+  _mm_storeu_si128((__m128i *)keys[3].KEY, keyD);
+
+  con = _mm_loadu_si128((__m128i const *)_con1);
+  mask = _mm_loadu_si128((__m128i const *)_mask);
+
+  KS_round_4(1) KS_round_4(2) KS_round_4(3) KS_round_4(4) KS_round_4(5)
+      KS_round_4(6) KS_round_4(7) KS_round_4(8)
+
+          con = _mm_loadu_si128((__m128i const *)_con2);
+
+  KS_round_4(9) KS_round_4_last(10)
+}
+
+static inline void AES_ks8(block128 *user_key, ROUND_KEYS *KEYS) {
+  unsigned char *first_key = (unsigned char *)user_key;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, keyA_aux, keyB_aux,
+      keyC_aux, keyD_aux, keyE_aux, keyF_aux, keyG_aux, keyH_aux;
+  __m128i con, mask, x2, globAux;
+  int _con1[4] = {1, 1, 1, 1};
+  int _con2[4] = {0x1b, 0x1b, 0x1b, 0x1b};
+  int _mask[4] = {0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d};
+  unsigned int _con3[4] = {0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
+  __m128i con3 = _mm_loadu_si128((__m128i const *)_con3);
+
+  keys[0].nr = 10;
+  keys[1].nr = 10;
+  keys[2].nr = 10;
+  keys[3].nr = 10;
+  keys[4].nr = 10;
+  keys[5].nr = 10;
+  keys[6].nr = 10;
+  keys[7].nr = 10;
+
+  keyA = _mm_loadu_si128((__m128i const *)(first_key));
+  keyB = _mm_loadu_si128((__m128i const *)(first_key + 16));
+  keyC = _mm_loadu_si128((__m128i const *)(first_key + 32));
+  keyD = _mm_loadu_si128((__m128i const *)(first_key + 48));
+  keyE = _mm_loadu_si128((__m128i const *)(first_key + 64));
+  keyF = _mm_loadu_si128((__m128i const *)(first_key + 80));
+  keyG = _mm_loadu_si128((__m128i const *)(first_key + 96));
+  keyH = _mm_loadu_si128((__m128i const *)(first_key + 112));
+
+  _mm_storeu_si128((__m128i *)keys[0].KEY, keyA);
+  _mm_storeu_si128((__m128i *)keys[1].KEY, keyB);
+  _mm_storeu_si128((__m128i *)keys[2].KEY, keyC);
+  _mm_storeu_si128((__m128i *)keys[3].KEY, keyD);
+  _mm_storeu_si128((__m128i *)keys[4].KEY, keyE);
+  _mm_storeu_si128((__m128i *)keys[5].KEY, keyF);
+  _mm_storeu_si128((__m128i *)keys[6].KEY, keyG);
+  _mm_storeu_si128((__m128i *)keys[7].KEY, keyH);
+
+  con = _mm_loadu_si128((__m128i const *)_con1);
+  mask = _mm_loadu_si128((__m128i const *)_mask);
+
+  KS_round_8(1) KS_round_8(2) KS_round_8(3) KS_round_8(4) KS_round_8(5)
+      KS_round_8(6) KS_round_8(7) KS_round_8(8)
+
+          con = _mm_loadu_si128((__m128i const *)_con2);
+
+  KS_round_8(9) KS_round_8_last(10)
+}
+
+/*
+ * AES key scheduling for circuit generation with 2/4/8 keys
+ */
+static inline void AES_ks2_index(block128 random, uint64_t idx,
+                                 ROUND_KEYS *KEYS) {
+  block128 user_key[2];
+  user_key[0] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[1] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+
+  AES_ks2(user_key, KEYS);
+}
+
+static inline void AES_ks4_index(block128 random, uint64_t idx,
+                                 ROUND_KEYS *KEYS) {
+  block128 user_key[4];
+  user_key[0] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[1] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+  idx++;
+  user_key[2] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[3] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+
+  AES_ks4(user_key, KEYS);
+}
+
+static inline void AES_ks8_index(block128 random, uint64_t idx,
+                                 ROUND_KEYS *KEYS) {
+  block128 user_key[8];
+  user_key[0] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[1] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+  idx++;
+  user_key[2] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[3] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+  idx++;
+  user_key[4] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[5] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+  idx++;
+  user_key[6] = xorBlocks(makeBlock128(2 * idx, (uint64_t)0), random);
+  user_key[7] = xorBlocks(makeBlock128(2 * idx + 1, (uint64_t)0), random);
+
+  AES_ks8(user_key, KEYS);
+}
+
+/*
+ * AES encryptin with
+ * 2 keys 2 ciphers
+ * 2 keys 4 ciphers
+ * 4 keys 8 ciphers
+ * 8 keys 8 ciphers
+ */
+static inline void AES_ecb_ccr_ks2_enc2(block128 *plaintext,
+                                        block128 *ciphertext,
+                                        ROUND_KEYS *KEYS) {
+  unsigned char *PT = (unsigned char *)plaintext;
+  unsigned char *CT = (unsigned char *)ciphertext;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB;
+  __m128i block1 = _mm_loadu_si128((__m128i const *)(0 * 16 + PT));
+  __m128i block2 = _mm_loadu_si128((__m128i const *)(1 * 16 + PT));
+  READ_KEYS_2(0)
+
+  block1 = _mm_xor_si128(keyA, block1);
+  block2 = _mm_xor_si128(keyB, block2);
+
+  ENC_round_22(1) ENC_round_22(2) ENC_round_22(3) ENC_round_22(4)
+      ENC_round_22(5) ENC_round_22(6) ENC_round_22(7) ENC_round_22(8)
+          ENC_round_22(9) ENC_round_22_last(10)
+
+              _mm_storeu_si128((__m128i *)(CT + 0 * 16), block1);
+  _mm_storeu_si128((__m128i *)(CT + 1 * 16), block2);
+}
+
+static inline void AES_ecb_ccr_ks2_enc4(block128 *plaintext,
+                                        block128 *ciphertext,
+                                        ROUND_KEYS *KEYS) {
+  unsigned char *PT = (unsigned char *)plaintext;
+  unsigned char *CT = (unsigned char *)ciphertext;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB;
+
+  __m128i block1 = _mm_loadu_si128((__m128i const *)(0 * 16 + PT));
+  __m128i block2 = _mm_loadu_si128((__m128i const *)(1 * 16 + PT));
+  __m128i block3 = _mm_loadu_si128((__m128i const *)(2 * 16 + PT));
+  __m128i block4 = _mm_loadu_si128((__m128i const *)(3 * 16 + PT));
+
+  READ_KEYS_2(0)
+
+  block1 = _mm_xor_si128(keyA, block1);
+  block2 = _mm_xor_si128(keyA, block2);
+  block3 = _mm_xor_si128(keyB, block3);
+  block4 = _mm_xor_si128(keyB, block4);
+
+  ENC_round_24(1) ENC_round_24(2) ENC_round_24(3) ENC_round_24(4)
+      ENC_round_24(5) ENC_round_24(6) ENC_round_24(7) ENC_round_24(8)
+          ENC_round_24(9) ENC_round_24_last(10)
+
+              _mm_storeu_si128((__m128i *)(CT + 0 * 16), block1);
+  _mm_storeu_si128((__m128i *)(CT + 1 * 16), block2);
+  _mm_storeu_si128((__m128i *)(CT + 2 * 16), block3);
+  _mm_storeu_si128((__m128i *)(CT + 3 * 16), block4);
+}
+
+static inline void AES_ecb_ccr_ks4_enc8(block128 *plaintext,
+                                        block128 *ciphertext,
+                                        ROUND_KEYS *KEYS) {
+  unsigned char *PT = (unsigned char *)plaintext;
+  unsigned char *CT = (unsigned char *)ciphertext;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB, keyC, keyD;
+
+  __m128i block1 = _mm_loadu_si128((__m128i const *)(0 * 16 + PT));
+  __m128i block2 = _mm_loadu_si128((__m128i const *)(1 * 16 + PT));
+  __m128i block3 = _mm_loadu_si128((__m128i const *)(2 * 16 + PT));
+  __m128i block4 = _mm_loadu_si128((__m128i const *)(3 * 16 + PT));
+  __m128i block5 = _mm_loadu_si128((__m128i const *)(4 * 16 + PT));
+  __m128i block6 = _mm_loadu_si128((__m128i const *)(5 * 16 + PT));
+  __m128i block7 = _mm_loadu_si128((__m128i const *)(6 * 16 + PT));
+  __m128i block8 = _mm_loadu_si128((__m128i const *)(7 * 16 + PT));
+
+  READ_KEYS_4(0)
+
+  block1 = _mm_xor_si128(keyA, block1);
+  block2 = _mm_xor_si128(keyA, block2);
+  block3 = _mm_xor_si128(keyB, block3);
+  block4 = _mm_xor_si128(keyB, block4);
+  block5 = _mm_xor_si128(keyC, block5);
+  block6 = _mm_xor_si128(keyC, block6);
+  block7 = _mm_xor_si128(keyD, block7);
+  block8 = _mm_xor_si128(keyD, block8);
+
+  ENC_round_48(1) ENC_round_48(2) ENC_round_48(3) ENC_round_48(4)
+      ENC_round_48(5) ENC_round_48(6) ENC_round_48(7) ENC_round_48(8)
+          ENC_round_48(9) ENC_round_48_last(10)
+
+              _mm_storeu_si128((__m128i *)(CT + 0 * 16), block1);
+  _mm_storeu_si128((__m128i *)(CT + 1 * 16), block2);
+  _mm_storeu_si128((__m128i *)(CT + 2 * 16), block3);
+  _mm_storeu_si128((__m128i *)(CT + 3 * 16), block4);
+  _mm_storeu_si128((__m128i *)(CT + 4 * 16), block5);
+  _mm_storeu_si128((__m128i *)(CT + 5 * 16), block6);
+  _mm_storeu_si128((__m128i *)(CT + 6 * 16), block7);
+  _mm_storeu_si128((__m128i *)(CT + 7 * 16), block8);
+}
+
+static inline void AES_ecb_ccr_ks8_enc8(block128 *plaintext,
+                                        block128 *ciphertext,
+                                        ROUND_KEYS *KEYS) {
+  unsigned char *PT = (unsigned char *)plaintext;
+  unsigned char *CT = (unsigned char *)ciphertext;
+  ROUND_KEYS *keys = KEYS;
+  __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH;
+
+  __m128i block1 = _mm_loadu_si128((__m128i const *)(0 * 16 + PT));
+  __m128i block2 = _mm_loadu_si128((__m128i const *)(1 * 16 + PT));
+  __m128i block3 = _mm_loadu_si128((__m128i const *)(2 * 16 + PT));
+  __m128i block4 = _mm_loadu_si128((__m128i const *)(3 * 16 + PT));
+  __m128i block5 = _mm_loadu_si128((__m128i const *)(4 * 16 + PT));
+  __m128i block6 = _mm_loadu_si128((__m128i const *)(5 * 16 + PT));
+  __m128i block7 = _mm_loadu_si128((__m128i const *)(6 * 16 + PT));
+  __m128i block8 = _mm_loadu_si128((__m128i const *)(7 * 16 + PT));
+
+  READ_KEYS_8(0)
+
+  block1 = _mm_xor_si128(keyA, block1);
+  block2 = _mm_xor_si128(keyB, block2);
+  block3 = _mm_xor_si128(keyC, block3);
+  block4 = _mm_xor_si128(keyD, block4);
+  block5 = _mm_xor_si128(keyE, block5);
+  block6 = _mm_xor_si128(keyF, block6);
+  block7 = _mm_xor_si128(keyG, block7);
+  block8 = _mm_xor_si128(keyH, block8);
+
+  ENC_round_88(1) ENC_round_88(2) ENC_round_88(3) ENC_round_88(4)
+      ENC_round_88(5) ENC_round_88(6) ENC_round_88(7) ENC_round_88(8)
+          ENC_round_88(9) ENC_round_88_last(10)
+
+              _mm_storeu_si128((__m128i *)(CT + 0 * 16), block1);
+  _mm_storeu_si128((__m128i *)(CT + 1 * 16), block2);
+  _mm_storeu_si128((__m128i *)(CT + 2 * 16), block3);
+  _mm_storeu_si128((__m128i *)(CT + 3 * 16), block4);
+  _mm_storeu_si128((__m128i *)(CT + 4 * 16), block5);
+  _mm_storeu_si128((__m128i *)(CT + 5 * 16), block6);
+  _mm_storeu_si128((__m128i *)(CT + 6 * 16), block7);
+  _mm_storeu_si128((__m128i *)(CT + 7 * 16), block8);
+}
+
+// static inline void AES_256_ks2(block256* user_key, ROUND_KEYS *KEYS) {
+// unsigned char *first_key = (unsigned char*)user_key;
+// ROUND_KEYS *keys = KEYS;
+
+//__m128i keyALeft, keyAAux, keyARight, xmm14, xmm4;
+//__m128i keyBLeft, keyBAux, keyBRight;
+//__m128i con1, con3, mask;
+// int i =0;
+// mask = _mm_setr_epi32(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+// con1 = _mm_setr_epi32(1,1,1,1);
+// con3 = _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7);
+// xmm4 = _mm_setzero_si128();
+// xmm14 = _mm_setzero_si128();
+
+// keys[0].nr=14;
+// keys[1].nr=14;
+
+// keyALeft = _mm_loadu_si128((__m128i const*)(first_key));
+// keyARight = _mm_loadu_si128((__m128i const*)(first_key+16));
+
+// keyBLeft = _mm_loadu_si128((__m128i const*)(first_key+32));
+// keyBRight = _mm_loadu_si128((__m128i const*)(first_key+48));
+
+//_mm_storeu_si128((__m128i *)(keys[0].KEY), keyALeft);
+//_mm_storeu_si128((__m128i *)(keys[0].KEY+16), keyARight);
+
+//_mm_storeu_si128((__m128i *)(keys[1].KEY), keyBLeft);
+//_mm_storeu_si128((__m128i *)(keys[1].KEY+16), keyBRight);
+
+// for (i=1; i<=6; i++)
+//{
+// KS_round_first_x2(2*i);
+// KS_round_second_x2(2*i+1);
+//}
+// KS_round_last_x2(14);
+//}
+
+static inline void AES_256_ks2(block256 *user_key, AESNI_KEY *KEYS) {
+  unsigned char *first_key = (unsigned char *)user_key;
+  AESNI_KEY *keys = KEYS;
+
+  __m128i keyALeft, keyAAux, keyARight, xmm14, xmm4;
+  __m128i keyBLeft, keyBAux, keyBRight;
+  __m128i con1, con3, mask;
+  int i = 0;
+  mask = _mm_setr_epi32(0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d);
+  con1 = _mm_setr_epi32(1, 1, 1, 1);
+  con3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 6, 7, 4, 5, 6, 7);
+  xmm4 = _mm_setzero_si128();
+  xmm14 = _mm_setzero_si128();
+
+  keys[0].rounds = 14;
+  keys[1].rounds = 14;
+
+  keyALeft = _mm_loadu_si128((__m128i const *)(first_key));
+  keyARight = _mm_loadu_si128((__m128i const *)(first_key + 16));
+
+  keyBLeft = _mm_loadu_si128((__m128i const *)(first_key + 32));
+  keyBRight = _mm_loadu_si128((__m128i const *)(first_key + 48));
+
+  _mm_storeu_si128((__m128i *)(&(keys[0].rk[0])), keyALeft);
+  _mm_storeu_si128((__m128i *)(&(keys[0].rk[1])), keyARight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[1].rk[0])), keyBLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[1].rk[1])), keyBRight);
+
+  // std::cout<<"A"<<std::endl;
+
+  for (i = 1; i <= 6; i++) {
+    // std::cout<<2*i<<std::endl;
+    KS_round_first_x2(2 * i);
+    // std::cout<<2*i+1<<std::endl;
+    KS_round_second_x2(2 * i + 1);
+  }
+  // std::cout<<"B"<<std::endl;
+  KS_round_last_x2(14);
+}
+
+static inline void AES_256_ks8(block256 *user_key, AESNI_KEY *KEYS) {
+  unsigned char *first_key = (unsigned char *)user_key;
+  AESNI_KEY *keys = KEYS;
+
+  __m128i keyALeft, keyAAux, keyARight, xmm14, xmm4;
+  __m128i keyBLeft, keyBAux, keyBRight;
+  __m128i keyCLeft, keyCAux, keyCRight;
+  __m128i keyDLeft, keyDAux, keyDRight;
+  __m128i keyELeft, keyEAux, keyERight;
+  __m128i keyFLeft, keyFAux, keyFRight;
+  __m128i keyGLeft, keyGAux, keyGRight;
+  __m128i keyHLeft, keyHAux, keyHRight;
+
+  __m128i con1, con3, mask;
+  int i = 0;
+  mask = _mm_setr_epi32(0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d, 0x0c0f0e0d);
+  con1 = _mm_setr_epi32(1, 1, 1, 1);
+  con3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 6, 7, 4, 5, 6, 7);
+  xmm4 = _mm_setzero_si128();
+  xmm14 = _mm_setzero_si128();
+
+  keys[0].rounds = 14;
+  keys[1].rounds = 14;
+  keys[2].rounds = 14;
+  keys[3].rounds = 14;
+  keys[4].rounds = 14;
+  keys[5].rounds = 14;
+  keys[6].rounds = 14;
+  keys[7].rounds = 14;
+
+  keyALeft = _mm_loadu_si128((__m128i const *)(first_key));
+  keyARight = _mm_loadu_si128((__m128i const *)(first_key + 16));
+
+  keyBLeft = _mm_loadu_si128((__m128i const *)(first_key + 32));
+  keyBRight = _mm_loadu_si128((__m128i const *)(first_key + 48));
+
+  keyCLeft = _mm_loadu_si128((__m128i const *)(first_key + 64));
+  keyCRight = _mm_loadu_si128((__m128i const *)(first_key + 80));
+
+  keyDLeft = _mm_loadu_si128((__m128i const *)(first_key + 96));
+  keyDRight = _mm_loadu_si128((__m128i const *)(first_key + 112));
+
+  keyELeft = _mm_loadu_si128((__m128i const *)(first_key + 128));
+  keyERight = _mm_loadu_si128((__m128i const *)(first_key + 144));
+
+  keyFLeft = _mm_loadu_si128((__m128i const *)(first_key + 160));
+  keyFRight = _mm_loadu_si128((__m128i const *)(first_key + 176));
+
+  keyGLeft = _mm_loadu_si128((__m128i const *)(first_key + 192));
+  keyGRight = _mm_loadu_si128((__m128i const *)(first_key + 208));
+
+  keyHLeft = _mm_loadu_si128((__m128i const *)(first_key + 224));
+  keyHRight = _mm_loadu_si128((__m128i const *)(first_key + 240));
+
+  _mm_storeu_si128((__m128i *)(&(keys[0].rk[0])), keyALeft);
+  _mm_storeu_si128((__m128i *)(&(keys[0].rk[1])), keyARight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[1].rk[0])), keyBLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[1].rk[1])), keyBRight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[2].rk[0])), keyCLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[2].rk[1])), keyCRight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[3].rk[0])), keyDLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[3].rk[1])), keyDRight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[4].rk[0])), keyELeft);
+  _mm_storeu_si128((__m128i *)(&(keys[4].rk[1])), keyERight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[5].rk[0])), keyFLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[5].rk[1])), keyFRight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[6].rk[0])), keyGLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[6].rk[1])), keyGRight);
+
+  _mm_storeu_si128((__m128i *)(&(keys[7].rk[0])), keyHLeft);
+  _mm_storeu_si128((__m128i *)(&(keys[7].rk[1])), keyHRight);
+
+  for (i = 1; i <= 6; i++) {
+    KS_round_first_x8(2 * i);
+    KS_round_second_x8(2 * i + 1);
+  }
+  KS_round_last_x8(14);
+}
+
+} // namespace sci
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h
new file mode 100644
index 00000000..7892dec4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h
@@ -0,0 +1,393 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef UTIL_BLOCK_H__
+#define UTIL_BLOCK_H__
+#include <algorithm>
+#include <assert.h>
+#include <bitset>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <iostream>
+#include <smmintrin.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wmmintrin.h>
+#include <xmmintrin.h>
+
+namespace sci {
+typedef __m128i block128;
+typedef __m256i block256;
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
+static inline void __attribute__((__always_inline__))
+_mm256_storeu2_m128i(__m128i *const hiaddr, __m128i *const loaddr,
+                     const __m256i a) {
+  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
+}
+#endif /* defined(__GNUC__) */
+
+inline void print(const uint64_t &value, const char *end = "\n", int len = 64,
+                  bool reverse = false) {
+  std::string tmp = std::bitset<64>(value).to_string();
+  if (reverse)
+    std::reverse(tmp.begin(), tmp.end());
+  if (reverse)
+    std::cout << tmp.substr(0, len); // std::cout << std::hex << buffer[i];
+  else
+    std::cout << tmp.substr(64 - len,
+                            len); // std::cout << std::hex << buffer[i];
+  std::cout << end;
+}
+
+inline void print(const uint8_t &value, const char *end = "\n", int len = 8,
+                  bool reverse = false) {
+  std::string tmp = std::bitset<8>(value).to_string();
+  if (reverse)
+    std::reverse(tmp.begin(), tmp.end());
+  if (reverse)
+    std::cout << tmp.substr(0, len); // std::cout << std::hex << buffer[i];
+  else
+    std::cout << tmp.substr(8 - len, len); // std::cout << std::hex <<
+                                           // buffer[i];
+  std::cout << end;
+}
+
+inline void print(const block128 &value, const char *end = "\n") {
+  const size_t n = sizeof(__m128i) / sizeof(uint64_t);
+  uint64_t buffer[n];
+  _mm_storeu_si128((__m128i *)buffer, value);
+  // std::cout << "0x";
+  for (size_t i = 0; i < n; i++) {
+    std::string tmp = std::bitset<64>(buffer[i]).to_string();
+    std::reverse(tmp.begin(), tmp.end());
+    std::cout << tmp; // std::cout << std::hex << buffer[i];
+  }
+  std::cout << end;
+}
+
+inline void print(const block256 &value, const char *end = "\n") {
+  const size_t n = sizeof(__m256i) / sizeof(uint64_t);
+  uint64_t buffer[n];
+  _mm256_storeu_si256((__m256i *)buffer, value);
+  // std::cout << "0x";
+  for (size_t i = 0; i < n; i++) {
+    std::string tmp = std::bitset<64>(buffer[i]).to_string();
+    std::reverse(tmp.begin(), tmp.end());
+    std::cout << tmp; // std::cout << std::hex << buffer[i];
+  }
+  std::cout << end;
+}
+
+inline bool getLSB(const block128 &x) { return (*((char *)&x) & 1) == 1; }
+__attribute__((target("sse2"))) inline block128 makeBlock128(int64_t x,
+                                                             int64_t y) {
+  return _mm_set_epi64x(x, y);
+}
+
+__attribute__((target("avx"))) inline block256
+makeBlock256(int64_t w, int64_t x, int64_t y,
+             int64_t z) { // return w||x||y||z (MSB->LSB)
+  return _mm256_set_epi64x(w, x, y, z);
+}
+__attribute__((target("avx2,avx"))) inline block256
+makeBlock256(block128 x, block128 y) { // return x (MSB) || y (LSB)
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(y), x, 1);
+  // return _mm256_loadu2_m128i(&x, &y);
+}
+__attribute__((target("sse2"))) inline block128 zero_block() {
+  return _mm_setzero_si128();
+}
+inline block128 one_block() {
+  return makeBlock128(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL);
+}
+
+const block128 select_mask[2] = {zero_block(), one_block()};
+
+__attribute__((target("sse2"))) inline block128 make_delta(const block128 &a) {
+  return _mm_or_si128(makeBlock128(0L, 1L), a);
+}
+
+typedef __m128i block_tpl[2];
+
+__attribute__((target("sse2"))) inline block128 xorBlocks(block128 x,
+                                                          block128 y) {
+  return _mm_xor_si128(x, y);
+}
+__attribute__((target("avx2"))) inline block256 xorBlocks(block256 x,
+                                                          block256 y) {
+  return _mm256_xor_si256(x, y);
+}
+__attribute__((target("sse2"))) inline block128 andBlocks(block128 x,
+                                                          block128 y) {
+  return _mm_and_si128(x, y);
+}
+__attribute__((target("avx2"))) inline block256 andBlocks(block256 x,
+                                                          block256 y) {
+  return _mm256_and_si256(x, y);
+}
+
+inline void xorBlocks_arr(block128 *res, const block128 *x, const block128 *y,
+                          int nblocks) {
+  const block128 *dest = nblocks + x;
+  for (; x != dest;) {
+    *(res++) = xorBlocks(*(x++), *(y++));
+  }
+}
+inline void xorBlocks_arr(block128 *res, const block128 *x, block128 y,
+                          int nblocks) {
+  const block128 *dest = nblocks + x;
+  for (; x != dest;) {
+    *(res++) = xorBlocks(*(x++), y);
+  }
+}
+
+inline void xorBlocks_arr(block256 *res, const block256 *x, const block256 *y,
+                          int nblocks) {
+  const block256 *dest = nblocks + x;
+  for (; x != dest;) {
+    *(res++) = xorBlocks(*(x++), *(y++));
+  }
+}
+inline void xorBlocks_arr(block256 *res, const block256 *x, block256 y,
+                          int nblocks) {
+  const block256 *dest = nblocks + x;
+  for (; x != dest;) {
+    *(res++) = xorBlocks(*(x++), y);
+  }
+}
+
+__attribute__((target("sse4.1,sse2"))) inline bool
+cmpBlock(const block128 *x, const block128 *y, int nblocks) {
+  const block128 *dest = nblocks + x;
+  for (; x != dest;) {
+    __m128i vcmp = _mm_xor_si128(*(x++), *(y++));
+    if (!_mm_testz_si128(vcmp, vcmp))
+      return false;
+  }
+  return true;
+}
+
+__attribute__((target("avx2,avx"))) inline bool
+cmpBlock(const block256 *x, const block256 *y, int nblocks) {
+  const block256 *dest = nblocks + x;
+  for (; x != dest;) {
+    __m256i vcmp = _mm256_xor_si256(*(x++), *(y++));
+    if (!_mm256_testz_si256(vcmp, vcmp))
+      return false;
+  }
+  return true;
+}
+
+// deprecate soon
+inline bool block_cmp(const block128 *x, const block128 *y, int nblocks) {
+  return cmpBlock(x, y, nblocks);
+}
+
+inline bool block_cmp(const block256 *x, const block256 *y, int nblocks) {
+  return cmpBlock(x, y, nblocks);
+}
+
+__attribute__((target("sse4.1"))) inline bool isZero(const block128 *b) {
+  return _mm_testz_si128(*b, *b) > 0;
+}
+
+__attribute__((target("avx"))) inline bool isZero(const block256 *b) {
+  return _mm256_testz_si256(*b, *b) > 0;
+}
+
+__attribute__((target("sse4.1,sse2"))) inline bool isOne(const block128 *b) {
+  __m128i neq = _mm_xor_si128(*b, one_block());
+  return _mm_testz_si128(neq, neq) > 0;
+}
+
+/* Linear orthomorphism function
+ * [REF] Implementation of "Efficient and Secure Multiparty Computation from
+ * Fixed-Key Block Ciphers" https://eprint.iacr.org/2019/074.pdf
+ */
+#ifdef __x86_64__
+__attribute__((target("sse2")))
+#endif
+inline block128
+sigma(block128 a) {
+  return _mm_shuffle_epi32(a, 78) ^
+         (a & makeBlock128(0xFFFFFFFFFFFFFFFF, 0x00));
+}
+
+inline block128 set_bit(const block128 &a, int i) {
+  if (i < 64)
+    return makeBlock128(0L, 1ULL << i) | a;
+  else
+    return makeBlock128(1ULL << (i - 64), 0) | a;
+}
+
+// Modified from
+// https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
+// with inner most loops changed to _mm_set_epi8 and _mm_set_epi16
+#define INP(x, y) inp[(x)*ncols / 8 + (y) / 8]
+#define OUT(x, y) out[(y)*nrows / 8 + (x) / 8]
+
+__attribute__((target("sse2"))) inline void
+sse_trans(uint8_t *out, uint8_t const *inp, uint64_t nrows, uint64_t ncols) {
+  uint64_t rr, cc;
+  int i, h;
+  union {
+    __m128i x;
+    uint8_t b[16];
+  } tmp;
+  __m128i vec;
+  assert(nrows % 8 == 0 && ncols % 8 == 0);
+
+  // Do the main body in 16x8 blocks:
+  for (rr = 0; rr <= nrows - 16; rr += 16) {
+    for (cc = 0; cc < ncols; cc += 8) {
+      vec = _mm_set_epi8(INP(rr + 15, cc), INP(rr + 14, cc), INP(rr + 13, cc),
+                         INP(rr + 12, cc), INP(rr + 11, cc), INP(rr + 10, cc),
+                         INP(rr + 9, cc), INP(rr + 8, cc), INP(rr + 7, cc),
+                         INP(rr + 6, cc), INP(rr + 5, cc), INP(rr + 4, cc),
+                         INP(rr + 3, cc), INP(rr + 2, cc), INP(rr + 1, cc),
+                         INP(rr + 0, cc));
+      for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1))
+        *(uint16_t *)&OUT(rr, cc + i) = _mm_movemask_epi8(vec);
+    }
+  }
+  if (rr == nrows)
+    return;
+
+  // The remainder is a block128 of 8x(16n+8) bits (n may be 0).
+  //  Do a PAIR of 8x8 blocks in each step:
+  if ((ncols % 8 == 0 && ncols % 16 != 0) ||
+      (nrows % 8 == 0 && nrows % 16 != 0)) {
+    // The fancy optimizations in the else-branch don't work if the above
+    // if-condition holds, so we use the simpler non-simd variant for that case.
+    for (cc = 0; cc <= ncols - 16; cc += 16) {
+      for (i = 0; i < 8; ++i) {
+        tmp.b[i] = h = *(uint16_t const *)&INP(rr + i, cc);
+        tmp.b[i + 8] = h >> 8;
+      }
+      for (i = 8; --i >= 0; tmp.x = _mm_slli_epi64(tmp.x, 1)) {
+        OUT(rr, cc + i) = h = _mm_movemask_epi8(tmp.x);
+        OUT(rr, cc + i + 8) = h >> 8;
+      }
+    }
+  } else {
+    for (cc = 0; cc <= ncols - 16; cc += 16) {
+      vec = _mm_set_epi16(*(uint16_t const *)&INP(rr + 7, cc),
+                          *(uint16_t const *)&INP(rr + 6, cc),
+                          *(uint16_t const *)&INP(rr + 5, cc),
+                          *(uint16_t const *)&INP(rr + 4, cc),
+                          *(uint16_t const *)&INP(rr + 3, cc),
+                          *(uint16_t const *)&INP(rr + 2, cc),
+                          *(uint16_t const *)&INP(rr + 1, cc),
+                          *(uint16_t const *)&INP(rr + 0, cc));
+      for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1)) {
+        OUT(rr, cc + i) = h = _mm_movemask_epi8(vec);
+        OUT(rr, cc + i + 8) = h >> 8;
+      }
+    }
+  }
+  if (cc == ncols)
+    return;
+
+  //  Do the remaining 8x8 block128:
+  for (i = 0; i < 8; ++i)
+    tmp.b[i] = INP(rr + i, cc);
+  for (i = 8; --i >= 0; tmp.x = _mm_slli_epi64(tmp.x, 1))
+    OUT(rr, cc + i) = _mm_movemask_epi8(tmp.x);
+}
+
+const char fix_key[] = "\x61\x7e\x8d\xa2\xa0\x51\x1e\x96"
+                       "\x5e\x41\xc2\x9b\x15\x3f\xc7\x7a";
+
+/*
+        This file is part of JustGarble.
+
+        JustGarble is free software: you can redistribute it and/or modify
+        it under the terms of the GNU General Public License as published by
+        the Free Software Foundation, either version 3 of the License, or
+        (at your option) any later version.
+
+        JustGarble is distributed in the hope that it will be useful,
+        but WITHOUT ANY WARRANTY; without even the implied warranty of
+        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+        GNU General Public License for more details.
+
+        You should have received a copy of the GNU General Public License
+        along with JustGarble.  If not, see <http://www.gnu.org/licenses/>.
+
+ */
+
+/*------------------------------------------------------------------------
+  / OCB Version 3 Reference Code (Optimized C)     Last modified 08-SEP-2012
+  /-------------------------------------------------------------------------
+  / Copyright (c) 2012 Ted Krovetz.
+  /
+  / Permission to use, copy, modify, and/or distribute this software for any
+  / purpose with or without fee is hereby granted, provided that the above
+  / copyright notice and this permission notice appear in all copies.
+  /
+  / THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  / WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  / MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  / ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  / WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  / ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  / OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+  /
+  / Phillip Rogaway holds patents relevant to OCB. See the following for
+  / his patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
+  /
+  / Special thanks to Keegan McAllister for suggesting several good improvements
+  /
+  / Comments are welcome: Ted Krovetz <ted@krovetz.net> - Dedicated to Laurel K
+  /------------------------------------------------------------------------- */
+__attribute__((target("sse2"))) inline block128 double_block(block128 bl) {
+  const __m128i mask = _mm_set_epi32(135, 1, 1, 1);
+  __m128i tmp = _mm_srai_epi32(bl, 31);
+  tmp = _mm_and_si128(tmp, mask);
+  tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
+  bl = _mm_slli_epi32(bl, 1);
+  return _mm_xor_si128(bl, tmp);
+}
+
+__attribute__((target("sse2"))) inline block128 LEFTSHIFT1(block128 bl) {
+  const __m128i mask = _mm_set_epi32(0, 0, (1 << 31), 0);
+  __m128i tmp = _mm_and_si128(bl, mask);
+  bl = _mm_slli_epi64(bl, 1);
+  return _mm_xor_si128(bl, tmp);
+}
+__attribute__((target("sse2"))) inline block128 RIGHTSHIFT(block128 bl) {
+  const __m128i mask = _mm_set_epi32(0, 1, 0, 0);
+  __m128i tmp = _mm_and_si128(bl, mask);
+  bl = _mm_slli_epi64(bl, 1);
+  return _mm_xor_si128(bl, tmp);
+}
+} // namespace sci
+#endif // UTIL_BLOCK_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h
new file mode 100644
index 00000000..8e18568b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h
@@ -0,0 +1,58 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef CCRF_H__
+#define CCRF_H__
+#include "utils/aes-ni.h"
+#include "utils/aes_opt.h"
+#include "utils/prg.h"
+#include <stdio.h>
+/** @addtogroup BP
+  @{
+  */
+namespace sci {
+
+inline void CCRF(block128 *y, block256 *k, int n) {
+  AESNI_KEY aes[8];
+  int r = n % 8;
+  if (r == 0) {
+    for (int i = 0; i < n / 8; i++) {
+      AES_256_ks8(k + i * 8, aes);
+      // AESNI_set_encrypt_key(&aes[j], k[i*8 + j]);
+      AESNI_ecb_encrypt_blks_ks_x8(y + i * 8, 8, aes);
+    }
+  } else {
+    for (int i = 0; i < (n - r) / 8; i++) {
+      AES_256_ks8(k + i * 8, aes);
+      // AESNI_set_encrypt_key(&aes[j], k[i*8 + j]);
+      AESNI_ecb_encrypt_blks_ks_x8(y + i * 8, 8, aes);
+    }
+    for (int i = n - r; i < n; i++) {
+      y[i] = one;
+      AESNI_set_encrypt_key(&aes[0], k[i]);
+      AESNI_ecb_encrypt_blks(y + i, 1, aes);
+    }
+  }
+}
+
+} // namespace sci
+/**@}*/
+#endif // CCRF_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h
new file mode 100644
index 00000000..d2cb75c1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h
@@ -0,0 +1,65 @@
+#ifndef EMP_CCRH_H__
+#define EMP_CCRH_H__
+#include "utils/prp.h"
+#include <stdio.h>
+namespace emp {
+
+/*
+ * By default, CRH use zero_block as the AES key.
+ * Here we model f(x) = AES_{00..0}(x) as a random permutation (and thus in the
+ * RPM model)
+ */
+class CCRH : public PRP {
+public:
+  CCRH(const block128 &key = zero_block) : PRP(key) {}
+
+  block128 H(block128 in) {
+    block128 t;
+    t = in = sigma(in);
+    permute_block(&t, 1);
+    return t ^ in;
+  }
+
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC push_options
+#pragma GCC optimize("unroll-loops")
+#endif
+#endif
+
+  template <int n> void H(block128 out[n], block128 in[n]) {
+    block128 tmp[n];
+    for (int i = 0; i < n; ++i)
+      tmp[i] = out[i] = sigma(in[i]);
+    permute_block(tmp, n);
+    xorblocks_arr(out, tmp, out, n);
+  }
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC pop_options
+#endif
+#endif
+
+  void Hn(block128 *out, block128 *in, uint64_t id, int length,
+          block128 *scratch = nullptr) {
+    bool del = false;
+    if (scratch == nullptr) {
+      del = true;
+      scratch = new block128[length];
+    }
+
+    for (int i = 0; i < length; ++i)
+      scratch[i] = out[i] = sigma(in[i]);
+
+    permute_block(scratch, length);
+    xorblocks_arr(out, scratch, out, length);
+
+    if (del) {
+      delete[] scratch;
+      scratch = nullptr;
+    }
+  }
+};
+
+} // namespace emp
+#endif // CCRH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/FindGMP.cmake b/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/FindGMP.cmake
new file mode 100644
index 00000000..37d73b48
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/FindGMP.cmake
@@ -0,0 +1,21 @@
+# https://raw.githubusercontent.com/stevedekorte/io/master/modules/FindGMP.cmake
+
+# Try to find the GMP librairies
+# GMP_FOUND - system has GMP lib
+# GMP_INCLUDE_DIR - the GMP include directory
+# GMP_LIBRARIES - Libraries needed to use GMP
+
+if (GMP_INCLUDE_DIR AND GMP_LIBRARIES)
+		# Already in cache, be silent
+		set(GMP_FIND_QUIETLY TRUE)
+endif (GMP_INCLUDE_DIR AND GMP_LIBRARIES)
+
+find_path(GMP_INCLUDE_DIR NAMES gmp.h )
+find_library(GMP_LIBRARIES NAMES gmp libgmp )
+find_library(GMPXX_LIBRARIES NAMES gmpxx libgmpxx )
+MESSAGE(STATUS "GMP libs: " ${GMP_LIBRARIES} " " ${GMPXX_LIBRARIES} )
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(GMP DEFAULT_MSG GMP_INCLUDE_DIR GMP_LIBRARIES)
+
+mark_as_advanced(GMP_INCLUDE_DIR GMP_LIBRARIES)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/source_of_randomness.cmake b/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/source_of_randomness.cmake
new file mode 100644
index 00000000..a0765176
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/source_of_randomness.cmake
@@ -0,0 +1,27 @@
+OPTION(USE_RANDOM_DEVICE "Option description" OFF)
+set(CMAKE_C_FLAGS "-mrdseed")
+
+# Use rdseed if available
+unset(RDSEED_COMPILE_RESULT CACHE)
+unset(RDSEED_RUN_RESULT CACHE)
+file(WRITE ${CMAKE_SOURCE_DIR}/rdseedtest.c "#include <stdio.h>\n#include <x86intrin.h>\nint main(){\nunsigned long long r;\n_rdseed64_step(&r);\nreturn 0;\n}\n")
+try_run(RDSEED_RUN_RESULT RDSEED_COMPILE_RESULT ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/rdseedtest.c CMAKE_FLAGS ${CMAKE_C_FLAGS})
+file(REMOVE ${CMAKE_SOURCE_DIR}/rdseedtest.c)
+
+cmake_policy(SET CMP0012 NEW)
+IF(NOT ${RDSEED_COMPILE_RESULT})
+        set(USE_RANDOM_DEVICE ON)
+ELSE(NOT ${RDSEED_COMPILE_RESULT})
+	string(COMPARE EQUAL "${RDSEED_RUN_RESULT}" "0" RDSEED_RUN_SUCCESS)
+        IF(NOT ${RDSEED_RUN_SUCCESS})
+                set(USE_RANDOM_DEVICE ON)
+        ELSE(NOT ${RDSEED_RUN_SUCCESS})
+                set(USE_RANDOM_DEVICE OFF)
+        ENDIF(NOT ${RDSEED_RUN_SUCCESS})
+ENDIF(NOT ${RDSEED_COMPILE_RESULT})
+
+IF(${USE_RANDOM_DEVICE})
+	message("${Red}-- Source of Randomness: random_device${ColourReset}")
+ELSE(${USE_RANDOM_DEVICE})
+	message("${Green}-- Source of Randomness: rdseed${ColourReset}")
+ENDIF(${USE_RANDOM_DEVICE})
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/constants.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/constants.h
new file mode 100644
index 00000000..f354d6c4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/constants.h
@@ -0,0 +1,573 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef CONFIG_H__
+#define CONFIG_H__
+#include <map>
+
+namespace sci {
+const static int AES_BATCH_SIZE = 2048;
+// const static int AES_BATCH_SIZE = 256;
+const static int HASH_BUFFER_SIZE = 1024 * 8;
+const static int NETWORK_BUFFER_SIZE =
+    1024 * 16; // Should change depending on the network
+const static int FILE_BUFFER_SIZE = 1024 * 16;
+const static int CHECK_BUFFER_SIZE = 1024 * 8;
+
+const static int XOR = -1;
+const static int PUBLIC = 0;
+const static int ALICE = 1;
+const static int BOB = 2;
+
+#ifdef SCI_HE
+const std::map<int32_t, uint64_t> default_prime_mod{
+    {32, 4293918721},    {33, 8585084929},   {34, 17171218433},
+    {35, 34359214081},   {36, 68686184449},  {37, 137352314881},
+    {38, 274824036353},  {39, 549753716737}, {40, 1099480956929},
+    {41, 2198100901889},
+};
+#endif
+
+const static uint64_t WH_Code[256][4] = {
+    {0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+     0x0000000000000000},
+    {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa},
+    {0xcccccccccccccccc, 0xcccccccccccccccc, 0xcccccccccccccccc,
+     0xcccccccccccccccc},
+    {0x6666666666666666, 0x6666666666666666, 0x6666666666666666,
+     0x6666666666666666},
+    {0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0,
+     0xf0f0f0f0f0f0f0f0},
+    {0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a, 0x5a5a5a5a5a5a5a5a,
+     0x5a5a5a5a5a5a5a5a},
+    {0x3c3c3c3c3c3c3c3c, 0x3c3c3c3c3c3c3c3c, 0x3c3c3c3c3c3c3c3c,
+     0x3c3c3c3c3c3c3c3c},
+    {0x9696969696969696, 0x9696969696969696, 0x9696969696969696,
+     0x9696969696969696},
+    {0xff00ff00ff00ff00, 0xff00ff00ff00ff00, 0xff00ff00ff00ff00,
+     0xff00ff00ff00ff00},
+    {0x55aa55aa55aa55aa, 0x55aa55aa55aa55aa, 0x55aa55aa55aa55aa,
+     0x55aa55aa55aa55aa},
+    {0x33cc33cc33cc33cc, 0x33cc33cc33cc33cc, 0x33cc33cc33cc33cc,
+     0x33cc33cc33cc33cc},
+    {0x9966996699669966, 0x9966996699669966, 0x9966996699669966,
+     0x9966996699669966},
+    {0x0ff00ff00ff00ff0, 0x0ff00ff00ff00ff0, 0x0ff00ff00ff00ff0,
+     0x0ff00ff00ff00ff0},
+    {0xa55aa55aa55aa55a, 0xa55aa55aa55aa55a, 0xa55aa55aa55aa55a,
+     0xa55aa55aa55aa55a},
+    {0xc33cc33cc33cc33c, 0xc33cc33cc33cc33c, 0xc33cc33cc33cc33c,
+     0xc33cc33cc33cc33c},
+    {0x6996699669966996, 0x6996699669966996, 0x6996699669966996,
+     0x6996699669966996},
+    {0xffff0000ffff0000, 0xffff0000ffff0000, 0xffff0000ffff0000,
+     0xffff0000ffff0000},
+    {0x5555aaaa5555aaaa, 0x5555aaaa5555aaaa, 0x5555aaaa5555aaaa,
+     0x5555aaaa5555aaaa},
+    {0x3333cccc3333cccc, 0x3333cccc3333cccc, 0x3333cccc3333cccc,
+     0x3333cccc3333cccc},
+    {0x9999666699996666, 0x9999666699996666, 0x9999666699996666,
+     0x9999666699996666},
+    {0x0f0ff0f00f0ff0f0, 0x0f0ff0f00f0ff0f0, 0x0f0ff0f00f0ff0f0,
+     0x0f0ff0f00f0ff0f0},
+    {0xa5a55a5aa5a55a5a, 0xa5a55a5aa5a55a5a, 0xa5a55a5aa5a55a5a,
+     0xa5a55a5aa5a55a5a},
+    {0xc3c33c3cc3c33c3c, 0xc3c33c3cc3c33c3c, 0xc3c33c3cc3c33c3c,
+     0xc3c33c3cc3c33c3c},
+    {0x6969969669699696, 0x6969969669699696, 0x6969969669699696,
+     0x6969969669699696},
+    {0x00ffff0000ffff00, 0x00ffff0000ffff00, 0x00ffff0000ffff00,
+     0x00ffff0000ffff00},
+    {0xaa5555aaaa5555aa, 0xaa5555aaaa5555aa, 0xaa5555aaaa5555aa,
+     0xaa5555aaaa5555aa},
+    {0xcc3333cccc3333cc, 0xcc3333cccc3333cc, 0xcc3333cccc3333cc,
+     0xcc3333cccc3333cc},
+    {0x6699996666999966, 0x6699996666999966, 0x6699996666999966,
+     0x6699996666999966},
+    {0xf00f0ff0f00f0ff0, 0xf00f0ff0f00f0ff0, 0xf00f0ff0f00f0ff0,
+     0xf00f0ff0f00f0ff0},
+    {0x5aa5a55a5aa5a55a, 0x5aa5a55a5aa5a55a, 0x5aa5a55a5aa5a55a,
+     0x5aa5a55a5aa5a55a},
+    {0x3cc3c33c3cc3c33c, 0x3cc3c33c3cc3c33c, 0x3cc3c33c3cc3c33c,
+     0x3cc3c33c3cc3c33c},
+    {0x9669699696696996, 0x9669699696696996, 0x9669699696696996,
+     0x9669699696696996},
+    {0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000,
+     0xffffffff00000000},
+    {0x55555555aaaaaaaa, 0x55555555aaaaaaaa, 0x55555555aaaaaaaa,
+     0x55555555aaaaaaaa},
+    {0x33333333cccccccc, 0x33333333cccccccc, 0x33333333cccccccc,
+     0x33333333cccccccc},
+    {0x9999999966666666, 0x9999999966666666, 0x9999999966666666,
+     0x9999999966666666},
+    {0x0f0f0f0ff0f0f0f0, 0x0f0f0f0ff0f0f0f0, 0x0f0f0f0ff0f0f0f0,
+     0x0f0f0f0ff0f0f0f0},
+    {0xa5a5a5a55a5a5a5a, 0xa5a5a5a55a5a5a5a, 0xa5a5a5a55a5a5a5a,
+     0xa5a5a5a55a5a5a5a},
+    {0xc3c3c3c33c3c3c3c, 0xc3c3c3c33c3c3c3c, 0xc3c3c3c33c3c3c3c,
+     0xc3c3c3c33c3c3c3c},
+    {0x6969696996969696, 0x6969696996969696, 0x6969696996969696,
+     0x6969696996969696},
+    {0x00ff00ffff00ff00, 0x00ff00ffff00ff00, 0x00ff00ffff00ff00,
+     0x00ff00ffff00ff00},
+    {0xaa55aa5555aa55aa, 0xaa55aa5555aa55aa, 0xaa55aa5555aa55aa,
+     0xaa55aa5555aa55aa},
+    {0xcc33cc3333cc33cc, 0xcc33cc3333cc33cc, 0xcc33cc3333cc33cc,
+     0xcc33cc3333cc33cc},
+    {0x6699669999669966, 0x6699669999669966, 0x6699669999669966,
+     0x6699669999669966},
+    {0xf00ff00f0ff00ff0, 0xf00ff00f0ff00ff0, 0xf00ff00f0ff00ff0,
+     0xf00ff00f0ff00ff0},
+    {0x5aa55aa5a55aa55a, 0x5aa55aa5a55aa55a, 0x5aa55aa5a55aa55a,
+     0x5aa55aa5a55aa55a},
+    {0x3cc33cc3c33cc33c, 0x3cc33cc3c33cc33c, 0x3cc33cc3c33cc33c,
+     0x3cc33cc3c33cc33c},
+    {0x9669966969966996, 0x9669966969966996, 0x9669966969966996,
+     0x9669966969966996},
+    {0x0000ffffffff0000, 0x0000ffffffff0000, 0x0000ffffffff0000,
+     0x0000ffffffff0000},
+    {0xaaaa55555555aaaa, 0xaaaa55555555aaaa, 0xaaaa55555555aaaa,
+     0xaaaa55555555aaaa},
+    {0xcccc33333333cccc, 0xcccc33333333cccc, 0xcccc33333333cccc,
+     0xcccc33333333cccc},
+    {0x6666999999996666, 0x6666999999996666, 0x6666999999996666,
+     0x6666999999996666},
+    {0xf0f00f0f0f0ff0f0, 0xf0f00f0f0f0ff0f0, 0xf0f00f0f0f0ff0f0,
+     0xf0f00f0f0f0ff0f0},
+    {0x5a5aa5a5a5a55a5a, 0x5a5aa5a5a5a55a5a, 0x5a5aa5a5a5a55a5a,
+     0x5a5aa5a5a5a55a5a},
+    {0x3c3cc3c3c3c33c3c, 0x3c3cc3c3c3c33c3c, 0x3c3cc3c3c3c33c3c,
+     0x3c3cc3c3c3c33c3c},
+    {0x9696696969699696, 0x9696696969699696, 0x9696696969699696,
+     0x9696696969699696},
+    {0xff0000ff00ffff00, 0xff0000ff00ffff00, 0xff0000ff00ffff00,
+     0xff0000ff00ffff00},
+    {0x55aaaa55aa5555aa, 0x55aaaa55aa5555aa, 0x55aaaa55aa5555aa,
+     0x55aaaa55aa5555aa},
+    {0x33cccc33cc3333cc, 0x33cccc33cc3333cc, 0x33cccc33cc3333cc,
+     0x33cccc33cc3333cc},
+    {0x9966669966999966, 0x9966669966999966, 0x9966669966999966,
+     0x9966669966999966},
+    {0x0ff0f00ff00f0ff0, 0x0ff0f00ff00f0ff0, 0x0ff0f00ff00f0ff0,
+     0x0ff0f00ff00f0ff0},
+    {0xa55a5aa55aa5a55a, 0xa55a5aa55aa5a55a, 0xa55a5aa55aa5a55a,
+     0xa55a5aa55aa5a55a},
+    {0xc33c3cc33cc3c33c, 0xc33c3cc33cc3c33c, 0xc33c3cc33cc3c33c,
+     0xc33c3cc33cc3c33c},
+    {0x6996966996696996, 0x6996966996696996, 0x6996966996696996,
+     0x6996966996696996},
+    {0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff,
+     0x0000000000000000},
+    {0x5555555555555555, 0xaaaaaaaaaaaaaaaa, 0x5555555555555555,
+     0xaaaaaaaaaaaaaaaa},
+    {0x3333333333333333, 0xcccccccccccccccc, 0x3333333333333333,
+     0xcccccccccccccccc},
+    {0x9999999999999999, 0x6666666666666666, 0x9999999999999999,
+     0x6666666666666666},
+    {0x0f0f0f0f0f0f0f0f, 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f,
+     0xf0f0f0f0f0f0f0f0},
+    {0xa5a5a5a5a5a5a5a5, 0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5,
+     0x5a5a5a5a5a5a5a5a},
+    {0xc3c3c3c3c3c3c3c3, 0x3c3c3c3c3c3c3c3c, 0xc3c3c3c3c3c3c3c3,
+     0x3c3c3c3c3c3c3c3c},
+    {0x6969696969696969, 0x9696969696969696, 0x6969696969696969,
+     0x9696969696969696},
+    {0x00ff00ff00ff00ff, 0xff00ff00ff00ff00, 0x00ff00ff00ff00ff,
+     0xff00ff00ff00ff00},
+    {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa, 0xaa55aa55aa55aa55,
+     0x55aa55aa55aa55aa},
+    {0xcc33cc33cc33cc33, 0x33cc33cc33cc33cc, 0xcc33cc33cc33cc33,
+     0x33cc33cc33cc33cc},
+    {0x6699669966996699, 0x9966996699669966, 0x6699669966996699,
+     0x9966996699669966},
+    {0xf00ff00ff00ff00f, 0x0ff00ff00ff00ff0, 0xf00ff00ff00ff00f,
+     0x0ff00ff00ff00ff0},
+    {0x5aa55aa55aa55aa5, 0xa55aa55aa55aa55a, 0x5aa55aa55aa55aa5,
+     0xa55aa55aa55aa55a},
+    {0x3cc33cc33cc33cc3, 0xc33cc33cc33cc33c, 0x3cc33cc33cc33cc3,
+     0xc33cc33cc33cc33c},
+    {0x9669966996699669, 0x6996699669966996, 0x9669966996699669,
+     0x6996699669966996},
+    {0x0000ffff0000ffff, 0xffff0000ffff0000, 0x0000ffff0000ffff,
+     0xffff0000ffff0000},
+    {0xaaaa5555aaaa5555, 0x5555aaaa5555aaaa, 0xaaaa5555aaaa5555,
+     0x5555aaaa5555aaaa},
+    {0xcccc3333cccc3333, 0x3333cccc3333cccc, 0xcccc3333cccc3333,
+     0x3333cccc3333cccc},
+    {0x6666999966669999, 0x9999666699996666, 0x6666999966669999,
+     0x9999666699996666},
+    {0xf0f00f0ff0f00f0f, 0x0f0ff0f00f0ff0f0, 0xf0f00f0ff0f00f0f,
+     0x0f0ff0f00f0ff0f0},
+    {0x5a5aa5a55a5aa5a5, 0xa5a55a5aa5a55a5a, 0x5a5aa5a55a5aa5a5,
+     0xa5a55a5aa5a55a5a},
+    {0x3c3cc3c33c3cc3c3, 0xc3c33c3cc3c33c3c, 0x3c3cc3c33c3cc3c3,
+     0xc3c33c3cc3c33c3c},
+    {0x9696696996966969, 0x6969969669699696, 0x9696696996966969,
+     0x6969969669699696},
+    {0xff0000ffff0000ff, 0x00ffff0000ffff00, 0xff0000ffff0000ff,
+     0x00ffff0000ffff00},
+    {0x55aaaa5555aaaa55, 0xaa5555aaaa5555aa, 0x55aaaa5555aaaa55,
+     0xaa5555aaaa5555aa},
+    {0x33cccc3333cccc33, 0xcc3333cccc3333cc, 0x33cccc3333cccc33,
+     0xcc3333cccc3333cc},
+    {0x9966669999666699, 0x6699996666999966, 0x9966669999666699,
+     0x6699996666999966},
+    {0x0ff0f00f0ff0f00f, 0xf00f0ff0f00f0ff0, 0x0ff0f00f0ff0f00f,
+     0xf00f0ff0f00f0ff0},
+    {0xa55a5aa5a55a5aa5, 0x5aa5a55a5aa5a55a, 0xa55a5aa5a55a5aa5,
+     0x5aa5a55a5aa5a55a},
+    {0xc33c3cc3c33c3cc3, 0x3cc3c33c3cc3c33c, 0xc33c3cc3c33c3cc3,
+     0x3cc3c33c3cc3c33c},
+    {0x6996966969969669, 0x9669699696696996, 0x6996966969969669,
+     0x9669699696696996},
+    {0x00000000ffffffff, 0xffffffff00000000, 0x00000000ffffffff,
+     0xffffffff00000000},
+    {0xaaaaaaaa55555555, 0x55555555aaaaaaaa, 0xaaaaaaaa55555555,
+     0x55555555aaaaaaaa},
+    {0xcccccccc33333333, 0x33333333cccccccc, 0xcccccccc33333333,
+     0x33333333cccccccc},
+    {0x6666666699999999, 0x9999999966666666, 0x6666666699999999,
+     0x9999999966666666},
+    {0xf0f0f0f00f0f0f0f, 0x0f0f0f0ff0f0f0f0, 0xf0f0f0f00f0f0f0f,
+     0x0f0f0f0ff0f0f0f0},
+    {0x5a5a5a5aa5a5a5a5, 0xa5a5a5a55a5a5a5a, 0x5a5a5a5aa5a5a5a5,
+     0xa5a5a5a55a5a5a5a},
+    {0x3c3c3c3cc3c3c3c3, 0xc3c3c3c33c3c3c3c, 0x3c3c3c3cc3c3c3c3,
+     0xc3c3c3c33c3c3c3c},
+    {0x9696969669696969, 0x6969696996969696, 0x9696969669696969,
+     0x6969696996969696},
+    {0xff00ff0000ff00ff, 0x00ff00ffff00ff00, 0xff00ff0000ff00ff,
+     0x00ff00ffff00ff00},
+    {0x55aa55aaaa55aa55, 0xaa55aa5555aa55aa, 0x55aa55aaaa55aa55,
+     0xaa55aa5555aa55aa},
+    {0x33cc33cccc33cc33, 0xcc33cc3333cc33cc, 0x33cc33cccc33cc33,
+     0xcc33cc3333cc33cc},
+    {0x9966996666996699, 0x6699669999669966, 0x9966996666996699,
+     0x6699669999669966},
+    {0x0ff00ff0f00ff00f, 0xf00ff00f0ff00ff0, 0x0ff00ff0f00ff00f,
+     0xf00ff00f0ff00ff0},
+    {0xa55aa55a5aa55aa5, 0x5aa55aa5a55aa55a, 0xa55aa55a5aa55aa5,
+     0x5aa55aa5a55aa55a},
+    {0xc33cc33c3cc33cc3, 0x3cc33cc3c33cc33c, 0xc33cc33c3cc33cc3,
+     0x3cc33cc3c33cc33c},
+    {0x6996699696699669, 0x9669966969966996, 0x6996699696699669,
+     0x9669966969966996},
+    {0xffff00000000ffff, 0x0000ffffffff0000, 0xffff00000000ffff,
+     0x0000ffffffff0000},
+    {0x5555aaaaaaaa5555, 0xaaaa55555555aaaa, 0x5555aaaaaaaa5555,
+     0xaaaa55555555aaaa},
+    {0x3333cccccccc3333, 0xcccc33333333cccc, 0x3333cccccccc3333,
+     0xcccc33333333cccc},
+    {0x9999666666669999, 0x6666999999996666, 0x9999666666669999,
+     0x6666999999996666},
+    {0x0f0ff0f0f0f00f0f, 0xf0f00f0f0f0ff0f0, 0x0f0ff0f0f0f00f0f,
+     0xf0f00f0f0f0ff0f0},
+    {0xa5a55a5a5a5aa5a5, 0x5a5aa5a5a5a55a5a, 0xa5a55a5a5a5aa5a5,
+     0x5a5aa5a5a5a55a5a},
+    {0xc3c33c3c3c3cc3c3, 0x3c3cc3c3c3c33c3c, 0xc3c33c3c3c3cc3c3,
+     0x3c3cc3c3c3c33c3c},
+    {0x6969969696966969, 0x9696696969699696, 0x6969969696966969,
+     0x9696696969699696},
+    {0x00ffff00ff0000ff, 0xff0000ff00ffff00, 0x00ffff00ff0000ff,
+     0xff0000ff00ffff00},
+    {0xaa5555aa55aaaa55, 0x55aaaa55aa5555aa, 0xaa5555aa55aaaa55,
+     0x55aaaa55aa5555aa},
+    {0xcc3333cc33cccc33, 0x33cccc33cc3333cc, 0xcc3333cc33cccc33,
+     0x33cccc33cc3333cc},
+    {0x6699996699666699, 0x9966669966999966, 0x6699996699666699,
+     0x9966669966999966},
+    {0xf00f0ff00ff0f00f, 0x0ff0f00ff00f0ff0, 0xf00f0ff00ff0f00f,
+     0x0ff0f00ff00f0ff0},
+    {0x5aa5a55aa55a5aa5, 0xa55a5aa55aa5a55a, 0x5aa5a55aa55a5aa5,
+     0xa55a5aa55aa5a55a},
+    {0x3cc3c33cc33c3cc3, 0xc33c3cc33cc3c33c, 0x3cc3c33cc33c3cc3,
+     0xc33c3cc33cc3c33c},
+    {0x9669699669969669, 0x6996966996696996, 0x9669699669969669,
+     0x6996966996696996},
+    {0xffffffffffffffff, 0xffffffffffffffff, 0x0000000000000000,
+     0x0000000000000000},
+    {0x5555555555555555, 0x5555555555555555, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa},
+    {0x3333333333333333, 0x3333333333333333, 0xcccccccccccccccc,
+     0xcccccccccccccccc},
+    {0x9999999999999999, 0x9999999999999999, 0x6666666666666666,
+     0x6666666666666666},
+    {0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0xf0f0f0f0f0f0f0f0,
+     0xf0f0f0f0f0f0f0f0},
+    {0xa5a5a5a5a5a5a5a5, 0xa5a5a5a5a5a5a5a5, 0x5a5a5a5a5a5a5a5a,
+     0x5a5a5a5a5a5a5a5a},
+    {0xc3c3c3c3c3c3c3c3, 0xc3c3c3c3c3c3c3c3, 0x3c3c3c3c3c3c3c3c,
+     0x3c3c3c3c3c3c3c3c},
+    {0x6969696969696969, 0x6969696969696969, 0x9696969696969696,
+     0x9696969696969696},
+    {0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff, 0xff00ff00ff00ff00,
+     0xff00ff00ff00ff00},
+    {0xaa55aa55aa55aa55, 0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa,
+     0x55aa55aa55aa55aa},
+    {0xcc33cc33cc33cc33, 0xcc33cc33cc33cc33, 0x33cc33cc33cc33cc,
+     0x33cc33cc33cc33cc},
+    {0x6699669966996699, 0x6699669966996699, 0x9966996699669966,
+     0x9966996699669966},
+    {0xf00ff00ff00ff00f, 0xf00ff00ff00ff00f, 0x0ff00ff00ff00ff0,
+     0x0ff00ff00ff00ff0},
+    {0x5aa55aa55aa55aa5, 0x5aa55aa55aa55aa5, 0xa55aa55aa55aa55a,
+     0xa55aa55aa55aa55a},
+    {0x3cc33cc33cc33cc3, 0x3cc33cc33cc33cc3, 0xc33cc33cc33cc33c,
+     0xc33cc33cc33cc33c},
+    {0x9669966996699669, 0x9669966996699669, 0x6996699669966996,
+     0x6996699669966996},
+    {0x0000ffff0000ffff, 0x0000ffff0000ffff, 0xffff0000ffff0000,
+     0xffff0000ffff0000},
+    {0xaaaa5555aaaa5555, 0xaaaa5555aaaa5555, 0x5555aaaa5555aaaa,
+     0x5555aaaa5555aaaa},
+    {0xcccc3333cccc3333, 0xcccc3333cccc3333, 0x3333cccc3333cccc,
+     0x3333cccc3333cccc},
+    {0x6666999966669999, 0x6666999966669999, 0x9999666699996666,
+     0x9999666699996666},
+    {0xf0f00f0ff0f00f0f, 0xf0f00f0ff0f00f0f, 0x0f0ff0f00f0ff0f0,
+     0x0f0ff0f00f0ff0f0},
+    {0x5a5aa5a55a5aa5a5, 0x5a5aa5a55a5aa5a5, 0xa5a55a5aa5a55a5a,
+     0xa5a55a5aa5a55a5a},
+    {0x3c3cc3c33c3cc3c3, 0x3c3cc3c33c3cc3c3, 0xc3c33c3cc3c33c3c,
+     0xc3c33c3cc3c33c3c},
+    {0x9696696996966969, 0x9696696996966969, 0x6969969669699696,
+     0x6969969669699696},
+    {0xff0000ffff0000ff, 0xff0000ffff0000ff, 0x00ffff0000ffff00,
+     0x00ffff0000ffff00},
+    {0x55aaaa5555aaaa55, 0x55aaaa5555aaaa55, 0xaa5555aaaa5555aa,
+     0xaa5555aaaa5555aa},
+    {0x33cccc3333cccc33, 0x33cccc3333cccc33, 0xcc3333cccc3333cc,
+     0xcc3333cccc3333cc},
+    {0x9966669999666699, 0x9966669999666699, 0x6699996666999966,
+     0x6699996666999966},
+    {0x0ff0f00f0ff0f00f, 0x0ff0f00f0ff0f00f, 0xf00f0ff0f00f0ff0,
+     0xf00f0ff0f00f0ff0},
+    {0xa55a5aa5a55a5aa5, 0xa55a5aa5a55a5aa5, 0x5aa5a55a5aa5a55a,
+     0x5aa5a55a5aa5a55a},
+    {0xc33c3cc3c33c3cc3, 0xc33c3cc3c33c3cc3, 0x3cc3c33c3cc3c33c,
+     0x3cc3c33c3cc3c33c},
+    {0x6996966969969669, 0x6996966969969669, 0x9669699696696996,
+     0x9669699696696996},
+    {0x00000000ffffffff, 0x00000000ffffffff, 0xffffffff00000000,
+     0xffffffff00000000},
+    {0xaaaaaaaa55555555, 0xaaaaaaaa55555555, 0x55555555aaaaaaaa,
+     0x55555555aaaaaaaa},
+    {0xcccccccc33333333, 0xcccccccc33333333, 0x33333333cccccccc,
+     0x33333333cccccccc},
+    {0x6666666699999999, 0x6666666699999999, 0x9999999966666666,
+     0x9999999966666666},
+    {0xf0f0f0f00f0f0f0f, 0xf0f0f0f00f0f0f0f, 0x0f0f0f0ff0f0f0f0,
+     0x0f0f0f0ff0f0f0f0},
+    {0x5a5a5a5aa5a5a5a5, 0x5a5a5a5aa5a5a5a5, 0xa5a5a5a55a5a5a5a,
+     0xa5a5a5a55a5a5a5a},
+    {0x3c3c3c3cc3c3c3c3, 0x3c3c3c3cc3c3c3c3, 0xc3c3c3c33c3c3c3c,
+     0xc3c3c3c33c3c3c3c},
+    {0x9696969669696969, 0x9696969669696969, 0x6969696996969696,
+     0x6969696996969696},
+    {0xff00ff0000ff00ff, 0xff00ff0000ff00ff, 0x00ff00ffff00ff00,
+     0x00ff00ffff00ff00},
+    {0x55aa55aaaa55aa55, 0x55aa55aaaa55aa55, 0xaa55aa5555aa55aa,
+     0xaa55aa5555aa55aa},
+    {0x33cc33cccc33cc33, 0x33cc33cccc33cc33, 0xcc33cc3333cc33cc,
+     0xcc33cc3333cc33cc},
+    {0x9966996666996699, 0x9966996666996699, 0x6699669999669966,
+     0x6699669999669966},
+    {0x0ff00ff0f00ff00f, 0x0ff00ff0f00ff00f, 0xf00ff00f0ff00ff0,
+     0xf00ff00f0ff00ff0},
+    {0xa55aa55a5aa55aa5, 0xa55aa55a5aa55aa5, 0x5aa55aa5a55aa55a,
+     0x5aa55aa5a55aa55a},
+    {0xc33cc33c3cc33cc3, 0xc33cc33c3cc33cc3, 0x3cc33cc3c33cc33c,
+     0x3cc33cc3c33cc33c},
+    {0x6996699696699669, 0x6996699696699669, 0x9669966969966996,
+     0x9669966969966996},
+    {0xffff00000000ffff, 0xffff00000000ffff, 0x0000ffffffff0000,
+     0x0000ffffffff0000},
+    {0x5555aaaaaaaa5555, 0x5555aaaaaaaa5555, 0xaaaa55555555aaaa,
+     0xaaaa55555555aaaa},
+    {0x3333cccccccc3333, 0x3333cccccccc3333, 0xcccc33333333cccc,
+     0xcccc33333333cccc},
+    {0x9999666666669999, 0x9999666666669999, 0x6666999999996666,
+     0x6666999999996666},
+    {0x0f0ff0f0f0f00f0f, 0x0f0ff0f0f0f00f0f, 0xf0f00f0f0f0ff0f0,
+     0xf0f00f0f0f0ff0f0},
+    {0xa5a55a5a5a5aa5a5, 0xa5a55a5a5a5aa5a5, 0x5a5aa5a5a5a55a5a,
+     0x5a5aa5a5a5a55a5a},
+    {0xc3c33c3c3c3cc3c3, 0xc3c33c3c3c3cc3c3, 0x3c3cc3c3c3c33c3c,
+     0x3c3cc3c3c3c33c3c},
+    {0x6969969696966969, 0x6969969696966969, 0x9696696969699696,
+     0x9696696969699696},
+    {0x00ffff00ff0000ff, 0x00ffff00ff0000ff, 0xff0000ff00ffff00,
+     0xff0000ff00ffff00},
+    {0xaa5555aa55aaaa55, 0xaa5555aa55aaaa55, 0x55aaaa55aa5555aa,
+     0x55aaaa55aa5555aa},
+    {0xcc3333cc33cccc33, 0xcc3333cc33cccc33, 0x33cccc33cc3333cc,
+     0x33cccc33cc3333cc},
+    {0x6699996699666699, 0x6699996699666699, 0x9966669966999966,
+     0x9966669966999966},
+    {0xf00f0ff00ff0f00f, 0xf00f0ff00ff0f00f, 0x0ff0f00ff00f0ff0,
+     0x0ff0f00ff00f0ff0},
+    {0x5aa5a55aa55a5aa5, 0x5aa5a55aa55a5aa5, 0xa55a5aa55aa5a55a,
+     0xa55a5aa55aa5a55a},
+    {0x3cc3c33cc33c3cc3, 0x3cc3c33cc33c3cc3, 0xc33c3cc33cc3c33c,
+     0xc33c3cc33cc3c33c},
+    {0x9669699669969669, 0x9669699669969669, 0x6996966996696996,
+     0x6996966996696996},
+    {0x0000000000000000, 0xffffffffffffffff, 0xffffffffffffffff,
+     0x0000000000000000},
+    {0xaaaaaaaaaaaaaaaa, 0x5555555555555555, 0x5555555555555555,
+     0xaaaaaaaaaaaaaaaa},
+    {0xcccccccccccccccc, 0x3333333333333333, 0x3333333333333333,
+     0xcccccccccccccccc},
+    {0x6666666666666666, 0x9999999999999999, 0x9999999999999999,
+     0x6666666666666666},
+    {0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f,
+     0xf0f0f0f0f0f0f0f0},
+    {0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5, 0xa5a5a5a5a5a5a5a5,
+     0x5a5a5a5a5a5a5a5a},
+    {0x3c3c3c3c3c3c3c3c, 0xc3c3c3c3c3c3c3c3, 0xc3c3c3c3c3c3c3c3,
+     0x3c3c3c3c3c3c3c3c},
+    {0x9696969696969696, 0x6969696969696969, 0x6969696969696969,
+     0x9696969696969696},
+    {0xff00ff00ff00ff00, 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
+     0xff00ff00ff00ff00},
+    {0x55aa55aa55aa55aa, 0xaa55aa55aa55aa55, 0xaa55aa55aa55aa55,
+     0x55aa55aa55aa55aa},
+    {0x33cc33cc33cc33cc, 0xcc33cc33cc33cc33, 0xcc33cc33cc33cc33,
+     0x33cc33cc33cc33cc},
+    {0x9966996699669966, 0x6699669966996699, 0x6699669966996699,
+     0x9966996699669966},
+    {0x0ff00ff00ff00ff0, 0xf00ff00ff00ff00f, 0xf00ff00ff00ff00f,
+     0x0ff00ff00ff00ff0},
+    {0xa55aa55aa55aa55a, 0x5aa55aa55aa55aa5, 0x5aa55aa55aa55aa5,
+     0xa55aa55aa55aa55a},
+    {0xc33cc33cc33cc33c, 0x3cc33cc33cc33cc3, 0x3cc33cc33cc33cc3,
+     0xc33cc33cc33cc33c},
+    {0x6996699669966996, 0x9669966996699669, 0x9669966996699669,
+     0x6996699669966996},
+    {0xffff0000ffff0000, 0x0000ffff0000ffff, 0x0000ffff0000ffff,
+     0xffff0000ffff0000},
+    {0x5555aaaa5555aaaa, 0xaaaa5555aaaa5555, 0xaaaa5555aaaa5555,
+     0x5555aaaa5555aaaa},
+    {0x3333cccc3333cccc, 0xcccc3333cccc3333, 0xcccc3333cccc3333,
+     0x3333cccc3333cccc},
+    {0x9999666699996666, 0x6666999966669999, 0x6666999966669999,
+     0x9999666699996666},
+    {0x0f0ff0f00f0ff0f0, 0xf0f00f0ff0f00f0f, 0xf0f00f0ff0f00f0f,
+     0x0f0ff0f00f0ff0f0},
+    {0xa5a55a5aa5a55a5a, 0x5a5aa5a55a5aa5a5, 0x5a5aa5a55a5aa5a5,
+     0xa5a55a5aa5a55a5a},
+    {0xc3c33c3cc3c33c3c, 0x3c3cc3c33c3cc3c3, 0x3c3cc3c33c3cc3c3,
+     0xc3c33c3cc3c33c3c},
+    {0x6969969669699696, 0x9696696996966969, 0x9696696996966969,
+     0x6969969669699696},
+    {0x00ffff0000ffff00, 0xff0000ffff0000ff, 0xff0000ffff0000ff,
+     0x00ffff0000ffff00},
+    {0xaa5555aaaa5555aa, 0x55aaaa5555aaaa55, 0x55aaaa5555aaaa55,
+     0xaa5555aaaa5555aa},
+    {0xcc3333cccc3333cc, 0x33cccc3333cccc33, 0x33cccc3333cccc33,
+     0xcc3333cccc3333cc},
+    {0x6699996666999966, 0x9966669999666699, 0x9966669999666699,
+     0x6699996666999966},
+    {0xf00f0ff0f00f0ff0, 0x0ff0f00f0ff0f00f, 0x0ff0f00f0ff0f00f,
+     0xf00f0ff0f00f0ff0},
+    {0x5aa5a55a5aa5a55a, 0xa55a5aa5a55a5aa5, 0xa55a5aa5a55a5aa5,
+     0x5aa5a55a5aa5a55a},
+    {0x3cc3c33c3cc3c33c, 0xc33c3cc3c33c3cc3, 0xc33c3cc3c33c3cc3,
+     0x3cc3c33c3cc3c33c},
+    {0x9669699696696996, 0x6996966969969669, 0x6996966969969669,
+     0x9669699696696996},
+    {0xffffffff00000000, 0x00000000ffffffff, 0x00000000ffffffff,
+     0xffffffff00000000},
+    {0x55555555aaaaaaaa, 0xaaaaaaaa55555555, 0xaaaaaaaa55555555,
+     0x55555555aaaaaaaa},
+    {0x33333333cccccccc, 0xcccccccc33333333, 0xcccccccc33333333,
+     0x33333333cccccccc},
+    {0x9999999966666666, 0x6666666699999999, 0x6666666699999999,
+     0x9999999966666666},
+    {0x0f0f0f0ff0f0f0f0, 0xf0f0f0f00f0f0f0f, 0xf0f0f0f00f0f0f0f,
+     0x0f0f0f0ff0f0f0f0},
+    {0xa5a5a5a55a5a5a5a, 0x5a5a5a5aa5a5a5a5, 0x5a5a5a5aa5a5a5a5,
+     0xa5a5a5a55a5a5a5a},
+    {0xc3c3c3c33c3c3c3c, 0x3c3c3c3cc3c3c3c3, 0x3c3c3c3cc3c3c3c3,
+     0xc3c3c3c33c3c3c3c},
+    {0x6969696996969696, 0x9696969669696969, 0x9696969669696969,
+     0x6969696996969696},
+    {0x00ff00ffff00ff00, 0xff00ff0000ff00ff, 0xff00ff0000ff00ff,
+     0x00ff00ffff00ff00},
+    {0xaa55aa5555aa55aa, 0x55aa55aaaa55aa55, 0x55aa55aaaa55aa55,
+     0xaa55aa5555aa55aa},
+    {0xcc33cc3333cc33cc, 0x33cc33cccc33cc33, 0x33cc33cccc33cc33,
+     0xcc33cc3333cc33cc},
+    {0x6699669999669966, 0x9966996666996699, 0x9966996666996699,
+     0x6699669999669966},
+    {0xf00ff00f0ff00ff0, 0x0ff00ff0f00ff00f, 0x0ff00ff0f00ff00f,
+     0xf00ff00f0ff00ff0},
+    {0x5aa55aa5a55aa55a, 0xa55aa55a5aa55aa5, 0xa55aa55a5aa55aa5,
+     0x5aa55aa5a55aa55a},
+    {0x3cc33cc3c33cc33c, 0xc33cc33c3cc33cc3, 0xc33cc33c3cc33cc3,
+     0x3cc33cc3c33cc33c},
+    {0x9669966969966996, 0x6996699696699669, 0x6996699696699669,
+     0x9669966969966996},
+    {0x0000ffffffff0000, 0xffff00000000ffff, 0xffff00000000ffff,
+     0x0000ffffffff0000},
+    {0xaaaa55555555aaaa, 0x5555aaaaaaaa5555, 0x5555aaaaaaaa5555,
+     0xaaaa55555555aaaa},
+    {0xcccc33333333cccc, 0x3333cccccccc3333, 0x3333cccccccc3333,
+     0xcccc33333333cccc},
+    {0x6666999999996666, 0x9999666666669999, 0x9999666666669999,
+     0x6666999999996666},
+    {0xf0f00f0f0f0ff0f0, 0x0f0ff0f0f0f00f0f, 0x0f0ff0f0f0f00f0f,
+     0xf0f00f0f0f0ff0f0},
+    {0x5a5aa5a5a5a55a5a, 0xa5a55a5a5a5aa5a5, 0xa5a55a5a5a5aa5a5,
+     0x5a5aa5a5a5a55a5a},
+    {0x3c3cc3c3c3c33c3c, 0xc3c33c3c3c3cc3c3, 0xc3c33c3c3c3cc3c3,
+     0x3c3cc3c3c3c33c3c},
+    {0x9696696969699696, 0x6969969696966969, 0x6969969696966969,
+     0x9696696969699696},
+    {0xff0000ff00ffff00, 0x00ffff00ff0000ff, 0x00ffff00ff0000ff,
+     0xff0000ff00ffff00},
+    {0x55aaaa55aa5555aa, 0xaa5555aa55aaaa55, 0xaa5555aa55aaaa55,
+     0x55aaaa55aa5555aa},
+    {0x33cccc33cc3333cc, 0xcc3333cc33cccc33, 0xcc3333cc33cccc33,
+     0x33cccc33cc3333cc},
+    {0x9966669966999966, 0x6699996699666699, 0x6699996699666699,
+     0x9966669966999966},
+    {0x0ff0f00ff00f0ff0, 0xf00f0ff00ff0f00f, 0xf00f0ff00ff0f00f,
+     0x0ff0f00ff00f0ff0},
+    {0xa55a5aa55aa5a55a, 0x5aa5a55aa55a5aa5, 0x5aa5a55aa55a5aa5,
+     0xa55a5aa55aa5a55a},
+    {0xc33c3cc33cc3c33c, 0x3cc3c33cc33c3cc3, 0x3cc3c33cc33c3cc3,
+     0xc33c3cc33cc3c33c},
+    {0x6996966996696996, 0x9669699669969669, 0x9669699669969669,
+     0x6996966996696996}};
+
+#if defined(unix) || defined(__unix__) || defined(__unix) || defined(__APPLE__)
+#define UNIX_PLATFORM
+#endif
+} // namespace sci
+#endif // CONFIG_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h
new file mode 100644
index 00000000..b33a7bfc
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+*/
+
+#include "utils/prp.h"
+#include <stdio.h>
+#ifndef CRH_H__
+#define CRH_H__
+/** @addtogroup BP
+  @{
+ */
+namespace sci {
+
+class CRH : public PRP {
+public:
+  CRH(const char *seed = fix_key) : PRP(seed) {}
+
+  CRH(const block128 &seed) : PRP(seed) {}
+
+  block128 H(block128 in) {
+    block128 t = in;
+    permute_block(&t, 1);
+    return xorBlocks(t, in);
+  }
+
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC push_options
+#pragma GCC optimize("unroll-loops")
+#endif
+#endif
+
+  template <int n> void H(block128 out[n], block128 in[n]) {
+    block128 tmp[n];
+    for (int i = 0; i < n; ++i)
+      tmp[i] = in[i];
+    permute_block(tmp, n);
+    xorBlocks_arr(out, in, tmp, n);
+  }
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC pop_options
+#endif
+#endif
+
+  void Hn(block128 *out, block128 *in, int n, block128 *scratch = nullptr) {
+    bool del = false;
+    if (scratch == nullptr) {
+      del = true;
+      scratch = new block128[n];
+    }
+    for (int i = 0; i < n; ++i)
+      scratch[i] = in[i];
+    permute_block(scratch, n);
+    xorBlocks_arr(out, in, scratch, n);
+    if (del) {
+      delete[] scratch;
+      scratch = nullptr;
+    }
+  }
+};
+} // namespace sci
+/**@}*/
+#endif // CRH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/emp-tool.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/emp-tool.h
new file mode 100644
index 00000000..4eed040a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/emp-tool.h
@@ -0,0 +1,18 @@
+#include "utils/io_channel.h"
+#include "utils/io_pack.h"
+#include "utils/net_io_channel.h"
+
+#include "utils/ArgMapping/ArgMapping.h"
+
+#include "utils/aes-ni.h"
+#include "utils/aes.h"
+#include "utils/aes_opt.h"
+#include "utils/block.h"
+#include "utils/ccrf.h"
+#include "utils/constants.h"
+#include "utils/crh.h"
+#include "utils/group.h"
+#include "utils/hash.h"
+#include "utils/prg.h"
+#include "utils/prp.h"
+#include "utils/utils.h"
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h
new file mode 100644
index 00000000..92529dcf
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h
@@ -0,0 +1,205 @@
+#ifndef EMP_F2K_H__
+#define EMP_F2K_H__
+#include "utils/block.h"
+
+namespace emp {
+/* multiplication in galois field without reduction */
+#ifdef __x86_64__
+__attribute__((target("sse2,pclmul"))) inline void
+mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
+  __m128i tmp3, tmp4, tmp5, tmp6;
+  tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+  tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
+  tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
+  tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
+
+  tmp4 = _mm_xor_si128(tmp4, tmp5);
+  tmp5 = _mm_slli_si128(tmp4, 8);
+  tmp4 = _mm_srli_si128(tmp4, 8);
+  tmp3 = _mm_xor_si128(tmp3, tmp5);
+  tmp6 = _mm_xor_si128(tmp6, tmp4);
+  // initial mul now in tmp3, tmp6
+  *res1 = tmp3;
+  *res2 = tmp6;
+}
+#elif __aarch64__
+inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
+  __m128i tmp3, tmp4, tmp5, tmp6;
+  poly64_t a_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(a));
+  poly64_t a_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(a));
+  poly64_t b_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(b));
+  poly64_t b_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(b));
+  tmp3 = (__m128i)vmull_p64(a_lo, b_lo);
+  tmp4 = (__m128i)vmull_p64(a_hi, b_lo);
+  tmp5 = (__m128i)vmull_p64(a_lo, b_hi);
+  tmp6 = (__m128i)vmull_p64(a_hi, b_hi);
+
+  tmp4 = _mm_xor_si128(tmp4, tmp5);
+  tmp5 = _mm_slli_si128(tmp4, 8);
+  tmp4 = _mm_srli_si128(tmp4, 8);
+  tmp3 = _mm_xor_si128(tmp3, tmp5);
+  tmp6 = _mm_xor_si128(tmp6, tmp4);
+  // initial mul now in tmp3, tmp6
+  *res1 = tmp3;
+  *res2 = tmp6;
+}
+#endif
+
+/* multiplication in galois field with reduction */
+#ifdef __x86_64__
+__attribute__((target("sse2,pclmul")))
+#endif
+inline void
+gfmul(__m128i a, __m128i b, __m128i *res) {
+  __m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
+  __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
+  mul128(a, b, &tmp3, &tmp6);
+  tmp7 = _mm_srli_epi32(tmp6, 31);
+  tmp8 = _mm_srli_epi32(tmp6, 30);
+  tmp9 = _mm_srli_epi32(tmp6, 25);
+  tmp7 = _mm_xor_si128(tmp7, tmp8);
+  tmp7 = _mm_xor_si128(tmp7, tmp9);
+  tmp8 = _mm_shuffle_epi32(tmp7, 147);
+
+  tmp7 = _mm_and_si128(XMMMASK, tmp8);
+  tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
+  tmp3 = _mm_xor_si128(tmp3, tmp8);
+  tmp6 = _mm_xor_si128(tmp6, tmp7);
+  tmp10 = _mm_slli_epi32(tmp6, 1);
+  tmp3 = _mm_xor_si128(tmp3, tmp10);
+  tmp11 = _mm_slli_epi32(tmp6, 2);
+  tmp3 = _mm_xor_si128(tmp3, tmp11);
+  tmp12 = _mm_slli_epi32(tmp6, 7);
+  tmp3 = _mm_xor_si128(tmp3, tmp12);
+
+  *res = _mm_xor_si128(tmp3, tmp6);
+}
+
+/* inner product of two galois field vectors with reduction */
+inline void vector_inn_prdt_sum_red(block128 *res, const block128 *a,
+                                    const block128 *b, int sz) {
+  block128 r = zero_block128;
+  block128 r1;
+  for (int i = 0; i < sz; i++) {
+    gfmul(a[i], b[i], &r1);
+    r = r ^ r1;
+  }
+  *res = r;
+}
+
+/* inner product of two galois field vectors with reduction */
+template <int N>
+inline void vector_inn_prdt_sum_red(block128 *res, block128 const *a,
+                                    const block128 *b) {
+  vector_inn_prdt_sum_red(res, a, b, N);
+}
+
+/* inner product of two galois field vectors without reduction */
+inline void vector_inn_prdt_sum_no_red(block128 *res, const block128 *a,
+                                       const block128 *b, int sz) {
+  block128 r1 = zero_block128, r2 = zero_block128;
+  block128 r11, r12;
+  for (int i = 0; i < sz; i++) {
+    mul128(a[i], b[i], &r11, &r12);
+    r1 = r1 ^ r11;
+    r2 = r2 ^ r12;
+  }
+  res[0] = r1;
+  res[1] = r2;
+}
+
+/* inner product of two galois field vectors without reduction */
+template <int N>
+inline void vector_inn_prdt_sum_no_red(block128 *res, const block128 *a,
+                                       const block128 *b) {
+  vector_inn_prdt_sum_no_red(res, a, b, N);
+}
+
+/* coefficients of almost universal hash function */
+inline void uni_hash_coeff_gen(block128 *coeff, block128 seed, int sz) {
+  coeff[0] = seed;
+  gfmul(seed, seed, &coeff[1]);
+  gfmul(coeff[1], seed, &coeff[2]);
+  block128 multiplier;
+  gfmul(coeff[2], seed, &multiplier);
+  coeff[3] = multiplier;
+  int i = 4;
+  for (; i < sz - 3; i += 4) {
+    gfmul(coeff[i - 4], multiplier, &coeff[i]);
+    gfmul(coeff[i - 3], multiplier, &coeff[i + 1]);
+    gfmul(coeff[i - 2], multiplier, &coeff[i + 2]);
+    gfmul(coeff[i - 1], multiplier, &coeff[i + 3]);
+  }
+  int remainder = sz % 4;
+  if (remainder != 0) {
+    i = sz - remainder;
+    for (; i < sz; ++i)
+      gfmul(coeff[i - 1], seed, &coeff[i]);
+  }
+}
+
+/* coefficients of almost universal hash function */
+template <int N>
+inline void uni_hash_coeff_gen(block128 *coeff, block128 seed) {
+  uni_hash_coeff_gen(coeff, seed, N);
+}
+
+/* packing in Galois field (v[i] * X^i for v of size 128) */
+class GaloisFieldPacking {
+public:
+  block128 base[128];
+
+  GaloisFieldPacking() { packing_base_gen(); }
+
+  ~GaloisFieldPacking() {}
+
+  void packing_base_gen() {
+    uint64_t a = 0, b = 1;
+    for (int i = 0; i < 64; i += 4) {
+      base[i] = _mm_set_epi64x(a, b);
+      base[i + 1] = _mm_set_epi64x(a, b << 1);
+      base[i + 2] = _mm_set_epi64x(a, b << 2);
+      base[i + 3] = _mm_set_epi64x(a, b << 3);
+      b <<= 4;
+    }
+    a = 1, b = 0;
+    for (int i = 64; i < 128; i += 4) {
+      base[i] = _mm_set_epi64x(a, b);
+      base[i + 1] = _mm_set_epi64x(a << 1, b);
+      base[i + 2] = _mm_set_epi64x(a << 2, b);
+      base[i + 3] = _mm_set_epi64x(a << 3, b);
+      a <<= 4;
+    }
+  }
+
+  void packing(block128 *res, block128 *data) {
+    vector_inn_prdt_sum_red(res, data, base, 128);
+  }
+};
+
+/* XOR of all elements in a vector */
+inline void vector_self_xor(block128 *sum, block128 *data, int sz) {
+  block128 res[4];
+  res[0] = zero_block128;
+  res[1] = zero_block128;
+  res[2] = zero_block128;
+  res[3] = zero_block128;
+  for (int i = 0; i < (sz / 4) * 4; i += 4) {
+    res[0] = data[i] ^ res[0];
+    res[1] = data[i + 1] ^ res[1];
+    res[2] = data[i + 2] ^ res[2];
+    res[3] = data[i + 3] ^ res[3];
+  }
+  for (int i = (sz / 4) * 4, j = 0; i < sz; ++i, ++j)
+    res[j] = data[i] ^ res[j];
+  res[0] = res[0] ^ res[1];
+  res[2] = res[2] ^ res[3];
+  *sum = res[0] ^ res[2];
+}
+
+/* XOR of all elements in a vector */
+template <int N> inline void vector_self_xor(block128 *sum, block128 *data) {
+  vector_self_xor(sum, data, N);
+}
+} // namespace emp
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h
new file mode 100644
index 00000000..a3376f77
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+*/
+
+#ifndef EMP_GROUP_H__
+#define EMP_GROUP_H__
+
+#include "utils/utils.h"
+#include <cstring>
+#include <openssl/bn.h>
+#include <openssl/ec.h>
+#include <openssl/obj_mac.h>
+#include <string>
+
+//#ifdef ECC_USE_OPENSSL
+//#else
+//#include "group_relic.h"
+//#endif
+namespace emp {
+class BigInt {
+public:
+  BIGNUM *n = nullptr;
+  BigInt();
+  BigInt(const BigInt &oth);
+  BigInt &operator=(BigInt oth);
+  ~BigInt();
+
+  int size();
+  void to_bin(unsigned char *in);
+  void from_bin(const unsigned char *in, int length);
+
+  BigInt add(const BigInt &oth);
+  BigInt mul(const BigInt &oth, BN_CTX *ctx = nullptr);
+  BigInt mod(const BigInt &oth, BN_CTX *ctx = nullptr);
+  BigInt add_mod(const BigInt &b, const BigInt &m, BN_CTX *ctx = nullptr);
+  BigInt mul_mod(const BigInt &b, const BigInt &m, BN_CTX *ctx = nullptr);
+};
+class Group;
+class Point {
+public:
+  EC_POINT *point = nullptr;
+  Group *group = nullptr;
+  Point(Group *g = nullptr);
+  ~Point();
+  Point(const Point &p);
+  Point &operator=(Point p);
+
+  void to_bin(unsigned char *buf, size_t buf_len);
+  size_t size();
+  void from_bin(Group *g, const unsigned char *buf, size_t buf_len);
+
+  Point add(Point &rhs);
+  //		Point sub(Point & rhs);
+  //		bool is_at_infinity();
+  //		bool is_on_curve();
+  Point mul(const BigInt &m);
+  Point inv();
+  bool operator==(Point &rhs);
+};
+
+class Group {
+public:
+  EC_GROUP *ec_group = nullptr;
+  BN_CTX *bn_ctx = nullptr;
+  BigInt order;
+  unsigned char *scratch;
+  size_t scratch_size = 256;
+  Group();
+  ~Group();
+  void resize_scratch(size_t size);
+  void get_rand_bn(BigInt &n);
+  Point get_generator();
+  Point mul_gen(const BigInt &m);
+};
+
+} // namespace emp
+#include "group_openssl.h"
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/group_openssl.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/group_openssl.h
new file mode 100644
index 00000000..42c8cb61
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/group_openssl.h
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+*/
+
+#ifndef EMP_GROUP_OPENSSL_H__
+#define EMP_GROUP_OPENSSL_H__
+
+namespace emp {
+inline BigInt::BigInt() { n = BN_new(); }
+inline BigInt::BigInt(const BigInt &oth) {
+  n = BN_new();
+  BN_copy(n, oth.n);
+}
+inline BigInt &BigInt::operator=(BigInt oth) {
+  std::swap(n, oth.n);
+  return *this;
+}
+inline BigInt::~BigInt() {
+  if (n != nullptr)
+    BN_free(n);
+}
+
+inline int BigInt::size() { return BN_num_bytes(n); }
+
+inline void BigInt::to_bin(unsigned char *in) { BN_bn2bin(n, in); }
+
+inline void BigInt::from_bin(const unsigned char *in, int length) {
+  BN_free(n);
+  n = BN_bin2bn(in, length, nullptr);
+}
+
+inline BigInt BigInt::add(const BigInt &oth) {
+  BigInt ret;
+  BN_add(ret.n, n, oth.n);
+  return ret;
+}
+
+inline BigInt BigInt::mul_mod(const BigInt &b, const BigInt &m, BN_CTX *ctx) {
+  BigInt ret;
+  BN_mod_mul(ret.n, n, b.n, m.n, ctx);
+  return ret;
+}
+
+inline BigInt BigInt::add_mod(const BigInt &b, const BigInt &m, BN_CTX *ctx) {
+  BigInt ret;
+  BN_mod_add(ret.n, n, b.n, m.n, ctx);
+  return ret;
+}
+
+inline BigInt BigInt::mul(const BigInt &oth, BN_CTX *ctx) {
+  BigInt ret;
+  BN_mul(ret.n, n, oth.n, ctx);
+  return ret;
+}
+
+inline BigInt BigInt::mod(const BigInt &oth, BN_CTX *ctx) {
+  BigInt ret;
+  BN_mod(ret.n, n, oth.n, ctx);
+  return ret;
+}
+
+inline Point::Point(Group *g) {
+  if (g == nullptr)
+    return;
+  this->group = g;
+  point = EC_POINT_new(group->ec_group);
+}
+
+inline Point::~Point() {
+  if (point != nullptr)
+    EC_POINT_free(point);
+}
+
+inline Point::Point(const Point &p) {
+  if (p.group == nullptr)
+    return;
+  this->group = p.group;
+  point = EC_POINT_new(group->ec_group);
+  int ret = EC_POINT_copy(point, p.point);
+  if (ret == 0)
+    sci::error("ECC COPY");
+}
+
+inline Point &Point::operator=(Point p) {
+  std::swap(p.point, point);
+  std::swap(p.group, group);
+  return *this;
+}
+
+inline void Point::to_bin(unsigned char *buf, size_t buf_len) {
+  int ret =
+      EC_POINT_point2oct(group->ec_group, point, POINT_CONVERSION_UNCOMPRESSED,
+                         buf, buf_len, group->bn_ctx);
+  if (ret == 0)
+    sci::error("ECC TO_BIN");
+}
+
+inline size_t Point::size() {
+  size_t ret =
+      EC_POINT_point2oct(group->ec_group, point, POINT_CONVERSION_UNCOMPRESSED,
+                         NULL, 0, group->bn_ctx);
+  if (ret == 0)
+    sci::error("ECC SIZE_BIN");
+  return ret;
+}
+
+inline void Point::from_bin(Group *g, const unsigned char *buf,
+                            size_t buf_len) {
+  if (point == nullptr) {
+    group = g;
+    point = EC_POINT_new(group->ec_group);
+  }
+  int ret =
+      EC_POINT_oct2point(group->ec_group, point, buf, buf_len, group->bn_ctx);
+  if (ret == 0)
+    sci::error("ECC FROM_BIN");
+}
+
+inline Point Point::add(Point &rhs) {
+  Point ret(group);
+  int res =
+      EC_POINT_add(group->ec_group, ret.point, point, rhs.point, group->bn_ctx);
+  if (res == 0)
+    sci::error("ECC ADD");
+  return ret;
+}
+
+inline Point Point::mul(const BigInt &m) {
+  Point ret(group);
+  int res =
+      EC_POINT_mul(group->ec_group, ret.point, NULL, point, m.n, group->bn_ctx);
+  if (res == 0)
+    sci::error("ECC MUL");
+  return ret;
+}
+
+inline Point Point::inv() {
+  Point ret(*this);
+  int res = EC_POINT_invert(group->ec_group, ret.point, group->bn_ctx);
+  if (res == 0)
+    sci::error("ECC INV");
+  return ret;
+}
+inline bool Point::operator==(Point &rhs) {
+  int ret = EC_POINT_cmp(group->ec_group, point, rhs.point, group->bn_ctx);
+  if (ret == -1)
+    sci::error("ECC CMP");
+  return (ret == 0);
+}
+
+inline Group::Group() {
+  ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1); // NIST P-256
+  bn_ctx = BN_CTX_new();
+  EC_GROUP_precompute_mult(ec_group, bn_ctx);
+  EC_GROUP_get_order(ec_group, order.n, bn_ctx);
+  scratch = new unsigned char[scratch_size];
+}
+
+inline Group::~Group() {
+  if (ec_group != nullptr)
+    EC_GROUP_free(ec_group);
+
+  if (bn_ctx != nullptr)
+    BN_CTX_free(bn_ctx);
+
+  if (scratch != nullptr)
+    delete[] scratch;
+}
+
+inline void Group::resize_scratch(size_t size) {
+  if (size > scratch_size) {
+    delete[] scratch;
+    scratch_size = size;
+    scratch = new unsigned char[scratch_size];
+  }
+}
+
+inline void Group::get_rand_bn(BigInt &n) { BN_rand_range(n.n, order.n); }
+
+inline Point Group::get_generator() {
+  Point res(this);
+  int ret = EC_POINT_copy(res.point, EC_GROUP_get0_generator(ec_group));
+  if (ret == 0)
+    sci::error("ECC GEN");
+  return res;
+}
+
+inline Point Group::mul_gen(const BigInt &m) {
+  Point res(this);
+  int ret = EC_POINT_mul(ec_group, res.point, m.n, NULL, NULL, bn_ctx);
+  if (ret == 0)
+    sci::error("ECC GEN MUL");
+  return res;
+}
+} // namespace emp
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h
new file mode 100644
index 00000000..5499535e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h
@@ -0,0 +1,115 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef HASH_H__
+#define HASH_H__
+
+#include "utils/block.h"
+#include "utils/constants.h"
+#include <openssl/sha.h>
+#include <stdio.h>
+
+/** @addtogroup BP
+  @{
+ */
+namespace sci {
+class Hash {
+public:
+  SHA256_CTX hash;
+  char buffer[HASH_BUFFER_SIZE];
+  int size = 0;
+  static const int DIGEST_SIZE = 32;
+  Hash() { SHA256_Init(&hash); }
+  ~Hash() {}
+  void put(const void *data, int nbyte) {
+    if (nbyte > HASH_BUFFER_SIZE)
+      SHA256_Update(&hash, data, nbyte);
+    else if (size + nbyte < HASH_BUFFER_SIZE) {
+      memcpy(buffer + size, data, nbyte);
+      size += nbyte;
+    } else {
+      SHA256_Update(&hash, (char *)buffer, size);
+      memcpy(buffer, data, nbyte);
+      size = nbyte;
+    }
+  }
+  void put_block(const block128 *block128, int nblock = 1) {
+    put(block128, sizeof(block128) * nblock);
+  }
+  void digest(char *a) {
+    if (size > 0) {
+      SHA256_Update(&hash, (char *)buffer, size);
+      size = 0;
+    }
+    SHA256_Final((unsigned char *)a, &hash);
+  }
+  void reset() {
+    SHA256_Init(&hash);
+    size = 0;
+  }
+  static void hash_once(void *digest, const void *data, int nbyte) {
+    (void)SHA256((const unsigned char *)data, nbyte, (unsigned char *)digest);
+  }
+  __attribute__((target("sse2"))) static block128
+  hash_for_block128(const void *data, int nbyte) {
+    // even though stack is aligned to 16 byte, we don't know the order of
+    // locals.
+    alignas(16) char digest[DIGEST_SIZE];
+    hash_once(digest, data, nbyte);
+    return _mm_load_si128((__m128i *)&digest[0]);
+  }
+
+  __attribute__((target("avx"))) static block256
+  hash_for_block256(const void *data, int nbyte) {
+    alignas(32) char digest[DIGEST_SIZE];
+    hash_once(digest, data, nbyte);
+    return _mm256_load_si256((__m256i *)&digest[0]);
+  }
+
+  static block128 KDF128(emp::Point &in, uint64_t id = 1) {
+    size_t len = in.size();
+    in.group->resize_scratch(len + 8);
+    unsigned char *tmp = in.group->scratch;
+    in.to_bin(tmp, len);
+    memcpy(tmp + len, &id, 8);
+    block128 ret = hash_for_block128(tmp, len + 8);
+    return ret;
+  }
+
+  static block256 KDF256(emp::Point &in, uint64_t id = 1) {
+    size_t len = in.size();
+    in.group->resize_scratch(len + 8);
+    unsigned char *tmp = in.group->scratch;
+    in.to_bin(tmp, len);
+    memcpy(tmp + len, &id, 8);
+    alignas(32) block256 ret = hash_for_block256(tmp, len + 8);
+    return ret;
+  }
+};
+} // namespace sci
+/**@}*/
+#endif // HASH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_channel.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_channel.h
new file mode 100644
index 00000000..a73d12ec
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_channel.h
@@ -0,0 +1,84 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef IO_CHANNEL_H__
+#define IO_CHANNEL_H__
+#include "utils/block.h"
+#include "utils/group.h"
+
+/** @addtogroup IO
+  @{
+ */
+
+namespace sci {
+template <typename T> class IOChannel {
+public:
+  void send_data(const void *data, int nbyte) {
+    derived().send_data(data, nbyte);
+  }
+  void recv_data(void *data, int nbyte) { derived().recv_data(data, nbyte); }
+
+  void send_block(const block128 *data, int nblock) {
+    send_data(data, nblock * sizeof(block128));
+  }
+
+  void send_block(const block256 *data, int nblock) {
+    send_data(data, nblock * sizeof(block256));
+  }
+
+  void recv_block(block128 *data, int nblock) {
+    recv_data(data, nblock * sizeof(block128));
+  }
+
+  void send_pt(emp::Point *A, int num_pts = 1) {
+    for (int i = 0; i < num_pts; ++i) {
+      size_t len = A[i].size();
+      A[i].group->resize_scratch(len);
+      unsigned char *tmp = A[i].group->scratch;
+      send_data(&len, 4);
+      A[i].to_bin(tmp, len);
+      send_data(tmp, len);
+    }
+  }
+
+  void recv_pt(emp::Group *g, emp::Point *A, int num_pts = 1) {
+    size_t len = 0;
+    for (int i = 0; i < num_pts; ++i) {
+      recv_data(&len, 4);
+      g->resize_scratch(len);
+      unsigned char *tmp = g->scratch;
+      recv_data(tmp, len);
+      A[i].from_bin(g, tmp, len);
+    }
+  }
+
+private:
+  T &derived() { return *static_cast<T *>(this); }
+};
+/**@}*/
+} // namespace sci
+#endif // IO_CHANNEL_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_pack.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_pack.h
new file mode 100644
index 00000000..3fad86d7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/io_pack.h
@@ -0,0 +1,44 @@
+#ifndef IO_PACK_H__
+#define IO_PACK_H__
+#include "utils/net_io_channel.h"
+
+#define GC_PORT_OFFSET 100
+#define REV_PORT_OFFSET 50
+
+namespace sci {
+class IOPack {
+public:
+  NetIO *io;
+  NetIO *io_rev;
+  NetIO *io_GC;
+  std::string address;
+  int party, port;
+
+  IOPack(int party, int port, std::string address = "127.0.0.1") {
+    this->party = party;
+    this->port = port;
+    this->io =
+        new NetIO(party == 1 ? nullptr : address.c_str(), port, false, false);
+    this->io_rev = new NetIO(party == 1 ? nullptr : address.c_str(),
+                             port + REV_PORT_OFFSET, false, true);
+    this->io_GC = new NetIO(party == 1 ? nullptr : address.c_str(),
+                            port + GC_PORT_OFFSET, true, true);
+    this->io_GC->flush();
+  }
+
+  uint64_t get_rounds() {
+    // no need to count io_rev->num_rounds as io_rev is only used in parallel
+    // with io
+    return io->num_rounds + io_GC->num_rounds;
+  }
+
+  uint64_t get_comm() { return io->counter + io_rev->counter + io_GC->counter; }
+
+  ~IOPack() {
+    delete io;
+    delete io_rev;
+    delete io_GC;
+  }
+};
+} // namespace sci
+#endif // IO_PACK_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/net_io_channel.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/net_io_channel.h
new file mode 100644
index 00000000..e4b47c7e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/net_io_channel.h
@@ -0,0 +1,205 @@
+/*
+Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+*/
+
+#ifndef NETWORK_IO_CHANNEL
+#define NETWORK_IO_CHANNEL
+
+#include "utils/io_channel.h"
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+using std::string;
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+enum class LastCall { None, Send, Recv };
+
+namespace sci {
+/** @addtogroup IO
+  @{
+ */
+
+class NetIO : public IOChannel<NetIO> {
+public:
+  bool is_server;
+  int mysocket = -1;
+  int consocket = -1;
+  FILE *stream = nullptr;
+  char *buffer = nullptr;
+  bool has_sent = false;
+  string addr;
+  int port;
+  uint64_t counter = 0;
+  uint64_t num_rounds = 0;
+  bool FBF_mode;
+  LastCall last_call = LastCall::None;
+  NetIO(const char *address, int port, bool full_buffer = false,
+        bool quiet = false) {
+    this->port = port;
+    is_server = (address == nullptr);
+    if (address == nullptr) {
+      struct sockaddr_in dest;
+      struct sockaddr_in serv;
+      socklen_t socksize = sizeof(struct sockaddr_in);
+      memset(&serv, 0, sizeof(serv));
+      serv.sin_family = AF_INET;
+      serv.sin_addr.s_addr =
+          htonl(INADDR_ANY);       /* set our address to any interface */
+      serv.sin_port = htons(port); /* set the server port number */
+      mysocket = socket(AF_INET, SOCK_STREAM, 0);
+      int reuse = 1;
+      setsockopt(mysocket, SOL_SOCKET, SO_REUSEADDR, (const char *)&reuse,
+                 sizeof(reuse));
+      if (::bind(mysocket, (struct sockaddr *)&serv, sizeof(struct sockaddr)) <
+          0) {
+        perror("error: bind");
+        exit(1);
+      }
+      if (listen(mysocket, 1) < 0) {
+        perror("error: listen");
+        exit(1);
+      }
+      consocket = accept(mysocket, (struct sockaddr *)&dest, &socksize);
+      close(mysocket);
+    } else {
+      addr = string(address);
+
+      struct sockaddr_in dest;
+      memset(&dest, 0, sizeof(dest));
+      dest.sin_family = AF_INET;
+      dest.sin_addr.s_addr = inet_addr(address);
+      dest.sin_port = htons(port);
+
+      while (1) {
+        consocket = socket(AF_INET, SOCK_STREAM, 0);
+
+        if (connect(consocket, (struct sockaddr *)&dest,
+                    sizeof(struct sockaddr)) == 0) {
+          break;
+        }
+
+        close(consocket);
+        usleep(1000);
+      }
+    }
+    set_nodelay();
+    stream = fdopen(consocket, "wb+");
+    buffer = new char[NETWORK_BUFFER_SIZE];
+    memset(buffer, 0, NETWORK_BUFFER_SIZE);
+    if (full_buffer) {
+      setvbuf(stream, buffer, _IOFBF, NETWORK_BUFFER_SIZE);
+    } else {
+      setvbuf(stream, buffer, _IONBF, NETWORK_BUFFER_SIZE);
+    }
+    this->FBF_mode = full_buffer;
+    if (!quiet)
+      std::cout << "connected\n";
+  }
+
+  void sync() {
+    int tmp = 0;
+    if (is_server) {
+      send_data(&tmp, 1);
+      recv_data(&tmp, 1);
+    } else {
+      recv_data(&tmp, 1);
+      send_data(&tmp, 1);
+      flush();
+    }
+  }
+
+  ~NetIO() {
+    fflush(stream);
+    close(consocket);
+    delete[] buffer;
+  }
+
+  void set_FBF() {
+    flush();
+    setvbuf(stream, buffer, _IOFBF, NETWORK_BUFFER_SIZE);
+  }
+
+  void set_NBF() {
+    flush();
+    setvbuf(stream, buffer, _IONBF, NETWORK_BUFFER_SIZE);
+  }
+
+  void set_nodelay() {
+    const int one = 1;
+    setsockopt(consocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
+  }
+
+  void set_delay() {
+    const int zero = 0;
+    setsockopt(consocket, IPPROTO_TCP, TCP_NODELAY, &zero, sizeof(zero));
+  }
+
+  void flush() { fflush(stream); }
+
+  void send_data(const void *data, int len) {
+    if (last_call != LastCall::Send) {
+      num_rounds++;
+      last_call = LastCall::Send;
+    }
+    counter += len;
+    int sent = 0;
+    while (sent < len) {
+      int res = fwrite(sent + (char *)data, 1, len - sent, stream);
+      if (res >= 0)
+        sent += res;
+      else
+        fprintf(stderr, "error: net_send_data %d\n", res);
+    }
+    has_sent = true;
+  }
+
+  void recv_data(void *data, int len) {
+    if (last_call != LastCall::Recv) {
+      num_rounds++;
+      last_call = LastCall::Recv;
+    }
+    if (has_sent)
+      fflush(stream);
+    has_sent = false;
+    int sent = 0;
+    while (sent < len) {
+      int res = fread(sent + (char *)data, 1, len - sent, stream);
+      if (res >= 0)
+        sent += res;
+      else
+        fprintf(stderr, "error: net_send_data %d\n", res);
+    }
+  }
+};
+/**@}*/
+
+} // namespace sci
+#endif // NETWORK_IO_CHANNEL
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h
new file mode 100644
index 00000000..00337321
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h
@@ -0,0 +1,300 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef PRG_H__
+#define PRG_H__
+#include "utils/aes-ni.h"
+#include "utils/aes.h"
+#include "utils/block.h"
+#include "utils/constants.h"
+#include <gmp.h>
+#include <random>
+
+#ifdef EMP_USE_RANDOM_DEVICE
+#else
+#include <x86intrin.h>
+#endif
+
+/** @addtogroup BP
+  @{
+ */
+namespace sci {
+
+class PRG128 {
+public:
+  uint64_t counter = 0;
+  AES_KEY aes;
+  PRG128(const void *seed = nullptr, int id = 0) {
+    if (seed != nullptr) {
+      reseed(seed, id);
+    } else {
+      block128 v;
+#ifdef EMP_USE_RANDOM_DEVICE
+      int *data = (int *)(&v);
+      std::random_device rand_div;
+      for (size_t i = 0; i < sizeof(block128) / sizeof(int); ++i)
+        data[i] = rand_div();
+#else
+      unsigned long long r0, r1;
+      _rdseed64_step(&r0);
+      _rdseed64_step(&r1);
+      v = makeBlock128(r0, r1);
+#endif
+      reseed(&v);
+    }
+  }
+  void reseed(const void *key, uint64_t id = 0) {
+    block128 v = _mm_loadu_si128((block128 *)key);
+    v = xorBlocks(v, makeBlock128(0LL, id));
+    AES_set_encrypt_key(v, &aes);
+    counter = 0;
+  }
+
+  void random_data(void *data, int nbytes) {
+    random_block((block128 *)data, nbytes / 16);
+    if (nbytes % 16 != 0) {
+      block128 extra;
+      random_block(&extra, 1);
+      memcpy((nbytes / 16 * 16) + (char *)data, &extra, nbytes % 16);
+    }
+  }
+
+  void random_bool(bool *data, int length) {
+    uint8_t *uint_data = (uint8_t *)data;
+    random_data(uint_data, length);
+    for (int i = 0; i < length; ++i)
+      data[i] = uint_data[i] & 1;
+  }
+
+  void random_data_unaligned(void *data, int nbytes) {
+    block128 tmp[AES_BATCH_SIZE];
+    for (int i = 0; i < nbytes / (AES_BATCH_SIZE * 16); i++) {
+      random_block(tmp, AES_BATCH_SIZE);
+      memcpy((16 * i * AES_BATCH_SIZE) + (uint8_t *)data, tmp,
+             16 * AES_BATCH_SIZE);
+    }
+    if (nbytes % (16 * AES_BATCH_SIZE) != 0) {
+      random_block(tmp, AES_BATCH_SIZE);
+      memcpy((nbytes / (16 * AES_BATCH_SIZE) * (16 * AES_BATCH_SIZE)) +
+                 (uint8_t *)data,
+             tmp, nbytes % (16 * AES_BATCH_SIZE));
+    }
+  }
+
+  void random_block(block128 *data, int nblocks = 1) {
+    for (int i = 0; i < nblocks; ++i) {
+      data[i] = makeBlock128(0LL, counter++);
+    }
+    int i = 0;
+    for (; i < nblocks - AES_BATCH_SIZE; i += AES_BATCH_SIZE) {
+      AES_ecb_encrypt_blks(data + i, AES_BATCH_SIZE, &aes);
+    }
+    AES_ecb_encrypt_blks(
+        data + i, (AES_BATCH_SIZE > nblocks - i) ? nblocks - i : AES_BATCH_SIZE,
+        &aes);
+  }
+
+  void random_block(block256 *data, int nblocks = 1) {
+    nblocks = nblocks * 2;
+    block128 tmp[nblocks];
+    for (int i = 0; i < nblocks; ++i) {
+      tmp[i] = makeBlock128(0LL, counter++);
+    }
+    int i = 0;
+    for (; i < nblocks - AES_BATCH_SIZE; i += AES_BATCH_SIZE) {
+      AES_ecb_encrypt_blks(tmp + i, AES_BATCH_SIZE, &aes);
+    }
+    AES_ecb_encrypt_blks(
+        tmp + i, (AES_BATCH_SIZE > nblocks - i) ? nblocks - i : AES_BATCH_SIZE,
+        &aes);
+    for (int i = 0; i < nblocks / 2; ++i) {
+      data[i] = makeBlock256(tmp[2 * i], tmp[2 * i + 1]);
+    }
+  }
+
+  void random_mpz(mpz_t out, int nbits) {
+    int nbytes = (nbits + 7) / 8;
+    uint8_t *data = (uint8_t *)new block128[(nbytes + 15) / 16];
+    random_data(data, nbytes);
+    if (nbits % 8 != 0)
+      data[0] %= (1 << (nbits % 8));
+    mpz_import(out, nbytes, 1, 1, 0, 0, data);
+    delete[] data;
+  }
+
+  void random_mpz(mpz_t rop, const mpz_t n) {
+    auto size = mpz_sizeinbase(n, 2);
+    while (1) {
+      random_mpz(rop, (int)size);
+      if (mpz_cmp(rop, n) < 0) {
+        break;
+      }
+    }
+  }
+
+  template <typename T> void random_mod_p(T *arr, uint64_t size, T prime_mod) {
+    T boundary = (((-1 * prime_mod) / prime_mod) + 1) *
+                 prime_mod; // prime_mod*floor((2^l)/prime_mod)
+    int tries_before_resampling = 2;
+    uint64_t size_total = tries_before_resampling * size;
+    T *randomness = new T[size_total];
+    uint64_t rptr = 0, arrptr = 0;
+    while (arrptr < size) {
+      this->random_data(randomness, sizeof(T) * size_total);
+      rptr = 0;
+      for (; (arrptr < size) && (rptr < size_total); arrptr++, rptr++) {
+        while (randomness[rptr] > boundary) {
+          rptr++;
+          if (rptr >= size_total) {
+            this->random_data(randomness, sizeof(T) * size_total);
+            rptr = 0;
+          }
+        }
+        arr[arrptr] = randomness[rptr] % prime_mod;
+      }
+    }
+    delete[] randomness;
+  }
+};
+
+class PRG256 {
+public:
+  uint64_t counter = 0;
+  AESNI_KEY aes;
+  PRG256(const void *seed = nullptr, int id = 0) {
+    if (seed != nullptr) {
+      reseed(seed, id);
+    } else {
+      alignas(32) block256 v;
+#ifdef EMP_USE_RANDOM_DEVICE
+      int *data = (int *)(&v);
+      std::random_device rand_div;
+      for (size_t i = 0; i < sizeof(block256) / sizeof(int); ++i)
+        data[i] = rand_div();
+#else
+      unsigned long long r0, r1, r2, r3;
+      _rdseed64_step(&r0);
+      _rdseed64_step(&r1);
+      _rdseed64_step(&r2);
+      _rdseed64_step(&r3);
+      v = makeBlock256(r0, r1, r2, r3);
+#endif
+      reseed(&v);
+    }
+  }
+  void reseed(const void *key, uint64_t id = 0) {
+    alignas(32) block256 v = _mm256_load_si256((block256 *)key);
+    v = xorBlocks(v, makeBlock256(0LL, 0LL, 0LL, id));
+    AESNI_set_encrypt_key(&aes, v);
+    counter = 0;
+  }
+
+  void random_data(void *data, int nbytes) {
+    random_block((block128 *)data, nbytes / 16);
+    if (nbytes % 16 != 0) {
+      block128 extra;
+      random_block(&extra, 1);
+      memcpy((nbytes / 16 * 16) + (char *)data, &extra, nbytes % 16);
+    }
+  }
+
+  void random_bool(bool *data, int length) {
+    uint8_t *uint_data = (uint8_t *)data;
+    random_data(uint_data, length);
+    for (int i = 0; i < length; ++i)
+      data[i] = uint_data[i] & 1;
+  }
+
+  void random_data_unaligned(void *data, int nbytes) {
+    block128 tmp[AES_BATCH_SIZE];
+    for (int i = 0; i < nbytes / (AES_BATCH_SIZE * 16); i++) {
+      random_block(tmp, AES_BATCH_SIZE);
+      memcpy((16 * i * AES_BATCH_SIZE) + (uint8_t *)data, tmp,
+             16 * AES_BATCH_SIZE);
+    }
+    if (nbytes % (16 * AES_BATCH_SIZE) != 0) {
+      random_block(tmp, AES_BATCH_SIZE);
+      memcpy((nbytes / (16 * AES_BATCH_SIZE) * (16 * AES_BATCH_SIZE)) +
+                 (uint8_t *)data,
+             tmp, nbytes % (16 * AES_BATCH_SIZE));
+    }
+  }
+
+  void random_block(block128 *data, int nblocks = 1) {
+    for (int i = 0; i < nblocks; ++i) {
+      data[i] = makeBlock128(0LL, counter++);
+    }
+    int i = 0;
+    for (; i < nblocks - AES_BATCH_SIZE; i += AES_BATCH_SIZE) {
+      AESNI_ecb_encrypt_blks(data + i, AES_BATCH_SIZE, &aes);
+    }
+    AESNI_ecb_encrypt_blks(
+        data + i, (AES_BATCH_SIZE > nblocks - i) ? nblocks - i : AES_BATCH_SIZE,
+        &aes);
+  }
+
+  void random_block(block256 *data, int nblocks = 1) {
+    nblocks = nblocks * 2;
+    block128 tmp[nblocks];
+    for (int i = 0; i < nblocks; ++i) {
+      tmp[i] = makeBlock128(0LL, counter++);
+    }
+    int i = 0;
+    for (; i < nblocks - AES_BATCH_SIZE; i += AES_BATCH_SIZE) {
+      AESNI_ecb_encrypt_blks(tmp + i, AES_BATCH_SIZE, &aes);
+    }
+    AESNI_ecb_encrypt_blks(
+        tmp + i, (AES_BATCH_SIZE > nblocks - i) ? nblocks - i : AES_BATCH_SIZE,
+        &aes);
+    for (int i = 0; i < nblocks / 2; ++i) {
+      data[i] = makeBlock256(tmp[2 * i], tmp[2 * i + 1]);
+    }
+  }
+
+  void random_mpz(mpz_t out, int nbits) {
+    int nbytes = (nbits + 7) / 8;
+    uint8_t *data = (uint8_t *)new block128[(nbytes + 15) / 16];
+    random_data(data, nbytes);
+    if (nbits % 8 != 0)
+      data[0] %= (1 << (nbits % 8));
+    mpz_import(out, nbytes, 1, 1, 0, 0, data);
+    delete[] data;
+  }
+
+  void random_mpz(mpz_t rop, const mpz_t n) {
+    auto size = mpz_sizeinbase(n, 2);
+    while (1) {
+      random_mpz(rop, (int)size);
+      if (mpz_cmp(rop, n) < 0) {
+        break;
+      }
+    }
+  }
+};
+
+} // namespace sci
+#endif // PRG_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h
new file mode 100644
index 00000000..5611220d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+*/
+
+#ifndef PRP_H__
+#define PRP_H__
+#include "utils/aes.h"
+#include "utils/block.h"
+#include "utils/constants.h"
+#include <stdio.h>
+/** @addtogroup BP
+  @{
+ */
+namespace sci {
+
+class PRP {
+public:
+  AES_KEY aes;
+
+  PRP(const char *seed = fix_key) { aes_set_key(seed); }
+
+  PRP(const block128 &seed) : PRP((const char *)&seed) {}
+
+  void aes_set_key(const char *key) {
+    aes_set_key(_mm_loadu_si128((block128 *)key));
+  }
+
+  void aes_set_key(const block128 &v) { AES_set_encrypt_key(v, &aes); }
+
+  void permute_block(block128 *data, int nblocks) {
+    int i = 0;
+    for (; i < nblocks - AES_BATCH_SIZE; i += AES_BATCH_SIZE) {
+      AES_ecb_encrypt_blks(data + i, AES_BATCH_SIZE, &aes);
+    }
+    AES_ecb_encrypt_blks(
+        data + i, (AES_BATCH_SIZE > nblocks - i) ? nblocks - i : AES_BATCH_SIZE,
+        &aes);
+  }
+
+  void permute_data(void *data, int nbytes) {
+    permute_block((block128 *)data, nbytes / 16);
+    if (nbytes % 16 != 0) {
+      uint8_t extra[16];
+      memset(extra, 0, 16);
+      memcpy(extra, (nbytes / 16 * 16) + (char *)data, nbytes % 16);
+      permute_block((block128 *)extra, 1);
+      memcpy((nbytes / 16 * 16) + (char *)data, &extra, nbytes % 16);
+    }
+  }
+
+  block128 H(block128 in, uint64_t id) {
+    in = double_block(in);
+    __m128i k_128 = _mm_loadl_epi64((__m128i const *)(&id));
+    in = xorBlocks(in, k_128);
+    block128 t = in;
+    permute_block(&t, 1);
+    in = xorBlocks(in, t);
+    return in;
+  }
+  template <int n> void H(block128 out[n], block128 in[n], uint64_t id) {
+    block128 scratch[n];
+    for (int i = 0; i < n; ++i) {
+      out[i] = scratch[i] = xorBlocks(double_block(in[i]),
+                                      _mm_loadl_epi64((__m128i const *)(&id)));
+      ++id;
+    }
+    permute_block(scratch, n);
+    xorBlocks_arr(out, scratch, out, n);
+  }
+
+  void Hn(block128 *out, block128 *in, uint64_t id, int length,
+          block128 *scratch = nullptr) {
+    bool del = false;
+    if (scratch == nullptr) {
+      del = true;
+      scratch = new block128[length];
+    }
+    for (int i = 0; i < length; ++i) {
+      out[i] = scratch[i] = xorBlocks(double_block(in[i]),
+                                      _mm_loadl_epi64((__m128i const *)(&id)));
+      ++id;
+    }
+    permute_block(scratch, length);
+    xorBlocks_arr(out, scratch, out, length);
+    if (del) {
+      delete[] scratch;
+      scratch = nullptr;
+    }
+  }
+};
+} // namespace sci
+/**@}*/
+#endif // PRP_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/sse2neon.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/sse2neon.h
new file mode 100644
index 00000000..9fd405a5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/sse2neon.h
@@ -0,0 +1,5566 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of _mm_min_ps and _mm_max_ps
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#elif defined(__aarch64__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)                                        \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+  float m128_f32[4];    // as floats - DON'T USE. Added for convenience.
+  int8_t m128_i8[16];   // as signed 8-bit integers.
+  int16_t m128_i16[8];  // as signed 16-bit integers.
+  int32_t m128_i32[4];  // as signed 32-bit integers.
+  int64_t m128_i64[2];  // as signed 64-bit integers.
+  uint8_t m128_u8[16];  // as unsigned 8-bit integers.
+  uint16_t m128_u16[8]; // as unsigned 16-bit integers.
+  uint32_t m128_u32[4]; // as unsigned 32-bit integers.
+  uint64_t m128_u64[2]; // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *)&x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *)&x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *)&x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ <= 9
+FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p) {
+  uint8x16x4_t ret;
+  ret.val[0] = vld1q_u8(p + 0);
+  ret.val[1] = vld1q_u8(p + 16);
+  ret.val[2] = vld1q_u8(p + 32);
+  ret.val[3] = vld1q_u8(p + 48);
+  return ret;
+}
+#endif
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+  _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+  _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+  _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+  _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+  _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+  _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+  _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+  _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i) {
+  (void)i;
+  __builtin_prefetch(p);
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a) {
+  return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void) {
+  return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void) {
+  return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w) {
+  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w) {
+  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) {
+  float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+  return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a) {
+  float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+  return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) {
+  float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+  return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3,
+                                    short w4, short w5, short w6, short w7) {
+  int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+  return vreinterpretq_m128i_s16(vld1q_s16((int16_t *)data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) {
+  int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+  return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) {
+  return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w) {
+  return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w) {
+  return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i
+_mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12,
+             signed char b11, signed char b10, signed char b9, signed char b8,
+             signed char b7, signed char b6, signed char b5, signed char b4,
+             signed char b3, signed char b2, signed char b1, signed char b0) {
+  int8_t ALIGN_STRUCT(16)
+      data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
+                  (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
+                  (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
+                  (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
+  return (__m128i)vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4,
+                                   short i3, short i2, short i1, short i0) {
+  int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+  return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(
+    signed char b0, signed char b1, signed char b2, signed char b3,
+    signed char b4, signed char b5, signed char b6, signed char b7,
+    signed char b8, signed char b9, signed char b10, signed char b11,
+    signed char b12, signed char b13, signed char b14, signed char b15) {
+  int8_t ALIGN_STRUCT(16)
+      data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
+                  (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
+                  (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
+                  (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
+  return (__m128i)vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i) {
+  return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) {
+  return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t)_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) {
+  return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) {
+  int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+  return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) {
+  int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
+  return vreinterpretq_m128i_s64(vld1q_s64(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) {
+  return _mm_set_epi64x((int64_t)i1, (int64_t)i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) {
+  double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(vld1q_f64((float64_t *)data));
+#else
+  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *)data));
+#endif
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a) {
+  vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) {
+  vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) {
+  vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) {
+  vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a) {
+  vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) {
+#if defined(__aarch64__)
+  vst1q_f64((float64_t *)mem_addr, vreinterpretq_f64_m128d(a));
+#else
+  vst1q_f32((float32_t *)mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) {
+  _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) {
+  uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+  uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+  *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) {
+  *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) {
+  *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p) {
+  return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) {
+  return vreinterpretq_m128_f32(
+      vcombine_f32(vld1_f32((const float32_t *)p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p) {
+  float32x4_t v = vrev64q_f32(vld1q_f32(p));
+  return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) {
+  return vreinterpretq_m128_f32(
+      vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *)p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p) {
+  return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p) {
+  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+  // equivalent for neon
+  return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p) {
+  return vreinterpretq_m128i_s16(
+      vsetq_lane_s16(*(const int16_t *)p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p) {
+  return vreinterpretq_m128i_s64(
+      vcombine_s64(vld1_s64((const int64_t *)p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+  const float *fp = (const float *)p;
+  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+  return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+  const float *fp = (const float *)p;
+  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+  return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); }
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p) {
+  return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) {
+  /* Load the lower 64 bits of the value pointed to by p into the
+   * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+   */
+  return vreinterpretq_m128i_s32(
+      vcombine_s32(vld1_s32((int32_t const *)p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(
+      vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+  return vreinterpretq_m128d_f32(vcombine_f32(
+      vld1_f32((const float *)p), vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p) {
+#if defined(__aarch64__)
+  float64x2_t v = vld1q_f64(p);
+  return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+  int64x2_t v = vld1q_s64((const int64_t *)p);
+  return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) {
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                     vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a) {
+  return vreinterpretq_m128i_s64(
+      vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void) {
+  __m128 a = (__m128)_mm_setzero_si128();
+  return a;
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_s32(
+      vbicq_s32(vreinterpretq_s32_m128(b),
+                vreinterpretq_s32_m128(a))); // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) {
+  // *NOTE* argument swap
+  return vreinterpretq_m128d_s64(
+      vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vbicq_s32(vreinterpretq_s32_m128i(b),
+                vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_s32(
+      vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
+  return vreinterpretq_m128d_s64(
+      vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_s32(
+      vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_s32(
+      veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) {
+  return vreinterpretq_m128d_s64(
+      veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) {
+#if __has_builtin(__builtin_shufflevector)
+  return vreinterpretq_m128_f32(__builtin_shufflevector(
+      vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+  float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+  float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+  float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+  return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) {
+#if __has_builtin(__builtin_shufflevector)
+  return vreinterpretq_m128_f32(__builtin_shufflevector(
+      vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+  float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+  float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+  return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) {
+  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+  return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) {
+  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) {
+  return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) {
+  return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) {
+  return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) {
+  return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) {
+  return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) {
+  return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) {
+  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) {
+  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+  float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+  return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) {
+  float32x2_t a21 = vget_high_f32(
+      vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+  float32x2_t b03 = vget_low_f32(
+      vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+  return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) {
+  float32x2_t a03 = vget_low_f32(
+      vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+  float32x2_t b21 = vget_high_f32(
+      vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+  return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) {
+  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) {
+  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) {
+  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+  float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+  return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) {
+  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) {
+  float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+  return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) {
+  float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+  return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) {
+  float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+  float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+  return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) {
+  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+  float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+  float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) {
+  float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+  float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+  return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) {
+  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+  float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+  return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) {
+  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+  return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) {
+  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+  return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+  __extension__({                                                              \
+    float32x4_t ret;                                                           \
+    ret =                                                                      \
+        vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
+    ret = vsetq_lane_f32(                                                      \
+        vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret,    \
+        1);                                                                    \
+    ret = vsetq_lane_f32(                                                      \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret,    \
+        2);                                                                    \
+    ret = vsetq_lane_f32(                                                      \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret,    \
+        3);                                                                    \
+    vreinterpretq_m128_f32(ret);                                               \
+  })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                              \
+  __extension__({                                                              \
+    float32x4_t _input1 = vreinterpretq_f32_m128(a);                           \
+    float32x4_t _input2 = vreinterpretq_f32_m128(b);                           \
+    float32x4_t _shuf = __builtin_shufflevector(                               \
+        _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3,                   \
+        (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4);                   \
+    vreinterpretq_m128_f32(_shuf);                                             \
+  })
+#else // generic
+#define _mm_shuffle_ps(a, b, imm)                                              \
+  __extension__({                                                              \
+    __m128 ret;                                                                \
+    switch (imm) {                                                             \
+    case _MM_SHUFFLE(1, 0, 3, 2):                                              \
+      ret = _mm_shuffle_ps_1032((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 3, 0, 1):                                              \
+      ret = _mm_shuffle_ps_2301((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 3, 2, 1):                                              \
+      ret = _mm_shuffle_ps_0321((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 1, 0, 3):                                              \
+      ret = _mm_shuffle_ps_2103((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 0, 1, 0):                                              \
+      ret = _mm_movelh_ps((a), (b));                                           \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 0, 0, 1):                                              \
+      ret = _mm_shuffle_ps_1001((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 1, 0, 1):                                              \
+      ret = _mm_shuffle_ps_0101((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(3, 2, 1, 0):                                              \
+      ret = _mm_shuffle_ps_3210((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 0, 1, 1):                                              \
+      ret = _mm_shuffle_ps_0011((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 0, 2, 2):                                              \
+      ret = _mm_shuffle_ps_0022((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 2, 0, 0):                                              \
+      ret = _mm_shuffle_ps_2200((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(3, 2, 0, 2):                                              \
+      ret = _mm_shuffle_ps_3202((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(3, 2, 3, 2):                                              \
+      ret = _mm_movehl_ps((b), (a));                                           \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 1, 3, 3):                                              \
+      ret = _mm_shuffle_ps_1133((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 0, 1, 0):                                              \
+      ret = _mm_shuffle_ps_2010((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 0, 0, 1):                                              \
+      ret = _mm_shuffle_ps_2001((a), (b));                                     \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 0, 3, 2):                                              \
+      ret = _mm_shuffle_ps_2032((a), (b));                                     \
+      break;                                                                   \
+    default:                                                                   \
+      ret = _mm_shuffle_ps_default((a), (b), (imm));                           \
+      break;                                                                   \
+    }                                                                          \
+    ret;                                                                       \
+  })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) {
+  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+  return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) {
+  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+  int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+  return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) {
+  return vreinterpretq_m128i_s32(
+      vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) {
+  return vreinterpretq_m128i_s32(
+      vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) {
+  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+  return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) {
+  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+  return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) {
+  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+  return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) {
+  int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+  return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) {
+  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+  return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) {
+  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+  int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+  return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
+  int8x16_t tbl = vreinterpretq_s8_m128i(a);  // input a
+  uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
+  uint8x16_t idx_masked =
+      vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+  int8x16_t ret;
+  // %e and %f represent the even and odd D registers
+  // respectively.
+  __asm__ __volatile__("vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+                       "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+                       : [ret] "=&w"(ret)
+                       : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+  return vreinterpretq_m128i_s8(ret);
+#else
+  // use this line if testing on aarch64
+  int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+  return vreinterpretq_m128i_s8(
+      vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                  vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                      \
+  __extension__({                                                              \
+    int32x4_t ret;                                                             \
+    ret = vmovq_n_s32(                                                         \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));            \
+    ret = vsetq_lane_s32(                                                      \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret,   \
+        1);                                                                    \
+    ret = vsetq_lane_s32(                                                      \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret,   \
+        2);                                                                    \
+    ret = vsetq_lane_s32(                                                      \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret,   \
+        3);                                                                    \
+    vreinterpretq_m128i_s32(ret);                                              \
+  })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                                        \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s32(                                                   \
+        vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)));                   \
+  })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                        \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s32(                                                   \
+        vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))));       \
+  })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                                              \
+  __extension__({                                                              \
+    int32x4_t _input = vreinterpretq_s32_m128i(a);                             \
+    int32x4_t _shuf = __builtin_shufflevector(                                 \
+        _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, ((imm) >> 4) & 0x3, \
+        ((imm) >> 6) & 0x3);                                                   \
+    vreinterpretq_m128i_s32(_shuf);                                            \
+  })
+#else // generic
+#define _mm_shuffle_epi32(a, imm)                                              \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    switch (imm) {                                                             \
+    case _MM_SHUFFLE(1, 0, 3, 2):                                              \
+      ret = _mm_shuffle_epi_1032((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 3, 0, 1):                                              \
+      ret = _mm_shuffle_epi_2301((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 3, 2, 1):                                              \
+      ret = _mm_shuffle_epi_0321((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 1, 0, 3):                                              \
+      ret = _mm_shuffle_epi_2103((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 0, 1, 0):                                              \
+      ret = _mm_shuffle_epi_1010((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 0, 0, 1):                                              \
+      ret = _mm_shuffle_epi_1001((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 1, 0, 1):                                              \
+      ret = _mm_shuffle_epi_0101((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 2, 1, 1):                                              \
+      ret = _mm_shuffle_epi_2211((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 1, 2, 2):                                              \
+      ret = _mm_shuffle_epi_0122((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(3, 3, 3, 2):                                              \
+      ret = _mm_shuffle_epi_3332((a));                                         \
+      break;                                                                   \
+    case _MM_SHUFFLE(0, 0, 0, 0):                                              \
+      ret = _mm_shuffle_epi32_splat((a), 0);                                   \
+      break;                                                                   \
+    case _MM_SHUFFLE(1, 1, 1, 1):                                              \
+      ret = _mm_shuffle_epi32_splat((a), 1);                                   \
+      break;                                                                   \
+    case _MM_SHUFFLE(2, 2, 2, 2):                                              \
+      ret = _mm_shuffle_epi32_splat((a), 2);                                   \
+      break;                                                                   \
+    case _MM_SHUFFLE(3, 3, 3, 3):                                              \
+      ret = _mm_shuffle_epi32_splat((a), 3);                                   \
+      break;                                                                   \
+    default:                                                                   \
+      ret = _mm_shuffle_epi32_default((a), (imm));                             \
+      break;                                                                   \
+    }                                                                          \
+    ret;                                                                       \
+  })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                   \
+  __extension__({                                                              \
+    int16x8_t ret = vreinterpretq_s16_m128i(a);                                \
+    int16x4_t lowBits = vget_low_s16(ret);                                     \
+    ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);       \
+    ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, 1);  \
+    ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, 2);  \
+    ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, 3);  \
+    vreinterpretq_m128i_s16(ret);                                              \
+  })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                            \
+  __extension__({                                                              \
+    int16x8_t _input = vreinterpretq_s16_m128i(a);                             \
+    int16x8_t _shuf = __builtin_shufflevector(                                 \
+        _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),                 \
+        (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7);               \
+    vreinterpretq_m128i_s16(_shuf);                                            \
+  })
+#else // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+  __extension__({                                                              \
+    int16x8_t ret = vreinterpretq_s16_m128i(a);                                \
+    int16x4_t highBits = vget_high_s16(ret);                                   \
+    ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);      \
+    ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
+    ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
+    ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
+    vreinterpretq_m128i_s16(ret);                                              \
+  })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                                            \
+  __extension__({                                                              \
+    int16x8_t _input = vreinterpretq_s16_m128i(a);                             \
+    int16x8_t _shuf = __builtin_shufflevector(                                 \
+        _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,                       \
+        (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4,                    \
+        (((imm) >> 6) & 0x3) + 4);                                             \
+    vreinterpretq_m128i_s16(_shuf);                                            \
+  })
+#else // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                             \
+  __extension__({                                                              \
+    const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,           \
+                               ((imm) & (1 << 7)) ? 0xFFFF : 0x0000};          \
+    uint16x8_t _mask_vec = vld1q_u16(_mask);                                   \
+    uint16x8_t _a = vreinterpretq_u16_m128i(a);                                \
+    uint16x8_t _b = vreinterpretq_u16_m128i(b);                                \
+    vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                     \
+  })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) {
+  // Use a signed shift right to create a mask with the sign bit
+  uint8x16_t mask =
+      vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+  uint8x16_t a = vreinterpretq_u8_m128i(_a);
+  uint8x16_t b = vreinterpretq_u8_m128i(_b);
+  return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) {
+  const int count = (imm & ~15) ? 15 : imm;
+  return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) <= 0) {                                                          \
+      ret = a;                                                                 \
+    } else if ((imm) > 15) {                                                   \
+      ret = _mm_setzero_si128();                                               \
+    } else {                                                                   \
+      ret = vreinterpretq_m128i_s16(                                           \
+          vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm)));                     \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) {
+  if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
+    return a;
+  if (imm > 31) /* TODO: add unlikely macro */
+    return _mm_setzero_si128();
+  return vreinterpretq_m128i_s32(
+      vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) {
+  if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
+    return a;
+  if (imm > 63) /* TODO: add unlikely macro */
+    return _mm_setzero_si128();
+  return vreinterpretq_m128i_s64(
+      vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) == 0) {                                                          \
+      ret = a;                                                                 \
+    } else if (0 < (imm) && (imm) < 16) {                                      \
+      ret = vreinterpretq_m128i_u16(                                           \
+          vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm)));           \
+    } else {                                                                   \
+      ret = _mm_setzero_si128();                                               \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) == 0) {                                                          \
+      ret = a;                                                                 \
+    } else if (0 < (imm) && (imm) < 32) {                                      \
+      ret = vreinterpretq_m128i_u32(                                           \
+          vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm)));           \
+    } else {                                                                   \
+      ret = _mm_setzero_si128();                                               \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) == 0) {                                                          \
+      ret = a;                                                                 \
+    } else if (0 < (imm) && (imm) < 64) {                                      \
+      ret = vreinterpretq_m128i_u64(                                           \
+          vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm)));           \
+    } else {                                                                   \
+      ret = _mm_setzero_si128();                                               \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) == 0) {                                                          \
+      ret = a;                                                                 \
+    } else if (0 < (imm) && (imm) < 32) {                                      \
+      ret = vreinterpretq_m128i_s32(                                           \
+          vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm)));           \
+    } else {                                                                   \
+      ret = vreinterpretq_m128i_s32(                                           \
+          vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                        \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) <= 0) {                                                          \
+      ret = a;                                                                 \
+    } else if ((imm) > 15) {                                                   \
+      ret = _mm_setzero_si128();                                               \
+    } else {                                                                   \
+      ret = vreinterpretq_m128i_s8(                                            \
+          vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm)));          \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                                 \
+  __extension__({                                                              \
+    __m128i ret;                                                               \
+    if ((imm) <= 0) {                                                          \
+      ret = a;                                                                 \
+    } else if ((imm) > 15) {                                                   \
+      ret = _mm_setzero_si128();                                               \
+    } else {                                                                   \
+      ret = vreinterpretq_m128i_s8(                                            \
+          vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm)));     \
+    }                                                                          \
+    ret;                                                                       \
+  })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 15)
+    return _mm_setzero_si128();
+
+  int16x8_t vc = vdupq_n_s16((int16_t)c);
+  return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 31)
+    return _mm_setzero_si128();
+
+  int32x4_t vc = vdupq_n_s32((int32_t)c);
+  return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 63)
+    return _mm_setzero_si128();
+
+  int64x2_t vc = vdupq_n_s64((int64_t)c);
+  return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 15)
+    return _mm_setzero_si128();
+
+  int16x8_t vc = vdupq_n_s16(-(int16_t)c);
+  return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 31)
+    return _mm_setzero_si128();
+
+  int32x4_t vc = vdupq_n_s32(-(int32_t)c);
+  return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) {
+  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+  if (c > 63)
+    return _mm_setzero_si128();
+
+  int64x2_t vc = vdupq_n_s64(-(int64_t)c);
+  return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a) {
+#if defined(__aarch64__)
+  uint8x16_t input = vreinterpretq_u8_m128i(a);
+  const int8_t ALIGN_STRUCT(16)
+      xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
+  const uint8x16_t mask_and = vdupq_n_u8(0x80);
+  const int8x16_t mask_shift = vld1q_s8(xr);
+  const uint8x16_t mask_result =
+      vshlq_u8(vandq_u8(input, mask_and), mask_shift);
+  uint8x8_t lo = vget_low_u8(mask_result);
+  uint8x8_t hi = vget_high_u8(mask_result);
+
+  return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
+#else
+  // Use increasingly wide shifts+adds to collect the sign bits
+  // together.
+  // Since the widening shifts would be rather confusing to follow in little
+  // endian, everything will be illustrated in big endian order instead. This
+  // has a different result - the bits would actually be reversed on a big
+  // endian machine.
+
+  // Starting input (only half the elements are shown):
+  // 89 ff 1d c0 00 10 99 33
+  uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+  // Shift out everything but the sign bits with an unsigned shift right.
+  //
+  // Bytes of the vector::
+  // 89 ff 1d c0 00 10 99 33
+  // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+  //  |  |  |  |  |  |  |  |
+  // 01 01 00 01 00 00 01 00
+  //
+  // Bits of first important lane(s):
+  // 10001001 (89)
+  // \______
+  //        |
+  // 00000001 (01)
+  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+  // Merge the even lanes together with a 16-bit unsigned shift right + add.
+  // 'xx' represents garbage data which will be ignored in the final result.
+  // In the important bytes, the add functions like a binary OR.
+  //
+  // 01 01 00 01 00 00 01 00
+  //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+  //    \|    \|    \|    \|
+  // xx 03 xx 01 xx 00 xx 02
+  //
+  // 00000001 00000001 (01 01)
+  //        \_______ |
+  //                \|
+  // xxxxxxxx xxxxxx11 (xx 03)
+  uint32x4_t paired16 =
+      vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+  // Repeat with a wider 32-bit shift + add.
+  // xx 03 xx 01 xx 00 xx 02
+  //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+  //     14))
+  //          \|          \|
+  // xx xx xx 0d xx xx xx 02
+  //
+  // 00000011 00000001 (03 01)
+  //        \\_____ ||
+  //         '----.\||
+  // xxxxxxxx xxxx1101 (xx 0d)
+  uint64x2_t paired32 =
+      vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+  // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+  // lanes. xx xx xx 0d xx xx xx 02
+  //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+  //            28))
+  //                      \|
+  // xx xx xx xx xx xx xx d2
+  //
+  // 00001101 00000010 (0d 02)
+  //     \   \___ |  |
+  //      '---.  \|  |
+  // xxxxxxxx 11010010 (xx d2)
+  uint8x16_t paired64 =
+      vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+  // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+  // xx xx xx xx xx xx xx d2
+  //                      ||  return paired64[0]
+  //                      d2
+  // Note: Little endian would return the correct value 4b (01001011) instead.
+  return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) {
+  return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) {
+  return vreinterpretq_m128i_s64(
+      vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a) {
+  uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+  static const int32x4_t shift = {0, 1, 2, 3};
+  uint32x4_t tmp = vshrq_n_u32(input, 31);
+  return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+  // Uses the exact same method as _mm_movemask_epi8, see that for details.
+  // Shift out everything but the sign bits with a 32-bit unsigned shift
+  // right.
+  uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+  // Merge the two pairs together with a 64-bit unsigned shift right + add.
+  uint8x16_t paired =
+      vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+  // Extract the result.
+  return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a) {
+  return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+         ~(uint64_t)0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) {
+  int64x2_t a_and_mask =
+      vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+  return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                         : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_f32(
+      vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s64(
+      vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) {
+  return vreinterpret_m64_s64(
+      vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) {
+  int8x16_t a = vreinterpretq_s8_m128i(_a);
+  int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFF : 0
+  uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+  // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+  int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+  int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+  // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+  // based on ltMask
+  int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+  // res = masked & (~zeroMask)
+  int8x16_t res = vbicq_s8(masked, zeroMask);
+
+  return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) {
+  int16x8_t a = vreinterpretq_s16_m128i(_a);
+  int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFFFF : 0
+  uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+  // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+  int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+  int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+  // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+  // 'a') based on ltMask
+  int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+  // res = masked & (~zeroMask)
+  int16x8_t res = vbicq_s16(masked, zeroMask);
+  return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) {
+  int32x4_t a = vreinterpretq_s32_m128i(_a);
+  int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFFFFFFFF : 0
+  uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+  // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+  int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+  int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+  // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+  // 'a') based on ltMask
+  int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+  // res = masked & (~zeroMask)
+  int32x4_t res = vbicq_s32(masked, zeroMask);
+  return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) {
+  int16x4_t a = vreinterpret_s16_m64(_a);
+  int16x4_t b = vreinterpret_s16_m64(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFFFF : 0
+  uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+  // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+  int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+  int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+  // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+  // based on ltMask
+  int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+  // res = masked & (~zeroMask)
+  int16x4_t res = vbic_s16(masked, zeroMask);
+
+  return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) {
+  int32x2_t a = vreinterpret_s32_m64(_a);
+  int32x2_t b = vreinterpret_s32_m64(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFFFFFFFF : 0
+  uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+  // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+  int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+  int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+  // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+  // based on ltMask
+  int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+  // res = masked & (~zeroMask)
+  int32x2_t res = vbic_s32(masked, zeroMask);
+
+  return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) {
+  int8x8_t a = vreinterpret_s8_m64(_a);
+  int8x8_t b = vreinterpret_s8_m64(_b);
+
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFF : 0
+  uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+  // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+  int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+  int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+  // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+  // based on ltMask
+  int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+  // res = masked & (~zeroMask)
+  int8x8_t res = vbic_s8(masked, zeroMask);
+
+  return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) {
+  return vreinterpret_m64_u16(
+      vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) {
+  return vreinterpret_m64_u8(
+      vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) {
+  return (__m128i)vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                              vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_f32(
+      vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(
+      vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+  double *da = (double *)&a;
+  double *db = (double *)&b;
+  double c[2];
+  c[0] = da[0] + db[0];
+  c[1] = da[1] + db[1];
+  return vld1q_f32((float32_t *)c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) {
+  return vreinterpret_m64_s64(
+      vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) {
+  float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+  float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+  // the upper values in the result must be the remnants of <a>.
+  return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s64(
+      vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      tmp[31:0] := a[i+15:i] * b[i+15:i]
+//      dst[i+15:i] := tmp[31:16]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_f32(
+      vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) {
+  // vmull_u32 upcasts instead of masking, so we downcast.
+  uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+  uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+  return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) {
+  return vreinterpret_m64_u64(vget_low_u64(
+      vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) {
+  // vmull_s32 upcasts instead of masking, so we downcast.
+  int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+  int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+  return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) {
+  int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                            vget_low_s16(vreinterpretq_s16_m128i(b)));
+  int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                             vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+  int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+  int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+  return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) {
+  // Has issues due to saturation
+  // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+  // Multiply
+  int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                               vget_low_s16(vreinterpretq_s16_m128i(b)));
+  int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+  // Rounding narrowing shift right
+  // narrow = (int16_t)((mul + 16384) >> 15);
+  int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+  int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+  // Join together
+  return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) {
+#if defined(__aarch64__)
+  uint8x16_t a = vreinterpretq_u8_m128i(_a);
+  int8x16_t b = vreinterpretq_s8_m128i(_b);
+  int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                           vmovl_s8(vget_low_s8(b)));
+  int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                           vmovl_s8(vget_high_s8(b)));
+  return vreinterpretq_m128i_s16(
+      vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+  // This would be much simpler if x86 would choose to zero extend OR sign
+  // extend, not both. This could probably be optimized better.
+  uint16x8_t a = vreinterpretq_u16_m128i(_a);
+  int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+  // Zero extend a
+  int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+  int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+  // Sign extend by shifting left then shifting right.
+  int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+  int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+  // multiply
+  int16x8_t prod1 = vmulq_s16(a_even, b_even);
+  int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+  // saturated add
+  return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+                                          vreinterpretq_f32_m128(b),
+                                          vreinterpretq_f32_m128(a)));
+#else
+  return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) {
+  __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+  return _mm_fmadd_ps(b, mask, a);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) {
+  uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b));
+  uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+  uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+  uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+  return (__m128i)vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) {
+  uint16x4_t t =
+      vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+  uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+  return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+//   ENDFOR
+//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(
+      vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+  float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
+  float32x4_t recip1 =
+      vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
+  return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) {
+  float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the approximations of reciprocals of the four single-precision,
+// floating-point values of a.
+// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) {
+  float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+  return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) {
+  return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+  float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+  float32x4_t sq = vrecpeq_f32(recipsq);
+  // ??? use step versions of both sqrt and recip for better accuracy?
+  return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) {
+  float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) {
+  return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) {
+  return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) {
+  return vreinterpret_m64_s16(
+      vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) {
+#if SSE2NEON_PRECISE_MINMAX
+  float32x4_t _a = vreinterpretq_f32_m128(a);
+  float32x4_t _b = vreinterpretq_f32_m128(b);
+  return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+  return vreinterpretq_m128_f32(
+      vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) {
+  return vreinterpret_m64_u8(
+      vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) {
+  return vreinterpret_m64_s16(
+      vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) {
+#if SSE2NEON_PRECISE_MINMAX
+  float32x4_t _a = vreinterpretq_f32_m128(a);
+  float32x4_t _b = vreinterpretq_f32_m128(b);
+  return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+  return vreinterpretq_m128_f32(
+      vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) {
+  return vreinterpret_m64_u8(
+      vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) {
+  float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) {
+  float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s32(
+      vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u32(
+      vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u32(
+      vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) {
+  return vreinterpret_m64_u16(vshrn_n_u32(
+      vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) {
+  /* FIXME: issue with large values because of result saturation */
+  // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+  // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+  // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+  int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+  int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+  int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+  int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+  int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+  int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+  uint16x8x2_t r =
+      vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+  return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(
+      vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_f32(
+      vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) {
+  int16x8_t a = vreinterpretq_s16_m128i(_a);
+  int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+  return vreinterpretq_m128i_s16(
+      vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                   vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(vsubq_f32(
+      vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+      vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+  float32x4x2_t c =
+      vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+  return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) {
+  return vreinterpret_m64_s16(
+      vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) {
+  return vreinterpret_m64_s32(
+      vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) {
+  int32x4_t a = vreinterpretq_s32_m128i(_a);
+  int32x4_t b = vreinterpretq_s32_m128i(_b);
+  // Interleave using vshrn/vmovn
+  // [a0|a2|a4|a6|b0|b2|b4|b6]
+  // [a1|a3|a5|a7|b1|b3|b5|b7]
+  int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+  int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+  // Subtract
+  return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) {
+#if defined(__aarch64__)
+  int16x8_t a = vreinterpretq_s16_m128i(_a);
+  int16x8_t b = vreinterpretq_s16_m128i(_b);
+  return vreinterpretq_s64_s16(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+  int32x4_t a = vreinterpretq_s32_m128i(_a);
+  int32x4_t b = vreinterpretq_s32_m128i(_b);
+  // Interleave using vshrn/vmovn
+  // [a0|a2|a4|a6|b0|b2|b4|b6]
+  // [a1|a3|a5|a7|b1|b3|b5|b7]
+  int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+  int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+  // Saturated add
+  return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) {
+#if defined(__aarch64__)
+  int16x8_t a = vreinterpretq_s16_m128i(_a);
+  int16x8_t b = vreinterpretq_s16_m128i(_b);
+  return vreinterpretq_s64_s16(vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+  int32x4_t a = vreinterpretq_s32_m128i(_a);
+  int32x4_t b = vreinterpretq_s32_m128i(_b);
+  // Interleave using vshrn/vmovn
+  // [a0|a2|a4|a6|b0|b2|b4|b6]
+  // [a1|a3|a5|a7|b1|b3|b5|b7]
+  int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+  int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+  // Saturated subtract
+  return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) {
+  int32x4_t a = vreinterpretq_s32_m128i(_a);
+  int32x4_t b = vreinterpretq_s32_m128i(_b);
+  return vreinterpretq_m128i_s32(
+      vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                   vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) {
+  int64x2_t a = vreinterpretq_s64_m128i(_a);
+  int64x2_t b = vreinterpretq_s64_m128i(_b);
+  // Interleave using vshrn/vmovn
+  // [a0|a2|b0|b2]
+  // [a1|a2|b1|b3]
+  int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+  int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+  // Subtract
+  return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y) {
+  y -= *c;
+  float t = *sum + y;
+  *c = (t - *sum) - y;
+  *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) {
+#if defined(__aarch64__)
+  /* shortcuts */
+  if (imm == 0xFF) {
+    return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+  }
+  if (imm == 0x7F) {
+    float32x4_t m = _mm_mul_ps(a, b);
+    m[3] = 0;
+    return _mm_set1_ps(vaddvq_f32(m));
+  }
+#endif
+
+  float s = 0, c = 0;
+  float32x4_t f32a = vreinterpretq_f32_m128(a);
+  float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+  /* To improve the accuracy of floating-point summation, Kahan algorithm
+   * is used for each operation.
+   */
+  if (imm & (1 << 4))
+    sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+  if (imm & (1 << 5))
+    sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+  if (imm & (1 << 6))
+    sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+  if (imm & (1 << 7))
+    sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+  s += c;
+
+  float32x4_t res = {
+      (imm & 0x1) ? s : 0,
+      (imm & 0x2) ? s : 0,
+      (imm & 0x4) ? s : 0,
+      (imm & 0x8) ? s : 0,
+  };
+  return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(
+      vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(
+      vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(
+      vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(
+      vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) {
+  return vreinterpretq_m128_u32(vmvnq_u32(
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) {
+  return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) {
+  return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) {
+  return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) {
+  return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) {
+  return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) {
+  return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) {
+  return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) {
+  return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u32(
+      vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_u64(
+      vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+  // ARMv7 lacks vceqq_u64
+  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+  uint32x4_t cmp =
+      vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+  uint32x4_t swapped = vrev64q_u32(cmp);
+  return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u32(
+      vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u32(
+      vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_u64(
+      vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+  // ARMv7 lacks vcgtq_s64.
+  // This is based off of Clang's SSE2 polyfill:
+  // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+  // Mask the sign bit out since we need a signed AND an unsigned comparison
+  // and it is ugly to try and split them.
+  int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+  int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+  int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+  // Check if a > b
+  int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+  // Copy upper mask to lower mask
+  // a_hi > b_hi
+  int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+  // Copy lower mask to upper mask
+  // a_lo > b_lo
+  int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+  // Compare for equality
+  int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+  // Copy upper mask to lower mask
+  // a_hi == b_hi
+  int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+  // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+  int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+  return vreinterpretq_m128i_s64(ret);
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) {
+  // Note: NEON does not have ordered compare builtin
+  // Need to compare a eq a and b eq b to check for NaN
+  // Do AND of results to get final
+  uint32x4_t ceqaa =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t ceqbb =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) {
+  uint32x4_t f32a =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t f32b =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) {
+  return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) {
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+  uint32x4_t a_lt_b =
+      vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) {
+  // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+  // vreinterpretq_f32_m128(b)), 0);
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+  uint32x4_t a_gt_b =
+      vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) {
+  // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+  // vreinterpretq_f32_m128(b)), 0);
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+  uint32x4_t a_le_b =
+      vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) {
+  // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+  // vreinterpretq_f32_m128(b)), 0);
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+  uint32x4_t a_ge_b =
+      vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) {
+  // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+  // vreinterpretq_f32_m128(b)), 0);
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+  uint32x4_t a_eq_b =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) {
+  // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+  // vreinterpretq_f32_m128(b)), 0);
+  uint32x4_t a_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+  uint32x4_t b_not_nan =
+      vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+  uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+  uint32x4_t a_neq_b = vmvnq_u32(
+      vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+  return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) {
+  return vreinterpretq_m128_f32(
+      vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                   vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) {
+  __m128 ret = a;
+  return vreinterpretq_m128_f32(
+      vsetq_lane_f32((float)b, vreinterpretq_f32_m128(ret), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a) {
+#if defined(__aarch64__)
+  return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+  float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+  float32_t diff = data - floor(data);
+  if (diff > 0.5)
+    return (int32_t)ceil(data);
+  if (diff == 0.5) {
+    int32_t f = (int32_t)floor(data);
+    int32_t c = (int32_t)ceil(data);
+    return c & 1 ? f : c;
+  }
+  return (int32_t)floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) {
+  return vreinterpretq_m128_f32(
+      vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) {
+  return vreinterpretq_m128_f32(
+      vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                   vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) {
+  return vreinterpretq_m128_f32(vcvtq_f32_s32(
+      vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) {
+  return vreinterpretq_m128_f32(
+      vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) {
+  return vreinterpretq_m128_f32(
+      vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) {
+  return vreinterpretq_m128_f32(
+      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) {
+  return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) {
+  return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) {
+  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+  return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) {
+  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+  return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) {
+  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+  uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+  return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) {
+  int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+  return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) {
+  int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+  return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) {
+  int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+  int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+  return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) {
+  return vreinterpretq_m128i_s32(
+      vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) {
+  int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+  int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+  return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) {
+  return vreinterpretq_m128i_u32(
+      vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) {
+  uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+  uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+  return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) {
+  return vreinterpretq_m128i_u64(
+      vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) {
+  return vreinterpretq_m128i_s64(
+      vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+  uint32x4_t signmask = vdupq_n_u32(0x80000000);
+  float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                               vdupq_n_f32(0.5f)); /* +/- 0.5 */
+  int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+      vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+  int32x4_t r_trunc =
+      vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+  int32x4_t plusone = vreinterpretq_s32_u32(
+      vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+  int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                               vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+  float32x4_t delta =
+      vsubq_f32(vreinterpretq_f32_m128(a),
+                vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+  uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+  return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) {
+  return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) {
+  return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) {
+  return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) {
+  return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a) {
+  return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a) {
+  return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) {
+  return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) {
+  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+  return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *)p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(
+      vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+  return vreinterpretq_m128d_f32(vcombine_f32(
+      vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *)p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) {
+  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p) {
+  return vreinterpretq_m128i_s32(
+      vsetq_lane_s32(*(const int32_t *)p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) {
+#if defined(__aarch64__)
+  float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+  return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+  float a0 = (float)((double *)&a)[0];
+  float a1 = (float)((double *)&a)[1];
+  return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a) {
+#if defined(__aarch64__)
+  return (double)vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+  return ((double *)&a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128d_f64(
+      vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+  double a0 = (double)vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+  double a1 = (double)vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+  return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) {
+  return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) {
+  return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
+                                          vreinterpretq_f32_m128(b),
+                                          vreinterpretq_f32_m128(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) {
+#if defined(__aarch64__)
+  switch (rounding) {
+  case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+    return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+  case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+  case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+  case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+    return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+  default: //_MM_FROUND_CUR_DIRECTION
+    return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+  }
+#else
+  float *v_float = (float *)&a;
+  __m128 zero, neg_inf, pos_inf;
+
+  switch (rounding) {
+  case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+    return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+  case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+    return (__m128){floorf(v_float[0]), floorf(v_float[1]), floorf(v_float[2]),
+                    floorf(v_float[3])};
+  case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+    return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+                    ceilf(v_float[3])};
+  case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+    zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+    neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+                         floorf(v_float[2]), floorf(v_float[3]));
+    pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+                         ceilf(v_float[2]), ceilf(v_float[3]));
+    return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+  default: //_MM_FROUND_CUR_DIRECTION
+    return (__m128){roundf(v_float[0]), roundf(v_float[1]), roundf(v_float[2]),
+                    roundf(v_float[3])};
+  }
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) {
+  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a) {
+  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) {
+  int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
+  if (c > 15)
+    return _mm_cmplt_epi16(a, _mm_setzero_si128());
+  return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t)a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) {
+  int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
+  if (c > 31)
+    return _mm_cmplt_epi32(a, _mm_setzero_si128());
+  return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t)a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s8(
+      vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                  vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                  vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_s16(
+      vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                   vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u16(
+      vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                   vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s8(
+      vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+  int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+  int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+  int8x8x2_t result = vzip_s8(a1, b1);
+  return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s16(
+      vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+  int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+  int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+  int16x4x2_t result = vzip_s16(a1, b1);
+  return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s32(
+      vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+  int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+  int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+  int32x2x2_t result = vzip_s32(a1, b1);
+  return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) {
+  int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+  int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+  return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(
+      vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+  float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+  float32x2x2_t result = vzip_f32(a1, b1);
+  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128_f32(
+      vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+  float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+  float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+  float32x2x2_t result = vzip_f32(a1, b1);
+  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s8(
+      vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+  int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+  int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+  int8x8x2_t result = vzip_s8(a1, b1);
+  return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s16(
+      vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+  int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+  int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+  int16x4x2_t result = vzip_s16(a1, b1);
+  return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) {
+#if defined(__aarch64__)
+  return vreinterpretq_m128i_s32(
+      vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+  int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+  int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+  int32x2x2_t result = vzip_s32(a1, b1);
+  return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) {
+  int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+  int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+  return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) {
+  __m128i dst;
+  uint16_t min, idx = 0;
+  // Find the minimum value
+#if defined(__aarch64__)
+  min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+  __m64 tmp;
+  tmp =
+      vreinterpret_m64_u16(vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                                    vget_high_u16(vreinterpretq_u16_m128i(a))));
+  tmp = vreinterpret_m64_u16(
+      vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+  tmp = vreinterpret_m64_u16(
+      vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+  min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+  // Get the index of the minimum value
+  int i;
+  for (i = 0; i < 8; i++) {
+    if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+      idx = (uint16_t)i;
+      break;
+    }
+    a = _mm_srli_si128(a, 2);
+  }
+  // Generate result
+  dst = _mm_setzero_si128();
+  dst = vreinterpretq_m128i_u16(
+      vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+  dst = vreinterpretq_m128i_u16(
+      vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+  return dst;
+}
+
+// shift to right
+// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
+// http://blog.csdn.net/hemmingway/article/details/44828303
+// Clang requires a macro here, as it is extremely picky about c being a
+// literal.
+#define _mm_alignr_epi8(a, b, c)                                               \
+  ((__m128i)vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) {
+  int64x2_t s64 =
+      vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                vreinterpretq_s64_m128i(b));
+  return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) {
+  int64x2_t s64 =
+      vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+  return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                             \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s8(                                                    \
+        vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)));                 \
+  })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm)                                              \
+  vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                            \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s16(                                                   \
+        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)));               \
+  })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm)                                              \
+  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                            \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s32(                                                   \
+        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)));               \
+  })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm)                                              \
+  vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                            \
+  __extension__({                                                              \
+    vreinterpretq_m128i_s64(                                                   \
+        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)));               \
+  })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a) {
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+  return __builtin_popcount(a);
+#else
+  return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+#endif
+#else
+  uint32_t count = 0;
+  uint8x8_t input_val, count8x8_val;
+  uint16x4_t count16x4_val;
+  uint32x2_t count32x2_val;
+
+  input_val = vld1_u8((uint8_t *)&a);
+  count8x8_val = vcnt_u8(input_val);
+  count16x4_val = vpaddl_u8(count8x8_val);
+  count32x2_val = vpaddl_u16(count16x4_val);
+
+  vst1_u32(&count, count32x2_val);
+  return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) {
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+  return __builtin_popcountll(a);
+#else
+  return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+  uint64_t count = 0;
+  uint8x8_t input_val, count8x8_val;
+  uint16x4_t count16x4_val;
+  uint32x2_t count32x2_val;
+  uint64x1_t count64x1_val;
+
+  input_val = vld1_u8((uint8_t *)&a);
+  count8x8_val = vcnt_u8(input_val);
+  count16x4_val = vpaddl_u8(count8x8_val);
+  count32x2_val = vpaddl_u16(count16x4_val);
+  count64x1_val = vpaddl_u32(count32x2_val);
+  vst1_u64(&count, count64x1_val);
+  return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
+  do {                                                                         \
+    float32x4x2_t ROW01 = vtrnq_f32(row0, row1);                               \
+    float32x4x2_t ROW23 = vtrnq_f32(row2, row3);                               \
+    row0 =                                                                     \
+        vcombine_f32(vget_low_f32(ROW01.val[0]), vget_low_f32(ROW23.val[0]));  \
+    row1 =                                                                     \
+        vcombine_f32(vget_low_f32(ROW01.val[1]), vget_low_f32(ROW23.val[1]));  \
+    row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),                           \
+                        vget_high_f32(ROW23.val[0]));                          \
+    row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),                           \
+                        vget_high_f32(ROW23.val[1]));                          \
+  } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) {
+  poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+  poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+  return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) {
+  poly8x8_t a = vreinterpret_p8_u64(_a);
+  poly8x8_t b = vreinterpret_p8_u64(_b);
+
+  // Masks
+  uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                  vcreate_u8(0x00000000ffffffff));
+  uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                  vcreate_u8(0x0000000000000000));
+
+  // Do the multiplies, rotating with vext to get all combinations
+  uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
+  uint8x16_t e =
+      vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
+  uint8x16_t f =
+      vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
+  uint8x16_t g =
+      vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
+  uint8x16_t h =
+      vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
+  uint8x16_t i =
+      vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
+  uint8x16_t j =
+      vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
+  uint8x16_t k =
+      vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
+
+  // Add cross products
+  uint8x16_t l = veorq_u8(e, f); // L = E + F
+  uint8x16_t m = veorq_u8(g, h); // M = G + H
+  uint8x16_t n = veorq_u8(i, j); // N = I + J
+
+  // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+  // instructions.
+#if defined(__aarch64__)
+  uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+      vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+  uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+      vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+  uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+      vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+  uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+      vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+  uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+  uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+  uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+  uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+  // t0 = (L) (P0 + P1) << 8
+  // t1 = (M) (P2 + P3) << 16
+  uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+  uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+  uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+  // t2 = (N) (P4 + P5) << 24
+  // t3 = (K) (P6 + P7) << 32
+  uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+  uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+  uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+  // De-interleave
+#if defined(__aarch64__)
+  uint8x16_t t0 = vreinterpretq_u8_u64(
+      vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+  uint8x16_t t1 = vreinterpretq_u8_u64(
+      vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+  uint8x16_t t2 = vreinterpretq_u8_u64(
+      vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+  uint8x16_t t3 = vreinterpretq_u8_u64(
+      vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+  uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+  uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+  uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+  uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+  // Shift the cross products
+  uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
+  uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
+  uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
+  uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
+
+  // Accumulate the products
+  uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+  uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+  uint8x16_t mix = veorq_u8(d, cross1);
+  uint8x16_t r = veorq_u8(mix, cross2);
+  return vreinterpretq_u64_u8(r);
+}
+#endif // ARMv7 polyfill
+
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b,
+                                          const int imm) {
+  uint64x2_t a = vreinterpretq_u64_m128i(_a);
+  uint64x2_t b = vreinterpretq_u64_m128i(_b);
+  switch (imm & 0x11) {
+  case 0x00:
+    return vreinterpretq_m128i_u64(
+        _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+  case 0x01:
+    return vreinterpretq_m128i_u64(
+        _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+  case 0x10:
+    return vreinterpretq_m128i_u64(
+        _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+  case 0x11:
+    return vreinterpretq_m128i_u64(
+        _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+  default:
+    abort();
+  }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) {
+#if defined(__aarch64__)
+  static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+                                       0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb};
+  static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                     0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+  uint8x16_t v;
+  uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+  // shift rows
+  w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+  // sub bytes
+  v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
+  v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+  v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+  v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+  // mix columns
+  w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+  w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+  w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+  //  add round key
+  return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+  (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) |   \
+   (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p)                                                     \
+  SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p)                                                     \
+  SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p)                                                     \
+  SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p)                                                     \
+  SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+  static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+      SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+      SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+      SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+      SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+  };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+  uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+  uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+  uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+  uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+  __m128i out =
+      _mm_set_epi32((aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+                     aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+                    (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+                     aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+                    (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+                     aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+                    (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+                     aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+  return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {
+  /* FIXME: optimized for NEON */
+  uint8_t v[4][4] = {
+      [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+      [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+      [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+      [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+             SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+  };
+  for (int i = 0; i < 16; i++)
+    vreinterpretq_nth_u8_m128i(a, i) =
+        v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+  return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) {
+  uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+  uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+  for (int i = 0; i < 4; ++i) {
+    ((uint8_t *)&X1)[i] = SSE2NEON_sbox[((uint8_t *)&X1)[i]];
+    ((uint8_t *)&X3)[i] = SSE2NEON_sbox[((uint8_t *)&X3)[i]];
+  }
+  return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                       ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) {
+  return vreinterpretq_m128i_u8(
+      vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+      vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {
+  return _mm_xor_si128(vreinterpretq_m128i_u8(
+                           vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                       RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {
+  // AESE does ShiftRows and SubBytes on A
+  uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+  uint8x16_t dest = {
+      // Undo ShiftRows step from AESE and extract X1 and X3
+      u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
+      u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
+      u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
+      u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
+  };
+  uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon};
+  return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void) { __sync_synchronize(); }
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) {
+#if __has_builtin(__builtin_nontemporal_store)
+  __builtin_nontemporal_store(a, (float32x4_t *)p);
+#else
+  vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) {
+#if __has_builtin(__builtin_nontemporal_store)
+  __builtin_nontemporal_store(a, p);
+#else
+  vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) {
+#if __has_builtin(__builtin_nontemporal_store)
+  return __builtin_nontemporal_load(p);
+#else
+  return vreinterpretq_m128i_s64(vld1q_s64((int64_t *)p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p) {
+  (void)p;
+  // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align) {
+  void *ptr;
+  if (align == 1)
+    return malloc(size);
+  if (align == 2 || (sizeof(void *) == 8 && align == 4))
+    align = sizeof(void *);
+  if (!posix_memalign(&ptr, align, size))
+    return ptr;
+  return NULL;
+}
+
+FORCE_INLINE void _mm_free(void *addr) { free(addr); }
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+  __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                       : [c] "+r"(crc)
+                       : [v] "r"(v));
+#else
+  crc ^= v;
+  for (int bit = 0; bit < 8; bit++) {
+    if (crc & 1)
+      crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+    else
+      crc = (crc >> 1);
+  }
+#endif
+  return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+  __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                       : [c] "+r"(crc)
+                       : [v] "r"(v));
+#else
+  crc = _mm_crc32_u8(crc, v & 0xff);
+  crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+  return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+  __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                       : [c] "+r"(crc)
+                       : [v] "r"(v));
+#else
+  crc = _mm_crc32_u16(crc, v & 0xffff);
+  crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+  return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) {
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+  __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                       : [c] "+r"(crc)
+                       : [v] "r"(v));
+#else
+  crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+  crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+  return crc;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h
new file mode 100644
index 00000000..4fc47ac4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h
@@ -0,0 +1,73 @@
+#ifndef EMP_TCCRH_H__
+#define EMP_TCCRH_H__
+#include "utils/prp.h"
+#include <stdio.h>
+
+namespace emp {
+/*
+ * By default, TCCRH use zero_block as the AES key.
+ * Here we model f(x) = AES_{00..0}(x) as a random permutation (and thus in the
+ * RPM model)
+ */
+
+class TCCRH : public PRP {
+public:
+  TCCRH(const block128 &key = zero_block) : PRP(key) {}
+  block128 H(block128 in, uint64_t i) {
+    permute_block(&in, 1);
+    block128 t = in ^ makeblock128(0, i);
+    permute_block(&t, 1);
+    return t ^ in;
+  }
+
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC push_options
+#pragma GCC optimize("unroll-loops")
+#endif
+#endif
+
+  template <int n> void H(block128 out[n], block128 in[n], uint64_t id) {
+    block128 tmp[n];
+    for (int i = 0; i < n; ++i)
+      tmp[i] = in[i];
+    permute_block(tmp, n);
+    for (int i = 0; i < n; ++i) {
+      out[i] = tmp[i] ^ makeblock128(0, id);
+      ++id;
+    }
+    permute_block(out, n);
+    xorblocks_arr(out, tmp, out, n);
+  }
+
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC pop_options
+#endif
+#endif
+
+  void Hn(block128 *out, block128 *in, uint64_t id, int length,
+          block128 *scratch = nullptr) {
+    bool del = false;
+    if (scratch == nullptr) {
+      del = true;
+      scratch = new block128[length];
+    }
+    for (int i = 0; i < length; ++i)
+      scratch[i] = in[i];
+    permute_block(scratch, length);
+    for (int i = 0; i < length; ++i) {
+      out[i] = scratch[i] ^ makeblock128(0, id);
+      ++id;
+    }
+    permute_block(out, length);
+    xorblocks_arr(out, scratch, out, length);
+
+    if (del) {
+      delete[] scratch;
+      scratch = nullptr;
+    }
+  }
+};
+} // namespace emp
+#endif // TCCRH_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/ubuntu_terminal_colors.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ubuntu_terminal_colors.h
new file mode 100644
index 00000000..10d6cba9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/ubuntu_terminal_colors.h
@@ -0,0 +1,39 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+// the following are UBUNTU/LINUX, and MacOS ONLY terminal color codes.
+#define RESET "\033[0m"
+#define BLACK "\033[30m"              /* Black */
+#define RED "\033[31m"                /* Red */
+#define GREEN "\033[32m"              /* Green */
+#define YELLOW "\033[33m"             /* Yellow */
+#define BLUE "\033[34m"               /* Blue */
+#define MAGENTA "\033[35m"            /* Magenta */
+#define CYAN "\033[36m"               /* Cyan */
+#define WHITE "\033[37m"              /* White */
+#define BOLDBLACK "\033[1m\033[30m"   /* Bold Black */
+#define BOLDRED "\033[1m\033[31m"     /* Bold Red */
+#define BOLDGREEN "\033[1m\033[32m"   /* Bold Green */
+#define BOLDYELLOW "\033[1m\033[33m"  /* Bold Yellow */
+#define BOLDBLUE "\033[1m\033[34m"    /* Bold Blue */
+#define BOLDMAGENTA "\033[1m\033[35m" /* Bold Magenta */
+#define BOLDCYAN "\033[1m\033[36m"    /* Bold Cyan */
+#define BOLDWHITE "\033[1m\033[37m"   /* Bold White */
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h b/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h
new file mode 100644
index 00000000..00bf9819
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h
@@ -0,0 +1,89 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Deevashwer Rathee
+*/
+
+#ifndef UTILS_H__
+#define UTILS_H__
+#include "utils/block.h"
+#include "utils/prg.h"
+#include <chrono>
+#include <cstddef>
+#include <gmp.h>
+#include <sstream>
+#include <string>
+#define macro_xstr(a) macro_str(a)
+#define macro_str(a) #a
+
+using std::string;
+using std::chrono::high_resolution_clock;
+using std::chrono::time_point;
+
+namespace sci {
+template <typename T> void inline delete_array_null(T *ptr);
+
+inline void error(const char *s, int line = 0, const char *file = nullptr);
+
+template <class... Ts> void run_function(void *function, const Ts &... args);
+
+inline void parse_party_and_port(char **arg, int argc, int *party, int *port);
+
+std::string Party(int p);
+
+// Timing related
+inline time_point<high_resolution_clock> clock_start();
+inline double time_from(const time_point<high_resolution_clock> &s);
+
+// block128 conversions
+template <typename T = uint64_t> std::string m128i_to_string(const __m128i var);
+block128 bool_to128(const bool *data);
+
+template <typename T> inline void int_to_bool(bool *data, T input, int len);
+void int64_to_bool(bool *data, uint64_t input, int length);
+inline void uint8_to_bool(uint8_t *data, uint8_t input, int length);
+
+// Other conversions
+template <typename T> T bool_to_int(const bool *data, size_t len = 0);
+inline uint8_t bool_to_uint8(const uint8_t *data, size_t len = 0);
+std::string hex_to_binary(std::string hex);
+inline string change_base(string str, int old_base, int new_base);
+inline string dec_to_bin(const string &dec);
+inline string bin_to_dec(const string &bin2);
+inline const char *hex_char_to_bin(char c);
+
+inline int bitlen(int x);
+inline int bitlen_true(int x);
+
+inline int64_t neg_mod(int64_t val, int64_t mod);
+inline int8_t neg_mod(int8_t val, int8_t mod);
+
+// deprecate soon
+inline void parse_party_and_port(char **arg, int *party, int *port) {
+  parse_party_and_port(arg, 2, party, port);
+}
+
+#include "utils/utils.hpp"
+} // namespace sci
+#endif // UTILS_H__
diff --git a/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.hpp b/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.hpp
new file mode 100644
index 00000000..1b753653
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.hpp
@@ -0,0 +1,445 @@
+/*
+Original Work Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
+Modified Work Copyright (c) 2020 Microsoft Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Enquiries about further applications and development opportunities are welcome.
+
+Modified by Nishant Kumar, Deevashwer Rathee
+*/
+
+template<class... Ts>
+void run_function(void *function, const Ts&... args) {
+    reinterpret_cast<void(*)(Ts...)>(function)(args...);
+}
+
+template<typename T>
+void inline delete_array_null(T * ptr){
+    if(ptr != nullptr) {
+        delete[] ptr;
+        ptr = nullptr;
+    }
+}
+
+template <typename T>
+std::string m128i_to_string(const __m128i var) {
+    std::stringstream sstr;
+    const T* values = (const T*) &var;
+    for (unsigned int i = 0; i < sizeof(__m128i) / sizeof(T); i++) {
+        sstr <<"0x"<<std::hex<< values[i] << " ";
+    }
+    return sstr.str();
+}
+
+inline time_point<high_resolution_clock> clock_start() {
+    return high_resolution_clock::now();
+}
+
+inline double time_from(const time_point<high_resolution_clock>& s) {
+    return std::chrono::duration_cast<std::chrono::microseconds>(high_resolution_clock::now() - s).count();
+}
+
+inline string bin_to_dec(const string& bin2) {
+    if(bin2[0] == '0')
+        return change_base(bin2, 2, 10);
+    string bin = bin2;
+    bool flip = false;
+    for(int i = bin.size()-1; i>=0; --i) {
+        if(flip)
+            bin[i] = (bin[i] == '1' ? '0': '1');
+        if(bin[i] == '1')
+            flip = true;
+    }
+    return "-"+change_base(bin, 2, 10);
+}
+
+inline string dec_to_bin(const string& dec) {
+    string bin = change_base(dec, 10, 2);
+    if(dec[0] != '-')
+        return '0' + bin;
+    bin[0] = '1';
+    bool flip = false;
+    for(int i = bin.size()-1; i>=1; --i) {
+        if(flip)
+            bin[i] = (bin[i] == '1' ? '0': '1');
+        if(bin[i] == '1')
+            flip = true;
+    }
+    return bin;
+}
+
+inline string change_base(string str, int old_base, int new_base) {
+    mpz_t tmp;
+    mpz_init_set_str (tmp, str.c_str(), old_base);
+    char * b = new char[mpz_sizeinbase(tmp, new_base) + 2];
+    mpz_get_str(b, new_base, tmp);
+    mpz_clear(tmp);
+    string res(b);
+    delete[]b;
+    return res;
+}
+
+inline void error(const char * s, int line, const char * file) {
+    fprintf(stderr, s, "\n");
+    if(file != nullptr) {
+        fprintf(stderr, "at %d, %s\n", line, file);
+    }
+    exit(1);
+}
+
+inline const char* hex_char_to_bin(char c) {
+    switch(toupper(c)) {
+        case '0': return "0000";
+        case '1': return "0001";
+        case '2': return "0010";
+        case '3': return "0011";
+        case '4': return "0100";
+        case '5': return "0101";
+        case '6': return "0110";
+        case '7': return "0111";
+        case '8': return "1000";
+        case '9': return "1001";
+        case 'A': return "1010";
+        case 'B': return "1011";
+        case 'C': return "1100";
+        case 'D': return "1101";
+        case 'E': return "1110";
+        case 'F': return "1111";
+        default: return "0";
+    }
+}
+
+inline std::string hex_to_binary(std::string hex) {
+    std::string bin;
+    for(unsigned i = 0; i != hex.length(); ++i)
+        bin += hex_char_to_bin(hex[i]);
+    return bin;
+}
+inline void parse_party_and_port(char ** arg, int argc, int * party, int * port) {
+    if (argc == 1)
+        error("ERROR: argc = 1, need two argsm party ID {1,2} and port.");
+    else if (argc == 2)
+        *party = atoi(arg[1]);
+    else if (argc >= 3) {
+        *party = atoi (arg[1]);
+        *port = atoi (arg[2]);
+    }
+}
+
+inline std::string Party(int p) {
+    if (p == ALICE)
+        return "ALICE";
+    else if (p == BOB)
+        return "BOB";
+    else return "PUBLIC";
+}
+
+template<typename T>
+inline void int_to_bool(bool * data, T input, int len) {
+	for (int i = 0; i < len; ++i) {
+		data[i] = (input & 1)==1;
+		input >>= 1;
+	}
+}
+
+template<typename t>
+t bool_to_int(const bool * data, size_t len) {
+    if (len != 0) len = (len > sizeof(t)*8 ? sizeof(t)*8 : len);
+    else len = sizeof(t)*8;
+    t res = 0;
+    for(size_t i = 0; i < len-1; ++i) {
+        if(data[i])
+            res |= (1LL<<i);
+    }
+    if(data[len-1]) return -1*res;
+    else return res;
+}
+
+inline uint8_t bool_to_uint8(const uint8_t * data, size_t len) {
+    if (len != 0) len = (len > 8 ? 8 : len);
+    else len = 8;
+    uint8_t res = 0;
+    for(size_t i = 0; i < len; ++i) {
+        if(data[i])
+            res |= (1ULL<<i);
+    }
+    return res;
+}
+
+inline uint64_t bool_to64(const bool * data) {
+    uint64_t res = 0;
+    for(int i = 0; i < 64; ++i) {
+        if(data[i])
+            res |= (1ULL<<i);
+    }
+    return res;
+}
+inline block128 bool_to128(const bool * data) {
+    return makeBlock128(bool_to64(data+64), bool_to64(data));
+}
+inline block256 bool_to256(const bool * data) {
+    return makeBlock256(bool_to128(data+128), bool_to128(data));
+}
+
+inline void int64_to_bool(bool * data, uint64_t input, int length) {
+    for (int i = 0; i < length; ++i) {
+        data[i] = (input & 1)==1;
+        input >>= 1;
+    }
+}
+
+inline void uint8_to_bool(uint8_t * data, uint8_t input, int length) {
+    for (int i = 0; i < length; ++i) {
+        data[i] = (input & 1)==1;
+        input >>= 1;
+    }
+}
+
+// NOTE: This function doesn't return bitlen. It returns log_alpha
+inline int bitlen(int x) { // x & (-x)
+    if (x < 1) return 0;
+    for (int i = 0; i < 32; i++) {
+        int curr = 1 << i;
+        if (curr >= x) return i;
+    }
+    return 0;
+}
+
+inline int bitlen_true(int x) {
+    if (x < 1) return 0;
+    for (int i = 0; i < 32; i++) {
+        int curr = 1 << i;
+        if (curr > x) return (i);
+				else if(curr == x) return (i+1);
+    }
+    return 0;
+}
+
+// Taken from https://github.com/mc2-project/delphi/blob/master/rust/protocols-sys/c++/src/lib/conv2d.h
+/* Helper function for performing modulo with possibly negative numbers */
+inline int64_t neg_mod(int64_t val, int64_t mod) {
+    return ((val % mod) + mod) % mod;
+}
+
+/* Helper function for performing modulo with possibly negative numbers */
+inline int8_t neg_mod(int8_t val, int8_t mod) {
+    return ((val % mod) + mod) % mod;
+}
+// Converts  uint64_t values to __m128i or block128
+inline block128 toBlock(uint64_t high_u64, uint64_t low_u64) { return _mm_set_epi64x(high_u64, low_u64); }
+inline block128 toBlock(uint64_t low_u64) { return toBlock(0, low_u64); }
+
+inline uint64_t all1Mask(int x){
+    if (x==64) return -1;
+    else return (1ULL<<x)-1;
+}
+
+#define INIT_TIMER auto start_timer = std::chrono::high_resolution_clock::now(); \
+    uint64_t pause_timer = 0;
+#define START_TIMER  start_timer = std::chrono::high_resolution_clock::now();
+#define PAUSE_TIMER(name) pause_timer += std::chrono::duration_cast<std::chrono::milliseconds>( \
+            std::chrono::high_resolution_clock::now()-start_timer).count(); \
+    std::cout << "[PAUSING TIMER] RUNTIME till now of " << name << ": " << pause_timer<<" ms"<<std::endl;
+#define STOP_TIMER(name) std::cout << "------------------------------------" << std::endl; std::cout << "[STOPPING TIMER] Total RUNTIME of " << name << ": " << \
+    std::chrono::duration_cast<std::chrono::milliseconds>( \
+            std::chrono::high_resolution_clock::now()-start_timer \
+    ).count() + pause_timer << " ms " << std::endl; 
+#define TIMER_TILL_NOW std::chrono::duration_cast<std::chrono::milliseconds>(\
+    std::chrono::high_resolution_clock::now()-start_timer).count()
+
+#define INIT_IO_DATA_SENT uint64_t dataSentCtr__1 = io->counter;
+#define IO_TILL_NOW (io->counter - dataSentCtr__1);
+#define RESET_IO dataSentCtr__1 = io->counter;
+
+#define INIT_ALL_IO_DATA_SENT uint64_t __ioStartTracker[::num_threads];\
+        for(int __thrdCtr = 0; __thrdCtr < ::num_threads; __thrdCtr++){\
+            __ioStartTracker[__thrdCtr] = ::ioArr[__thrdCtr]->counter;\
+        }
+#define FIND_ALL_IO_TILL_NOW(var) uint64_t __curComm = 0;\
+        for(int __thrdCtr = 0; __thrdCtr < ::num_threads; __thrdCtr++){\
+             __curComm += ((::ioArr[__thrdCtr]->counter) - __ioStartTracker[__thrdCtr]);\
+        }\
+        var = __curComm;
+#define RESET_ALL_IO for(int __thrdCtr = 0; __thrdCtr < ::num_threads; __thrdCtr++){\
+            __ioStartTracker[__thrdCtr] = ::ioArr[__thrdCtr]->counter;\
+        }
+
+inline void print128_num(__m128i var) 
+{
+    uint64_t *v64val = (uint64_t*) &var;
+    printf("%.16lx %.16lx\n", v64val[1], v64val[0]);
+}
+
+inline void linIdxRowMInverseMapping(int cur, int s1, int s2, int& i, int& j){
+    i = cur/s2;
+    j = (cur-i*s2);
+}
+
+inline void linIdxRowMInverseMapping(int cur, int s1, int s2, int s3, int& i, int& j, int& k){
+    i = cur/(s2*s3);
+    j = (cur-i*s2*s3)/s3;
+    k = (cur-i*s2*s3-j*s3);
+}
+
+// Note of the bracket around each expression use -- if this is not there, not macro expansion
+//  can result in hard in trace bugs when expanded around expressions like 1-2.
+#define Arr1DIdxRowM(arr,s0,i) (*((arr) + (i)))
+#define Arr2DIdxRowM(arr,s0,s1,i,j) (*((arr) + (i)*(s1) + (j)))
+#define Arr3DIdxRowM(arr,s0,s1,s2,i,j,k) (*((arr) + (i)*(s1)*(s2) + (j)*(s2) + (k)))
+#define Arr4DIdxRowM(arr,s0,s1,s2,s3,i,j,k,l) (*((arr) + (i)*(s1)*(s2)*(s3) + (j)*(s2)*(s3) + (k)*(s3) + (l)))
+#define Arr5DIdxRowM(arr,s0,s1,s2,s3,s4,i,j,k,l,m) (*((arr) + (i)*(s1)*(s2)*(s3)*(s4) + (j)*(s2)*(s3)*(s4) + (k)*(s3)*(s4) + (l)*(s4) + (m)))
+
+#define Arr2DIdxColM(arr,s0,s1,i,j) (*((arr) + (j)*(s0) + (i)))
+
+template<typename intType>
+void elemWiseAdd(int len, const intType* A, const intType* B, intType* C){
+    for(int i=0;i<len;i++){
+        C[i] = A[i] + B[i];
+    }
+}
+
+template<typename intType>
+void elemWiseSub(int len, const intType* A, const intType* B, intType* C){
+    for(int i=0;i<len;i++){
+        C[i] = A[i] - B[i];
+    }
+}   
+
+template<typename intType>
+void print2DArr(int s1, int s2, const intType* arr){
+    for(int i=0;i<s1;i++){
+        for(int j=0;j<s2;j++){
+            std::cout<<Arr2DIdxRowM(arr,s1,s2,i,j)<<" ";
+        }
+        std::cout<<"\n";
+    }
+    std::cout<<std::endl;
+}
+
+template<typename intType>
+void convertRowToColMajor(int s1, int s2, const intType* rowMajor, intType* columnMajor){
+    for(int i=0;i<s1;i++){
+        for(int j=0;j<s2;j++){
+            Arr2DIdxColM(columnMajor,s1,s2,i,j) = Arr2DIdxRowM(rowMajor,s1,s2,i,j);
+        }
+    }
+}
+
+template<typename intType>
+void convertColToRowMajor(int s1, int s2, const intType* columnMajor, intType* rowMajor){
+    for(int i=0;i<s1;i++){
+        for(int j=0;j<s2;j++){
+            Arr2DIdxRowM(rowMajor,s1,s2,i,j) = Arr2DIdxColM(columnMajor,s1,s2,i,j);
+        }
+    }
+}
+
+template<typename intType>
+void copyElemWisePadded(int s1, intType* inp, int s2, intType* outp, int padVal){
+    assert(s2>=s1);
+    for(int i=0;i<s1;i++) outp[i] = inp[i];
+    for(int i=s1;i<s2;i++) outp[i] = padVal;
+}
+
+inline uint64_t moduloMult(uint64_t a, uint64_t b, uint64_t mod){ 
+    uint64_t res = 0;
+    a %= mod; 
+    while (b){
+        if (b & 1) 
+            res = (res + a) % mod; 
+        a = (2 * a) % mod; 
+        b >>= 1; 
+    }  
+    return res; 
+}
+
+inline uint64_t ceil_val(uint64_t x, uint64_t y){
+    return (x+y-1)/y;
+}
+
+/*
+    Given a byte array arr of length arrLen, interpret this as a bit array,
+        and read from bitIdx = bitIdx, #bits = bitsToRead.
+    Return the read value as uint64_t (assuming obviously that bitsToRead <= 64).
+    The answer is stored in the lower bitsToRead bits of the returned value (lsb onwards).
+    Note: In the new implementation, while reading, this reads a uint64_t. Ensure the array
+    from which reading is being done has one uint64_t extra at the end to prevent reading
+    out of bounds.
+    This code assumes little endian encoding for the system
+*/
+static uint64_t readFromPackedArr(uint8_t* arr, int arrLen, uint64_t bitIdx, uint64_t bitsToRead){
+    assert(bitsToRead<=64);
+    uint64_t firstByteIdx = bitIdx/8;
+    uint64_t toRejectBitsInFirstByte = bitIdx%8;
+    uint64_t ans = *((uint64_t*)(arr+firstByteIdx));
+    ans >>= toRejectBitsInFirstByte;
+    if (bitsToRead > 64 - toRejectBitsInFirstByte){
+        // Read from the unaligned byte at firstByteIdx+8
+        ans = ans & ((1ULL<<(64 - toRejectBitsInFirstByte))-1);
+        uint64_t temp = ((uint64_t)(arr[firstByteIdx+8])) & ((1ULL<<toRejectBitsInFirstByte)-1);
+        ans += (temp<<(64 - toRejectBitsInFirstByte));
+    }
+    ans = ans & all1Mask(bitsToRead);
+    return ans;
+}
+
+/*
+    In the uint8_t pointed by x, starts writing lower 8-idx values of val starting from bit idx
+*/
+static void writeInAByte(uint8_t* x, int idx, uint8_t val){
+    (*x) = (*x) & ((1<<idx)-1);
+    (*x) = (*x) + (val << idx);
+}
+
+/*
+    Similar to above, given a byte array arr of length = arrLen, interpret this as a bit array,
+    and start writing bits at bitIdx = bitIdx with #bits = bitlen. Actual bits to be written are given
+    in the lower bits of val, i.e. lower bitlen bits of val represent the bits to be written.
+    Obviously again bitlen <= 64.
+    Note that in the final unaligned byte this writes 0 in the extra positions.
+    Also, assuming extra bits of val after bitlen are filled with 0s.
+    Note: The new implementation assumes that there one extra uint64_t space at the end.
+    This is because it writes the whole uint64_t given and so to prevent writing out of bounds,
+    ensure the array has one extra uint64_t space at the end.
+    Note: this code assumes little endian encoding
+*/
+static void writeToPackedArr(uint8_t* arr, int arrLen, uint64_t bitIdx, uint64_t bitlen, uint64_t val){
+    assert(bitlen<=64);
+    //First write the unaligned bits -- i.e. first byte which needs to be written
+    uint64_t firstByteIdx = bitIdx/8;
+    uint64_t freeBitsInFirstByte = 8 - (bitIdx%8);
+    uint8_t valLowerByte = val; //This will give the lower 8 bits of val
+    writeInAByte(arr+firstByteIdx,bitIdx%8,valLowerByte); //If bitlen < freeBitsInFirstByte, this will write out 0 in those extra positions
+    // Now things are byte aligned
+    uint64_t valIter = (val >> freeBitsInFirstByte); //Since val is unsigned, this is unsigned right shift
+    (*((uint64_t*)(arr+firstByteIdx+1))) = valIter;
+}
+
+inline int64_t unsigned_val(uint64_t x, int bw_x) {
+    uint64_t mask_x = (bw_x == 64 ? -1: ((1ULL << bw_x) - 1));
+    return x & mask_x;
+}
+
+inline int64_t signed_val(uint64_t x, int bw_x) {
+    uint64_t pow_x = (bw_x == 64? 0ULL: (1ULL << bw_x));
+    uint64_t mask_x = pow_x - 1;
+    x = x & mask_x;
+    return int64_t(x - ((x >= (pow_x/2)) * pow_x));
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/tests/CMakeLists.txt
new file mode 100644
index 00000000..ce9fdf83
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/CMakeLists.txt
@@ -0,0 +1,45 @@
+macro (add_test_OT _name)
+	add_executable(${_name}-OT "test_ring_${_name}.cpp")
+    target_link_libraries(${_name}-OT SCI-OT) 
+endmacro()
+
+macro (add_test_HE _name)
+    add_executable(${_name}-HE "test_field_${_name}.cpp")
+    target_link_libraries(${_name}-HE SCI-HE) 
+endmacro()
+
+macro (add_test_float _name)
+	add_executable(${_name}-float "test_float_${_name}.cpp")
+    target_link_libraries(${_name}-float SCI-FloatingPoint) 
+endmacro()
+
+add_test_OT(matmul)
+add_test_OT(value_extension)
+add_test_OT(truncation)
+add_test_OT(relu)
+add_test_OT(argmax)
+add_test_OT(aux_protocols)
+add_test_OT(maxpool)
+add_test_OT(hadamard_product)
+add_test_OT(sigmoid)
+add_test_OT(exp)
+add_test_OT(tanh)
+add_test_OT(sqrt)
+
+add_test_HE(relu)
+add_test_HE(maxpool)
+add_test_HE(argmax)
+add_test_HE(conv)
+add_test_HE(fc)
+add_test_HE(elemwise_prod)
+
+add_test_float(bench_op)
+add_test_float(primitive)
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+find_package(MPFR 2.3.0 REQUIRED)
+add_executable(math-float "test_float_math.cpp")
+target_include_directories(math-float PUBLIC ${MPFR_INCLUDES}) 
+target_link_libraries(math-float SCI-FloatingPoint ${MPFR_LIBRARIES}) 
+
+add_subdirectory(GC)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/FindMPFR.cmake b/GPU-MPC/ext/sytorch/ext/sci/tests/FindMPFR.cmake
new file mode 100644
index 00000000..ebdbc55c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/FindMPFR.cmake
@@ -0,0 +1,72 @@
+# Try to find the MPFR library
+# See http://www.mpfr.org/
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(MPFR 2.3.0)
+# to require version 2.3.0 to newer of MPFR.
+#
+# Once done this will define
+#
+#  MPFR_FOUND - system has MPFR lib with correct version
+#  MPFR_INCLUDES - the MPFR include directory
+#  MPFR_LIBRARIES - the MPFR library
+#  MPFR_VERSION - MPFR version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2010 Jitse Niesen, <jitse@maths.leeds.ac.uk>
+# Copyright (c) 2015 Jack Poulson, <jack.poulson@gmail.com>
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+find_path(MPFR_INCLUDES NAMES mpfr.h PATHS $ENV{GMPDIR} $ENV{MPFRDIR}
+  ${INCLUDE_INSTALL_DIR})
+
+# Set MPFR_FIND_VERSION to 1.0.0 if no minimum version is specified
+if(NOT MPFR_FIND_VERSION)
+  if(NOT MPFR_FIND_VERSION_MAJOR)
+    set(MPFR_FIND_VERSION_MAJOR 1)
+  endif()
+  if(NOT MPFR_FIND_VERSION_MINOR)
+    set(MPFR_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT MPFR_FIND_VERSION_PATCH)
+    set(MPFR_FIND_VERSION_PATCH 0)
+  endif()
+  set(MPFR_FIND_VERSION
+    "${MPFR_FIND_VERSION_MAJOR}.${MPFR_FIND_VERSION_MINOR}.${MPFR_FIND_VERSION_PATCH}")
+endif()
+
+if(MPFR_INCLUDES)
+  # Query MPFR_VERSION
+  file(READ "${MPFR_INCLUDES}/mpfr.h" _mpfr_version_header)
+
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_MAJOR[ \t]+([0-9]+)"
+    _mpfr_major_version_match "${_mpfr_version_header}")
+  set(MPFR_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_MINOR[ \t]+([0-9]+)"
+    _mpfr_minor_version_match "${_mpfr_version_header}")
+  set(MPFR_MINOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPFR_VERSION_PATCHLEVEL[ \t]+([0-9]+)"
+    _mpfr_patchlevel_version_match "${_mpfr_version_header}")
+  set(MPFR_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}")
+
+  set(MPFR_VERSION
+    ${MPFR_MAJOR_VERSION}.${MPFR_MINOR_VERSION}.${MPFR_PATCHLEVEL_VERSION})
+
+  # Check whether found version exceeds minimum required
+  if(${MPFR_VERSION} VERSION_LESS ${MPFR_FIND_VERSION})
+    set(MPFR_VERSION_OK FALSE)
+    message(STATUS "MPFR version ${MPFR_VERSION} found in ${MPFR_INCLUDES}, "
+                   "but at least version ${MPFR_FIND_VERSION} is required")
+  else()
+    set(MPFR_VERSION_OK TRUE)
+  endif()
+endif()
+
+find_library(MPFR_LIBRARIES mpfr
+  PATHS $ENV{GMPDIR} $ENV{MPFRDIR} ${LIB_INSTALL_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPFR DEFAULT_MSG
+                                  MPFR_INCLUDES MPFR_LIBRARIES MPFR_VERSION_OK)
+mark_as_advanced(MPFR_INCLUDES MPFR_LIBRARIES)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/GC/CMakeLists.txt b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/CMakeLists.txt
new file mode 100644
index 00000000..38156465
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/CMakeLists.txt
@@ -0,0 +1,9 @@
+macro (add_GC_test _name)
+	add_executable(${_name} "test_${_name}.cpp")
+    target_link_libraries(${_name} SCI-OT) 
+endmacro()
+
+add_GC_test(bit)
+add_GC_test(int)
+add_GC_test(msnzb)
+add_GC_test(and)
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_and.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_and.cpp
new file mode 100644
index 00000000..adacdb31
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_and.cpp
@@ -0,0 +1,49 @@
+#include "GC/emp-sh2pc.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+int party, port = 8000, iters = 512;
+int bw_x = 64;
+NetIO *io_gc;
+
+void test_and() {
+  PRG128 prg(fix_key);
+
+  uint64_t *x = new uint64_t[iters];
+  uint64_t *y = new uint64_t[iters];
+  uint64_t *z = new uint64_t[iters];
+
+  prg.random_data(x, iters * sizeof(uint64_t));
+  prg.random_data(y, iters * sizeof(uint64_t));
+
+  for (int i = 0; i < iters; i++) {
+    Integer a(bw_x, x[i], ALICE);
+    Integer b(bw_x, y[i], BOB);
+    Integer c = a & b;
+    z[i] = c.reveal<uint64_t>(PUBLIC);
+
+    if (party == BOB) {
+      if (z[i] != (x[i] & y[i])) {
+        cout << i << "\t" << x[i] << "\t" << y[i] << "\t" << (x[i] & y[i])
+             << "\t" << z[i] << endl;
+        cout << ((SemiHonestParty<NetIO> *)prot_exec)->top << endl;
+      }
+      assert(z[i] == (x[i] & y[i]));
+    }
+  }
+
+  delete[] x;
+  delete[] y;
+  delete[] z;
+}
+
+int main(int argc, char **argv) {
+  party = atoi(argv[1]);
+  io_gc = new NetIO(party == ALICE ? nullptr : "127.0.0.1",
+                    port + GC_PORT_OFFSET, true);
+
+  setup_semi_honest(io_gc, party);
+  test_and();
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_bit.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_bit.cpp
new file mode 100644
index 00000000..907101d0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_bit.cpp
@@ -0,0 +1,89 @@
+#include "GC/emp-sh2pc.h"
+using namespace sci;
+using namespace std;
+int party;
+int num_threads = 1;
+NetIO *io_gc;
+
+void test_bit() {
+  bool b[] = {true, false};
+  int p[] = {PUBLIC, ALICE, BOB};
+
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 3; ++j)
+      for (int k = 0; k < 2; ++k)
+        for (int l = 0; l < 3; ++l) {
+          {
+            Bit b1(b[i], p[j]);
+            Bit b2(b[k], p[l]);
+            bool res = (b1 & b2).reveal(PUBLIC);
+            if (res != (b[i] and b[k])) {
+              cout << "AND" << i << " " << j << " " << k << " " << l << " "
+                   << res << endl;
+              error("test bit error!");
+            }
+            res = (b1 & b1).reveal(PUBLIC);
+            if (res != b[i]) {
+              cout << "AND" << i << " " << j << res << endl;
+              error("test bit error!");
+            }
+
+            res = (b1 & (!b1)).reveal(PUBLIC);
+            if (res) {
+              cout << "AND" << i << " " << j << res << endl;
+              error("test bit error!");
+            }
+          }
+          {
+            Bit b1(b[i], p[j]);
+            Bit b2(b[k], p[l]);
+            bool res = (b1 ^ b2).reveal(PUBLIC);
+            if (res != (b[i] xor b[k])) {
+              cout << "XOR" << i << " " << j << " " << k << " " << l << " "
+                   << res << endl;
+              error("test bit error!");
+            }
+
+            res = (b1 ^ b1).reveal(PUBLIC);
+            if (res) {
+              cout << "XOR" << i << " " << j << res << endl;
+              error("test bit error!");
+            }
+
+            res = (b1 ^ (!b1)).reveal(PUBLIC);
+            if (!res) {
+              cout << "XOR" << i << " " << j << res << endl;
+              error("test bit error!");
+            }
+          }
+          {
+            Bit b1(b[i], p[j]);
+            Bit b2(b[k], p[l]);
+            bool res = (b1 ^ b2).reveal(XOR);
+            if (party == ALICE) {
+              io_gc->send_data(&res, 1);
+            } else {
+              bool tmp;
+              io_gc->recv_data(&tmp, 1);
+              res = res != tmp;
+              if (res != (b[i] xor b[k])) {
+                cout << "XOR" << i << " " << j << " " << k << " " << l << " "
+                     << res << endl;
+                error("test bit error!");
+              }
+            }
+          }
+        }
+  io_gc->flush();
+  cout << "success!" << endl;
+}
+
+int main(int argc, char **argv) {
+  int port = 8000;
+  party = atoi(argv[1]);
+  io_gc = new NetIO(party == ALICE ? nullptr : "127.0.0.1",
+                    port + GC_PORT_OFFSET, true);
+
+  setup_semi_honest(io_gc, party);
+  test_bit();
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_int.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_int.cpp
new file mode 100644
index 00000000..f024ba9b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_int.cpp
@@ -0,0 +1,67 @@
+#include "GC/emp-sh2pc.h"
+#include <typeinfo>
+using namespace sci;
+using namespace std;
+
+NetIO *io_gc;
+
+template <typename Op, typename Op2>
+void test_int(int party, int range1 = 1 << 25, int range2 = 1 << 25,
+              int runs = 100) {
+  PRG128 prg(fix_key);
+  for (int i = 0; i < runs; ++i) {
+    long long ia, ib;
+    prg.random_data(&ia, 8);
+    prg.random_data(&ib, 8);
+    ia %= range1;
+    ib %= range2;
+    while (Op()(int(ia), int(ib)) != Op()(ia, ib)) {
+      prg.random_data(&ia, 8);
+      prg.random_data(&ib, 8);
+      ia %= range1;
+      ib %= range2;
+    }
+
+    Integer a(32, ia, ALICE);
+    Integer b(32, ib, BOB);
+
+    Integer res = Op2()(a, b);
+
+    // /*
+    if (res.reveal<int>(PUBLIC) != Op()(ia, ib)) {
+      cout << ia << "\t" << ib << "\t" << Op()(ia, ib) << "\t"
+           << res.reveal<int>(PUBLIC) << endl
+           << flush;
+    }
+    // */
+    // cout << i << "\t" << ia
+    // <<"\t"<<ib<<"\t"<<Op()(ia,ib)<<"\t"<<res.reveal<int>(PUBLIC)<<endl<<flush;
+    assert(res.reveal<int>(PUBLIC) == Op()(ia, ib));
+  }
+  cout << typeid(Op2).name() << "\t\t\tDONE" << endl;
+}
+
+void scratch_pad() {
+  Integer a(32, 9, ALICE);
+  cout << "HW " << a.hamming_weight().reveal<string>(PUBLIC) << endl;
+  cout << "LZ " << a.leading_zeros().reveal<string>(PUBLIC) << endl;
+}
+int main(int argc, char **argv) {
+  int port = 8000;
+  int party = atoi(argv[1]);
+  io_gc = new NetIO(party == ALICE ? nullptr : "127.0.0.1",
+                    port + GC_PORT_OFFSET, true);
+
+  setup_semi_honest(io_gc, party);
+
+  //	scratch_pad();return 0;
+  test_int<std::plus<int>, std::plus<Integer>>(party);
+  test_int<std::minus<int>, std::minus<Integer>>(party);
+  test_int<std::multiplies<int>, std::multiplies<Integer>>(party);
+  test_int<std::divides<int>, std::divides<Integer>>(party);
+  test_int<std::modulus<int>, std::modulus<Integer>>(party);
+
+  test_int<std::bit_and<int>, std::bit_and<Integer>>(party);
+  test_int<std::bit_or<int>, std::bit_or<Integer>>(party);
+  test_int<std::bit_xor<int>, std::bit_xor<Integer>>(party);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_msnzb.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_msnzb.cpp
new file mode 100644
index 00000000..5cd0c8fd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_msnzb.cpp
@@ -0,0 +1,126 @@
+#include "BuildingBlocks/aux-protocols.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 8000, dim = 1 << 16;
+string address = "127.0.0.1";
+int op = 1;
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+int bw_x = 32;
+int num_threads = 4;
+
+void test_msnzb(int tid, uint64_t *x, uint8_t *y, int32_t ldim) {
+  if (ldim == 0)
+    return;
+  int lparty = (tid & 1 ? 3 - party : party);
+  AuxProtocols *aux = new AuxProtocols(lparty, iopackArr[tid], otpackArr[tid]);
+  aux->msnzb_GC(x, y, bw_x, ldim);
+  delete aux;
+}
+
+void test_msnzb_one_hot(int tid, uint64_t *x, uint8_t *y, int32_t ldim) {
+  if (ldim == 0)
+    return;
+  int lparty = (tid & 1 ? 3 - party : party);
+  int digit_size = 8;
+  AuxProtocols *aux = new AuxProtocols(lparty, iopackArr[tid], otpackArr[tid]);
+  aux->msnzb_one_hot(x, y, bw_x, ldim, digit_size);
+  delete aux;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE/SERVER = 1; BOB/CLIENT = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("o", op, "MSNZB Kind? 1 (GC)/2 (SIRNN)");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("N", dim, "Batch dim");
+  amap.parse(argc, argv);
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *x = new uint64_t[dim];
+  uint8_t *y = new uint8_t[dim * bw_x];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+  }
+
+  auto start = clock_start();
+  std::thread bench_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lsize;
+    if (i == (num_threads - 1)) {
+      lsize = dim - offset;
+    } else {
+      lsize = chunk_size;
+    }
+    if (op == 1) {
+      bench_threads[i] =
+          std::thread(test_msnzb, i, x + offset, y + bw_x * offset, lsize);
+    } else {
+      bench_threads[i] = std::thread(test_msnzb_one_hot, i, x + offset,
+                                     y + (bw_x * offset), lsize);
+    }
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    bench_threads[i].join();
+  }
+  long long t = time_from(start);
+  cout << "Time MSNZB GC: " << t / 1000.0 << " ms" << endl;
+
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * bw_x * sizeof(uint8_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint8_t *y0 = new uint8_t[dim * bw_x];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * bw_x * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      uint64_t actual_val = unsigned_val(x0[i] + x[i], bw_x);
+      uint64_t secure_val = 0ULL;
+      for (int j = 0; j < bw_x; j++) {
+        secure_val +=
+            (uint64_t(y[i * bw_x + j] ^ y0[i * bw_x + j]) * (1ULL << j));
+      }
+      if (actual_val == 0) {
+        assert(secure_val == 0);
+        continue;
+      }
+      if (!((actual_val >= secure_val) && (actual_val < 2 * secure_val))) {
+        cout << i << "\t" << actual_val << "\t" << secure_val << "\t";
+        for (int j = 0; j < bw_x; j++) {
+          cout << uint64_t(y[i * bw_x + j] ^ y0[i * bw_x + j]);
+        }
+        cout << endl;
+      }
+      assert((actual_val >= secure_val) && (actual_val < 2 * secure_val));
+    }
+    std::cout << "Correct!" << std::endl;
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/float_utils.h b/GPU-MPC/ext/sytorch/ext/sci/tests/float_utils.h
new file mode 100644
index 00000000..5e623bfa
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/float_utils.h
@@ -0,0 +1,13 @@
+#include <iostream>
+#include <random>
+
+using namespace std;
+
+float sample_float(std::mt19937& generator, float lb, float ub) {
+  float f;
+  do {
+    uint32_t fint = generator();
+    f = *((float *) &fint);
+  } while ((f < lb || f >= ub) || (!isnormal(f) && (f != 0.0)));
+  return f;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_argmax.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_argmax.cpp
new file mode 100644
index 00000000..14369829
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_argmax.cpp
@@ -0,0 +1,186 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "NonLinear/argmax.h"
+
+using namespace std;
+using namespace sci;
+
+int party = 0;
+int32_t bitlength = 32;
+int num_threads = 1;
+int port = 32000;
+string address = "127.0.0.1";
+int num_argmax = 1000;
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", num_argmax, "Number of elements");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+
+  amap.parse(argc, argv);
+  uint64_t prime_mod = sci::default_prime_mod.at(bitlength);
+
+  IOPack *iopack = new IOPack(party, port, address);
+  uint64_t magnitude_bound = prime_mod / 8;
+  OTPack otpack(iopack, party);
+  ArgMaxProtocol<uint64_t> argmax_oracle(party, FIELD, iopack, bitlength,
+                                        MILL_PARAM, prime_mod, &otpack);
+
+  PRG128 prg;
+  uint64_t *input_share1, *input_share2, *input_share_uncorrected;
+  uint8_t *input_share_sign;
+  input_share_uncorrected = new uint64_t[num_argmax];
+  input_share1 = new uint64_t[num_argmax];
+  input_share2 = new uint64_t[num_argmax];
+  input_share_sign = new uint8_t[num_argmax];
+
+  uint64_t *argmax_output_protocol = new uint64_t[1];
+  uint64_t *argmax_output_protocol_share_other = new uint64_t[1];
+  uint64_t *argmax_output_protocol_arg = new uint64_t[1];
+  uint64_t *argmax_output_protocol_share_other_arg = new uint64_t[1];
+  uint64_t *argmax_output_actual = new uint64_t[1];
+  switch (party) {
+  case ALICE: {
+    prg.random_data(input_share_uncorrected, sizeof(uint64_t) * num_argmax);
+    prg.random_data(input_share_sign, num_argmax);
+    uint64_t comm_start = iopack->get_comm();
+    auto start = clock_start();
+    for (int i = 0; i < num_argmax; i++) {
+      input_share_uncorrected[i] %= magnitude_bound;
+    }
+    for (int i = 0; i < num_argmax; i++) {
+      if (input_share_sign[i] & 1) {
+        input_share1[i] =
+            sci::neg_mod(-1 * (int64_t)input_share_uncorrected[i], prime_mod);
+      } else {
+        input_share1[i] = input_share_uncorrected[i];
+      }
+    }
+    argmax_oracle.ArgMaxMPC(num_argmax, input_share1,
+                            argmax_output_protocol_arg, true,
+                            argmax_output_protocol);
+
+    long long t = time_from(start);
+    uint64_t comm_end = iopack->get_comm();
+    cout << "Comparison Time\t" << RED << (double(num_argmax) / t) * 1e6
+         << " ArgMax/sec" << RESET << endl;
+    cout << "ALICE communication\t" << BLUE
+         << ((double)(comm_end - comm_start) * 8) / (bitlength * num_argmax)
+         << "*" << bitlength << " bits/ArgMax" << RESET << endl;
+    std::cout << "ALICE: Done MaxPool protocol execution" << std::endl;
+    iopack->io->recv_data(input_share2, sizeof(uint64_t) * num_argmax);
+    iopack->io->recv_data(argmax_output_protocol_share_other,
+                          sizeof(uint64_t) * 1);
+    iopack->io->recv_data(argmax_output_protocol_share_other_arg,
+                          sizeof(uint64_t) * 1);
+
+    cout << "Checking correctness of ArgMax now..." << endl;
+
+    argmax_output_protocol[0] =
+        (argmax_output_protocol[0] + argmax_output_protocol_share_other[0]) %
+        prime_mod;
+    argmax_output_protocol_arg[0] =
+        (argmax_output_protocol_arg[0] +
+         argmax_output_protocol_share_other_arg[0]) %
+        prime_mod;
+    uint64_t max_mag = 0;
+    uint64_t max_mag_2 = 0;
+    for (int i = 0; i < num_argmax; i++) {
+      input_share1[i] = (input_share1[i] + input_share2[i]) % prime_mod;
+      if (input_share1[i] < (prime_mod / 2)) {
+        if (input_share1[i] > max_mag) {
+          max_mag_2 = max_mag;
+          max_mag = input_share1[i];
+        } else if (input_share1[i] > max_mag_2) {
+          max_mag_2 = input_share1[i];
+        }
+      } else {
+        uint64_t v = prime_mod - input_share1[i];
+        if (v > max_mag) {
+          max_mag_2 = max_mag;
+          max_mag = v;
+        } else if (v > max_mag_2) {
+          max_mag_2 = v;
+        }
+      }
+    }
+    if ((max_mag + max_mag) >= (prime_mod / 2)) {
+      cout << RED << "Shares exceed their magnitude bound!" << RESET << endl;
+      assert(false);
+    }
+    argmax_output_actual[0] = input_share1[0];
+    for (int i = 1; i < num_argmax; i++) {
+      argmax_output_actual[0] =
+          ((sci::neg_mod(argmax_output_actual[0] - input_share1[i],
+                         (int64_t)prime_mod) > (prime_mod / 2))
+               ? input_share1[i]
+               : argmax_output_actual[0]);
+    }
+    std::cout << "Max Protocol: " << argmax_output_protocol[0] << std::endl;
+    std::cout << "Max Actual: " << argmax_output_actual[0] << std::endl;
+    std::cout << "ArgMax Protocol: " << argmax_output_protocol_arg[0]
+              << std::endl;
+
+    assert(argmax_output_actual[0] == argmax_output_protocol[0] &&
+           "ArgMax output is incorrect");
+
+    cout << "ArgMax answer is: " << GREEN << "CORRECT!" << RESET << endl;
+    break;
+  }
+  case BOB: {
+    // These are written so that overall time excludes these.
+    prg.random_data(input_share_uncorrected, sizeof(uint64_t) * num_argmax);
+    prg.random_data(input_share_sign, num_argmax);
+    uint64_t comm_start = iopack->get_comm();
+
+    for (int i = 0; i < num_argmax; i++) {
+      input_share_uncorrected[i] %= magnitude_bound;
+    }
+    for (int i = 0; i < num_argmax; i++) {
+      if (input_share_sign[i] & 1) {
+        input_share2[i] =
+            sci::neg_mod(-1 * (int64_t)input_share_uncorrected[i], prime_mod);
+      } else {
+        input_share2[i] = input_share_uncorrected[i];
+      }
+    }
+    argmax_oracle.ArgMaxMPC(num_argmax, input_share2,
+                            argmax_output_protocol_arg, true,
+                            argmax_output_protocol);
+
+    uint64_t comm_end = iopack->get_comm();
+    cout << "BOB communication\t" << BLUE
+         << ((double)(comm_end - comm_start) * 8) / (bitlength * num_argmax)
+         << "*" << bitlength << " bits/ArgMax" << RESET << endl;
+
+    std::cout << "BOB: Done MaxPool protocol execution" << std::endl;
+    iopack->io->send_data(input_share2, sizeof(uint64_t) * num_argmax);
+    iopack->io->send_data(argmax_output_protocol, sizeof(uint64_t) * 1);
+    iopack->io->send_data(argmax_output_protocol_arg, sizeof(uint64_t) * 1);
+    break;
+  }
+  }
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_conv.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_conv.cpp
new file mode 100644
index 00000000..9785ab97
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_conv.cpp
@@ -0,0 +1,141 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "LinearHE/conv-field.h"
+
+using namespace std;
+using namespace seal;
+using namespace sci;
+
+int party = 0;
+int bitlength = 32;
+int num_threads = 4;
+int port = 8000;
+string address = "127.0.0.1";
+int image_h = 56;
+int inp_chans = 64;
+int filter_h = 3;
+int out_chans = 64;
+int pad_l = 0;
+int pad_r = 0;
+int stride = 2;
+int filter_precision = 12;
+
+void Conv(ConvField &he_conv, int32_t H, int32_t CI, int32_t FH, int32_t CO,
+          int32_t zPadHLeft, int32_t zPadHRight, int32_t strideH) {
+  int newH = 1 + (H + zPadHLeft + zPadHRight - FH) / strideH;
+  int N = 1;
+  int W = H;
+  int FW = FH;
+  int zPadWLeft = zPadHLeft;
+  int zPadWRight = zPadHRight;
+  int strideW = strideH;
+  int newW = newH;
+  vector<vector<vector<vector<uint64_t>>>> inputArr(N);
+  vector<vector<vector<vector<uint64_t>>>> filterArr(FH);
+  vector<vector<vector<vector<uint64_t>>>> outArr(N);
+
+  PRG128 prg;
+  for (int i = 0; i < N; i++) {
+    outArr[i].resize(newH);
+    for (int j = 0; j < newH; j++) {
+      outArr[i][j].resize(newW);
+      for (int k = 0; k < newW; k++) {
+        outArr[i][j][k].resize(CO);
+      }
+    }
+  }
+  if (party == ALICE) {
+    for (int i = 0; i < FH; i++) {
+      filterArr[i].resize(FW);
+      for (int j = 0; j < FW; j++) {
+        filterArr[i][j].resize(CI);
+        for (int k = 0; k < CI; k++) {
+          filterArr[i][j][k].resize(CO);
+          prg.random_data(filterArr[i][j][k].data(), CO * sizeof(uint64_t));
+          for (int h = 0; h < CO; h++) {
+            filterArr[i][j][k][h] =
+                ((int64_t)filterArr[i][j][k][h]) >> (64 - filter_precision);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    inputArr[i].resize(H);
+    for (int j = 0; j < H; j++) {
+      inputArr[i][j].resize(W);
+      for (int k = 0; k < W; k++) {
+        inputArr[i][j][k].resize(CI);
+        prg.random_mod_p<uint64_t>(inputArr[i][j][k].data(), CI, prime_mod);
+      }
+    }
+  }
+  uint64_t comm_start = he_conv.io->counter;
+  INIT_TIMER;
+  START_TIMER;
+  he_conv.convolution(N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft,
+                      zPadWRight, strideH, strideW, inputArr, filterArr, outArr,
+                      true, true);
+  STOP_TIMER("Total Time for Conv");
+  uint64_t comm_end = he_conv.io->counter;
+  cout << "Total Comm: " << (comm_end - comm_start) / (1.0 * (1ULL << 20))
+       << endl;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("nt", num_threads, "Number of Threads");
+  amap.arg("l", bitlength, "Bitlength");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("p", port, "Port Number");
+  amap.arg("h", image_h, "Image Height/Width");
+  amap.arg("f", filter_h, "Filter Height/Width");
+  amap.arg("i", inp_chans, "Input Channels");
+  amap.arg("o", out_chans, "Ouput Channels");
+  amap.arg("s", stride, "stride");
+  amap.arg("pl", pad_l, "Left Padding");
+  amap.arg("pr", pad_r, "Right Padding");
+  amap.arg("fp", filter_precision, "Filter Precision");
+  amap.parse(argc, argv);
+  prime_mod = sci::default_prime_mod.at(bitlength);
+
+  cout << "=================================================================="
+       << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength
+       << " - Mod: " << prime_mod << " - Image: " << image_h << "x" << image_h
+       << "x" << inp_chans << " - Filter: " << filter_h << "x" << filter_h
+       << "x" << out_chans << "\n- Stride: " << stride << "x" << stride
+       << " - Padding: " << pad_l << "x" << pad_r
+       << " - # Threads: " << num_threads << endl;
+  cout << "=================================================================="
+       << endl;
+
+  NetIO *io = new NetIO(party == 1 ? nullptr : address.c_str(), port);
+
+  ConvField he_conv(party, io);
+
+  Conv(he_conv, image_h, inp_chans, filter_h, out_chans, pad_l, pad_r, stride);
+
+  io->flush();
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_elemwise_prod.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_elemwise_prod.cpp
new file mode 100644
index 00000000..697634ca
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_elemwise_prod.cpp
@@ -0,0 +1,80 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "LinearHE/elemwise-prod-field.h"
+
+using namespace std;
+using namespace seal;
+using namespace sci;
+
+int party = 0;
+int bitlength = 32;
+int num_threads = 1;
+int port = 8000;
+string address = "127.0.0.1";
+int vec_size = 56 * 56;
+int filter_precision = 15;
+
+void ElemWiseProd(ElemWiseProdField &he_prod, int32_t size) {
+  vector<uint64_t> inArr(size);
+  vector<uint64_t> multArr(size);
+  vector<uint64_t> outArr(size);
+  PRG128 prg;
+  if (party == ALICE) {
+    prg.random_data(multArr.data(), size * sizeof(uint64_t));
+    for (int i = 0; i < size; i++) {
+      multArr[i] = ((int64_t)multArr[i]) >> (64 - filter_precision);
+    }
+  }
+  prg.random_mod_p<uint64_t>(inArr.data(), size, prime_mod);
+
+  INIT_TIMER;
+  START_TIMER;
+  he_prod.elemwise_product(size, inArr, multArr, outArr, true, true);
+  STOP_TIMER("Total Time for ElemWiseProduct");
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("s", vec_size, "Size of Vectors");
+  amap.arg("fp", filter_precision, "Filter Precision");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.parse(argc, argv);
+  prime_mod = sci::default_prime_mod.at(bitlength);
+
+  cout << "===================================================================="
+          "===="
+       << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength
+       << " - Mod: " << prime_mod << " - Vector Size: " << vec_size
+       << " - # Threads: " << num_threads << endl;
+  cout << "===================================================================="
+          "===="
+       << endl;
+
+  NetIO *io = new NetIO(party == 1 ? nullptr : address.c_str(), port);
+
+  ElemWiseProdField he_prod(party, io);
+
+  ElemWiseProd(he_prod, vec_size);
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_fc.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_fc.cpp
new file mode 100644
index 00000000..a25e14fb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_fc.cpp
@@ -0,0 +1,91 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "LinearHE/fc-field.h"
+
+using namespace std;
+using namespace seal;
+using namespace sci;
+
+int party = 0;
+int bitlength = 32;
+int num_threads = 4;
+int port = 8000;
+string address = "127.0.0.1";
+int num_rows = 1001;
+int common_dim = 512;
+int filter_precision = 15;
+
+void MatMul(FCField &he_fc, int32_t num_rows, int32_t common_dim) {
+  int num_cols = 1;
+  vector<vector<uint64_t>> A(num_rows);   // Weights
+  vector<vector<uint64_t>> B(common_dim); // Image
+  vector<vector<uint64_t>> C(num_rows);
+  PRG128 prg;
+  for (int i = 0; i < num_rows; i++) {
+    A[i].resize(common_dim);
+    C[i].resize(num_cols);
+    if (party == ALICE) {
+      prg.random_data(A[i].data(), common_dim * sizeof(uint64_t));
+      for (int j = 0; j < common_dim; j++) {
+        A[i][j] = ((int64_t)A[i][j]) >> (64 - filter_precision);
+      }
+    }
+  }
+  for (int i = 0; i < common_dim; i++) {
+    B[i].resize(1);
+    prg.random_mod_p<uint64_t>(B[i].data(), num_cols, prime_mod);
+  }
+  INIT_TIMER;
+  START_TIMER;
+  he_fc.matrix_multiplication(num_rows, common_dim, num_cols, A, B, C, true,
+                              true);
+  STOP_TIMER("Total Time for FC");
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("n", num_rows, "Rows in Weight Matrix");
+  amap.arg("c", common_dim, "Image Length / Columns in Weight Matrix");
+  amap.arg("fp", filter_precision, "Filter Precision");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.parse(argc, argv);
+  prime_mod = sci::default_prime_mod.at(bitlength);
+
+  cout << "===================================================================="
+       << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength
+       << " - Mod: " << prime_mod << " - Rows: " << num_rows
+       << " - Cols: " << common_dim << " - # Threads: " << num_threads << endl;
+  cout << "===================================================================="
+       << endl;
+
+  NetIO *io = new NetIO(party == 1 ? nullptr : address.c_str(), port);
+
+  FCField he_fc(party, io);
+
+  MatMul(he_fc, num_rows, common_dim);
+
+  io->flush();
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_maxpool.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_maxpool.cpp
new file mode 100644
index 00000000..0525df56
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_maxpool.cpp
@@ -0,0 +1,194 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "library_fixed.h"
+#include <fstream>
+#include <thread>
+
+using namespace std;
+using namespace sci;
+
+#define MAX_THREADS 4
+
+int party = 0;
+int port = 32000;
+int num_rows = 35, num_cols = 1 << 6;
+int bitlength = 32, b = 4;
+int batch_size = 0;
+string address = "127.0.0.1";
+int num_threads = 1;
+
+void field_maxpool_thread(int tid, uint64_t *z, uint64_t *x, int lnum_rows,
+                          int lnum_cols) {
+  MaxPoolProtocol<uint64_t> *maxpool_oracle;
+  if (tid & 1) {
+    maxpool_oracle = new MaxPoolProtocol<uint64_t>(
+        3 - party, FIELD, iopackArr[tid], bitlength, b, prime_mod, otpackArr[tid]);
+  } else {
+    maxpool_oracle = new MaxPoolProtocol<uint64_t>(
+        party, FIELD, iopackArr[tid], bitlength, b, prime_mod, otpackArr[tid]);
+  }
+  if (batch_size) {
+    for (int j = 0; j < lnum_rows; j += batch_size) {
+      if (batch_size <= lnum_rows - j) {
+        maxpool_oracle->funcMaxMPC(batch_size, lnum_cols, x + j, z + j,
+                                   nullptr);
+      } else {
+        maxpool_oracle->funcMaxMPC(lnum_rows - j, lnum_cols, x + j, z + j,
+                                   nullptr);
+      }
+    }
+  } else {
+    maxpool_oracle->funcMaxMPC(lnum_rows, lnum_cols, x, z, nullptr);
+  }
+
+  delete maxpool_oracle;
+  return;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.arg("b", b, "Radix base");
+  amap.arg("Nr", num_rows, "Number of rows");
+  amap.arg("Nc", num_cols, "Number of cols");
+  amap.arg("bt", batch_size, "Batch size as a power of 2 (No batching = 0)");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+  prime_mod = sci::default_prime_mod.at(bitlength);
+
+  if (batch_size > 0) {
+    batch_size = 1 << batch_size;
+  }
+
+  cout << "========================================================" << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength
+       << " - Radix Base: " << b << "\n#rows: " << num_rows
+       << " - #cols: " << num_cols << " - Batch Size: " << batch_size
+       << " - # Threads: " << num_threads << endl;
+  cout << "========================================================" << endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg;
+  uint64_t *x = new uint64_t[num_rows * num_cols];
+  uint64_t *z = new uint64_t[num_rows];
+  prg.random_data(x, sizeof(uint64_t) * num_rows * num_cols);
+  for (int i = 0; i < num_rows * num_cols; i++) {
+    x[i] = x[i] % prime_mod;
+  }
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************** Fork Threads ****************/
+  /********************************************/
+
+  auto start = clock_start();
+  std::thread maxpool_threads[num_threads];
+  int chunk_size = num_rows / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_rows;
+    if (i == (num_threads - 1)) {
+      lnum_rows = num_rows - offset;
+    } else {
+      lnum_rows = chunk_size;
+    }
+    maxpool_threads[i] =
+        std::thread(field_maxpool_thread, i, z + offset, x + offset * num_cols,
+                    lnum_rows, num_cols);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    maxpool_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  /************** Verification ****************/
+  /********************************************/
+
+  switch (party) {
+  case sci::ALICE: {
+    iopackArr[0]->io->send_data(x, sizeof(uint64_t) * num_rows * num_cols);
+    iopackArr[0]->io->send_data(z, sizeof(uint64_t) * num_rows);
+    break;
+  }
+  case sci::BOB: {
+    uint64_t *xi = new uint64_t[num_rows * num_cols];
+    uint64_t *zi = new uint64_t[num_rows];
+    iopackArr[0]->io->recv_data(xi, sizeof(uint64_t) * num_rows * num_cols);
+    iopackArr[0]->io->recv_data(zi, sizeof(uint64_t) * num_rows);
+
+    for (int i = 0; i < num_rows; i++) {
+      zi[i] = (zi[i] + z[i]) % prime_mod;
+      for (int c = 0; c < num_cols; c++) {
+        xi[i * num_cols + c] =
+            (xi[i * num_cols + c] + x[i * num_cols + c]) % prime_mod;
+      }
+      uint64_t maxpool_output = xi[i * num_cols];
+      for (int c = 1; c < num_cols; c++) {
+        maxpool_output =
+            sci::neg_mod((int64_t)maxpool_output - xi[i * num_cols + c],
+                         (int64_t)prime_mod) > (prime_mod / 2)
+                ? xi[i * num_cols + c]
+                : maxpool_output;
+      }
+      assert((zi[i] == maxpool_output) && "MaxPool output is incorrect");
+    }
+    delete[] xi;
+    delete[] zi;
+    cout << "Maxpool Tests Passed" << endl;
+    break;
+  }
+  }
+  delete[] x;
+  delete[] z;
+
+  cout << "Number of Maxpool rows (num_cols=" << num_cols << ")/s:\t"
+       << (double(num_rows) / t) * 1e6 << std::endl;
+  cout << "Maxpool Time (bitlength=" << bitlength << "; b=" << b << ")\t" << t
+       << " mus" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_relu.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_relu.cpp
new file mode 100644
index 00000000..6e98c7bf
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_field_relu.cpp
@@ -0,0 +1,271 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "NonLinear/relu-field.h"
+#include "library_fixed.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+#define MAX_THREADS 4
+
+using namespace sci;
+using namespace std;
+
+int party = 0;
+int num_relu = 35, port = 32000;
+int b = 4;
+int batch_size = 0;
+string address = "127.0.0.1";
+int num_threads = 1;
+string network = "none";
+vector<int> network_layer_sizes;
+int32_t bitlength = 32;
+
+const std::map<std::string, std::vector<int>> layer_sizes{
+    {"sq", vector<int>({200704, 50176, 200704, 200704, 50176, 200704, 200704,
+                        23328,  93312, 93312,  23328,  93312, 93312,  8112,
+                        32448,  32448, 8112,   32448,  32448, 10816,  43264,
+                        43264,  10816, 43264,  43264,  169000})},
+    {"res",
+     vector<int>({200704, 200704, 200704, 802816, 200704, 200704, 802816,
+                  200704, 200704, 802816, 401408, 100352, 401408, 100352,
+                  100352, 401408, 100352, 100352, 401408, 100352, 100352,
+                  401408, 200704, 50176,  200704, 50176,  50176,  200704,
+                  50176,  50176,  200704, 50176,  50176,  200704, 50176,
+                  50176,  200704, 50176,  50176,  200704, 100352, 25088,
+                  100352, 25088,  25088,  100352, 25088,  25088,  100352})},
+    {"dense",
+     vector<int>(
+         {200704, 200704, 401408, 301056, 401408, 401408, 401408, 501760,
+          401408, 602112, 401408, 702464, 401408, 802816, 100352, 100352,
+          125440, 100352, 150528, 100352, 175616, 100352, 200704, 100352,
+          225792, 100352, 250880, 100352, 275968, 100352, 301056, 100352,
+          326144, 100352, 351232, 100352, 376320, 100352, 401408, 50176,
+          25088,  56448,  25088,  62720,  25088,  68992,  25088,  75264,
+          25088,  81536,  25088,  87808,  25088,  94080,  25088,  100352,
+          25088,  106624, 25088,  112896, 25088,  119168, 25088,  125440,
+          25088,  131712, 25088,  137984, 25088,  144256, 25088,  150528,
+          25088,  156800, 25088,  163072, 25088,  169344, 25088,  175616,
+          25088,  181888, 25088,  188160, 25088,  194432, 25088,  200704,
+          25088,  6272,   26656,  6272,   28224,  6272,   29792,  6272,
+          31360,  6272,   32928,  6272,   34496,  6272,   36064,  6272,
+          37632,  6272,   39200,  6272,   40768,  6272,   42336,  6272,
+          43904,  6272,   45472,  6272,   47040,  6272,   48608,  6272,
+          50176})},
+};
+
+void field_relu_thread(int tid, uint64_t *z, uint64_t *x, int lnum_relu) {
+  ReLUFieldProtocol<uint64_t> *relu_oracle;
+  if (tid & 1) {
+    relu_oracle = new ReLUFieldProtocol<uint64_t>(3 - party, FIELD,
+                                                  iopackArr[tid], bitlength, b,
+                                                  prime_mod, otpackArr[tid]);
+  } else {
+    relu_oracle = new ReLUFieldProtocol<uint64_t>(
+        party, FIELD, iopackArr[tid], bitlength, b, prime_mod, otpackArr[tid]);
+  }
+  if (batch_size) {
+    for (int j = 0; j < lnum_relu; j += batch_size) {
+      if (batch_size <= lnum_relu - j) {
+        relu_oracle->relu(z + j, x + j, batch_size);
+      } else {
+        relu_oracle->relu(z + j, x + j, lnum_relu - j);
+      }
+    }
+  } else {
+    relu_oracle->relu(z, x, lnum_relu);
+  }
+
+  delete relu_oracle;
+  return;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", num_relu, "Number of ReLUs");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("b", b, "Radix base");
+  amap.arg("bt", batch_size, "Batch size as a power of 2 (No batching = 0)");
+  amap.arg("network", network,
+           "Network Type: sq - SqNet, res - ResNet50, dense - DenseNet121");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.parse(argc, argv);
+  prime_mod = sci::default_prime_mod.at(bitlength);
+
+  if (batch_size > 0) {
+    batch_size = 1 << batch_size;
+  }
+  if (network != "none") {
+    num_relu = 0;
+    network_layer_sizes = layer_sizes.at(network);
+    for (size_t i = 0; i < network_layer_sizes.size(); i++) {
+      num_relu += network_layer_sizes[i];
+    }
+    if (network == "res")
+      bitlength = 37;
+    else
+      bitlength = 32;
+  }
+
+  cout << "========================================================" << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength
+       << " - Radix Base: " << b << "\n# ReLUs: " << num_relu
+       << " - Batch Size: " << batch_size << " - # Threads: " << num_threads
+       << endl;
+  cout << "========================================================" << endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+
+  sci::PRG128 prg;
+  uint64_t mask_l;
+  if (bitlength == 64)
+    mask_l = -1;
+  else
+    mask_l = (1ULL << bitlength) - 1;
+  uint64_t *x = new uint64_t[num_relu];
+  uint64_t *z = new uint64_t[num_relu];
+  prg.random_mod_p<uint64_t>(x, num_relu, prime_mod);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************** Fork Threads ****************/
+  /********************************************/
+
+  uint64_t comm_sent = 0;
+  uint64_t multiThreadedIOStart[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    multiThreadedIOStart[i] = iopackArr[i]->get_comm();
+  }
+  auto start = clock_start();
+  if (network != "none") {
+    int layer_offset = 0;
+    for (size_t layer_idx = 0; layer_idx < network_layer_sizes.size();
+         layer_idx++) {
+      std::thread relu_threads[num_threads];
+      int layer_size = network_layer_sizes[layer_idx];
+      int chunk_size = layer_size / num_threads;
+      cout << "Layer_idx: " << layer_idx << "; Layer_size: " << layer_size
+           << endl;
+      for (int i = 0; i < num_threads; ++i) {
+        int offset = i * chunk_size + layer_offset;
+        int lnum_relu;
+        if (i == (num_threads - 1)) {
+          lnum_relu = (layer_offset + layer_size) - offset;
+        } else {
+          lnum_relu = chunk_size;
+        }
+        relu_threads[i] = std::thread(field_relu_thread, i, z + offset,
+                                      x + offset, lnum_relu);
+      }
+      for (int i = 0; i < num_threads; ++i) {
+        relu_threads[i].join();
+      }
+      layer_offset += layer_size;
+    }
+  } else {
+    std::thread relu_threads[num_threads];
+    int chunk_size = num_relu / num_threads;
+    for (int i = 0; i < num_threads; ++i) {
+      int offset = i * chunk_size;
+      int lnum_relu;
+      if (i == (num_threads - 1)) {
+        lnum_relu = num_relu - offset;
+      } else {
+        lnum_relu = chunk_size;
+      }
+      relu_threads[i] =
+          std::thread(field_relu_thread, i, z + offset, x + offset, lnum_relu);
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      relu_threads[i].join();
+    }
+  }
+  long long t = time_from(start);
+  for (int i = 0; i < num_threads; i++) {
+    auto curComm = (iopackArr[i]->get_comm()) - multiThreadedIOStart[i];
+    comm_sent += curComm;
+  }
+  std::cout << "Comm. Sent/ell: "
+            << double(comm_sent * 8) / (bitlength * num_relu) << std::endl;
+
+  /************** Verification ****************/
+  /********************************************/
+
+  switch (party) {
+  case sci::ALICE: {
+    iopackArr[0]->io->send_data(x, sizeof(uint64_t) * num_relu);
+    iopackArr[0]->io->send_data(z, sizeof(uint64_t) * num_relu);
+    break;
+  }
+  case sci::BOB: {
+    uint64_t *xi = new uint64_t[num_relu];
+    uint64_t *zi = new uint64_t[num_relu];
+    iopackArr[0]->io->recv_data(xi, sizeof(uint64_t) * num_relu);
+    iopackArr[0]->io->recv_data(zi, sizeof(uint64_t) * num_relu);
+
+    for (int i = 0; i < num_relu; i++) {
+      xi[i] = (xi[i] + x[i]) % prime_mod;
+      zi[i] = (zi[i] + z[i]) % prime_mod;
+      assert((zi[i] == ((xi[i] <= prime_mod / 2) * xi[i])) &&
+             "ReLU protocol's answer is incorrect!");
+    }
+    delete[] xi;
+    delete[] zi;
+    break;
+  }
+  }
+  delete[] x;
+  delete[] z;
+
+  /**** Process & Write Benchmarking Data *****/
+  /********************************************/
+
+  cout << "Number of ReLU/s:\t" << (double(num_relu) / t) * 1e6 << std::endl;
+  cout << "ReLU Time (bitlength=" << bitlength << "; b=" << b << ")\t" << t
+       << " mus" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_bench_op.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_bench_op.cpp
new file mode 100644
index 00000000..106ebb29
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_bench_op.cpp
@@ -0,0 +1,236 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/floating-point.h"
+#include "FloatingPoint/fp-math.h"
+#include <cfenv>
+#include <fstream>
+#include <random>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+enum class Op {
+  GT,
+  ADD,
+  MUL,
+  DIV,
+  SQRT,
+  SINPI,
+  COSPI,
+  TANPI,
+  LOG2,
+  EXP2,
+  EXP,
+  LN,
+  ERF
+};
+
+#define MAX_THREADS 4
+
+Op op = Op::ADD;
+string op_name = "ADD";
+bool verbose = true;
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+int dim = 8192;
+int party = 1;
+string address = "127.0.0.1";
+int port = 32000;
+int num_threads = 4;
+std::random_device rand_div;
+// std::mt19937 generator(rand_div());
+std::mt19937 generator(0);
+
+void bench_op(int tid, int sz) {
+  if (sz == 0)
+    return;
+  int lparty = (tid & 1 ? 3 - party : party);
+  FPOp *fp_op;
+  FPMath *fp_math;
+  fp_op = new FPOp(lparty, iopackArr[tid], otpackArr[tid]);
+  fp_math = new FPMath(lparty, iopackArr[tid], otpackArr[tid]);
+  assert(party == ALICE || party == BOB);
+  BoolArray bp;
+  FPArray fp;
+  float *f_1 = new float[sz];
+  float *f_2 = new float[sz];
+  for (int i = 0; i < sz; i++) {
+    uint32_t fint = generator();
+    f_1[i] = *((float *)&fint);
+    fint = generator();
+    f_2[i] = *((float *)&fint);
+  }
+  FPArray fp_1 = fp_op->input<float>(ALICE, sz, f_1);
+  FPArray fp_2 = fp_op->input<float>(BOB, sz, f_2);
+  switch (op) {
+  case Op::GT:
+    bp = fp_op->GT(fp_1, fp_2);
+    op_name = "GT";
+    break;
+  case Op::ADD:
+    fp = fp_op->add(fp_1, fp_2);
+    op_name = "ADD";
+    break;
+  case Op::MUL:
+    fp = fp_op->mul(fp_1, fp_2);
+    op_name = "MUL";
+    break;
+  case Op::DIV:
+    fp = fp_op->div(fp_1, fp_2);
+    op_name = "DIV";
+    break;
+  case Op::SQRT:
+    fp = fp_op->sqrt(fp_1);
+    op_name = "SQRT";
+    break;
+  case Op::SINPI:
+    fp = fp_math->sinpi(fp_1);
+    op_name = "SINPI";
+    break;
+  case Op::COSPI:
+    fp = fp_math->cospi(fp_1);
+    op_name = "COSPI";
+    break;
+  case Op::TANPI:
+    fp = fp_math->tanpi(fp_1);
+    op_name = "TANPI";
+    break;
+  case Op::EXP2:
+    fp = fp_math->exp2(fp_1);
+    op_name = "EXP2";
+    break;
+  case Op::LOG2:
+    fp = fp_math->log2(fp_1);
+    op_name = "LOG2";
+    break;
+  case Op::EXP:
+    fp = fp_math->exp(fp_1);
+    op_name = "EXP";
+    break;
+  case Op::LN:
+    fp = fp_math->ln(fp_1);
+    op_name = "LN";
+    break;
+  case Op::ERF:
+    fp = fp_math->erf(fp_1);
+    op_name = "ERF";
+    break;
+  default:
+    assert(false);
+  }
+  delete[] f_1;
+  delete[] f_2;
+  delete fp_op;
+  delete fp_math;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+
+  int int_op = static_cast<int>(op);
+  amap.arg("r", party, "Role of party: ALICE/SERVER = 1; BOB/CLIENT = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("o", int_op, "FP Operation");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("N", dim, "Batch dim");
+  amap.parse(argc, argv);
+  op = static_cast<Op>(int_op);
+
+  assert(num_threads <= MAX_THREADS);
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new sci::IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  uint64_t num_rounds = iopackArr[0]->get_rounds();
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+  auto start = clock_start();
+  std::thread bench_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lsize;
+    if (i == (num_threads - 1)) {
+      lsize = dim - offset;
+    } else {
+      lsize = chunk_size;
+    }
+    bench_threads[i] = std::thread(bench_op, i, lsize);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    bench_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+  num_rounds = iopackArr[0]->get_rounds() - num_rounds;
+
+  if (party == BOB) {
+    uint64_t total_comm_ALICE = 0;
+    iopackArr[0]->io->recv_data(&total_comm_ALICE, sizeof(uint64_t));
+    total_comm += total_comm_ALICE;
+    cout << "Total Communication (ALICE + BOB)\t" << total_comm << " bytes"
+         << endl;
+    string file_addr;
+    file_addr = "FP-op.csv";
+    bool write_title = true;
+    {
+      fstream result(file_addr.c_str(), fstream::in);
+      if (result.is_open())
+        write_title = false;
+      result.close();
+    }
+    fstream result(file_addr.c_str(), fstream::out | fstream::app);
+    if (write_title) {
+      result << "Operator,Using GC?,#Operations,#Threads,Time "
+                "(ms),Communication (Bytes),Total Rounds"
+             << endl;
+    }
+    result << op_name << "," << dim << "," << num_threads << "," << t / (1000.0)
+           << "," << total_comm << "," << num_rounds << endl;
+    result.close();
+  } else {
+    iopackArr[0]->io->send_data(&total_comm, sizeof(uint64_t));
+  }
+  cout << "Ops/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "Total Time\t" << t / (1000.0) << " ms" << endl;
+
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_math.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_math.cpp
new file mode 100644
index 00000000..f95660d2
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_math.cpp
@@ -0,0 +1,308 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/fp-math.h"
+#include "mpfr.h"
+#include <random>
+#include <limits>
+#include "float_utils.h"
+
+#define MPFR_PREC 300
+
+using namespace sci;
+using namespace std;
+
+mpfr_t mval;
+
+enum class Op { SINPI, COSPI, TANPI, EXP2, LOG2, EXP, LN, ERF };
+
+Op op = Op::EXP;
+bool verbose = true;
+IOPack *iopack = nullptr;
+OTPack *otpack = nullptr;
+FPMath *fp_math = nullptr;
+int m_bits = 23, e_bits = 8;
+int sz = 8192;
+int party = 1;
+string address = "127.0.0.1";
+int port = 8000;
+std::random_device rand_div;
+// std::mt19937 generator(rand_div());
+std::mt19937 generator(0);
+float lb;
+float ub;
+
+#define f32_get_e(f) ((f & 0x7F800000) >> 23)
+#define f32_get_m(f) (f & 0x007FFFFF)
+#define f32_get_s(f) (f >> 31)
+#define f32_is_denormal_number(f) (f32_get_e(f) == 0 && f32_get_m(f) != 0)
+#define f32_is_nan(f) (f32_get_e(f) == 0xff && f32_get_m(f) != 0)
+#define f32_is_inf(f) (f32_get_e(f) == 255 && f32_get_m(f) == 0)
+
+int64_t get_exponent(double x) {
+  int64_t x_int = *((int64_t *)&x);
+  return ((x_int >> 52) & 2047) - 1023;
+}
+
+double ULP_error(float actual, double expected) {
+  if (abs(expected) < pow(2, -126)) {
+    if (actual == 0.0)
+      expected = 0;
+  }
+  if (abs(expected) >= pow(2, 128)) {
+    if (expected < 0)
+      expected = -INFINITY;
+    else
+      expected = INFINITY;
+  }
+  if ((actual == INFINITY) && (expected == -INFINITY))
+    return 0.0;
+  if ((actual == -INFINITY) && (expected == INFINITY))
+    return 0.0;
+  if (actual == expected)
+    return 0.0;
+  double abs_err = abs(double(actual) - expected);
+  int64_t expected_exp = get_exponent(expected);
+  double ulp;
+  ulp = exp2(expected_exp - 23.0);
+  return abs_err / ulp;
+}
+
+bool check_limit(float x_1, double x_2, int32_t ULP_limit = 1) {
+  double ulp = ULP_error(x_1, x_2);
+  if (ulp > ULP_limit)
+    return false;
+  else
+    return true;
+}
+
+// x <= 2^-14, tanpi(x) = pi*x in float representation
+double precise_tanpi(float x) {
+  mpfr_const_pi(mval, MPFR_RNDN);
+  mpfr_mul_d(mval, mval, (double)x, MPFR_RNDN);
+  mpfr_tan(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+// x <= 2^-14, sinpi(x) = pi*x in float representation
+double precise_sinpi(float x) {
+  mpfr_const_pi(mval, MPFR_RNDN);
+  mpfr_mul_d(mval, mval, (double)x, MPFR_RNDN);
+  mpfr_sin(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_cospi(float x) {
+  mpfr_const_pi(mval, MPFR_RNDN);
+  mpfr_mul_d(mval, mval, (double)x, MPFR_RNDN);
+  mpfr_cos(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_exp2(float x) {
+  mpfr_set_flt(mval, x, MPFR_RNDN);
+  mpfr_exp2(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_log2(float x) {
+  mpfr_set_flt(mval, x, MPFR_RNDN);
+  mpfr_log2(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_ln(float x) {
+  mpfr_set_flt(mval, x, MPFR_RNDN);
+  mpfr_log(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_exp(float x) {
+  mpfr_set_flt(mval, x, MPFR_RNDN);
+  mpfr_exp(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+double precise_erf(float x) {
+  mpfr_set_flt(mval, x, MPFR_RNDN);
+  mpfr_erf(mval, mval, MPFR_RNDN);
+  return mpfr_get_d(mval, MPFR_RNDN);
+}
+
+void test_op() {
+  assert(party == ALICE || party == BOB);
+  switch (op) {
+  case Op::SINPI:
+  case Op::COSPI:
+  case Op::TANPI:
+    lb = pow(2.0, -14);
+    ub = pow(2.0, 23);
+    break;
+  case Op::EXP2:
+    lb = -126;
+    ub = pow(2.0, 7);
+    break;
+  case Op::EXP:
+    lb = -87.33654022216796875;
+    ub = 88.72283172607421875;
+    break;
+  case Op::LOG2:
+  case Op::LN:
+    lb = std::numeric_limits<float>::min();
+    ub = std::numeric_limits<float>::max();
+    break;
+  case Op::ERF:
+    lb = pow(2.0, -12);
+    ub = 3.875;
+    break;
+  default:
+    assert(false);
+  }
+  FPArray fp;
+  float *f_1 = new float[sz];
+  double *f = new double[sz];
+  for (int i = 0; i < sz; i++) {
+    f_1[i] = sample_float(generator, lb, ub);
+  }
+  FPArray fp_1 = fp_math->fp_op->input<float>(ALICE, sz, f_1, m_bits, e_bits);
+  switch (op) {
+  case Op::SINPI:
+    cout << "SINPI" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_sinpi(f_1[i]);
+    }
+    fp = fp_math->sinpi(fp_1);
+    break;
+  case Op::COSPI:
+    cout << "COSPI" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_cospi(f_1[i]);
+    }
+    fp = fp_math->cospi(fp_1);
+    break;
+  case Op::TANPI:
+    cout << "TANPI" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_tanpi(f_1[i]);
+    }
+    fp = fp_math->tanpi(fp_1);
+    break;
+  case Op::EXP2:
+    cout << "EXP2" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_exp2(f_1[i]);
+    }
+    fp = fp_math->exp2(fp_1);
+    break;
+  case Op::LOG2:
+    cout << "LOG2" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_log2(f_1[i]);
+    }
+    fp = fp_math->log2(fp_1);
+    break;
+  case Op::LN:
+    cout << "LN" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_ln(f_1[i]);
+    }
+    fp = fp_math->ln(fp_1);
+    break;
+  case Op::EXP:
+    cout << "EXP" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_exp(f_1[i]);
+    }
+    fp = fp_math->exp(fp_1);
+    break;
+  case Op::ERF:
+    cout << "ERF" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = precise_erf(f_1[i]);
+    }
+    fp = fp_math->erf(fp_1);
+    break;
+  default:
+    assert(false);
+  }
+  FPArray fp_pub = fp_math->fp_op->output(PUBLIC, fp);
+  vector<float> f_ = fp_pub.get_native_type<float>();
+  for (int i = 0; i < sz; i++) {
+    uint32_t f_int_1 = *((uint32_t *)&f_1[i]);
+    uint32_t f_int = *((uint32_t *)&f[i]);
+    double ulp_err = ULP_error(f_[i], f[i]);
+    if (verbose) {
+      FPArray fp_i = fp_pub.subset(i, i + 1);
+      cout << i << "\t" << f_1[i] << "\t" << f[i] << "\t" << f_[i] << "\t"
+           << fp_i << "\t" << ulp_err << endl;
+    }
+    if (op == Op::TANPI &&
+        (float(f[i]) == INFINITY || float(f[i]) == -INFINITY))
+      continue;
+    if (op == Op::LOG2 && f_1[i] == 0.0)
+      continue;
+    if (f32_is_nan(f_int_1) || f32_is_denormal_number(f_int_1))
+      continue;
+    else {
+      assert(check_limit(f_[i], f[i]));
+    }
+  }
+  delete[] f;
+  delete[] f_1;
+}
+
+int main(int argc, char **argv) {
+  cout.precision(15);
+  mpfr_init2(mval, MPFR_PREC);
+
+  ArgMapping amap;
+
+  int int_op = static_cast<int>(op);
+  amap.arg("r", party, "Role of party: ALICE/SERVER = 1; BOB/CLIENT = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("o", int_op, "FP Operation");
+  amap.arg("v", verbose, "Print test inputs/outputs?");
+  amap.parse(argc, argv);
+  op = static_cast<Op>(int_op);
+
+  iopack = new IOPack(party, port, address);
+  otpack = new OTPack(iopack, party);
+
+  fp_math = new FPMath(party, iopack, otpack);
+
+  auto start = clock_start();
+  uint64_t comm_start = iopack->get_comm();
+  uint64_t initial_rounds = iopack->get_rounds();
+  test_op();
+  uint64_t comm_end = iopack->get_comm();
+  long long t = time_from(start);
+  cout << "Comm. per operations: " << 8 * (comm_end - comm_start) / sz
+       << " bits" << endl;
+  cout << "Number of FP ops/s:\t" << (double(sz) / t) * 1e6 << std::endl;
+  cout << "Total Time:\t" << t / (1000.0) << " ms" << endl;
+  cout << "Num_rounds: " << (iopack->get_rounds() - initial_rounds) << endl;
+  cout << "lb: " << lb << endl;
+  cout << "ub: " << ub << endl;
+
+  delete iopack;
+  mpfr_clear(mval);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_primitive.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_primitive.cpp
new file mode 100644
index 00000000..292c43eb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_float_primitive.cpp
@@ -0,0 +1,349 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "FloatingPoint/floating-point.h"
+#include "FloatingPoint/fp-math.h"
+#include <random>
+#include <limits>
+#include "float_utils.h"
+
+using namespace sci;
+using namespace std;
+
+enum class Op { ADD, MUL, DIV, SQRT, CHEAP_ADD, CHEAP_DIV };
+
+enum class CmpOp { LT, GT, GE, LE };
+
+Op op = Op::DIV;
+CmpOp cmp_op = CmpOp::LT;
+bool verbose = true;
+IOPack *iopack = nullptr;
+OTPack *otpack = nullptr;
+FPOp *fp_op = nullptr;
+FPMath *fp_math = nullptr;
+int sz = 10000;
+int party = 1;
+string address = "127.0.0.1";
+int port = 8000;
+uint8_t m_bits = 23, e_bits = 8;
+std::random_device rand_div;
+// std::mt19937 generator(rand_div());
+std::mt19937 generator(0);
+float lb;
+float ub;
+
+#define f32_get_e(f) ((f & 0x7F800000) >> 23)
+#define f32_get_m(f) (f & 0x007FFFFF)
+#define f32_get_s(f) (f >> 31)
+#define f32_is_denormal_number(f) (f32_get_e(f) == 0 && f32_get_m(f) != 0)
+#define f32_is_nan(f) (f32_get_e(f) == 0xff && f32_get_m(f) != 0)
+#define f32_is_inf(f) (f32_get_e(f) == 255 && f32_get_m(f) == 0)
+
+bool check_limit(float x_1, float x_2, int32_t ULP_limit = 1) {
+  if ((x_1 == INFINITY) && (x_2 == -INFINITY))
+    return true;
+  if ((x_1 == -INFINITY) && (x_2 == INFINITY))
+    return true;
+  if (x_1 == x_2)
+    return true;
+  int32_t x_1_int, x_2_int;
+  x_1_int = *((int32_t *)&x_1);
+  x_2_int = *((int32_t *)&x_2);
+  int32_t diff = x_1_int - x_2_int;
+  if (diff > ULP_limit || diff < -1 * ULP_limit)
+    return false;
+  else
+    return true;
+}
+
+void test_assignment() {
+  FPArray fp;
+  float *f = new float[sz];
+  for (int i = 0; i < sz; i++) {
+    uint32_t fint = generator();
+    f[i] = *((float *)&fint);
+  }
+  if (party == PUBLIC) {
+    fp = fp_op->input<float>(PUBLIC, sz, f, m_bits, e_bits);
+  } else {
+    fp = fp_op->input<float>(ALICE, sz, f, m_bits, e_bits);
+  }
+  FPArray fp_ = fp_op->output(PUBLIC, fp);
+  vector<float> f_ = fp_.get_native_type<float>();
+  for (int i = 0; i < sz; i++) {
+    uint32_t f_int = *((uint32_t *)&f[i]);
+    FPArray fp_i = fp_.subset(i, i + 1);
+    cout << i << "\t" << f[i] << "\t" << f_[i] << "\t" << fp_i << endl;
+    if (!(f32_is_nan(f_int) || f32_is_denormal_number(f_int))) {
+      assert(f[i] == f_[i]);
+    }
+  }
+  delete[] f;
+}
+
+void test_op() {
+  assert(party == ALICE || party == BOB);
+  switch (op) {
+  case Op::ADD:
+  case Op::MUL:
+  case Op::DIV:
+  case Op::CHEAP_ADD:
+  case Op::CHEAP_DIV:
+    lb = -1*std::numeric_limits<float>::max();
+    ub = std::numeric_limits<float>::max();
+    break;
+  case Op::SQRT:
+    lb = std::numeric_limits<float>::min();
+    ub = std::numeric_limits<float>::max();
+    break;
+  default:
+    assert(false);
+  }
+  FPArray fp;
+  float *f_1 = new float[sz];
+  float *f_2 = new float[sz];
+  float *f = new float[sz];
+  for (int i = 0; i < sz; i++) {
+    f_1[i] = sample_float(generator, lb, ub);
+    f_2[i] = sample_float(generator, lb, ub);
+  }
+  FPArray fp_1 = fp_op->input<float>(ALICE, sz, f_1, m_bits, e_bits);
+  FPArray fp_2 = fp_op->input<float>(BOB, sz, f_2, m_bits, e_bits);
+  switch (op) {
+  case Op::ADD:
+    cout << "ADD" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = f_1[i] + f_2[i];
+    }
+    fp = fp_op->add(fp_1, fp_2);
+    break;
+  case Op::MUL:
+    cout << "MUL" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = f_1[i] * f_2[i];
+    }
+    fp = fp_op->mul(fp_1, fp_2);
+    break;
+  case Op::DIV:
+    cout << "DIV" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = f_1[i] / f_2[i];
+    }
+    fp = fp_op->div(fp_1, fp_2);
+    break;
+  case Op::SQRT:
+    cout << "SQRT" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = sqrtf(f_1[i]);
+    }
+    fp = fp_op->sqrt(fp_1);
+    break;
+  case Op::CHEAP_ADD:
+    cout << "CHEAP ADD" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = f_1[i] + f_2[i];
+    }
+    fp = fp_op->add(fp_1, fp_2, true);
+    break;
+  case Op::CHEAP_DIV:
+    cout << "CHEAP DIV" << endl;
+    for (int i = 0; i < sz; i++) {
+      f[i] = f_1[i] / f_2[i];
+    }
+    fp = fp_op->div(fp_1, fp_2, true);
+    break;
+  default:
+    assert(false);
+  }
+  FPArray fp_pub = fp_op->output(PUBLIC, fp);
+  vector<float> f_ = fp_pub.get_native_type<float>();
+  for (int i = 0; i < sz; i++) {
+    uint32_t f_int_1 = *((uint32_t *)&f_1[i]);
+    uint32_t f_int_2 = *((uint32_t *)&f_2[i]);
+    uint32_t f_int = *((uint32_t *)&f[i]);
+    if (verbose) {
+      FPArray fp_i = fp_pub.subset(i, i + 1);
+      cout << i << "\t" << f_1[i] << "\t" << f_2[i] << "\t" << f[i] << "\t"
+           << f_[i] << "\t" << fp_i << endl;
+    }
+    if (op == Op::SQRT && (f_1[i] < 0))
+      continue;
+    if ((op == Op::DIV) && (f_2[i] == 0.0))
+      continue;
+    if ((f32_is_nan(f_int_1) || f32_is_denormal_number(f_int_1) ||
+         f32_is_nan(f_int_2) || f32_is_denormal_number(f_int_2)))
+      continue;
+    if (f32_is_denormal_number(f_int))
+      assert(f_[i] == 0.0);
+    else {
+      if ((op != Op::CHEAP_ADD && op != Op::CHEAP_DIV) && (m_bits == 23 && e_bits == 8)) {
+        assert(f[i] == f_[i]);
+      }
+    }
+  }
+  delete[] f;
+  delete[] f_1;
+  delete[] f_2;
+}
+
+void test_cmp_op() {
+  assert(party == ALICE || party == BOB);
+  lb = -1*std::numeric_limits<float>::max();
+  ub = std::numeric_limits<float>::max();
+  BoolArray bp;
+  float *f_1 = new float[sz];
+  float *f_2 = new float[sz];
+  uint8_t *b = new uint8_t[sz];
+  uint8_t *b_ = new uint8_t[sz];
+  for (int i = 0; i < sz; i++) {
+    f_1[i] = sample_float(generator, lb, ub);
+    f_2[i] = sample_float(generator, lb, ub);
+  }
+  FPArray fp_1 = fp_op->input<float>(ALICE, sz, f_1, m_bits, e_bits);
+  FPArray fp_2 = fp_op->input<float>(BOB, sz, f_2, m_bits, e_bits);
+  switch (cmp_op) {
+  case CmpOp::LT:
+    cout << "LT" << endl;
+    for (int i = 0; i < sz; i++) {
+      b[i] = f_1[i] < f_2[i];
+    }
+    bp = fp_op->LT(fp_1, fp_2);
+    break;
+  case CmpOp::GT:
+    cout << "GT" << endl;
+    for (int i = 0; i < sz; i++) {
+      b[i] = f_1[i] > f_2[i];
+    }
+    bp = fp_op->GT(fp_1, fp_2);
+    break;
+  case CmpOp::LE:
+    cout << "LE" << endl;
+    for (int i = 0; i < sz; i++) {
+      b[i] = f_1[i] <= f_2[i];
+    }
+    bp = fp_op->LE(fp_1, fp_2);
+    break;
+  case CmpOp::GE:
+    cout << "GE" << endl;
+    for (int i = 0; i < sz; i++) {
+      b[i] = f_1[i] >= f_2[i];
+    }
+    bp = fp_op->GE(fp_1, fp_2);
+    break;
+  default:
+    assert(false);
+  }
+  BoolArray bp_pub = fp_op->bool_op->output(PUBLIC, bp);
+  memcpy(b_, bp_pub.data, sz * sizeof(uint8_t));
+  for (int i = 0; i < sz; i++) {
+    uint32_t f_int_1 = *((uint32_t *)&f_1[i]);
+    uint32_t f_int_2 = *((uint32_t *)&f_2[i]);
+    if (verbose) {
+      cout << i << "\t" << f_1[i] << "\t" << f_2[i] << "\t" << int(b[i]) << "\t"
+           << int(b_[i]) << endl;
+    }
+    if (f32_is_nan(f_int_1) || f32_is_nan(f_int_2))
+      continue;
+    assert(b[i] == b_[i]);
+  }
+  delete[] b;
+  delete[] b_;
+  delete[] f_1;
+  delete[] f_2;
+}
+
+void test_int_to_float() {
+  bool signed_ = true;
+  FPArray fp;
+  uint64_t *f_1 = new uint64_t[sz];
+  float *f = new float[sz];
+  for (int i = 0; i < sz; i++) {
+    uint32_t fint = generator();
+    f_1[i] = fint;
+  }
+  FixArray fx = fp_op->fix->input(ALICE, sz, f_1, signed_, 32, 0);
+
+  cout << "INT TO FLOAT" << endl;
+  fp = fp_op->int_to_float(fx, 23, 8);
+
+  FPArray fp_pub = fp_op->output(PUBLIC, fp);
+  vector<float> f_ = fp_pub.get_native_type<float>();
+  for (int i = 0; i < sz; i++) {
+    if (signed_) {
+      f[i] = float(int32_t(f_1[i]));
+    } else {
+      f[i] = float(f_1[i]);
+    }
+    if (verbose) {
+      FPArray fp_i = fp_pub.subset(i, i + 1);
+      cout << i << "\t" << f_1[i] << "\t" << f[i] << "\t" << f_[i] << "\t"
+           << fp_i << endl;
+    }
+    assert(f[i] == f_[i]);
+  }
+  delete[] f;
+  delete[] f_1;
+}
+
+int main(int argc, char **argv) {
+  cout.precision(15);
+
+  ArgMapping amap;
+
+  int int_op = static_cast<int>(op);
+  int int_cmp_op = static_cast<int>(cmp_op);
+  amap.arg("r", party, "Role of party: ALICE/SERVER = 1; BOB/CLIENT = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("o", int_op, "FP Primitve Operation");
+  amap.arg("c", int_cmp_op, "FP Comparison Operation");
+  amap.arg("v", verbose, "Print test inputs/outputs?");
+  amap.parse(argc, argv);
+  op = static_cast<Op>(int_op);
+  cmp_op = static_cast<CmpOp>(int_cmp_op);
+
+  iopack = new IOPack(party, port, address);
+  otpack = new OTPack(iopack, party);
+
+  fp_op = new FPOp(party, iopack, otpack);
+  fp_math = new FPMath(party, iopack, otpack);
+
+  auto start = clock_start();
+  uint64_t comm_start = iopack->get_comm();
+  uint64_t initial_rounds = iopack->get_rounds();
+  // test_int_to_float();
+  test_op();
+  // test_cmp_op();
+  uint64_t comm_end = iopack->get_comm();
+  long long t = time_from(start);
+  cout << "Comm. per operations: " << 8 * (comm_end - comm_start) / sz
+       << " bits" << endl;
+  cout << "Number of FP ops/s:\t" << (double(sz) / t) * 1e6 << std::endl;
+  cout << "Total Time:\t" << t / (1000.0) << " ms" << endl;
+  cout << "Num_rounds: " << (iopack->get_rounds() - initial_rounds) << endl;
+  cout << "lb: " << lb << endl;
+  cout << "ub: " << ub << endl;
+
+  delete iopack;
+  delete otpack;
+  delete fp_op;
+  delete fp_math;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_argmax.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_argmax.cpp
new file mode 100644
index 00000000..6e6e25a0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_argmax.cpp
@@ -0,0 +1,164 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "NonLinear/argmax.h"
+
+using namespace std;
+using namespace sci;
+
+int party = 0;
+int32_t bitlength = 32;
+int num_threads = 1;
+int port = 32000;
+string address = "127.0.0.1";
+int num_argmax = 1000;
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.arg("N", num_argmax, "Number of elements");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  IOPack *iopack = new IOPack(party, port, address);
+  uint64_t magnitude_bound = (1 << (bitlength - 3));
+  uint64_t mask_l = -1ULL;
+  if (bitlength != 64) {
+    mask_l = (1ULL << bitlength) - 1ULL;
+  }
+  OTPack otpack(iopack, party);
+  ArgMaxProtocol<uint64_t> argmax_oracle(party, RING, iopack, bitlength, MILL_PARAM, 0, &otpack);
+  PRG128 prg;
+  uint64_t *input_share1, *input_share2, *input_share_uncorrected;
+  uint8_t *input_share_sign;
+  input_share_uncorrected = new uint64_t[num_argmax];
+  input_share1 = new uint64_t[num_argmax];
+  input_share2 = new uint64_t[num_argmax];
+  input_share_sign = new uint8_t[num_argmax];
+
+  uint64_t *argmax_output_protocol = new uint64_t[1];
+  uint64_t *argmax_output_protocol_share_other = new uint64_t[1];
+  uint64_t *argmax_output_protocol_arg = new uint64_t[1];
+  uint64_t *argmax_output_protocol_share_other_arg = new uint64_t[1];
+  uint64_t *argmax_output_actual = new uint64_t[1];
+  switch (party) {
+  case ALICE: {
+    prg.random_data(input_share_uncorrected, sizeof(uint64_t) * num_argmax);
+    prg.random_data(input_share_sign, num_argmax);
+    uint64_t comm_start = iopack->get_comm();
+    auto start = clock_start();
+    for (int i = 0; i < num_argmax; i++) {
+      input_share_uncorrected[i] %= magnitude_bound;
+    }
+    for (int i = 0; i < num_argmax; i++) {
+      if (input_share_sign[i] & 1) {
+        input_share1[i] = -1 * input_share_uncorrected[i];
+      } else {
+        input_share1[i] = input_share_uncorrected[i];
+      }
+    }
+    argmax_oracle.ArgMaxMPC(num_argmax, input_share1,
+                            argmax_output_protocol_arg, true,
+                            argmax_output_protocol);
+
+    long long t = time_from(start);
+    uint64_t comm_end = iopack->get_comm();
+    cout << "Comparison Time\t" << RED << (double(num_argmax) / t) * 1e6
+         << " ArgMax/sec" << RESET << endl;
+    cout << "ALICE communication\t" << BLUE
+         << ((double)(comm_end - comm_start) * 8) / (bitlength * num_argmax) << "*" << bitlength
+         << " bits/ArgMax" << RESET << endl;
+    std::cout << "ALICE: Done MaxPool protocol execution" << std::endl;
+    iopack->io->recv_data(input_share2, sizeof(uint64_t) * num_argmax);
+    iopack->io->recv_data(argmax_output_protocol_share_other,
+                          sizeof(uint64_t) * 1);
+    iopack->io->recv_data(argmax_output_protocol_share_other_arg,
+                          sizeof(uint64_t) * 1);
+
+    cout << "Checking correctness of ArgMax now..." << endl;
+
+    argmax_output_protocol[0] =
+        (argmax_output_protocol[0] + argmax_output_protocol_share_other[0]) &
+        mask_l;
+    argmax_output_protocol_arg[0] =
+        (argmax_output_protocol_arg[0] +
+         argmax_output_protocol_share_other_arg[0]) &
+        mask_l;
+
+    for (int i = 0; i < num_argmax; i++) {
+      input_share1[i] = input_share1[i] + input_share2[i];
+    }
+
+    argmax_output_actual[0] = input_share1[0];
+    for (int i = 1; i < num_argmax; i++) {
+      argmax_output_actual[0] = (((argmax_output_actual[0] - input_share1[i]) >=
+                                  (1 << (bitlength - 1)))
+                                     ? input_share1[i]
+                                     : argmax_output_actual[0]);
+    }
+    std::cout << "Max Protocol: " << argmax_output_protocol[0] << std::endl;
+    std::cout << "Max Actual: " << argmax_output_actual[0] << std::endl;
+    std::cout << "ArgMax Protocol: " << argmax_output_protocol_arg[0]
+              << std::endl;
+
+    assert(argmax_output_actual[0] == argmax_output_protocol[0] &&
+           "ArgMax output is incorrect");
+
+    cout << "ArgMax answer is: " << GREEN << "CORRECT!" << RESET << endl;
+    break;
+  }
+  case BOB: {
+    // These are written so that overall time excludes these.
+    prg.random_data(input_share_uncorrected, sizeof(uint64_t) * num_argmax);
+    prg.random_data(input_share_sign, num_argmax);
+    uint64_t comm_start = iopack->get_comm();
+
+    for (int i = 0; i < num_argmax; i++) {
+      input_share_uncorrected[i] %= magnitude_bound;
+    }
+    for (int i = 0; i < num_argmax; i++) {
+      if (input_share_sign[i] & 1) {
+        input_share2[i] = -1 * input_share_uncorrected[i];
+      } else {
+        input_share2[i] = input_share_uncorrected[i];
+      }
+    }
+    argmax_oracle.ArgMaxMPC(num_argmax, input_share2,
+                            argmax_output_protocol_arg, true,
+                            argmax_output_protocol);
+
+    uint64_t comm_end = iopack->get_comm();
+    cout << "BOB communication\t" << BLUE
+         << ((double)(comm_end - comm_start) * 8) / (bitlength * num_argmax) << "*" << bitlength
+         << " bits/ArgMax" << RESET << endl;
+
+    std::cout << "BOB: Done MaxPool protocol execution" << std::endl;
+    iopack->io->send_data(input_share2, sizeof(uint64_t) * num_argmax);
+    iopack->io->send_data(argmax_output_protocol, sizeof(uint64_t) * 1);
+    iopack->io->send_data(argmax_output_protocol_arg, sizeof(uint64_t) * 1);
+    break;
+  }
+  }
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_aux_protocols.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_aux_protocols.cpp
new file mode 100644
index 00000000..abdbfa32
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_aux_protocols.cpp
@@ -0,0 +1,484 @@
+/*
+Authors: Deevashwer Rathee, Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/aux-protocols.h"
+#include "utils/emp-tool.h"
+#include <iostream>
+using namespace sci;
+using namespace std;
+
+int party, port = 8000, dim = 35;
+string address = "127.0.0.1";
+IOPack *iopack;
+OTPack *otpack;
+AuxProtocols *aux;
+
+void test_wrap_computation() {
+  int bw_x = 32;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *x = new uint64_t[dim];
+  uint8_t *y = new uint8_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+  }
+
+  aux->wrap_computation(x, y, dim, bw_x);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * sizeof(uint8_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint8_t *y0 = new uint8_t[dim];
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      assert((x0[i] > (mask_x - x[i])) == (y0[i] ^ y[i]));
+    }
+    cout << "Wrap Computation Tests passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+void test_mux() {
+  int bw_x = 32, bw_y = 32;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+  uint8_t *sel = new uint8_t[dim];
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(sel, dim * sizeof(uint8_t));
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    sel[i] = sel[i] & 1;
+    x[i] = x[i] & mask_x;
+  }
+
+  aux->multiplexer(sel, x, y, dim, bw_x, bw_y);
+
+  if (party == ALICE) {
+    iopack->io->send_data(sel, dim * sizeof(uint8_t));
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * sizeof(uint64_t));
+  } else {
+    uint8_t *sel0 = new uint8_t[dim];
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopack->io->recv_data(sel0, dim * sizeof(uint8_t));
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    for (int i = 0; i < dim; i++) {
+      assert(((uint64_t(sel0[i] ^ sel[i]) * (x0[i] + x[i])) & mask_y) ==
+             ((y0[i] + y[i]) & mask_y));
+    }
+    cout << "MUX Tests passed" << endl;
+
+    delete[] sel0;
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] sel;
+  delete[] x;
+  delete[] y;
+}
+
+void test_B2A() {
+  int bw_y = 32;
+  PRG128 prg;
+  uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+  uint8_t *x = new uint8_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint8_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & 1;
+  }
+
+  aux->B2A(x, y, dim, bw_y);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint8_t));
+    iopack->io->send_data(y, dim * sizeof(uint64_t));
+  } else {
+    uint8_t *x0 = new uint8_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopack->io->recv_data(x0, dim * sizeof(uint8_t));
+    iopack->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    for (int i = 0; i < dim; i++) {
+      assert(((uint64_t(x0[i] ^ x[i])) & mask_y) == ((y0[i] + y[i]) & mask_y));
+    }
+    cout << "B2A Tests passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+template <typename T> void test_lookup_table() {
+  int32_t T_size = sizeof(T) * 8;
+  int bw_x = 8;
+  int bw_y;
+  if (T_size == 8)
+    bw_y = 7;
+  else
+    bw_y = 29;
+  PRG128 prg;
+  uint64_t N = 1ULL << bw_x;
+  T mask_x = (bw_x == T_size ? -1 : ((1ULL << bw_x) - 1));
+  T mask_y = (bw_y == T_size ? -1 : ((1ULL << bw_y) - 1));
+
+  T **spec = new T *[dim];
+  T *x = new T[dim];
+  T *y = new T[dim];
+
+  for (int i = 0; i < dim; i++) {
+    spec[i] = new T[N];
+    prg.random_data(spec[i], N * sizeof(T));
+  }
+  prg.random_data(x, dim * sizeof(T));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+    for (int j = 0; j < N; j++) {
+      spec[i][j] = spec[i][j] & mask_y;
+    }
+  }
+
+  if (party == ALICE) {
+    aux->lookup_table<T>(spec, nullptr, nullptr, dim, bw_x, bw_y);
+  } else { // party == BOB
+    aux->lookup_table<T>(nullptr, x, y, dim, bw_x, bw_y);
+  }
+
+  if (party == BOB) {
+    iopack->io->send_data(x, dim * sizeof(T));
+    iopack->io->send_data(y, dim * sizeof(T));
+  } else { // ALICE knows the correct spec
+    T *x0 = new T[dim];
+    T *y0 = new T[dim];
+    iopack->io->recv_data(x0, dim * sizeof(T));
+    iopack->io->recv_data(y0, dim * sizeof(T));
+
+    for (int i = 0; i < dim; i++) {
+      assert((spec[i][x0[i] & mask_x]) == (y0[i] & mask_y));
+    }
+    if (T_size == 8)
+      cout << "Lookup Table <uint8_t> Tests passed" << endl;
+    else
+      cout << "Lookup Table <uint64_t> Tests passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < dim; i++)
+    delete[] spec[i];
+  delete[] spec;
+}
+
+void test_MSB_computation() {
+  int bw_x = 32;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *x = new uint64_t[dim];
+  uint8_t *y = new uint8_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+  }
+
+  aux->MSB(x, y, dim, bw_x);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * sizeof(uint8_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint8_t *y0 = new uint8_t[dim];
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      assert((((x0[i] + x[i]) & mask_x) >= (1ULL << (bw_x - 1))) ==
+             (y0[i] ^ y[i]));
+    }
+    cout << "MSB Computation Tests passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+void test_MSB_to_Wrap() {
+  int bw_x = 32;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *x = new uint64_t[dim];
+  uint8_t *msb_x = new uint8_t[dim];
+  uint8_t *y = new uint8_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  prg.random_data(msb_x, dim * sizeof(uint8_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+    msb_x[i] = msb_x[i] & 1;
+  }
+  if (party == ALICE) {
+    uint64_t *x_bob = new uint64_t[dim];
+    uint8_t *msb_x_bob = new uint8_t[dim];
+    iopack->io->recv_data(x_bob, dim * sizeof(uint64_t));
+    iopack->io->recv_data(msb_x_bob, dim * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      msb_x[i] =
+          (((x_bob[i] + x[i]) & mask_x) >= (1ULL << (bw_x - 1))) ^ msb_x_bob[i];
+    }
+
+    delete[] x_bob;
+    delete[] msb_x_bob;
+  } else {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(msb_x, dim * sizeof(uint8_t));
+  }
+
+  aux->MSB_to_Wrap(x, msb_x, y, dim, bw_x);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * sizeof(uint8_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint8_t *y0 = new uint8_t[dim];
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      assert((x0[i] > (mask_x - x[i])) == (y0[i] ^ y[i]));
+    }
+    cout << "MSB to Wrap Tests passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+void test_AND() {
+  int bw_in = 32;
+  PRG128 prg;
+  uint64_t mask_in = (bw_in == 64 ? -1 : ((1ULL << bw_in) - 1));
+
+  uint8_t *x = new uint8_t[dim];
+  uint8_t *y = new uint8_t[dim];
+  uint8_t *z = new uint8_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint8_t));
+  prg.random_data(y, dim * sizeof(uint8_t));
+
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & 1;
+    y[i] = y[i] & 1;
+  }
+
+  aux->AND(x, y, z, dim);
+
+  if (party == ALICE) {
+    uint8_t *x_bob = new uint8_t[dim];
+    uint8_t *y_bob = new uint8_t[dim];
+    uint8_t *z_bob = new uint8_t[dim];
+    iopack->io->recv_data(x_bob, dim * sizeof(uint8_t));
+    iopack->io->recv_data(y_bob, dim * sizeof(uint8_t));
+    iopack->io->recv_data(z_bob, dim * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      x_bob[i] ^= x[i];
+      y_bob[i] ^= y[i];
+      z_bob[i] ^= z[i];
+      assert((x_bob[i] & y_bob[i]) == z_bob[i]);
+    }
+    cout << "AND Computation Tests passed" << endl;
+
+  } else {
+    iopack->io->send_data(x, dim * sizeof(uint8_t));
+    iopack->io->send_data(y, dim * sizeof(uint8_t));
+    iopack->io->send_data(z, dim * sizeof(uint8_t));
+  }
+
+  delete[] x;
+  delete[] y;
+  delete[] z;
+}
+
+void test_digit_decomposition() {
+  int bw_x = 32;
+  int digit_size = 8;
+  int num_digits = ceil((1.0 * bw_x) / digit_size);
+  int last_digit_size = bw_x - (num_digits - 1) * digit_size;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+  uint64_t digit_mask = (digit_size == 64 ? -1 : (1ULL << digit_size) - 1);
+  uint64_t last_digit_mask =
+      (last_digit_size == 64 ? -1 : (1ULL << last_digit_size) - 1);
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *xprime = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim * num_digits];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+  }
+
+  aux->digit_decomposition_sci(dim, x, y, bw_x, digit_size);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * num_digits * sizeof(uint64_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim * num_digits];
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * num_digits * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      for (int j = 0; j < num_digits - 1; j++) {
+        y0[j * dim + i] += y[j * dim + i];
+        y0[j * dim + i] &= digit_mask;
+      }
+      y0[(num_digits - 1) * dim + i] += y[(num_digits - 1) * dim + i];
+      y0[(num_digits - 1) * dim + i] &= last_digit_mask;
+      xprime[i] = 0;
+      for (int j = 0; j < num_digits; j++) {
+        xprime[i] += y0[j * dim + i] * (1ULL << (j * digit_size));
+      }
+      xprime[i] &= mask_x;
+      x0[i] += x[i];
+      x0[i] &= mask_x;
+      for (int j = 0; j < num_digits; j++) {
+        uint64_t temp_mask =
+            (j == (num_digits - 1)) ? last_digit_mask : digit_mask;
+      }
+      assert(xprime[i] == x0[i]);
+    }
+    cout << "Digit Decomposition Tests Passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+void test_msnzb_one_hot() {
+  int bw_x = 32;
+  int digit_size = 8;
+  PRG128 prg;
+  uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+  uint64_t *x = new uint64_t[dim];
+  uint8_t *y = new uint8_t[dim * bw_x];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+  for (int i = 0; i < dim; i++) {
+    x[i] = x[i] & mask_x;
+  }
+
+  aux->msnzb_one_hot(x, y, bw_x, dim, digit_size);
+
+  if (party == ALICE) {
+    iopack->io->send_data(x, dim * sizeof(uint64_t));
+    iopack->io->send_data(y, dim * bw_x * sizeof(uint8_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    uint8_t *y0 = new uint8_t[dim * bw_x];
+    iopack->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(y0, dim * bw_x * sizeof(uint8_t));
+
+    for (int i = 0; i < dim; i++) {
+      uint64_t secure_val = 0ULL;
+      for (int j = 0; j < bw_x; j++) {
+        secure_val += (1ULL << j) * int(y[i * bw_x + j] ^ y0[i * bw_x + j]);
+      }
+      if (unsigned_val(x0[i] + x[i], bw_x) == 0) {
+        continue;
+      }
+      assert((unsigned_val(x0[i] + x[i], bw_x) >=
+              unsigned_val(secure_val, bw_x)) &&
+             (unsigned_val(x0[i] + x[i], bw_x) <
+              2 * unsigned_val(secure_val, bw_x)));
+    }
+    std::cout << "MSNZB One Hot Tests Passed" << std::endl;
+    delete[] x0;
+    delete[] y0;
+  }
+  delete[] x;
+  delete[] y;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("d", dim, "Size of vector");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.parse(argc, argv);
+
+  iopack = new IOPack(party, port, "127.0.0.1");
+  otpack = new OTPack(iopack, party);
+  uint64_t num_rounds;
+
+  aux = new AuxProtocols(party, iopack, otpack);
+
+  test_MSB_computation();
+  test_wrap_computation();
+  test_mux();
+  test_B2A();
+  test_lookup_table<uint8_t>();
+  test_lookup_table<uint64_t>();
+  test_MSB_to_Wrap();
+  test_AND();
+  test_digit_decomposition();
+  test_msnzb_one_hot();
+
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_exp.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_exp.cpp
new file mode 100644
index 00000000..1b9e460b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_exp.cpp
@@ -0,0 +1,192 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 32000;
+int num_threads = 4;
+string address = "127.0.0.1";
+
+int dim = 1ULL << 16;
+int bw_x = 16;
+int bw_y = 16;
+int s_x = 12;
+int s_y = 12;
+
+uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+
+uint64_t computeULPErr(double calc, double actual, int SCALE) {
+  int64_t calc_fixed = (double(calc) * (1ULL << SCALE));
+  int64_t actual_fixed = (double(actual) * (1ULL << SCALE));
+  uint64_t ulp_err = (calc_fixed - actual_fixed) > 0
+                         ? (calc_fixed - actual_fixed)
+                         : (actual_fixed - calc_fixed);
+  return ulp_err;
+}
+
+void exp_thread(int tid, uint64_t *x, uint64_t *y, int num_exp) {
+  MathFunctions *math;
+  if (tid & 1) {
+    math = new MathFunctions(3 - party, iopackArr[tid], otpackArr[tid]);
+  } else {
+    math = new MathFunctions(party, iopackArr[tid], otpackArr[tid]);
+  }
+  math->lookup_table_exp(num_exp, x, y, bw_x, bw_y, s_x, s_y);
+
+  delete math;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of exponentiations");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  assert(num_threads <= MAX_THREADS);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg;
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      // x is always negative
+      x[i] = ((1ULL << (bw_x - 1)) + (x[i] & (mask_x >> 1))) - x0[i];
+    }
+    delete[] x0;
+  }
+  for (int i = 0; i < dim; i++) {
+    x[i] &= mask_x;
+  }
+
+  /************** Fork Threads ****************/
+  /********************************************/
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+
+  auto start = clock_start();
+  std::thread exp_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_exp;
+    if (i == (num_threads - 1)) {
+      lnum_exp = dim - offset;
+    } else {
+      lnum_exp = chunk_size;
+    }
+    exp_threads[i] =
+        std::thread(exp_thread, i, x + offset, y + offset, lnum_exp);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    exp_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+
+  /************** Verification ****************/
+  /********************************************/
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    uint64_t total_err = 0;
+    uint64_t max_ULP_err = 0;
+    for (int i = 0; i < dim; i++) {
+      double dbl_x = (signed_val(x0[i] + x[i], bw_x)) / double(1LL << s_x);
+      double dbl_y = (signed_val(y0[i] + y[i], bw_y)) / double(1ULL << s_y);
+      double exp_x = exp(dbl_x);
+      uint64_t err = computeULPErr(dbl_y, exp_x, s_y);
+      total_err += err;
+      max_ULP_err = std::max(max_ULP_err, err);
+    }
+
+    cerr << "Average ULP error: " << total_err / dim << endl;
+    cerr << "Max ULP error: " << max_ULP_err << endl;
+    cerr << "Number of tests: " << dim << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+  cout << "Number of Exp/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "Exp Time\t" << t / (1000.0) << " ms" << endl;
+  cout << "Exp Bytes Sent\t" << total_comm << " bytes" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_hadamard_product.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_hadamard_product.cpp
new file mode 100644
index 00000000..4c32dd83
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_hadamard_product.cpp
@@ -0,0 +1,99 @@
+#include "LinearOT/linear-ot.h"
+#include "utils/emp-tool.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+int party, port = 32000;
+string address = "127.0.0.1";
+IOPack *iopack;
+OTPack *otpack;
+LinearOT *prod;
+
+int dim = 1 << 16;
+int bwA = 32;
+int bwB = 32;
+int bwC = 32;
+
+uint64_t maskA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+uint64_t maskB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+
+void test_hadamard_product(uint64_t *inA, uint64_t *inB,
+                           bool signed_arithmetic = true) {
+  uint64_t *outC = new uint64_t[dim];
+
+  prod->hadamard_product(dim, inA, inB, outC, bwA, bwB, bwC, signed_arithmetic);
+
+  if (party == ALICE) {
+    iopack->io->send_data(inA, dim * sizeof(uint64_t));
+    iopack->io->send_data(inB, dim * sizeof(uint64_t));
+    iopack->io->send_data(outC, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *inA0 = new uint64_t[dim];
+    uint64_t *inB0 = new uint64_t[dim];
+    uint64_t *outC0 = new uint64_t[dim];
+    iopack->io->recv_data(inA0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(inB0, dim * sizeof(uint64_t));
+    iopack->io->recv_data(outC0, dim * sizeof(uint64_t));
+
+    for (int i = 0; i < dim; i++) {
+      if (signed_arithmetic) {
+        assert(signed_val(outC[i] + outC0[i], bwC) ==
+               (signed_val(signed_val(inA[i] + inA0[i], bwA) *
+                               signed_val(inB[i] + inB0[i], bwB),
+                           bwC)));
+      } else {
+        assert(unsigned_val(outC[i] + outC0[i], bwC) ==
+               (unsigned_val(unsigned_val(inA[i] + inA0[i], bwA) *
+                                 unsigned_val(inB[i] + inB0[i], bwB),
+                             bwC)));
+      }
+    }
+    if (signed_arithmetic)
+      cout << "SMult Tests Passed" << endl;
+    else
+      cout << "UMult Tests Passed" << endl;
+
+    delete[] inA0;
+    delete[] inB0;
+    delete[] outC0;
+  }
+
+  delete[] outC;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  iopack = new IOPack(party, port, address);
+  otpack = new OTPack(iopack, party);
+
+  prod = new LinearOT(party, iopack, otpack);
+
+  PRG128 prg;
+
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *inB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+  prg.random_data(inB, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= maskA;
+    inB[i] &= maskB;
+  }
+
+  test_hadamard_product(inA, inB, false);
+  test_hadamard_product(inA, inB, true);
+
+  delete[] inA;
+  delete[] inB;
+  delete prod;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_matmul.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_matmul.cpp
new file mode 100644
index 00000000..e7f02c34
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_matmul.cpp
@@ -0,0 +1,211 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "LinearOT/linear-ot.h"
+#include "utils/emp-tool.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+int party, port = 32000;
+string address = "127.0.0.1";
+IOPack *iopack;
+OTPack *otpack;
+LinearOT *prod;
+
+int dim1 = 1;
+int dim2 = 100;
+int dim3 = 35;
+int bwA = 8;
+int bwB = 8;
+int bwC = bwA + bwB;
+bool signed_B = true;
+bool accumulate = true;
+bool precomputed_MSBs = false;
+MultMode mode = MultMode::None;
+
+uint64_t maskA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+uint64_t maskB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+uint64_t maskC = (bwC == 64 ? -1 : ((1ULL << bwC) - 1));
+
+void test_matrix_multiplication(uint64_t *inA, uint64_t *inB,
+                                bool signed_arithmetic = true) {
+  int dim = (::accumulate ? dim1 * dim3 : dim1 * dim2 * dim3);
+  uint64_t *outC = new uint64_t[dim];
+
+  INIT_TIMER;
+  START_TIMER;
+  uint8_t *msbA = nullptr;
+  uint8_t *msbB = nullptr;
+  if (precomputed_MSBs) {
+    msbA = new uint8_t[dim1 * dim2];
+    msbB = new uint8_t[dim2 * dim3];
+    prod->aux->MSB(inA, msbA, dim1 * dim2, bwA);
+    prod->aux->MSB(inB, msbB, dim2 * dim3, bwB);
+  }
+  uint64_t num_rounds = iopack->get_rounds();
+  uint64_t comm_start = iopack->get_comm();
+  prod->matrix_multiplication(dim1, dim2, dim3, inA, inB, outC, bwA, bwB, bwC,
+                              signed_arithmetic, signed_B, ::accumulate, mode,
+                              msbA, msbB);
+  if (precomputed_MSBs) {
+    delete[] msbA;
+    delete[] msbB;
+  }
+  uint64_t comm_end = iopack->get_comm();
+  cout << "Bytes Sent: " << (comm_end - comm_start) << endl;
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds: " << num_rounds << endl;
+  STOP_TIMER("Total time for matmul");
+
+  if (party == ALICE) {
+    iopack->io->send_data(inA, dim1 * dim2 * sizeof(uint64_t));
+    iopack->io->send_data(inB, dim2 * dim3 * sizeof(uint64_t));
+    iopack->io->send_data(outC, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *inA0 = new uint64_t[dim1 * dim2];
+    uint64_t *inB0 = new uint64_t[dim2 * dim3];
+    uint64_t *outC0 = new uint64_t[dim];
+    iopack->io->recv_data(inA0, dim1 * dim2 * sizeof(uint64_t));
+    iopack->io->recv_data(inB0, dim2 * dim3 * sizeof(uint64_t));
+    iopack->io->recv_data(outC0, dim * sizeof(uint64_t));
+
+    int extra_bits = (::accumulate ? ceil(log2(dim2)) : 0);
+    uint64_t *res = new uint64_t[dim];
+    for (int i = 0; i < dim1 * dim2; i++) {
+      if (signed_arithmetic) {
+        if (mode == MultMode::Alice_has_A) {
+          inA0[i] = signed_val(inA0[i], bwA);
+        } else if (mode == MultMode::Bob_has_A) {
+          inA0[i] = signed_val(inA[i], bwA);
+        } else {
+          inA0[i] = signed_val(inA0[i] + inA[i], bwA);
+        }
+      } else {
+        if (mode == MultMode::Alice_has_A) {
+          inA0[i] = unsigned_val(inA0[i], bwA);
+        } else if (mode == MultMode::Bob_has_A) {
+          inA0[i] = unsigned_val(inA[i], bwA);
+        } else {
+          inA0[i] = unsigned_val(inA0[i] + inA[i], bwA);
+        }
+      }
+    }
+    for (int i = 0; i < dim2 * dim3; i++) {
+      if (signed_arithmetic && signed_B) {
+        if (mode == MultMode::Alice_has_B) {
+          inB0[i] = signed_val(inB0[i], bwB);
+        } else if (mode == MultMode::Bob_has_B) {
+          inB0[i] = signed_val(inB[i], bwB);
+        } else {
+          inB0[i] = signed_val(inB0[i] + inB[i], bwB);
+        }
+      } else {
+        if (mode == MultMode::Alice_has_B) {
+          inB0[i] = unsigned_val(inB0[i], bwB);
+        } else if (mode == MultMode::Bob_has_B) {
+          inB0[i] = unsigned_val(inB[i], bwB);
+        } else {
+          inB0[i] = unsigned_val(inB0[i] + inB[i], bwB);
+        }
+      }
+    }
+    prod->matmul_cleartext(dim1, dim2, dim3, inA0, inB0, res, ::accumulate);
+
+    for (int i = 0; i < dim; i++) {
+      if (signed_arithmetic) {
+        assert(signed_val(res[i] >> extra_bits, bwC) ==
+               signed_val(outC[i] + outC0[i], bwC));
+      } else {
+        assert(unsigned_val(res[i] >> extra_bits, bwC) ==
+               unsigned_val(outC[i] + outC0[i], bwC));
+      }
+    }
+    if (signed_arithmetic)
+      cout << "SMult Tests Passed" << endl;
+    else
+      cout << "UMult Tests Passed" << endl;
+
+    delete[] inA0;
+    delete[] inB0;
+    delete[] outC0;
+    delete[] res;
+  }
+  delete[] outC;
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+  amap.arg("m", precomputed_MSBs, "MSB_to_Wrap Optimization?");
+  amap.arg("a", ::accumulate, "Accumulate?");
+
+  amap.parse(argc, argv);
+
+  iopack = new IOPack(party, port, "127.0.0.1");
+  otpack = new OTPack(iopack, party);
+  prod = new LinearOT(party, iopack, otpack);
+
+  PRG128 prg; //(fix_key);
+
+  uint64_t *inA = new uint64_t[dim1 * dim2];
+  uint64_t *inB = new uint64_t[dim2 * dim3];
+
+  prg.random_data(inA, dim1 * dim2 * sizeof(uint64_t));
+  prg.random_data(inB, dim2 * dim3 * sizeof(uint64_t));
+
+  for (int i = 0; i < dim1 * dim2; i++) {
+    inA[i] &= maskA;
+  }
+  for (int i = 0; i < dim2 * dim3; i++) {
+    inB[i] &= maskB;
+  }
+
+  cout << "Precomputed MSBs: " << precomputed_MSBs << endl;
+  cout << "Accumulate: " << ::accumulate << endl;
+  mode = MultMode::None;
+  cout << "Mode: None" << endl;
+  test_matrix_multiplication(inA, inB, false);
+  test_matrix_multiplication(inA, inB, true);
+  mode = MultMode::Alice_has_A;
+  cout << "Mode: Alice_has_A" << endl;
+  test_matrix_multiplication(inA, inB, false);
+  test_matrix_multiplication(inA, inB, true);
+  mode = MultMode::Alice_has_B;
+  cout << "Mode: Alice_has_B" << endl;
+  test_matrix_multiplication(inA, inB, false);
+  test_matrix_multiplication(inA, inB, true);
+  mode = MultMode::Bob_has_A;
+  cout << "Mode: Bob_has_A" << endl;
+  test_matrix_multiplication(inA, inB, false);
+  test_matrix_multiplication(inA, inB, true);
+  mode = MultMode::Bob_has_B;
+  cout << "Mode: Bob_has_B" << endl;
+  test_matrix_multiplication(inA, inB, false);
+  test_matrix_multiplication(inA, inB, true);
+
+  delete[] inA;
+  delete[] inB;
+  delete prod;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_maxpool.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_maxpool.cpp
new file mode 100644
index 00000000..affcbb5b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_maxpool.cpp
@@ -0,0 +1,200 @@
+/*
+Authors: Mayank Rathee, Deevashwer Rathee
+Copyright:
+Copyright (c) 2020 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "NonLinear/maxpool.h"
+#include <fstream>
+#include <thread>
+
+using namespace std;
+using namespace sci;
+
+#define MAX_THREADS 4
+
+int party = 0;
+int port = 32000;
+int num_rows = 35, num_cols = 1 << 6;
+int bitlength = 32, b = 4;
+int batch_size = 0;
+string address = "127.0.0.1";
+int num_threads = 1;
+
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+
+void ring_maxpool_thread(int tid, uint64_t *z, uint64_t *x, int lnum_rows,
+                         int lnum_cols) {
+  MaxPoolProtocol<uint64_t> *maxpool_oracle;
+  if (tid & 1) {
+    maxpool_oracle = new MaxPoolProtocol<uint64_t>(
+        3 - party, RING, iopackArr[tid], bitlength, b, 0, otpackArr[tid]);
+  } else {
+    maxpool_oracle = new MaxPoolProtocol<uint64_t>(party, RING, iopackArr[tid],
+                                                   bitlength, b, 0, otpackArr[tid]);
+  }
+  if (batch_size) {
+    for (int j = 0; j < lnum_rows; j += batch_size) {
+      if (batch_size <= lnum_rows - j) {
+        maxpool_oracle->funcMaxMPC(batch_size, lnum_cols, x + j, z + j,
+                                   nullptr);
+      } else {
+        maxpool_oracle->funcMaxMPC(lnum_rows - j, lnum_cols, x + j, z + j,
+                                   nullptr);
+      }
+    }
+  } else {
+    maxpool_oracle->funcMaxMPC(lnum_rows, lnum_cols, x, z, nullptr);
+  }
+
+  delete maxpool_oracle;
+  return;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("l", bitlength, "Bitlength of inputs");
+  amap.arg("b", b, "Radix base");
+  amap.arg("Nr", num_rows, "Number of rows");
+  amap.arg("Nc", num_cols, "Number of cols");
+  amap.arg("bt", batch_size, "Batch size as a power of 2 (No batching = 0)");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+  if (batch_size > 0) {
+    batch_size = 1 << batch_size;
+  }
+
+  cout << "========================================================" << endl;
+  cout << "Role: " << party << " - Bitlength: " << bitlength << " - Radix Base: " << b
+       << "\n#rows: " << num_rows << " - #cols: " << num_cols
+       << " - Batch Size: " << batch_size << " - # Threads: " << num_threads
+       << endl;
+  cout << "========================================================" << endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  uint64_t mask_l;
+  if (bitlength == 64)
+    mask_l = -1;
+  else
+    mask_l = (1ULL << bitlength) - 1;
+  PRG128 prg;
+  uint64_t *x = new uint64_t[num_rows * num_cols];
+  uint64_t *z = new uint64_t[num_rows];
+  prg.random_data(x, sizeof(uint64_t) * num_rows * num_cols);
+  for (int i = 0; i < num_rows * num_cols; i++) {
+    x[i] = x[i] & mask_l;
+  }
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************** Fork Threads ****************/
+  /********************************************/
+
+  auto start = clock_start();
+  std::thread maxpool_threads[num_threads];
+  int chunk_size = num_rows / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_rows;
+    if (i == (num_threads - 1)) {
+      lnum_rows = num_rows - offset;
+    } else {
+      lnum_rows = chunk_size;
+    }
+    maxpool_threads[i] =
+        std::thread(ring_maxpool_thread, i, z + offset, x + offset * num_cols,
+                    lnum_rows, num_cols);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    maxpool_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  /************** Verification ****************/
+  /********************************************/
+
+  switch (party) {
+  case sci::ALICE: {
+    iopackArr[0]->io->send_data(x, sizeof(uint64_t) * num_rows * num_cols);
+    iopackArr[0]->io->send_data(z, sizeof(uint64_t) * num_rows);
+    break;
+  }
+  case sci::BOB: {
+    uint64_t *xi = new uint64_t[num_rows * num_cols];
+    uint64_t *zi = new uint64_t[num_rows];
+    iopackArr[0]->io->recv_data(xi, sizeof(uint64_t) * num_rows * num_cols);
+    iopackArr[0]->io->recv_data(zi, sizeof(uint64_t) * num_rows);
+
+    for (int i = 0; i < num_rows; i++) {
+      zi[i] = (zi[i] + z[i]) & mask_l;
+      for (int c = 0; c < num_cols; c++) {
+        xi[i * num_cols + c] =
+            (xi[i * num_cols + c] + x[i * num_cols + c]) & mask_l;
+      }
+      uint64_t maxpool_output = xi[i * num_cols];
+      for (int c = 1; c < num_cols; c++) {
+        maxpool_output = ((maxpool_output - xi[i * num_cols + c]) & mask_l) >=
+                                 (1ULL << (bitlength - 1))
+                             ? xi[i * num_cols + c]
+                             : maxpool_output;
+      }
+      assert((zi[i] == maxpool_output) && "MaxPool output is incorrect");
+    }
+    delete[] xi;
+    delete[] zi;
+    cout << "Maxpool Tests Passed" << endl;
+    break;
+  }
+  }
+  delete[] x;
+  delete[] z;
+
+  cout << "Number of Maxpool rows (num_cols=" << num_cols << ")/s:\t"
+       << (double(num_rows) / t) * 1e6 << std::endl;
+  cout << "Maxpool Time (bitlength=" << bitlength << "; b=" << b << ")\t" << t << " mus"
+       << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+
+  return 0;
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_relu.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_relu.cpp
new file mode 100644
index 00000000..fbf2c228
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_relu.cpp
@@ -0,0 +1,180 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 32000;
+int num_threads = 4;
+string address = "127.0.0.1";
+bool six_comparison = true;
+
+int dim = 35;//1ULL << 16;
+int bw_x = 32;
+int s_x = 28;
+
+uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+
+sci::IOPack *iopackArr[MAX_THREADS];
+sci::OTPack *otpackArr[MAX_THREADS];
+
+void relu_thread(int tid, uint64_t *x, uint64_t *y, int num_ops, uint64_t six) {
+  MathFunctions *math;
+  if (tid & 1) {
+    math = new MathFunctions(3 - party, iopackArr[tid], otpackArr[tid]);
+  } else {
+    math = new MathFunctions(party, iopackArr[tid], otpackArr[tid]);
+  }
+  math->ReLU(num_ops, x, y, bw_x, six);
+
+  delete math;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of ReLU operations");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("six", six_comparison, "ReLU6?");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  assert(num_threads <= MAX_THREADS);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg;
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    x[i] &= mask_x;
+  }
+  uint64_t six;
+  if (six_comparison)
+    six = (6ULL << s_x);
+  else
+    six = 0;
+
+  /************** Fork Threads ****************/
+  /********************************************/
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+
+  auto start = clock_start();
+  std::thread relu_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_ops;
+    if (i == (num_threads - 1)) {
+      lnum_ops = dim - offset;
+    } else {
+      lnum_ops = chunk_size;
+    }
+    relu_threads[i] =
+        std::thread(relu_thread, i, x + offset, y + offset, lnum_ops, six);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    relu_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+
+  /************** Verification ****************/
+  /********************************************/
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    for (int i = 0; i < dim; i++) {
+      int64_t X = signed_val(x[i] + x0[i], bw_x);
+      int64_t Y = signed_val(y[i] + y0[i], bw_x);
+      int64_t expectedY = X;
+      if (X < 0)
+        expectedY = 0;
+      if (six != 0) {
+        if (X > int64_t(six))
+          expectedY = six;
+      }
+      // cout << X << "\t" << Y << "\t" << expectedY << endl;
+      assert(Y == expectedY);
+    }
+
+    cout << "ReLU" << (six == 0 ? "" : "6") << " Tests Passed" << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+
+  /**** Process & Write Benchmarking Data *****/
+  /********************************************/
+  cout << "Number of ReLU/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "ReLU Time\t" << t / (1000.0) << " ms" << endl;
+  cout << "ReLU Bytes Sent\t" << total_comm << " bytes" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sigmoid.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sigmoid.cpp
new file mode 100644
index 00000000..45fe00ed
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sigmoid.cpp
@@ -0,0 +1,184 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 32000;
+int num_threads = 4;
+string address = "127.0.0.1";
+
+int dim = 10000;
+int bw_x = 16;
+int bw_y = 16;
+int s_x = 12;
+int s_y = 12;
+
+uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+
+uint64_t computeULPErr(double calc, double actual, int SCALE) {
+  int64_t calc_fixed = (double(calc) * (1ULL << SCALE));
+  int64_t actual_fixed = (double(actual) * (1ULL << SCALE));
+  uint64_t ulp_err = (calc_fixed - actual_fixed) > 0
+                         ? (calc_fixed - actual_fixed)
+                         : (actual_fixed - calc_fixed);
+  return ulp_err;
+}
+
+void sigmoid_thread(int tid, uint64_t *x, uint64_t *y, int num_ops) {
+  MathFunctions *math;
+  if (tid & 1) {
+    math = new MathFunctions(3 - party, iopackArr[tid], otpackArr[tid]);
+  } else {
+    math = new MathFunctions(party, iopackArr[tid], otpackArr[tid]);
+  }
+  math->sigmoid(num_ops, x, y, bw_x, bw_y, s_x, s_y);
+
+  delete math;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of sigmoid operations");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  assert(num_threads <= MAX_THREADS);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg;
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    x[i] &= mask_x;
+  }
+
+  /************** Fork Threads ****************/
+  /********************************************/
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+
+  auto start = clock_start();
+  std::thread sig_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_ops;
+    if (i == (num_threads - 1)) {
+      lnum_ops = dim - offset;
+    } else {
+      lnum_ops = chunk_size;
+    }
+    sig_threads[i] =
+        std::thread(sigmoid_thread, i, x + offset, y + offset, lnum_ops);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    sig_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+
+  /************** Verification ****************/
+  /********************************************/
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    uint64_t total_err = 0;
+    uint64_t max_ULP_err = 0;
+    for (int i = 0; i < dim; i++) {
+      double dbl_x = (signed_val(x0[i] + x[i], bw_x)) / double(1LL << s_x);
+      double dbl_y = (unsigned_val(y0[i] + y[i], bw_y)) / double(1ULL << s_y);
+      double sig_x = 1.0 / (1 + exp(-1 * dbl_x));
+      uint64_t err = computeULPErr(dbl_y, sig_x, s_y);
+      // cout << "ULP Error: " << dbl_x << "," << dbl_y << "," << sig_x << ","
+      // << err << endl;
+      total_err += err;
+      max_ULP_err = std::max(max_ULP_err, err);
+    }
+
+    cerr << "Average ULP error: " << total_err / dim << endl;
+    cerr << "Max ULP error: " << max_ULP_err << endl;
+    cerr << "Number of tests: " << dim << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+
+  cout << "Number of sigmoid ops/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "Sigmoid Time\t" << t / (1000.0) << " ms" << endl;
+  cout << "Sigmoid Bytes Sent\t" << total_comm << " bytes" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sqrt.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sqrt.cpp
new file mode 100644
index 00000000..1d75870b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sqrt.cpp
@@ -0,0 +1,206 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 32000;
+int num_threads = 4;
+string address = "127.0.0.1";
+
+int dim = 100000;
+int bw_x = 16;
+int bw_y = 16;
+int s_x = 12;
+int s_y = 12;
+bool inverse = true;
+
+uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+
+uint64_t computeULPErr(double calc, double actual, int SCALE) {
+  int64_t calc_fixed = (double(calc) * (1ULL << SCALE));
+  int64_t actual_fixed = (double(actual) * (1ULL << SCALE));
+  uint64_t ulp_err = (calc_fixed - actual_fixed) > 0
+                         ? (calc_fixed - actual_fixed)
+                         : (actual_fixed - calc_fixed);
+  return ulp_err;
+}
+
+void sqrt_thread(int tid, uint64_t *x, uint64_t *y, int num_ops) {
+  MathFunctions *math;
+  if (tid & 1) {
+    math = new MathFunctions(3 - party, iopackArr[tid], otpackArr[tid]);
+  } else {
+    math = new MathFunctions(party, iopackArr[tid], otpackArr[tid]);
+  }
+  math->sqrt(num_ops, x, y, bw_x, bw_y, s_x, s_y, inverse);
+
+  delete math;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of sqrt operations");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  assert(num_threads <= MAX_THREADS);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg(fix_key);
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+
+  // x is always positive
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+  } else {
+    uint64_t *x0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    for (int i = 0; i < dim; i++) {
+      x[i] &= (mask_x >> 1);
+      // dn is always of the form 1.xxxx
+      x[i] = (x[i] - x0[i]);
+    }
+    delete[] x0;
+  }
+  for (int i = 0; i < dim; i++) {
+    x[i] &= mask_x;
+  }
+
+  /************** Fork Threads ****************/
+  /********************************************/
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+
+  auto start = clock_start();
+  std::thread sqrt_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_ops;
+    if (i == (num_threads - 1)) {
+      lnum_ops = dim - offset;
+    } else {
+      lnum_ops = chunk_size;
+    }
+    sqrt_threads[i] =
+        std::thread(sqrt_thread, i, x + offset, y + offset, lnum_ops);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    sqrt_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+
+  /************** Verification ****************/
+  /********************************************/
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    uint64_t total_err = 0;
+    uint64_t max_ULP_err = 0;
+    for (int i = 0; i < dim; i++) {
+      double dbl_x = (signed_val(x0[i] + x[i], bw_x)) / double(1LL << s_x);
+      double dbl_y = (signed_val(y0[i] + y[i], bw_y)) / double(1LL << s_y);
+      double sqrt_x;
+      if (inverse) {
+        sqrt_x = signed_val((1.0 / sqrt(dbl_x)) * (1LL << s_y), bw_y) /
+                 double(1LL << s_y);
+      } else {
+        sqrt_x = sqrt(dbl_x);
+      }
+      uint64_t err = computeULPErr(dbl_y, sqrt_x, s_y);
+      if (err > 4) {
+        cout << "ULP Error: " << dbl_x << "," << dbl_y << "," << sqrt_x << ","
+             << err << endl;
+      }
+      total_err += err;
+      max_ULP_err = std::max(max_ULP_err, err);
+    }
+
+    cerr << "Average ULP error: " << total_err / dim << endl;
+    cerr << "Max ULP error: " << max_ULP_err << endl;
+    cerr << "Number of tests: " << dim << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+
+  cout << "Number of sqrt/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "Sqrt Time\t" << t / (1000.0) << " ms" << endl;
+  cout << "Sqrt Bytes Sent\t" << total_comm << " bytes" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_tanh.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_tanh.cpp
new file mode 100644
index 00000000..fb7e53a5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_tanh.cpp
@@ -0,0 +1,184 @@
+/*
+Authors: Deevashwer Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "Math/math-functions.h"
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+using namespace sci;
+using namespace std;
+
+#define MAX_THREADS 4
+
+int party, port = 32000;
+int num_threads = 4;
+string address = "127.0.0.1";
+
+int dim = 1ULL << 16;
+int bw_x = 16;
+int bw_y = 16;
+int s_x = 12;
+int s_y = 12;
+
+uint64_t mask_x = (bw_x == 64 ? -1 : ((1ULL << bw_x) - 1));
+uint64_t mask_y = (bw_y == 64 ? -1 : ((1ULL << bw_y) - 1));
+
+IOPack *iopackArr[MAX_THREADS];
+OTPack *otpackArr[MAX_THREADS];
+
+uint64_t computeULPErr(double calc, double actual, int SCALE) {
+  int64_t calc_fixed = (double(calc) * (1ULL << SCALE));
+  int64_t actual_fixed = (double(actual) * (1ULL << SCALE));
+  uint64_t ulp_err = (calc_fixed - actual_fixed) > 0
+                         ? (calc_fixed - actual_fixed)
+                         : (actual_fixed - calc_fixed);
+  return ulp_err;
+}
+
+void tanh_thread(int tid, uint64_t *x, uint64_t *y, int num_ops) {
+  MathFunctions *math;
+  if (tid & 1) {
+    math = new MathFunctions(3 - party, iopackArr[tid], otpackArr[tid]);
+  } else {
+    math = new MathFunctions(party, iopackArr[tid], otpackArr[tid]);
+  }
+  math->tanh(num_ops, x, y, bw_x, bw_y, s_x, s_y);
+
+  delete math;
+}
+
+int main(int argc, char **argv) {
+  /************* Argument Parsing  ************/
+  /********************************************/
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of tanh operations");
+  amap.arg("nt", num_threads, "Number of threads");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  assert(num_threads <= MAX_THREADS);
+
+  /********** Setup IO and Base OTs ***********/
+  /********************************************/
+  for (int i = 0; i < num_threads; i++) {
+    iopackArr[i] = new IOPack(party, port + i, address);
+    if (i & 1) {
+      otpackArr[i] = new OTPack(iopackArr[i], 3 - party);
+    } else {
+      otpackArr[i] = new OTPack(iopackArr[i], party);
+    }
+  }
+  std::cout << "All Base OTs Done" << std::endl;
+
+  /************ Generate Test Data ************/
+  /********************************************/
+  PRG128 prg;
+
+  uint64_t *x = new uint64_t[dim];
+  uint64_t *y = new uint64_t[dim];
+
+  prg.random_data(x, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    x[i] &= mask_x;
+  }
+
+  /************** Fork Threads ****************/
+  /********************************************/
+  uint64_t total_comm = 0;
+  uint64_t thread_comm[num_threads];
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm();
+  }
+
+  auto start = clock_start();
+  std::thread tanh_threads[num_threads];
+  int chunk_size = dim / num_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    int offset = i * chunk_size;
+    int lnum_ops;
+    if (i == (num_threads - 1)) {
+      lnum_ops = dim - offset;
+    } else {
+      lnum_ops = chunk_size;
+    }
+    tanh_threads[i] =
+        std::thread(tanh_thread, i, x + offset, y + offset, lnum_ops);
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    tanh_threads[i].join();
+  }
+  long long t = time_from(start);
+
+  for (int i = 0; i < num_threads; i++) {
+    thread_comm[i] = iopackArr[i]->get_comm() - thread_comm[i];
+    total_comm += thread_comm[i];
+  }
+
+  /************** Verification ****************/
+  /********************************************/
+  if (party == ALICE) {
+    iopackArr[0]->io->send_data(x, dim * sizeof(uint64_t));
+    iopackArr[0]->io->send_data(y, dim * sizeof(uint64_t));
+  } else { // party == BOB
+    uint64_t *x0 = new uint64_t[dim];
+    uint64_t *y0 = new uint64_t[dim];
+    iopackArr[0]->io->recv_data(x0, dim * sizeof(uint64_t));
+    iopackArr[0]->io->recv_data(y0, dim * sizeof(uint64_t));
+
+    uint64_t total_err = 0;
+    uint64_t max_ULP_err = 0;
+    for (int i = 0; i < dim; i++) {
+      double dbl_x = (signed_val(x0[i] + x[i], bw_x)) / double(1LL << s_x);
+      double dbl_y = (signed_val(y0[i] + y[i], bw_y)) / double(1LL << s_y);
+      double tanh_x = tanh(dbl_x);
+      uint64_t err = computeULPErr(dbl_y, tanh_x, s_y);
+      // cout << "ULP Error: " << dbl_x << "," << dbl_y << "," << tanh_x << ","
+      // << err << endl;
+      total_err += err;
+      max_ULP_err = std::max(max_ULP_err, err);
+    }
+
+    cerr << "Average ULP error: " << total_err / dim << endl;
+    cerr << "Max ULP error: " << max_ULP_err << endl;
+    cerr << "Number of tests: " << dim << endl;
+
+    delete[] x0;
+    delete[] y0;
+  }
+
+  cout << "Number of tanh/s:\t" << (double(dim) / t) * 1e6 << std::endl;
+  cout << "Tanh Time\t" << t / (1000.0) << " ms" << endl;
+  cout << "Tanh Bytes Sent\t" << total_comm << " bytes" << endl;
+
+  /******************* Cleanup ****************/
+  /********************************************/
+  delete[] x;
+  delete[] y;
+  for (int i = 0; i < num_threads; i++) {
+    delete iopackArr[i];
+    delete otpackArr[i];
+  }
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_truncation.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_truncation.cpp
new file mode 100644
index 00000000..967ee555
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_truncation.cpp
@@ -0,0 +1,189 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/truncation.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+int dim = 35;//1 << 8;
+int bw = 16;
+int shift = 7;
+
+uint64_t mask_bw = (bw == 64 ? -1 : ((1ULL << bw) - 1));
+uint64_t mask_shift = (shift == 64 ? -1 : ((1ULL << shift) - 1));
+uint64_t mask_out = ((bw - shift) == 64 ? -1 : ((1ULL << (bw - shift)) - 1));
+
+// vars
+int party, port = 32000;
+string address = "127.0.0.1";
+IOPack *iopack;
+OTPack *otpack;
+Truncation *trunc_oracle;
+PRG128 prg;
+
+void trunc_reduce() {
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *outB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= mask_bw;
+    outB[i] = 0;
+  }
+
+  trunc_oracle->truncate_and_reduce(dim, inA, outB, shift, bw);
+
+  if (party == ALICE) {
+    uint64_t *inA_bob = new uint64_t[dim];
+    uint64_t *outB_bob = new uint64_t[dim];
+    iopack->io->recv_data(inA_bob, sizeof(uint64_t) * dim);
+    iopack->io->recv_data(outB_bob, sizeof(uint64_t) * dim);
+    for (int i = 0; i < dim; i++) {
+      inA[i] = (inA[i] + inA_bob[i]) & mask_bw;
+      outB[i] = (outB[i] + outB_bob[i]) & mask_out;
+    }
+    cout << "Testing for correctness..." << endl;
+    for (int i = 0; i < dim; i++) {
+      assert((inA[i] >> shift) == outB[i]);
+    }
+    cout << "Correct!" << endl;
+  } else { // BOB
+    iopack->io->send_data(inA, sizeof(uint64_t) * dim);
+    iopack->io->send_data(outB, sizeof(uint64_t) * dim);
+  }
+}
+
+void trunc(bool signed_arithmetic = true) {
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *outB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= mask_bw;
+    outB[i] = 0;
+  }
+
+  trunc_oracle->truncate(dim, inA, outB, shift, bw, signed_arithmetic, nullptr);
+
+  if (party == ALICE) {
+    uint64_t *inA_bob = new uint64_t[dim];
+    uint64_t *outB_bob = new uint64_t[dim];
+    iopack->io->recv_data(inA_bob, sizeof(uint64_t) * dim);
+    iopack->io->recv_data(outB_bob, sizeof(uint64_t) * dim);
+    for (int i = 0; i < dim; i++) {
+      inA[i] = (inA[i] + inA_bob[i]) & mask_bw;
+      outB[i] = (outB[i] + outB_bob[i]) & mask_bw;
+    }
+    cout << "Testing for correctness..." << endl;
+    for (int i = 0; i < dim; i++) {
+      if (signed_arithmetic) {
+        assert((signed_val(inA[i], bw) >> shift) == signed_val(outB[i], bw));
+      } else {
+        assert((inA[i] >> shift) == outB[i]);
+      }
+    }
+    cout << "Correct!" << endl;
+  } else { // BOB
+    iopack->io->send_data(inA, sizeof(uint64_t) * dim);
+    iopack->io->send_data(outB, sizeof(uint64_t) * dim);
+  }
+}
+
+void div_pow2(bool signed_arithmetic = true) {
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *outB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= mask_bw;
+    outB[i] = 0;
+  }
+
+  trunc_oracle->div_pow2(dim, inA, outB, shift, bw, signed_arithmetic, nullptr);
+
+  if (party == ALICE) {
+    uint64_t *inA_bob = new uint64_t[dim];
+    uint64_t *outB_bob = new uint64_t[dim];
+    iopack->io->recv_data(inA_bob, sizeof(uint64_t) * dim);
+    iopack->io->recv_data(outB_bob, sizeof(uint64_t) * dim);
+    for (int i = 0; i < dim; i++) {
+      inA[i] = (inA[i] + inA_bob[i]) & mask_bw;
+      outB[i] = (outB[i] + outB_bob[i]) & mask_bw;
+    }
+    cout << "Testing for correctness..." << endl;
+    for (int i = 0; i < dim; i++) {
+      if (signed_arithmetic) {
+        assert((signed_val(inA[i], bw) / (1LL << shift)) ==
+               signed_val(outB[i], bw));
+      } else {
+        assert((inA[i] / (1ULL << shift)) == outB[i]);
+      }
+    }
+    cout << "Correct!" << endl;
+  } else { // BOB
+    iopack->io->send_data(inA, sizeof(uint64_t) * dim);
+    iopack->io->send_data(outB, sizeof(uint64_t) * dim);
+  }
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("N", dim, "Number of ReLU operations");
+  amap.arg("l", bw, "Bitlength of inputs");
+  amap.arg("ip", address, "IP Address of server (ALICE)");
+
+  amap.parse(argc, argv);
+
+  iopack = new IOPack(party, port, "127.0.0.1");
+  otpack = new OTPack(iopack, party);
+  trunc_oracle = new Truncation(party, iopack, otpack);
+  uint64_t num_rounds;
+
+  cout << "<><><><> Truncate & Reduce <><><><>" << endl;
+  num_rounds = iopack->get_rounds();
+  trunc_reduce();
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds (TR): " << num_rounds << endl;
+
+  cout << "<><><><> (Unsigned) Truncate <><><><>" << endl;
+  num_rounds = iopack->get_rounds();
+  trunc(false);
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds (LRS): " << num_rounds << endl;
+
+  cout << "<><><><> (Signed) Truncate <><><><>" << endl;
+  trunc(true);
+  cout << "<><><><> (Unsigned) Division by power of 2 <><><><>" << endl;
+  num_rounds = iopack->get_rounds();
+  div_pow2(false);
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds (Div-Pow2): " << num_rounds << endl;
+
+  cout << "<><><><> (Signed) Division by power of 2 <><><><>" << endl;
+  div_pow2(true);
+}
diff --git a/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_value_extension.cpp b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_value_extension.cpp
new file mode 100644
index 00000000..683c3954
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_value_extension.cpp
@@ -0,0 +1,145 @@
+/*
+Authors: Mayank Rathee
+Copyright:
+Copyright (c) 2021 Microsoft Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "BuildingBlocks/value-extension.h"
+#include <iostream>
+
+using namespace sci;
+using namespace std;
+
+int dim = 1024;
+int bwA = 32;
+int bwB = 64;
+bool precomputed_MSBs = false;
+
+uint64_t maskA = (bwA == 64 ? -1 : ((1ULL << bwA) - 1));
+uint64_t maskB = (bwB == 64 ? -1 : ((1ULL << bwB) - 1));
+
+// vars
+int party, port = 32000;
+IOPack *iopack;
+OTPack *otpack;
+XTProtocol *ext;
+PRG128 prg;
+
+void z_ext() {
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *outB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+  prg.random_data(outB, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= maskA;
+    outB[i] = 0;
+  }
+
+  uint8_t *msbA = nullptr;
+  if (precomputed_MSBs) {
+    msbA = new uint8_t[dim];
+    ext->aux->MSB(inA, msbA, dim, bwA);
+  }
+  uint64_t num_rounds = iopack->get_rounds();
+  ext->z_extend(dim, inA, outB, bwA, bwB, msbA);
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds (Zero-Extension): " << num_rounds << endl;
+
+  if (party == ALICE) {
+    uint64_t *inA_bob = new uint64_t[dim];
+    uint64_t *outB_bob = new uint64_t[dim];
+    iopack->io->recv_data(inA_bob, sizeof(uint64_t) * dim);
+    iopack->io->recv_data(outB_bob, sizeof(uint64_t) * dim);
+    for (int i = 0; i < dim; i++) {
+      inA[i] = (inA[i] + inA_bob[i]) & maskA;
+      outB[i] = (outB[i] + outB_bob[i]) & maskB;
+    }
+    cout << "Testing for correctness..." << endl;
+    for (int i = 0; i < dim; i++) {
+      // cout << inA[i] << " " << outB[i] << endl;
+      assert(inA[i] == outB[i]);
+    }
+    cout << "Correct!" << endl;
+  } else { // BOB
+    iopack->io->send_data(inA, sizeof(uint64_t) * dim);
+    iopack->io->send_data(outB, sizeof(uint64_t) * dim);
+  }
+}
+
+void s_ext() {
+  uint64_t *inA = new uint64_t[dim];
+  uint64_t *outB = new uint64_t[dim];
+
+  prg.random_data(inA, dim * sizeof(uint64_t));
+  prg.random_data(outB, dim * sizeof(uint64_t));
+
+  for (int i = 0; i < dim; i++) {
+    inA[i] &= maskA;
+    outB[i] = 0;
+  }
+
+  uint8_t *msbA = nullptr;
+  if (precomputed_MSBs) {
+    msbA = new uint8_t[dim];
+    ext->aux->MSB(inA, msbA, dim, bwA);
+  }
+  uint64_t num_rounds = iopack->get_rounds();
+  ext->s_extend(dim, inA, outB, bwA, bwB, msbA);
+  num_rounds = iopack->get_rounds() - num_rounds;
+  cout << "Num rounds (Signed-Extension): " << num_rounds << endl;
+
+  if (party == ALICE) {
+    uint64_t *inA_bob = new uint64_t[dim];
+    uint64_t *outB_bob = new uint64_t[dim];
+    iopack->io->recv_data(inA_bob, sizeof(uint64_t) * dim);
+    iopack->io->recv_data(outB_bob, sizeof(uint64_t) * dim);
+    for (int i = 0; i < dim; i++) {
+      inA[i] = (inA[i] + inA_bob[i]) & maskA;
+      outB[i] = (outB[i] + outB_bob[i]) & maskB;
+    }
+    cout << "Testing for correctness..." << endl;
+    for (int i = 0; i < dim; i++) {
+      // cout << inA[i] << " " << outB[i] << endl;
+      assert(signed_val(inA[i], bwA) == signed_val(outB[i], bwB));
+    }
+    cout << "Correct!" << endl;
+  } else { // BOB
+    iopack->io->send_data(inA, sizeof(uint64_t) * dim);
+    iopack->io->send_data(outB, sizeof(uint64_t) * dim);
+  }
+}
+
+int main(int argc, char **argv) {
+  ArgMapping amap;
+  amap.arg("r", party, "Role of party: ALICE = 1; BOB = 2");
+  amap.arg("p", port, "Port Number");
+  amap.arg("m", precomputed_MSBs, "MSB_to_Wrap Optimization?");
+  amap.arg("d", dim, "Size of vector");
+  amap.parse(argc, argv);
+
+  iopack = new IOPack(party, port, "127.0.0.1");
+  otpack = new OTPack(iopack, party);
+  ext = new XTProtocol(party, iopack, otpack);
+
+  cout << "<><><><> Zero Extension <><><><>" << endl;
+  z_ext();
+  cout << "<><><><> Signed Extension <><><><>" << endl;
+  s_ext();
+}
diff --git a/GPU-MPC/ext/sytorch/ezpc-cli-2.sh b/GPU-MPC/ext/sytorch/ezpc-cli-2.sh
new file mode 100755
index 00000000..9771771b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ezpc-cli-2.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+
+# Default values
+BACKEND="LLAMA"
+SCALE="15"
+BITLENGTH="40"
+
+# Parse command-line arguments
+while [[ $# -gt 0 ]]
+do
+    key="$1"
+    case $key in
+        -m|--model)
+            MODEL_PATH="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -c|--client)
+            CLIENT_IP="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -s|--server)
+            SERVER_IP="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -d|--dealer)
+            DEALER_IP="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -b|--backend)
+            BACKEND="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -scale|--scale)
+            SCALE="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -bl|--bitlength)
+            BITLENGTH="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -preprocess|--preprocess)
+            PREPROCESS="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Check that required arguments are set
+if [ -z "$MODEL_PATH" ] || [ -z "$SERVER_IP" ] || [ -z "$PREPROCESS" ] || [ -z "$DEALER_IP" ] ;
+then
+    echo "To run the secure MPC model, please ensure the following:"
+    echo "Server Files:             Client Files:       Dealer Files:"
+    echo "-------------------       --------------      --------------"
+    echo "path to model.onnx        path to image.jpg   dealer IP"
+    echo "path to preprocess.py)"
+    echo "server IP"
+    echo "-------------------       --------------" | column -t -s $'\t'
+    echo "Usage: $0 -m <full-path/model.onnx> -preprocess <full-path/preprocess_image_file> -s <server-ip>  -d <dealer-ip>"
+    echo "Optional: [-b <backend>] [-scale <scale>] [-bl <bitlength>]"
+    exit 1
+fi
+
+# Print out arguments
+echo ------------------------------
+echo "SERVER Details:"
+echo "Model path: $MODEL_PATH"
+echo "Preprocess path: $PREPROCESS"
+echo "Server IP: $SERVER_IP"
+echo ------------------------------
+echo "CLIENT Details:"
+echo "Image path: $IMAGE_PATH"
+# echo "Client IP: $CLIENT_IP"
+echo ------------------------------
+echo "DEALER Details:"
+echo "Dealer IP: $DEALER_IP"
+echo ------------------------------
+
+# Getting Model Name and Directory and Model Name without extension
+File_NAME=$(basename $MODEL_PATH)
+MODEL_DIR=$(dirname $MODEL_PATH)
+Model_Name=${File_NAME%.*}
+
+# Get preprocess file name
+preprocess_image_file=$(basename $PREPROCESS)
+
+# Generating Server Script
+SERVER_SCRIPT="server.sh"
+echo "Generating Server Script for $Model_Name:"
+# Script accepts 1 argument: path to sytorch
+cat <<EOF > $SERVER_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+if [ "$1" = "clean" ]; then
+  shopt -s extglob
+  echo -e "\${bg_yellow}Cleaning up\${clear}"
+  rm -rf !(server.sh)
+  echo -e "\${bg_green}Cleaned up\${clear}"
+  shopt -u extglob
+  exit 0
+fi
+
+# Current directory
+current_dir=\$(pwd)
+echo -e "Play Area: \${bg_green}\$current_dir\${clear}"
+
+
+# Clone sytorch
+echo -e "\${bg_green}Cloning sytorch repository\${clear}"
+git clone https://github.com/mpc-msri/EzPC
+wait
+
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+echo "MODEL_DIR: $MODEL_DIR"
+
+# Copy Files to current directory
+echo -e "\${bg_green}Copying files to current directory\${clear}"
+cp $MODEL_PATH .
+cp $PREPROCESS .
+
+# Compile the model
+echo -e "\${bg_green}Compiling the model\${clear}"
+python \$onnxbridge/main.py --path $File_NAME --backend $BACKEND --scale $SCALE --bitlength $BITLENGTH --generate code
+wait
+\$onnxbridge/LLAMA/compile_llama.sh "${Model_Name}_${BACKEND}_${SCALE}.cpp"
+
+# Create a zip file of stripped model to share with client
+zipfile="client_$Model_Name.zip"
+# Create the zip file
+echo -e "\${bg_green}Creating zip file\${clear}"
+zip "\$zipfile" "optimised_$File_NAME" "${Model_Name}_${BACKEND}_${SCALE}.cpp" "$preprocess_image_file"
+echo -e "\${bg_green}Zip file created\${clear}"
+wait
+
+# Start a Python server to serve the stripped model
+echo -e "\${bg_green}Starting a Python server to serve the stripped model to Client and Dealer.\${clear}"
+python \$sytorch/scripts/server.py 2
+
+while true; do
+    # Download Keys from Dealer
+    echo -e "\${bg_green}Downloading keys from Dealer\${clear}"
+    # Set the Dealer IP address and port number is 9000 by default
+    Dealer_url="$DEALER_IP"
+
+    # Get the keys from the Dealer
+    python \$sytorch/scripts/download_keys.py \$Dealer_url server server server.dat
+    wait
+    echo -e "\${bg_green}Downloaded Dealer Keys File\${clear}"
+
+    # Model inference
+    echo -e "\${bg_green}Running model inference\${clear}"
+    ./${Model_Name}_${BACKEND}_${SCALE} 2 $SERVER_IP ${Model_Name}_input_weights.dat
+    wait
+    echo -e "\${bg_green}Model inference completed.\${clear}"
+done
+
+EOF
+
+# Finish generating Server Script
+echo "Finished generating Server Script"
+
+# Generating Dealer Script
+DEALER_SCRIPT="dealer.sh"
+echo "Generating Dealer Script"
+cat <<EOF > $DEALER_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+if [ "$1" = "clean" ]; then
+  shopt -s extglob
+  echo -e "\${bg_yellow}Cleaning up\${clear}"
+  rm -rf !(dealer.sh)
+  echo -e "\${bg_green}Cleaned up\${clear}"
+  shopt -u extglob
+  exit 0
+fi
+
+# Current directory
+current_dir=\$(pwd)
+echo -e "Play Area: \${bg_green}\$current_dir\${clear}"
+
+# Downloading Server Files
+echo -e "\${bg_green}Downloading Server Files\${clear}"
+# Set the server IP address and port number
+SERVER_PORT="8000"
+
+# Loop until a 200 response is received
+while true; do
+    echo -e "\${bg_yellow}Sending GET request to server.\${clear}"
+    # Send a GET request to the server and save the response status code
+    STATUS=\$(curl -s -w '%{http_code}' "http://$SERVER_IP:\$SERVER_PORT/client_$Model_Name.zip" --output client_$Model_Name.zip)
+
+    echo \$STATUS
+    # Check if the status code is 200
+    if [ \$STATUS -eq 200 ]; then
+        echo "Zip file downloaded successfully"
+        break
+    fi
+
+    echo -e "\${bg_yellow}Waiting for server to generate zip file: sleeping for 10 seconds.\${clear}"
+    # Wait for 10 seconds before trying again
+    sleep 10
+done
+
+
+# Clone sytorch
+echo -e "\${bg_green}Cloning sytorch repository\${clear}"
+git clone https://github.com/mpc-msri/EzPC
+wait
+
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+# Looking ZIP file from SERVER
+echo "Looking ZIP file from SERVER"
+# look for a zip file in the current directory with name "client_$Model_Name.zip"
+zipfile=\$(find . -maxdepth 1 -name "client_$Model_Name.zip" -print -quit)
+
+if [ -z "\$zipfile" ]; then
+  echo "Error: Zip file not found."
+  exit 1
+fi
+
+# Unzip the file
+echo -e "\${bg_green}Unzipping the file\${clear}"
+unzip \$zipfile 
+wait
+
+# Compile the model
+echo -e "\${bg_green}Compiling the model\${clear}"
+\$onnxbridge/LLAMA/compile_llama.sh "${Model_Name}_${BACKEND}_${SCALE}.cpp"
+wait
+
+# binary to generate keys
+cp ${Model_Name}_${BACKEND}_${SCALE} generate_keys
+
+# Generate keys for 1st inference
+./generate_keys 1
+mkdir server
+mv server.dat server/server.dat
+mkdir client
+mv client.dat client/client.dat
+
+# Key generation and serving key files
+echo -e "\${bg_green}Starting a Python server to serve keys file\${clear}"
+python \$sytorch/scripts/dealer.py $SERVER_IP 
+
+EOF
+# Finish generating Dealer Script
+echo "Finished generating Dealer Script"
+
+# Generating Client Script
+CLIENT_OFFLINE_SCRIPT="client-offline.sh"
+CLIENT_ONLINE_SCRIPT="client-online.sh"
+echo "Generating Client Script"
+# Script accepts 1 argument: path to sytorch
+cat <<EOF > $CLIENT_OFFLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+if [ "$1" = "clean" ]; then
+  shopt -s extglob
+  echo -e "\${bg_yellow}Cleaning up\${clear}"
+  rm -rf !(client-o*)
+  echo -e "\${bg_green}Cleaned up\${clear}"
+  shopt -u extglob
+  exit 0
+fi
+
+# Current directory
+current_dir=\$(pwd)
+echo -e "Play Area: \${bg_green}\$current_dir\${clear}"
+
+# Downloading Server Files
+echo -e "\${bg_green}Downloading Server Files\${clear}"
+# Set the server IP address and port number
+SERVER_PORT="8000"
+
+# Loop until a 200 response is received
+while true; do
+    echo -e "\${bg_yellow}Sending GET request to server.\${clear}"
+    # Send a GET request to the server and save the response status code
+    STATUS=\$(curl -s -w '%{http_code}' "http://$SERVER_IP:\$SERVER_PORT/client_$Model_Name.zip" --output client_$Model_Name.zip)
+
+    echo \$STATUS
+    # Check if the status code is 200
+    if [ \$STATUS -eq 200 ]; then
+        echo "File downloaded successfully"
+        break
+    fi
+
+    echo -e "\${bg_yellow}Waiting for server to generate zip file: sleeping for 10 seconds.\${clear}"
+    # Wait for 10 seconds before trying again
+    sleep 10
+done
+echo -e "\${bg_green}Downloaded Server Files\${clear}"
+
+# Clone sytorch
+echo -e "\${bg_green}Cloning sytorch repository\${clear}"
+git clone https://github.com/mpc-msri/EzPC
+wait
+
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+# Looking ZIP file from SERVER
+echo "Looking ZIP file from SERVER"
+# look for a zip file in the current directory with name "client_$Model_Name.zip"
+zipfile=\$(find . -maxdepth 1 -name "client_$Model_Name.zip" -print -quit)
+
+if [ -z "\$zipfile" ]; then
+  echo "Error: Zip file not found."
+  exit 1
+fi
+
+# Unzip the file
+echo -e "\${bg_green}Unzipping the file\${clear}"
+unzip \$zipfile 
+wait
+
+
+# Compile the model
+echo -e "\${bg_green}Compiling the model\${clear}"
+\$onnxbridge/LLAMA/compile_llama.sh "${Model_Name}_${BACKEND}_${SCALE}.cpp"
+wait
+
+
+
+wait
+
+EOF
+
+cat <<EOF > $CLIENT_ONLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+if [ "$1" = "clean" ]; then
+  shopt -s extglob
+  echo -e "\${bg_yellow}Cleaning up\${clear}"
+  rm -rf !(client-o*)
+  echo -e "\${bg_green}Cleaned up\${clear}"
+  shopt -u extglob
+  exit 0
+fi
+
+# if Image is not provided
+if [ -z "\$1" ]; then
+    echo "Error: Image not provided."
+    exit 1
+fi
+IMAGE_PATH=\$1
+
+current_dir=\$(pwd)
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+# Download Keys from Dealer
+echo -e "\${bg_green}Downloading keys from Dealer\${clear}"
+# Set the dealer IP address and port number
+Dealer_url="$DEALER_IP"
+
+# Get the keys from the Dealer
+python \$sytorch/scripts/download_keys.py \$Dealer_url client client client.dat
+wait
+echo -e "\${bg_green}Downloaded Dealer Keys File\${clear}"
+
+# Copy the input image
+cp \$IMAGE_PATH .
+# get input file name
+File_NAME=\$(basename \$IMAGE_PATH)
+Image_Name=\${File_NAME%.*}
+
+# Prepare the input
+echo -e "\${bg_green}Preparing the input\${clear}"
+python $preprocess_image_file \$File_NAME
+wait
+python \$onnxbridge/helper/convert_np_to_float_inp.py --inp \$Image_Name.npy --out \$Image_Name.inp
+
+
+# Run the model
+echo -e "\${bg_green}Running the model\${clear}"
+./${Model_Name}_${BACKEND}_${SCALE} 3 $SERVER_IP < \$Image_Name.inp > output.txt
+
+# Print the output
+echo -e "\${bg_green}Printing the output\${clear}"
+cat output.txt
+echo -e "\${bg_green}Finished\${clear}"
+
+EOF
+echo "Finished generating Client Script"
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ezpc-cli.sh b/GPU-MPC/ext/sytorch/ezpc-cli.sh
new file mode 100755
index 00000000..be70959c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ezpc-cli.sh
@@ -0,0 +1,298 @@
+#!/bin/bash
+
+# Default values
+BACKEND="LLAMA"
+SCALE="15"
+BITLENGTH="40"
+
+# Parse command-line arguments
+while [[ $# -gt 0 ]]
+do
+    key="$1"
+    case $key in
+        -m|--model)
+            MODEL_PATH="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -i|--image)
+            IMAGE_PATH="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -c|--client)
+            CLIENT_IP="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -s|--server)
+            SERVER_IP="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -b|--backend)
+            BACKEND="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -scale|--scale)
+            SCALE="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -bl|--bitlength)
+            BITLENGTH="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        -preprocess|--preprocess)
+            PREPROCESS="$2"
+            shift # past argument
+            shift # past value
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Check that required arguments are set
+if [ -z "$MODEL_PATH" ] || [ -z "$IMAGE_PATH" ] || [ -z "$SERVER_IP" ] || [ -z "$PREPROCESS" ] ;
+then
+    echo "To run the secure MPC model, please ensure the following:"
+    echo "Server Files:             Client Files:"
+    echo "-------------------       --------------"
+    echo "path to model.onnx        path to image.jpg"
+    echo "path to preprocess.py"
+    echo "server IP"
+    echo "-------------------       --------------" | column -t -s $'\t'
+    echo "Usage: $0 -m <full-path/model.onnx> -preprocess <full-path/preprocess_image_file> -s <server-ip> -i <full-path/image>"
+    echo "Optional: [-b <backend>] [-scale <scale>] [-bl <bitlength>] "
+    exit 1
+fi
+
+# Print out arguments
+echo ------------------------------
+echo "SERVER Details:"
+echo "Model path: $MODEL_PATH"
+echo "Server IP: $SERVER_IP"
+echo ------------------------------
+echo "CLIENT Details:"
+echo "Image path: $IMAGE_PATH"
+# echo "Client IP: $CLIENT_IP"
+echo ------------------------------
+
+# Getting Model Name and Directory and Model Name without extension
+File_NAME=$(basename $MODEL_PATH)
+MODEL_DIR=$(dirname $MODEL_PATH)
+Model_Name=${File_NAME%.*}
+
+# Get preprocess file name
+preprocess_image_file=$(basename $PREPROCESS)
+
+# Generating Server Script
+SERVER_OFFLINE_SCRIPT="server-offline.sh"
+SERVER_ONLINE_SCRIPT="server-online.sh"
+echo "Generating Server Script for $Model_Name"
+# Script accepts 1 argument: path to sytorch
+cat <<EOF > $SERVER_OFFLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+# Current directory
+current_dir=\$(pwd)
+echo -e "Play Area: \${bg_green}\$current_dir\${clear}"
+
+
+# Clone sytorch
+echo -e "\${bg_green}Cloning sytorch repository\${clear}"
+git clone https://github.com/mpc-msri/EzPC
+wait
+
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+echo "MODEL_DIR: $MODEL_DIR"
+
+# Copy Files to current directory
+echo -e "\${bg_green}Copying files to current directory\${clear}"
+cp $MODEL_PATH .
+cp $PREPROCESS .
+
+# Compile the model
+echo -e "\${bg_green}Compiling the model\${clear}"
+python \$onnxbridge/main.py --path $File_NAME --backend $BACKEND --scale $SCALE --bitlength $BITLENGTH --generate code
+wait
+\$onnxbridge/LLAMA/compile_llama.sh "${Model_Name}_${BACKEND}_${SCALE}.cpp"
+
+# Create a zip file of stripped model to share with client
+zipfile="client_$Model_Name.zip"
+# Create the zip file
+echo -e "\${bg_green}Creating zip file\${clear}"
+zip "\$zipfile" "optimised_$File_NAME" "${Model_Name}_${BACKEND}_${SCALE}.cpp" "$preprocess_image_file"
+echo -e "\${bg_green}Zip file created\${clear}"
+wait
+
+# Start a Python server to serve the stripped model
+echo -e "\${bg_green}Starting a Python server to serve the stripped model\${clear}"
+python \$sytorch/scripts/server.py 1
+
+# Key generation
+echo -e "\${bg_green}Generating keys\${clear}"
+./${Model_Name}_${BACKEND}_${SCALE} 1
+rm client.dat
+
+EOF
+
+cat <<EOF > $SERVER_ONLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+# Model inference
+echo -e "\${bg_green}Running model inference\${clear}"
+./${Model_Name}_${BACKEND}_${SCALE} 2 $SERVER_IP ${Model_Name}_input_weights.dat
+echo -e "\${bg_green}Model inference completed.\${clear}"
+
+EOF
+
+# Finish generating Server Script
+echo "Finished generating Server Script"
+
+
+
+# Generating Client Script
+CLIENT_OFFLINE_SCRIPT="client-offline.sh"
+CLIENT_ONLINE_SCRIPT="client-online.sh"
+echo "Generating Client Script"
+# Script accepts 1 argument: path to sytorch
+cat <<EOF > $CLIENT_OFFLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+# Current directory
+current_dir=\$(pwd)
+echo -e "Play Area: \${bg_green}\$current_dir\${clear}"
+
+# Downloading Server Files
+echo -e "\${bg_green}Downloading Server Files\${clear}"
+# Set the server IP address and port number
+SERVER_PORT="8000"
+
+# Loop until a 200 response is received
+while true; do
+    echo -e "\${bg_yellow}Sending GET request to server.\${clear}"
+    # Send a GET request to the server and save the response status code
+    STATUS=\$(curl -s -w '%{http_code}' "http://$SERVER_IP:\$SERVER_PORT/client_$Model_Name.zip" --output client_$Model_Name.zip)
+
+    # Check if the status code is 200
+    if [ \$STATUS -eq 200 ]; then
+        echo "File downloaded successfully"
+        break
+    fi
+
+    echo -e "\${bg_yellow}Waiting for server to generate zip file: sleeping for 10 seconds.\${clear}"
+    # Wait for 10 seconds before trying again
+    sleep 10
+done
+echo -e "\${bg_green}Downloaded Server Files\${clear}"
+
+# Clone sytorch
+echo -e "\${bg_green}Cloning sytorch repository\${clear}"
+git clone https://github.com/mpc-msri/EzPC
+wait
+
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+# Looking ZIP file from SERVER
+echo "Looking ZIP file from SERVER"
+# look for a zip file in the current directory with name "client_$Model_Name.zip"
+zipfile=\$(find . -maxdepth 1 -name "client_$Model_Name.zip" -print -quit)
+
+if [ -z "\$zipfile" ]; then
+  echo "Error: Zip file not found."
+  exit 1
+fi
+
+# Unzip the file
+echo -e "\${bg_green}Unzipping the file\${clear}"
+unzip \$zipfile 
+wait
+
+
+# Compile the model
+echo -e "\${bg_green}Compiling the model\${clear}"
+\$onnxbridge/LLAMA/compile_llama.sh "${Model_Name}_${BACKEND}_${SCALE}.cpp"
+wait
+
+# Generate the key
+echo -e "\${bg_green}Generating the key\${clear}"
+./${Model_Name}_${BACKEND}_${SCALE} 1
+wait
+
+EOF
+
+cat <<EOF > $CLIENT_ONLINE_SCRIPT
+#!/bin/bash
+
+# Color variables
+bg_red='\033[0;41m'
+bg_green='\033[0;42m'
+bg_yellow='\033[0;43m'
+bg_blue='\033[0;44m'
+bg_magenta='\033[0;45m'
+bg_cyan='\033[0;46m'
+clear='\033[0m'
+
+current_dir=\$(pwd)
+sytorch="\$current_dir/EzPC/sytorch"
+onnxbridge="\$current_dir/EzPC/OnnxBridge"
+
+# Copy the input image
+cp $IMAGE_PATH .
+# get input file name
+File_NAME=\$(basename $IMAGE_PATH)
+Image_Name=\${File_NAME%.*}
+
+# Prepare the input
+echo -e "\${bg_green}Preparing the input\${clear}"
+python $preprocess_image_file \$File_NAME
+wait
+python \$onnxbridge/helper/convert_np_to_float_inp.py --inp \$Image_Name.npy --out \$Image_Name.inp
+
+# Run the model
+echo -e "\${bg_green}Running the model\${clear}"
+./${Model_Name}_${BACKEND}_${SCALE} 3 $SERVER_IP < \$Image_Name.inp > output.txt
+
+# Print the output
+echo -e "\${bg_green}Printing the output\${clear}"
+cat output.txt
+echo -e "\${bg_green}Finished\${clear}"
+
+EOF
+echo "Finished generating Client Script"
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
new file mode 100644
index 00000000..c4aba60a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
@@ -0,0 +1,194 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/tensor.h>
+#include <llama/assert.h>
+#include <omp.h>
+
+#define NOT_IMPLEMENTED                                                                  \
+    {                                                                                    \
+        throw std::runtime_error(std::string("not implemented ") + __PRETTY_FUNCTION__); \
+    }
+
+template <typename T>
+class Backend
+{
+public:
+    // truncation API
+    virtual void truncate(T *in, T *out, u64 shift, u64 size, u8 mode = 0) NOT_IMPLEMENTED;
+
+    void truncate(const Tensor<T> &in, const Tensor<T> &out, u64 shift, u8 mode = 0)
+    {
+        always_assert(in.is_same_shape(out));
+        truncate(in.data, out.data, shift, in.size(), mode);
+    }
+
+    void truncate(const Tensor4D<T> &in, u64 shift, u8 mode = 0)
+    {
+        truncate(in.data, in.data, shift, in.d1 * in.d2 * in.d3 * in.d4, mode);
+    }
+
+    void truncate(const Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        truncate(in.data, in.data, shift, in.size(), mode);
+    }
+
+    virtual void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
+    {
+        truncate(in.data, in.data, shift, in.size(), mode);
+    }
+
+    void truncate(const Tensor2D<T> &in, u64 shift, u8 mode = 0)
+    {
+        truncate(in.data, in.data, shift, in.d1 * in.d2, mode);
+    }
+
+    void truncate(const Tensor1D<T> &in, u64 shift, u8 mode = 0)
+    {
+        truncate(in.data, in.data, shift, in.d1, mode);
+    }
+
+    // matmul API
+    virtual void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) NOT_IMPLEMENTED;
+    virtual void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c, bool useBias, Tensor1D<T> &d, bool isFirst)
+    {
+        // printf("inp=%lx\n", a.d_data);
+        matmul(a, b, c);
+        // printf("input[0]=%ld, output[0]=%ld\n", i64(a.data[0]), i64(this->activation.data[0]));
+        if (useBias)
+        {
+            auto c_as_nd = c.as_nd();
+            addbias(c_as_nd, d);
+        }
+    }
+    virtual void matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) NOT_IMPLEMENTED;
+
+    // conv API
+    virtual void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst) NOT_IMPLEMENTED;
+    virtual void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, bool useBias, const Tensor1D<T> &bias, Tensor4D<T> &output, bool isFirst)
+    {
+        conv2D(fh, fw, padding, stride, ci, co, input, filter, output, isFirst);
+        if (useBias)
+        {
+            auto output_as_nd = output.as_nd();
+            addbias(output_as_nd, bias);
+        }
+    }
+    virtual void conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output) NOT_IMPLEMENTED;
+    virtual void convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output) NOT_IMPLEMENTED;
+
+    // relu API
+    virtual void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode) NOT_IMPLEMENTED;
+    virtual void select(const Tensor<T> &in, const Tensor<T> &drelu, const Tensor<T> &out) NOT_IMPLEMENTED;
+
+    // avgpool API
+    virtual void div(Tensor<T> &in, T divisor, u64 scale) NOT_IMPLEMENTED;
+    virtual void sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out)
+    {
+        // assert(in.d1 == out.d1);
+        // assert(in.d4 == out.d4);
+        u64 newH = (in.d2 + 2 * padding - ks) / stride + 1;
+        u64 newW = (in.d3 + 2 * padding - ks) / stride + 1;
+        assert(out.d2 == newH);
+        assert(out.d3 == newW);
+
+#pragma omp parallel for collapse(4)
+        for (int i = 0; i < in.d1; i++)
+        {
+            for (int j = 0; j < newH; j++)
+            {
+                for (int k = 0; k < newW; k++)
+                {
+                    for (int l = 0; l < in.d4; l++)
+                    {
+                        T sum = 0;
+                        for (int m = 0; m < ks; m++)
+                        {
+                            for (int n = 0; n < ks; n++)
+                            {
+                                sum += in(i, j * stride + m, k * stride + n, l);
+                            }
+                        }
+                        out(i, j, k, l) = sum;
+                    }
+                }
+            }
+        }
+    }
+
+    virtual void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale) NOT_IMPLEMENTED;
+    virtual void avgPool2DInputGrad(u64 ks, u64 padding, u64 stride, Tensor4D<T> &in, const Tensor4D<T> &out, u64 scale) NOT_IMPLEMENTED;
+
+    // maxpool API
+    virtual void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode) NOT_IMPLEMENTED;
+
+    virtual void batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale) NOT_IMPLEMENTED;
+    virtual void signext(Tensor<T> &x, u64 scale) NOT_IMPLEMENTED;
+
+    // add API
+    virtual void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out) NOT_IMPLEMENTED;
+
+    // softmax API
+    virtual void softmaxGrad(Tensor<T> &in) NOT_IMPLEMENTED;
+
+    virtual void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0) NOT_IMPLEMENTED;
+    virtual void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0) NOT_IMPLEMENTED;
+    virtual void tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale) NOT_IMPLEMENTED;
+    virtual void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0) NOT_IMPLEMENTED;
+    virtual void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale) NOT_IMPLEMENTED;
+    virtual void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale) NOT_IMPLEMENTED;
+    virtual void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+    {
+        always_assert(x.shape.back() == bias.d1);
+
+#pragma omp parallel for
+        for (u64 i = 0; i < x.size(); ++i)
+        {
+            x.data[i] += bias(i % bias.d1);
+        }
+    }
+
+    virtual void scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y) NOT_IMPLEMENTED;
+    virtual void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode = 0) NOT_IMPLEMENTED;
+    virtual void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y) NOT_IMPLEMENTED;
+    virtual void local_attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y) NOT_IMPLEMENTED;
+    virtual void softmax_triangular(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0) NOT_IMPLEMENTED;
+    virtual void attention_triangular(Tensor2D<T> &q, Tensor2D<T> &k, Tensor2D<T> &v, Tensor2D<T> &out, u64 scale, u64 n_heads) NOT_IMPLEMENTED;
+    virtual void mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out) NOT_IMPLEMENTED;
+
+    // MHA
+    virtual void mha(int n_heads, int n_embed, int dim_W, bool selfAttn, bool doNormQKt, bool doRotEmb, const Tensor2D<T> &wQKV, const Tensor1D<T> &bQKV, const Tensor2D<T> &wProj, const Tensor1D<T> &bProj, const Tensor2D<T> &X, Tensor2D<T> &Y)
+    {
+        assert(0 && "not implemented");
+    }
+
+    virtual void optimize(LayerGraphNode<T> *root)
+    {
+    }
+
+    virtual void output(Tensor<T> &a)
+    {
+    }
+
+    virtual void close() {}
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/baseline_cleartext.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/baseline_cleartext.h
new file mode 100644
index 00000000..4f13d971
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/baseline_cleartext.h
@@ -0,0 +1,115 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include "backend.h"
+#include <sytorch/utils.h>
+#include <thread>
+
+template <typename T>
+class BaselineClearText : public Backend<T> {
+private:
+public:
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode);
+    static const bool probablistic = true;
+    static const bool localTruncationEmulation = false;
+    // static const u64 bw = sizeof(T) * 8;
+    static const u64 bw = 50;
+
+    template <typename Functor>
+    void fastfor(u64 size, Functor f)
+    {
+        #pragma omp parallel for
+        for (u64 i = 0; i < size; i++) {
+            f(i);
+        }
+    }
+
+    void modbw(T* x, u64 size)
+    {
+        if constexpr (std::is_floating_point<T>::value) {
+            return;
+        }
+        else if /*constexpr*/ (bw == sizeof(T) * 8) {
+            return;
+        }
+        else {
+            i64 mask = (1LL << (bw - 1));
+            fastfor(size, [&](u64 i) {
+                i64 val = (x[i] + mask) % (1LL << bw);
+                val -= mask;
+                x[i] = val;
+            });
+        }
+    }
+    
+    void modbw(T &x)
+    {
+        if constexpr (std::is_floating_point<T>::value) {
+            return;
+        }
+        else if constexpr (bw == sizeof(T) * 8) {
+            return;
+        }
+        else {
+            i64 val = (x + (1LL << (bw - 1))) % (1LL << bw);
+            val -= (1LL << (bw - 1));
+            x = val;
+        }
+    }
+    
+    void modbw(Tensor<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor1D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor2D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor4D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor5D<T> &x) { modbw(x.data, x.size()); }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst);
+    void conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+    void convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+
+    void relu(const Tensor<T> &in, const Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode);
+    void truncate(T &in, u64 shift);
+    void div(Tensor<T> &in, T divisor, u64 scale);
+    void div(T &in, T divisor, u64 scale);
+    u64 log2(u64 x);
+    void sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out);
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale);
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode);
+
+    void batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out);
+    virtual void gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode = 0);
+    virtual void tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale);
+    virtual void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    virtual void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias);
+    void scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode);
+    void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void local_attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y);
+    virtual void softmax_triangular(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/cleartext.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/cleartext.h
new file mode 100644
index 00000000..89f9243b
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/cleartext.h
@@ -0,0 +1,130 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "backend.h"
+#include <sytorch/utils.h>
+#include <thread>
+
+template <typename T>
+class ClearText : public Backend<T> {
+private:
+public:
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode);
+    /*static const*/ bool probablistic = false;
+    /*static const*/ bool localTruncationEmulation = false;
+    // static const u64 bw = sizeof(T) * 8;
+    /*static const*/ u64 bw = 64;
+
+    template <typename Functor>
+    void fastfor(u64 size, Functor f)
+    {
+        #pragma omp parallel for
+        for (u64 i = 0; i < size; i++) {
+            f(i);
+        }
+    }
+
+    void modbw(T* x, u64 size)
+    {
+        if constexpr (std::is_floating_point<T>::value) {
+            return;
+        }
+        else if /*constexpr*/ (bw == sizeof(T) * 8) {
+            return;
+        }
+        else {
+            i64 mask = (1LL << (bw - 1));
+            fastfor(size, [&](u64 i) {
+                i64 val = (x[i] + mask) % (1LL << bw);
+                val -= mask;
+                x[i] = val;
+            });
+        }
+    }
+    
+    void modbw(T &x)
+    {
+        if constexpr (std::is_floating_point<T>::value) {
+            return;
+        }
+        else if /*constexpr*/ (bw == sizeof(T) * 8) {
+            return;
+        }
+        else {
+            i64 val = (x + (1LL << (bw - 1))) % (1LL << bw);
+            val -= (1LL << (bw - 1));
+            x = val;
+        }
+    }
+    
+    void modbw(Tensor<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor1D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor2D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor4D<T> &x) { modbw(x.data, x.size()); }
+    void modbw(Tensor5D<T> &x) { modbw(x.data, x.size()); }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst);
+    void conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+    void convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+
+    void relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode);
+    // void truncate(const Tensor4D<T> &in, const Tensor4D<T> &out, u64 shift);
+    // void truncate(const Tensor4D<T> &in, u64 shift);
+    // void truncate(const Tensor2D<T> &in, u64 shift);
+    // void truncate(const Tensor1D<T> &in, u64 shift);
+    void truncate(T &in, u64 shift);
+    void div(Tensor<T> &in, T divisor, u64 scale);
+    void div(T &in, T divisor, u64 scale);
+    u64 log2(u64 x);
+    void sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out);
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale);
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode);
+    void maxPool2D(u64 fh, u64 fw, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor<u8> &maxIdx, u64 scale, u8 mode);
+
+    void batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out);
+    void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    void tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale);
+    void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias);
+    void scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode);
+    void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void local_attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void softmax_table(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale);
+    void softmax_polynomial(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale);
+    void softmax_sirnn_2part_lut(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale);
+    void softmax_sirnn_2part_lut_triangular(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale);
+    void softmax_triangular(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    void mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out);
+    void polyeval(const Tensor<T> &x, Tensor<T> &y, const std::vector<double> &coefficients, u64 scale);
+    T invsqrt_lut_2(T x, u64 scale, u64 additional_div, u64 n);
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/crypten_cleartext.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/crypten_cleartext.h
new file mode 100644
index 00000000..13c20b27
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/crypten_cleartext.h
@@ -0,0 +1,142 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/backend/baseline_cleartext.h>
+
+template <typename T>
+class CryptenClearText : public BaselineClearText<T> {
+public:
+    T crypten_inverse(T x, u64 scale)
+    {
+        T result = (1LL << scale) - 2 * x;
+        BaselineClearText<T>::modbw(result);
+        result = 3 * crypten_exp(result, scale) + T(0.003 * (1LL << scale));
+        BaselineClearText<T>::modbw(result);
+        u64 iters = 10;
+        for(u64 i = 0; i < iters; ++i) {
+            T tmp = x * result;
+            BaselineClearText<T>::modbw(tmp);
+            BaselineClearText<T>::truncate(tmp, scale);
+            result = result * (2 * (1LL << scale) - tmp);
+            BaselineClearText<T>::modbw(result);
+            BaselineClearText<T>::truncate(result, scale);
+        }
+        return result;
+    }
+
+    T crypten_exp(T x, u64 scale)
+    {
+        u64 iters = 8;
+        T y = x;
+        BaselineClearText<T>::truncate(y, iters);
+        y = y + (1LL << scale);
+        BaselineClearText<T>::modbw(y);
+        for(u64 i = 0; i < iters; ++i) {
+            y = y * y;
+            BaselineClearText<T>::modbw(y);
+            BaselineClearText<T>::truncate(y, scale);
+        }
+        return y;
+    }
+    
+    void softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+        for(int b = 0; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < numClasses; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < numClasses; ++j) {
+                T x = in(b, j) - max;
+                exps[j] = crypten_exp(x, scale);
+                den += exps[j];
+            }
+
+            BaselineClearText<T>::modbw(den);
+            T inv_den = crypten_inverse(den, scale);
+            for(u64 j = 0; j < numClasses; ++j) {
+                out(b, j) = exps[j] * inv_den;
+            }
+        }
+        Backend<T>::truncate(out, scale);
+    }
+
+    void softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        always_assert(_in.shape[0] == _in.shape[1]); // should be a square matrix
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+
+        out.zero();
+        out(0, 0) = T(1LL << (2*scale));
+        
+        for(int b = 1; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < b + 1; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < b + 1; ++j) {
+                T x = in(b, j) - max;
+                exps[j] = crypten_exp(x, scale);
+                den += exps[j];
+            }
+
+            BaselineClearText<T>::modbw(den);
+            T inv_den = crypten_inverse(den, scale);
+            for(u64 j = 0; j < b + 1; ++j) {
+                out(b, j) = exps[j] * inv_den;
+            }
+        }
+        Backend<T>::truncate(out, scale);
+    }
+
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/default.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/default.h
new file mode 100644
index 00000000..5f10e921
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/default.h
@@ -0,0 +1,35 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/backend/cleartext.h>
+#include <sytorch/backend/float.h>
+
+template <typename T>
+Backend<T>* defaultBackend()
+{
+    if constexpr (std::is_floating_point<T>::value) {
+        return new FloatClearText<T>();
+    } else {
+        return new ClearText<T>();
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/float.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/float.h
new file mode 100644
index 00000000..82af8b06
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/float.h
@@ -0,0 +1,70 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include "backend.h"
+#include <sytorch/utils.h>
+#include <thread>
+
+template <typename T>
+class FloatClearText : public Backend<T> {
+private:
+public:
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode);
+
+    template <typename Functor>
+    void fastfor(u64 size, Functor f)
+    {
+        #pragma omp parallel for
+        for (u64 i = 0; i < size; i++) {
+            f(i);
+        }
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+    void matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c);
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst);
+    void conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+    void convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output);
+
+    void relu(const Tensor<T> &in, const Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode);
+    void truncate(T &in, u64 shift);
+    void div(Tensor<T> &in, T divisor, u64 scale);
+    void div(T &in, T divisor, u64 scale);
+    void sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out);
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale);
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode);
+
+    void batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out);
+    void gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode = 0);
+    void tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale);
+    void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+    void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale);
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias);
+    void scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode);
+    void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y);
+    void softmax_triangular(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0);
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
new file mode 100644
index 00000000..34e70c92
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
@@ -0,0 +1,578 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/utils.h>
+#include <llama/config.h>
+#include <llama/input_prng.h>
+#include <llama/comms.h>
+#include <llama/api.h>
+#include "backend.h"
+#include <sytorch/layers/layers.h>
+#include <omp.h>
+#include <filesystem>
+
+char *readFile(std::string filename)
+{
+    std::ifstream file(filename, std::ios::binary);
+    size_t fileSz = std::filesystem::file_size(filename);
+    char *mem_bytes = (char *)malloc(fileSz);
+    file.read((char *)mem_bytes, fileSz);
+    file.close();
+    return mem_bytes;
+}
+
+template <typename T>
+class LlamaBase : public Backend<T>
+{
+public:
+    const bool useLocalTruncation = false;
+    char *_kBuf;
+
+    void initPrngs()
+    {
+        u64 seedKey = 0xdeadbeefbadc0ffe;
+        for (int i = 0; i < 256; ++i)
+        {
+            LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(i, seedKey));
+        }
+    }
+
+    void initDealer(char **sBuf, char **cBuf)
+    {
+        assert(LlamaConfig::party == 1);
+        initPrngs();
+        LlamaConfig::server = new Peer(sBuf);
+        LlamaConfig::client = new Peer(cBuf);
+        // input_prng_init();
+    }
+
+    void initServer(std::string ip, char **kBuf)
+    {
+        assert(LlamaConfig::party == 2);
+        initPrngs();
+        LlamaConfig::dealer = new Dealer(kBuf);
+        LlamaConfig::client = waitForPeer(42002);
+        LlamaConfig::peer = LlamaConfig::client;
+        // input_prng_init();
+    }
+
+    void initClient(std::string ip, char **kBuf)
+    {
+        assert(LlamaConfig::party == 3);
+        initPrngs();
+        LlamaConfig::dealer = new Dealer(kBuf);
+        LlamaConfig::server = new Peer(ip, 42002);
+        LlamaConfig::peer = LlamaConfig::server;
+        // input_prng_init();
+    }
+
+    void init(std::string ip, bool memBuf = false)
+    {
+        if (LlamaConfig::party == 1)
+        {
+            initPrngs();
+            LlamaConfig::server = new Peer("server.dat");
+            LlamaConfig::client = new Peer("client.dat");
+        }
+        else if (LlamaConfig::party == 2)
+        {
+            if (memBuf)
+            {
+                _kBuf = readFile("server.dat");
+                initServer(ip, &_kBuf);
+            }
+            else
+            {
+                initPrngs();
+                LlamaConfig::dealer = new Dealer("server.dat");
+                LlamaConfig::client = waitForPeer(42005);
+                LlamaConfig::peer = LlamaConfig::client;
+            }
+        }
+        else if (LlamaConfig::party == 3)
+        {
+            if (memBuf)
+            {
+                _kBuf = readFile("client.dat");
+                initClient(ip, &_kBuf);
+            }
+            else
+            {
+                initPrngs();
+                LlamaConfig::dealer = new Dealer("client.dat");
+                LlamaConfig::server = new Peer(ip, 42005);
+                LlamaConfig::peer = LlamaConfig::server;
+            }
+        }
+        else
+        {
+            throw std::runtime_error("Invalid party");
+        }
+        input_prng_init();
+    }
+
+    void finalize()
+    {
+        switch (LlamaConfig::party)
+        {
+        case 1:
+            LlamaConfig::server->close();
+            LlamaConfig::client->close();
+            break;
+        case 2:
+            LlamaConfig::dealer->close();
+            LlamaConfig::client->close();
+            break;
+        case 3:
+            LlamaConfig::dealer->close();
+            LlamaConfig::server->close();
+        }
+    }
+
+    void initializeInferencePartyB(Tensor<T> &data)
+    {
+        u64 size = data.size();
+        if (LlamaConfig::party == 1)
+        {
+            input_layer(nullptr, data.data, size, 3);
+        }
+        else
+        {
+            Tensor<T> tmp(data.shape);
+            input_layer(data.data, tmp.data, size, 3);
+        }
+    }
+
+    void initializeInferencePartyA(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         {
+                             auto layer = node->layer;
+                             if (layer->name == "_MHADummy")
+                             {
+                                 auto mha = (_MHADummy<T> *)layer;
+                                 if (LlamaConfig::party == 1)
+                                 {
+                                     // populate with randomness
+                                     input_layer(nullptr, mha->wQKV.data, mha->wQKV.size(), 2);
+                                     input_layer(nullptr, mha->bQKV.data, mha->bQKV.size(), 2);
+                                     input_layer(nullptr, mha->wProj.data, mha->wProj.size(), 2);
+                                     input_layer(nullptr, mha->bProj.data, mha->bProj.size(), 2);
+                                 }
+                                 else
+                                 {
+                                     Tensor1D<T> tmp1(mha->wQKV.size());
+                                     printf("Weight=%ld\n", mha->wQKV.data[0]);
+                                     input_layer(mha->wQKV.data, tmp1.data, mha->wQKV.size(), 2);
+                                     printf("Masked weight=%ld\n", mha->wQKV.data[0]);
+                                     Tensor1D<T> tmp2(mha->bQKV.size());
+                                     input_layer(mha->bQKV.data, tmp2.data, mha->bQKV.size(), 2);
+                                     Tensor1D<T> tmp3(mha->wProj.size());
+                                     input_layer(mha->wProj.data, tmp3.data, mha->wProj.size(), 2);
+                                     Tensor1D<T> tmp4(mha->bProj.size());
+                                     input_layer(mha->bProj.data, tmp4.data, mha->bProj.size(), 2);
+                                 }
+                             }
+                             else
+                             {
+                                 auto weights = layer->getweights();
+                                 auto bias = layer->getbias();
+                                 if (LlamaConfig::party == 1)
+                                 {
+                                     input_layer(nullptr, weights.data, weights.size, 2);
+                                     if (layer->useBias)
+                                     {
+                                         input_layer(nullptr, bias.data, bias.size, 2);
+                                     }
+                                 }
+                                 else
+                                 {
+                                     Tensor1D<T> tmp(weights.size);
+                                     input_layer(weights.data, tmp.data, weights.size, 2);
+                                     if (layer->useBias)
+                                     {
+                                         Tensor1D<T> tmp2(bias.size);
+                                         input_layer(bias.data, tmp2.data, bias.size, 2);
+                                     }
+                                 }
+                             } });
+    }
+
+    void inputA(Tensor<T> &data)
+    {
+        u64 b1 = data.size();
+        if (LlamaConfig::party == 1)
+        {
+            input_layer(nullptr, data.data, b1, 2);
+        }
+        else
+        {
+            if (LlamaConfig::party == 2)
+            {
+                Tensor<T> tmp(data.shape);
+                input_layer(data.data, tmp.data, b1, 2);
+            }
+            else
+            {
+                input_layer(data.data, nullptr, b1, 2);
+            }
+        }
+    }
+
+    void outputA(Tensor<T> &a)
+    {
+        outputA(a.data, a.size());
+    }
+
+    void output(Tensor<T> &a)
+    {
+        output(a.data, a.size());
+    }
+
+    void outputA(Tensor2D<T> &a)
+    {
+        outputA(a.data, a.d1 * a.d2);
+    }
+
+    void output(Tensor2D<T> &a)
+    {
+        output(a.data, a.d1 * a.d2);
+    }
+
+    void outputA(Tensor1D<T> &a)
+    {
+        outputA(a.data, a.d1);
+    }
+
+    void output(Tensor1D<T> &a)
+    {
+        output(a.data, a.d1);
+    }
+
+    void outputA(T *a, u64 sz)
+    {
+        if (LlamaConfig::party == 1)
+        {
+            for (int i = 0; i < sz; i++)
+            {
+                LlamaConfig::client->send_mask(a[i]);
+                a[i] = 0;
+            }
+        }
+        else if (LlamaConfig::party == 3)
+        {
+            for (int i = 0; i < sz; i++)
+            {
+                auto mask = LlamaConfig::dealer->recv_mask();
+                a[i] = a[i] - mask;
+            }
+        }
+    }
+
+    void output(T *a, u64 sz)
+    {
+        if (LlamaConfig::party == 1)
+        {
+            for (int i = 0; i < sz; i++)
+            {
+                LlamaConfig::client->send_mask(a[i]);
+                LlamaConfig::server->send_mask(a[i]);
+                a[i] = 0;
+            }
+        }
+        else
+        {
+            for (int i = 0; i < sz; i++)
+            {
+                auto mask = LlamaConfig::dealer->recv_mask();
+                if(i == 0) printf("Mask=%lu\n", mask);
+                a[i] = a[i] - mask;
+            }
+        }
+    }
+
+    void ss2m(T *data, u64 size)
+    {
+        std::cerr << ">> SS2M - Start" << std::endl;
+        if (LlamaConfig::party == 1)
+        {
+            for (int i = 0; i < size; i++)
+            {
+                data[i] = random_ge(64);
+                auto p = splitShare(data[i], 64);
+                LlamaConfig::client->send_mask(p.first);
+                LlamaConfig::server->send_mask(p.second);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < size; i++)
+            {
+                auto mask = LlamaConfig::dealer->recv_mask();
+                data[i] = data[i] + mask;
+            }
+            reconstruct(size, data, 64);
+        }
+        std::cerr << ">> SS2M - End" << std::endl;
+    }
+
+    void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        assert(a.d2 == b.d1);
+        assert(c.d1 == a.d1);
+        assert(c.d2 == b.d2);
+        MatMul2D(a.d1, a.d2, b.d2, a.data, a.data, b.data, b.data, c.data, c.data, true);
+    }
+
+    void matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        assert(a.d2 == b.d1);
+        assert(c.d1 == a.d1);
+        assert(c.d2 == b.d2);
+        MatMul2DTriangular(a.d1, a.d2, b.d2, a.data, a.data, b.data, b.data, c.data, c.data, true);
+    }
+
+    void matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        assert(a.d1 == b.d1);
+        assert(c.d1 == a.d2);
+        assert(c.d2 == b.d2);
+
+        Tensor2D<T> aTranspose(a.d2, a.d1);
+        for (int i = 0; i < a.d1; ++i)
+            for (int j = 0; j < a.d2; ++j)
+                aTranspose(j, i) = a(i, j);
+        MatMul2D(aTranspose.d1, aTranspose.d2, b.d2, aTranspose.data, aTranspose.data, b.data, b.data, c.data, c.data, true);
+    }
+
+    void matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+    {
+        assert(a.d2 == b.d2);
+        assert(c.d1 == a.d1);
+        assert(c.d2 == b.d1);
+        Tensor2D<T> bTranspose(b.d2, b.d1);
+        for (int i = 0; i < b.d1; ++i)
+            for (int j = 0; j < b.d2; ++j)
+                bTranspose(j, i) = b(i, j);
+        matmul(a, bTranspose, c);
+    }
+
+    void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst)
+    {
+        assert(input.d4 == ci);
+        assert(filter.d1 == co);
+        assert(filter.d2 == fh * fw * ci);
+        u64 newH = (((input.d2 + 2 * padding - fh) / stride) + 1);
+        u64 newW = (((input.d3 + 2 * padding - fw) / stride) + 1);
+        assert(output.d1 == input.d1);
+        assert(output.d2 == newH);
+        assert(output.d3 == newW);
+        assert(output.d4 == co);
+
+        Conv2DWrapper(input.d1, input.d2, input.d3, input.d4, fh, fw, co,
+                      padding, padding, padding, padding, stride, stride,
+                      input.data, input.data, filter.data, filter.data, output.data, output.data);
+    }
+
+    void conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+    {
+        assert(input.d5 == ci);
+        assert(filter.d1 == co);
+        assert(filter.d2 == fd * fh * fw * ci);
+        always_assert(dd == 1);
+        always_assert(dh == 1);
+        always_assert(dw == 1);
+        u64 newD = (((input.d2 + 2 * pd - fd - (fd - 1) * (dd - 1)) / sd) + 1);
+        u64 newH = (((input.d3 + 2 * ph - fh - (fh - 1) * (dh - 1)) / sh) + 1);
+        u64 newW = (((input.d4 + 2 * pw - fw - (fw - 1) * (dw - 1)) / sw) + 1);
+        assert(output.d1 == input.d1);
+        assert(output.d2 == newD);
+        assert(output.d3 == newH);
+        assert(output.d4 == newW);
+        assert(output.d5 == co);
+
+        Conv3DWrapper(input.d1, input.d2, input.d3, input.d4, input.d5, fd, fh, fw, co,
+                      pd, pd, ph, ph, pw, pw, sd, sh, sw,
+                      input.data, filter.data, output.data);
+    }
+
+    void convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+    {
+        assert(input.d5 == ci);
+        assert(filter.d1 == co);
+        assert(filter.d2 == fd * fh * fw * ci);
+        u64 newD = (((input.d2 - 1) * sd + fd - 2 * pd));
+        u64 newH = (((input.d3 - 1) * sh + fh - 2 * ph));
+        u64 newW = (((input.d4 - 1) * sw + fw - 2 * pw));
+        assert(output.d1 == input.d1);
+        assert(output.d2 == newD);
+        assert(output.d3 == newH);
+        assert(output.d4 == newW);
+        assert(output.d5 == co);
+
+        ConvTranspose3DWrapper(input.d1, input.d2, input.d3, input.d4, input.d5, fd, fh, fw, co,
+                               pd, pd, ph, ph, pw, pw, sd, sh, sw,
+                               output.d2, output.d3, output.d4, input.data, filter.data, output.data);
+    }
+
+    void sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out)
+    {
+        assert(in.d1 == out.d1);
+        assert(in.d4 == out.d4);
+        u64 newH = (in.d2 + 2 * padding - ks) / stride + 1;
+        u64 newW = (in.d3 + 2 * padding - ks) / stride + 1;
+        assert(out.d2 == newH);
+        assert(out.d3 == newW);
+
+#pragma omp parallel for collapse(4)
+        for (int i = 0; i < in.d1; i++)
+        {
+            for (int j = 0; j < newH; j++)
+            {
+                for (int k = 0; k < newW; k++)
+                {
+                    for (int l = 0; l < in.d4; l++)
+                    {
+                        T sum = 0;
+                        for (int m = 0; m < ks; m++)
+                        {
+                            for (int n = 0; n < ks; n++)
+                            {
+                                sum += in(i, j * stride + m, k * stride + n, l);
+                            }
+                        }
+                        out(i, j, k, l) = sum;
+                    }
+                }
+            }
+        }
+    }
+
+    void div(const Tensor<T> &in, T divisor, u64 scale)
+    {
+        if (!(divisor & (divisor - 1)))
+        {
+            Backend<T>::truncate(in, log2(divisor), 3);
+        }
+        else
+        {
+            T divfp = (1LL << scale) / divisor;
+            u64 sz = in.size();
+            for (u64 i = 0; i < sz; i++)
+            {
+                in.data[i] *= divfp;
+            }
+            Backend<T>::truncate(in, scale, 3);
+        }
+    }
+
+    void divPartial(const Tensor4D<T> &in, T divisor, u64 scale)
+    {
+        T divfp = (1LL << scale) / divisor;
+        u64 sz = in.d1 * in.d2 * in.d3 * in.d4;
+#pragma omp parallel for
+        for (u64 i = 0; i < sz; i++)
+        {
+            in.data[i] *= divfp;
+        }
+    }
+
+    void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale)
+    {
+        sumPool2D(ks, padding, stride, in, out);
+        divPartial(out, (T)(ks * ks), scale);
+    }
+
+    u64 log2(u64 x)
+    {
+        u64 y = 0;
+        while (x >>= 1)
+            y++;
+        return y;
+    }
+
+    void batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        assert(A.d1 == B.d1);
+        assert(A.d1 == x.shape.back());
+        assert(x.is_same_shape(y));
+        u64 channels = x.shape.back();
+        // replicate A
+        Tensor<T> A2(x.shape);
+
+        for (u64 i = 0; i < x.size(); ++i)
+        {
+            A2.data[i] = A.data[i % channels];
+        }
+
+        ElemWiseMul(x.size(), x.data, A2.data, y.data);
+
+        for (u64 i = 0; i < x.size(); ++i)
+        {
+            y.data[i] += B.data[i % channels];
+        }
+    }
+
+    //     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    //     {
+    //         always_assert(in.size() > 0);
+    //         always_assert(out.size() == in[0]->size());
+    //         for (int i = 0; i < in.size(); i++)
+    //         {
+    //             always_assert(out.size() == in[i]->size());
+    //         }
+
+    // #pragma omp parallel for
+    //         for (u64 i = 0; i < out.size(); ++i)
+    //         {
+    //             T sum = 0;
+    //             for (int j = 0; j < in.size(); j++)
+    //             {
+    //                 sum += in[j]->data[i];
+    //             }
+    //             out.data[i] = sum;
+    //         }
+    //     }
+
+    void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+    {
+        always_assert(x.shape.back() == bias.d1);
+
+#pragma omp parallel for
+        for (u64 i = 0; i < x.size(); ++i)
+        {
+            x.data[i] += bias(i % bias.d1);
+        }
+    }
+
+    void scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y)
+    {
+        always_assert(x.is_same_shape(y));
+
+#pragma omp parallel for
+        for (u64 i = 0; i < x.size(); ++i)
+        {
+            y.data[i] = x.data[i] * scalar;
+        }
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_extended.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_extended.h
new file mode 100644
index 00000000..613285cb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_extended.h
@@ -0,0 +1,89 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/backend/llama_base.h>
+
+template <typename T>
+class Sequential;
+
+template <typename T>
+class LlamaExtended : public LlamaBase<T> {
+public:
+
+    void relu(const Tensor<T> &in, const Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode) {
+        assert(in.is_same_shape(out));
+        assert(in.is_same_shape(drelu));
+        int sz = in.size();
+        Relu(sz, in.data, in.data, out.data, out.data, drelu.data);
+    }
+
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+        if (this->useLocalTruncation) {
+            for(u64 i = 0; i < size; i++) {
+                out[i] = in[i] >> shift;
+            }
+        }
+        else {
+            ARS(size, in, in, out, out, shift);
+        }
+    }
+
+    void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode) {
+        assert(in.d1 == out.d1);
+        assert(in.d4 == out.d4);
+        Tensor1D<T> maxBit((ks * ks - 1) * out.d1 * out.d2 * out.d3 * out.d4);
+        maxIdx.resize(ks * ks * out.d1, out.d2, out.d3, out.d4);
+        MaxPool(out.d1, out.d2, out.d3, out.d4, ks, ks, padding, padding, padding, padding, stride, stride, in.d1, in.d2, in.d3, in.d4, in.data, in.data, out.data, out.data, maxBit.data);
+        MaxPoolOneHot(out.d1, out.d2, out.d3, out.d4, ks, ks, maxBit.data, maxIdx.data);
+        // maxBit.template print<1>();
+        // maxIdx.template print<1>();
+    }
+
+    void doOptimize(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
+    {
+        if (node->layer->doTruncationForward) {
+            if (node->children.size() == 1) {
+                // std::cout << "yeah.." << std::endl;
+                LayerGraphNode<T> *child = node->children[0];
+                if (child->layer->doTruncationForward) {
+                    // no optimization possible
+                    // this is set to true for FC, Conv2D and BatchNormInference
+                }
+                else {
+                    if (child->layer->name == "MaxPool2D" || child->layer->name == "ReLU") {
+                        // optimize
+                        node->layer->doTruncationForward = false;
+                        child->layer->doTruncationForward = true;
+                    }
+                }
+            }
+        }
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
+            doOptimize(n, r);
+        });
+    }
+
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
new file mode 100644
index 00000000..f287bf51
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
@@ -0,0 +1,209 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/backend/llama_base.h>
+
+template <typename T>
+class LlamaTransformer : public LlamaBase<T> {
+public:
+
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+        // ARS(size, in, in, out, out, shift);
+        // SlothARS(size, in, out, shift);
+        SlothFaithfulARS(size, LlamaConfig::bitlength, in, out, shift, "Linear::");
+    }
+
+    void gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        u64 sz = in.size();
+        always_assert(sz == out.size());
+        if (mode == 0)
+        {
+            SlothGelu(sz, LlamaConfig::bitlength, in.data, out.data, scale);
+        }
+        else if (mode == 1)
+        {
+            SlothGelu(sz, LlamaConfig::bitlength - scale, in.data, out.data, scale);
+        }
+    }
+
+    void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
+    {
+        in.is_same_shape(out);
+        if (mode == 0)
+            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength, in.data, out.data, scale);
+        else if (mode == 1)
+            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale, in.data, out.data, scale);
+    }
+
+    void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        always_assert(A.d1 == B.d1);
+        always_assert(A.d1 == x.shape.back());
+        always_assert(x.is_same_shape(y));
+        u64 s2 = x.shape.back();
+        u64 s1 = x.size() / s2;
+        SlothLayerNorm(s1, s2, x.data, A.data, B.data, y.data, scale);
+    }
+
+    void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+    {
+        always_assert(x.is_same_shape(y));
+        always_assert(x.shape.size() == 2);
+        always_assert(x.shape[0] == x.shape[1]);
+
+        if (LlamaConfig::party == DEALER)
+        {
+            y.copy(x, false);
+        }
+        else
+        {
+            u64 n_seq = x.shape[0];
+            auto y_2d = y.as_2d();
+            auto x_2d = x.as_2d();
+
+            for (u64 j = 0; j < n_seq; ++j) {
+                for (u64 k = 0; k < j + 1; ++k) {
+                    y_2d(j, k) = x_2d(j, k);
+                }
+                for (u64 k = j + 1; k < n_seq; ++k) {
+                    y_2d(j, k) = x_2d(j, k) - scalar;
+                }
+            }
+        }
+    }
+
+    void softmax_triangular(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
+    {
+        in.is_same_shape(out);
+        if (mode == 0)
+            SoftmaxTriangular(in.shape[0], in.shape[1], LlamaConfig::bitlength, in.data, out.data, scale);
+        else if (mode == 1)
+            SoftmaxTriangular(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale, in.data, out.data, scale);
+    }
+
+    void tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale)
+    {
+        Tanh(in.size(), in.data, out.data, scale);
+    }
+
+    void mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out)
+    {
+        always_assert(a.is_same_shape(b));
+        always_assert(a.is_same_shape(out));
+        // Mul(a.size(), a.data, b.data, out.data);
+        ElemWiseMul(a.size(), a.data, b.data, out.data);
+    }
+
+    void doOptimizeGelu(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
+    {
+        if (node->layer->doTruncationForward) 
+        {
+            if (node->children.size() == 1) 
+            {
+                LayerGraphNode<T> *child = node->children[0];
+                if (child->layer->name == "GeLU") 
+                {
+                    child->layer->mode = 1;
+                }
+            }
+        }
+    }
+
+    void doOptimizeDiv(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
+    {
+        if (node->layer->doTruncationForward) 
+        {
+            if (node->children.size() == 1) 
+            {
+                LayerGraphNode<T> *child = node->children[0];
+                if (child->layer->name == "_ScalarDiv") 
+                {
+                    auto layer_sd = (_ScalarDiv<T> *)child->layer;
+                    T d = T(double(1LL << (layer_sd->scale)) / layer_sd->scalar);
+                    // if d is power of two
+                    if ((d & (d - 1)) == 0) 
+                    {
+                        // seems very hacky
+                        node->layer->scale += (layer_sd->scale - log2(d));
+                        child->layer->mode = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    void attention_triangular(Tensor2D<T> &q, Tensor2D<T> &k, Tensor2D<T> &v, Tensor2D<T> &out, u64 scale, u64 n_heads)
+    {
+        u64 n_seq = q.d1;
+        u64 n_embd = q.d2;
+        SlothAttentionTriangular(n_seq, n_embd, n_heads, q.data, k.data, v.data, out.data, scale);
+    }
+
+    void doOptimizeSoftmax(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
+    {
+        if (node->layer->doTruncationForward || node->layer->name == "_ScalarDiv") 
+        {
+            if (node->children.size() == 1) 
+            {
+                LayerGraphNode<T> *child = node->children[0];
+                if (child->layer->name == "SoftMax" || child->layer->name == "SoftMaxTriangular") 
+                {
+                    child->layer->mode = 1;
+                }
+            }
+        }
+    }
+
+    void optimize(LayerGraphNode<T> *root)
+    {
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
+            doOptimizeGelu(n, r);
+        });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
+            doOptimizeSoftmax(n, r);
+        });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
+            doOptimizeDiv(n, r);
+        });
+    }
+
+    void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode)
+    {
+        if (mode == 1) {
+            y.copy(x, false);
+        }
+        else
+        {
+            T d = T(double(1LL << (scale)) / scalar);
+            if ((d & (d - 1)) == 0) 
+            {
+                SlothFaithfulARS(x.size(), LlamaConfig::bitlength, x.data, y.data, scale - log2(d), "Linear::");
+            }
+            else
+            {
+                this->scalarmul(x, d, y);
+                SlothFaithfulARS(y.size(), LlamaConfig::bitlength, y.data, y.data, scale, "Linear::");
+            }
+        }
+    }
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/piranha_cleartext.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/piranha_cleartext.h
new file mode 100644
index 00000000..2eedb419
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/piranha_cleartext.h
@@ -0,0 +1,135 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/backend/baseline_cleartext.h>
+
+template <typename T>
+class PiranhaClearText : public BaselineClearText<T> {
+public:
+    T pirhana_inverse(T x, u64 scale)
+    {
+        T alpha = 0;
+        while((1LL << alpha) < x) {
+            alpha++;
+        }
+        // inv(x/2^scale) * 2^scale = [inv(x/2^alpha) * 2^alpha] * 2^(scale - alpha) * 2^(scale - alpha)
+        T a_alpha = 2.63 * (1LL << alpha);
+        T b_alpha = -5.857 * (1LL << alpha);
+        T c_alpha = 4.245 * (1LL << alpha);
+
+        T res = a_alpha * x + b_alpha * (1LL << alpha);
+        res >>= alpha;
+        res = res * x + c_alpha * (1LL << alpha);
+        res >>= alpha;
+        if (scale > alpha) {
+            res = res * (1LL<<(scale-alpha));
+            res = res * (1LL<<(scale-alpha));
+        }
+        else {
+            res = res / (1LL<<(alpha-scale));
+            res = res / (1LL<<(alpha-scale));
+        }
+        return res;
+    }
+    
+    
+    void softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+        for(int b = 0; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < numClasses; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < numClasses; ++j) {
+                T x = max - in(b, j);
+                exps[j] = (x < twofix ? twofix - x : 0) / 2;
+                den += exps[j];
+            }
+
+            T inv_den = pirhana_inverse(den, scale);
+            for(u64 j = 0; j < numClasses; ++j) {
+                out(b, j) = exps[j] * inv_den;
+                out(b, j) >>= scale;
+            }
+        }
+    }
+
+    void softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        always_assert(_in.shape[0] == _in.shape[1]); // should be a square matrix
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+
+        out.zero();
+        out(0, 0) = T(1LL << (scale));
+        
+        for(int b = 1; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < b + 1; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < b + 1; ++j) {
+                T x = max - in(b, j);
+                exps[j] = (x < twofix ? twofix - x : 0) / 2;
+                // exps[j] = T(std::exp(-x / double(1LL<<scale)) * (1LL<<scale));
+                den += exps[j];
+            }
+
+            T inv_den = pirhana_inverse(den, scale);
+            for(u64 j = 0; j < b + 1; ++j) {
+                out(b, j) = exps[j] * inv_den;
+                out(b, j) >>= scale;
+            }
+        }
+    }
+
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/secureml_cleartext.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/secureml_cleartext.h
new file mode 100644
index 00000000..b2082535
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/secureml_cleartext.h
@@ -0,0 +1,113 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/backend/baseline_cleartext.h>
+
+template <typename T>
+class SecureMLClearText : public BaselineClearText<T> {
+public:
+    void softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+        for(int b = 0; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < numClasses; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < numClasses; ++j) {
+                T x = in(b, j);
+                exps[j] = (x > 0 ? x : 0);
+                den += exps[j];
+            }
+
+            if (den == 0)
+                for(u64 j = 0; j < numClasses; ++j) {
+                    out(b, j) = (1LL<<scale) / numClasses;
+                }
+            else
+                for(u64 j = 0; j < numClasses; ++j) {
+                    out(b, j) = exps[j] * (1LL<<scale) / den;
+                }
+        }
+    }
+
+    void softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+    {
+        always_assert(_in.shape.size() == 2);
+        always_assert(_out.shape.size() == 2);
+        always_assert(_in.shape[0] == _out.shape[0]);
+        always_assert(_in.shape[1] == _out.shape[1]);
+        always_assert(_in.shape[0] == _in.shape[1]); // should be a square matrix
+        T twofix = 2 * (1LL << scale);
+
+        auto in = _in.as_2d();
+        auto out = _out.as_2d();
+
+        auto batchSize = in.d1;
+        auto numClasses = in.d2;
+        T exps[numClasses];
+
+        out.zero();
+        out(0, 0) = T(1LL << (scale));
+        
+        for(int b = 1; b < batchSize; ++b) {
+            T max = in(b, 0);
+            for(u64 j = 1; j < b + 1; ++j) {
+                if(in(b, j) > max) {
+                    max = in(b, j);
+                }
+            }
+
+            T den = 0;
+            for(u64 j = 0; j < b + 1; ++j) {
+                T x = in(b, j);
+                exps[j] = (x > 0 ? x : 0);
+                den += exps[j];
+            }
+
+            if (den == 0)
+                for(u64 j = 0; j < b + 1; ++j) {
+                    out(b, j) = (1LL<<scale) / (b + 1);
+                }
+            else
+                for(u64 j = 0; j < b + 1; ++j) {
+                    out(b, j) = exps[j] * (1LL<<scale) / den;
+                }
+        }
+    }
+
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/graph.h b/GPU-MPC/ext/sytorch/include/sytorch/graph.h
new file mode 100644
index 00000000..debaadc3
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/graph.h
@@ -0,0 +1,121 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <set>
+#include <fstream>
+#include <queue>
+
+
+template <typename T>
+class Layer;
+
+template <typename T>
+class Tensor;
+
+template <typename T>
+struct LayerGraphNode
+{
+    Layer<T> *layer;
+    std::vector<LayerGraphNode<T> *> parents;
+    std::vector<LayerGraphNode<T> *> children;
+    int numUsages = 0;
+    Tensor<T> *currTensor = nullptr;
+    bool mark = false;
+    std::vector<LayerGraphNode<T> *> *allNodesInExecutionOrderRef = nullptr;
+    bool onGpu = false;
+
+    bool incrementAndGc()
+    {
+        if (layer->name == "Input")
+        {
+            return false;
+        }
+        numUsages++; // todo: make it atomic
+        if (numUsages == children.size())
+        {
+            // printf("Freeing gpu mem\n");
+            assert(layer->activation.data == currTensor->data);
+            layer->activation.freeGpu();
+            return true;
+        }
+        return false;
+    }
+};
+
+template <typename T, typename Functor>
+void topologicalVisit(std::set<LayerGraphNode<T> *> &visited, LayerGraphNode<T> *node, LayerGraphNode<T> *root, Functor visitFn)
+{
+    if (visited.find(node) != visited.end())
+    {
+        return;
+    }
+    visited.insert(node);
+    for (auto parent : node->parents)
+    {
+        topologicalVisit(visited, parent, root, visitFn);
+    }
+
+    visitFn(node, root);
+
+    for (auto child : node->children)
+    {
+        topologicalVisit(visited, child, root, visitFn);
+    }
+}
+
+template <typename T, typename Functor>
+void topologicalApply(LayerGraphNode<T> *root, Functor visitFn)
+{
+    std::set<LayerGraphNode<T> *> visited;
+    topologicalVisit(visited, root, root, visitFn);
+}
+
+template <typename T>
+void print_dot_graph(LayerGraphNode<T> *root)
+{
+    std::ofstream dotfile("graph.dot");
+    dotfile << "digraph G {" << std::endl;
+
+    topologicalApply(root, [&dotfile](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                     {
+        if (node->layer != nullptr) {
+            // std::string label = node->layer->name + "-" + std::to_string(node->layer->mode) + "-" + (node->layer->doPreSignExtension ? "true" : "false");
+            std::string label = node->layer->name;
+            if (node->layer->paramstring != "") {
+                std::string args = node->layer->paramstring;
+                std::replace(args.begin(), args.end(), '|', ',');
+                // remove last comma if exists
+                if (args.back() == ',') {
+                    args.pop_back();
+                }
+                label += "(" + args + ")";
+            }
+            dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " [label=\"" << label << "\"" + (node->mark ? std::string(" color=\"red\"") : std::string("")) + "];" << std::endl;
+            for (auto &child : node->children) {
+                dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " -> " << child->layer->name + std::to_string((uint64_t)(child->layer)) << ";" << std::endl;
+            }
+        } });
+
+    dotfile << "}" << std::endl;
+    dotfile.close();
+}
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
new file mode 100644
index 00000000..9fb03770
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
@@ -0,0 +1,1758 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/utils.h>
+#include <llama/assert.h>
+#include <sytorch/backend/default.h>
+#include <string>
+#include <omp.h>
+
+template <typename T>
+bool shapesOkay(std::vector<Tensor<T> *> &a)
+{
+    if (a.size() == 0)
+        return false;
+    for (auto i : a)
+    {
+        if (i->size() == 0)
+            return false;
+        for (auto j : i->shape)
+        {
+            if (j <= 0)
+            {
+                return false;
+            }
+        }
+    }
+    // printf("Resizing the input\n");
+    return true;
+}
+
+template <typename T>
+class Layer
+{
+public:
+    std::string name;
+    Tensor<T> activation, inputDerivative;
+    Backend<T> *backend = nullptr;
+
+    // config options
+    bool doTruncationForward = false;
+    bool doPreSignExtension = false;
+    bool doPostSignExtension = false;
+    bool isFirst = false;
+    u64 scale = 0;
+    int mode = 0; // only used in ReLU in llama improved to decide between relu and reluext, might need something cleaner?
+    int forwardTruncationMode = 0;
+    bool useBias = true;
+    bool isTrainingMode = false;
+    std::string paramstring = "";
+
+    LayerGraphNode<T> *node = nullptr;
+
+    Layer(){};
+
+    Layer(const std::string &name) : activation({0}), name(name)
+    {
+        backend = defaultBackend<T>();
+    }
+
+    virtual void _initScale(u64 scale) {};
+    void initScale(u64 scale)
+    {
+        always_assert(std::is_integral<T>::value || scale == 0);
+        this->scale = scale;
+        _initScale(scale);
+    };
+
+    virtual void _resize(const std::vector<std::vector<u64>> &shapes) {};
+    void resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        inputDerivative.resize(shapes[0]);
+        // printf("Shapes=%d\n", shapes[0].size());
+        auto outdims = this->get_output_dims(shapes);
+        activation.resize(outdims);
+        _resize(shapes);
+    }
+
+    virtual void _forward(Tensor<T> &a) = 0;
+    virtual void _forward(std::vector<Tensor<T> *> &a)
+    {
+        if (a.size() != 1)
+            throw std::runtime_error("variable input cardinality not supported in this layer");
+        _forward(*a[0]);
+    };
+
+    template <typename... Args>
+    Tensor<T> &forward(Args &...args)
+    {
+        std::vector<Tensor<T> *> a = collect(args...);
+        return forward(a);
+    }
+
+    Tensor<T> &forward(std::vector<Tensor<T> *> &a)
+    {
+        if (a[0]->graphGenMode)
+        {
+            for (auto &i : a)
+            {
+                always_assert(i->graphGenMode);
+            }
+            if (a[0]->graphNode->layer->name == "Input")
+            {
+                this->isFirst = true;
+                // assert(this->name.compare("Conv2D") == 0);
+                // printf("Setting isFirst=true for %s\n", this->name.data());
+            }
+            node = new LayerGraphNode<T>();
+            node->layer = this;
+            node->allNodesInExecutionOrderRef = a[0]->graphNode->allNodesInExecutionOrderRef;
+            node->allNodesInExecutionOrderRef->push_back(node);
+            for (auto &i : a)
+            {
+                auto parentNode = i->graphNode;
+                always_assert(parentNode->allNodesInExecutionOrderRef == node->allNodesInExecutionOrderRef);
+                node->parents.push_back(parentNode);
+                parentNode->children.push_back(node);
+            }
+            node->currTensor = &activation;
+            activation.graphNode = node;
+            activation.graphGenMode = true;
+            if (shapesOkay(a))
+            {
+                resize(getShapes(a));
+            }
+            return activation;
+        }
+        // check if we have the graph generated already
+        always_assert(node != nullptr);
+        for (auto &i : a)
+        {
+            always_assert(i->graphNode != nullptr);
+        }
+        activation.graphGenMode = false;
+        if (activation.size() == 0)
+            resize(getShapes(a));
+        // node->currTensor = &activation;
+        activation.graphNode = node;
+
+        if (doPreSignExtension)
+        {
+            for (auto &i : a)
+            {
+                this->backend->signext(*i, scale);
+            }
+        }
+        _forward(a);
+        // printf("Layer=%s, doTruncationForward=%d\n", this->name.data(), this->doTruncationForward);
+        if (doTruncationForward)
+        {
+            this->backend->truncateForward(activation, scale, forwardTruncationMode);
+            // printf("After truncate=%ld\n", activation.data[0]);
+        }
+        if (doPostSignExtension)
+        {
+            this->backend->signext(activation, scale);
+        }
+        for (auto &i : a)
+        {
+            i->graphNode->incrementAndGc();
+        }
+        return activation;
+    }
+
+    virtual TensorRef<T> getweights() { return TensorRef<T>(nullptr, 0); };
+    virtual TensorRef<T> getbias() { return TensorRef<T>(nullptr, 0); };
+    virtual std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes) = 0;
+
+    virtual void setBackend(Backend<T> *b)
+    {
+        backend = b;
+    }
+
+    void train()
+    {
+        isTrainingMode = true;
+    }
+
+    void eval()
+    {
+        isTrainingMode = false;
+    }
+};
+
+template <typename T>
+class Conv2D : public Layer<T>
+{
+public:
+    Tensor4D<T> inp;
+    Tensor2D<T> filter;
+    Tensor1D<T> bias;
+
+    Tensor2D<T> filterGrad;
+    Tensor2D<T> Vw;
+    Tensor1D<T> biasGrad;
+    Tensor1D<T> Vb;
+
+    u64 ci, co;
+    u64 fh, fw, padding, stride;
+
+    Conv2D(u64 ci, u64 co, u64 f, u64 padding = 0, u64 stride = 1, bool useBias = false) : Layer<T>("Conv2D"), ci(ci), co(co), fh(f), fw(f),
+                                                                                           padding(padding), stride(stride), filter(co, f * f * ci), bias(co), inp({0, 0, 0, 0}), filterGrad(co, f * f * ci), Vw(co, f * f * ci), biasGrad(co), Vb(co)
+    {
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    Conv2D(u64 ci, u64 co, const std::array<u64, 2> f, u64 padding = 0, u64 stride = 1, bool useBias = false) : Layer<T>("Conv2D"), ci(ci), co(co), fh(f[0]), fw(f[1]),
+                                                                                                                padding(padding), stride(stride), filter(co, f[0] * f[1] * ci), bias(co), inp({0, 0, 0, 0})
+    {
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    void _initScale(u64 scale)
+    {
+        double xavier = 1.0 / sqrt(ci * fh * fw);
+        filter.randomize(xavier * (1ULL << scale));
+        if (this->useBias)
+            bias.randomize(xavier * (1ULL << (2 * scale)));
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 4);
+        always_assert(shape[3] == ci);
+        if (this->isTrainingMode)
+            inp.resize(shape);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.size() == 4);
+        assert(a.shape[3] == ci);
+        // if (this->isTrainingMode)
+        //     inp.as_nd().copy(a, false, a.d_data != nullptr);
+        auto act_4d = this->activation.as_4d();
+        this->backend->conv2D(fh, fw, padding, stride, ci, co, a.as_4d(), filter, this->useBias, bias, act_4d, this->isFirst);
+        this->activation.d_data = act_4d.d_data;
+    }
+
+    TensorRef<T> getweights() { return filter.ref(); }
+    TensorRef<T> getbias() { return bias.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 4);
+        always_assert(inShape[3] == ci);
+        u64 newH = (((inShape[1] + 2 * padding - fh) / stride) + 1);
+        u64 newW = (((inShape[2] + 2 * padding - fw) / stride) + 1);
+        return {inShape[0], newH, newW, co};
+    }
+};
+
+template <typename T>
+class Conv3D : public Layer<T>
+{
+public:
+    Tensor<T> inp;
+    Tensor2D<T> filter;
+    Tensor1D<T> bias;
+    u64 ci, co;
+    u64 fd, fh, fw;
+    u64 pd, ph, pw;
+    u64 sd, sh, sw;
+    u64 dd, dh, dw;
+
+    Conv3D(u64 ci, u64 co, u64 f, u64 padding = 0, u64 stride = 1, u64 dialation = 1, bool useBias = false) : Layer<T>("Conv3D"), ci(ci), co(co), fd(f), fh(f), fw(f),
+                                                                                                              pd(padding), ph(padding), pw(padding), sd(stride), sh(stride), sw(stride), filter(co, f * f * f * ci), bias(co), inp({0, 0, 0, 0, 0})
+    {
+        always_assert(dialation == 1);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    Conv3D(u64 ci, u64 co, const std::array<u64, 3> f, u64 padding = 0, u64 stride = 1, u64 dialation = 1, bool useBias = false) : Layer<T>("Conv3D"), ci(ci), co(co), fd(f[0]), fh(f[1]), fw(f[2]),
+                                                                                                                                   pd(padding), ph(padding), pw(padding), sd(stride), sh(stride), sw(stride), dd(dialation), dh(dialation), dw(dialation), filter(co, f[0] * f[1] * f[2] * ci), bias(co), inp({0, 0, 0, 0, 0})
+    {
+        always_assert(dialation == 1);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    Conv3D(u64 ci, u64 co, const std::array<u64, 3> f, const std::array<u64, 3> padding = {0, 0, 0}, u64 stride = 1, u64 dialation = 1, bool useBias = false) : Layer<T>("Conv3D"), ci(ci), co(co), fd(f[0]), fh(f[1]), fw(f[2]),
+                                                                                                                                                                pd(padding[0]), ph(padding[1]), pw(padding[2]), sd(stride), sh(stride), sw(stride), dd(dialation), dh(dialation), dw(dialation), filter(co, f[0] * f[1] * f[2] * ci), bias(co), inp({0, 0, 0, 0, 0})
+    {
+        always_assert(dialation == 1);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    Conv3D(u64 ci, u64 co, const std::array<u64, 3> f, const std::array<u64, 6> padding = {0, 0, 0, 0, 0, 0}, const std::array<u64, 3> stride = {1, 1, 1}, const std::array<u64, 3> dialation = {1, 1, 1}, bool useBias = false) : Layer<T>("Conv3D"), ci(ci), co(co), fd(f[0]), fh(f[1]), fw(f[2]),
+                                                                                                                                                                                                                                   pd(padding[0]), ph(padding[1]), pw(padding[2]), sd(stride[0]), sh(stride[1]), sw(stride[2]), dd(dialation[0]), dh(dialation[1]), dw(dialation[2]), filter(co, f[0] * f[1] * f[2] * ci), bias(co), inp({0, 0, 0, 0, 0})
+    {
+        always_assert(dialation[0] == 1);
+        always_assert(dialation[1] == 1);
+        always_assert(dialation[2] == 1);
+        always_assert(padding[3] == padding[0]);
+        always_assert(padding[4] == padding[1]);
+        always_assert(padding[5] == padding[2]);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    void _initScale(u64 scale)
+    {
+        double xavier = 1.0 / sqrt(ci * fd * fh * fw);
+        filter.randomize(xavier * (1ULL << scale));
+        if (this->useBias)
+            bias.randomize(xavier * (1ULL << (2 * scale)));
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 5);
+        always_assert(shape[4] == ci);
+        if (this->isTrainingMode)
+            inp.resize(shape);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.size() == 5);
+        assert(a.shape[4] == ci);
+        auto act_5d = this->activation.as_5d();
+        this->backend->conv3D(fd, fh, fw, pd, ph, pw, sd, sh, sw, dd, dh, dw, ci, co, a.as_5d(), filter, act_5d);
+        if (this->useBias)
+            this->backend->addbias(this->activation, bias);
+    }
+
+    TensorRef<T> getweights() { return filter.ref(); }
+    TensorRef<T> getbias() { return bias.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 5);
+        always_assert(inShape[4] == ci);
+        u64 newD = (((inShape[1] + 2 * pd - fd - (fd - 1) * (dd - 1)) / sd) + 1);
+        u64 newH = (((inShape[2] + 2 * ph - fh - (fh - 1) * (dh - 1)) / sh) + 1);
+        u64 newW = (((inShape[3] + 2 * pw - fw - (fw - 1) * (dw - 1)) / sw) + 1);
+        return {inShape[0], newD, newH, newW, co};
+    }
+};
+
+template <typename T>
+class AvgPool2D : public Layer<T>
+{
+public:
+    u64 ks, padding, stride;
+
+    AvgPool2D(u64 ks, u64 padding = 0, u64 _stride = 0) : Layer<T>("AvgPool2D"), ks(ks), padding(padding), stride(_stride == 0 ? ks : _stride)
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 4);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.size() == 4);
+        auto a_4d = a.as_4d();
+        auto act_4d = this->activation.as_4d();
+        this->backend->avgPool2D(ks, padding, stride, a_4d, act_4d, this->scale);
+        this->activation.d_data = act_4d.d_data;
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 4);
+        u64 newH = (((inShape[1] + 2 * padding - ks) / stride) + 1);
+        u64 newW = (((inShape[2] + 2 * padding - ks) / stride) + 1);
+        return {inShape[0], newH, newW, inShape[3]};
+    }
+};
+
+template <typename T>
+class SumPool2D : public Layer<T>
+{
+public:
+    u64 ks, padding, stride;
+
+    SumPool2D(u64 ks, u64 padding = 0, u64 _stride = 0) : Layer<T>("SumPool2D"), ks(ks), padding(padding), stride(_stride == 0 ? ks : _stride) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 4);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->sumPool2D(ks, padding, stride, a.as_4d(), this->activation.as_4d());
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 4);
+        u64 newH = (((inShape[1] + 2 * padding - ks) / stride) + 1);
+        u64 newW = (((inShape[2] + 2 * padding - ks) / stride) + 1);
+        return {inShape[0], newH, newW, inShape[3]};
+    }
+};
+
+template <typename T>
+class MaxPool2D : public Layer<T>
+{
+public:
+    u64 ks, padding, stride;
+    Tensor4D<u64> maxIndex;
+
+    MaxPool2D(u64 ks, u64 padding = 0, u64 _stride = 0) : Layer<T>("MaxPool2D"), ks(ks), padding(padding), stride(_stride == 0 ? ks : _stride), maxIndex(0, 0, 0, 0) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 4);
+        this->maxIndex.resize(this->activation.shape);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        auto a_4d = a.as_4d();
+        auto act_4d = this->activation.as_4d();
+        this->backend->maxPool2D(ks, padding, stride, a_4d, act_4d, maxIndex, this->scale, this->mode);
+        this->activation.d_data = act_4d.d_data;
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        u64 newH = (((inShape[1] + 2 * padding - ks) / stride) + 1);
+        u64 newW = (((inShape[2] + 2 * padding - ks) / stride) + 1);
+        // printf("%d, %d, %d, %d\n", newH, newW, inShape[1], inShape[2]);
+        return {inShape[0], newH, newW, inShape[3]};
+    }
+};
+
+template <typename T>
+class Flatten : public Layer<T>
+{
+public:
+    bool transpose = true;
+    Flatten() : Layer<T>("Flatten") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() >= 2);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        printf("############### Flatten=%d\n", transpose);
+        if (transpose && (a.shape.size() == 4 || a.shape.size() == 5))
+        {
+            printf("@@@@@@@@@@@@@@@@@@@@@@@ In here!!!!!!!!!!!\n");
+            // printf("Flatten: %d\n", a.shape.size());
+            if (a.shape.size() == 4)
+            {
+                auto a_4d = a.as_4d();
+                auto act_2d = this->activation.as_2d();
+                u64 d1 = a.shape[0];
+                u64 d2 = a.shape[1];
+                u64 d3 = a.shape[2];
+                u64 d4 = a.shape[3];
+
+#pragma omp parallel for collapse(4)
+                for (u64 i = 0; i < d1; i++)
+                {
+                    for (u64 j = 0; j < d2; j++)
+                    {
+                        for (u64 k = 0; k < d3; k++)
+                        {
+                            for (u64 l = 0; l < d4; l++)
+                            {
+                                // this->activation(i, j * d3 * d4 + k * d4 + l, 0, 0) = a(i, j, k, l);
+                                act_2d(i, l * d2 * d3 + j * d3 + k) = a_4d(i, j, k, l);
+                                // printf("Flatten: %ld, %ld\n", act_2d(i, l * d2 * d3 + j * d3 + k), a_4d(i, j, k, l));
+                            }
+                        }
+                    }
+                }
+            }
+            else if (a.shape.size() == 5)
+            {
+                auto a_5d = a.as_5d();
+                auto act_2d = this->activation.as_2d();
+                u64 d1 = a.shape[0];
+                u64 d2 = a.shape[1];
+                u64 d3 = a.shape[2];
+                u64 d4 = a.shape[3];
+                u64 d5 = a.shape[4];
+
+#pragma omp parallel for collapse(5)
+                for (u64 i = 0; i < d1; i++)
+                {
+                    for (u64 j = 0; j < d2; j++)
+                    {
+                        for (u64 k = 0; k < d3; k++)
+                        {
+                            for (u64 l = 0; l < d4; l++)
+                            {
+                                for (u64 m = 0; m < d5; ++m)
+                                {
+                                    act_2d(i, m * d2 * d3 * d4 + j * d3 * d4 + k * d4 + l) = a_5d(i, j, k, l, m);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            printf("################# In here!!!!!!!!!!!\n");
+            u64 sz = a.size();
+#pragma omp parallel for
+            for (u64 i = 0; i < sz; i++)
+            {
+                this->activation.data[i] = a.data[i];
+            }
+        }
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        u64 prod = 1;
+        for (int i = 1; i < inShape.size(); i++)
+        {
+            prod *= inShape[i];
+        }
+        return {inShape[0], prod};
+    }
+};
+
+template <typename T>
+class FC : public Layer<T>
+{
+public:
+    Tensor<T> inp;
+    Tensor2D<T> weight;
+    Tensor1D<T> bias;
+    u64 in, out;
+
+    Tensor2D<T> weightGrad;
+    Tensor2D<T> Vw;
+    Tensor1D<T> Vb;
+
+    FC(u64 in, u64 out, bool useBias = false) : Layer<T>("FC"), in(in), out(out), weight(in, out), bias(out), inp({0, 0, 0, 0}), weightGrad(in, out), Vw(in, out), Vb(out)
+    {
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    void _initScale(u64 scale)
+    {
+        double xavier = 1.0 / sqrt(in);
+        weight.randomize(xavier * (1ULL << scale));
+        if (this->useBias)
+            bias.randomize(xavier * (1ULL << (2 * scale)));
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 2);
+        always_assert(shape[1] == in);
+        inp.resize(shape);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        auto a_2d = a.as_2d();
+        auto act_2d = this->activation.as_2d();
+        this->backend->matmul(a_2d, weight, act_2d, this->useBias, bias, this->isFirst);
+        this->activation.d_data = act_2d.d_data;
+    }
+
+    TensorRef<T> getweights() { return weight.ref(); }
+    TensorRef<T> getbias() { return bias.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        // always_assert(inShape.size() == 2);
+        // assert(inShape[1] == in);
+        u64 sz = 1;
+        for (int i = 1; i < inShape.size(); i++)
+            sz *= inShape[i];
+        always_assert(sz == in);
+        return {inShape[0], out};
+    }
+};
+
+template <typename T>
+class ReLU : public Layer<T>
+{
+public:
+    Tensor<T> drelu;
+    ReLU() : Layer<T>("ReLU"), drelu({0}) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        this->drelu.resize(shape);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->relu(a, this->activation, this->drelu, this->scale, this->mode);
+        // printf("Relu=%ld, %ld\n", this->activation.data[0], this->activation.data[1]);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class BatchNormInference : public Layer<T>
+{
+public:
+    Tensor1D<T> A; // scale = s
+    Tensor1D<T> B; // scale = 2s
+
+    BatchNormInference(u64 channels) : Layer<T>("BatchNormInference"), A(channels), B(channels)
+    {
+        this->A.fill(0);
+        this->B.fill(0);
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        // always_assert(shape.size() == 4);
+        always_assert(shape.back() == this->A.d1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        // always_assert(a.shape.size() == 4);
+        assert(a.shape.back() == this->A.d1);
+        if (this->isTrainingMode)
+        {
+            std::runtime_error("BatchNormInference should not be used in training mode");
+        }
+        else
+        {
+            this->backend->batchNormInference(this->A, this->B, a, this->activation, this->scale);
+        }
+    }
+
+    TensorRef<T> getweights() { return A.ref(); }
+    TensorRef<T> getbias() { return B.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        // always_assert(inShape.size() == 4);
+        always_assert(inShape.back() == this->A.d1);
+        return inShape;
+    }
+};
+
+template <typename T>
+class Identity : public Layer<T>
+{
+public:
+    Identity() : Layer<T>("Identity") {}
+
+    void _forward(Tensor<T> &a)
+    {
+        this->activation.copy(a, false);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class GlobalAvgPool2D : public Layer<T>
+{
+public:
+    GlobalAvgPool2D() : Layer<T>("GlobalAvgPool2D")
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 4);
+        always_assert(shape[1] == shape[2]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        auto a_4d = a.as_4d();
+        auto act_4d = this->activation.as_4d();
+        this->backend->avgPool2D(a_4d.d2, 0, 1, a_4d, act_4d, this->scale);
+        this->activation.d_data = act_4d.d_data;
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 4);
+        always_assert(inShape[1] == inShape[2]);
+        return {inShape[0], 1, 1, inShape[3]};
+    }
+};
+
+template <typename T>
+class ConvTranspose3D : public Layer<T>
+{
+public:
+    Tensor2D<T> filter;
+    Tensor1D<T> bias;
+    u64 ci, co;
+    u64 fd, fh, fw;
+    u64 pd, ph, pw;
+    u64 sd, sh, sw;
+
+    ConvTranspose3D(u64 ci, u64 co, u64 f, u64 padding = 0, u64 stride = 1, bool useBias = false) : Layer<T>("ConvTranspose3D"), ci(ci), co(co), fd(f), fh(f), fw(f),
+                                                                                                    pd(padding), ph(padding), pw(padding), sd(stride), sh(stride), sw(stride), filter(co, f * f * f * ci), bias(co)
+    {
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    ConvTranspose3D(u64 ci, u64 co, const std::array<u64, 3> f, u64 padding = 0, u64 stride = 1, bool useBias = false) : Layer<T>("ConvTranspose3D"), ci(ci), co(co), fd(f[0]), fh(f[1]), fw(f[2]),
+                                                                                                                         pd(padding), ph(padding), pw(padding), sd(stride), sh(stride), sw(stride), filter(co, f[0] * f[1] * f[2] * ci), bias(co)
+    {
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    ConvTranspose3D(u64 ci, u64 co, const std::array<u64, 3> f, const std::array<u64, 6> padding = {0, 0, 0, 0, 0, 0}, const std::array<u64, 3> stride = {1, 1, 1}, const std::array<u64, 3> dialation = {1, 1, 1}, bool useBias = false) : Layer<T>("ConvTranspose3D"), ci(ci), co(co), fd(f[0]), fh(f[1]), fw(f[2]),
+                                                                                                                                                                                                                                            pd(padding[0]), ph(padding[1]), pw(padding[2]), sd(stride[0]), sh(stride[1]), sw(stride[2]), filter(co, f[0] * f[1] * f[2] * ci), bias(co)
+    {
+        always_assert(dialation[0] == 1);
+        always_assert(dialation[1] == 1);
+        always_assert(dialation[2] == 1);
+        always_assert(padding[3] == padding[0]);
+        always_assert(padding[4] == padding[1]);
+        always_assert(padding[5] == padding[2]);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    void _initScale(u64 scale)
+    {
+        double xavier = 1.0 / sqrt(ci * fd * fh * fw);
+        filter.randomize(xavier * (1ULL << scale));
+        if (this->useBias)
+            bias.randomize(xavier * (1ULL << (2 * scale)));
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 5);
+        always_assert(shape[4] == ci);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.size() == 5);
+        assert(a.shape[4] == ci);
+        auto act_5d = this->activation.as_5d();
+        this->backend->convTranspose3D(fd, fh, fw, pd, ph, pw, sd, sh, sw, ci, co, a.as_5d(), filter, act_5d);
+        if (this->useBias)
+            this->backend->addbias(this->activation, bias);
+    }
+
+    TensorRef<T> getweights() { return filter.ref(); }
+    TensorRef<T> getbias() { return bias.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.size() == 5);
+        always_assert(inShape[4] == ci);
+        u64 newD = (((inShape[1] - 1) * sd + fd - 2 * pd));
+        u64 newH = (((inShape[2] - 1) * sh + fh - 2 * ph));
+        u64 newW = (((inShape[3] - 1) * sw + fw - 2 * pw));
+        return {inShape[0], newD, newH, newW, co};
+    }
+};
+
+template <typename T>
+class PlaceHolderLayer : public Layer<T>
+{
+public:
+    PlaceHolderLayer(const std::string &s) : Layer<T>(s)
+    {
+    }
+
+    void initScale(u64 scale)
+    {
+        throw std::runtime_error("PlaceHolderLayer only to be used for tree traversal");
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        throw std::runtime_error("PlaceHolderLayer only to be used for tree traversal");
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class Add : public Layer<T>
+{
+public:
+    Add() : Layer<T>("Add") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        auto &shape0 = shapes[0];
+        for (auto &shape : shapes)
+        {
+            always_assert(shape.size() == shape0.size());
+            for (u64 i = 0; i < shape.size(); i++)
+            {
+                always_assert(shape[i] == shape0[i]);
+            }
+        }
+    }
+
+    void _forward(std::vector<Tensor<T> *> &a)
+    {
+        this->backend->add(a, this->activation);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->activation.copy(a, false);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        auto &shape0 = inShapes[0];
+        for (auto &shape : inShapes)
+        {
+            always_assert(shape.size() == shape0.size());
+            for (u64 i = 0; i < shape.size(); i++)
+            {
+                assert(shape[i] == shape0[i]);
+            }
+        }
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+// concat along the last axis (channel)
+template <typename T>
+class Concat : public Layer<T>
+{
+public:
+    Concat() : Layer<T>("Concat") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        auto &shape0 = shapes[0];
+        for (auto &shape : shapes)
+        {
+            for (u64 i = 0; i < shape.size() - 1; i++)
+            {
+                always_assert(shape[i] == shape0[i]);
+            }
+        }
+    }
+
+    void _forward(std::vector<Tensor<T> *> &arr)
+    {
+        u64 outchannels = 0;
+        u64 sz = 0;
+        for (auto &t : arr)
+        {
+            outchannels += t->shape.back();
+            sz += t->size();
+        }
+
+#pragma omp parallel for
+        for (int i = 0; i < sz; ++i)
+        {
+            u64 l = i % outchannels;
+            u64 rest = i / outchannels;
+            for (auto &a : arr)
+            {
+                if (l < a->shape.back())
+                {
+                    this->activation.data[i] = a->data[rest * a->shape.back() + l];
+                    break;
+                }
+                l -= a->shape.back();
+            }
+        }
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->activation.copy(a, false);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        auto &shape0 = inShapes[0];
+        for (auto &shape : inShapes)
+        {
+            for (u64 i = 0; i < shape.size() - 1; i++)
+            {
+                always_assert(shape[i] == shape0[i]);
+            }
+        }
+
+        std::vector<u64> outShape = shape0;
+        outShape.back() = 0;
+        for (auto &shape : inShapes)
+        {
+            outShape.back() += shape.back();
+        }
+        return outShape;
+    }
+};
+
+template <typename T>
+class GeLU : public Layer<T>
+{
+public:
+    GeLU() : Layer<T>("GeLU") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->gelu(a, this->activation, this->scale, this->mode);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class SiLU : public Layer<T>
+{
+public:
+    SiLU() : Layer<T>("SiLU") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->silu(a, this->activation, this->scale, this->mode);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class SoftMax : public Layer<T>
+{
+public:
+    SoftMax() : Layer<T>("SoftMax") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        always_assert(shapes[0].size() == 2);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->softmax(a, this->activation, this->scale, this->mode);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        always_assert(inShapes[0].size() == 2);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class SoftMaxTriangular : public Layer<T>
+{
+public:
+    SoftMaxTriangular() : Layer<T>("SoftMaxTriangular") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        always_assert(shapes[0].size() == 2);
+        always_assert(shapes[0][0] == shapes[0][1]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->softmax_triangular(a, this->activation, this->scale, this->mode);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        always_assert(inShapes[0].size() == 2);
+        auto &inShape = inShapes[0];
+        always_assert(inShape[0] == inShape[1]);
+        return inShape;
+    }
+};
+
+template <typename T>
+class LayerNorm : public Layer<T>
+{
+public:
+    Tensor1D<T> A; // scale = s
+    Tensor1D<T> B; // scale = 2s
+
+    LayerNorm(u64 channels) : Layer<T>("LayerNorm"), A(channels), B(channels)
+    {
+        this->A.fill(0);
+        this->B.fill(0);
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.back() == this->A.d1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        // always_assert(a.shape.size() == 4);
+        assert(a.shape.back() == this->A.d1);
+        // A.as_nd().print();
+        this->backend->layernorm(this->A, this->B, a, this->activation, this->scale);
+    }
+
+    TensorRef<T> getweights() { return A.ref(); }
+    TensorRef<T> getbias() { return B.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.back() == this->A.d1);
+        return inShape;
+    }
+};
+
+template <typename T>
+class RMSNorm : public Layer<T>
+{
+public:
+    Tensor1D<T> A; // scale = s
+    Tensor1D<T> B; // scale = 2s
+
+    RMSNorm(u64 channels, bool useBias = false) : Layer<T>("RMSNorm"), A(channels), B(channels)
+    {
+        this->A.fill(0);
+        this->B.fill(0);
+        this->doTruncationForward = true;
+        this->useBias = useBias;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.back() == this->A.d1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        // always_assert(a.shape.size() == 4);
+        assert(a.shape.back() == this->A.d1);
+        this->backend->rmsnorm(this->A, this->B, a, this->activation, this->scale);
+    }
+
+    TensorRef<T> getweights() { return A.ref(); }
+    TensorRef<T> getbias() { return B.ref(); }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        always_assert(inShape.back() == this->A.d1);
+        return inShape;
+    }
+};
+
+template <typename T>
+class Split : public Layer<T>
+{
+public:
+    u64 n_splits;
+
+    Split(u64 n_splits) : Layer<T>("Split"), n_splits(n_splits) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.back() % n_splits == 0);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.back() % n_splits == 0);
+        u64 split_size = a.shape.back() / n_splits; // 3
+        u64 rest_size = a.size() / a.shape.back();  // 2
+
+#pragma omp parallel for
+        for (u64 i = 0; i < a.size(); ++i)
+        {
+            u64 p = i / a.shape.back();
+            u64 q = i % a.shape.back();
+            u64 r = q / split_size;
+            u64 s = q % split_size;
+            this->activation.data[r * split_size * rest_size + p * split_size + s] = a.data[i];
+        }
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto shape = inShapes[0];
+        always_assert(shape.back() % n_splits == 0);
+        shape.back() /= n_splits;
+        shape.insert(shape.begin(), n_splits);
+        return shape;
+    }
+};
+
+template <typename T>
+class View : public Layer<T>
+{
+public:
+    i64 idx;
+
+    View(i64 idx) : Layer<T>("View"), idx(idx) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        // auto &shape = shapes[0];
+        // always_assert(idx < shape[0]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        // always_assert(idx < a.shape[0]);
+        // std::cout << idx << std::endl;
+        // std::cout << (idx % a.shape[0]) << std::endl;
+        u64 i = (idx + a.shape[0]) % a.shape[0];
+        auto v = a.view(i);
+        this->activation.copy(v, false);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto shape = inShapes[0];
+        if (inShapes[0].size() < 1)
+            printshape(inShapes[0]);
+        shape.erase(shape.begin());
+        return shape;
+    }
+};
+
+template <typename T>
+class Transpose : public Layer<T>
+{
+public:
+    Transpose() : Layer<T>("Transpose") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto &shape = shapes[0];
+        always_assert(shape.size() == 2);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        always_assert(a.shape.size() == 2);
+#pragma omp parallel for collapse(2)
+        for (u64 i = 0; i < a.shape[0]; ++i)
+        {
+            for (u64 j = 0; j < a.shape[1]; ++j)
+            {
+                this->activation.data[j * a.shape[0] + i] = a.data[i * a.shape[1] + j];
+            }
+        }
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto shape = inShapes[0];
+        always_assert(shape.size() == 2);
+        return {shape[1], shape[0]};
+    }
+};
+
+template <typename T>
+class _MatMul : public Layer<T>
+{
+public:
+    _MatMul() : Layer<T>("_MatMul")
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 2);
+        auto &shape0 = shapes[0];
+        auto &shape1 = shapes[1];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape0[1] == shape1[0]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        throw std::runtime_error("single input not allowed in matmul");
+    }
+
+    void _forward(std::vector<Tensor<T> *> &a)
+    {
+        always_assert(a.size() == 2);
+        auto &a0 = *a[0];
+        auto a0_2d = a0.as_2d();
+        auto &a1 = *a[1];
+        auto a1_2d = a1.as_2d();
+        auto act_2d = this->activation.as_2d();
+        this->backend->matmul(a0_2d, a1_2d, act_2d);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 2);
+        auto &shape0 = inShapes[0];
+        auto &shape1 = inShapes[1];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape0[1] == shape1[0]);
+        return {shape0[0], shape1[1]};
+    }
+};
+
+template <typename T>
+class _MatMulTriangular : public Layer<T>
+{
+public:
+    _MatMulTriangular() : Layer<T>("_MatMulTriangular")
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 2);
+        auto &shape0 = shapes[0];
+        auto &shape1 = shapes[1];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape0[1] == shape1[0]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        throw std::runtime_error("single input not allowed in matmul");
+    }
+
+    void _forward(std::vector<Tensor<T> *> &a)
+    {
+        always_assert(a.size() == 2);
+        auto &a0 = *a[0];
+        auto a0_2d = a0.as_2d();
+        auto &a1 = *a[1];
+        auto a1_2d = a1.as_2d();
+        auto act_2d = this->activation.as_2d();
+        this->backend->matmul_triangular(a0_2d, a1_2d, act_2d);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 2);
+        auto &shape0 = inShapes[0];
+        auto &shape1 = inShapes[1];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape0[1] == shape1[0]);
+        return {shape0[0], shape1[1]};
+    }
+};
+
+template <typename T>
+class _ScalarMul : public Layer<T>
+{
+public:
+    double scalar;
+
+    _ScalarMul(double scalar) : Layer<T>("_ScalarMul"), scalar(scalar)
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        T scalarFix = scalar * (1LL << this->scale);
+        // a.printshape();
+        // this->activation.printshape();
+        this->backend->scalarmul(a, scalarFix, this->activation);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &shape0 = inShapes[0];
+        return shape0;
+    }
+};
+
+template <typename T>
+class AttentionMask : public Layer<T>
+{
+public:
+    double scalar;
+
+    AttentionMask(double scalar) : Layer<T>("AttentionMask"), scalar(scalar) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto shape = shapes[0];
+        always_assert(shape.size() == 2);
+        always_assert(shape[0] == shape[1]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        T scalarFix = scalar * (1LL << this->scale);
+        this->backend->attention_mask(a, scalarFix, this->activation);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto shape = inShapes[0];
+        always_assert(shape.size() == 2);
+        always_assert(shape[0] == shape[1]);
+        return shape;
+    }
+};
+
+template <typename T>
+class LocalAttentionMask : public Layer<T>
+{
+public:
+    double scalar;
+    // u64 window_size;
+
+    LocalAttentionMask(double scalar) : Layer<T>("LocalAttentionMask"), scalar(scalar) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+        auto shape = shapes[0];
+        always_assert(shape.size() == 2);
+        always_assert(shape[0] == shape[1]);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        T scalarFix = scalar * (1LL << this->scale);
+        this->backend->local_attention_mask(a, scalarFix, this->activation);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto shape = inShapes[0];
+        always_assert(shape.size() == 2);
+        always_assert(shape[0] == shape[1]);
+        return shape;
+    }
+};
+
+template <typename T>
+class _Tanh : public Layer<T>
+{
+public:
+    _Tanh() : Layer<T>("_Tanh") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->tanh(a, this->activation, this->scale);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class Unsqueeze : public Layer<T>
+{
+public:
+    Unsqueeze() : Layer<T>("Unsqueeze") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        u64 sz = a.size();
+        for (u64 i = 0; i < sz; i++)
+        {
+            this->activation.data[i] = a.data[i];
+        }
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto inShape = inShapes[0];
+        inShape.insert(inShape.begin(), 1);
+        return inShape;
+    }
+};
+
+template <typename T>
+class AttentionTriangular : public Layer<T>
+{
+public:
+    u64 n_heads;
+    AttentionTriangular(u64 n_heads) : Layer<T>("AttentionTriangular"), n_heads(n_heads) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 3);
+        auto &shape0 = shapes[0];
+        auto &shape1 = shapes[1];
+        auto &shape2 = shapes[2];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape2.size() == 2);
+        auto n_seq = shape0[0];
+        auto n_embd = shape0[1];
+        always_assert(shape1[0] == n_seq);
+        always_assert(shape1[1] == n_embd);
+        always_assert(shape2[0] == n_seq);
+        always_assert(shape2[1] == n_embd);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        throw std::runtime_error("single input not allowed in AttentionTriangular");
+    }
+
+    void _forward(std::vector<Tensor<T> *> &a)
+    {
+        always_assert(a.size() == 3);
+        auto &q = *a[0];
+        auto q_2d = q.as_2d();
+        auto &k = *a[1];
+        auto k_2d = k.as_2d();
+        auto &v = *a[2];
+        auto v_2d = v.as_2d();
+        auto act_2d = this->activation.as_2d();
+        this->backend->attention_triangular(q_2d, k_2d, v_2d, act_2d, this->scale, n_heads);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 3);
+        auto &shape0 = inShapes[0];
+        auto &shape1 = inShapes[1];
+        auto &shape2 = inShapes[2];
+        always_assert(shape0.size() == 2);
+        always_assert(shape1.size() == 2);
+        always_assert(shape2.size() == 2);
+        auto n_seq = shape0[0];
+        auto n_embd = shape0[1];
+        always_assert(shape1[0] == n_seq);
+        always_assert(shape1[1] == n_embd);
+        always_assert(shape2[0] == n_seq);
+        always_assert(shape2[1] == n_embd);
+        return {shape0[0], shape0[1]};
+    }
+};
+
+template <typename T>
+class _Mul : public Layer<T>
+{
+public:
+    _Mul() : Layer<T>("_Mul")
+    {
+        this->doTruncationForward = true;
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 2);
+        auto &shape0 = shapes[0];
+        auto &shape1 = shapes[1];
+        always_assert(shape0.size() == shape1.size());
+        for (u64 i = 0; i < shape0.size(); i++)
+        {
+            always_assert(shape0[i] == shape1[i]);
+        }
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        throw std::runtime_error("single input not allowed in mul");
+    }
+
+    void _forward(std::vector<Tensor<T> *> &a)
+    {
+        always_assert(a.size() == 2);
+        auto &a0 = *a[0];
+        auto &a1 = *a[1];
+        this->backend->mul(a0, a1, this->activation);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 2);
+        auto &shape0 = inShapes[0];
+        auto &shape1 = inShapes[1];
+        always_assert(shape0.size() == shape1.size());
+        for (u64 i = 0; i < shape0.size(); i++)
+        {
+            always_assert(shape0[i] == shape1[i]);
+        }
+        return shape0;
+    }
+};
+
+template <typename T>
+class _MHADummy : public Layer<T>
+{
+public:
+    int n_heads, n_embed, dim_W;
+    Tensor2D<T> wQKV, wProj;
+    Tensor1D<T> bQKV, bProj;
+    std::string attnMask;
+    bool selfAttn;
+    std::string qkvLayout;
+    bool doNormQKt, doRotEmb;
+
+    _MHADummy(int n_heads, int n_embed, int dim_W, std::string attnMask, std::string qkvLayout, bool doNormQKt, bool doRotEmb = false) : Layer<T>("_MHADummy"), n_heads(n_heads), n_embed(n_embed), dim_W(dim_W), attnMask(attnMask), qkvLayout(qkvLayout), wQKV(n_embed, 3 * n_heads * dim_W), wProj(n_embed, n_embed), bQKV(3 * n_heads * dim_W), bProj(n_embed), doNormQKt(doNormQKt), doRotEmb(doRotEmb)
+    {
+        selfAttn = (attnMask.compare("self") == 0);
+    }
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        auto a_2d = a.as_2d();
+        auto act_2d = this->activation.as_2d();
+        this->backend->mha(n_heads, n_embed, dim_W, selfAttn, doNormQKt, doRotEmb, wQKV, bQKV, wProj, bProj, a_2d, act_2d);
+        this->activation.d_data = act_2d.d_data;
+        // printf("mha output=%lx\n", this->activation.d_data);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &inShape = inShapes[0];
+        return inShape;
+    }
+};
+
+template <typename T>
+class _ScalarDiv : public Layer<T>
+{
+public:
+    double scalar;
+
+    _ScalarDiv(double scalar) : Layer<T>("_ScalarDiv"), scalar(scalar) {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        this->backend->scalardiv(a, scalar, this->activation, this->scale, this->mode);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &shape0 = inShapes[0];
+        return shape0;
+    }
+};
+
+template <typename T>
+class RotaryEmbedding : public Layer<T>
+{
+public:
+    u64 base=10000;
+
+    RotaryEmbedding() : Layer<T>("RotaryEmbedding") {}
+
+    void _resize(const std::vector<std::vector<u64>> &shapes)
+    {
+        always_assert(shapes.size() == 1);
+    }
+
+    void _forward(Tensor<T> &a)
+    {
+        u64 n_seq = a.shape[0];
+        u64 dim = a.shape[1];
+        auto x_2d = a.as_2d();
+        auto y_2d = this->activation.as_2d();
+        
+
+        for (u64 i = 0; i < n_seq; ++i)
+        {
+            for (u64 j = 0; j < dim; j++)
+            {
+                double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
+                T scalarInt = (i * this->scalar) * std::pow(2, this->scale);
+                T sinx = std::sin(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
+                T cosx = std::cos(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
+                // T sinx = std::sin(i * scalar) * std::pow(2, this->scale - 3);
+                // T cosx = std::cos(i * scalar) * std::pow(2, this->scale - 3);
+
+                if (sinx == (1ULL << (this->scale - 3)))
+                    sinx -= 1;
+                if (cosx == (1ULL << (this->scale - 3)))
+                    cosx -= 1;
+                u64 k = (j + dim / 2) % dim;
+                T mul = 2 * (j >= dim / 2) - 1;
+                T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
+                y_2d(i, j) = z;
+            }
+        }
+        this->backend->truncate(this->activation, this->scale - 3);
+    }
+
+    std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
+    {
+        always_assert(inShapes.size() == 1);
+        auto &shape0 = inShapes[0];
+        return shape0;
+    }
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/module.h b/GPU-MPC/ext/sytorch/include/sytorch/module.h
new file mode 100644
index 00000000..3b1b1d5c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/module.h
@@ -0,0 +1,740 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/tensor.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/backend/backend.h>
+#include <fstream>
+#include <filesystem>
+#include <map>
+#include <algorithm>
+#include <sytorch/backend/default.h>
+
+template <typename T>
+class SytorchModule
+{
+public:
+    Tensor<T> activation;
+    Backend<T> *backend = nullptr;
+    LayerGraphNode<T> *root = nullptr;
+    bool debug = false;
+    u64 scale;
+
+    std::vector<LayerGraphNode<T> *> allNodesInExecutionOrder;
+    const std::vector<std::string> functionalLayers = {"Add", "Concat", "GeLU", "SoftMax", "Split", "View", "Transpose", "_MatMul", "_ScalarMul", "ReLU", "AttentionMask", "LocalAttentionMask", "_Tanh", "Unsqueeze", "SoftMaxTriangular", "_MatMulTriangular", "AttentionTriangular", "_Mul", "SiLU", "_ScalarDiv", "RotaryEmbedding"};
+    static std::map<std::string, LayerGraphNode<T> *> functionalLayerMap;
+
+public:
+    virtual Tensor<T> &_forward(Tensor<T> &input) = 0;
+
+    SytorchModule() : activation({}), allNodesInExecutionOrder(0)
+    {
+        backend = defaultBackend<T>();
+    }
+
+    void generateFunctionalLayerMap()
+    {
+        // functionalLayerMap.clear();
+        topologicalApply(root, [=](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         {
+                             if (std::find(functionalLayers.begin(), functionalLayers.end(), node->layer->name) != functionalLayers.end())
+                             {
+                                 std::string id = node->layer->name;
+                                 for (auto &parent : node->parents)
+                                 {
+                                     id += "|" + std::to_string((uint64_t)(parent));
+                                 }
+                                 id = id + "|" + node->layer->paramstring;
+                                 // make sure it already doesn't exist
+                                 always_assert(functionalLayerMap.find(id) == functionalLayerMap.end());
+                                 functionalLayerMap[id] = node;
+                             } });
+    }
+
+    template <typename... Args>
+    LayerGraphNode<T> *getFunctionalNode(const std::string &layerName, std::vector<Tensor<T> *> ips, Args... args)
+    {
+        std::string id = layerName;
+        for (auto &ip : ips)
+        {
+            id += "|" + std::to_string((uint64_t)(ip->graphNode));
+        }
+        id = id + "|" + paramstring(args...);
+        if (functionalLayerMap.find(id) == functionalLayerMap.end())
+        {
+            std::cerr << "Layer not found = \"" << id << "\"" << std::endl;
+            exit(1);
+        }
+        return functionalLayerMap[id];
+    }
+
+    template <typename LayerType, typename... Args>
+    Tensor<T> &functionalGraphGen(std::vector<Tensor<T> *> arr, Args... args)
+    {
+        for (auto &a : arr)
+        {
+            always_assert(a->graphGenMode);
+        }
+        auto layer = new LayerType(args...);
+        layer->paramstring = paramstring(args...);
+        return layer->forward(arr);
+    }
+
+    void genGraphAndExecutionOrder(Tensor<T> &ip)
+    {
+        // Tensor<T> ip({});
+        ip.graphGenMode = true;
+        ip.graphNode = new LayerGraphNode<T>();
+        ip.graphNode->layer = new PlaceHolderLayer<T>("Input");
+        // ip.graphNode->currTensor = &ip;
+        ip.graphNode->allNodesInExecutionOrderRef = &allNodesInExecutionOrder;
+        auto &res = this->_forward(ip);
+        std::vector<Tensor<T> *> a = {&res};
+        if (shapesOkay(a))
+            this->activation.resize(res.shape);
+        ip.graphGenMode = false;
+        root = ip.graphNode;
+        ip.graphNode->currTensor = &ip;
+    }
+
+    void init(u64 scale, Tensor<T> &ip)
+    {
+        genGraphAndExecutionOrder(ip);
+        topologicalApply(root, [=](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         { node->layer->initScale(scale); });
+
+        this->scale = scale;
+        generateFunctionalLayerMap();
+    }
+
+    void init(u64 scale)
+    {
+        Tensor<T> ip({});
+        init(scale, ip);
+    }
+
+    void randomize()
+    {
+        topologicalApply(root, [](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         {
+                             node->layer->getweights().randomize();
+                             node->layer->getbias().randomize(); });
+    }
+
+    void zero()
+    {
+        topologicalApply(root, [](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         {
+                             node->layer->getweights().zero();
+                             node->layer->getbias().zero(); });
+    }
+
+    void setBackend(Backend<T> *b)
+    {
+        topologicalApply(root, [=](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         { node->layer->setBackend(b); });
+        backend = b;
+    }
+
+    Tensor<T> &forward(Tensor<T> &input)
+    {
+        // printf("%d, %lx, %d\n", input.graphGenMode, input.graphNode, debug);
+        if (input.graphGenMode)
+        {
+            return this->_forward(input);
+        }
+        // input.print();
+        if (input.graphNode == nullptr)
+        { // when the module is a top level module
+            topologicalApply(root, [](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                             { node->numUsages = 0; });
+            input.graphNode = root;
+            input.graphNode->currTensor = &input;
+        }
+        if (debug)
+        {
+            auto &res = this->_forward(input);
+            this->activation.resize(res.shape);
+            this->activation.copy(res);
+            return this->activation;
+        }
+        else
+        {
+            int i = 0;
+            // printf("Nodes in execution order=%d\n", allNodesInExecutionOrder.size());
+            for (auto &n : allNodesInExecutionOrder)
+            {
+                // printf("%d=%s\n", i++, n->layer->name.data());
+                std::vector<Tensor<T> *> a;
+                for (auto p : n->parents)
+                {
+                    // printf("%lx\n\n\n", p->currTensor);
+                    a.push_back(p->currTensor);
+                }
+                // printf("Input %s=%ld, %ld\n", n->layer->name.data(), a[0]->data[0], a[0]->data[a[0]->size() - 1]);
+                n->layer->forward(a);
+                n->currTensor->d_data = n->layer->activation.d_data;
+                // printf("Output=%ld, %ld\n", n->layer->activation.data[0], n->layer->activation.data[n->layer->activation.size() - 1]);
+            }
+            auto l = allNodesInExecutionOrder.size();
+            auto res = allNodesInExecutionOrder[l - 1]->currTensor; // todo: calculate using the generated graph
+            this->activation.resize(res->shape);
+            this->activation.copy(*res);
+            this->activation.d_data = res->d_data;
+            return this->activation;
+        }
+    }
+
+    void optimize()
+    {
+        backend->optimize(root);
+    }
+
+    void load(const std::string weightsFile)
+    {
+        size_t size_in_bytes = std::filesystem::file_size(weightsFile);
+        always_assert(size_in_bytes % 4 == 0); // as it's float
+        size_t numParameters = size_in_bytes / 4;
+        float *floatWeights = new float[numParameters];
+
+        std::ifstream file(weightsFile, std::ios::binary);
+        file.read((char *)floatWeights, size_in_bytes);
+        file.close();
+        u64 scale = this->scale;
+
+        size_t wIdx = 0;
+        int i = 0;
+        for (auto &node : allNodesInExecutionOrder)
+        {
+            auto layer = node->layer;
+            printf("Loading weights layer %s\n", layer->name.data());
+            if (layer->name == "_MHADummy")
+            {
+                auto mha = (_MHADummy<T> *)layer;
+                if (mha->qkvLayout == "qkvconcat")
+                {
+                    for (u64 j = 0; j < mha->wQKV.size(); j++)
+                    {
+                        mha->wQKV.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                    }
+                    wIdx += mha->wQKV.size();
+                    for (u64 j = 0; j < mha->bQKV.size(); ++j)
+                    {
+                        mha->bQKV.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    }
+                    wIdx += mha->bQKV.size();
+                }
+                else if (mha->qkvLayout.find("sep") != std::string::npos)
+                {
+                    assert(mha->wQKV.d1 == mha->wQKV.d2 / 3);
+                    // need this in qkvconcat format
+                    Tensor2D<T> wK(mha->wQKV.d1, mha->wQKV.d2 / 3);
+                    Tensor2D<T> wV(mha->wQKV.d1, mha->wQKV.d2 / 3);
+                    Tensor2D<T> wQ(mha->wQKV.d1, mha->wQKV.d2 / 3);
+                    if (mha->qkvLayout == "kvqsep")
+                    {
+                        printf("#################### %d, %d, %d\n", wK.d1, wK.d2, mha->wQKV.d1);
+                        for (u64 j = 0; j < wK.size(); j++)
+                        {
+                            wK.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wK.size();
+
+                        for (u64 j = 0; j < wV.size(); j++)
+                        {
+                            wV.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wV.size();
+
+                        for (u64 j = 0; j < wQ.size(); j++)
+                        {
+                            wQ.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wQ.size();
+                    }
+                    else if (mha->qkvLayout == "qkvsep")
+                    {
+                        for (u64 j = 0; j < wQ.size(); j++)
+                        {
+                            wQ.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wQ.size();
+
+                        for (u64 j = 0; j < wK.size(); j++)
+                        {
+                            wK.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wK.size();
+
+                        for (u64 j = 0; j < wV.size(); j++)
+                        {
+                            wV.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                        }
+                        wIdx += wV.size();
+                    }
+                    else
+                    {
+                        assert(0);
+                    }
+
+                    printf("Loading model weights=%ld, %ld, %ld\n", wK.data[0], wV.data[0], wQ.data[0]);
+                    for (u64 j = 0; j < mha->wQKV.d1; j++)
+                    {
+                        for (u64 k = 0; k < mha->wQKV.d2 / 3; k++)
+                        {
+                            mha->wQKV(j, k) = wQ(j, k);
+                            mha->wQKV(j, mha->wQKV.d2 / 3 + k) = wK(j, k);
+                            mha->wQKV(j, 2 * mha->wQKV.d2 / 3 + k) = wV(j, k);
+                        }
+                    }
+                    printf("Loaded model weights=%d, %d, %ld, %ld, %ld\n", mha->wQKV.d1, mha->wQKV.d2, mha->wQKV(mha->wQKV.d1 - 1, 0), mha->wQKV(mha->wQKV.d1 - 1, mha->wQKV.d2 / 3), mha->wQKV(mha->wQKV.d1 - 1, 2 * mha->wQKV.d2 / 3));
+                    // memcpy(mha->wQKV.data, wQKV.data, mha->wQKV.size());
+                    mha->bQKV.as_nd().zero();
+                }
+                for (u64 j = 0; j < mha->wProj.size(); j++)
+                {
+                    mha->wProj.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                }
+                wIdx += mha->wProj.size();
+                for (u64 j = 0; j < mha->bProj.size(); ++j)
+                {
+                    mha->bProj.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                }
+                wIdx += mha->bProj.size();
+            }
+            else if (layer->name == "BatchNormInference")
+            {
+                auto bn = (BatchNormInference<T> *)layer;
+                auto channel = bn->A.d1;
+                auto gammaPtr = floatWeights + wIdx;
+                auto betaPtr = floatWeights + wIdx + channel;
+                auto meanPtr = floatWeights + wIdx + 2 * channel;
+                auto varPtr = floatWeights + wIdx + 3 * channel;
+                for (int j = 0; j < channel; ++j)
+                {
+                    bn->A(j) = T((gammaPtr[j] / std::sqrt(varPtr[j])) * (1LL << scale));
+                    bn->B(j) = T((betaPtr[j] - gammaPtr[j] * meanPtr[j] / std::sqrt(varPtr[j])) * (1LL << (2 * scale)));
+                }
+                wIdx += 4 * channel;
+            }
+            else
+            {
+                auto weights = layer->getweights();
+                for (u64 j = 0; j < weights.size; j++)
+                {
+                    weights.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
+                }
+                wIdx += weights.size;
+
+                auto bias = layer->getbias();
+                if (layer->useBias)
+                {
+
+                    for (u64 j = 0; j < bias.size; ++j)
+                    {
+                        bias.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    }
+
+                    wIdx += bias.size;
+                }
+                else
+                {
+                    bias.zero();
+                }
+            }
+            i++;
+        }
+        printf("wIdx=%lu, numParameters=%lu\n", wIdx, numParameters);
+        // always_assert(wIdx == numParameters);
+        delete[] floatWeights;
+    }
+
+    void dumpi64(const std::string weightsFile)
+    {
+        std::ofstream file(weightsFile, std::ios::binary);
+        u64 scale = this->scale;
+
+        for (auto &node : allNodesInExecutionOrder)
+        {
+            auto layer = node->layer;
+            if (layer->name == "BatchNormInference")
+            {
+                auto bn = (BatchNormInference<T> *)layer;
+                auto channel = bn->A.d1;
+
+                for (int j = 0; j < channel; ++j)
+                {
+                    i64 v = bn->A(j);
+                    file.write((char *)(&v), sizeof(i64));
+                }
+                for (int j = 0; j < channel; ++j)
+                {
+                    i64 v = bn->B(j);
+                    file.write((char *)(&v), sizeof(i64));
+                }
+                for (int j = 0; j < channel; ++j)
+                {
+                    i64 v = 0;
+                    file.write((char *)(&v), sizeof(i64));
+                }
+                for (int j = 0; j < channel; ++j)
+                {
+                    i64 v = (1LL << scale);
+                    file.write((char *)(&v), sizeof(i64));
+                }
+            }
+            else
+            {
+                auto weights = layer->getweights();
+                for (u64 j = 0; j < weights.size; j++)
+                {
+                    i64 v = weights.data[j];
+                    file.write((char *)(&v), sizeof(i64));
+                }
+
+                auto bias = layer->getbias();
+                if (layer->useBias)
+                {
+
+                    for (u64 j = 0; j < bias.size; ++j)
+                    {
+                        i64 v = bias.data[j];
+                        file.write((char *)(&v), sizeof(i64));
+                    }
+                }
+            }
+        }
+    }
+
+    Tensor<T> &add(std::vector<Tensor<T> *> &arr)
+    {
+        if (arr[0]->graphGenMode)
+        {
+            auto &c = functionalGraphGen<Add<T>>(arr);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("Add", arr);
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    template <typename... Args>
+    Tensor<T> &add(Args &...args)
+    {
+        auto res = collect(args...);
+        return add(res);
+    }
+
+    Tensor<T> &concat(std::vector<Tensor<T> *> &arr)
+    {
+        if (arr[0]->graphGenMode)
+        {
+            auto &c = functionalGraphGen<Concat<T>>(arr);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("Concat", arr);
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    template <typename... Args>
+    Tensor<T> &concat(Args &...args)
+    {
+        auto res = collect(args...);
+        return concat(res);
+    }
+
+    Tensor<T> &gelu(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<GeLU<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("GeLU", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &silu(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<SiLU<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("SiLU", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &relu(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<ReLU<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("ReLU", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &unsqueeze(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<Unsqueeze<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("Unsqueeze", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &tanh(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_Tanh<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_Tanh", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &softmax(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<SoftMax<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("SoftMax", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &split(Tensor<T> &a, u64 n_splits)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<Split<T>>({&a}, n_splits);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("Split", {&a}, n_splits);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &view(Tensor<T> &a, i64 idx)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<View<T>>({&a}, idx);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("View", {&a}, idx);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &transpose(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<Transpose<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("Transpose", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &matmul(Tensor<T> &a, Tensor<T> &b)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_MatMul<T>>({&a, &b});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_MatMul", {&a, &b});
+        std::vector<Tensor<T> *> arr = {&a, &b};
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    Tensor<T> &matmul_triangular(Tensor<T> &a, Tensor<T> &b)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_MatMulTriangular<T>>({&a, &b});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_MatMulTriangular", {&a, &b});
+        std::vector<Tensor<T> *> arr = {&a, &b};
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    Tensor<T> &mul(Tensor<T> &a, Tensor<T> &b)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_Mul<T>>({&a, &b});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_Mul", {&a, &b});
+        std::vector<Tensor<T> *> arr = {&a, &b};
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    Tensor<T> &scalarmul(Tensor<T> &a, double scalar)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_ScalarMul<T>>({&a}, scalar);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_ScalarMul", {&a}, scalar);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &scalardiv(Tensor<T> &a, double scalar)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<_ScalarDiv<T>>({&a}, scalar);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("_ScalarDiv", {&a}, scalar);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &attention_mask(Tensor<T> &a, double scalar)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<AttentionMask<T>>({&a}, scalar);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("AttentionMask", {&a}, scalar);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &local_attention_mask(Tensor<T> &a, double scalar)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<LocalAttentionMask<T>>({&a}, scalar);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("LocalAttentionMask", {&a}, scalar);
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &softmax_triangular(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<SoftMaxTriangular<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("SoftMaxTriangular", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    Tensor<T> &attention_triangular(Tensor<T> &q, Tensor<T> &k, Tensor<T> &v, u64 n_heads)
+    {
+        if (q.graphGenMode)
+        {
+            auto &c = functionalGraphGen<AttentionTriangular<T>>({&q, &k, &v}, n_heads);
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("AttentionTriangular", {&q, &k, &v}, n_heads);
+        std::vector<Tensor<T> *> arr = {&q, &k, &v};
+        auto &c = cNode->layer->forward(arr);
+        return c;
+    }
+
+    Tensor<T> &rotary_embedding(Tensor<T> &a)
+    {
+        if (a.graphGenMode)
+        {
+            auto &c = functionalGraphGen<RotaryEmbedding<T>>({&a});
+            return c;
+        }
+
+        auto cNode = getFunctionalNode("RotaryEmbedding", {&a});
+        auto &c = cNode->layer->forward(a);
+        return c;
+    }
+
+    T invsqrt(double x)
+    {
+        double t = 1 / sqrt(x);
+        return T(t * (1LL << scale));
+    }
+
+    void train()
+    {
+        topologicalApply(root, [=](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         { node->layer->train(); });
+    }
+
+    void eval()
+    {
+        topologicalApply(root, [=](LayerGraphNode<T> *node, LayerGraphNode<T> *_root)
+                         { node->layer->eval(); });
+    }
+};
+
+template <typename T>
+std::map<std::string, LayerGraphNode<T> *> SytorchModule<T>::functionalLayerMap = std::map<std::string, LayerGraphNode<T> *>();
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/random.h b/GPU-MPC/ext/sytorch/include/sytorch/random.h
new file mode 100644
index 00000000..02873d56
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/random.h
@@ -0,0 +1,30 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cryptoTools/Crypto/PRNG.h>
+
+extern osuCrypto::PRNG prngWeights;
+extern osuCrypto::PRNG prngStr;
+
+// samples a random float in range [0, 1)
+double rand_float();
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/softmax.h b/GPU-MPC/ext/sytorch/include/sytorch/softmax.h
new file mode 100644
index 00000000..d4ec7d39
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/softmax.h
@@ -0,0 +1,174 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+
+#include <sytorch/tensor.h>
+#include <llama/stats.h>
+#include <llama/api.h>
+
+template <typename T, u64 scale>
+void softmax(const Tensor4D<T> &in, Tensor4D<T> &out)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d2 == out.d2);
+    assert(in.d3 == 1);
+    assert(in.d4 == 1);
+    assert(out.d3 == 1);
+    assert(out.d4 == 1);
+    assert(std::is_integral<T>::value || (scale == 0));
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    for(int b = 0; b < batchSize; ++b) {
+        T max = in(b, 0, 0, 0);
+        for(u64 j = 1; j < numClasses; ++j) {
+            if(in(b, j, 0, 0) > max) {
+                max = in(b, j, 0, 0);
+            }
+        }
+        double den = 0.0;
+        double exps[numClasses];
+        for(u64 j = 0; j < numClasses; ++j) {
+            double x = in(b, j, 0, 0) - max;
+            if constexpr (scale == 0) {
+                exps[j] = std::exp(x);
+            } else {
+                exps[j] = std::exp(x / (1ULL << scale));
+            }
+            den += exps[j];
+        }
+        den = den * batchSize;
+
+        for(u64 j = 0; j < numClasses; ++j) {
+            if constexpr (scale == 0) {
+                out(b, j, 0, 0) = exps[j] / den;
+            } else {
+                auto t = (exps[j] / den) * (1ULL << scale);
+                t += rand_float();
+                out(b, j, 0, 0) = (T)(t);
+            }
+        }
+    }
+}
+
+inline void pirhana_softmax(const Tensor4D<u64> &in, Tensor4D<u64> &out, u64 scale)
+{
+    PiranhaSoftmax(in.d1, in.d2, in.data, in.data, out.data, out.data, scale);
+}
+
+inline i64 pirhana_inverse(i64 x, u64 scale)
+{
+    i64 alpha = 0;
+    while((1LL << alpha) < x) {
+        alpha++;
+    }
+    // inv(x/2^scale) * 2^scale = [inv(x/2^alpha) * 2^alpha] * 2^(scale - alpha) * 2^(scale - alpha)
+    i64 a_alpha = 2.63 * (1LL << alpha);
+    i64 b_alpha = -5.857 * (1LL << alpha);
+    i64 c_alpha = 4.245 * (1LL << alpha);
+
+    i64 res = a_alpha * x + b_alpha * (1LL << alpha);
+    res >>= alpha;
+    res = res * x + c_alpha * (1LL << alpha);
+    res >>= alpha;
+    if (scale > alpha) {
+        res = res * (1LL<<(scale-alpha));
+        res = res * (1LL<<(scale-alpha));
+    }
+    else {
+        res = res / (1LL<<(alpha-scale));
+        res = res / (1LL<<(alpha-scale));
+    }
+    return res;
+}
+
+inline void pirhana_softmax_ct(const Tensor4D<i64> &in, Tensor4D<i64> &out, u64 scale)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d2 == out.d2);
+    assert(in.d3 == 1);
+    assert(in.d4 == 1);
+    assert(out.d3 == 1);
+    assert(out.d4 == 1);
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    for(int b = 0; b < batchSize; ++b) {
+        i64 max = in(b, 0, 0, 0);
+        for(u64 j = 1; j < numClasses; ++j) {
+            if(in(b, j, 0, 0) - max > 0) {
+                max = in(b, j, 0, 0);
+            }
+        }
+        i64 den = 0;
+        i64 exps[numClasses];
+        for(u64 j = 0; j < numClasses; ++j) {
+            i64 x = in(b, j, 0, 0) - max;
+            exps[j] = (x > (-2*(1LL << scale))) ? ((x + 2*(1LL << scale)) / 2) : 0;
+            den += exps[j];
+        }
+        // den = den * batchSize;
+        den = pirhana_inverse(den, scale);
+
+        for(u64 j = 0; j < numClasses; ++j) {
+            auto t = exps[j] * den;
+            t >>= scale;
+            t /= batchSize;
+            out(b, j, 0, 0) = (i64)(t);
+        }
+    }
+}
+
+inline void reconstrunct_and_print_float(Tensor4D<u64> &in, int llamaParty) {
+    reconstruct(in.d1 * in.d2 * 4, in.data, 64);
+    for(int i = 0; i < in.d1; ++i) {
+        for(int j = 0; j < in.d2; ++j) {
+            int m = in(i, j, 0, 0) % (1LL << 24);
+            int e = in(i, j, 1, 0) % 256;
+            int z = in(i, j, 2, 0) % 2;
+            int s = in(i, j, 3, 0) % 2;
+            std::cout << m << " " << e << " " << s << " " << z << std::endl;
+            double x = m;
+            x = x / (1LL << 23);
+            if (s == 1) {
+                x = -x;
+            }
+            e = e - 127;
+            if (e < 0) {
+                x = x / (1LL << (-e));
+            } else {
+                x = x * (1LL << e);
+            }
+            if (z == 1) {
+                x = 0;
+            }
+            std::cout << x << std::endl;
+        }
+    }
+    if (llamaParty == 3) {
+        in.fill(0);
+    }
+}
+
+void secfloat_init(int secfloatParty, std::string secfloatAddr);
+void softmax_secfloat(Tensor4D<u64> &in, Tensor4D<u64> &out, u64 scale, int llamaParty);
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/tensor.h b/GPU-MPC/ext/sytorch/include/sytorch/tensor.h
new file mode 100644
index 00000000..544def44
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/tensor.h
@@ -0,0 +1,801 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstdint>
+#include <cassert>
+#include <eigen3/Eigen/Dense>
+#include <iostream>
+#include <sytorch/random.h>
+#include <sytorch/graph.h>
+#include <llama/assert.h>
+#include <cmath>
+#include <filesystem>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+typedef int64_t i64;
+typedef int32_t i32;
+
+template <typename T>
+class TensorRef
+{
+public:
+    T *data;
+    T *d_data = nullptr;
+    u64 size;
+    TensorRef(T *data, u64 size) : data(data), size(size) {}
+    void zero()
+    {
+        for (u64 i = 0; i < size; i++)
+        {
+            data[i] = 0;
+        }
+    }
+};
+
+template <typename T>
+class Tensor5D;
+
+template <typename T>
+class Tensor4D;
+
+template <typename T>
+class Tensor2D;
+
+template <typename T>
+class Tensor1D;
+
+template <typename T>
+class Tensor
+{
+    bool isFreed = false;
+
+public:
+    Tensor(){};
+    T *data;
+    T *d_data = nullptr;
+    std::vector<u64> shape;
+    bool isOwner = true;
+
+    bool graphGenMode = false;
+    LayerGraphNode<T> *graphNode = nullptr;
+    bool wasCopiedToCpu = false;
+
+    bool wasFreed() { return isFreed; }
+
+    T value_at(const std::vector<u64> &idx) const
+    {
+        always_assert(idx.size() == this->shape.size());
+        u64 offset = 0;
+        for (u64 i = 0; i < idx.size(); i++)
+        {
+            always_assert(idx[i] < this->shape[i]);
+            u64 stride = 1;
+            for (u64 j = i + 1; j < idx.size(); j++)
+            {
+                stride *= this->shape[j];
+            }
+            offset += idx[i] * stride;
+        }
+        return this->data[offset];
+    }
+
+    void allocate(const std::vector<u64> &s)
+    {
+        always_assert(isOwner);
+        this->shape = s;
+        if (this->size() > 0)
+        {
+            this->data = new T[this->size()];
+            isFreed = false;
+        }
+        else
+        {
+            this->data = nullptr;
+            isFreed = true;
+        }
+    }
+
+    void free()
+    {
+        always_assert(isOwner);
+        if (isFreed)
+        {
+            return;
+        }
+        if (this->size() == 0)
+        {
+            return;
+        }
+        delete[] data;
+        // if (d_data)
+        //     cudaFree(d_data);
+        this->shape = {};
+        isFreed = true;
+    }
+
+    void freeGpu()
+    {
+        always_assert(isOwner);
+        // if (isFreed)
+        // {
+        //     return;
+        // }
+        // if (this->size() == 0)
+        // {
+        //     return;
+        // }
+        // delete[] data;
+        if (d_data)
+        {
+            cudaFreeAsync(d_data, 0);
+            d_data = nullptr;
+        }
+        // this->shape = {};
+        // isFreed = true;
+    }
+
+    void resize(const std::vector<u64> &s)
+    {
+        always_assert(isOwner);
+        if (s.size() == this->shape.size())
+        {
+            bool allSameDims = true;
+            for (u64 i = 0; i < s.size(); i++)
+            {
+                if (s[i] != this->shape[i])
+                {
+                    allSameDims = false;
+                    break;
+                }
+            }
+            if (allSameDims)
+            {
+                return;
+            }
+        }
+        free();
+        allocate(s);
+    }
+
+    Tensor(const std::vector<u64> &s)
+    {
+        allocate(s);
+    }
+
+    Tensor(std::initializer_list<u64> s)
+    {
+        allocate(s);
+    }
+
+    Tensor(T *data, T *d_data, const std::vector<u64> &s)
+    {
+        this->data = data;
+        this->d_data = d_data;
+        this->shape = s;
+        this->isOwner = false;
+    }
+
+    Tensor(T *data, const std::vector<u64> &s)
+    {
+        this->data = data;
+        this->d_data = nullptr;
+        this->shape = s;
+        this->isOwner = false;
+    }
+
+    ~Tensor()
+    {
+        // printf("Freeing tensor=%d, %lx, %lu\n", isOwner, data, size());
+        if (isOwner)
+            free();
+    }
+
+    u64 size() const
+    {
+        if (this->shape.size() == 0)
+        {
+            return 0;
+        }
+        u64 s = 1;
+        for (auto d : this->shape)
+        {
+            s *= d;
+        }
+        return s;
+    }
+
+    bool is_same_shape(const Tensor<T> &other) const
+    {
+        if (!(this->shape.size() == other.shape.size()))
+        {
+            return false;
+        }
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
+            if (!(this->shape[i] == other.shape[i]))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void assert_same_shape(const Tensor<T> &other)
+    {
+        always_assert(this->shape.size() == other.shape.size());
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
+            always_assert(this->shape[i] == other.shape[i]);
+        }
+    }
+
+    void copy(const Tensor<T> &other, bool copyGraph = true, bool gpuToCpu = false)
+    {
+        assert_same_shape(other);
+        if (gpuToCpu)
+        {
+            cudaMemcpy(data, other.d_data, other.size() * sizeof(T), cudaMemcpyDeviceToHost);
+        }
+        else
+        {
+// memcpy(data, other.data, size() * sizeof(T));
+#pragma omp parallel for
+            for (u64 i = 0; i < size(); ++i)
+            {
+                data[i] = other.data[i];
+            }
+            if (copyGraph)
+                this->graphNode = other.graphNode;
+        }
+    }
+
+    void fill(T x)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
+            data[i] = x;
+        }
+    }
+
+    void zero()
+    {
+        fill(0);
+    }
+
+    void input(int scale)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
+            double d;
+            std::cin >> d;
+            data[i] = (i64)(d * (1LL << scale));
+        }
+    }
+
+    void input_nchw(int scale)
+    {
+        always_assert(this->shape.size() >= 2); // atleast batch and channel axis
+
+        u64 batch_size = shape[0];
+        u64 num_channel = shape.back();
+        u64 rest_size = size() / (batch_size * num_channel);
+
+        for (u64 i = 0; i < size(); i++)
+        {
+            double d;
+            std::cin >> d;
+            u64 curr_batch = i / (num_channel * rest_size);
+            u64 curr_channel = (i / rest_size) % num_channel;
+            u64 curr_rest = i % rest_size;
+            u64 new_idx = curr_batch * (num_channel * rest_size) + curr_rest * num_channel + curr_channel;
+            data[new_idx] = (i64)(d * (1LL << scale));
+        }
+    }
+
+    void print()
+    {
+        // std::cout << "Tensor(";
+        // for (int i = 0; i < this->shape.size(); i++)
+        // {
+        //     std::cout << this->shape[i] << ", ";
+        // }
+        // std::cout << ")" << std::endl;
+        for (u64 i = 0; i < size(); i++)
+        {
+            std::cout << (i64)data[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    void printshape()
+    {
+        std::cout << "(";
+        for (int i = 0; i < this->shape.size(); i++)
+        {
+            std::cout << this->shape[i] << ", ";
+        }
+        std::cout << ")" << std::endl;
+    }
+
+    T multidir_broadcast_value(const std::vector<u64> &broadcast_shape, const std::vector<u64> &idx) const
+    {
+        always_assert(broadcast_shape.size() >= this->shape.size());
+        always_assert(broadcast_shape.size() == idx.size());
+        int num_broadcast_dims = broadcast_shape.size() - this->shape.size();
+        std::vector<u64> new_idx;
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
+            always_assert(this->shape[i] == 1 || this->shape[i] == broadcast_shape[i + num_broadcast_dims]);
+            if (this->shape[i] == 1)
+            {
+                new_idx.push_back(0);
+            }
+            else
+            {
+                always_assert(idx[i + num_broadcast_dims] < this->shape[i]);
+                new_idx.push_back(idx[i + num_broadcast_dims]);
+            }
+        }
+        return this->value_at(new_idx);
+    }
+
+    void load(const std::vector<std::vector<std::vector<std::vector<std::vector<float>>>>> &arr, int scale)
+    {
+        int d1 = arr.size();
+        int d2 = arr[0].size();
+        int d3 = arr[0][0].size();
+        int d4 = arr[0][0][0].size();
+        int d5 = arr[0][0][0][0].size();
+        always_assert(d1 == this->shape[0]);
+        always_assert(d2 == this->shape[1]);
+        always_assert(d3 == this->shape[2]);
+        always_assert(d4 == this->shape[3]);
+        always_assert(d5 == this->shape[4]);
+        for (int i = 0; i < d1; i++)
+        {
+            for (int j = 0; j < d2; j++)
+            {
+                for (int k = 0; k < d3; k++)
+                {
+                    for (int l = 0; l < d4; l++)
+                    {
+                        for (int m = 0; m < d5; m++)
+                        {
+                            this->data[i * d2 * d3 * d4 * d5 + j * d3 * d4 * d5 + k * d4 * d5 + l * d5 + m] = (i64)(arr[i][j][k][l][m] * scale);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    void load(const std::string filename, u64 scale)
+    {
+        size_t size_in_bytes = std::filesystem::file_size(filename);
+        std::cout << size() << std::endl;
+        std::cout << size_in_bytes << std::endl;
+        // always_assert(size_in_bytes >= size() * 4);
+        size_t wSize = std::min(size(), size_in_bytes / 4);
+        float *floatInput = new float[/*size()*/ wSize];
+        std::ifstream file(filename, std::ios::binary);
+        file.read((char *)floatInput, /*size_in_bytes*/ /*size()*/ wSize * 4);
+        file.close();
+        for (u64 i = 0; i < size(); ++i)
+        {
+            data[i] = T(floatInput[i % (size_in_bytes / 4)] * (1LL << scale));
+        }
+        delete[] floatInput;
+    }
+
+    Tensor5D<T> as_5d()
+    {
+        assert(this->shape.size() == 5);
+        return Tensor5D<T>(this->data, this->shape[0], this->shape[1], this->shape[2], this->shape[3], this->shape[4]);
+    }
+
+    Tensor4D<T> as_4d() const
+    {
+        assert(this->shape.size() == 4);
+        return Tensor4D<T>(this->data, this->d_data, this->shape[0], this->shape[1], this->shape[2], this->shape[3]);
+    }
+
+    Tensor2D<T> as_2d()
+    {
+        always_assert(this->shape.size() == 2);
+        return Tensor2D<T>(this->data, this->d_data, this->shape[0], this->shape[1]);
+    }
+
+    Tensor<T> view(u64 i)
+    {
+        assert(i < shape[0]);
+        u64 newsize = size() / shape[0];
+        auto newshape = shape;
+        newshape.erase(newshape.begin());
+        return Tensor<T>(data + i * newsize, newshape);
+    }
+};
+
+template <typename T>
+class Tensor1D
+{
+public:
+    T *data, *d_data = nullptr;
+    u64 d1;
+    bool isOwner = true;
+
+    Tensor1D(u64 s) : d1(s), data(new T[s]) {}
+    Tensor1D(T *data, u64 s) : d1(s), data(new T[s])
+    {
+        memcpy(this->data, data, s * sizeof(T));
+    }
+
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            auto r = (double)prngWeights.get<int32_t>();
+            // if(i == 0) printf("Inside randomize=%lf, %ld range=%lf\n", ((r / (1LL << 31)) * range), u64(((r / (1LL << 31)) * range)), range);
+            this->data[i] = (T)((r / (1LL << 31)) * range);
+        }
+    }
+
+    void randomize()
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            this->data[i] = (T)prngWeights.get<T>();
+        }
+    }
+
+    ~Tensor1D()
+    {
+        if (isOwner)
+        {
+            delete[] this->data;
+            // if (d_data)
+            //     cudaFree(d_data);
+        }
+    }
+
+    u64 size() const
+    {
+        return d1;
+    }
+
+    TensorRef<T> ref()
+    {
+        return TensorRef<T>(data, size());
+    }
+
+    T &operator()(u64 i) const
+    {
+        assert(i < this->d1);
+        return this->data[i];
+    }
+
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            this->data[i] = val;
+        }
+    }
+
+    Tensor<T> as_nd()
+    {
+        return Tensor<T>(data, d_data, {d1});
+    }
+};
+
+template <typename T>
+class Tensor2D
+{
+public:
+    u64 d1, d2;
+    T *data, *d_data = nullptr;
+    bool isOwner = true;
+
+    Tensor2D(u64 d1, u64 d2) : d1(d1), d2(d2), data(new T[d1 * d2]) {}
+
+    Tensor2D(T *data, u64 d1, u64 d2) : d1(d1), d2(d2), data(data), isOwner(false) {}
+
+    Tensor2D(T *data, T *d_data, u64 d1, u64 d2) : d1(d1), d2(d2), data(data), d_data(d_data), isOwner(false) {}
+
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
+                auto r = (double)prngWeights.get<int32_t>();
+                // if(i == 0 && j == 0) printf("Inside randomize=%lf, %ld range=%lf\n", ((r / (1LL << 31)) * range), u64(((r / (1LL << 31)) * range)), range);
+                this->data[i * this->d2 + j] = (T)((r / (1LL << 31)) * range);
+            }
+        }
+    }
+
+    u64 size() const
+    {
+        return d1 * d2;
+    }
+
+    TensorRef<T> ref()
+    {
+        return TensorRef<T>(data, size());
+    }
+
+    void resize(u64 d1, u64 d2)
+    {
+        always_assert(this->isOwner);
+        if (this->d1 == d1 && this->d2 == d2)
+        {
+            return;
+        }
+        delete[] data;
+        this->d1 = d1;
+        this->d2 = d2;
+        data = new T[d1 * d2];
+    }
+
+    ~Tensor2D()
+    {
+        if (this->isOwner)
+        {
+            delete[] this->data;
+            // if (d_data)
+            //     cudaFree(d_data);
+        }
+    }
+
+    T &operator()(u64 i, u64 j) const
+    {
+        assert(i < this->d1);
+        assert(j < this->d2);
+        return this->data[i * this->d2 + j];
+    }
+
+    void zero()
+    {
+        fill(0);
+    }
+
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
+                this->data[i * this->d2 + j] = val;
+            }
+        }
+    }
+
+    u64 argmax(u64 i)
+    {
+        assert(i < d1);
+        u64 maxIndex = 0;
+        T maxValue = data[i * d2];
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
+                maxValue = data[i * d2 + j];
+                maxIndex = j;
+            }
+        }
+        return maxIndex;
+    }
+
+    Tensor<T> as_nd()
+    {
+        return Tensor<T>(data, d_data, {d1, d2});
+    }
+};
+
+template <typename T>
+class Tensor4D
+{
+public:
+    u64 d1, d2, d3, d4;
+    T *data, *d_data = nullptr;
+    bool isOwner = true;
+
+    Tensor4D(u64 d1, u64 d2, u64 d3, u64 d4) : d1(d1), d2(d2), d3(d3), d4(d4)
+    {
+        data = new T[d1 * d2 * d3 * d4];
+    }
+
+    Tensor4D(T *data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d1(d1), d2(d2), d3(d3), d4(d4)
+    {
+        isOwner = false;
+    }
+
+    Tensor4D(T *data, T *d_data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d_data(d_data), d1(d1), d2(d2), d3(d3), d4(d4)
+    {
+        isOwner = false;
+    }
+
+    ~Tensor4D()
+    {
+        if (isOwner)
+        {
+            delete[] data;
+            // if (d_data)
+            //     cudaFree(d_data);
+        }
+    }
+
+    u64 size() const
+    {
+        return d1 * d2 * d3 * d4;
+    }
+
+    TensorRef<T> ref()
+    {
+        return TensorRef<T>(data, size());
+    }
+
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4)
+    {
+        always_assert(isOwner);
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4)
+        {
+            return;
+        }
+        delete[] data;
+        this->d1 = d1;
+        this->d2 = d2;
+        this->d3 = d3;
+        this->d4 = d4;
+        data = new T[d1 * d2 * d3 * d4];
+    }
+
+    void resize(const std::vector<u64> &shape)
+    {
+        always_assert(isOwner);
+        always_assert(shape.size() == 4);
+        resize(shape[0], shape[1], shape[2], shape[3]);
+    }
+
+    T &operator()(u64 i, u64 j, u64 k, u64 l) const
+    {
+        assert(i < d1);
+        assert(j < d2);
+        assert(k < d3);
+        assert(l < d4);
+        return data[i * d2 * d3 * d4 + j * d3 * d4 + k * d4 + l];
+    }
+
+    u64 argmax(u64 i)
+    {
+        assert(d3 == 1);
+        assert(d4 == 1);
+        assert(i < d1);
+        u64 maxIndex = 0;
+        T maxValue = data[i * d2];
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
+                maxValue = data[i * d2 + j];
+                maxIndex = j;
+            }
+        }
+        return maxIndex;
+    }
+
+    Tensor<T> as_nd()
+    {
+        return Tensor<T>(data, d_data, {d1, d2, d3, d4});
+    }
+
+    void fill(T val)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
+            this->data[i] = val;
+        }
+    }
+};
+
+template <typename T>
+class Tensor5D
+{
+public:
+    u64 d1, d2, d3, d4, d5;
+    T *data, *d_data = nullptr;
+    bool isOwner = true;
+
+    Tensor5D(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
+        data = new T[d1 * d2 * d3 * d4 * d5];
+    }
+
+    Tensor5D(T *data, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : data(data), d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
+        isOwner = false;
+    }
+
+    ~Tensor5D()
+    {
+        if (isOwner)
+        {
+            delete[] data;
+            // if (d_data)
+            //     cudaFree(d_data);
+        }
+    }
+
+    u64 size() const
+    {
+        return d1 * d2 * d3 * d4 * d5;
+    }
+
+    TensorRef<T> ref()
+    {
+        return TensorRef<T>(data, size());
+    }
+
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5)
+    {
+        always_assert(isOwner);
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4 && this->d5 == d5)
+        {
+            return;
+        }
+        delete[] data;
+        this->d1 = d1;
+        this->d2 = d2;
+        this->d3 = d3;
+        this->d4 = d4;
+        this->d5 = d5;
+        data = new T[d1 * d2 * d3 * d4 * d5];
+    }
+
+    void resize(const std::vector<u64> &shape)
+    {
+        always_assert(isOwner);
+        always_assert(shape.size() == 5);
+        resize(shape[0], shape[1], shape[2], shape[3], shape[4]);
+    }
+
+    T &operator()(u64 i, u64 j, u64 k, u64 l, u64 m) const
+    {
+        assert(i < d1);
+        assert(j < d2);
+        assert(k < d3);
+        assert(l < d4);
+        assert(m < d5);
+        return data[i * d2 * d3 * d4 * d5 + j * d3 * d4 * d5 + k * d4 * d5 + l * d5 + m];
+    }
+
+    Tensor<T> as_nd()
+    {
+        return Tensor<T>(data, d_data, {d1, d2, d3, d4, d5});
+    }
+};
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/utils.h b/GPU-MPC/ext/sytorch/include/sytorch/utils.h
new file mode 100644
index 00000000..b64c7f52
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/include/sytorch/utils.h
@@ -0,0 +1,442 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <sytorch/tensor.h>
+#include <llama/array.h>
+#include <queue>
+#include <set>
+#include <fstream>
+
+inline std::vector<float> make_float_vector( size_t last)
+{
+	std::vector<float> _ret;
+	for (size_t i = 0; i < last; i++)
+	{
+		_ret.push_back(0.0);
+	}
+	return _ret;
+}
+
+template <typename... Args>
+auto make_float_vector( size_t first, Args... sizes)
+{
+	auto _inner = make_float_vector( sizes...);
+	std::vector<decltype(_inner)> _ret;
+	_ret.push_back(_inner);
+	for (size_t i = 1; i < first; i++)
+	{
+		_ret.push_back(make_float_vector( sizes...));
+	}
+	return _ret;
+}
+
+template <typename T>
+Tensor2D<T> reshapeInputTransposed(const Tensor4D<T> &input, u64 padding, u64 stride, u64 FH, u64 FW) {
+    u64 newH = (((input.d2 + 2*padding - FH)/stride) + 1);
+	u64 newW = (((input.d3 + 2*padding - FW)/stride) + 1);
+	u64 reshapedIPCols = input.d1 * newH * newW;
+    Tensor2D<T> reshaped(reshapedIPCols, FH * FW * input.d4);
+    i64 linIdxFilterMult = 0;
+	for (i64 n = 0; n < input.d1; n++){
+		i64 leftTopCornerH = 0 - padding;
+		i64 extremeRightBottomCornerH = input.d2 - 1 + padding;
+		while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+			i64 leftTopCornerW = 0 - padding;
+			i64 extremeRightBottomCornerW = input.d3 - 1 + padding;
+			while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+				for (i64 fh = 0; fh < FH; fh++){
+					for (i64 fw = 0; fw < FW; fw++){
+						i64 curPosH = leftTopCornerH + fh;
+						i64 curPosW = leftTopCornerW + fw;
+						for (i64 ci = 0; ci < input.d4; ci++){
+							if ((((curPosH < 0) || (curPosH >= input.d2)) || ((curPosW < 0) || (curPosW >= input.d3)))){
+								reshaped(linIdxFilterMult, (fh*FW*input.d4) + (fw*input.d4) + ci) = 0L;
+							}
+							else{
+								reshaped(linIdxFilterMult, (fh*FW*input.d4) + (fw*input.d4) + ci) = input(n, curPosH, curPosW, ci);
+							}
+						}
+					}
+				}
+
+				linIdxFilterMult = linIdxFilterMult + 1;
+				leftTopCornerW = leftTopCornerW + stride;
+			}
+
+			leftTopCornerH = leftTopCornerH + stride;
+		}
+	}
+    return reshaped;
+}
+
+template <typename T>
+void reshapeOutput(const Tensor2D<T> &output, u64 d1, u64 d2, u64 d3, u64 d4, Tensor4D<T> &res) {
+    assert(res.d1 == d1);
+    assert(res.d2 == d2);
+    assert(res.d3 == d3);
+    assert(res.d4 == d4);
+    assert(output.d1 == d4);
+    assert(output.d2 == d1 * d2 * d3);
+    for(int i = 0; i < d1; i++) {
+        for(int j = 0; j < d2; j++) {
+            for(int k = 0; k < d3; k++) {
+                for(int l = 0; l < d4; l++) {
+                    res(i, j, k, l) = output(l, i * d2 * d3 + j * d3 + k);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void reshapeOutputReversed(Tensor2D<T> &output, u64 d1, u64 d2, u64 d3, u64 d4, const Tensor4D<T> &res) {
+    assert(res.d1 == d1);
+    assert(res.d2 == d2);
+    assert(res.d3 == d3);
+    assert(res.d4 == d4);
+    assert(output.d1 == d4);
+    assert(output.d2 == d1 * d2 * d3);
+    for(int i = 0; i < d1; i++) {
+        for(int j = 0; j < d2; j++) {
+            for(int k = 0; k < d3; k++) {
+                for(int l = 0; l < d4; l++) {
+                    output(l, i * d2 * d3 + j * d3 + k) = res(i, j, k, l);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void transposeFilter(u64 fh, u64 fw, u64 ci, u64 co, const Tensor2D<T> &filter, Tensor2D<T> &transposedFilter)
+{
+    assert(filter.d1 == co);
+    assert(filter.d2 == fh * fw * ci);
+    assert(transposedFilter.d1 == ci);
+    assert(transposedFilter.d2 == fh * fw * co);
+
+    for(int i = 0; i < fh; i++) {
+        for(int j = 0; j < fw; j++) {
+            for(int k = 0; k < ci; k++) {
+                for(int l = 0; l < co; l++) {
+                    transposedFilter(k, i * fw * co + j * co + l) = filter(l, (fh - i - 1) * fw * ci + (fw - j - 1) * ci + k);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+Tensor2D<T> reshapeInputTransposed3d(const Tensor5D<T> &input, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 FD, u64 FH, u64 FW) {
+    u64 D = input.d2;
+    u64 H = input.d3;
+    u64 W = input.d4;
+    u64 CI = input.d5;
+    u64 newD = (((D + 2*pd - FD)/sd) + 1);
+    u64 newH = (((H + 2*ph - FH)/sh) + 1);
+	u64 newW = (((W + 2*pw - FW)/sw) + 1);
+	u64 reshapedIPCols = input.d1 * newD * newH * newW;
+    Tensor2D<T> reshaped(reshapedIPCols, FD * FH * FW * CI);
+    i64 linIdxFilterMult = 0;
+	for (i64 n = 0; n < input.d1; n++){
+        i64 leftTopCornerD = 0 - pd;
+        i64 extremeRightBottomCornerD = D - 1 + pd;
+        while((leftTopCornerD + FD - 1) <= extremeRightBottomCornerD) {
+            i64 leftTopCornerH = 0 - ph;
+            i64 extremeRightBottomCornerH = H - 1 + ph;
+            while((leftTopCornerH + FH - 1) <= extremeRightBottomCornerH){
+                i64 leftTopCornerW = 0 - pw;
+                i64 extremeRightBottomCornerW = W - 1 + pw;
+                while((leftTopCornerW + FW - 1) <= extremeRightBottomCornerW){
+
+                    for (i64 fd = 0; fd < FD; fd++) {
+                        for (i64 fh = 0; fh < FH; fh++){
+                            for (i64 fw = 0; fw < FW; fw++){
+                                i64 curPosD = leftTopCornerD + fd;
+                                i64 curPosH = leftTopCornerH + fh;
+                                i64 curPosW = leftTopCornerW + fw;
+                                for (i64 ci = 0; ci < CI; ci++){
+                                    if ((((curPosD < 0) || (curPosD >= D)) || ((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
+                                        reshaped(linIdxFilterMult, fd*(FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci) = 0L;
+                                    }
+                                    else{
+                                        reshaped(linIdxFilterMult, fd*(FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci) = input(n, curPosD, curPosH, curPosW, ci);
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    linIdxFilterMult = linIdxFilterMult + 1;
+                    leftTopCornerW = leftTopCornerW + sw;
+                }
+                leftTopCornerH = leftTopCornerH + sh;
+            }
+            leftTopCornerD = leftTopCornerD + sd;
+		}
+	}
+    return reshaped;
+}
+
+template <typename T>
+void reshapeOutput3d(const Tensor2D<T> &output, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5, Tensor5D<T> &res) {
+    assert(res.d1 == d1);
+    assert(res.d2 == d2);
+    assert(res.d3 == d3);
+    assert(res.d4 == d4);
+    assert(res.d5 == d5);
+    assert(output.d1 == d5);
+    assert(output.d2 == d1 * d2 * d3 * d4);
+    for(int i = 0; i < d1; i++) {
+        for(int j = 0; j < d2; j++) {
+            for(int k = 0; k < d3; k++) {
+                for(int l = 0; l < d4; l++) {
+                    for(int m = 0; m < d5; m++) {
+                        res(i, j, k, l, m) = output(m, i * d2 * d3 * d4 + j * d3 * d4 + k * d4 + l);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void convTranspose3dLoop(
+    int64_t N, 
+    int64_t D, 
+    int64_t H, 
+    int64_t W, 
+    int64_t CI, 
+    int64_t FD, 
+    int64_t FH, 
+    int64_t FW, 
+    int64_t CO, 
+    int64_t zPadDLeft, 
+    int64_t zPadDRight, 
+    int64_t zPadHLeft, 
+    int64_t zPadHRight, 
+    int64_t zPadWLeft, 
+    int64_t zPadWRight, 
+    int64_t strideD, 
+    int64_t strideH, 
+    int64_t strideW, 
+    int64_t outD, 
+    int64_t outH, 
+    int64_t outW, 
+    T* inputArr, 
+    T* filterArr, 
+    T* outArr)
+{
+    zPadDLeft = FD - 1 - zPadDLeft;
+    zPadDRight = FD - 1 - zPadDRight;
+    zPadHLeft = FH - 1 - zPadHLeft;
+    zPadHRight = FH - 1 - zPadHRight;
+    zPadWLeft = FW - 1 - zPadWLeft;
+    zPadWRight = FW - 1 - zPadWRight;
+
+    #pragma omp parallel for collapse(5)
+    for (int64_t n =  0; n < N; n++){
+        for (int64_t d =  0; d < outD; d++){
+            for (int64_t h =  0; h < outH; h++){
+                for (int64_t w =  0; w < outW; w++){
+                    for (int64_t co =  0; co < CO; co++){
+                        
+                        T val =  0;
+                        for (int64_t ci =  0; ci < CI; ci++){
+                            for (int64_t fd = d; fd < (d + FD); fd++){
+                                for (int64_t fh = h; fh < (h + FH); fh++){
+                                    for (int64_t fw = w; fw < (w + FW); fw++){
+
+                                        int64_t curPosD = ((fd - zPadDLeft) / strideD);
+                                        int64_t curPosH = ((fh - zPadHLeft) / strideH);
+                                        int64_t curPosW = ((fw - zPadWLeft) / strideW);
+
+                                        if ((curPosD >=  0) &&
+                                            (curPosH >=  0) &&
+                                            (curPosW >=  0) &&
+                                            (curPosD < D) &&
+                                            (curPosH < H) &&
+                                            (curPosW < W) &&
+                                            (((fd - zPadDLeft) % strideD) == 0) &&
+                                            (((fh - zPadHLeft) % strideH) == 0) &&
+                                            (((fw - zPadWLeft) % strideW) == 0))
+                                        {
+                                            int32_t curFilterPosD = FD + d - fd -  1;
+                                            int32_t curFilterPosH = FH + h - fh -  1;
+                                            int32_t curFilterPosW = FW + w - fw -  1;
+                                            val += (Arr5DIdx(inputArr, N, D, H, W, CI, n, curPosD, curPosH, curPosW, ci) * Arr5DIdx(filterArr, CO, FD, FH, FW, CI, co, curFilterPosD, curFilterPosH, curFilterPosW, ci));
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) =  val;
+                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T, typename... Args>
+std::vector<T *> collect(T &first, Args & ... args)
+{
+    std::vector<T *> res;
+    res.push_back(&first);
+    collectHelper(res, args...);
+    return res;
+}
+template <typename T, typename... Args>
+void collectHelper(std::vector<T *> &res)
+{
+
+}
+
+template <typename T, typename... Args>
+void collectHelper(std::vector<T *> &res, T &a, Args & ... args)
+{
+    res.push_back(&a);
+    collectHelper(res, args...);
+}
+
+template <typename T, typename... Args>
+void collectByValueHelper(std::vector<T> &res)
+{
+
+}
+
+template <typename T, typename... Args>
+void collectByValueHelper(std::vector<T> &res, T a, Args ... args)
+{
+    res.push_back(a);
+    collectByValueHelper(res, args...);
+}
+
+template <typename T, typename... Args>
+std::vector<T> collectByValue(T first, Args ... args)
+{
+    std::vector<T> res;
+    res.push_back(first);
+    collectByValueHelper(res, args...);
+    return res;
+}
+
+template <typename... Args>
+std::string paramstring(Args ... args)
+{
+    std::stringstream ss;
+    auto arr = collectByValue(args...);
+    for (u64 i = 0; i < arr.size(); ++i)
+    {
+        ss << std::to_string(arr[i]) << "|";
+    }
+    return ss.str();
+}
+
+inline std::string paramstring()
+{
+    return "";
+}
+
+template <typename T>
+std::vector<std::vector<u64>> getShapes(const std::vector<Tensor<T> *> &tensors) {
+    std::vector<std::vector<u64>> shapes;
+    for (auto tensor : tensors) {
+        shapes.push_back(tensor->shape);
+    }
+    return shapes;
+}
+
+template <typename T>
+void print(const Tensor<T> &p, u64 scale, u64 bw)
+{
+    printf("bw=%lu, scale=%lu, p[0]=%ld, p[1]=%ld\n", bw, scale, p.data[0], p.data[1]);
+    u64 d = p.shape.back();
+    int count = 0;
+    for (u64 i = 0; i < /*p.size()*/10; ++i)
+    {
+        i64 val;
+        if (bw == sizeof(T) * 8) {
+            val = p.data[i];
+        }
+        else {
+            val = (p.data[i] + (1LL << (bw - 1))) % (1LL << bw);
+            val -= (1LL << (bw - 1));
+        }
+        std::cout << (double) val / (1LL << scale);
+        if(std::abs((double) val / (1LL << scale)) < 1) count++; 
+        if ((i + 1) % d == 0) {
+            std::cout << std::endl;
+        }
+        else {
+            std::cout << " ";
+        }
+    }
+    // printf("Count < 1: %d\n", count);
+}
+
+template <typename T>
+void print(const Tensor<T> &p, u64 scale)
+{
+    print(p, scale, sizeof(T) * 8);
+}
+
+inline void printshape(const std::vector<u64> &shape) {
+    std::cout << "(";
+    for(int i = 0; i < shape.size(); i++) {
+        std::cout << shape[i] << ", ";
+    }
+    std::cout << ")" << std::endl;
+}
+
+inline void sytorch_init()
+{
+    prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
+    prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
+}
+
+template <typename T>
+void qkv_split(Tensor2D<T> &x, Tensor4D<T> &y, u64 n_heads)
+{
+    always_assert(x.d2 % 3 == 0);
+    u64 n_seq = x.d1;
+    u64 n_embd = x.d2 / 3;
+    always_assert(n_embd % n_heads == 0);
+    always_assert(y.d1 == 3);
+    always_assert(y.d2 == n_heads);
+    always_assert(y.d3 == n_seq);
+    always_assert(y.d4 == n_embd / n_heads);
+
+    for (u64 i = 0; i < n_seq; ++i)
+    {
+        for (u64 j = 0; j < n_embd; ++j)
+        {
+            u64 head = j / (n_embd / n_heads);
+            u64 pos = j % (n_embd / n_heads);
+            y(0, head, i, pos) = x(i, j);
+            y(1, head, i, pos) = x(i, j + n_embd);
+            y(2, head, i, pos) = x(i, j + 2 * n_embd);
+        }
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/scores/.gitignore b/GPU-MPC/ext/sytorch/scores/.gitignore
new file mode 100644
index 00000000..8f405155
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scores/.gitignore
@@ -0,0 +1 @@
+**/*.txt
diff --git a/GPU-MPC/ext/sytorch/scripts/dealer.py b/GPU-MPC/ext/sytorch/scripts/dealer.py
new file mode 100644
index 00000000..546a9b01
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/dealer.py
@@ -0,0 +1,119 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import sys
+import hashlib
+
+from pyftpdlib.authorizers import DummyAuthorizer
+from pyftpdlib.handlers import FTPHandler
+from pyftpdlib.servers import ThreadedFTPServer
+
+
+class FileHandler(FTPHandler):
+    files_served_to_client = 0
+    files_served_to_server = 0
+    keys_served = 0
+    keys_available = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def on_connect(self):
+        self.log(f"Connected {self.username}")
+        self.log(f"Checking if keys are available")
+        if not FileHandler.keys_available:
+            self.log(f"Keys not available. Sleeping")
+        while not FileHandler.keys_available:
+            pass
+        self.log(f"Keys available. Continuing")
+
+    def on_file_sent(self, file):
+        self.log(f"Ip of {self.username} is {self.remote_ip}")
+
+        # Calculate the MD hash of the file
+        # hasher = hashlib.md5()
+        # with open(file, "rb") as f:
+        #     buf = f.read()
+        #     hasher.update(buf)
+        # md_hash = hasher.hexdigest()
+
+        if self.username == "server":
+            FileHandler.files_served_to_server += 1
+            # self.log(f"MD5 hash of server.dat is {md_hash}")
+        elif self.username == "client":
+            FileHandler.files_served_to_client += 1
+            # self.log(f"MD5 hash of client.dat is {md_hash}")
+
+        self.log(f"Files served to client: {FileHandler.files_served_to_client}")
+        self.log(f"Files served to server: {FileHandler.files_served_to_server}")
+
+    def on_disconnect(self):
+        self.log(f"Disconnected {self.username}")
+        if (
+            FileHandler.files_served_to_client > 0
+            and FileHandler.files_served_to_server > 0
+        ):
+            self.log("Files downloaded via both servers.")
+            FileHandler.keys_served += 1
+            FileHandler.files_served_to_client = 0
+            FileHandler.files_served_to_server = 0
+            self.log(f"Keys served: {FileHandler.keys_served}")
+
+            self.log("Generating New Keys")
+            FileHandler.keys_available = False
+            os.system("rm -rf *.dat")
+            os.system("./generate_keys 1")
+            os.system("mv server.dat server/server.dat")
+            os.system("mv client.dat client/client.dat")
+            FileHandler.keys_available = True
+            self.log("New Keys Generated")
+
+
+def main():
+    # Instantiate a dummy authorizer for managing 'virtual' users
+    authorizer = DummyAuthorizer()
+    # Define a new user having full r/w permissions and a read-only
+    # anonymous user
+    authorizer.add_user("server", "server", "./server", perm="elradfmwMT")
+    authorizer.add_user("client", "client", "./client", perm="elradfmwMT")
+
+    # Instantiate FTP handler class
+    handler = FileHandler
+    handler.authorizer = authorizer
+
+    # Define a customized banner (string returned when client connects)
+    handler.banner = "pyftpdlib based ftpd ready."
+
+    # Instantiate FTP server class and listen on 0.0.0.0:2121
+    address = (sys.argv[1], 9000)
+    server = ThreadedFTPServer(address, handler)
+
+    # set a limit for connections
+    server.max_cons = 256
+    server.max_cons_per_ip = 5
+
+    # start ftp server
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/GPU-MPC/ext/sytorch/scripts/diff.py b/GPU-MPC/ext/sytorch/scripts/diff.py
new file mode 100644
index 00000000..9717abca
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/diff.py
@@ -0,0 +1,33 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+
+f1 = open(sys.argv[1], 'r').readlines()
+f2 = open(sys.argv[2], 'r').readlines()
+assert(len(f1) == len(f2))
+
+for i in range(len(f1)):
+    if f1[i] != f2[i]:
+        print(i)
+        print(f1[i])
+        print(f2[i])
+        break
diff --git a/GPU-MPC/ext/sytorch/scripts/download_keys.py b/GPU-MPC/ext/sytorch/scripts/download_keys.py
new file mode 100644
index 00000000..7a9e7045
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/download_keys.py
@@ -0,0 +1,62 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import ftplib
+import sys
+from tqdm import tqdm
+import time
+
+url = sys.argv[1]
+user = sys.argv[2]
+passwd = sys.argv[3]
+file_name = sys.argv[4]
+
+while True:
+    try:
+        ftp = ftplib.FTP()
+        ftp.connect(url, 9000)
+        ftp.login(user=user, passwd=passwd)
+
+        # Switch to binary mode
+        ftp.sendcmd("TYPE i")
+
+        # Get the size of the file on the server
+        file_size = ftp.size(file_name)
+
+        # Download the file and display a progress bar
+        with open(file_name, "wb") as f:
+            with tqdm(
+                unit="B", unit_scale=True, unit_divisor=1024, total=file_size
+            ) as progress:
+
+                def callback(data):
+                    f.write(data)
+                    progress.update(len(data))
+
+                ftp.retrbinary(f"RETR {file_name}", callback)
+
+        ftp.quit()
+        break  # exit the loop if the file is downloaded successfully
+
+    except Exception as e:
+        print(f"Error: Dealer not ready.")
+        print("Retrying in 10 seconds...")
+        time.sleep(10)
diff --git a/GPU-MPC/ext/sytorch/scripts/gptacc.py b/GPU-MPC/ext/sytorch/scripts/gptacc.py
new file mode 100644
index 00000000..efbaf0fb
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/gptacc.py
@@ -0,0 +1,46 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+nt = 8
+
+folder = "results_final_4june"
+corr = 0
+tot = 0
+lab = "pos"
+for i in range(nt):
+    lines = open(folder + "/" + lab + "/thread-" + str(i)).readlines()
+    for l in lines:
+        # print(l.split(" ")[3])
+        if lab == l.split(" ")[3].strip():
+            corr += 1
+        tot += 1
+
+lab = "neg"
+for i in range(nt):
+    lines = open(folder + "/" + lab + "/thread-" + str(i)).readlines()
+    for l in lines:
+        # print(l.split(" ")[3])
+        if lab == l.split(" ")[3].strip():
+            corr += 1
+        tot += 1
+
+print(corr, tot)
+print(corr / tot)
diff --git a/GPU-MPC/ext/sytorch/scripts/mnli_matched_acc.py b/GPU-MPC/ext/sytorch/scripts/mnli_matched_acc.py
new file mode 100644
index 00000000..de50ea28
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/mnli_matched_acc.py
@@ -0,0 +1,38 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ground_truth = open('../../../transformers/gpt2-mnli/dataset/mismatched/labels.txt').readlines()
+ground_truth = [int(x.strip()) for x in ground_truth]
+
+import sys
+import numpy as np
+predictions = open(sys.argv[1]).readlines()
+
+corr = 0
+for i in range(len(ground_truth)):
+    pred = predictions[i].strip().split(' ')
+    pred = [float(x) for x in pred]
+    pred = np.argmax(pred)
+    if pred == ground_truth[i]:
+        corr += 1
+
+print('Accuracy: %.2f%%' % (100.0 * corr / len(ground_truth)))
+print(corr, len(ground_truth))
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/scripts/mnnli_mismatched_acc.py b/GPU-MPC/ext/sytorch/scripts/mnnli_mismatched_acc.py
new file mode 100644
index 00000000..de50ea28
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/mnnli_mismatched_acc.py
@@ -0,0 +1,38 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ground_truth = open('../../../transformers/gpt2-mnli/dataset/mismatched/labels.txt').readlines()
+ground_truth = [int(x.strip()) for x in ground_truth]
+
+import sys
+import numpy as np
+predictions = open(sys.argv[1]).readlines()
+
+corr = 0
+for i in range(len(ground_truth)):
+    pred = predictions[i].strip().split(' ')
+    pred = [float(x) for x in pred]
+    pred = np.argmax(pred)
+    if pred == ground_truth[i]:
+        corr += 1
+
+print('Accuracy: %.2f%%' % (100.0 * corr / len(ground_truth)))
+print(corr, len(ground_truth))
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/scripts/mrpcacc.py b/GPU-MPC/ext/sytorch/scripts/mrpcacc.py
new file mode 100644
index 00000000..b37629b4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/mrpcacc.py
@@ -0,0 +1,38 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ground_truth = open('../../../transformers/datasets/mrpc/labels.txt').readlines()
+ground_truth = [int(x.strip()) for x in ground_truth]
+
+import sys
+import numpy as np
+predictions = open(sys.argv[1]).readlines()
+
+corr = 0
+for i in range(len(ground_truth)):
+    pred = predictions[i].strip().split(' ')
+    pred = [float(x) for x in pred]
+    pred = np.argmax(pred)
+    if pred == ground_truth[i]:
+        corr += 1
+
+print('Accuracy: %.2f%%' % (100.0 * corr / len(ground_truth)))
+print(corr, len(ground_truth))
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/scripts/qnli_acc.py b/GPU-MPC/ext/sytorch/scripts/qnli_acc.py
new file mode 100644
index 00000000..42502ba5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/qnli_acc.py
@@ -0,0 +1,38 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ground_truth = open('../../../transformers/gpt2-qnli/dataset/labels.txt').readlines()
+ground_truth = [int(x.strip()) for x in ground_truth]
+
+import sys
+import numpy as np
+predictions = open(sys.argv[1]).readlines()
+
+corr = 0
+for i in range(len(ground_truth)):
+    pred = predictions[i].strip().split(' ')
+    pred = [float(x) for x in pred]
+    pred = np.argmax(pred)
+    if pred == ground_truth[i]:
+        corr += 1
+
+print('Accuracy: %.2f%%' % (100.0 * corr / len(ground_truth)))
+print(corr, len(ground_truth))
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/scripts/server.py b/GPU-MPC/ext/sytorch/scripts/server.py
new file mode 100644
index 00000000..4c98873e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/server.py
@@ -0,0 +1,33 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import http.server
+import socketserver
+import sys
+
+PORT = 8000
+loop = int(sys.argv[1])
+handler = http.server.SimpleHTTPRequestHandler
+
+with socketserver.TCPServer(("", PORT), handler) as httpd:
+    print("Server started at localhost:" + str(PORT))
+    for i in range(loop):
+        httpd.handle_request()
diff --git a/GPU-MPC/ext/sytorch/scripts/sst2acc.py b/GPU-MPC/ext/sytorch/scripts/sst2acc.py
new file mode 100644
index 00000000..bc5ced08
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/sst2acc.py
@@ -0,0 +1,37 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ground_truth = open('../../../transformers/datasets/sst2/labels.txt').readlines()
+ground_truth = [int(x.strip()) for x in ground_truth]
+
+import sys
+import numpy as np
+predictions = open(sys.argv[1]).readlines()
+
+corr = 0
+for i in range(len(ground_truth)):
+    pred = predictions[i].strip().split(' ')
+    pred = [float(x) for x in pred]
+    pred = np.argmax(pred)
+    if pred == ground_truth[i]:
+        corr += 1
+
+print('Accuracy: %d / %d = %.2f%%' % (corr, len(ground_truth), (100.0 * corr / len(ground_truth))))
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/src/sytorch/backend/baseline_cleartext.cpp b/GPU-MPC/ext/sytorch/src/sytorch/backend/baseline_cleartext.cpp
new file mode 100644
index 00000000..dc4538ec
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/src/sytorch/backend/baseline_cleartext.cpp
@@ -0,0 +1,575 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <sytorch/backend/baseline_cleartext.h>
+#include <Eigen/Dense>
+
+template <typename T>
+void BaselineClearText<T>::matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void BaselineClearText<T>::matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = (eA * eB).template triangularView<Eigen::Lower>();
+    modbw(c);
+}
+
+template <typename T>
+void BaselineClearText<T>::matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d1 == b.d1);
+    assert(c.d1 == a.d2);
+    assert(c.d2 == b.d2);
+//    c.zero();
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eA(a.data, a.d2, a.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void BaselineClearText<T>::matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d2);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eB(b.data, b.d2, b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void BaselineClearText<T>::conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst)
+{
+    assert(input.d4 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fh * fw * ci);
+    u64 newH = (((input.d2 + 2*padding - fh)/stride) + 1);
+    u64 newW = (((input.d3 + 2*padding - fw)/stride) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newH);
+    assert(output.d3 == newW);
+    assert(output.d4 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed<T>(input, padding, stride, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput<T>(tempOutput, input.d1, (((input.d2 + 2*padding - fh)/stride) + 1), (((input.d3 + 2*padding - fw)/stride) + 1), co, output);
+    modbw(output);
+}
+
+template <typename T>
+void BaselineClearText<T>::conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+{
+    assert(input.d5 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fd * fh * fw * ci);
+    always_assert(dd == 1);
+    always_assert(dh == 1);
+    always_assert(dw == 1);
+    u64 newD = (((input.d2 + 2*pd - fd - (fd-1)*(dd-1))/sd) + 1);
+    u64 newH = (((input.d3 + 2*ph - fh - (fh-1)*(dh-1))/sh) + 1);
+    u64 newW = (((input.d4 + 2*pw - fw - (fw-1)*(dw-1))/sw) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newD);
+    assert(output.d3 == newH);
+    assert(output.d4 == newW);
+    assert(output.d5 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed3d<T>(input, pd, ph, pw, sd, sh, sw, fd, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput3d<T>(tempOutput, input.d1, newD, newH, newW, co, output);
+    modbw(output);
+}
+
+template <typename T>
+void BaselineClearText<T>::convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+    {
+        assert(input.d5 == ci);
+        assert(filter.d1 == co);
+        assert(filter.d2 == fd * fh * fw * ci);
+        u64 newD = (((input.d2 - 1)*sd + fd - 2*pd));
+        u64 newH = (((input.d3 - 1)*sh + fh - 2*ph));
+        u64 newW = (((input.d4 - 1)*sw + fw - 2*pw));
+        assert(output.d1 == input.d1);
+        assert(output.d2 == newD);
+        assert(output.d3 == newH);
+        assert(output.d4 == newW);
+        assert(output.d5 == co);
+
+        convTranspose3dLoop<T>(input.d1, input.d2, input.d3, input.d4, input.d5, fd, fh, fw, co, 
+            pd, pd, ph, ph, pw, pw, sd, sh, sw,
+            output.d2, output.d3, output.d4, input.data, filter.data, output.data);
+        modbw(output);
+    }
+
+template <typename T>
+void BaselineClearText<T>::relu(const Tensor<T> &in, const Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode) {
+    assert(in.is_same_shape(out));
+    assert(in.is_same_shape(drelu));
+    fastfor(in.size(), [&] (u64 i) {
+        drelu.data[i] = (T)(in.data[i] > 0);
+        assert(drelu.data[i] == 1 || drelu.data[i] == 0);
+        out.data[i] = (drelu.data[i] == 1) ? in.data[i] : 0;
+    });
+}
+
+template <int bw>
+void _modbw(u64& x) {
+    if /*constexpr*/ (bw == 64) {
+        return;
+    }
+    else {
+        x = x % (1ULL << bw);
+    }
+}
+
+template <typename T>
+void BaselineClearText<T>::truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+    fastfor(size, [&] (u64 i) {
+        if constexpr (localTruncationEmulation) {
+            u64 a = prngStr.get<u64>();
+            _modbw<bw>(a);
+            u64 b = a - ((u64)in[i]);
+            _modbw<bw>(b);
+            a = a >> shift;
+            b = b >> shift;
+            out[i] = a - b;
+            modbw(out[i]);
+            return;
+        }
+        out[i] = in[i] >> shift;
+        if constexpr (probablistic) {
+            u64 x0 = ((u64)in[i]) % (1ULL << shift);
+            u64 r = rand() % (1ULL << shift);
+            out[i] += (x0 < r ? 0 : 1); 
+        }
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::truncate(T &in, u64 shift) {
+    if constexpr (localTruncationEmulation) {
+        u64 a = prngStr.get<u64>();
+        _modbw<bw>(a);
+        u64 b = a - ((u64)in);
+        _modbw<bw>(b);
+        a = a >> shift;
+        b = b >> shift;
+        in = a - b;
+        modbw(in);
+        return;
+    }
+    u64 x0 = ((u64)in) % (1ULL << shift);
+    in = in >> shift;
+    if constexpr (probablistic) {
+        u64 r = rand() % (1ULL << shift);
+        in += (x0 < r ? 0 : 1); 
+    }
+    modbw(in);
+}
+
+template <typename T>
+void BaselineClearText<T>::div(Tensor<T> &in, T divisor, u64 scale) {
+    // fastfor(in.size(), [&] (u64 i) {
+    //     in.data[i] = in.data[i] / divisor;
+    // });
+
+    T divfp = (1LL << scale) / divisor;
+    u64 sz = in.size();
+    fastfor(in.size(), [&] (u64 i) {
+        in.data[i] *= divfp;
+    });
+    modbw(in);
+    Backend<T>::truncate(in, scale, 3);
+}
+
+template <typename T>
+void BaselineClearText<T>::div(T &in, T divisor, u64 scale) {
+    // in = in / divisor;
+
+    T divfp = (1LL << scale) / divisor;
+    in *= divfp;
+    modbw(in);
+    BaselineClearText<T>::truncate(in, scale);
+}
+
+template <typename T>
+void BaselineClearText<T>::sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out) {
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2*padding - ks)/stride + 1;
+    u64 newW = (in.d3 + 2*padding - ks)/stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&] (int i) {
+        for(int j = 0; j < newH; j++) {
+            for(int k = 0; k < newW; k++) {
+                for(int l = 0; l < in.d4; l++) {
+                    T sum = 0;
+                    for(int m = 0; m < ks; m++) {
+                        for(int n = 0; n < ks; n++) {
+                            sum += in(i, j*stride+m, k*stride+n, l);
+                        }
+                    }
+                    out(i, j, k, l) = sum;
+                }
+            }
+        }
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale) {
+    sumPool2D(ks, padding, stride, in, out);
+    auto out_nd = out.as_nd();
+    div(out_nd, (T)(ks*ks), scale);
+}
+
+template <typename T>
+u64 BaselineClearText<T>::log2(u64 x) {
+    u64 y = 0;
+    while (x >>= 1) y++;
+    return y;
+}
+
+template <typename T>
+void BaselineClearText<T>::maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode) {
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2*padding - ks)/stride + 1;
+    u64 newW = (in.d3 + 2*padding - ks)/stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&](int i) {
+        for(int j = 0; j < newH; j++) {
+            for(int k = 0; k < newW; k++) {
+                for(int l = 0; l < in.d4; l++) {
+                    T max = std::numeric_limits<T>::lowest();
+                    u64 maxIdxI = 0;
+                    u64 maxIdxJ = 0;
+                    for(int m = 0; m < ks; m++) {
+                        for(int n = 0; n < ks; n++) {
+                            auto h2 = j*stride+m-padding;
+                            auto w2 = k*stride+n-padding;
+                            T val = 0;
+                            if (h2 < in.d2 && w2 < in.d3 && h2 >= 0 && w2 >= 0)
+                                val = in(i, h2, w2, l);
+                            if(val > max) {
+                                max = val;
+                                maxIdxI = m;
+                                maxIdxJ = n;
+                            }
+                        }
+                    }
+                    out(i, j, k, l) = max;
+                    maxIdx(i, j, k, l) = maxIdxI * ks + maxIdxJ;
+                }
+            }
+        }
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    assert(A.d1 == B.d1);
+    assert(A.d1 == x.shape.back());
+    assert(x.is_same_shape(y));
+    u64 channels = x.shape.back();
+
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = x.data[i] * A(i % channels) + B(i % channels);
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+{
+    always_assert(in.size() > 0);
+    always_assert(out.size() == in[0]->size());
+    for (int i = 0; i < in.size(); i++) {
+        always_assert(out.size() == in[i]->size());
+    }
+    fastfor(out.size(), [&](int i) {
+        T sum = 0;
+        for (int j = 0; j < in.size(); j++) {
+            sum += in[j]->data[i];
+        }
+        out.data[i] = sum;
+    });
+    modbw(out);
+}
+
+double gelu(double x)
+{
+    return 0.5 * x * (1 + erf(x / sqrt(2.0)));
+}
+
+template <typename T>
+void BaselineClearText<T>::gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode)
+{
+    fastfor(in.size(), [&](u64 i) {
+        out.data[i] = ::gelu(double(in.data[i]) / (1LL << scale)) * (1LL << scale);
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    always_assert(_in.shape.size() == 2);
+    always_assert(_out.shape.size() == 2);
+    always_assert(_in.shape[0] == _out.shape[0]);
+    always_assert(_in.shape[1] == _out.shape[1]);
+
+    auto in = _in.as_2d();
+    auto out = _out.as_2d();
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    T exps[numClasses];
+    for(int b = 0; b < batchSize; ++b) {
+        T max = in(b, 0);
+        for(u64 j = 1; j < numClasses; ++j) {
+            if(in(b, j) > max) {
+                max = in(b, j);
+            }
+        }
+
+        T den = 0;
+        for(u64 j = 0; j < numClasses; ++j) {
+            T x = max - in(b, j);
+            exps[j] = T(std::exp(-x / double(1LL<<scale)) * (1LL<<scale));
+            modbw(exps[j]);
+            den += exps[j];
+        }
+
+        for(u64 j = 0; j < numClasses; ++j) {
+            out(b, j) = exps[j] * (1LL<<scale) / den;
+        }
+    }
+}
+
+template <typename T>
+void BaselineClearText<T>::softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    always_assert(_in.shape.size() == 2);
+    always_assert(_out.shape.size() == 2);
+    always_assert(_in.shape[0] == _out.shape[0]);
+    always_assert(_in.shape[1] == _out.shape[1]);
+    always_assert(_in.shape[0] == _in.shape[1]); // should be a square matrix
+
+    auto in = _in.as_2d();
+    auto out = _out.as_2d();
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    T exps[numClasses];
+
+    out.zero();
+    out(0, 0) = T(1LL << (scale));
+    
+    for(int b = 1; b < batchSize; ++b) {
+        T max = in(b, 0);
+        for(u64 j = 1; j < b + 1; ++j) {
+            if(in(b, j) > max) {
+                max = in(b, j);
+            }
+        }
+
+        T den = 0;
+        for(u64 j = 0; j < b + 1; ++j) {
+            T x = max - in(b, j);
+            exps[j] = T(std::exp(-x / double(1LL<<scale)) * (1LL<<scale));
+            modbw(exps[j]);
+            den += exps[j];
+        }
+
+        for(u64 j = 0; j < b + 1; ++j) {
+            out(b, j) = exps[j] * (1LL<<scale) / den;
+        }
+    }
+}
+
+template <typename T>
+void BaselineClearText<T>::layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    always_assert(A.d1 == B.d1);
+    always_assert(A.d1 == x.shape.back());
+    always_assert(x.is_same_shape(y));
+    
+    u64 channels = x.shape.back();
+
+    fastfor(x.size() / channels, [&](u64 i) {
+        T mean = 0;
+        T var = 0;
+        for (u64 j = 0; j < channels; j++) {
+            mean += x.data[i * channels + j];
+        }
+        modbw(mean);
+        // mean = mean / T(channels);
+        div(mean, channels, scale);
+        for (u64 j = 0; j < channels; j++) {
+            var += (x.data[i * channels + j] - mean) * (x.data[i * channels + j] - mean);
+        }
+        modbw(var);
+        truncate(var, scale);
+        div(var, channels, scale);
+        double d = double(var) / ((1LL << (scale)));
+        // var = var / T(channels);
+        // div(var, channels, scale);
+        // truncate(var, scale);
+        var = T((1LL << scale) / std::sqrt(d));
+        for (u64 j = 0; j < channels; j++) {
+            y.data[i * channels + j] = (x.data[i * channels + j] - mean) * var;
+        }
+    });
+    modbw(y);
+
+    Backend<T>::truncate(y, scale);
+
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = y.data[i] * A(i % channels) + B(i % channels);
+    });
+
+    Backend<T>::truncate(y, scale);
+}
+
+template <typename T>
+void BaselineClearText<T>::addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+{
+    always_assert(x.shape.back() == bias.d1);
+    fastfor(x.size(), [&](u64 i) {
+        x.data[i] += bias(i % bias.d1);
+    });
+    modbw(x);
+}
+
+template <typename T>
+void BaselineClearText<T>::scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = x.data[i] * scalar;
+    });
+    modbw(y);
+}
+
+template <typename T>
+void BaselineClearText<T>::attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    always_assert(x.shape.size() == 2);
+    always_assert(x.shape[0] == x.shape[1]);
+
+    u64 n_seq = x.shape[0];
+    auto y_2d = y.as_2d();
+    auto x_2d = x.as_2d();
+
+    for (u64 j = 0; j < n_seq; ++j) {
+        for (u64 k = 0; k < j + 1; ++k) {
+            y_2d(j, k) = x_2d(j, k);
+        }
+        for (u64 k = j + 1; k < n_seq; ++k) {
+            y_2d(j, k) = x_2d(j, k) - scalar;
+        }
+    }
+
+    modbw(y);
+}
+
+template <typename T>
+void BaselineClearText<T>::local_attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    always_assert(x.shape.size() == 2);
+    always_assert(x.shape[0] == x.shape[1]);
+
+    u64 n_seq = x.shape[0];
+    auto y_2d = y.as_2d();
+    auto x_2d = x.as_2d();
+    u64 window_size = 256;
+
+    for (u64 j = 1; j <= n_seq - 1 ; ++j) {
+        for (u64 k = 0; k < n_seq - j; ++k) {
+            y_2d(k, k+j) = x_2d(k, k+j) - scalar;
+        }
+    }
+
+    for (u64 j = 0; j <= window_size -1 ; ++j) {
+        for (u64 k = j; k < n_seq; ++k) {
+            y_2d(k, k-j) = x_2d(k, k-j);
+        }
+    }
+
+    for (u64 j = window_size; j <= n_seq -1 ; ++j) {
+        for (u64 k = j; k < n_seq; ++k) {
+            y_2d(k, k-j) = x_2d(k, k-j) - scalar;
+        }
+    }
+
+    modbw(y);
+}
+
+template <typename T>
+void BaselineClearText<T>::tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale)
+{
+
+    fastfor(in.size(), [&](u64 i) {
+        double x = double(in.data[i]) / double(1LL << scale);
+        out.data[i] = T(std::tanh(x) * (1LL << scale));
+    });
+}
+
+template <typename T>
+void BaselineClearText<T>::scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode)
+{
+    T d = T(double(1LL << (scale)) / scalar);
+    if ((d & (d - 1)) == 0) 
+    {
+        Backend<T>::truncate(x, y, scale - log2(d), 0);
+    }
+    else
+    {
+        this->scalarmul(x, d, y);
+        Backend<T>::truncate(y, y, scale, 0);
+    }
+}
+
+template class BaselineClearText<i64>;
+template class BaselineClearText<i32>;
diff --git a/GPU-MPC/ext/sytorch/src/sytorch/backend/cleartext.cpp b/GPU-MPC/ext/sytorch/src/sytorch/backend/cleartext.cpp
new file mode 100644
index 00000000..f82a959f
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/src/sytorch/backend/cleartext.cpp
@@ -0,0 +1,1108 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <sytorch/backend/cleartext.h>
+#include <Eigen/Dense>
+
+template <typename T>
+void ClearText<T>::matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+{
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void ClearText<T>::matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+{
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = (eA * eB).template triangularView<Eigen::Lower>();
+    modbw(c);
+}
+
+template <typename T>
+void ClearText<T>::matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+{
+    assert(a.d1 == b.d1);
+    assert(c.d1 == a.d2);
+    assert(c.d2 == b.d2);
+    //    c.zero();
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eA(a.data, a.d2, a.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void ClearText<T>::matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c)
+{
+    assert(a.d2 == b.d2);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eB(b.data, b.d2, b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+    modbw(c);
+}
+
+template <typename T>
+void ClearText<T>::conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst)
+{
+    assert(input.d4 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fh * fw * ci);
+    u64 newH = (((input.d2 + 2 * padding - fh) / stride) + 1);
+    u64 newW = (((input.d3 + 2 * padding - fw) / stride) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newH);
+    assert(output.d3 == newW);
+    assert(output.d4 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed<T>(input, padding, stride, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput<T>(tempOutput, input.d1, (((input.d2 + 2 * padding - fh) / stride) + 1), (((input.d3 + 2 * padding - fw) / stride) + 1), co, output);
+    modbw(output);
+}
+
+template <typename T>
+void ClearText<T>::conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+{
+    assert(input.d5 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fd * fh * fw * ci);
+    always_assert(dd == 1);
+    always_assert(dh == 1);
+    always_assert(dw == 1);
+    u64 newD = (((input.d2 + 2 * pd - fd - (fd - 1) * (dd - 1)) / sd) + 1);
+    u64 newH = (((input.d3 + 2 * ph - fh - (fh - 1) * (dh - 1)) / sh) + 1);
+    u64 newW = (((input.d4 + 2 * pw - fw - (fw - 1) * (dw - 1)) / sw) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newD);
+    assert(output.d3 == newH);
+    assert(output.d4 == newW);
+    assert(output.d5 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed3d<T>(input, pd, ph, pw, sd, sh, sw, fd, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput3d<T>(tempOutput, input.d1, newD, newH, newW, co, output);
+    modbw(output);
+}
+
+template <typename T>
+void ClearText<T>::convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+{
+    assert(input.d5 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fd * fh * fw * ci);
+    u64 newD = (((input.d2 - 1) * sd + fd - 2 * pd));
+    u64 newH = (((input.d3 - 1) * sh + fh - 2 * ph));
+    u64 newW = (((input.d4 - 1) * sw + fw - 2 * pw));
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newD);
+    assert(output.d3 == newH);
+    assert(output.d4 == newW);
+    assert(output.d5 == co);
+
+    convTranspose3dLoop<T>(input.d1, input.d2, input.d3, input.d4, input.d5, fd, fh, fw, co,
+                           pd, pd, ph, ph, pw, pw, sd, sh, sw,
+                           output.d2, output.d3, output.d4, input.data, filter.data, output.data);
+    modbw(output);
+}
+
+template <typename T>
+void ClearText<T>::relu(Tensor<T> &in, Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode)
+{
+    assert(in.is_same_shape(out));
+    assert(in.is_same_shape(drelu));
+    fastfor(in.size(), [&](u64 i)
+            {
+                drelu.data[i] = (T)(in.data[i] > 0);
+                assert(drelu.data[i] == 1 || drelu.data[i] == 0);
+                out.data[i] = (drelu.data[i] == 1) ? in.data[i] : 0; });
+}
+
+template <typename T>
+void ClearText<T>::truncate(T *in, T *out, u64 shift, u64 size, u8 mode)
+{
+    fastfor(size, [&](u64 i)
+            {
+                if /*constexpr*/ (localTruncationEmulation)
+                {
+                    u64 a = prngStr.get<u64>();
+                    u64 b = ((u64)in[i]) - a;
+                    a = a >> shift;
+                    b = -((-b) >> shift);
+                    out[i] = a + b;
+                    return;
+                }
+                out[i] = in[i] >> shift;
+                if /*constexpr*/ (probablistic)
+                {
+                    u64 x0 = ((u64)in[i]) % (1ULL << shift);
+                    u64 r = rand() % (1ULL << shift);
+                    out[i] += (x0 < r ? 0 : 1);
+                } });
+}
+
+template <typename T>
+void ClearText<T>::truncate(T &in, u64 shift)
+{
+    if /*constexpr*/ (localTruncationEmulation)
+    {
+        u64 a = prngStr.get<T>();
+        u64 b = ((u64)in) - a;
+        a = a >> shift;
+        b = -((-b) >> shift);
+        in = a + b;
+        return;
+    }
+    u64 x0 = ((u64)in) % (1ULL << shift);
+    in = in >> shift;
+    if /*constexpr*/ (probablistic)
+    {
+        u64 r = rand() % (1ULL << shift);
+        in += (x0 < r ? 0 : 1);
+    }
+    modbw(in);
+}
+
+template <typename T>
+void ClearText<T>::div(Tensor<T> &in, T divisor, u64 scale)
+{
+    // fastfor(in.size(), [&] (u64 i) {
+    //     in.data[i] = in.data[i] / divisor;
+    // });
+
+    T divfp = (1LL << scale) / divisor;
+    u64 sz = in.size();
+    fastfor(in.size(), [&](u64 i)
+            { in.data[i] *= divfp; });
+    modbw(in);
+    Backend<T>::truncate(in, scale, 3);
+}
+
+template <typename T>
+void ClearText<T>::div(T &in, T divisor, u64 scale)
+{
+    // in = in / divisor;
+
+    T divfp = (1LL << scale) / divisor;
+    in *= divfp;
+    modbw(in);
+    ClearText<T>::truncate(in, scale);
+}
+
+template <typename T>
+void ClearText<T>::sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2 * padding - ks) / stride + 1;
+    u64 newW = (in.d3 + 2 * padding - ks) / stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&](int i)
+            {
+                for (int j = 0; j < newH; j++)
+                {
+                    for (int k = 0; k < newW; k++)
+                    {
+                        for (int l = 0; l < in.d4; l++)
+                        {
+                            T sum = 0;
+                            for (int m = 0; m < ks; m++)
+                            {
+                                for (int n = 0; n < ks; n++)
+                                {
+                                    sum += in(i, j * stride + m, k * stride + n, l);
+                                }
+                            }
+                            out(i, j, k, l) = sum;
+                        }
+                    }
+                } });
+}
+
+template <typename T>
+void divPartial(const Tensor4D<T> &in, T divisor, u64 scale)
+{
+    T divfp = (1LL << scale) / divisor;
+    u64 sz = in.d1 * in.d2 * in.d3 * in.d4;
+#pragma omp parallel for
+    for (u64 i = 0; i < sz; i++)
+    {
+        in.data[i] *= divfp;
+    }
+}
+
+template <typename T>
+void ClearText<T>::avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale)
+{
+    sumPool2D(ks, padding, stride, in, out);
+    divPartial(out, (T)(ks * ks), scale);
+}
+
+
+template <typename T>
+u64 ClearText<T>::log2(u64 x)
+{
+    u64 y = 0;
+    while (x >>= 1)
+        y++;
+    return y;
+}
+
+template <typename T>
+void ClearText<T>::maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2 * padding - ks) / stride + 1;
+    u64 newW = (in.d3 + 2 * padding - ks) / stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&](int i)
+            {
+                for (int j = 0; j < newH; j++)
+                {
+                    for (int k = 0; k < newW; k++)
+                    {
+                        for (int l = 0; l < in.d4; l++)
+                        {
+                            T max = std::numeric_limits<T>::lowest();
+                            u64 maxIdxI = 0;
+                            u64 maxIdxJ = 0;
+                            for (int m = 0; m < ks; m++)
+                            {
+                                for (int n = 0; n < ks; n++)
+                                {
+                                    auto h2 = j * stride + m - padding;
+                                    auto w2 = k * stride + n - padding;
+                                    T val = 0;
+                                    if (h2 < in.d2 && w2 < in.d3 && h2 >= 0 && w2 >= 0)
+                                        val = in(i, h2, w2, l);
+                                    if (val > max)
+                                    {
+                                        max = val;
+                                        maxIdxI = m;
+                                        maxIdxJ = n;
+                                    }
+                                }
+                            }
+                            out(i, j, k, l) = max;
+                            maxIdx(i, j, k, l) = maxIdxI * ks + maxIdxJ;
+                        }
+                    }
+                } });
+}
+
+
+template <typename T>
+void ClearText<T>::batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    assert(A.d1 == B.d1);
+    assert(A.d1 == x.shape.back());
+    assert(x.is_same_shape(y));
+    u64 channels = x.shape.back();
+
+    fastfor(x.size(), [&](u64 i)
+            { y.data[i] = x.data[i] * A(i % channels) + B(i % channels); });
+}
+
+template <typename T>
+void ClearText<T>::add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+{
+    always_assert(in.size() > 0);
+    always_assert(out.size() == in[0]->size());
+    for (int i = 0; i < in.size(); i++)
+    {
+        always_assert(out.size() == in[i]->size());
+    }
+    fastfor(out.size(), [&](int i)
+            {
+                T sum = 0;
+                for (int j = 0; j < in.size(); j++)
+                {
+                    sum += in[j]->data[i];
+                }
+                out.data[i] = sum; });
+    modbw(out);
+}
+
+template <typename T>
+T tanh(T x, u64 scale)
+{
+    double d = ((double)x) / (1LL << scale);
+    return (T)(tanh(d) * (1LL << scale));
+}
+
+i64 clip(i64 x, u64 l)
+{
+    assert(x >= 0);
+    if (x >= (1LL << l))
+    {
+        x = (1LL << l) - 1;
+    }
+    return x;
+}
+
+template <typename T>
+T tanh_lut(T x, u64 scale)
+{
+    u64 idx = x < 0 ? (-x) : x;
+    idx = clip(idx, 14);
+    float xf = double(idx) / double(1LL << scale);
+    xf = std::tanh(xf);
+    i64 res = i64(xf * (1LL << scale));
+    ;
+    if (x < 0)
+    {
+        res = -res;
+    }
+    return res;
+}
+
+double gelu_sub_relu(double x)
+{
+    // double g = 0.5 * x * (1 + std::tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
+    double g = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+    return g - std::max(0.0, x);
+}
+
+template <typename T>
+T gelu_sub_relu(T x, u64 scale_in, u64 scale_out)
+{
+    return (T)(gelu_sub_relu((double)x / (1LL << scale_in)) * (1LL << scale_out));
+}
+
+// double relu_sub_gelu_double(double x)
+// {
+//     // double g = 0.5 * x * (1 + std::tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
+//     double g = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+//     return std::max(0.0, x) - g;
+// }
+
+// template <typename T>
+// T ClearText<T>::relu_sub_gelu(T x, u64 scale_in, u64 scale_out)
+// {
+//     return (T) (relu_sub_gelu_double((double) x / (1LL << scale_in)) * (1LL << scale_out));
+// }
+
+template <typename T>
+void ClearText<T>::gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
+{
+    // gen_tanh_table(scale, scale);
+    // always_assert(in.size() == out.size());
+    // T t1 = (T) (sqrt(2.0 / M_PI) * (1LL << scale));
+    // T t2 = (T) (0.044715 * (1LL << scale));
+    // fastfor(in.size(), [&](u64 i) {
+    //     T ini = in.data[i];
+    //     T t = ini * ini;
+    //     modbw(t);
+    //     truncate(t, scale);
+    //     t = t * ini;
+    //     modbw(t);
+    //     truncate(t, scale);
+    //     t = t * t2;
+    //     modbw(t);
+    //     truncate(t, scale);
+    //     t = t + ini;
+    //     t = t * t1;
+    //     modbw(t);
+    //     truncate(t, scale);
+    //     t = tanh_lut(t, scale);
+    //     t = t + (1LL << scale);
+    //     t = t * ini;
+    //     modbw(t);
+    //     truncate(t, scale+1);
+    //     out.data[i] = t;
+    // });
+    // printf("In here!\n");
+    // always_assert(scale == 12);
+    fastfor(in.size(), [&](u64 i)
+            {
+                // T r = in.data[i] > 0 ? in.data[i] : 0;
+                // T t = 2 * r - in.data[i];
+                // t = clip(t, scale + 2);
+                // // t = t / (1LL << (scale - 6)); // bw = 8, scale = 6
+                // ClearText<T>::truncate(t, scale - 6);
+                // t = gelu_sub_relu(t, 6, scale);
+                // out.data[i] = t + r;
+
+                auto inpSmall = in.data[i];
+                ClearText<T>::truncate(inpSmall, scale - 6);
+                T r = in.data[i] > 0 ? in.data[i] : 0;
+                T rSmall = in.data[i] > 0 ? inpSmall : 0;
+                T t = 2 * rSmall - inpSmall;
+                t = clip(t, scale + 2);
+                t = gelu_sub_relu(t, 6, scale);
+                out.data[i] = r + t; });
+}
+
+double silu_sub_relu(double x)
+{
+    double g = x / (1 + exp(-x));
+    return g - std::max(0.0, x);
+}
+
+template <typename T>
+T silu_sub_relu(T x, u64 scale_in, u64 scale_out)
+{
+    return (T)(silu_sub_relu(((double)x) / (1LL << scale_in)) * (1LL << scale_out));
+}
+
+template <typename T>
+void ClearText<T>::silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
+{
+    // always_assert(scale == 12);
+    fastfor(in.size(), [&](u64 i)
+            {
+                auto inpSmall = in.data[i];
+                ClearText<T>::truncate(inpSmall, scale - 6);
+                T r = in.data[i] > 0 ? in.data[i] : 0;
+                T rSmall = in.data[i] > 0 ? inpSmall : 0;
+                T t = 2 * rSmall - inpSmall;
+                t = clip(t, scale + 4);
+                t = silu_sub_relu(t, 6, scale);
+                out.data[i] = r + t;
+                // ClearText<T>::truncate(t, scale - 6);
+                // T r = in.data[i] > 0 ? in.data[i] : 0;
+                // T t = 2 * r - in.data[i];
+                // t = clip(t, scale + 4);
+                // t = silu_sub_relu(t, 6, scale);
+                // out.data[i] = t + r;
+            });
+}
+
+std::vector<i64> exp_tab(0);
+
+void gen_exp_table(u64 l1, u64 s1, u64 l2, u64 s2)
+{
+    // table for exp(-x), where x is negative
+    if (exp_tab.size() == (1LL << l1))
+    {
+        return;
+    }
+
+    exp_tab.resize(1LL << l1);
+    for (u64 i = 0; i < (1LL << l1); ++i)
+    {
+        double x = -double(i) / double(1LL << s1);
+        double ex = std::exp(x);
+        exp_tab[i] = i64(ex * (1LL << s2));
+    }
+}
+
+std::vector<i64> inv_tab(0);
+
+void gen_inv_table(u64 l1, u64 s1, u64 l2, u64 s2)
+{
+    // table for inv(x), where x is negative
+    if (inv_tab.size() == (1LL << l1))
+    {
+        return;
+    }
+
+    inv_tab.resize(1LL << l1);
+    for (u64 i = 1; i < (1LL << l1); ++i)
+    {
+        double x = double(i) / double(1LL << s1);
+        double iv = 1.0 / x;
+        inv_tab[i] = i64(iv * (1LL << s2));
+    }
+}
+
+template <typename T>
+void ClearText<T>::softmax_table(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale)
+{
+    always_assert(scale == 12);
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    // hardcoded params for now
+    gen_exp_table(16, 12, 22, 12);
+    gen_inv_table(16, 6, 13, 12);
+    T exps[numClasses];
+
+    for (int b = 0; b < batchSize; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < numClasses; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        T den = 0;
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            T x = max - in(b, j);
+            x = clip(x, 16);
+            exps[j] = exp_tab[x];
+            den += exps[j];
+        }
+
+        den = den / (1LL << 6);
+        T iden = inv_tab[den];
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            out(b, j) = exps[j] * iden;
+        }
+    }
+    Backend<T>::truncate(out, 12);
+}
+
+template <typename T>
+void ClearText<T>::softmax_sirnn_2part_lut(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale)
+{
+    always_assert(scale >= 12);
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    // hardcoded params for now
+    T exps[numClasses];
+
+    for (int b = 0; b < batchSize; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < numClasses; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        // printf("Max %d=%ld\n", b, max);
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            T x = max - in(b, j);
+            x = clip(x, scale + 4);
+            x = x / (1LL << (scale - 12)); // x is now in bw=16, scale=12
+            T x0 = x % (1LL << 8);
+            T x1 = x / (1LL << 8);
+            T e0 = T(std::exp(-x0 / double(1LL << 12)) * (1LL << scale));
+            T e1 = T(std::exp(-x1 / double(1LL << 4)) * (1LL << scale));
+            exps[j] = e0 * e1;
+            modbw(exps[j]);
+            truncate(exps[j], scale);
+        }
+
+        T den = 0;
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            den += exps[j];
+        }
+
+        den = den / (1LL << (scale - 6)); // bw=16, scale=6 (as max 1024 classes supported)
+        T iden = T((1.0 / double(den / double(1LL << 6))) * (1LL << scale));
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            out(b, j) = exps[j] * iden;
+        }
+    }
+    Backend<T>::truncate(out, scale);
+}
+
+template <typename T>
+void ClearText<T>::softmax_sirnn_2part_lut_triangular(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale)
+{
+    always_assert(scale >= 12);
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    always_assert(batchSize == numClasses);
+    T exps[numClasses];
+
+    out.zero();
+    out(0, 0) = T(1LL << (2 * scale));
+    for (int b = 1; b < batchSize; ++b)
+    {
+        // in.as_nd().print();
+        T max = in(b, 0);
+        // printf("Max=%ld\n", max);
+        for (u64 j = 1; j < b + 1; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        // printf("Max %d=%ld\n", b, max);
+        for (u64 j = 0; j < b + 1; ++j)
+        {
+            T x = max - in(b, j);
+            // printf("b=%d, %ld\n", j, x);
+            x = clip(x, scale + 4);
+            x = x / (1LL << (scale - 12)); // x is now in bw=16, scale=12
+            T x0 = x % (1LL << 8);
+            T x1 = x / (1LL << 8);
+            T e0 = T(std::exp(-x0 / double(1LL << 12)) * (1LL << scale));
+            T e1 = T(std::exp(-x1 / double(1LL << 4)) * (1LL << scale));
+            exps[j] = e0 * e1;
+            modbw(exps[j]);
+            truncate(exps[j], scale);
+        }
+
+        T den = 0;
+        for (u64 j = 0; j < b + 1; ++j)
+        {
+            // printf("j=%d, exp=%ld\n", b, exps[j]);
+            den += exps[j];
+        }
+        // printf("Den=%ld\n", den);
+        den = den / (1LL << (scale - 6)); // bw=16, scale=6 (as max 1024 classes supported)
+        T iden = T((1.0 / double(den / double(1LL << 6))) * (1LL << scale));
+        // printf("iDen=%ld\n", iden);
+        for (u64 j = 0; j < b + 1; ++j)
+        {
+            // printf("j=%d, exp=%ld\n", j, exps[j]);
+            out(b, j) = exps[j] * iden;
+        }
+    }
+    Backend<T>::truncate(out, scale);
+}
+
+template <typename T>
+void ClearText<T>::softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    always_assert(_in.shape.size() == 2);
+    always_assert(_out.shape.size() == 2);
+    always_assert(_in.shape[0] == _out.shape[0]);
+    always_assert(_in.shape[1] == _out.shape[1]);
+
+    auto in = _in.as_2d();
+    auto out = _out.as_2d();
+    if (mode == 1)
+        return softmax_table(in, out, scale);
+
+    if (mode == 2)
+        return softmax_polynomial(in, out, scale);
+
+    if (mode == 0)
+        return softmax_sirnn_2part_lut(in, out, scale);
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    for (int b = 0; b < batchSize; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < numClasses; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        // std::cout << "max: " << max << std::endl;
+        double den = 0.0;
+        double exps[numClasses];
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            double x = in(b, j) - max;
+            // std::cout << "out[" << j << "]: " << T(-x) << std::endl;
+            if (scale == 0)
+            {
+                exps[j] = std::exp(x);
+            }
+            else
+            {
+                exps[j] = std::exp(x / (1LL << scale));
+            }
+            // std::cout << "exps[" << j << "]: " << T(exps[j] * (1LL << scale)) << std::endl;
+            den += exps[j];
+        }
+
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            if (scale == 0)
+            {
+                out(b, j) = exps[j] / den;
+            }
+            else
+            {
+                auto t = (exps[j] / den) * (1LL << scale);
+                out(b, j) = (T)(t);
+            }
+        }
+    }
+}
+
+template <typename T>
+void ClearText<T>::softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    always_assert(_in.shape.size() == 2);
+    always_assert(_out.shape.size() == 2);
+    always_assert(_in.shape[0] == _out.shape[0]);
+    always_assert(_in.shape[1] == _out.shape[1]);
+    always_assert(_in.shape[0] == _in.shape[1]); // should be a square matrix
+
+    auto in = _in.as_2d();
+    auto out = _out.as_2d();
+    // printf("Printing out the mode=%d\n", mode);
+    if (mode == 0)
+        return softmax_sirnn_2part_lut_triangular(in, out, scale);
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    out(0, 0) = 1LL << scale;
+
+    for (int b = 1; b < batchSize; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < b + 1; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        // std::cout << "max: " << max << std::endl;
+        double den = 0.0;
+        double exps[b + 1];
+        for (u64 j = 0; j < b + 1; ++j)
+        {
+            double x = in(b, j) - max;
+            exps[j] = std::exp(x / (1LL << scale));
+            den += exps[j];
+        }
+
+        for (u64 j = 0; j < b + 1; ++j)
+        {
+            auto t = (exps[j] / den) * (1LL << scale);
+            out(b, j) = (T)(t);
+        }
+    }
+}
+
+template <typename T>
+T ClearText<T>::invsqrt_lut_2(T x, u64 scale, u64 additional_div, u64 n)
+{
+    i64 k = 0;
+    u64 unsigned_X = (u64)x & ((1ULL << n) - 1);
+    while (unsigned_X >= (1ULL << k) && k < bw)
+    {
+        k++;
+    }
+    k = k - 1;
+    u64 m = (unsigned_X << (n - k - 1));
+    m >>= (n - 8);
+    // ClearText<T>::truncate(m, n - 8);
+    // m = m - 128;
+    double val = double(m) * std::pow(2.0, k - 7);
+    return (T)(double(1LL << (2 * scale)) / sqrt(val / additional_div));
+}
+
+template <typename T>
+void ClearText<T>::layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    always_assert(A.d1 == B.d1);
+    always_assert(A.d1 == x.shape.back());
+    always_assert(x.is_same_shape(y));
+    u64 channels = x.shape.back();
+    fastfor(x.size() / channels, [&](u64 i)
+            {
+                T mean = 0;
+                T var = 0;
+                for (u64 j = 0; j < channels; j++)
+                {
+                    mean += x.data[i * channels + j];
+                }
+                modbw(mean);
+                div(mean, channels, scale);
+                for (u64 j = 0; j < channels; j++)
+                {
+                    var += (x.data[i * channels + j] - mean) * (x.data[i * channels + j] - mean);
+                }
+                modbw(var);
+                var = invsqrt_lut_2(var, scale, channels, bw);
+                for (u64 j = 0; j < channels; j++)
+                {
+                    y.data[i * channels + j] = (x.data[i * channels + j] - mean) * var;
+                } });
+    modbw(y);
+    Backend<T>::truncate(y, scale);
+    fastfor(x.size(), [&](u64 i)
+            { y.data[i] = y.data[i] * A(i % channels) + B(i % channels); });
+
+    Backend<T>::truncate(y, scale);
+}
+
+template <typename T>
+void ClearText<T>::rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    always_assert(A.d1 == B.d1);
+    always_assert(A.d1 == x.shape.back());
+    always_assert(x.is_same_shape(y));
+
+    u64 channels = x.shape.back();
+
+    fastfor(x.size() / channels, [&](u64 i)
+            {
+                T var = 0; // not exactly variance as no mean subtraction
+                for (u64 j = 0; j < channels; j++)
+                {
+                    var += x.data[i * channels + j] * x.data[i * channels + j];
+                }
+                modbw(var);
+                var = invsqrt_lut_2(var, scale, channels, bw);
+                for (u64 j = 0; j < channels; j++)
+                {
+                    y.data[i * channels + j] = x.data[i * channels + j] * var;
+                } });
+    modbw(y);
+
+    Backend<T>::truncate(y, scale);
+
+    fastfor(x.size(), [&](u64 i)
+            { y.data[i] = y.data[i] * A(i % channels) + B(i % channels); });
+}
+
+template <typename T>
+void ClearText<T>::addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+{
+    always_assert(x.shape.back() == bias.d1);
+    fastfor(x.size(), [&](u64 i)
+            { x.data[i] += bias(i % bias.d1); });
+    modbw(x);
+}
+
+template <typename T>
+void ClearText<T>::scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    fastfor(x.size(), [&](u64 i)
+            { y.data[i] = x.data[i] * scalar; });
+    modbw(y);
+}
+
+template <typename T>
+void ClearText<T>::polyeval(const Tensor<T> &x, Tensor<T> &y, const std::vector<double> &coefficients, u64 scale)
+{
+    always_assert(x.is_same_shape(y));
+    T c0 = (T)(coefficients[0] * (1LL << scale));
+    T c1 = (T)(coefficients[1] * (1LL << (2 * scale)));
+    // std::cout << c0 << std::endl;
+    // std::cout << c1 / (1LL << scale) << std::endl;
+    Tensor<T> tmp(x.shape);
+
+    fastfor(x.size(), [&](u64 i)
+            { tmp.data[i] = c0 * x.data[i] + c1; });
+
+    Backend<T>::truncate(tmp, scale);
+
+    for (u64 e = 2; e < coefficients.size(); ++e)
+    {
+        T c = (T)(coefficients[e] * (1LL << (2 * scale)));
+        // std::cout << c / (1LL << scale) << std::endl;
+
+        fastfor(x.size(), [&](u64 i)
+                { tmp.data[i] = tmp.data[i] * x.data[i] + c; });
+
+        Backend<T>::truncate(tmp, scale);
+    }
+
+    fastfor(x.size(), [&](u64 i)
+            { y.data[i] = tmp.data[i]; });
+}
+
+template <typename T>
+void ClearText<T>::softmax_polynomial(Tensor2D<T> &in, Tensor2D<T> &out, u64 scale)
+{
+    always_assert(scale == 12);
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    // hardcoded params for now
+    gen_inv_table(16, 6, 13, 12);
+
+    for (int b = 0; b < batchSize; ++b)
+    {
+        T max = in(b, 0);
+        for (u64 j = 1; j < numClasses; ++j)
+        {
+            if (in(b, j) > max)
+            {
+                max = in(b, j);
+            }
+        }
+        // std::cout << "max: " << max << std::endl;
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            T x = max - in(b, j);
+            x = clip(x, 14);
+            out(b, j) = x;
+            // std::cout << "out[" << j << "]: " << out(b, j) << std::endl;
+        }
+        // std::cout << std::endl;
+    }
+
+    u64 max_deg = 8;
+    std::vector<double> coeffs(max_deg);
+    coeffs[max_deg - 1] = 1.0;
+    for (u64 i = 1; i < max_deg; i++)
+    {
+        coeffs[max_deg - 1 - i] = coeffs[max_deg - i] * (-1.0 / i);
+    }
+    // for (u64 i = 0; i < 8; i++) {
+    //     std::cout << coeffs[i] << " ";
+    // }
+    // std::cout << std::endl;
+
+    auto out_2d = out.as_nd();
+    polyeval(out_2d, out_2d, coeffs, scale);
+
+    for (int b = 0; b < batchSize; ++b)
+    {
+        T den = 0;
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            // std::cout << "exps[" << j << "]: " << out(b, j) << std::endl;
+            den += out(b, j);
+        }
+        // std::cout << std::endl;
+
+        den = den / (1LL << 6);
+        // std::cout << "den: " << den << std::endl;
+        T iden = inv_tab[den];
+        for (u64 j = 0; j < numClasses; ++j)
+        {
+            out(b, j) = out(b, j) * iden;
+        }
+    }
+    Backend<T>::truncate(out, 12);
+}
+
+template <typename T>
+void ClearText<T>::attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    always_assert(x.shape.size() == 2);
+    always_assert(x.shape[0] == x.shape[1]);
+
+    u64 n_seq = x.shape[0];
+    auto y_2d = y.as_2d();
+    auto x_2d = x.as_2d();
+
+    for (u64 j = 0; j < n_seq; ++j)
+    {
+        for (u64 k = 0; k < j + 1; ++k)
+        {
+            y_2d(j, k) = x_2d(j, k);
+        }
+        for (u64 k = j + 1; k < n_seq; ++k)
+        {
+            y_2d(j, k) = x_2d(j, k) - scalar;
+        }
+    }
+
+    modbw(y);
+}
+
+template <typename T>
+void ClearText<T>::local_attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    always_assert(x.shape.size() == 2);
+    always_assert(x.shape[0] == x.shape[1]);
+
+    u64 n_seq = x.shape[0];
+    auto y_2d = y.as_2d();
+    auto x_2d = x.as_2d();
+    u64 window_size = 256;
+
+    for (u64 j = 1; j <= n_seq - 1; ++j)
+    {
+        for (u64 k = 0; k < n_seq - j; ++k)
+        {
+            y_2d(k, k + j) = x_2d(k, k + j) - scalar;
+        }
+    }
+
+    for (u64 j = 0; j <= window_size - 1; ++j)
+    {
+        for (u64 k = j; k < n_seq; ++k)
+        {
+            y_2d(k, k - j) = x_2d(k, k - j);
+        }
+    }
+
+    for (u64 j = window_size; j <= n_seq - 1; ++j)
+    {
+        for (u64 k = j; k < n_seq; ++k)
+        {
+            y_2d(k, k - j) = x_2d(k, k - j) - scalar;
+        }
+    }
+
+    modbw(y);
+}
+
+template <typename T>
+void ClearText<T>::tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale)
+{
+    fastfor(in.size(), [&](u64 i)
+            { out.data[i] = tanh_lut(in.data[i], scale); });
+}
+
+template <typename T>
+void ClearText<T>::mul(const Tensor<T> &a, const Tensor<T> &b, Tensor<T> &out)
+{
+    always_assert(a.is_same_shape(b));
+    always_assert(a.is_same_shape(out));
+
+    fastfor(a.size(), [&](u64 i)
+            { out.data[i] = a.data[i] * b.data[i]; });
+}
+
+template <typename T>
+void ClearText<T>::scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode)
+{
+    T d = T(double(1LL << (scale)) / scalar);
+    if ((d & (d - 1)) == 0)
+    {
+        Backend<T>::truncate(x, y, scale - log2(d), 0);
+    }
+    else
+    {
+        this->scalarmul(x, d, y);
+        Backend<T>::truncate(y, y, scale, 0);
+    }
+}
+
+
+template class ClearText<i64>;
+template class ClearText<i32>;
+template class ClearText<u64>;
+template class ClearText<u32>;
+// template class ClearText<double>;
+// template class ClearText<float>;
diff --git a/GPU-MPC/ext/sytorch/src/sytorch/backend/float.cpp b/GPU-MPC/ext/sytorch/src/sytorch/backend/float.cpp
new file mode 100644
index 00000000..843ae4fd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/src/sytorch/backend/float.cpp
@@ -0,0 +1,411 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <sytorch/backend/float.h>
+#include <Eigen/Dense>
+
+template <typename T>
+void FloatClearText<T>::matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+}
+
+template <typename T>
+void FloatClearText<T>::matmul_triangular(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d1);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = (eA * eB).template triangularView<Eigen::Lower>();
+}
+
+template <typename T>
+void FloatClearText<T>::matmulTransposeA(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d1 == b.d1);
+    assert(c.d1 == a.d2);
+    assert(c.d2 == b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eA(a.data, a.d2, a.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b.data, b.d1, b.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+}
+
+template <typename T>
+void FloatClearText<T>::matmulTransposeB(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
+    assert(a.d2 == b.d2);
+    assert(c.d1 == a.d1);
+    assert(c.d2 == b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a.data, a.d1, a.d2);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>> eB(b.data, b.d2, b.d1);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c.data, c.d1, c.d2);
+    eC = eA * eB;
+}
+
+template <typename T>
+void FloatClearText<T>::conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D<T> &input, const Tensor2D<T> &filter, Tensor4D<T> &output, bool isFirst)
+{
+    assert(input.d4 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fh * fw * ci);
+    u64 newH = (((input.d2 + 2*padding - fh)/stride) + 1);
+    u64 newW = (((input.d3 + 2*padding - fw)/stride) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newH);
+    assert(output.d3 == newW);
+    assert(output.d4 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed<T>(input, padding, stride, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput<T>(tempOutput, input.d1, (((input.d2 + 2*padding - fh)/stride) + 1), (((input.d3 + 2*padding - fw)/stride) + 1), co, output);
+}
+
+template <typename T>
+void FloatClearText<T>::conv3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 dd, u64 dh, u64 dw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+{
+    assert(input.d5 == ci);
+    assert(filter.d1 == co);
+    assert(filter.d2 == fd * fh * fw * ci);
+    always_assert(dd == 1);
+    always_assert(dh == 1);
+    always_assert(dw == 1);
+    u64 newD = (((input.d2 + 2*pd - fd - (fd-1)*(dd-1))/sd) + 1);
+    u64 newH = (((input.d3 + 2*ph - fh - (fh-1)*(dh-1))/sh) + 1);
+    u64 newW = (((input.d4 + 2*pw - fw - (fw-1)*(dw-1))/sw) + 1);
+    assert(output.d1 == input.d1);
+    assert(output.d2 == newD);
+    assert(output.d3 == newH);
+    assert(output.d4 == newW);
+    assert(output.d5 == co);
+
+    Tensor2D<T> reshapedInput = reshapeInputTransposed3d<T>(input, pd, ph, pw, sd, sh, sw, fd, fh, fw);
+    Tensor2D<T> tempOutput(filter.d1, reshapedInput.d1);
+    matmulTransposeB(filter, reshapedInput, tempOutput);
+    reshapeOutput3d<T>(tempOutput, input.d1, newD, newH, newW, co, output);
+}
+
+template <typename T>
+void FloatClearText<T>::convTranspose3D(u64 fd, u64 fh, u64 fw, u64 pd, u64 ph, u64 pw, u64 sd, u64 sh, u64 sw, u64 ci, u64 co, const Tensor5D<T> &input, const Tensor2D<T> &filter, Tensor5D<T> &output)
+    {
+        assert(input.d5 == ci);
+        assert(filter.d1 == co);
+        assert(filter.d2 == fd * fh * fw * ci);
+        u64 newD = (((input.d2 - 1)*sd + fd - 2*pd));
+        u64 newH = (((input.d3 - 1)*sh + fh - 2*ph));
+        u64 newW = (((input.d4 - 1)*sw + fw - 2*pw));
+        assert(output.d1 == input.d1);
+        assert(output.d2 == newD);
+        assert(output.d3 == newH);
+        assert(output.d4 == newW);
+        assert(output.d5 == co);
+
+        convTranspose3dLoop<T>(input.d1, input.d2, input.d3, input.d4, input.d5, fd, fh, fw, co, 
+            pd, pd, ph, ph, pw, pw, sd, sh, sw,
+            output.d2, output.d3, output.d4, input.data, filter.data, output.data);
+    }
+
+template <typename T>
+void FloatClearText<T>::relu(const Tensor<T> &in, const Tensor<T> &out, const Tensor<T> &drelu, u64 scale, int mode) {
+    assert(in.is_same_shape(out));
+    assert(in.is_same_shape(drelu));
+    fastfor(in.size(), [&] (u64 i) {
+        drelu.data[i] = (T)(in.data[i] > 0);
+        out.data[i] = (in.data[i] > 0) ? in.data[i] : 0;
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+    always_assert(shift == 0);
+    fastfor(size, [&] (u64 i) {
+        out[i] = in[i];
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::div(Tensor<T> &in, T divisor, u64 scale) {
+    always_assert(scale == 0);
+
+    fastfor(in.size(), [&] (u64 i) {
+        in.data[i] /= divisor;
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::div(T &in, T divisor, u64 scale) {
+    in = in / divisor;
+}
+
+template <typename T>
+void FloatClearText<T>::sumPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out) {
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2*padding - ks)/stride + 1;
+    u64 newW = (in.d3 + 2*padding - ks)/stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&] (int i) {
+        for(int j = 0; j < newH; j++) {
+            for(int k = 0; k < newW; k++) {
+                for(int l = 0; l < in.d4; l++) {
+                    T sum = 0;
+                    for(int m = 0; m < ks; m++) {
+                        for(int n = 0; n < ks; n++) {
+                            sum += in(i, j*stride+m, k*stride+n, l);
+                        }
+                    }
+                    out(i, j, k, l) = sum;
+                }
+            }
+        }
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, u64 scale) {
+    sumPool2D(ks, padding, stride, in, out);
+    auto out_nd = out.as_nd();
+    div(out_nd, (T)(ks*ks), scale);
+}
+
+template <typename T>
+void FloatClearText<T>::maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D<T> &in, Tensor4D<T> &out, Tensor4D<u64> &maxIdx, u64 scale, u8 mode) {
+    assert(in.d1 == out.d1);
+    assert(in.d4 == out.d4);
+    u64 newH = (in.d2 + 2*padding - ks)/stride + 1;
+    u64 newW = (in.d3 + 2*padding - ks)/stride + 1;
+    assert(out.d2 == newH);
+    assert(out.d3 == newW);
+    fastfor(in.d1, [&](int i) {
+        for(int j = 0; j < newH; j++) {
+            for(int k = 0; k < newW; k++) {
+                for(int l = 0; l < in.d4; l++) {
+                    T max = std::numeric_limits<T>::lowest();
+                    u64 maxIdxI = 0;
+                    u64 maxIdxJ = 0;
+                    for(int m = 0; m < ks; m++) {
+                        for(int n = 0; n < ks; n++) {
+                            auto h2 = j*stride+m-padding;
+                            auto w2 = k*stride+n-padding;
+                            T val = 0;
+                            if (h2 < in.d2 && w2 < in.d3 && h2 >= 0 && w2 >= 0)
+                                val = in(i, h2, w2, l);
+                            if(val > max) {
+                                max = val;
+                                maxIdxI = m;
+                                maxIdxJ = n;
+                            }
+                        }
+                    }
+                    out(i, j, k, l) = max;
+                    maxIdx(i, j, k, l) = maxIdxI * ks + maxIdxJ;
+                }
+            }
+        }
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::batchNormInference(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    assert(A.d1 == B.d1);
+    assert(A.d1 == x.shape.back());
+    assert(x.is_same_shape(y));
+    u64 channels = x.shape.back();
+
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = x.data[i] * A(i % channels) + B(i % channels);
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+{
+    always_assert(in.size() > 0);
+    always_assert(out.size() == in[0]->size());
+    for (int i = 0; i < in.size(); i++) {
+        always_assert(out.size() == in[i]->size());
+    }
+    fastfor(out.size(), [&](int i) {
+        T sum = 0;
+        for (int j = 0; j < in.size(); j++) {
+            sum += in[j]->data[i];
+        }
+        out.data[i] = sum;
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode)
+{
+    always_assert(scale == 0);
+    fastfor(in.size(), [&](u64 i) {
+        T x = in.data[i];
+        out.data[i] = 0.5 * x * (1 + erf(x / sqrt(2.0)));
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::softmax(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    always_assert(_in.shape.size() == 2);
+    always_assert(_out.shape.size() == 2);
+    always_assert(_in.shape[0] == _out.shape[0]);
+    always_assert(_in.shape[1] == _out.shape[1]);
+    always_assert((scale == 0));
+
+    auto in = _in.as_2d();
+    auto out = _out.as_2d();
+
+    auto batchSize = in.d1;
+    auto numClasses = in.d2;
+    for(int b = 0; b < batchSize; ++b) {
+        T max = in(b, 0);
+        for(u64 j = 1; j < numClasses; ++j) {
+            if(in(b, j) > max) {
+                max = in(b, j);
+            }
+        }
+
+        double den = 0.0;
+        double exps[numClasses];
+        for(u64 j = 0; j < numClasses; ++j) {
+            double x = in(b, j) - max;
+            exps[j] = std::exp(x);
+            den += exps[j];
+        }
+
+        for(u64 j = 0; j < numClasses; ++j) {
+            out(b, j) = exps[j] / den;
+        }
+    }
+}
+
+template <typename T>
+void FloatClearText<T>::softmax_triangular(Tensor<T> &_in, Tensor<T> &_out, u64 scale, u64 mode)
+{
+    Tensor<T> y(_in.shape);
+    T scalar = 10000.0 * (1LL << scale);
+    attention_mask(_in, scalar, y);
+    softmax(y, _out, scale);
+
+}
+
+template <typename T>
+void FloatClearText<T>::layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+{
+    always_assert(A.d1 == B.d1);
+    always_assert(A.d1 == x.shape.back());
+    always_assert(x.is_same_shape(y));
+    
+    u64 channels = x.shape.back();
+
+    fastfor(x.size() / channels, [&](u64 i) {
+        T mean = 0;
+        T var = 0;
+        for (u64 j = 0; j < channels; j++) {
+            mean += x.data[i * channels + j];
+        }
+        mean = mean / T(channels);
+
+        for (u64 j = 0; j < channels; j++) {
+            var += (x.data[i * channels + j] - mean) * (x.data[i * channels + j] - mean);
+        }
+
+        var = var / T(channels);
+
+        for (u64 j = 0; j < channels; j++) {
+            y.data[i * channels + j] = (x.data[i * channels + j] - mean) / std::sqrt(var);
+        }
+    });
+
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = y.data[i] * A(i % channels) + B(i % channels);
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::addbias(Tensor<T> &x, const Tensor1D<T> &bias)
+{
+    always_assert(x.shape.back() == bias.d1);
+    fastfor(x.size(), [&](u64 i) {
+        x.data[i] += bias(i % bias.d1);
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::scalarmul(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    fastfor(x.size(), [&](u64 i) {
+        y.data[i] = x.data[i] * scalar;
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
+{
+    always_assert(x.is_same_shape(y));
+    always_assert(x.shape.size() == 2);
+    always_assert(x.shape[0] == x.shape[1]);
+
+    u64 n_seq = x.shape[0];
+    auto y_2d = y.as_2d();
+    auto x_2d = x.as_2d();
+
+    for (u64 j = 0; j < n_seq; ++j) {
+        for (u64 k = 0; k < j + 1; ++k) {
+            y_2d(j, k) = x_2d(j, k);
+        }
+        for (u64 k = j + 1; k < n_seq; ++k) {
+            y_2d(j, k) = x_2d(j, k) - scalar;
+        }
+    }
+
+}
+
+template <typename T>
+void FloatClearText<T>::tanh(const Tensor<T> &in, const Tensor<T> &out, u64 scale)
+{
+    fastfor(in.size(), [&](u64 i) {
+        out.data[i] = std::tanh(in.data[i]);
+    });
+}
+
+template <typename T>
+void FloatClearText<T>::scalardiv(Tensor<T> &in, double scalar, Tensor<T> &out, u64 scale, u64 mode)
+{
+    always_assert(scale == 0);
+    fastfor(in.size(), [&](u64 i) {
+        out.data[i] = in.data[i] / scalar;
+    });
+}
+
+template class FloatClearText<double>;
+template class FloatClearText<float>;
diff --git a/GPU-MPC/ext/sytorch/src/sytorch/random.cpp b/GPU-MPC/ext/sytorch/src/sytorch/random.cpp
new file mode 100644
index 00000000..bd25d4a7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/src/sytorch/random.cpp
@@ -0,0 +1,31 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/random.h>
+
+osuCrypto::PRNG prngWeights;
+osuCrypto::PRNG prngStr;
+
+// samples a random float in range [0, 1)
+double rand_float() {
+    auto t = prngStr.get<uint32_t>();
+    return t / ((double)(1ULL<<32));
+}
diff --git a/GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp b/GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp
new file mode 100644
index 00000000..7f31ceee
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp
@@ -0,0 +1,107 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/tensor.h>
+#include <library_float.h>
+#include <llama/stats.h>
+#include <llama/api.h>
+
+
+void secfloat_init(int secfloatParty, std::string secfloatAddr)
+{
+    __party = secfloatParty;
+    __address = secfloatAddr;
+    __init(0, nullptr);
+}
+
+void softmax_secfloat(Tensor4D<u64> &in, Tensor4D<u64> &out, u64 scale, int llamaParty)
+{
+    assert(in.d1 == out.d1);
+    assert(in.d2 == out.d2);
+    assert(in.d3 == 1);
+    assert(in.d4 == 1);
+    assert(out.d3 == 1);
+    assert(out.d4 == 1);
+
+    Tensor4D<u64> inFloat(in.d1, in.d2, 4, 1);
+    // This hack only works when last layer is truncation layer, which is usually the case
+    int origBitlength = LlamaConfig::bitlength;
+    LlamaConfig::bitlength = origBitlength - scale;
+    FixToFloat(in.d1 * in.d2, in.data, inFloat.data, scale);
+    // printf("inFloat=%lu, %lu, %lu, %lu\n", inFloat.data[0], inFloat.data[1], inFloat.data[2], inFloat.data[3]);
+    LlamaConfig::bitlength = origBitlength;
+    Tensor4D<u64> outFloat(in.d1, in.d2, 4, 1);
+    outFloat.fill(0);
+    if (llamaParty != 1) {
+        vector < vector < FPArray > > inpFloatSecfloat = make_vector_float(llamaParty-1, in.d1, in.d2);
+        for(int i = 0; i < in.d1; ++i) {
+            for(int j = 0; j < in.d2; ++j) {
+                inpFloatSecfloat[i][j].m[0] = inFloat(i, j, 0, 0);
+                inpFloatSecfloat[i][j].e[0] = inFloat(i, j, 1, 0);
+                inpFloatSecfloat[i][j].z[0] = inFloat(i, j, 2, 0);
+                inpFloatSecfloat[i][j].s[0] = inFloat(i, j, 3, 0);
+            }
+        }
+        vector < vector < FPArray > > outFloatSecfloat = make_vector_float(llamaParty-1, in.d1, in.d2);
+
+        // std::cerr << ">> Softmax (SecFloat) - Start" << std::endl;
+        auto secfloat_start = std::chrono::high_resolution_clock::now();
+        auto secfloat_comm_start = __get_comm();
+        auto secfloat_round_start = __iopack->get_rounds();
+
+        Softmax2(in.d1, in.d2, inpFloatSecfloat, outFloatSecfloat);
+        int sz = in.d1 * in.d2;
+        vector < FPArray > outFloatSecfloatFlat = make_vector_float(llamaParty-1, sz);
+        for(int i = 0; i < in.d1; ++i) {
+            for(int j = 0; j < in.d2; ++j) {
+                outFloatSecfloatFlat[i * in.d2 + j].m[0] = outFloatSecfloat[i][j].m[0];
+                outFloatSecfloatFlat[i * in.d2 + j].e[0] = outFloatSecfloat[i][j].e[0];
+                outFloatSecfloatFlat[i * in.d2 + j].s[0] = outFloatSecfloat[i][j].s[0];
+                outFloatSecfloatFlat[i * in.d2 + j].z[0] = outFloatSecfloat[i][j].z[0];
+            }
+        }
+        vector<FPArray> divver = make_vector_float(llamaParty-1, sz) ;
+        for (int i = 0 ; i < sz ; i++)
+            divver[i] = __fp_op->input<float>(ALICE, sz, (float)(1.0/(float)in.d1)) ;
+        ElemWiseMul(in.d1 * in.d2, outFloatSecfloatFlat, divver, outFloatSecfloatFlat);
+
+        auto secfloat_round_end = __iopack->get_rounds();
+        auto secfloat_comm_end = __get_comm();
+        auto secfloat_end = std::chrono::high_resolution_clock::now();
+        auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(secfloat_end - secfloat_start).count();
+        evalMicroseconds += eval_time;
+        secFloatComm += (uint64_t)(secfloat_comm_end - secfloat_comm_start);
+        numRounds += (secfloat_round_end - secfloat_round_start);
+        // std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl;
+        // std::cerr << ">> Softmax (SecFloat) - End" << std::endl;
+
+        for(int i = 0; i < in.d1; ++i) {
+            for(int j = 0; j < in.d2; ++j) {
+                outFloat(i, j, 0, 0) = outFloatSecfloatFlat[i * in.d2 + j].m[0];
+                outFloat(i, j, 1, 0) = outFloatSecfloatFlat[i * in.d2 + j].e[0];
+                outFloat(i, j, 2, 0) = outFloatSecfloatFlat[i * in.d2 + j].z[0];
+                outFloat(i, j, 3, 0) = outFloatSecfloatFlat[i * in.d2 + j].s[0];
+            }
+        }
+    }
+    // printf("Scale=%lu, %lu\n", scale, LlamaConfig::bitlength);
+    FloatToFix(in.d1*in.d2, outFloat.data, out.data, scale);
+}
diff --git a/GPU-MPC/ext/sytorch/tests/bf16.cpp b/GPU-MPC/ext/sytorch/tests/bf16.cpp
new file mode 100644
index 00000000..1324c283
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/bf16.cpp
@@ -0,0 +1,67 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/fixtobfloat16.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 40;
+    int numTrials = 255;
+    
+    for (int i = 1; i <= numTrials; ++i) {
+        GroupElement rin = random_ge(bin);
+        GroupElement rout = random_ge(14);
+        GroupElement x = i;
+        GroupElement xhat = x + rin;
+        mod(xhat, bin);
+
+        auto keys = keyGenF2BF16(bin, rin, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        auto t0 = evalF2BF16_1(0, xhat, key0);
+        auto t1 = evalF2BF16_1(1, xhat, key1);
+        GroupElement k = t0.first + t1.first;
+        GroupElement m = t0.second + t1.second;
+
+        auto xm0 = evalF2BF16_2(0, xhat, k, m, key0);
+        auto xm1 = evalF2BF16_2(1, xhat, k, m, key1);
+        GroupElement xm = xm0 + xm1;
+
+        auto r0 = evalF2BF16_3(0, k, xm, key0);
+        auto r1 = evalF2BF16_3(1, k, xm, key1);
+
+        GroupElement r = r0 + r1 - rout;
+
+        mod(r, 13);
+
+        k = r % (1LL << 6);
+        m = r >> 6;
+
+        always_assert(((m+128) >> (7-k)) == x);
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/clip.cpp b/GPU-MPC/ext/sytorch/tests/clip.cpp
new file mode 100644
index 00000000..f411f7c7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/clip.cpp
@@ -0,0 +1,88 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/clip.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 40;
+    int numTrials = 10000;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(bin);
+        GroupElement x = random_ge(16);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenClip(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        auto routClip = keys.first.cmpKey.rout ^ keys.second.cmpKey.rout;
+        GroupElement t0 = evalClip_1(0, 16, xhat, key0);
+        GroupElement t1 = evalClip_1(1, 16, xhat, key1);
+        GroupElement t = t0 ^ t1;
+        always_assert((t ^ routClip) == 1);
+
+        GroupElement t0_2 = evalClip_2(0, 16, t, xhat, key0);
+        GroupElement t1_2 = evalClip_2(1, 16, t, xhat, key1);
+
+        GroupElement t_2 = t0_2 + t1_2 - rout;
+        mod(t_2, bin);
+
+        always_assert(t_2 == x);
+    }
+
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(bin);
+        GroupElement x = random_ge(bin);
+        if (x < (1LL<<16)) {
+            x += (1LL<<16);
+        }
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenClip(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        auto routClip = keys.first.cmpKey.rout ^ keys.second.cmpKey.rout;
+        GroupElement t0 = evalClip_1(0, 16, xhat, key0);
+        GroupElement t1 = evalClip_1(1, 16, xhat, key1);
+        GroupElement t = t0 ^ t1;
+        always_assert((t ^ routClip) == 0);
+
+        GroupElement t0_2 = evalClip_2(0, 16, t, xhat, key0);
+        GroupElement t1_2 = evalClip_2(1, 16, t, xhat, key1);
+
+        GroupElement t_2 = t0_2 + t1_2 - rout;
+        mod(t_2, bin);
+
+        always_assert(t_2 == ((1LL<<16) - 1));
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/tests/dcf.cpp b/GPU-MPC/ext/sytorch/tests/dcf.cpp
new file mode 100644
index 00000000..1ca6b645
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/dcf.cpp
@@ -0,0 +1,74 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dcf.h>
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for (int i = 0; i < 256; ++i)
+    {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int samples = 10000000;
+    int bin = 64;
+    int bout = 64;
+
+    std::pair<DCFKeyPack, DCFKeyPack> *keys = new std::pair<DCFKeyPack, DCFKeyPack>[samples];
+    GroupElement *r = new GroupElement[samples];
+    GroupElement *op = new GroupElement[samples];
+
+    for (int j = 0; j < samples; ++j)
+    {
+        r[j] = rand();
+        // GroupElement idx = rand();
+        //  % (1LL << bin);
+        keys[j] = keyGenDCF(bin, bout, r[j], 1);
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int j = 0; j < samples; ++j)
+    {
+        auto &key0 = keys[j].first;
+        // auto &key1 = keys[j].second;
+        GroupElement inp = rand();
+        evalDCF(0, &op[j], inp, key0);
+        // GroupElement res1;
+        // evalDCF(1, &res1, inp, key1);
+        // auto res = res0 + res1;
+        // mod(res, bout);
+        // if (inp < r[j])
+        // {
+        //     always_assert(res == 1);
+        // }
+        // else
+        // {
+        //     always_assert(res == 0);
+        // }
+    }
+    std::cout << "Op=" << op[rand() % samples] << std::endl;
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << "Time taken=" << elapsed.count() << std::endl;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp b/GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp
new file mode 100644
index 00000000..06d348b1
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp
@@ -0,0 +1,58 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dpf.h>
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int samples = 1000;
+    int bin = 12;
+
+    for (int j = 0; j < samples; ++j)
+    {
+        GroupElement idx = rand() % (1LL << bin);
+        auto keys = keyGenDPFET(bin, idx);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+
+        for (int i = 0; i < (1LL<<bin); ++i)
+        {
+            auto res0 = evalDPFET_LT(0, key0, i);
+            auto res1 = evalDPFET_LT(1, key1, i);
+            auto res = res0 ^ res1;
+            if (i < idx) 
+            {
+                always_assert(res == 1);
+            }
+            else 
+            {
+                always_assert(res == 0);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/dpf.cpp b/GPU-MPC/ext/sytorch/tests/dpf.cpp
new file mode 100644
index 00000000..6cddd9df
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/dpf.cpp
@@ -0,0 +1,124 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dpf.h>
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 4;
+    int bout = 64;
+
+    for (int idx = 0; idx < 16; ++idx)
+    {
+        auto keys = keyGenDPF(bin, bout, idx, 1);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+
+        for (int i = 0; i < 16; ++i)
+        {
+            auto y = (evalDPF_EQ(0, key0, i) ^ evalDPF_EQ(1, key1, i));
+            if (i == idx)
+            {
+                always_assert(y == 1);
+            }
+            else
+            {
+                always_assert(y == 0);
+            }
+        }
+
+        for (int i = 0; i < 16; ++i)
+        {
+            auto y = (evalDPF_GT(0, key0, i) ^ evalDPF_GT(1, key1, i));
+            if (i > idx)
+            {
+                always_assert(y == 1);
+            }
+            else
+            {
+                always_assert(y == 0);
+            }
+        }
+
+        for (int i = 0; i < 16; ++i)
+        {
+            auto y = (evalDPF_LT(0, key0, i) ^ evalDPF_LT(1, key1, i));
+            if (i < idx)
+            {
+                always_assert(y == 1);
+            }
+            else
+            {
+                always_assert(y == 0);
+            }
+        }
+
+        GroupElement out0[16];
+        GroupElement out1[16];
+        evalAll(0, key0, 0, out0);
+        evalAll(1, key1, 0, out1);
+        for (int i = 0; i < 16; ++i)
+        {
+            auto y = (out0[i] + out1[i]);
+            if (i == idx)
+            {
+                always_assert(y == 1);
+            }
+            else
+            {
+                always_assert(y == 0);
+            }
+        }
+
+        evalAll(0, key0, 7, out0);
+        evalAll(1, key1, 7, out1);
+        for (int i = 0; i < 16; ++i)
+        {
+            auto y = (out0[i] + out1[i]);
+            if (i == ((idx+7)%16))
+            {
+                always_assert(y == 1);
+            }
+            else
+            {
+                always_assert(y == 0);
+            }
+        }
+
+        GroupElement res0, res1;
+        std::vector<GroupElement> tab(16);
+        for (int i = 0; i < 16; ++i)
+        {
+            tab[i] = rand();
+        }
+        res0 = evalAll_reduce(0, key0, 0, tab);
+        res1 = evalAll_reduce(1, key1, 0, tab);
+        always_assert(res0 + res1 == tab[idx]);
+
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/dpfet.cpp b/GPU-MPC/ext/sytorch/tests/dpfet.cpp
new file mode 100644
index 00000000..267209df
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/dpfet.cpp
@@ -0,0 +1,57 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dpf.h>
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 16;
+    int bout = 64;
+
+    std::vector<GroupElement> tab(1LL<<bin);
+    for (int i = 0; i < (1LL<<bin); ++i)
+    {
+        tab[i] = rand();
+    }
+
+    for (int i = 0; i < 1000; ++i)
+    {
+        GroupElement idx = rand() % (1LL<<bin);
+        auto keys = keyGenDPFET(bin, idx);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+
+        GroupElement rot = rand() % (1LL<<bin);
+
+        auto res0 = evalAll_reduce_et(0, key0, rot, tab);
+        auto res1 = evalAll_reduce_et(1, key1, rot, tab);
+        // always_assert(res0 + res1 == tab[idx]);
+        always_assert((res0.first +  res1.first) * (res0.second + res1.second) == tab[(idx+rot) % (1LL<<bin)]);
+
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp b/GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp
new file mode 100644
index 00000000..7c302083
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp
@@ -0,0 +1,43 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <Eigen/Dense>
+#include <chrono>
+#include <iostream>
+
+int main()
+{
+    int d1 = 5000;
+    int d2 = 5000;
+    int d3 = 5000;
+
+    uint64_t *a = new uint64_t[d1 * d2];
+    uint64_t *b = new uint64_t[d2 * d3];
+    uint64_t *c = new uint64_t[d1 * d3];
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eA(a, d1, d2);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eB(b, d2, d3);
+    Eigen::Map<Eigen::Matrix<uint64_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eC(c, d1, d3);
+    auto start = std::chrono::high_resolution_clock::now();
+    eC = (eA * eB);//.template triangularView<Eigen::Lower>();
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::cout << "Eigen: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/evalallbenchmark.cpp b/GPU-MPC/ext/sytorch/tests/evalallbenchmark.cpp
new file mode 100644
index 00000000..facfa67d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/evalallbenchmark.cpp
@@ -0,0 +1,65 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/dpf.h>
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 16;
+    int bout = 64;
+    int samples = 18432;
+
+    std::vector<GroupElement> tab(1LL<<bin);
+    for (int i = 0; i < tab.size(); ++i)
+    {
+        tab[i] = rand();
+    }
+
+    DPFKeyPack *keys = new DPFKeyPack[samples];
+    GroupElement *res = new GroupElement[samples];
+    for (int idx = 0; idx < samples; ++idx)
+    {
+        auto keypair = keyGenDPF(bin, bout, rand(), 1);
+        keys[idx] = keypair.first;
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    #pragma omp parallel for
+    for (int idx = 0; idx < samples; ++idx)
+    {
+        res[idx] = evalAll_reduce(0, keys[idx], 0, tab);
+    }
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+    std::cout << compute_time / 1000.0 << " ms" << std::endl;
+
+    for (int idx = 0; idx < samples; ++idx)
+    {
+        std::cerr << res[idx] << std::endl;
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp b/GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp
new file mode 100644
index 00000000..338d9e57
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp
@@ -0,0 +1,68 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/float.h>
+#include <sytorch/backend/cleartext.h>
+
+int main(int argc, char** argv)
+{
+    ClearText<i64> b1;
+    FloatClearText<float> b2;
+    int scale = 12;
+    u64 size = 1ULL<<(scale+3);
+
+    Tensor<i64>   t1({size});
+    Tensor<i64>   v1({size});
+    Tensor<float> t2({size});
+    Tensor<float> v2({size});
+
+    i64 sign = 1;
+    if (argc > 1)
+        sign = atoi(argv[1]);
+    
+    always_assert(sign == 1 || sign == -1);
+
+    for (int i = 0; i < size; ++i)
+    {
+        float f = sign * i / float(1LL<<scale);
+        t1.data[i] = sign * i;
+        t2.data[i] = f;
+    }
+
+    b1.gelu(t1, v1, scale);
+    b2.gelu(t2, v2, 0);
+
+    float maxdiff = 0;
+    int maxi;
+    i64 maxdiff_i = 0;
+    for (int i = 0; i < size; ++i)
+    {
+        float diff = std::abs((v1.data[i] / double(1LL << scale)) - v2.data[i]);
+        if (diff > maxdiff)
+        {
+            maxdiff = diff;
+            maxi = i;
+            maxdiff_i = std::abs(v1.data[i] - (1LL<<12) * v2.data[i]);
+        }
+    }
+    std::cout << "Max diff: " << maxdiff << " at x = " << (sign * maxi / double(1LL<<12)) << std::endl;
+    std::cout << "ULP: " << maxdiff_i << std::endl;
+}
diff --git a/GPU-MPC/ext/sytorch/tests/lutss.cpp b/GPU-MPC/ext/sytorch/tests/lutss.cpp
new file mode 100644
index 00000000..0f02cc01
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/lutss.cpp
@@ -0,0 +1,57 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/lut.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 8;
+    int bout = 64;
+    std::vector<GroupElement> tab(256);
+    for (int i = 0; i < 256; ++i)
+    {
+        tab[i] = i + 1;
+    }
+
+    for (int i = 0; i < 1000; ++i)
+    {
+        GroupElement rin = rand() % 256;
+        GroupElement x = rand() % 256;
+        GroupElement xhat = (x + rin) % 256;
+
+        auto keys = keyGenLUTSS(bin, bout, rin, 0);
+        auto y0 = evalLUTSS_1(0, xhat, tab, keys.first);
+        auto y1 = evalLUTSS_1(1, xhat, tab, keys.second);
+        GroupElement res = y0.first + y1.first;
+        GroupElement corr = y0.second + y1.second;
+        GroupElement f0 = evalLUTSS_2(0, res, corr, keys.first);
+        GroupElement f1 = evalLUTSS_2(1, res, corr, keys.second);
+        GroupElement y = f0 + f1;
+        always_assert(y == tab[x]);
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/bf16.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/bf16.cpp
new file mode 100644
index 00000000..5e6664f7
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/bf16.cpp
@@ -0,0 +1,90 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+GroupElement clip_ct(GroupElement x, int maxBw) {
+    if (x >= (1LL << maxBw)) {
+        return (1LL << maxBw) - 1;
+    }
+    return x;
+}
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 255;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = i + 1;
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    F2BF16(num_samples, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], 13);
+            u64 r = output.data[i];
+            u64 k = r % (1LL << 6);
+            u64 m = r >> 6;
+            always_assert(((m+128) >> (7-k)) == input_ct.data[i]);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/clip.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/clip.cpp
new file mode 100644
index 00000000..5f0239cf
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/clip.cpp
@@ -0,0 +1,87 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+GroupElement clip_ct(GroupElement x, int maxBw) {
+    if (x >= (1LL << maxBw)) {
+        return (1LL << maxBw) - 1;
+    }
+    return x;
+}
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 100;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    Clip(num_samples, 16, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            always_assert(output.data[i] == clip_ct(input_ct.data[i], 16));
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp
new file mode 100644
index 00000000..c2fd6f38
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp
@@ -0,0 +1,83 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 14400;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = (rand() % (1LL << 16));
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    nExp(num_samples, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            i64 diff = std::abs(i64(output.data[i]) - i64(std::exp(-double(input_ct.data[i]) / double(1LL<<scale)) * (1LL << scale)));
+            // std::cout << diff << std::endl;
+            // always_assert(output.data[i] == u64(std::exp(-double(input_ct.data[i]) / double(1LL<<scale)) * (1LL << scale)));
+            always_assert(diff <= 4);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/gelu.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/gelu.cpp
new file mode 100644
index 00000000..1c764f8c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/gelu.cpp
@@ -0,0 +1,98 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+i64 u2i(u64 x, int bitlength) {
+    if (bitlength == 64)
+        return (i64) x;
+    if (x >= (1LL << (bitlength - 1))) {
+        return x - (1LL << bitlength);
+    }
+    return x;
+}
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<i64> input_ct({num_samples});
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength - 1);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    Tensor<i64> output_ct({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    llama->gelu(input, output, scale);
+    // Gelu(input.size(), input.data, output.data, scale);
+    llama::end();
+
+    ClearText<i64> *ct = new ClearText<i64>();
+    ct->gelu(input_ct, output_ct, scale);
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            i64 diff = std::abs(u2i(output.data[i], LlamaConfig::bitlength) - output_ct.data[i]);
+            // if (diff > 5)
+            //     std::cout << diff << std::endl;
+            always_assert(diff == 0);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/gemm.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/gemm.cpp
new file mode 100644
index 00000000..39f2e733
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/gemm.cpp
@@ -0,0 +1,82 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 s1 = 2;
+    u64 s2 = 2;
+    u64 s3 = 2;
+    Tensor<u64> x({s1, s2});
+    Tensor<u64> A({s2, s3});
+    Tensor<u64> y({s1, s3});
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < x.size(); ++i) {
+            x.data[i] = (i+1) * (1LL << scale);
+            A.data[i] = (i+1) * (1LL << scale);
+        }
+    }
+
+    llama->initializeInferencePartyB(x);
+    llama->initializeInferencePartyB(A);
+
+    llama::start();
+    // MatMul2D(s1, s2, s3, x.data, x.data, A.data, A.data, y.data, y.data, false);
+    SlothGemm(s1, s2, s3, x.data, A.data, y.data, scale);
+    llama::end();
+
+    llama->outputA(y);
+    if (party == CLIENT) {
+        print(y, scale, LlamaConfig::bitlength);
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/layernorm.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/layernorm.cpp
new file mode 100644
index 00000000..24698a83
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/layernorm.cpp
@@ -0,0 +1,100 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 n_seq = 11;
+
+    Tensor<u64> input({1, n_seq});
+    Tensor<i64> input_ct(input.shape);
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < input.size(); ++i) {
+            input.data[i] = i * (1LL << scale);
+            // input.data[i] = i;
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output(input.shape);
+    Tensor<i64> output_ct(input.shape);
+    llama->initializeInferencePartyB(input);
+
+    Tensor1D<u64> A(n_seq);
+    Tensor1D<u64> B(n_seq);
+    A.fill(1LL << scale);
+    auto A_nd = A.as_nd();
+    llama->initializeInferencePartyB(A_nd);
+    B.fill(0);
+
+    llama::start();
+
+    llama->layernorm(A, B, input, output, scale);
+    llama::end();
+
+    Tensor1D<i64> A_ct(n_seq);
+    Tensor1D<i64> B_ct(n_seq);
+    A_ct.fill(1LL << scale);
+    B_ct.fill(0);
+    ClearText<i64> *ct = new ClearText<i64>();
+    ct->layernorm(A_ct, B_ct, input_ct, output_ct, scale);
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < input.size(); ++i) {
+            i64 diff = std::abs((i64)output.data[i] - output_ct.data[i]);
+            always_assert(diff == 0);
+            // std::cout << double((i64)output.data[i]) / double(1LL << (2*scale)) << " " << double((i64)output_ct.data[i]) / double(1LL << (2*scale)) << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp
new file mode 100644
index 00000000..e37c951d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp
@@ -0,0 +1,89 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    u64 lutbw = 16;
+    std::vector<u64> lut(1LL<<lutbw);
+    u64 scale = 6;
+    for(int i = 0; i < (1LL<<lutbw); ++i)
+    {
+        // lut[i] = (rand() % 50); // this doesnt work, why?
+        lut[i] = u64(std::exp(-i / double(1LL<<scale)) * (1LL<<scale));
+    }
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = (rand() % (1LL << 8));
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    LUT_dpf(num_samples, lutbw, 64, lut, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            // std::cout << output.data[i] << " " << lut[input_ct.data[i]] << std::endl;
+            // std::cout << output.data[i] << " " << u64(std::exp(-double(input_ct.data[i]) / double(1LL<<scale)) * (1LL << scale)) << std::endl;
+            always_assert(output.data[i] == lut[input_ct.data[i]]);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/prtrunc.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/prtrunc.cpp
new file mode 100644
index 00000000..56c2289c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/prtrunc.cpp
@@ -0,0 +1,91 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+i64 u2i(u64 x, int bitlength) {
+    if (x >= (1LL << (bitlength - 1))) {
+        return x - (1LL << bitlength);
+    }
+    return x;
+}
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+    int scale = 12;
+
+    Tensor<u64> input({num_samples});
+    Tensor<i64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength - 1) - (1LL << (LlamaConfig::bitlength - 2));
+            mod(input.data[i], LlamaConfig::bitlength);
+            input_ct.data[i] = u2i(input.data[i], LlamaConfig::bitlength);
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    EdabitsPrTrunc(num_samples, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            i64 expected = (input_ct.data[i] >> scale);
+            i64 got = u2i(output.data[i], LlamaConfig::bitlength);
+            always_assert(got == expected || got == expected + 1);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/rsqrt.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/rsqrt.cpp
new file mode 100644
index 00000000..78363297
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/rsqrt.cpp
@@ -0,0 +1,84 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 500;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = (i + 1) * (1LL << (2*scale));
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    Rsqrt(num_samples, input.data, output.data, GroupElement(1), scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            // always_assert(output.data[i] == u64(std::exp(-double(input_ct.data[i]) / double(1LL<<scale)) * (1LL << scale)));
+            GroupElement expected = (1LL<<scale) / std::sqrt(i+1);
+            GroupElement diff = std::abs((i64)(expected - output.data[i]));
+            // std::cout << diff << std::endl;
+            always_assert(diff <= 1);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars.cpp
new file mode 100644
index 00000000..793aded4
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars.cpp
@@ -0,0 +1,85 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<i64> input_ct({num_samples});
+
+    u64 scale = 8;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input_ct.data[i] = i - (num_samples / 2);
+            input.data[i] = input_ct.data[i];
+            mod(input.data[i], LlamaConfig::bitlength);
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothARS(num_samples, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            u64 expected = input_ct.data[i] >> scale;
+            mod(expected, LlamaConfig::bitlength);
+            always_assert(output.data[i] == expected);
+            // std::cout << output.data[i] % (1LL << (LlamaConfig::bitlength - scale)) << " " << (input_ct.data[i] >> scale) << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars_faithful.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars_faithful.cpp
new file mode 100644
index 00000000..a39accc5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars_faithful.cpp
@@ -0,0 +1,85 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<i64> input_ct({num_samples});
+
+    u64 scale = 4;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input_ct.data[i] = i - (num_samples / 2);
+            input.data[i] = input_ct.data[i];
+            mod(input.data[i], LlamaConfig::bitlength);
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothFaithfulARS(num_samples, LlamaConfig::bitlength, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            u64 expected = input_ct.data[i] >> scale;
+            mod(expected, LlamaConfig::bitlength);
+            always_assert(output.data[i] == expected);
+            // std::cout << output.data[i] << " " << expected << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_clip.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_clip.cpp
new file mode 100644
index 00000000..f48c2e93
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_clip.cpp
@@ -0,0 +1,87 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+GroupElement clip_ct(GroupElement x, int maxBw) {
+    if (x >= (1LL << maxBw)) {
+        return (1LL << maxBw) - 1;
+    }
+    return x;
+}
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 10000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength - 1);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothClip(num_samples, 16, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            always_assert(output.data[i] == clip_ct(input_ct.data[i], 16));
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_drelu.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_drelu.cpp
new file mode 100644
index 00000000..e1724868
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_drelu.cpp
@@ -0,0 +1,87 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 100;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothDrelu(num_samples, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], 1);
+            if (input_ct.data[i] < (1ULL << u64(LlamaConfig::bitlength - 1)))
+            {
+                always_assert(output.data[i] == 1);
+            }
+            else
+            {
+                always_assert(output.data[i] == 0);
+            }
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_lrs.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_lrs.cpp
new file mode 100644
index 00000000..06be890c
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_lrs.cpp
@@ -0,0 +1,82 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 50;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    u64 scale = 8;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = i;
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothLRS(num_samples, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            always_assert(output.data[i] == (input_ct.data[i] >> scale));
+            // std::cout << output.data[i] % (1LL << (LlamaConfig::bitlength - scale)) << " " << (input_ct.data[i] >> scale) << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool.cpp
new file mode 100644
index 00000000..65961041
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool.cpp
@@ -0,0 +1,81 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 s1 = 1;
+    u64 s2 = 8;
+
+    Tensor<u64> input({s1, s2});
+    Tensor<u64> input_ct({s1, s2});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < s2; ++i) {
+            input.data[i] = i - 5;
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({s1});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothMaxpool(s1, s2, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < s1; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            std::cout << output.data[i] << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool_tri.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool_tri.cpp
new file mode 100644
index 00000000..0f029ccd
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool_tri.cpp
@@ -0,0 +1,84 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 s1 = 10;
+
+    Tensor<u64> input({s1, s1});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < s1; ++i) {
+            for (int j = 0; j < s1; ++j)
+            {
+                if (j <= i)
+                    input.data[i*s1 + j] = j;
+                else
+                    input.data[i*s1 + j] = s1 + 5;
+            }
+        }
+        input.print();
+    }
+    Tensor<u64> output({s1});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothMaxpoolTriangular(s1, s1, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < s1; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            std::cout << output.data[i] << std::endl;
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/sloth_relu.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_relu.cpp
new file mode 100644
index 00000000..d05e3c20
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/sloth_relu.cpp
@@ -0,0 +1,87 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 100;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    SlothRelu(num_samples, input.data, output.data);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength);
+            if (input_ct.data[i] < (1ULL << (LlamaConfig::bitlength - 1)))
+            {
+                always_assert(output.data[i] == input_ct.data[i]);
+            }
+            else
+            {
+                always_assert(output.data[i] == 0);
+            }
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/softmax.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/softmax.cpp
new file mode 100644
index 00000000..bef68969
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/softmax.cpp
@@ -0,0 +1,91 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 n_seq = 10;
+
+    Tensor<u64> input({n_seq, n_seq});
+    Tensor<i64> input_ct(input.shape);
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < input.size(); ++i) {
+            // input.data[i] = i * (1LL << scale);
+            input.data[i] = rand();
+            if ((rand() % 2) == 0)
+                input.data[i] = -input.data[i];
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output(input.shape);
+    Tensor<i64> output_ct(input.shape);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    for (int i = 0; i < 144; ++i)
+        llama->softmax(input, output, scale, 0);
+    llama::end();
+
+    ClearText<i64> *ct = new ClearText<i64>();
+    ct->softmax(input_ct, output_ct, scale, 1);
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < input.size(); ++i) {
+            i64 diff = std::abs((i64)output.data[i] - output_ct.data[i]);
+            always_assert(diff == 0);
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/tanh.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/tanh.cpp
new file mode 100644
index 00000000..0413a8e9
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/tanh.cpp
@@ -0,0 +1,82 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 64;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 1000;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    u64 scale = 12;
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = (rand() % (1LL << 14));
+            if ((rand() % 2) == 0)
+                input.data[i] = -input.data[i];
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    Tanh(num_samples, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            always_assert(output.data[i] == u64(std::tanh(double((i64)input_ct.data[i]) / double(1LL<<scale)) * (1LL << scale)));
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/multi_party/truncatereduce.cpp b/GPU-MPC/ext/sytorch/tests/multi_party/truncatereduce.cpp
new file mode 100644
index 00000000..d9e9e772
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/multi_party/truncatereduce.cpp
@@ -0,0 +1,81 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+int main(int __argc, char**__argv){
+
+    sytorch_init();
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = 40;
+    LlamaConfig::party = party;
+    LlamaConfig::num_threads = 4;
+    
+    if(__argc > 2){
+        ip = __argv[2];
+    }
+    llama->init(ip, true);
+
+    u64 num_samples = 100;
+    int scale = 12;
+
+    Tensor<u64> input({num_samples});
+    Tensor<u64> input_ct({num_samples});
+
+    if(party == CLIENT)
+    {
+        for (int i = 0; i < num_samples; ++i) {
+            input.data[i] = random_ge(LlamaConfig::bitlength);
+            input_ct.data[i] = input.data[i];
+        }
+
+    }
+    Tensor<u64> output({num_samples});
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    TruncateReduce(num_samples, LlamaConfig::bitlength, input.data, output.data, scale);
+    llama::end();
+
+    llama->outputA(output);
+    
+    if (party == CLIENT) {
+        for (int i = 0; i < num_samples; ++i) {
+            mod(output.data[i], LlamaConfig::bitlength - scale);
+            always_assert(output.data[i] == (input_ct.data[i] >> scale));
+        }
+    }
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/pubcmp.cpp b/GPU-MPC/ext/sytorch/tests/pubcmp.cpp
new file mode 100644
index 00000000..8bf188df
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/pubcmp.cpp
@@ -0,0 +1,58 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/pubcmp.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 40;
+    int numTrials = 10000;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(1);
+        GroupElement x = random_ge(bin);
+        GroupElement c = random_ge(bin);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenPubCmp(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        GroupElement t0 = evalPubCmp(0, xhat, c, key0);
+        GroupElement t1 = evalPubCmp(1, xhat, c, key1);
+        GroupElement t = t0 ^ t1 ^ rout;
+        mod(t, 1);
+
+        if (x < c) {
+            always_assert(t == 1);
+        } else {
+            always_assert(t == 0);
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp b/GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp
new file mode 100644
index 00000000..5ca6b81d
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp
@@ -0,0 +1,55 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/relu.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 64;
+    int numTrials = 100;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(1);
+        GroupElement x = random_ge(bin);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenSlothDrelu(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        GroupElement t0 = evalSlothDrelu(0, xhat, key0);
+        GroupElement t1 = evalSlothDrelu(1, xhat, key1);
+        GroupElement t = t0 ^ t1 ^ rout;
+        if (x < (1ULL<<(bin-1))) {
+            always_assert(t == 1);
+        } else {
+            always_assert(t == 0);
+        }
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/tests/truncatereduce.cpp b/GPU-MPC/ext/sytorch/tests/truncatereduce.cpp
new file mode 100644
index 00000000..dae8a115
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/truncatereduce.cpp
@@ -0,0 +1,56 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/pubdiv.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+int main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 40;
+    int numTrials = 10000;
+    int scale = 12;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(bin-scale);
+        GroupElement x = random_ge(bin);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenTruncateReduce(bin, scale, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+
+        GroupElement t0_2 = evalTruncateReduce(0, xhat, key0);
+        GroupElement t1_2 = evalTruncateReduce(1, xhat, key1);
+
+        GroupElement t_2 = t0_2 + t1_2 - rout;
+        mod(t_2, bin - scale);
+
+        always_assert(t_2 == (x >> scale));
+    }
+}
diff --git a/GPU-MPC/ext/sytorch/tests/wrap.cpp b/GPU-MPC/ext/sytorch/tests/wrap.cpp
new file mode 100644
index 00000000..f32e6f1a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/tests/wrap.cpp
@@ -0,0 +1,92 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../ext/llama/wrap.h"
+#include <iostream>
+#include <sytorch/backend/llama_base.h>
+
+void dpf_main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 64;
+    int numTrials = 1000;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(1);
+        GroupElement x = random_ge(bin);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenWrapDPF(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        GroupElement t0 = evalWrapDPF(0, xhat, key0);
+        GroupElement t1 = evalWrapDPF(1, xhat, key1);
+        GroupElement t = t0 ^ t1 ^ rout;
+        if (xhat < r) {
+            always_assert(t == 1);
+        } else {
+            always_assert(t == 0);
+        }
+    }
+}
+
+void ss_main()
+{
+    u64 seedKey = 0xdeadbeefbadc0ffe;
+    for(int i = 0; i < 256; ++i) {
+        LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(time(NULL), seedKey));
+    }
+
+    int bin = 6;
+    int numTrials = 1000;
+    
+    for (int i = 0; i < numTrials; ++i) {
+        GroupElement r = random_ge(bin);
+        GroupElement rout = random_ge(1);
+        GroupElement x = random_ge(bin);
+        GroupElement xhat = x + r;
+        mod(xhat, bin);
+
+        auto keys = keyGenWrapSS(bin, r, rout);
+        auto& key0 = keys.first;
+        auto& key1 = keys.second;
+        GroupElement t0 = evalWrapSS(0, xhat, key0);
+        GroupElement t1 = evalWrapSS(1, xhat, key1);
+        GroupElement t = t0 ^ t1 ^ rout;
+        if (xhat < r) {
+            always_assert(t == 1);
+        } else {
+            always_assert(t == 0);
+        }
+    }
+}
+
+int main()
+{
+    dpf_main();
+    ss_main();
+}
diff --git a/GPU-MPC/fss/dcf/gpu_dcf.cu b/GPU-MPC/fss/dcf/gpu_dcf.cu
new file mode 100644
index 00000000..34dba98e
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_dcf.cu
@@ -0,0 +1,378 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_mem.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_aes_shm.h"
+
+#include <assert.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#include "gpu_dcf_templates.h"
+#include "gpu_sstab.h"
+
+namespace dcf
+{
+
+    __device__ u64 getGroupElementFromAESBlock(AESBlock b, int bout, int vector_elem_pos)
+    {
+        /* returning the last 64 bits */
+        assert((vector_elem_pos + 1) * bout <= AES_BLOCK_LEN_IN_BITS);
+        u64 g = static_cast<u64>(b >> (vector_elem_pos * bout));
+        gpuMod(g, bout);
+        return g;
+    }
+
+    __device__ AESBlock traverseOneDCF(int bin, int bout, int party,
+                                       const AESBlock s,
+                                       const AESBlock cw,
+                                       const u8 keep,
+                                       u64 *v_share,
+                                       u64 vcw,
+                                       uint64_t level,
+                                       AESSharedContext *c)
+
+    {
+        // /* these need to be written to constant memory */
+        const AESBlock notThreeAESBlock = ~3;
+        const AESBlock zeroAndAllOne[2] = {0, static_cast<AESBlock>(~0)};
+        const AESBlock OneAESBlock = 1;
+
+        AESBlock tau = 0, cur_v = 0, stcw;
+        u8 t_previous = lsb(s);
+        auto ss = s & notThreeAESBlock;
+        applyAESPRGTwoTimes(c, (u32 *)&ss, keep, (u32 *)&tau, (u32 *)&cur_v);
+        const auto scw = (cw & notThreeAESBlock);
+        AESBlock ds[] = {((cw >> 1) & OneAESBlock), (cw & OneAESBlock)};
+        const auto mask = zeroAndAllOne[t_previous];
+        stcw = tau ^ ((scw ^ ds[keep]) & mask);
+
+        uint64_t sign = (party == SERVER1) ? -1 : 1;
+        u64 v = getGroupElementFromAESBlock(cur_v, bout, 0);
+        *v_share += (sign * (v + (static_cast<u64>(mask) & vcw)));
+        return stcw;
+    }
+
+    // fix the ballot sync bug
+    template <typename T, int E, dcfPrologue pr, dcfEpilogue ep>
+    __global__ void doDcf(int bin, int bout, int party, int N,
+                          T *in,         // might want to pass a pointer to this later
+                          AESBlock *scw, // k.bin + 1
+                          u32 *vcw,      // k.bin * groupSize
+                          AESBlock *l_g,
+                          u32 *out, u64 oStride, AESGlobalContext gaes)
+    {
+        AESSharedContext saes;
+        loadSbox(&gaes, &saes);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid < N)
+        {
+            scw = &scw[tid];
+            auto x = u64(in[tid]);
+            AESBlock s[E];
+            u64 x0[E], x1[E], v_alpha[E];
+            pr(party, bin, N, x, x0);
+            // printf("dcf=%lu, %lu\n", x0[0], x0[1]);
+            for (int e = 0; e < E; e++)
+            {
+                s[e] = scw[0];
+                gpuMod(x0[e], bin);
+                x1[e] = __brevll(x0[e]) >> (64 - bin);
+                v_alpha[e] = 0;
+            }
+            int elemsPerBlock = AES_BLOCK_LEN_IN_BITS / bout;
+            int levelsPacked = int(ceil(log2((double)elemsPerBlock)));
+            for (int i = 0; i < bin - levelsPacked - 1; ++i)
+            {
+                auto curVcw = getVCW(bout, vcw, N, i);
+                auto curScw = scw[(i + 1) * N];
+                for (int e = 0; e < E; e++)
+                {
+                    const u8 keep = lsb(x1[e]);
+                    s[e] = traverseOneDCF(bin, bout, party, s[e],
+                                          curScw, keep, &v_alpha[e], curVcw, i, &saes);
+                    x1[e] >>= 1;
+                }
+            }
+            AESBlock l[2];
+            l[0] = l_g[2 * tid];
+            l[1] = l_g[2 * tid + 1];
+            AESBlock ct;
+            for (int e = 0; e < E; e++)
+            {
+                int j = x1[e] & 1;
+                u64 offset = x0[e];
+                gpuMod(offset, levelsPacked);
+                u64 t = lsb(s[e]);
+                auto ss = s[e] & ~3;
+                applyAESPRG(&saes, (u32 *)&ss, 2 * j + 1, (u32 *)&ct);
+                u64 v = getGroupElementFromAESBlock(ct, bout, offset);
+                u64 curVcw = getGroupElementFromAESBlock(l[j], bout, offset);
+                u64 sign = party == SERVER1 ? -1 : 1;
+                v_alpha[e] += (sign * (v + (t * curVcw)));
+                gpuMod(v_alpha[e], bout);
+            }
+            ep(party, bin, bout, N, x, v_alpha, out, oStride);
+        }
+    }
+
+    // no memory leak
+    template <typename T, int E, dcfPrologue pr, dcfEpilogue ep>
+    void gpuDcfTreeEval(GPUDCFTreeKey k, int party, T *d_in, u32 *d_out, u64 oStride, AESGlobalContext *g, Stats *s)
+    {
+        // do not change tb size it is needed to load the sbox
+        const int tb_size = 256;
+        int num_thread_blocks = (k.N - 1) / tb_size + 1;
+        AESBlock *d_scw, *d_l;
+        u32 *d_vcw;
+
+        d_scw = (AESBlock *)moveToGPU((u8 *)k.scw, k.memSzScw, s);
+        d_vcw = (u32 *)moveToGPU((u8 *)k.vcw, k.memSzVcw, s);
+        d_l = (AESBlock *)moveToGPU((u8 *)k.l, k.memSzL, s);
+
+        doDcf<T, E, pr, ep><<<num_thread_blocks, tb_size>>>(k.bin, k.bout, party, k.N, d_in, d_scw, d_vcw, d_l, d_out, oStride, *g);
+
+        checkCudaErrors(cudaDeviceSynchronize());
+        gpuFree(d_scw);
+        gpuFree(d_l);
+        gpuFree(d_vcw);
+    }
+
+    template <typename T, int E, dcfPrologue pr, dcfEpilogue ep>
+    u32 *gpuDcf(GPUDCFKey k, int party, T *d_in, AESGlobalContext *g, Stats *s, std::vector<u32 *> *h_masks = NULL)
+    {
+        u32 *d_out;
+        if (k.bin <= 8)
+        {
+            d_out = dcf::gpuLookupSSTable<T, E, pr, ep>(k.ssKey, party, d_in, s, h_masks);
+        }
+        else
+        {
+            d_out = moveMasks(k.memSzOut, h_masks, s);
+            size_t gIntSzOut = k.memSzOut / sizeof(PACK_TYPE);
+            int n = k.dcfTreeKey[0].N;
+            size_t bIntSzOut = k.dcfTreeKey[0].memSzOut / sizeof(PACK_TYPE);
+            for (int b = 0; b < k.B; b++)
+            {
+                gpuDcfTreeEval<T, E, pr, ep>(k.dcfTreeKey[b], party, d_in + b * n, d_out + b * bIntSzOut, (u64)gIntSzOut, g, s);
+            }
+        }
+        return d_out;
+    }
+
+    // Real Endpoints
+    template <typename T>
+    __global__ void keyGenDCFKernel(int party, int bin, int bout, int N, T *rinArr,
+                                    u64 payload, AESBlock *s0, AESBlock *s1, AESBlock *k0, u32 *v0, AESBlock *leaves, AESGlobalContext gaes, bool leq = false)
+    {
+        AESSharedContext saes;
+        loadSbox(&gaes, &saes);
+        int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+        if (threadId < N)
+        {
+            static const AESBlock notOneBlock = ~1;
+            static const AESBlock notThreeBlock = ~3;
+            static const AESBlock OneBlock = 1;
+
+            AESBlock s[2];
+            s[0] = s0[threadId];
+            s[1] = s1[threadId];
+            AESBlock si[2][2];
+            AESBlock vi[2][2];
+
+            u64 v_alpha = 0;
+
+            s[0] = (s[0] & notOneBlock) ^ ((s[1] & OneBlock) ^ OneBlock);
+            k0[threadId] = s[party == 1];
+            AESBlock ct[4];
+
+            auto rin = u64(rinArr[threadId]);
+            int elemsPerBlock = AES_BLOCK_LEN_IN_BITS / bout;
+            int levelsPacked = int(ceil(log2((double)elemsPerBlock)));
+
+            for (int i = 0; i < bin - levelsPacked - 1; ++i)
+            {
+                const u8 keep = static_cast<u8>(rin >> (bin - 1 - i)) & 1;
+                AESBlock a = AESBlock(keep);
+
+                auto ss0 = s[0] & notThreeBlock;
+                auto ss1 = s[1] & notThreeBlock;
+
+                applyAESPRGFourTimes(&saes, (u32 *)&ss0, (u32 *)ct, (u32 *)&ct[1], (u32 *)&ct[2], (u32 *)&ct[3]);
+                si[0][0] = ct[0];
+                si[0][1] = ct[1];
+                vi[0][0] = ct[2];
+                vi[0][1] = ct[3];
+                applyAESPRGFourTimes(&saes, (u32 *)&ss1, (u32 *)ct, (u32 *)&ct[1], (u32 *)&ct[2], (u32 *)&ct[3]);
+                si[1][0] = ct[0];
+                si[1][1] = ct[1];
+                vi[1][0] = ct[2];
+                vi[1][1] = ct[3];
+
+                auto ti0 = (u8)lsb(s[0]);
+                auto ti1 = (u8)lsb(s[1]);
+                u64 sign = (ti1 == 1) ? -1 : +1;
+
+                auto vi_00_converted = getGroupElementFromAESBlock(vi[0][keep], bout, 0);
+                auto vi_10_converted = getGroupElementFromAESBlock(vi[1][keep], bout, 0);
+                auto vi_01_converted = getGroupElementFromAESBlock(vi[0][keep ^ 1], bout, 0);
+                auto vi_11_converted = getGroupElementFromAESBlock(vi[1][keep ^ 1], bout, 0);
+
+                auto v = sign * (-v_alpha - vi_01_converted + vi_11_converted);
+                if (keep == 1)
+                {
+                    v = v + sign * payload;
+                }
+                gpuMod(v, bout);
+                writeVCW(bout, v0, v, i, N);
+                v_alpha = v_alpha - vi_10_converted + vi_00_converted + sign * v;
+
+                AESBlock siXOR[] = {si[0][0] ^ si[1][0], si[0][1] ^ si[1][1]};
+
+                // get the left and right t_CW bits
+                AESBlock t[] = {
+                    (OneBlock & siXOR[0]) ^ a ^ OneBlock,
+                    (OneBlock & siXOR[1]) ^ a};
+
+                // take scw to be the bits [127, 2] as scw = s0_loss ^ s1_loss
+                auto scw = siXOR[keep ^ 1] & notThreeBlock; // not15Block;
+                k0[(i + 1) * N + threadId] = scw            // set bits [127, 2] as scw = s0_loss ^ s1_loss
+                                             ^ (t[0] << 1)  // set bit 1 as tL
+                                             ^ t[1];        // set bit 0 as tR
+
+                auto si0Keep = si[0][keep];
+                auto si1Keep = si[1][keep];
+
+                // extract the t^Keep_CW bit
+                auto TKeep = t[keep];
+                const AESBlock zeroAndAllOne[2] = {0, static_cast<AESBlock>(~0)};
+                // set the next level of s,t
+                s[0] = si0Keep ^ (zeroAndAllOne[ti0] & (scw ^ TKeep));
+                s[1] = si1Keep ^ (zeroAndAllOne[ti1] & (scw ^ TKeep));
+            }
+
+            auto ti0 = (u8)lsb(s[0]);
+            auto ti1 = (u8)lsb(s[1]);
+            auto ss0 = s[0] & notThreeBlock;
+            auto ss1 = s[1] & notThreeBlock;
+            AESBlock vcw[2];
+            vcw[0] = 0;
+            vcw[1] = 0;
+            applyAESPRGTwoTimes(&saes, (u32 *)&ss0, 1, (u32 *)&ct[0], (u32 *)&ct[1]);
+            applyAESPRGTwoTimes(&saes, (u32 *)&ss1, 1, (u32 *)&ct[2], (u32 *)&ct[3]);
+
+            u64 zeroMask = ~((1ULL << (levelsPacked + 1)) - 1);
+            auto rinPrefix = rin & zeroMask;
+            for (int i = 0; i < 2 * elemsPerBlock; i++)
+            {
+
+                u64 s0_converted = getGroupElementFromAESBlock(ct[i / elemsPerBlock], bout, i % elemsPerBlock);
+                u64 s1_converted = getGroupElementFromAESBlock(ct[2 + i / elemsPerBlock], bout, i % elemsPerBlock);
+                auto x = rinPrefix + i;
+                auto g0 = s1_converted - s0_converted - v_alpha + (x < rin) * payload + leq * (x == rin) * payload;
+                assert(((x < rin) * payload + leq * (x == rin) * payload) <= payload);
+                if (ti1 == 1)
+                {
+                    g0 = g0 * -1;
+                }
+                gpuMod(g0, bout);
+                vcw[i / elemsPerBlock] |= (AESBlock(g0) << ((i % elemsPerBlock) * bout));
+            }
+            leaves[2 * threadId] = vcw[0];
+            leaves[2 * threadId + 1] = vcw[1];
+        }
+    }
+
+    template <typename T>
+    void doDcfTreeKeyGen(u8 **key_as_bytes, int party, int bin, int bout, int N,
+                         T *d_rin, T payload, AESGlobalContext *gaes, bool leq = false)
+    {
+        writeInt(key_as_bytes, bin);
+        writeInt(key_as_bytes, bout);
+        writeInt(key_as_bytes, N);
+
+        // can think about other bitlengths later
+        // assert(bout == 1 || bout == 2);
+        int elemsPerBlock = AES_BLOCK_LEN_IN_BITS / bout;
+        int newBin = bin - int(log2(elemsPerBlock));
+        size_t memSzK = N * newBin * sizeof(AESBlock);
+        size_t memSzL = 2 * N * sizeof(AESBlock);
+
+        AESBlock *d_k0 = (AESBlock *)gpuMalloc(memSzK);
+        AESBlock *d_leaves = (AESBlock *)gpuMalloc(memSzL);
+
+        size_t memSzV = ((bout * N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE) * (newBin - 1);
+        auto d_v0 = (u32 *)gpuMalloc(memSzV);
+
+        auto d_s0 = randomAESBlockOnGpu(N);
+        auto d_s1 = randomAESBlockOnGpu(N);
+
+        keyGenDCFKernel<<<(N - 1) / 256 + 1, 256>>>(party, bin, bout, N, d_rin,
+                                                    u64(payload), d_s0, d_s1, d_k0, d_v0, d_leaves, *gaes, leq);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        moveIntoCPUMem(*key_as_bytes, (u8 *)d_k0, memSzK, NULL);
+        *key_as_bytes += memSzK;
+        moveIntoCPUMem(*key_as_bytes, (u8 *)d_leaves, memSzL, NULL);
+        *key_as_bytes += memSzL;
+        moveIntoCPUMem(*key_as_bytes, (u8 *)d_v0, memSzV, NULL);
+        *key_as_bytes += memSzV;
+
+        gpuFree(d_s0);
+        gpuFree(d_s1);
+        gpuFree(d_k0);
+        gpuFree(d_v0);
+        gpuFree(d_leaves);
+    }
+
+    template <typename T>
+    void gpuKeyGenDCF(uint8_t **key_as_bytes, int party, int bin, int bout, int N,
+                      T *d_rin, T payload, AESGlobalContext *gaes, bool leq = false)
+    {
+        if (bin <= 8)
+        {
+            assert(bout == 1 && payload == T(1) && leq);
+            genSSTable<T, dcfShares>(key_as_bytes, party, bin, N, d_rin);
+        }
+        else
+        {
+            int elemsPerBlock = AES_BLOCK_LEN_IN_BITS / bout;
+            int newBin = bin - int(log2(elemsPerBlock));
+            u64 memSzOneK = (newBin + 2) * sizeof(AESBlock);
+            int m = (24 * OneGB) / memSzOneK;
+            m -= (m % 32);
+            int B = (N - 1) / m + 1;
+            // printf("N=%d, m=%d, B=%d\n", N, m, B);
+            writeInt(key_as_bytes, bin);
+            writeInt(key_as_bytes, bout);
+            writeInt(key_as_bytes, N);
+            writeInt(key_as_bytes, B);
+            for (int b = 0; b < B; b++)
+                doDcfTreeKeyGen(key_as_bytes, party, bin, bout, std::min(m, N - b * m), d_rin + b * m, payload, gaes, leq);
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_dcf.h b/GPU-MPC/fss/dcf/gpu_dcf.h
new file mode 100644
index 00000000..3e9c6aa2
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_dcf.h
@@ -0,0 +1,102 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "fss/gpu_sstab.h"
+#include <cassert>
+
+namespace dcf
+{
+    struct GPUDCFTreeKey
+    {
+        int bin, bout, N;
+        AESBlock *scw, *l;
+        u32 *vcw;
+        u64 memSzScw, memSzVcw, memSzL, memSzOut;
+    };
+
+    struct GPUDCFKey
+    {
+        int bin, bout, M, B;
+        u64 memSzOut;
+        GPUDCFTreeKey *dcfTreeKey;
+        GPUSSTabKey ssKey;
+    };
+
+    GPUDCFTreeKey readGPUDCFTreeKey(uint8_t **key_as_bytes)
+    {
+        GPUDCFTreeKey k;
+        std::memcpy((char *)&k, *key_as_bytes, 3 * sizeof(int));
+        *key_as_bytes += 3 * sizeof(int);
+
+        int elemsPerBlock = AES_BLOCK_LEN_IN_BITS / k.bout;
+        int b2 = k.bin - int(ceil(log2(elemsPerBlock)));
+
+        size_t szScw = k.N * b2;
+        k.memSzScw = szScw * sizeof(AESBlock);
+        k.memSzL = 2 * k.N * sizeof(AESBlock);
+        // assert(k.bout == 1 || k.bout == 2);
+        k.memSzVcw = ((k.bout * k.N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE) * (b2 - 1);
+        k.memSzOut = ((k.bout * k.N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+
+        k.scw = (AESBlock *)*key_as_bytes;
+        *key_as_bytes += k.memSzScw;
+        k.l = (AESBlock *)*key_as_bytes;
+        *key_as_bytes += k.memSzL;
+        k.vcw = (u32 *)*key_as_bytes;
+        *key_as_bytes += k.memSzVcw;
+        return k;
+    }
+
+    GPUDCFKey readGPUDCFKey(uint8_t **key_as_bytes)
+    {
+        GPUDCFKey k;
+        k.bin = *((int *)*key_as_bytes);
+        if (k.bin <= 8)
+        {
+            k.ssKey = readGPUSSTabKey(key_as_bytes);
+            k.bout = 1;
+            k.M = k.ssKey.N;
+            k.B = 1;
+            k.memSzOut = k.ssKey.memSzOut;
+        }
+        else
+        {
+            memcpy(&k, *key_as_bytes, 4 * sizeof(int));
+            *key_as_bytes += (4 * sizeof(int));
+
+            // assert(k.bin > 8);
+
+            k.dcfTreeKey = new GPUDCFTreeKey[k.B];
+            k.memSzOut = 0;
+            for (int b = 0; b < k.B; b++)
+            {
+                k.dcfTreeKey[b] = readGPUDCFTreeKey(key_as_bytes);
+                k.memSzOut += k.dcfTreeKey[b].memSzOut;
+            }
+        }
+        return k;
+    }
+}
+
+#include "gpu_dcf.cu"
diff --git a/GPU-MPC/fss/dcf/gpu_dcf_templates.h b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
new file mode 100644
index 00000000..d569e058
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
@@ -0,0 +1,109 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_stats.h"
+#include "utils/gpu_mem.h"
+
+#include "fss/gpu_fss_helper.h"
+
+#include <vector>
+
+// using namespace std;
+namespace dcf
+{
+
+    typedef void (*dcfPrologue)(int party, int bin, int N,
+                                u64 x,
+                                u64 *o);
+    typedef void (*dcfEpilogue)(int party, int bin, int bout, int N,
+                                u64 x,
+                                u64 *o_l, u32 *out_g, u64 oStride);
+
+    __device__ void idPrologue(int party, int bin, int N,
+                               u64 x,
+                               u64 *o)
+    {
+        o[0] = x;
+    }
+
+    __device__ void idEpilogue(int party, int bin, int bout, int N,
+                               u64 x,
+                               u64 *o_l, u32 *out_g, u64 oStride)
+    {
+        auto o1 = u64(*o_l);
+        writePackedOp(out_g, o1, bout, N);
+    }
+
+    __device__ void maskEpilogue(int party, int bin, int bout, int N,
+                                 u64 x,
+                                 u64 *o_l, u32 *out_g, u64 oStride)
+    {
+        auto o1 = u64(*o_l);
+        auto mask = getVCW(bout, out_g, N, 0);
+        // printf("Mask: %ld, output: %ld\n", mask, o);
+        o1 = o1 + mask;
+        gpuMod(o1, bout);
+        writePackedOp(out_g, o1, bout, N);
+    }
+
+    __device__ void dReluPrologue(int party, int bin, int N,
+                                  u64 x,
+                                  u64 *o)
+    {
+        o[0] = x;
+        o[1] = (x + (1ULL << (bin - 1)));
+    }
+
+    template <bool returnXLtRin>
+    __device__ void dReluEpilogue(int party, int bin, int bout, int N,
+                                  u64 x,
+                                  u64 *o_l, u32 *out_g, u64 oStride)
+    {
+        auto o1 = o_l[0];
+        auto o2 = o_l[1];
+        auto mask = getVCW(bout, out_g, N, 0);
+        auto o = o2 - o1 + mask;
+        // printf("o1=%lu, o2=%lu, mask=%lu, o=%lu\n", o1, o2, mask, o);
+        if (party == SERVER1)
+        {
+            auto x2 = (x + (1ULL << (bin - 1)));
+            gpuMod(x2, bin);
+            // printf("x=%lu, x2=%lu, %lu\n", x, x2, (1ULL << (bin - 1)));
+            o += (x2 >= (1ULL << (bin - 1)));
+            // printf("o=%ld, %d, %d, %d\n", o, (x2 >= (1ULL << (bin - 1))), bin, bout);
+        }
+        gpuMod(o, bout);
+        writePackedOp(out_g, o, bout, N);
+        // writeVCW(bout, out_g, o, 0, N);
+        if (returnXLtRin)
+        {
+            o1 += getVCW(bout, out_g + oStride, N, 0);
+            gpuMod(o1, bout);
+            writePackedOp(out_g + oStride, o1, bout, N);
+            // writeVCW(bout, out_g, o1, 1, N);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_maxpool.cu b/GPU-MPC/fss/dcf/gpu_maxpool.cu
new file mode 100644
index 00000000..7e036255
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_maxpool.cu
@@ -0,0 +1,119 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_random.h"
+#include "gpu_dcf.h"
+#include "gpu_maxpool.h"
+
+namespace dcf
+{
+    template <typename T>
+    T *gpuMaxpoolHelper(SigmaPeer *peer, int party, MaxpoolParams p, GPU2RoundReLUKey<T> k, GPUAndKey andKey, int i, int j, T *d_I, T *d_curMax, u32 *d_oneHot, AESGlobalContext *gaes, Stats *s)
+    {
+        int outSz = getMSz(p);
+        T *d_diff = (T *)gpuMalloc(outSz * sizeof(T));
+        diffWithCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, i, j, d_curMax, d_I, d_diff, outSz);
+        checkCudaErrors(cudaDeviceSynchronize());
+        auto d_res = gpuTwoRoundRelu(peer, party, k, d_diff, gaes, s);
+        auto d_drelu = d_res.first;
+        auto d_newMax = d_res.second;
+        gpuFree(d_diff);
+        // relu(x-y) + y
+        gpuLinearComb(p.bw, outSz, d_newMax, T(1), d_newMax, T(1), d_curMax);
+        if (d_oneHot)
+        {
+            gpuAndForMaxpool(p, i * p.FW + j + 1, andKey, d_drelu, d_oneHot, party, s);
+            int numBits = k.selectKey.N * p.FH * p.FW;
+            peer->reconstructInPlace((T *)d_oneHot, 1, numBits, s);
+        }
+        gpuFree(d_drelu);
+        return d_newMax;
+    }
+
+    template <typename T>
+    T *gpuMaxPool(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, u32 *d_oneHot,
+                  AESGlobalContext *gaes, Stats *s)
+
+    {
+        int outSz = getMSz(p);
+        T *d_curMax = (T *)gpuMalloc(outSz * sizeof(T));
+        populateCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, d_curMax, d_I, outSz);
+        checkCudaErrors(cudaDeviceSynchronize());
+        for (int i = 0; i < p.FH; i++)
+        {
+            for (int j = 0; j < p.FW; j++)
+            {
+                if (i == 0 && j == 0)
+                    continue;
+                auto d_newMax = gpuMaxpoolHelper(peer, party, p, k.reluKey[i * p.FW + j], k.andKey[i * p.FW + j], i, j, d_I, d_curMax, d_oneHot, gaes, s);
+                gpuFree(d_curMax);
+                d_curMax = d_newMax;
+            }
+        }
+        return d_curMax;
+    }
+
+    template <typename T>
+    T *gpuKeygenMaxpoolHelper(uint8_t **key_as_bytes, int party, MaxpoolParams p, int fh, int fw,
+                              T *d_inputMask, T *d_curMaxMask, u8 *d_oneHotMask,
+                              AESGlobalContext *gaes)
+    {
+        int outSz = getMSz(p);
+        T *d_diffMask = (T *)gpuMalloc(outSz * sizeof(T));
+        // d_diffMask = inputMask - curMask
+        diffWithCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, fh, fw, d_curMaxMask, d_inputMask, d_diffMask, outSz);
+        checkCudaErrors(cudaDeviceSynchronize());
+        auto d_res = gpuGenTwoRoundReluKey(key_as_bytes, party, p.bin, p.bw, outSz, d_diffMask, gaes);
+        auto d_dreluMask = d_res.first;
+        auto d_newMaxMask = d_res.second;
+        gpuFree(d_diffMask);
+        gpuLinearComb(p.bw, outSz, d_newMaxMask, T(1), d_newMaxMask, T(1), d_curMaxMask);
+        if (d_oneHotMask)
+        {
+            gpuKeygenOneHotMaxpool(key_as_bytes, party, p, outSz, fh, fw, d_dreluMask, d_oneHotMask);
+        }
+        gpuFree(d_dreluMask);
+        return d_newMaxMask;
+    }
+
+    template <typename T>
+    T *gpuKeygenMaxpool(uint8_t **key_as_bytes, int party, MaxpoolParams p,
+                        T *d_inputMask, u8 *d_oneHotMask,
+                        AESGlobalContext *gaes)
+    {
+        int outSz = getMSz(p);
+        T *d_curMaxMask = (T *)gpuMalloc(outSz * sizeof(T));
+        populateCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, d_curMaxMask, d_inputMask, outSz);
+        checkCudaErrors(cudaDeviceSynchronize());
+        for (int i = 0; i < p.FH; i++)
+        {
+            for (int j = 0; j < p.FW; j++)
+            {
+                if (i == 0 && j == 0)
+                    continue;
+                auto d_newMax = gpuKeygenMaxpoolHelper(key_as_bytes, party, p, i, j, d_inputMask, d_curMaxMask, d_oneHotMask, gaes);
+                gpuFree(d_curMaxMask);
+                d_curMaxMask = d_newMax;
+            }
+        }
+        return d_curMaxMask;
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_maxpool.h b/GPU-MPC/fss/dcf/gpu_maxpool.h
new file mode 100644
index 00000000..a9765208
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_maxpool.h
@@ -0,0 +1,54 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "fss/gpu_maxpool.h"
+#include "fss/gpu_and.h"
+#include "gpu_relu.h"
+
+namespace dcf
+{
+
+    template <typename T>
+    struct GPUMaxpoolKey
+    {
+        GPU2RoundReLUKey<T> *reluKey;
+        GPUAndKey *andKey;
+    };
+
+    template <typename T>
+    GPUMaxpoolKey<T> readGPUMaxpoolKey(MaxpoolParams p, u8 **key_as_bytes)
+    {
+        GPUMaxpoolKey<T> k;
+        int rounds = p.FH * p.FW - 1;
+        // printf("Rounds=%d\n", rounds);
+        k.reluKey = new GPU2RoundReLUKey<T>[rounds + 1];
+        for (int i = 0; i < rounds; i++)
+        {
+            k.reluKey[i + 1] = readTwoRoundReluKey<T>(key_as_bytes);
+            // printf("Round %d=%d relus\n", i + 1, k.reluKey[i + 1].N);
+        }
+        return k;
+    }
+}
+
+#include "gpu_maxpool.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_relu.cu b/GPU-MPC/fss/dcf/gpu_relu.cu
new file mode 100644
index 00000000..77ea8ec4
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_relu.cu
@@ -0,0 +1,168 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_relu.h"
+#include "utils/gpu_comms.h"
+
+namespace dcf
+{
+    // need to check this
+    // drelu mask is used as input mask for the next set of protocols
+    // do we need something better than u64?
+    template <typename T>
+    u8 *keygenDRelu(uint8_t **key_as_bytes, int party, int bin, int N, T *d_rin, AESGlobalContext *gaes)
+    {
+        // need to write everything in the proper format
+        // printf("%d, %d\n", bin, N);
+        gpuKeyGenDCF<T>(key_as_bytes, party, bin, 1, N, d_rin, T(1), gaes);
+        auto d_dreluMask = randomGEOnGpu<u8>(N, 1);
+        writeShares<u8, u8>(key_as_bytes, party, N, d_dreluMask, 1);
+        return d_dreluMask;
+    }
+    // need to check this
+    template <typename T>
+    std::pair<u8 *, T *> gpuGenTwoRoundReluKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, T *d_inputMask, AESGlobalContext *gaes)
+    {
+        writeInt(key_as_bytes, bin);
+        writeInt(key_as_bytes, bout);
+        writeInt(key_as_bytes, N);
+        auto d_dreluMask = keygenDRelu(key_as_bytes, party, bin, N, d_inputMask, gaes);
+        auto d_outputMask = gpuKeyGenSelect<T, T, u8>(key_as_bytes, party, N, d_inputMask, d_dreluMask, bout);
+        return std::make_pair(d_dreluMask, d_outputMask);
+    }
+
+    template <typename T>
+    std::pair<u32 *, T *> gpuTwoRoundRelu(SigmaPeer *peer, int party, GPU2RoundReLUKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
+    {
+        std::vector<u32 *> h_dreluMask = {k.dreluKey.dReluMask};
+        auto d_drelu = gpuDcf<T, 2, dReluPrologue, dReluEpilogue<false>>(k.dreluKey.dcfKey, party, d_I, gaes, s, &h_dreluMask);
+        peer->reconstructInPlace(d_drelu, 1, k.N, s);
+        auto d_relu = gpuSelect<T, T, 0, 0>(peer, party, k.bout, k.selectKey, (u32 *)d_drelu, d_I, s, true);
+        return std::make_pair(d_drelu, d_relu);
+    }
+
+    template <typename T>
+    __global__ void reluExtendMuxKernel(int party, int bin, /*int f,*/ int N, T *x, T* y, T *oneHot, T *outMask, u32 *drelu, u32 *xLTRin)
+    {
+        int j = blockIdx.x * blockDim.x + threadIdx.x;
+        if (j < N)
+        {
+            int posInBlock = threadIdx.x & 0xf;
+            u32 d = (((u32 *)drelu)[j / 16] >> (2 * posInBlock)) & 3;
+            u32 w = (((u32 *)xLTRin)[j / 16] >> (2 * posInBlock)) & 3;
+            u32 i = (2 * d + w) & 3;
+            // if(j <= 1) printf("drelu=%u, lt=%u\n", d, w);
+            // should i store this table transposed instead?
+            // will always access sequential elements so might benefit from locality within a thread
+            T rotatedP3 = oneHot[4 * j + ((2 - i) & 3)];
+            T rotatedP4 = oneHot[4 * j + ((3 - i) & 3)];
+            T xIn = x[j];
+
+            y[j] = xIn * rotatedP3 + (xIn + (1ULL << (bin))) * rotatedP4 + outMask[2 * j + (d & 1)];
+            u64 dreluBit = static_cast<u64>(d & 1);
+            writePackedOp(xLTRin, dreluBit, 1, N);
+        }
+    }
+
+    template <typename T>
+    T* gpuReluExtendMux(int party, int bin, int N,
+                              T *d_I, T *h_oneHot, T *h_outMask, u32 *d_drelu,
+                              u32 *d_xLTRin, Stats *s)
+    {
+        auto d_out = (T*) gpuMalloc(N * sizeof(T));
+        auto d_oneHot = (T *)moveToGPU((uint8_t *)h_oneHot, 4 * N * sizeof(T), s);
+        auto d_outMask = (T *)moveToGPU((uint8_t *)h_outMask, 2 * N * sizeof(T), s);
+        reluExtendMuxKernel<<<(N - 1) / 128 + 1, 128>>>(party, bin, N, d_I, d_out, d_oneHot, d_outMask, d_drelu, d_xLTRin);
+        checkCudaErrors(cudaDeviceSynchronize());
+        gpuFree(d_oneHot);
+        gpuFree(d_outMask);
+        return d_out;
+    }
+
+    template <typename T>
+    __global__ void reluExtendMuxKeyKernel(int bin, int bout, int N, T *d_inputMask, u8 *d_dreluMask, u8 *d_dcfMask, T *d_randomMask, T *d_oneHot, T *d_outMask)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            // if(i <= 1) printf("dreluMask=%u, ltMask=%u\n", d_dreluMask[i], d_dcfMask[i]);
+            auto onePos = (-(2 * d_dreluMask[i] + d_dcfMask[i])) & T(3);
+            assert(onePos < 4);
+            for (int j = 0; j < 4; j++)
+            {
+                d_oneHot[4 * i + j] = (j == onePos ? T(1) : T(0));
+            }
+            int outputMask0Idx = d_dreluMask[i] & T(1);
+            int outputMask1Idx = 1 - outputMask0Idx;
+            d_outMask[2 * i + outputMask0Idx] = d_randomMask[i];
+            d_outMask[2 * i + outputMask1Idx] = d_randomMask[i] - d_inputMask[i];
+            d_dreluMask[i] &= T(1);
+        }
+    }
+
+    template <typename T>
+    T *genReluExtendMuxKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, T *d_inputMask, u8 *d_dreluMask, u8 *d_dcfMask)
+    {
+        auto d_randomMask = randomGEOnGpu<T>(N, bout);
+        auto d_oneHot = (T *)gpuMalloc(4 * N * sizeof(T));
+        auto d_outMask = (T *)gpuMalloc(2 * N * sizeof(T));
+        reluExtendMuxKeyKernel<<<(N - 1) / 256 + 1, 256>>>(bin, bout, N, d_inputMask, d_dreluMask, d_dcfMask, d_randomMask, d_oneHot, d_outMask);
+        writeShares<T, T>(key_as_bytes, party, 4 * N, d_oneHot, bout);
+        writeShares<T, T>(key_as_bytes, party, 2 * N, d_outMask, bout);
+        gpuFree(d_oneHot);
+        gpuFree(d_outMask);
+        return d_randomMask;
+    }
+
+    template <typename T>
+    std::pair<u8 *, T *> gpuKeygenReluExtend(uint8_t **key_as_bytes, int party, int bin, int bout, int N, T *d_inputMask, AESGlobalContext* g)
+    {
+        writeInt(key_as_bytes, bin);
+        writeInt(key_as_bytes, bout);
+        writeInt(key_as_bytes, N);
+        gpuKeyGenDCF(key_as_bytes, party, bin, 2, N, d_inputMask, T(1), g);
+        auto d_dreluMask = randomGEOnGpu<u8>(N, 2);
+        // checkCudaErrors(cudaMemset(d_dreluMask, 0, N));
+        auto d_dcfMask = randomGEOnGpu<u8>(N, 2);
+        // checkCudaErrors(cudaMemset(d_dcfMask, 0, N));
+        writeShares<u8, u8>(key_as_bytes, party, N, d_dreluMask, 2);
+        writeShares<u8, u8>(key_as_bytes, party, N, d_dcfMask, 2);
+        auto d_randomMask = genReluExtendMuxKey(key_as_bytes, party, bin, bout, N, d_inputMask, d_dreluMask, d_dcfMask);
+        // gpuFree(d_inputMask);
+        gpuFree(d_dcfMask);
+        // gpuFree(d_dreluMask);
+        return std::make_pair(d_dreluMask, d_randomMask);
+    }
+
+    template <typename T>
+    std::pair<u32 *, T *> gpuReluExtend(SigmaPeer *peer, int party, GPUReluExtendKey<T> k, T *d_I, AESGlobalContext *g, Stats *s)
+    {
+        // printf("%d, %d, %d\n", k.bin, k.bout, k.N);
+        std::vector<u32 *> h_masks = {k.dReluKey.dReluMask, k.dcfMask};
+        auto d_dcf = gpuDcf<T, 2, dReluPrologue, dReluEpilogue<true>>(k.dReluKey.dcfKey, party, d_I, g, s, &h_masks);
+        peer->reconstructInPlace(d_dcf, 2, 2 * k.dReluKey.dcfKey.memSzOut * 4, s);
+        auto d_drelu = d_dcf;
+        auto d_xLTRin = (u32 *)(((u8 *)d_dcf) + k.dReluKey.dcfKey.memSzOut);
+        auto d_relu = gpuReluExtendMux(party, k.bin, k.N, d_I, k.oneHot, k.outMask, d_drelu, d_xLTRin, s);
+        peer->reconstructInPlace(d_relu, k.bout, k.N, s);
+        return std::make_pair(d_drelu, d_relu);
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_relu.h b/GPU-MPC/fss/dcf/gpu_relu.h
new file mode 100644
index 00000000..988ddfad
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_relu.h
@@ -0,0 +1,102 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_dcf.h"
+#include "fss/gpu_select.h"
+
+namespace dcf
+{
+
+    struct GPUDReluKey
+    {
+        GPUDCFKey dcfKey;
+        u32 *dReluMask;
+    };
+
+    template <typename T>
+    struct GPU2RoundReLUKey
+    {
+        int bin, bout, N;
+        GPUDReluKey dreluKey;
+        GPUSelectKey<T> selectKey;
+    };
+
+    template <typename T>
+    struct GPUReluExtendKey
+    {
+        int bin, bout, N;
+        GPUDReluKey dReluKey;
+        u32 *dcfMask;
+        T *oneHot;
+        T *outMask;
+    };
+
+    GPUDReluKey readGPUDReluKey(u8 **key_as_bytes)
+    {
+        GPUDReluKey k;
+        k.dcfKey = readGPUDCFKey(key_as_bytes);
+        k.dReluMask = (u32 *)*key_as_bytes;
+        // number of 32-bit integers * sizeof(int)
+        *key_as_bytes += ((k.dcfKey.bout * k.dcfKey.M - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+        return k;
+    }
+
+    template <typename T>
+    GPU2RoundReLUKey<T> readTwoRoundReluKey(u8 **key_as_bytes)
+    {
+        GPU2RoundReLUKey<T> k;
+        k.bin = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        k.bout = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        k.N = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        size_t memSz = k.N * sizeof(T);
+
+        k.dreluKey = readGPUDReluKey(key_as_bytes);
+        k.selectKey = readGPUSelectKey<T>(key_as_bytes, k.N);
+        return k;
+    }
+
+    template <typename T>
+    GPUReluExtendKey<T> readGPUReluExtendKey(u8 **key_as_bytes)
+    {
+        GPUReluExtendKey<T> k;
+        memcpy(&k, *key_as_bytes, 3 * sizeof(int));
+        *key_as_bytes += (3 * sizeof(int));
+        k.dReluKey = readGPUDReluKey(key_as_bytes);
+        k.dcfMask = (u32 *)*key_as_bytes;
+        int N = k.dReluKey.dcfKey.M;
+        *key_as_bytes += ((2 * N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+        k.oneHot = (T *)*key_as_bytes;
+        *key_as_bytes += 4 * N * sizeof(T);
+        k.outMask = (T *)*key_as_bytes;
+        *key_as_bytes += 2 * N * sizeof(T);
+        return k;
+    }
+}
+
+#include "gpu_relu.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_sgd.cu b/GPU-MPC/fss/dcf/gpu_sgd.cu
new file mode 100644
index 00000000..5fa26533
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_sgd.cu
@@ -0,0 +1,330 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cassert>
+#include "utils/gpu_mem.h"
+
+#include "gpu_sgd.h"
+
+namespace dcf
+{
+
+    template <typename T>
+    __global__ void leftShiftAndAddKernel(T *A, T *B, T *C, int shift, T alpha, int N)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            C[i] = (A[i] << shift) + alpha * B[i];
+            // if(i == 1) printf("%u %u %u %u %d\n", A[i], B[i], alpha, C[i], shift);
+        }
+    }
+
+    template <typename T>
+    void gpuLeftShiftAndAdd(int N, T *d_A, T *d_B, T *d_C, int shift, T alpha)
+    {
+        assert(shift < sizeof(T) * 64);
+        leftShiftAndAddKernel<<<(N - 1) / 128 + 1, 128>>>(d_A, d_B, d_C, shift, alpha, N);
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+
+    template <typename T>
+    void genGpuSGDWithMomentumKey(u8 **key_as_bytes, int party, int bin, int bout, int N, T *h_W, T *d_W,
+                                  T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, TruncateType t, AESGlobalContext *gaes, int epoch)
+    {
+        size_t memSizeW = N * sizeof(T);
+        auto d_Vw = (T *)moveToGPU((u8 *)h_Vw, memSizeW, NULL);
+        int shift = orca::mom_scale + scaleVw - scaledW;
+        // the d_dW mask got moved to the left by shift
+        gpuLeftShiftAndAdd(N, d_dW, d_Vw, d_Vw, shift, T(orca::mom_fp));
+        d_Vw = genGPUTruncateKey(key_as_bytes, party, t, bin, bout, orca::mom_scale, N, d_Vw, gaes);
+        moveIntoCPUMem((u8 *)h_Vw, (u8 *)d_Vw /*d_dW*/, memSizeW, NULL);
+
+        bool dWWasNull = false;
+        if (d_W == NULL)
+        {
+            d_W = (T *)moveToGPU((u8 *)h_W, memSizeW, NULL);
+            dWWasNull = true;
+        }
+        shift = orca::lr_scale[epoch] + scaleVw - scaleW;
+        // this is wrong it needs to be -lr
+        auto d_new_W = (T *)gpuMalloc(memSizeW);
+        gpuLeftShiftAndAdd(N, d_W, d_Vw, d_new_W, shift, -T(orca::lr_fp));
+        if (shift > 0)
+            d_new_W = genGPUTruncateKey(key_as_bytes, party, t, bin, bout, shift, N, d_new_W, gaes);
+        moveIntoCPUMem((u8 *)h_W, (u8 *)d_new_W, memSizeW, NULL);
+        gpuFree(d_new_W);
+        if (dWWasNull)
+            gpuFree(d_W);
+        gpuFree(d_Vw);
+        // // dW = W << (scale + orca::lr_scale[epoch]) + orca::lr_fp * Vw;
+        // shift = scaleVw + orca::lr_scale[epoch] - scaleW;
+        // // Neha: this is wrong. it needs to be -lr
+        // gpuLeftShiftAndAddWrapper(N, W, Vw, dW, shift, -orca::lr_fp);
+        // genGPUTruncateKey(f1, f2, shift > 0 ? t : TruncateType::None, bin, bout, shift, N, dW, W);
+    }
+
+    template <typename T>
+    void readGpuSGDWithMomentumKey(TruncateType t, GPUTruncateKey<T> *truncateKeyVw, GPUTruncateKey<T> *truncateKeyW, u8 **key_as_bytes, int scaleW, int scaleVw, int scaledW, int epoch)
+    {
+        *truncateKeyVw = readGPUTruncateKey<T>(t, key_as_bytes);
+        int shift = orca::lr_scale[epoch] + scaleVw - scaleW;
+        if (shift > 0)
+            *truncateKeyW = readGPUTruncateKey<T>(t, key_as_bytes);
+    }
+
+    template <typename T>
+    void gpuSgdWithMomentum(int bin, int bout, int N, T *h_W, T *d_W,
+                            T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, dcf::TruncateType t,
+                            dcf::GPUTruncateKey<T> truncateKeyVw, GPUTruncateKey<T> truncateKeyW, int party, SigmaPeer *peer, AESGlobalContext *gaes, Stats *s, int epoch)
+    {
+        size_t memSizeW = N * sizeof(T);
+        auto d_Vw = (T *)moveToGPU((u8 *)h_Vw, memSizeW, s);
+        int shift = orca::mom_scale + scaleVw - scaledW;
+        // printf("h_Vw=%ld\n", h_Vw[0]);
+        // the d_dW mask got moved to the left by shift
+        gpuLeftShiftAndAdd(N, d_dW, d_Vw, d_Vw, shift, T(orca::mom_fp));
+        dcf::gpuTruncate(bin, bout, t, truncateKeyVw, orca::mom_scale, peer, party, N, d_Vw, gaes, s);
+        moveIntoCPUMem((u8 *)h_Vw, (u8 *)d_Vw /*d_dW*/, memSizeW, s);
+
+        bool dWWasNull = false;
+        if (d_W == NULL)
+        {
+            d_W = (T *)moveToGPU((u8 *)h_W, memSizeW, s);
+            dWWasNull = true;
+        }
+        shift = orca::lr_scale[epoch] + scaleVw - scaleW;
+        // this is wrong it needs to be -lr
+        gpuLeftShiftAndAdd(N, d_W, d_Vw, d_W, shift, -T(orca::lr_fp));
+        if (shift > 0)
+            dcf::gpuTruncate(bin, bout, t, truncateKeyW, shift, peer, party, N, d_W, gaes, s);
+        moveIntoCPUMem((u8 *)h_W, (u8 *)d_W, memSizeW, s);
+        if (dWWasNull)
+            gpuFree(d_W);
+        gpuFree(d_Vw);
+    }
+
+    template <typename T>
+    void checkSgdWithMomentum(int bin, int bout, int N,
+                              T *h_W, T *h_Vw, T *h_dW,
+                              T *h_masked_W, T *h_masked_Vw,
+                              T *h_mask_W, T *h_mask_Vw,
+                              int scaleW, int scaleVw, int scaledW, int epoch)
+    {
+        int shiftdW = scaleVw + orca::mom_scale - scaledW;
+        int shiftW = orca::lr_scale[epoch] + scaleVw - scaleW;
+        for (int i = 0; i < N; i++)
+        {
+            auto vw = h_masked_Vw[i] - h_mask_Vw[i];
+            auto vw_ct = cpuArs((h_dW[i] << shiftdW) + T(orca::mom_fp) * h_Vw[i], bin, orca::mom_scale);
+            // if(i < 10) printf("%lu %lu\n", u64(vw), u64(vw_ct));
+            assert(vw - vw_ct <= 1);
+            auto w_ct = cpuArs((h_W[i] << shiftW) - T(orca::lr_fp) * vw_ct, bin, shiftW);
+            // this is the new masked f
+            auto w = h_masked_W[i] - h_mask_W[i];
+            // need to test this when the starting vf is non-zero
+            auto diff = abs(static_cast<int64_t>(u64(w) - u64(w_ct)));
+            if (i < 10)
+                printf("%lu %lu %ld\n", u64(w), u64(w_ct), diff);
+            // the two is important
+            assert(/*abs(static_cast<int64_t>(w - w_ct))*/ diff <= 2);
+        }
+    }
+
+    template <typename T>
+    T *gpuMultiplyByConstant(T *d_A, T x, int N)
+    {
+        auto d_B = (T *)gpuMalloc(N * sizeof(T));
+        gpuLinearComb(sizeof(T) * 8, N, d_B, x, d_A);
+        return d_B;
+    }
+
+    template <typename T>
+    void genGpuSGDKey(u8 **key_as_bytes, int party, int bin, int bout, int N, T *h_W, T *d_W,
+                      T *d_dW, int scaleW, int scaledW, TruncateType t, AESGlobalContext *gaes, int epoch)
+    {
+        size_t memSizeW = N * sizeof(T);
+        auto d_delta = gpuMultiplyByConstant(d_dW, -T(orca::lr_fp), N);
+        int rightShift = scaledW + orca::lr_scale[epoch] - scaleW;
+        bool dWWasNull = false;
+        if (rightShift > 0)
+        {
+            assert(rightShift == orca::global::scale + orca::lr_scale[epoch]);
+            d_delta = genGPUTruncateKey(key_as_bytes, party, t, bin, bout, rightShift, N, d_delta, gaes);
+            gpuLinearComb(bin, N, d_W, T(1), d_W, T(1), d_delta);
+        }
+        else
+        {
+            int leftShift = scaleW - orca::lr_scale[epoch] - scaledW;
+            assert(leftShift == orca::global::scale - orca::lr_scale[epoch]);
+            assert(d_W == NULL);
+            d_W = (T *)moveToGPU((u8 *)h_W, memSizeW, NULL);
+            dWWasNull = true;
+            gpuLeftShiftAndAdd(N, d_delta, d_W, d_W, leftShift, T(1));
+        }
+        gpuFree(d_delta);
+        moveIntoCPUMem((u8 *)h_W, (u8 *)d_W, memSizeW, NULL);
+        if (dWWasNull)
+            gpuFree(d_W);
+    }
+
+    template <typename T>
+    void readGpuSGDKey(TruncateType t, int scaleW, int scaledW, GPUTruncateKey<T> *truncateKeyW, u8 **key_as_bytes, int epoch)
+    {
+        int rightShift = scaledW + orca::lr_scale[epoch] - scaleW;
+        if (rightShift > 0)
+        {
+            *truncateKeyW = readGPUTruncateKey<T>(t, key_as_bytes);
+        }
+    }
+
+    template <typename T>
+    void gpuSgd(int bin, int bout, int N, T *h_W, T *d_W,
+                T *d_dW, int scaleW, int scaledW, TruncateType t,
+                GPUTruncateKey<T> truncateKeyW, int party, SigmaPeer *peer, AESGlobalContext *gaes, Stats *s, int epoch)
+    {
+        size_t memSizeW = N * sizeof(T);
+        // the d_dW mask got moved to the left by shift
+        auto d_delta = gpuMultiplyByConstant(d_dW, -T(orca::lr_fp), N);
+        int rightShift = orca::lr_scale[epoch] + scaledW - scaleW;
+        bool dWWasNull = false;
+        if (rightShift > 0)
+        {
+            assert(rightShift == orca::global::scale + orca::lr_scale[epoch]);
+            dcf::gpuTruncate(bin, bout, t, truncateKeyW, rightShift, peer, party, N, d_delta, gaes, s);
+            gpuLinearComb(bin, N, d_W, T(1), d_W, T(1), d_delta);
+        }
+        else
+        {
+            int leftShift = scaleW - orca::lr_scale[epoch] - scaledW;
+            assert(leftShift == orca::global::scale - orca::lr_scale[epoch]);
+            assert(d_W == NULL);
+            d_W = (T *)moveToGPU((u8 *)h_W, memSizeW, NULL);
+            dWWasNull = true;
+            gpuLeftShiftAndAdd(N, d_delta, d_W, d_W, leftShift, T(1));
+        }
+        gpuFree(d_delta);
+        moveIntoCPUMem((u8 *)h_W, (u8 *)d_W, memSizeW, s);
+        if (dWWasNull)
+            gpuFree(d_W);
+    }
+
+    template <typename T>
+    void checkSgd(int bin, int bout, int N,
+                  T *h_W, T *h_dW, T *h_masked_W,
+                  T *h_mask_W, int scaleW, int scaledW, int epoch)
+    {
+        int rightShift = orca::lr_scale[epoch] + scaledW - scaleW;
+        if (rightShift > 0)
+        {
+            assert(rightShift == orca::global::scale + orca::lr_scale[epoch]);
+            for (int i = 0; i < N; i++)
+            {
+                auto w_ct = h_W[i] - cpuArs(T(orca::lr_fp) * h_dW[i], bin, rightShift);
+                // this is the new masked f
+                auto w = h_masked_W[i] - h_mask_W[i];
+                // need to test this when the starting vf is non-zero
+                auto diff = abs(static_cast<int32_t>(w - w_ct));
+                if (i < 10)
+                    printf("%lu %lu %d\n", u64(w), u64(w_ct), diff);
+                assert(diff <= 1);
+            }
+        }
+        else
+        {
+            int leftShift = scaleW - orca::lr_scale[epoch] - scaledW;
+            assert(leftShift == orca::global::scale - orca::lr_scale[epoch]);
+            for (int i = 0; i < N; i++)
+            {
+                auto w_ct = h_W[i] - T(orca::lr_fp) * h_dW[i] * (T(1) << leftShift);
+                // this is the new masked f
+                auto w = h_masked_W[i] - h_mask_W[i];
+                // need to test this when the starting vf is non-zero
+                auto diff = abs(static_cast<int32_t>(w - w_ct));
+                if (i < 10)
+                    printf("%lu %lu %ld\n", w, w_ct, diff);
+                assert(diff == 0);
+            }
+        }
+    }
+
+    template <typename T>
+    void genOptimizerKey(u8 **key_as_bytes, int party, int bin, int bout, int N, T *h_W, T *d_W,
+                         T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, TruncateType t, bool useMomentum, AESGlobalContext *gaes, int epoch)
+    {
+        if (useMomentum)
+        {
+            genGpuSGDWithMomentumKey(key_as_bytes, party, bin, bout, N, h_W, d_W, h_Vw, d_dW, scaleW, scaleVw, scaledW, t, gaes, epoch);
+        }
+        else
+        {
+            genGpuSGDKey(key_as_bytes, party, bin, bout, N, h_W, d_W, d_dW, scaleW, scaledW, t, gaes, epoch);
+        }
+    }
+
+    template <typename T>
+    void readOptimizerKey(TruncateType t, GPUTruncateKey<T> *truncateKeyVw, GPUTruncateKey<T> *truncateKeyW, u8 **key_as_bytes, int scaleW, int scaleVw, int scaledW, bool useMomentum, int epoch)
+    {
+        if (useMomentum)
+        {
+            readGpuSGDWithMomentumKey(t, truncateKeyVw, truncateKeyW, key_as_bytes, scaleW, scaleVw, scaledW, epoch);
+        }
+        else
+        {
+            readGpuSGDKey(t, scaleW, scaledW, truncateKeyW, key_as_bytes, epoch);
+        }
+    }
+
+    template <typename T>
+    void optimize(int bin, int bout, int N, T *h_W, T *d_W,
+                  T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, TruncateType t,
+                  GPUTruncateKey<T> truncateKeyVw, GPUTruncateKey<T> truncateKeyW, int party, SigmaPeer *peer, bool useMomentum, AESGlobalContext *gaes, Stats *s, int epoch)
+    {
+        if (useMomentum)
+        {
+            gpuSgdWithMomentum(bin, bout, N, h_W, d_W, h_Vw, d_dW, scaleW, scaleVw, scaledW, t, truncateKeyVw, truncateKeyW, party, peer, gaes, s, epoch);
+        }
+        else
+        {
+            gpuSgd(bin, bout, N, h_W, d_W, d_dW, scaleW, scaledW, t, truncateKeyW, party, peer, gaes, s, epoch);
+        }
+    }
+
+    template <typename T>
+    void checkOptimizer(int bin, int bout, int N,
+                        T *h_W, T *h_Vw, T *h_dW,
+                        T *h_masked_W, T *h_masked_Vw,
+                        T *h_mask_W, T *h_mask_Vw,
+                        int scaleW, int scaleVw, int scaledW, bool useMomentum, int epoch)
+    {
+        if (useMomentum)
+        {
+            checkSgdWithMomentum(bin, bout, N, h_W, h_Vw, h_dW, h_masked_W, h_masked_Vw, h_mask_W, h_mask_Vw, scaleW, scaleVw, scaledW, epoch);
+        }
+        else
+        {
+            checkSgd(bin, bout, N, h_W, h_dW, h_masked_W, h_mask_W, scaleW, scaledW, epoch);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_sgd.h b/GPU-MPC/fss/dcf/gpu_sgd.h
new file mode 100644
index 00000000..ea239f18
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_sgd.h
@@ -0,0 +1,57 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "fss/dcf/gpu_truncate.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        static const uint64_t lr_fp = 1;
+        static const int lr_scale[5] = {6, 6, 6, 9, 9};
+        static const uint64_t mom_fp = 29;
+        static const int mom_scale = 5;
+    }
+
+    template <typename T>
+    void genOptimizerKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, T *h_W, T *d_W,
+                         T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, TruncateType t, bool useMomentum, AESGlobalContext *gaes, int epoch);
+
+    template <typename T>
+    void readOptimizerKey(TruncateType t, GPUSignExtendKey<T> *truncateKeyVw, GPUSignExtendKey<T> *truncateKeyW, uint8_t **key_as_bytes, int scaleW, int scaleVw, int scaledW, bool useMomentum, int epoch);
+
+    template <typename T>
+    void optimize(int bin, int bout, int N, T *h_W, T *d_W,
+                  T *h_Vw, T *d_dW, int scaleW, int scaleVw, int scaledW, TruncateType t,
+                  GPUSignExtendKey<T> truncateKeyVw, GPUSignExtendKey<T> truncateKeyW, int party, Peer *peer, bool useMomentum, AESGlobalContext *gaes, Stats *s, int epoch);
+
+    template <typename T>
+    void checkOptimizer(int bin, int bout, int N,
+                        T *h_W, T *h_Vw, T *h_dW,
+                        T *h_masked_W, T *h_masked_Vw,
+                        T *h_mask_W, T *h_mask_Vw,
+                        int scaleW, int scaleVw, int scaledW, bool useMomentum, int epoch);
+
+}
+
+#include "gpu_sgd.cu"
diff --git a/GPU-MPC/fss/dcf/gpu_sstab.h b/GPU-MPC/fss/dcf/gpu_sstab.h
new file mode 100644
index 00000000..c6b8c56b
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_sstab.h
@@ -0,0 +1,67 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "fss/gpu_sstab.h"
+#include "gpu_dcf_templates.h"
+
+namespace dcf
+{
+    template <typename T, int E, dcfPrologue pr, dcfEpilogue ep>
+    __global__ void lookupSSTable(int party, int bin, int N,
+                                  T *in, u8 *ss, u32 *out)
+    {
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid < N)
+        {
+            auto x = u64(in[tid]);
+            gpuMod(x, bin);
+            int tabSz = (1ULL << (max(0, bin - 3))); // number of bytes occupied by the table
+            // printf("Table start idx %d=%d\n", tid, tabSz * tid);
+            u8 *localSS = &ss[tid * tabSz];
+            u64 x1[E];
+            // populate the input
+            pr(party, bin, N, x, x1);
+            u64 o[E];
+            for (int e = 0; e < E; e++)
+            {
+                gpuMod(x1[e], bin);
+                // printf("X[%d]=%ld, %ld, %d, %d\n", e, x, x1[e], int(lookup(localSS, x1[e])), int(localSS[x1[e] / 8]));
+                o[e] = lookup(localSS, x1[e]);
+            }
+            ep(party, bin, 1, N, x, o, out, 0);
+        }
+    }
+
+    template <typename T, int E, dcfPrologue pr, dcfEpilogue ep>
+    u32 *gpuLookupSSTable(GPUSSTabKey &k, int party, T *d_in, Stats *s, std::vector<u32 *> *h_masks = NULL)
+    {
+        auto d_out = moveMasks(k.memSzOut, h_masks, s);
+        // printf("Bin=%d, Memsz=%ld\n", k.bin, k.memSzSS);
+        auto d_ss = (u8 *)moveToGPU((u8 *)k.ss, k.memSzSS, s);
+        dcf::lookupSSTable<T, E, pr, ep><<<(k.N - 1) / 128 + 1, 128>>>(party, k.bin, k.N, d_in, d_ss, d_out);
+        checkCudaErrors(cudaDeviceSynchronize());
+        gpuFree(d_ss);
+        return d_out;
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_truncate.cu b/GPU-MPC/fss/dcf/gpu_truncate.cu
new file mode 100644
index 00000000..c929fd07
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_truncate.cu
@@ -0,0 +1,270 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_truncate.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+
+#include "gpu_dcf_templates.h"
+#include "fss/gpu_local_truncate.h"
+#include <cassert>
+
+namespace dcf
+{
+
+    template <typename T>
+    __global__ void signExtendKeyKernel(int bin, int bout, int N, T *inMask, u8 *dcfMask, T *t, T *p, T *outMask)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            t[i] = outMask[i] - inMask[i] - (T(1) << (bin - 1));
+            gpuMod<T>(t[i], bout);
+            assert(dcfMask[i] == 0 || dcfMask[i] == 1);
+            int idx0 = dcfMask[i];
+            int idx1 = 1 - idx0;
+            p[2 * i + idx0] = 0;
+            p[2 * i + idx1] = (T(1) << bin);
+        }
+    }
+
+    template <typename T>
+    T *genSignExtendKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, T *d_inputMask, AESGlobalContext *gaes)
+    {
+        writeInt(key_as_bytes, bin);
+        writeInt(key_as_bytes, bout);
+        writeInt(key_as_bytes, N);
+        gpuKeyGenDCF<T>(key_as_bytes, party, bin, 1, N, d_inputMask, T(1), gaes);
+        auto d_dcfMask = randomGEOnGpu<u8>(N, 1);
+        writeShares<u8, u8>(key_as_bytes, party, N, d_dcfMask, 1);
+        auto d_outputMask = randomGEOnGpu<T>(N, bout);
+        auto d_T = (T *)gpuMalloc(N * sizeof(T));
+        auto d_p = (T *)gpuMalloc(2 * N * sizeof(T));
+        signExtendKeyKernel<<<(N - 1) / 256 + 1, 256>>>(bin, bout, N, d_inputMask, d_dcfMask, d_T, d_p, d_outputMask);
+        writeShares<T, T>(key_as_bytes, party, N, d_T, bout);
+        writeShares<T, T>(key_as_bytes, party, 2 * N, d_p, bout);
+        gpuFree(d_dcfMask);
+        gpuFree(d_T);
+        gpuFree(d_p);
+        return d_outputMask;
+    }
+
+    template <typename T>
+    __global__ void keygenStTRKernel(int party, int bin, int bout, int shift, int N, T *inputMask, T *rHat, u8 *lsbMask, T *lsbCorr, T *outMask)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            auto temp = inputMask[i];
+            gpuMod(temp, shift);
+            auto corr = 1 - 1 * (rHat[i] < temp) - (inputMask[i] >> shift) + outMask[i];
+            gpuMod(corr, bout);
+            auto corrM1 = corr - 1;
+            gpuMod(corrM1, bout);
+            lsbCorr[2 * i + lsbMask[i]] = corr;
+            lsbCorr[2 * i + (lsbMask[i] ^ 1)] = corrM1;
+        }
+    }
+
+    template <typename T>
+    T *genGPUStTRKey(uint8_t **key_as_bytes, int party, int bin, int bout, int shift, int N, T *d_inputMask, AESGlobalContext *gaes, T *h_r = NULL)
+    {
+        writeInt(key_as_bytes, bin);
+        writeInt(key_as_bytes, bout);
+        writeInt(key_as_bytes, shift);
+        writeInt(key_as_bytes, N);
+        // printf("shift=%d, %lx\n", shift, h_r);
+        auto d_rHat = randomGEOnGpu<T>(N, shift);
+        if (h_r)
+            moveIntoCPUMem((u8 *)h_r, (u8 *)d_rHat, N * sizeof(T), NULL);
+        gpuLinearComb(shift, N, d_rHat, T(1), d_rHat, T(1), d_inputMask);
+        gpuKeyGenDCF(key_as_bytes, party, shift, 1, N, d_rHat, T(1), gaes, true);
+        auto d_lsbMask = randomGEOnGpu<u8>(N, 1);
+        writeShares<u8, u8>(key_as_bytes, party, N, d_lsbMask, 1);
+        auto d_outMask = randomGEOnGpu<T>(N, bout);
+        auto d_lsbCorr = (T *)gpuMalloc(2 * N * sizeof(T));
+        keygenStTRKernel<<<(N - 1) / 128 + 1, 128>>>(party, bin, bout, shift, N, d_inputMask, d_rHat, d_lsbMask, d_lsbCorr, d_outMask);
+        writeShares<T, T>(key_as_bytes, party, 2 * N, d_lsbCorr, bout);
+        gpuFree(d_inputMask);
+        gpuFree(d_rHat);
+        gpuFree(d_lsbMask);
+        gpuFree(d_lsbCorr);
+        return d_outMask;
+    }
+
+    template <typename T>
+    T *genGPUStochasticTruncateKey(uint8_t **key_as_bytes, int party, int bin, int bout, int shift, int N, T *d_inputMask, AESGlobalContext *gaes, T *h_r = NULL)
+    {
+        auto d_trMask = genGPUStTRKey(key_as_bytes, party, bin, bin - shift, shift, N, d_inputMask, gaes, h_r);
+        // this free happens inside genGPUStTRKey()
+        // gpuFree(d_inputMask);
+        auto d_outputMask = genSignExtendKey(key_as_bytes, party, bin - shift, bout, N, d_trMask, gaes);
+        gpuFree(d_trMask);
+        return d_outputMask;
+    }
+
+    template <typename T>
+    T *genGPUTruncateKey(uint8_t **key_as_bytes, int party, TruncateType t, int bin, int bout, int shift, int N, T *d_inMask, AESGlobalContext *gaes, T *h_r = NULL)
+    {
+        T *d_outMask;
+        switch (t)
+        {
+        case TruncateType::StochasticTruncate:
+            d_outMask = genGPUStochasticTruncateKey(key_as_bytes, party, bin, bout, shift, N, d_inMask, gaes, h_r);
+            break;
+        case TruncateType::LocalARS:
+            gpuLocalTr<T, T, ars>(party, bin, shift, N, d_inMask, true);
+            d_outMask = d_inMask;
+            break;
+        case TruncateType::StochasticTR:
+            bout = bin - shift;
+            d_outMask = genGPUStTRKey(key_as_bytes, party, bin, bout, shift, N, d_inMask, gaes, h_r);
+            break;
+        default:
+            d_outMask = d_inMask;
+            assert(t == TruncateType::None);
+        }
+        return d_outMask;
+    }
+
+
+    template <typename T>
+    __global__ void selectForTruncateKernel(T *x, u32 *maskedDcfBit, T *outMask, T *p, int N, int party)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            // can remove the cast to u32* for maskedDcfBit
+            int dcfBit = (((u32 *)maskedDcfBit)[i / 32] >> (threadIdx.x & 0x1f)) & 1;
+            x[i] = (party == SERVER1) * x[i] + outMask[i] + p[2 * i + dcfBit];
+        }
+    }
+
+    // no memory leak
+    template <typename T>
+    void gpuSelectForTruncate(int party, int N, T *d_I, u32 *d_maskedDcfBit, T *h_outMask, T *h_p, Stats *s)
+    {
+        size_t memSz = N * sizeof(T);
+        auto d_outMask = (T *)moveToGPU((u8 *)h_outMask, memSz, s);
+        auto d_p = (T *)moveToGPU((u8 *)h_p, 2 * memSz, s);
+        selectForTruncateKernel<T><<<(N - 1) / 128 + 1, 128>>>(d_I, d_maskedDcfBit, d_outMask, d_p, N, party);
+        checkCudaErrors(cudaDeviceSynchronize());
+        gpuFree(d_outMask);
+        gpuFree(d_p);
+    }
+
+    // no memory leaks
+    template <typename T>
+    void gpuSignExtend(GPUSignExtendKey<T> k, int party, SigmaPeer *peer, T *d_I, AESGlobalContext *g, Stats *s)
+    {
+        gpuLinearComb(k.bin, k.N, d_I, T(1), d_I, T(1ULL << (k.bin - 1)));
+        std::vector<u32 *> h_dcfMask = {k.dcfKey.dReluMask};
+        auto d_maskedDcfBit = dcf::gpuDcf<T, 1, dcf::idPrologue, dcf::maskEpilogue>(k.dcfKey.dcfKey, party, d_I, g, s, &h_dcfMask);
+        peer->reconstructInPlace(d_maskedDcfBit, 1, k.N, s);
+        gpuSelectForTruncate(party, k.N, d_I, d_maskedDcfBit, k.t, k.p, s);
+        peer->reconstructInPlace(d_I, k.bout, k.N, s);
+        gpuFree(d_maskedDcfBit);
+    }
+
+    template <typename T>
+    __global__ void stochasticTRKernel(int party, int bin, int bout, int shift, int N, T *d_I, u32 *d_dcf, T *lsbCorr)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < N)
+        {
+            T lsb = (T)((d_dcf[i / PACKING_SIZE] >> (threadIdx.x & 0x1f)) & 1);
+            d_I[i] = (party == SERVER1) * (d_I[i] >> shift) + lsbCorr[2 * i + lsb];
+            gpuMod(d_I[i], bout);
+        }
+    }
+
+    template <typename T>
+    void gpuStochasticTR(GPUStTRKey<T> k, int party, SigmaPeer *peer, T *d_I, AESGlobalContext *g, Stats *s)
+    {
+        std::vector<u32 *> h_mask = {k.lsbKey.dReluMask};
+        auto d_dcf = dcf::gpuDcf<T, 1, idPrologue, maskEpilogue>(k.lsbKey.dcfKey, party, d_I, g, s, &h_mask);
+        peer->reconstructInPlace(d_dcf, 1, k.N, s);
+        auto d_lsbCorr = (T *)moveToGPU((u8 *)k.lsbCorr, 2 * k.N * sizeof(T), s);
+        stochasticTRKernel<<<(k.N - 1) / 128 + 1, 128>>>(party, k.bin, k.bout, k.shift, k.N, d_I, d_dcf, d_lsbCorr);
+        peer->reconstructInPlace(d_I, k.bout, k.N, s);
+        gpuFree(d_dcf);
+        gpuFree(d_lsbCorr);
+    }
+
+    template <typename T>
+    void gpuStochasticTruncate(GPUTruncateKey<T> k, int party, SigmaPeer *peer, T *d_I, AESGlobalContext *g, Stats *s)
+    {
+        gpuStochasticTR(k.stTRKey, party, peer, d_I, g, s);
+        gpuSignExtend(k.signExtendKey, party, peer, d_I, g, s);
+    }
+
+    template <typename T>
+    void gpuTruncate(int bin, int bout, TruncateType t, GPUTruncateKey<T> k, int shift, SigmaPeer *peer, int party, int N, T *d_I, AESGlobalContext *gaes, Stats *s)
+    {
+        switch (t)
+        {
+        case TruncateType::StochasticTR:
+            // assert(bout == bin - shift);
+            bout = bin - shift;
+            gpuStochasticTR(k.stTRKey, party, peer, d_I, gaes, s);
+            break;
+        case TruncateType::LocalARS:
+            gpuLocalTr<T, T, ars>(party, bin, shift, N, d_I, true);
+            break;
+        case TruncateType::StochasticTruncate:
+            gpuStochasticTruncate(k, party, peer, d_I, gaes, s);
+            break;
+        default:
+            assert(t == TruncateType::None);
+        }
+        return;
+    }
+
+    // check via tolerance bounds
+    template <typename T>
+    void checkTrStWithTol(int bin, int bout, int shift, int N, T *h_masked_A, T *h_mask_A, T *h_A_ct)
+    {
+        for (int i = 0; i < N; i++)
+        {
+            auto temp = h_A_ct[i] + T(1ULL << (bin - 1));
+            cpuMod(temp, bin);
+            auto truncated_A = temp >> shift;
+            auto truncated_A_plus1 = truncated_A + 1;
+            cpuMod(truncated_A_plus1, bin - shift);
+            truncated_A -= T(1ULL << (bin - shift - 1));
+            cpuMod(truncated_A, bout);
+            truncated_A_plus1 -= T(1ULL << (bin - shift - 1));
+            cpuMod(truncated_A_plus1, bout);
+            auto output = h_masked_A[i] - h_mask_A[i];
+            cpuMod(output, bout);
+            if (i < 10)
+                printf("%lu %lu %lu\n", h_A_ct[i], u64(output), u64(truncated_A));
+            if (output != truncated_A && output != truncated_A_plus1)
+                printf("%lu %lu %lu %lu\n", h_A_ct[i], u64(output), u64(truncated_A), u64(truncated_A_plus1));
+            assert(output == truncated_A || output == truncated_A_plus1);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/dcf/gpu_truncate.h b/GPU-MPC/fss/dcf/gpu_truncate.h
new file mode 100644
index 00000000..7d333468
--- /dev/null
+++ b/GPU-MPC/fss/dcf/gpu_truncate.h
@@ -0,0 +1,132 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_dcf.h"
+#include "gpu_relu.h"
+#include "utils/gpu_stats.h"
+#include "utils/gpu_comms.h"
+
+namespace dcf
+{
+    enum TruncateType
+    {
+        None,
+        // LocalLRS,
+        StochasticTR,
+        LocalARS,
+        StochasticTruncate
+    };
+
+    using GPUMaskedDCFKey = GPUDReluKey;
+
+    template <typename T>
+    struct GPUStTRKey
+    {
+        int bin, bout, shift, N;
+        GPUMaskedDCFKey lsbKey;
+        T *lsbCorr;
+    };
+
+    template <typename T>
+    struct GPUSignExtendKey
+    {
+        int bin, bout, N;
+        GPUMaskedDCFKey dcfKey;
+        T *t, *p;
+    };
+
+    template <typename T>
+    struct GPUTruncateKey
+    {
+        GPUStTRKey<T> stTRKey;
+        GPUSignExtendKey<T> signExtendKey;
+    };
+
+    const auto readGPUMaskedDCFKey = readGPUDReluKey;
+
+    template <typename T>
+    GPUSignExtendKey<T> readGPUSignExtendKey(uint8_t **key_as_bytes)
+    {
+        GPUSignExtendKey<T> k;
+        k.bin = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        k.bout = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        k.N = *((int *)*key_as_bytes);
+        *key_as_bytes += sizeof(int);
+
+        k.dcfKey = readGPUMaskedDCFKey(key_as_bytes);
+        // change this ugly name
+        size_t memSz = k.dcfKey.dcfKey.M * sizeof(T);
+        // printf("Sign extend=%d\n", k.dcfKey.dcfKey.M);
+        k.t = (T *)*key_as_bytes;
+        *key_as_bytes += memSz;
+        k.p = (T *)*key_as_bytes;
+        *key_as_bytes += 2 * memSz;
+        return k;
+    }
+
+    template <typename T>
+    GPUStTRKey<T> readGPUStTRKey(u8 **key_as_bytes)
+    {
+        GPUStTRKey<T> k;
+        memcpy(&k, *key_as_bytes, 4 * sizeof(int));
+        *key_as_bytes += 4 * sizeof(int);
+        k.lsbKey = readGPUMaskedDCFKey(key_as_bytes);
+        size_t memSz = k.N * sizeof(T);
+        k.lsbCorr = (T *)*key_as_bytes;
+        *key_as_bytes += 2 * memSz;
+        return k;
+    }
+
+    template <typename T>
+    GPUTruncateKey<T> readGPUTrStochasticKey(u8 **key_as_bytes)
+    {
+        GPUTruncateKey<T> k;
+        k.stTRKey = readGPUStTRKey<T>(key_as_bytes);
+        k.signExtendKey = readGPUSignExtendKey<T>(key_as_bytes);
+        return k;
+    }
+
+    template <typename T>
+    GPUTruncateKey<T> readGPUTruncateKey(TruncateType t, uint8_t **key_as_bytes)
+    {
+        GPUTruncateKey<T> k;
+        switch (t)
+        {
+        case TruncateType::StochasticTruncate:
+            k = readGPUTrStochasticKey<T>(key_as_bytes);
+            break;
+        case TruncateType::StochasticTR:
+            k.stTRKey = readGPUStTRKey<T>(key_as_bytes);
+            break;
+        default:
+            assert(t == TruncateType::None || t == TruncateType::LocalARS || t == TruncateType::StochasticTR);
+        }
+        return k;
+    }
+}
+
+#include "gpu_truncate.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_add.h b/GPU-MPC/fss/gpu_add.h
new file mode 100644
index 00000000..a7aa91d4
--- /dev/null
+++ b/GPU-MPC/fss/gpu_add.h
@@ -0,0 +1,45 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+
+template <typename T> 
+__global__ void addManyArrays(int bw, int numPtrs, int N, T** ptrs, T* out) { 
+    int j = blockIdx.x * blockDim.x + threadIdx.x; 
+    if (j < N) { 
+        out[j] = T(0); 
+        for(int i = 0; i < numPtrs; i++) out[j] += ptrs[i][j]; 
+        gpuMod(out[j], bw); 
+        // if(j < 3) printf("Add %d, %d=%ld, %ld, %ld\n", N, j, out[j], ptrs[0][j], ptrs[1][j]);
+    } 
+}
+
+template <typename T>
+T* gpuAdd(int bw, int N, std::vector<T*> &h_dPtrsOnHost) { 
+    int numDPtrs = h_dPtrsOnHost.size();
+    T** d_dPtrs = (T**) moveToGPU((uint8_t*) h_dPtrsOnHost.data(), numDPtrs * sizeof(T*), NULL); 
+    T* d_out = (T*) gpuMalloc(N * sizeof(T)); 
+    addManyArrays<<<(N - 1) / 128 + 1, 128>>>(bw, numDPtrs, N, d_dPtrs, d_out); 
+    checkCudaErrors(cudaDeviceSynchronize()); 
+    return d_out; 
+}
diff --git a/GPU-MPC/fss/gpu_aes_shm.cu b/GPU-MPC/fss/gpu_aes_shm.cu
new file mode 100644
index 00000000..18a5a651
--- /dev/null
+++ b/GPU-MPC/fss/gpu_aes_shm.cu
@@ -0,0 +1,241 @@
+// Author: Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// #pragma once
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_mem.h"
+
+__device__ const u32 RCON32C[15] = {
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, 0x6C000000, 0xD8000000,
+	0xAB000000, 0x4D000000, 0x9A000000};
+
+inline __device__ u32 readSBoxByte(u32 byteIn, u8 (*Sbox)[32][4])
+{
+	int wTid = threadIdx.x & 31;
+	auto i = (byteIn & 0xff) / 4;
+	return (u32)Sbox[i][wTid][byteIn & 3];
+}
+
+__device__ void aesKeySchedule(u32 *key, u32 *roundKey, u32 *t4_0S, u32 *t4_1S, u32 *t4_2S, u32 *t4_3S)
+{
+	u32 roundKey0, roundKey1, roundKey2, roundKey3;
+
+	roundKey0 = key[0];
+	roundKey1 = key[1];
+	roundKey2 = key[2];
+	roundKey3 = key[3];
+
+	roundKey[0] = roundKey0;
+	roundKey[1] = roundKey1;
+	roundKey[2] = roundKey2;
+	roundKey[3] = roundKey3;
+
+	for (u8 r = 0; r < AES_128_ROUNDS; r++)
+	{
+		roundKey0 = roundKey0 ^ t4_3S[(roundKey3 >> 16) & 0xff] ^ t4_2S[(roundKey3 >> 8) & 0xff] ^ t4_1S[roundKey3 & 0xff] ^ t4_0S[(roundKey3 >> 24)] ^ RCON32C[r];
+		roundKey1 = roundKey1 ^ roundKey0;
+		roundKey2 = roundKey2 ^ roundKey1;
+		roundKey3 = roundKey3 ^ roundKey2;
+
+		roundKey[4 * r + 4] = roundKey0;
+		roundKey[4 * r + 5] = roundKey1;
+		roundKey[4 * r + 6] = roundKey2;
+		roundKey[4 * r + 7] = roundKey3;
+	}
+}
+
+inline __device__ u32 cyclicRot(u32 s, u32 rot)
+{
+	return __byte_perm(s, s, rot);
+}
+
+inline __device__ u32 computeOne(u32 s0, u32 s1, u32 s2, u32 s3, u32 *roundKey, int rkIdx, u32 (*t0_s)[NUM_SHARED_MEM_BANKS])
+{
+	int wTid = threadIdx.x & 31;
+	return t0_s[__byte_perm(s0, 0, 0x4443)][wTid] ^
+		   cyclicRot(t0_s[__byte_perm(s1, 0, 0x4442)][wTid], CYCLIC_ROT_RIGHT_1) ^
+		   cyclicRot(t0_s[__byte_perm(s2, 0, 0x4441)][wTid], CYCLIC_ROT_RIGHT_2) ^
+		   cyclicRot(t0_s[s3 & 0xff][wTid], CYCLIC_ROT_RIGHT_3) ^
+		   roundKey[rkIdx];
+}
+
+inline __device__ u32 readSBoxByteAndCyclicShift(u32 byteIn, u8 (*Sbox)[32][4], int shift)
+{
+	return cyclicRot(readSBoxByte(byteIn, Sbox), shift);
+}
+
+inline __device__ u32 computeLast(u32 t0, u32 t1, u32 t2, u32 t3, u8 (*Sbox)[32][4], u32 roundKey)
+{
+	return readSBoxByteAndCyclicShift(t0 >> 24, Sbox, CYCLIC_ROT_RIGHT_1) ^ readSBoxByteAndCyclicShift(t1 >> 16, Sbox, CYCLIC_ROT_RIGHT_2) ^ readSBoxByteAndCyclicShift(t2 >> 8, Sbox, CYCLIC_ROT_RIGHT_3) ^ readSBoxByte(t3, Sbox) ^ roundKey;
+}
+
+__device__ void aesEncrypt(u32 *pt, u32 *roundKey, u32 (*t0_s)[NUM_SHARED_MEM_BANKS], u8 (*Sbox)[32][4])
+{
+	u32 s0, s1, s2, s3;
+	s0 = pt[0];
+	s1 = pt[1];
+	s2 = pt[2];
+	s3 = pt[3];
+	s0 = s0 ^ roundKey[0];
+	s1 = s1 ^ roundKey[1];
+	s2 = s2 ^ roundKey[2];
+	s3 = s3 ^ roundKey[3];
+
+	u32 t0, t1, t2, t3;
+	u32 rkIdx = 4;
+	for (u8 r = 0; r < AES_128_ROUNDS_MIN_1; r++)
+	{
+		// Table based round function
+		t0 = computeOne(s0, s1, s2, s3, roundKey, rkIdx, t0_s);
+		t1 = computeOne(s1, s2, s3, s0, roundKey, rkIdx + 1, t0_s);
+		t2 = computeOne(s2, s3, s0, s1, roundKey, rkIdx + 2, t0_s);
+		t3 = computeOne(s3, s0, s1, s2, roundKey, rkIdx + 3, t0_s);
+		s0 = t0;
+		s1 = t1;
+		s2 = t2;
+		s3 = t3;
+		rkIdx += 4;
+	}
+	s0 = computeLast(t0, t1, t2, t3, Sbox, roundKey[40]);
+	s1 = computeLast(t1, t2, t3, t0, Sbox, roundKey[41]);
+	s2 = computeLast(t2, t3, t0, t1, Sbox, roundKey[42]);
+	s3 = computeLast(t3, t0, t1, t2, Sbox, roundKey[43]);
+
+	pt[0] = s0;
+	pt[1] = s1;
+	pt[2] = s2;
+	pt[3] = s3;
+}
+
+__device__ void loadSbox(AESGlobalContext *g, AESSharedContext *s)
+{
+	__shared__ u32 t0_s[AES_128_TABLE_SIZE][NUM_SHARED_MEM_BANKS];
+	__shared__ u8 Sbox[64][32][4];
+	__shared__ u32 t4_0S[AES_128_TABLE_SIZE];
+	__shared__ u32 t4_1S[AES_128_TABLE_SIZE];
+	__shared__ u32 t4_2S[AES_128_TABLE_SIZE];
+	__shared__ u32 t4_3S[AES_128_TABLE_SIZE];
+	// tb size might be small but it will be non-zero
+	for (int i = 0; i < max(AES_128_TABLE_SIZE / blockDim.x, u32(1)); i++)
+	{
+		// stride
+		int tid = threadIdx.x + i * blockDim.x;
+		if (tid < AES_128_TABLE_SIZE)
+		{
+			t4_0S[tid] = g->t4_0G[tid];
+			t4_1S[tid] = g->t4_1G[tid];
+			t4_2S[tid] = g->t4_2G[tid];
+			t4_3S[tid] = g->t4_3G[tid];
+			for (u8 bank = 0; bank < NUM_SHARED_MEM_BANKS; bank++)
+			{
+				t0_s[tid][bank] = g->t0_g[tid];
+				Sbox[tid / 4][bank][tid % 4] = g->Sbox_g[tid];
+			}
+		}
+	}
+	__syncthreads();
+	s->t0_s = t0_s;
+	s->Sbox = Sbox;
+	s->t4_0S = t4_0S;
+	s->t4_1S = t4_1S;
+	s->t4_2S = t4_2S;
+	s->t4_3S = t4_3S;
+}
+
+__device__ void reverseBytes(u32 *x)
+{
+	x[0] = __byte_perm(x[0], 0, 0x123);
+	x[1] = __byte_perm(x[1], 0, 0x123);
+	x[2] = __byte_perm(x[2], 0, 0x123);
+	x[3] = __byte_perm(x[3], 0, 0x123);
+}
+
+__device__ void applyAESPRG(AESSharedContext *s, u32 *key, uint8_t pt, u32 *ct1)
+{
+	reverseBytes(key);
+	u32 roundKey[44];
+	aesKeySchedule(key, roundKey, s->t4_0S, s->t4_1S, s->t4_2S, s->t4_3S);
+	memset(ct1, 0, 4 * sizeof(u32));
+	((uint8_t *)ct1)[3] = pt;
+	aesEncrypt(ct1, roundKey, s->t0_s, s->Sbox);
+	reverseBytes(ct1);
+}
+
+__device__ void applyAESPRGTwoTimes(AESSharedContext *s, u32 *key, uint8_t pt, u32 *ct1, u32 *ct2)
+{
+	reverseBytes(key);
+	u32 roundKey[44];
+	aesKeySchedule(key, roundKey, s->t4_0S, s->t4_1S, s->t4_2S, s->t4_3S);
+	memset(ct1, 0, 4 * sizeof(u32));
+	memset(ct2, 0, 4 * sizeof(u32));
+	((uint8_t *)ct1)[3] = pt;
+	((uint8_t *)ct2)[3] = pt + 2;
+	aesEncrypt(ct1, roundKey, s->t0_s, s->Sbox);
+	aesEncrypt(ct2, roundKey, s->t0_s, s->Sbox);
+	reverseBytes(ct1);
+	reverseBytes(ct2);
+}
+
+__device__ void applyAESPRGFourTimes(AESSharedContext *s, u32 *key, u32 *ct1, u32 *ct2, u32 *ct3, u32 *ct4)
+{
+	reverseBytes(key);
+	u32 roundKey[44];
+	aesKeySchedule(key, roundKey, s->t4_0S, s->t4_1S, s->t4_2S, s->t4_3S);
+	memset(ct1, 0, 4 * sizeof(u32));
+	memset(ct2, 0, 4 * sizeof(u32));
+	memset(ct3, 0, 4 * sizeof(u32));
+	memset(ct4, 0, 4 * sizeof(u32));
+	((uint8_t *)ct2)[3] = 1;
+	((uint8_t *)ct3)[3] = 2;
+	((uint8_t *)ct4)[3] = 3;
+	aesEncrypt(ct1, roundKey, s->t0_s, s->Sbox);
+	aesEncrypt(ct2, roundKey, s->t0_s, s->Sbox);
+	aesEncrypt(ct3, roundKey, s->t0_s, s->Sbox);
+	aesEncrypt(ct4, roundKey, s->t0_s, s->Sbox);
+	reverseBytes(ct1);
+	reverseBytes(ct2);
+	reverseBytes(ct3);
+	reverseBytes(ct4);
+}
+
+void initAESContext(AESGlobalContext *g)
+{
+	g->t0_g = (u32 *)moveToGPU((u8 *)T0, AES_128_TABLE_SIZE * sizeof(u32), NULL);
+	g->Sbox_g = (u8 *)moveToGPU((u8 *)Sbox_g, 256 * sizeof(u8), NULL);
+	g->t4_0G = (u32 *)moveToGPU((u8 *)T4_0, AES_128_TABLE_SIZE * sizeof(u32), NULL);
+	g->t4_1G = (u32 *)moveToGPU((u8 *)T4_1, AES_128_TABLE_SIZE * sizeof(u32), NULL);
+	g->t4_2G = (u32 *)moveToGPU((u8 *)T4_2, AES_128_TABLE_SIZE * sizeof(u32), NULL);
+	g->t4_3G = (u32 *)moveToGPU((u8 *)T4_3, AES_128_TABLE_SIZE * sizeof(u32), NULL);
+}
+
+void freeAESGlobalContext(AESGlobalContext *g)
+{
+	gpuFree(g->t0_g);
+	gpuFree(g->Sbox_g);
+	gpuFree(g->t4_0G);
+	gpuFree(g->t4_1G);
+	gpuFree(g->t4_2G);
+	gpuFree(g->t4_3G);
+}
diff --git a/GPU-MPC/fss/gpu_aes_shm.h b/GPU-MPC/fss/gpu_aes_shm.h
new file mode 100644
index 00000000..3be9c04c
--- /dev/null
+++ b/GPU-MPC/fss/gpu_aes_shm.h
@@ -0,0 +1,52 @@
+// Author: Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_aes_table.h"
+
+#define NUM_SHARED_MEM_BANKS 32
+
+#define AES_128_ROUNDS 10
+#define AES_128_ROUNDS_MIN_1 9
+
+#define CYCLIC_ROT_RIGHT_1 0x4321
+#define CYCLIC_ROT_RIGHT_2 0x5432
+#define CYCLIC_ROT_RIGHT_3 0x6543
+
+struct AESGlobalContext
+{
+	u32 *t0_g;
+	u8 *Sbox_g;
+	u32 *t4_0G, *t4_1G, *t4_2G, *t4_3G;
+};
+
+struct AESSharedContext
+{
+	u32 (*t0_s)[NUM_SHARED_MEM_BANKS];
+	u8 (*Sbox)[32][4];
+	u32 *t4_0S;
+	u32 *t4_1S;
+	u32 *t4_2S;
+	u32 *t4_3S;
+};
+
+#include "gpu_aes_shm.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_aes_table.h b/GPU-MPC/fss/gpu_aes_table.h
new file mode 100644
index 00000000..cd956586
--- /dev/null
+++ b/GPU-MPC/fss/gpu_aes_table.h
@@ -0,0 +1,1317 @@
+// Author: Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+
+#define AES_128_TABLE_SIZE 256
+
+u8 Sbox_g[256] = {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
+
+u32 T0[AES_128_TABLE_SIZE] = {
+	0xc66363a5U,
+	0xf87c7c84U,
+	0xee777799U,
+	0xf67b7b8dU,
+	0xfff2f20dU,
+	0xd66b6bbdU,
+	0xde6f6fb1U,
+	0x91c5c554U,
+	0x60303050U,
+	0x02010103U,
+	0xce6767a9U,
+	0x562b2b7dU,
+	0xe7fefe19U,
+	0xb5d7d762U,
+	0x4dababe6U,
+	0xec76769aU,
+	0x8fcaca45U,
+	0x1f82829dU,
+	0x89c9c940U,
+	0xfa7d7d87U,
+	0xeffafa15U,
+	0xb25959ebU,
+	0x8e4747c9U,
+	0xfbf0f00bU,
+	0x41adadecU,
+	0xb3d4d467U,
+	0x5fa2a2fdU,
+	0x45afafeaU,
+	0x239c9cbfU,
+	0x53a4a4f7U,
+	0xe4727296U,
+	0x9bc0c05bU,
+	0x75b7b7c2U,
+	0xe1fdfd1cU,
+	0x3d9393aeU,
+	0x4c26266aU,
+	0x6c36365aU,
+	0x7e3f3f41U,
+	0xf5f7f702U,
+	0x83cccc4fU,
+	0x6834345cU,
+	0x51a5a5f4U,
+	0xd1e5e534U,
+	0xf9f1f108U,
+	0xe2717193U,
+	0xabd8d873U,
+	0x62313153U,
+	0x2a15153fU,
+	0x0804040cU,
+	0x95c7c752U,
+	0x46232365U,
+	0x9dc3c35eU,
+	0x30181828U,
+	0x379696a1U,
+	0x0a05050fU,
+	0x2f9a9ab5U,
+	0x0e070709U,
+	0x24121236U,
+	0x1b80809bU,
+	0xdfe2e23dU,
+	0xcdebeb26U,
+	0x4e272769U,
+	0x7fb2b2cdU,
+	0xea75759fU,
+	0x1209091bU,
+	0x1d83839eU,
+	0x582c2c74U,
+	0x341a1a2eU,
+	0x361b1b2dU,
+	0xdc6e6eb2U,
+	0xb45a5aeeU,
+	0x5ba0a0fbU,
+	0xa45252f6U,
+	0x763b3b4dU,
+	0xb7d6d661U,
+	0x7db3b3ceU,
+	0x5229297bU,
+	0xdde3e33eU,
+	0x5e2f2f71U,
+	0x13848497U,
+	0xa65353f5U,
+	0xb9d1d168U,
+	0x00000000U,
+	0xc1eded2cU,
+	0x40202060U,
+	0xe3fcfc1fU,
+	0x79b1b1c8U,
+	0xb65b5bedU,
+	0xd46a6abeU,
+	0x8dcbcb46U,
+	0x67bebed9U,
+	0x7239394bU,
+	0x944a4adeU,
+	0x984c4cd4U,
+	0xb05858e8U,
+	0x85cfcf4aU,
+	0xbbd0d06bU,
+	0xc5efef2aU,
+	0x4faaaae5U,
+	0xedfbfb16U,
+	0x864343c5U,
+	0x9a4d4dd7U,
+	0x66333355U,
+	0x11858594U,
+	0x8a4545cfU,
+	0xe9f9f910U,
+	0x04020206U,
+	0xfe7f7f81U,
+	0xa05050f0U,
+	0x783c3c44U,
+	0x259f9fbaU,
+	0x4ba8a8e3U,
+	0xa25151f3U,
+	0x5da3a3feU,
+	0x804040c0U,
+	0x058f8f8aU,
+	0x3f9292adU,
+	0x219d9dbcU,
+	0x70383848U,
+	0xf1f5f504U,
+	0x63bcbcdfU,
+	0x77b6b6c1U,
+	0xafdada75U,
+	0x42212163U,
+	0x20101030U,
+	0xe5ffff1aU,
+	0xfdf3f30eU,
+	0xbfd2d26dU,
+	0x81cdcd4cU,
+	0x180c0c14U,
+	0x26131335U,
+	0xc3ecec2fU,
+	0xbe5f5fe1U,
+	0x359797a2U,
+	0x884444ccU,
+	0x2e171739U,
+	0x93c4c457U,
+	0x55a7a7f2U,
+	0xfc7e7e82U,
+	0x7a3d3d47U,
+	0xc86464acU,
+	0xba5d5de7U,
+	0x3219192bU,
+	0xe6737395U,
+	0xc06060a0U,
+	0x19818198U,
+	0x9e4f4fd1U,
+	0xa3dcdc7fU,
+	0x44222266U,
+	0x542a2a7eU,
+	0x3b9090abU,
+	0x0b888883U,
+	0x8c4646caU,
+	0xc7eeee29U,
+	0x6bb8b8d3U,
+	0x2814143cU,
+	0xa7dede79U,
+	0xbc5e5ee2U,
+	0x160b0b1dU,
+	0xaddbdb76U,
+	0xdbe0e03bU,
+	0x64323256U,
+	0x743a3a4eU,
+	0x140a0a1eU,
+	0x924949dbU,
+	0x0c06060aU,
+	0x4824246cU,
+	0xb85c5ce4U,
+	0x9fc2c25dU,
+	0xbdd3d36eU,
+	0x43acacefU,
+	0xc46262a6U,
+	0x399191a8U,
+	0x319595a4U,
+	0xd3e4e437U,
+	0xf279798bU,
+	0xd5e7e732U,
+	0x8bc8c843U,
+	0x6e373759U,
+	0xda6d6db7U,
+	0x018d8d8cU,
+	0xb1d5d564U,
+	0x9c4e4ed2U,
+	0x49a9a9e0U,
+	0xd86c6cb4U,
+	0xac5656faU,
+	0xf3f4f407U,
+	0xcfeaea25U,
+	0xca6565afU,
+	0xf47a7a8eU,
+	0x47aeaee9U,
+	0x10080818U,
+	0x6fbabad5U,
+	0xf0787888U,
+	0x4a25256fU,
+	0x5c2e2e72U,
+	0x381c1c24U,
+	0x57a6a6f1U,
+	0x73b4b4c7U,
+	0x97c6c651U,
+	0xcbe8e823U,
+	0xa1dddd7cU,
+	0xe874749cU,
+	0x3e1f1f21U,
+	0x964b4bddU,
+	0x61bdbddcU,
+	0x0d8b8b86U,
+	0x0f8a8a85U,
+	0xe0707090U,
+	0x7c3e3e42U,
+	0x71b5b5c4U,
+	0xcc6666aaU,
+	0x904848d8U,
+	0x06030305U,
+	0xf7f6f601U,
+	0x1c0e0e12U,
+	0xc26161a3U,
+	0x6a35355fU,
+	0xae5757f9U,
+	0x69b9b9d0U,
+	0x17868691U,
+	0x99c1c158U,
+	0x3a1d1d27U,
+	0x279e9eb9U,
+	0xd9e1e138U,
+	0xebf8f813U,
+	0x2b9898b3U,
+	0x22111133U,
+	0xd26969bbU,
+	0xa9d9d970U,
+	0x078e8e89U,
+	0x339494a7U,
+	0x2d9b9bb6U,
+	0x3c1e1e22U,
+	0x15878792U,
+	0xc9e9e920U,
+	0x87cece49U,
+	0xaa5555ffU,
+	0x50282878U,
+	0xa5dfdf7aU,
+	0x038c8c8fU,
+	0x59a1a1f8U,
+	0x09898980U,
+	0x1a0d0d17U,
+	0x65bfbfdaU,
+	0xd7e6e631U,
+	0x844242c6U,
+	0xd06868b8U,
+	0x824141c3U,
+	0x299999b0U,
+	0x5a2d2d77U,
+	0x1e0f0f11U,
+	0x7bb0b0cbU,
+	0xa85454fcU,
+	0x6dbbbbd6U,
+	0x2c16163aU,
+};
+u32 T4_0[AES_128_TABLE_SIZE] = {
+	0x00000063U,
+	0x0000007cU,
+	0x00000077U,
+	0x0000007bU,
+	0x000000f2U,
+	0x0000006bU,
+	0x0000006fU,
+	0x000000c5U,
+	0x00000030U,
+	0x00000001U,
+	0x00000067U,
+	0x0000002bU,
+	0x000000feU,
+	0x000000d7U,
+	0x000000abU,
+	0x00000076U,
+	0x000000caU,
+	0x00000082U,
+	0x000000c9U,
+	0x0000007dU,
+	0x000000faU,
+	0x00000059U,
+	0x00000047U,
+	0x000000f0U,
+	0x000000adU,
+	0x000000d4U,
+	0x000000a2U,
+	0x000000afU,
+	0x0000009cU,
+	0x000000a4U,
+	0x00000072U,
+	0x000000c0U,
+	0x000000b7U,
+	0x000000fdU,
+	0x00000093U,
+	0x00000026U,
+	0x00000036U,
+	0x0000003fU,
+	0x000000f7U,
+	0x000000ccU,
+	0x00000034U,
+	0x000000a5U,
+	0x000000e5U,
+	0x000000f1U,
+	0x00000071U,
+	0x000000d8U,
+	0x00000031U,
+	0x00000015U,
+	0x00000004U,
+	0x000000c7U,
+	0x00000023U,
+	0x000000c3U,
+	0x00000018U,
+	0x00000096U,
+	0x00000005U,
+	0x0000009aU,
+	0x00000007U,
+	0x00000012U,
+	0x00000080U,
+	0x000000e2U,
+	0x000000ebU,
+	0x00000027U,
+	0x000000b2U,
+	0x00000075U,
+	0x00000009U,
+	0x00000083U,
+	0x0000002cU,
+	0x0000001aU,
+	0x0000001bU,
+	0x0000006eU,
+	0x0000005aU,
+	0x000000a0U,
+	0x00000052U,
+	0x0000003bU,
+	0x000000d6U,
+	0x000000b3U,
+	0x00000029U,
+	0x000000e3U,
+	0x0000002fU,
+	0x00000084U,
+	0x00000053U,
+	0x000000d1U,
+	0x00000000U,
+	0x000000edU,
+	0x00000020U,
+	0x000000fcU,
+	0x000000b1U,
+	0x0000005bU,
+	0x0000006aU,
+	0x000000cbU,
+	0x000000beU,
+	0x00000039U,
+	0x0000004aU,
+	0x0000004cU,
+	0x00000058U,
+	0x000000cfU,
+	0x000000d0U,
+	0x000000efU,
+	0x000000aaU,
+	0x000000fbU,
+	0x00000043U,
+	0x0000004dU,
+	0x00000033U,
+	0x00000085U,
+	0x00000045U,
+	0x000000f9U,
+	0x00000002U,
+	0x0000007fU,
+	0x00000050U,
+	0x0000003cU,
+	0x0000009fU,
+	0x000000a8U,
+	0x00000051U,
+	0x000000a3U,
+	0x00000040U,
+	0x0000008fU,
+	0x00000092U,
+	0x0000009dU,
+	0x00000038U,
+	0x000000f5U,
+	0x000000bcU,
+	0x000000b6U,
+	0x000000daU,
+	0x00000021U,
+	0x00000010U,
+	0x000000ffU,
+	0x000000f3U,
+	0x000000d2U,
+	0x000000cdU,
+	0x0000000cU,
+	0x00000013U,
+	0x000000ecU,
+	0x0000005fU,
+	0x00000097U,
+	0x00000044U,
+	0x00000017U,
+	0x000000c4U,
+	0x000000a7U,
+	0x0000007eU,
+	0x0000003dU,
+	0x00000064U,
+	0x0000005dU,
+	0x00000019U,
+	0x00000073U,
+	0x00000060U,
+	0x00000081U,
+	0x0000004fU,
+	0x000000dcU,
+	0x00000022U,
+	0x0000002aU,
+	0x00000090U,
+	0x00000088U,
+	0x00000046U,
+	0x000000eeU,
+	0x000000b8U,
+	0x00000014U,
+	0x000000deU,
+	0x0000005eU,
+	0x0000000bU,
+	0x000000dbU,
+	0x000000e0U,
+	0x00000032U,
+	0x0000003aU,
+	0x0000000aU,
+	0x00000049U,
+	0x00000006U,
+	0x00000024U,
+	0x0000005cU,
+	0x000000c2U,
+	0x000000d3U,
+	0x000000acU,
+	0x00000062U,
+	0x00000091U,
+	0x00000095U,
+	0x000000e4U,
+	0x00000079U,
+	0x000000e7U,
+	0x000000c8U,
+	0x00000037U,
+	0x0000006dU,
+	0x0000008dU,
+	0x000000d5U,
+	0x0000004eU,
+	0x000000a9U,
+	0x0000006cU,
+	0x00000056U,
+	0x000000f4U,
+	0x000000eaU,
+	0x00000065U,
+	0x0000007aU,
+	0x000000aeU,
+	0x00000008U,
+	0x000000baU,
+	0x00000078U,
+	0x00000025U,
+	0x0000002eU,
+	0x0000001cU,
+	0x000000a6U,
+	0x000000b4U,
+	0x000000c6U,
+	0x000000e8U,
+	0x000000ddU,
+	0x00000074U,
+	0x0000001fU,
+	0x0000004bU,
+	0x000000bdU,
+	0x0000008bU,
+	0x0000008aU,
+	0x00000070U,
+	0x0000003eU,
+	0x000000b5U,
+	0x00000066U,
+	0x00000048U,
+	0x00000003U,
+	0x000000f6U,
+	0x0000000eU,
+	0x00000061U,
+	0x00000035U,
+	0x00000057U,
+	0x000000b9U,
+	0x00000086U,
+	0x000000c1U,
+	0x0000001dU,
+	0x0000009eU,
+	0x000000e1U,
+	0x000000f8U,
+	0x00000098U,
+	0x00000011U,
+	0x00000069U,
+	0x000000d9U,
+	0x0000008eU,
+	0x00000094U,
+	0x0000009bU,
+	0x0000001eU,
+	0x00000087U,
+	0x000000e9U,
+	0x000000ceU,
+	0x00000055U,
+	0x00000028U,
+	0x000000dfU,
+	0x0000008cU,
+	0x000000a1U,
+	0x00000089U,
+	0x0000000dU,
+	0x000000bfU,
+	0x000000e6U,
+	0x00000042U,
+	0x00000068U,
+	0x00000041U,
+	0x00000099U,
+	0x0000002dU,
+	0x0000000fU,
+	0x000000b0U,
+	0x00000054U,
+	0x000000bbU,
+	0x00000016U,
+};
+u32 T4_1[AES_128_TABLE_SIZE] = {
+	0x00006300U,
+	0x00007c00U,
+	0x00007700U,
+	0x00007b00U,
+	0x0000f200U,
+	0x00006b00U,
+	0x00006f00U,
+	0x0000c500U,
+	0x00003000U,
+	0x00000100U,
+	0x00006700U,
+	0x00002b00U,
+	0x0000fe00U,
+	0x0000d700U,
+	0x0000ab00U,
+	0x00007600U,
+	0x0000ca00U,
+	0x00008200U,
+	0x0000c900U,
+	0x00007d00U,
+	0x0000fa00U,
+	0x00005900U,
+	0x00004700U,
+	0x0000f000U,
+	0x0000ad00U,
+	0x0000d400U,
+	0x0000a200U,
+	0x0000af00U,
+	0x00009c00U,
+	0x0000a400U,
+	0x00007200U,
+	0x0000c000U,
+	0x0000b700U,
+	0x0000fd00U,
+	0x00009300U,
+	0x00002600U,
+	0x00003600U,
+	0x00003f00U,
+	0x0000f700U,
+	0x0000cc00U,
+	0x00003400U,
+	0x0000a500U,
+	0x0000e500U,
+	0x0000f100U,
+	0x00007100U,
+	0x0000d800U,
+	0x00003100U,
+	0x00001500U,
+	0x00000400U,
+	0x0000c700U,
+	0x00002300U,
+	0x0000c300U,
+	0x00001800U,
+	0x00009600U,
+	0x00000500U,
+	0x00009a00U,
+	0x00000700U,
+	0x00001200U,
+	0x00008000U,
+	0x0000e200U,
+	0x0000eb00U,
+	0x00002700U,
+	0x0000b200U,
+	0x00007500U,
+	0x00000900U,
+	0x00008300U,
+	0x00002c00U,
+	0x00001a00U,
+	0x00001b00U,
+	0x00006e00U,
+	0x00005a00U,
+	0x0000a000U,
+	0x00005200U,
+	0x00003b00U,
+	0x0000d600U,
+	0x0000b300U,
+	0x00002900U,
+	0x0000e300U,
+	0x00002f00U,
+	0x00008400U,
+	0x00005300U,
+	0x0000d100U,
+	0x00000000U,
+	0x0000ed00U,
+	0x00002000U,
+	0x0000fc00U,
+	0x0000b100U,
+	0x00005b00U,
+	0x00006a00U,
+	0x0000cb00U,
+	0x0000be00U,
+	0x00003900U,
+	0x00004a00U,
+	0x00004c00U,
+	0x00005800U,
+	0x0000cf00U,
+	0x0000d000U,
+	0x0000ef00U,
+	0x0000aa00U,
+	0x0000fb00U,
+	0x00004300U,
+	0x00004d00U,
+	0x00003300U,
+	0x00008500U,
+	0x00004500U,
+	0x0000f900U,
+	0x00000200U,
+	0x00007f00U,
+	0x00005000U,
+	0x00003c00U,
+	0x00009f00U,
+	0x0000a800U,
+	0x00005100U,
+	0x0000a300U,
+	0x00004000U,
+	0x00008f00U,
+	0x00009200U,
+	0x00009d00U,
+	0x00003800U,
+	0x0000f500U,
+	0x0000bc00U,
+	0x0000b600U,
+	0x0000da00U,
+	0x00002100U,
+	0x00001000U,
+	0x0000ff00U,
+	0x0000f300U,
+	0x0000d200U,
+	0x0000cd00U,
+	0x00000c00U,
+	0x00001300U,
+	0x0000ec00U,
+	0x00005f00U,
+	0x00009700U,
+	0x00004400U,
+	0x00001700U,
+	0x0000c400U,
+	0x0000a700U,
+	0x00007e00U,
+	0x00003d00U,
+	0x00006400U,
+	0x00005d00U,
+	0x00001900U,
+	0x00007300U,
+	0x00006000U,
+	0x00008100U,
+	0x00004f00U,
+	0x0000dc00U,
+	0x00002200U,
+	0x00002a00U,
+	0x00009000U,
+	0x00008800U,
+	0x00004600U,
+	0x0000ee00U,
+	0x0000b800U,
+	0x00001400U,
+	0x0000de00U,
+	0x00005e00U,
+	0x00000b00U,
+	0x0000db00U,
+	0x0000e000U,
+	0x00003200U,
+	0x00003a00U,
+	0x00000a00U,
+	0x00004900U,
+	0x00000600U,
+	0x00002400U,
+	0x00005c00U,
+	0x0000c200U,
+	0x0000d300U,
+	0x0000ac00U,
+	0x00006200U,
+	0x00009100U,
+	0x00009500U,
+	0x0000e400U,
+	0x00007900U,
+	0x0000e700U,
+	0x0000c800U,
+	0x00003700U,
+	0x00006d00U,
+	0x00008d00U,
+	0x0000d500U,
+	0x00004e00U,
+	0x0000a900U,
+	0x00006c00U,
+	0x00005600U,
+	0x0000f400U,
+	0x0000ea00U,
+	0x00006500U,
+	0x00007a00U,
+	0x0000ae00U,
+	0x00000800U,
+	0x0000ba00U,
+	0x00007800U,
+	0x00002500U,
+	0x00002e00U,
+	0x00001c00U,
+	0x0000a600U,
+	0x0000b400U,
+	0x0000c600U,
+	0x0000e800U,
+	0x0000dd00U,
+	0x00007400U,
+	0x00001f00U,
+	0x00004b00U,
+	0x0000bd00U,
+	0x00008b00U,
+	0x00008a00U,
+	0x00007000U,
+	0x00003e00U,
+	0x0000b500U,
+	0x00006600U,
+	0x00004800U,
+	0x00000300U,
+	0x0000f600U,
+	0x00000e00U,
+	0x00006100U,
+	0x00003500U,
+	0x00005700U,
+	0x0000b900U,
+	0x00008600U,
+	0x0000c100U,
+	0x00001d00U,
+	0x00009e00U,
+	0x0000e100U,
+	0x0000f800U,
+	0x00009800U,
+	0x00001100U,
+	0x00006900U,
+	0x0000d900U,
+	0x00008e00U,
+	0x00009400U,
+	0x00009b00U,
+	0x00001e00U,
+	0x00008700U,
+	0x0000e900U,
+	0x0000ce00U,
+	0x00005500U,
+	0x00002800U,
+	0x0000df00U,
+	0x00008c00U,
+	0x0000a100U,
+	0x00008900U,
+	0x00000d00U,
+	0x0000bf00U,
+	0x0000e600U,
+	0x00004200U,
+	0x00006800U,
+	0x00004100U,
+	0x00009900U,
+	0x00002d00U,
+	0x00000f00U,
+	0x0000b000U,
+	0x00005400U,
+	0x0000bb00U,
+	0x00001600U,
+};
+u32 T4_2[AES_128_TABLE_SIZE] = {
+	0x00630000U,
+	0x007c0000U,
+	0x00770000U,
+	0x007b0000U,
+	0x00f20000U,
+	0x006b0000U,
+	0x006f0000U,
+	0x00c50000U,
+	0x00300000U,
+	0x00010000U,
+	0x00670000U,
+	0x002b0000U,
+	0x00fe0000U,
+	0x00d70000U,
+	0x00ab0000U,
+	0x00760000U,
+	0x00ca0000U,
+	0x00820000U,
+	0x00c90000U,
+	0x007d0000U,
+	0x00fa0000U,
+	0x00590000U,
+	0x00470000U,
+	0x00f00000U,
+	0x00ad0000U,
+	0x00d40000U,
+	0x00a20000U,
+	0x00af0000U,
+	0x009c0000U,
+	0x00a40000U,
+	0x00720000U,
+	0x00c00000U,
+	0x00b70000U,
+	0x00fd0000U,
+	0x00930000U,
+	0x00260000U,
+	0x00360000U,
+	0x003f0000U,
+	0x00f70000U,
+	0x00cc0000U,
+	0x00340000U,
+	0x00a50000U,
+	0x00e50000U,
+	0x00f10000U,
+	0x00710000U,
+	0x00d80000U,
+	0x00310000U,
+	0x00150000U,
+	0x00040000U,
+	0x00c70000U,
+	0x00230000U,
+	0x00c30000U,
+	0x00180000U,
+	0x00960000U,
+	0x00050000U,
+	0x009a0000U,
+	0x00070000U,
+	0x00120000U,
+	0x00800000U,
+	0x00e20000U,
+	0x00eb0000U,
+	0x00270000U,
+	0x00b20000U,
+	0x00750000U,
+	0x00090000U,
+	0x00830000U,
+	0x002c0000U,
+	0x001a0000U,
+	0x001b0000U,
+	0x006e0000U,
+	0x005a0000U,
+	0x00a00000U,
+	0x00520000U,
+	0x003b0000U,
+	0x00d60000U,
+	0x00b30000U,
+	0x00290000U,
+	0x00e30000U,
+	0x002f0000U,
+	0x00840000U,
+	0x00530000U,
+	0x00d10000U,
+	0x00000000U,
+	0x00ed0000U,
+	0x00200000U,
+	0x00fc0000U,
+	0x00b10000U,
+	0x005b0000U,
+	0x006a0000U,
+	0x00cb0000U,
+	0x00be0000U,
+	0x00390000U,
+	0x004a0000U,
+	0x004c0000U,
+	0x00580000U,
+	0x00cf0000U,
+	0x00d00000U,
+	0x00ef0000U,
+	0x00aa0000U,
+	0x00fb0000U,
+	0x00430000U,
+	0x004d0000U,
+	0x00330000U,
+	0x00850000U,
+	0x00450000U,
+	0x00f90000U,
+	0x00020000U,
+	0x007f0000U,
+	0x00500000U,
+	0x003c0000U,
+	0x009f0000U,
+	0x00a80000U,
+	0x00510000U,
+	0x00a30000U,
+	0x00400000U,
+	0x008f0000U,
+	0x00920000U,
+	0x009d0000U,
+	0x00380000U,
+	0x00f50000U,
+	0x00bc0000U,
+	0x00b60000U,
+	0x00da0000U,
+	0x00210000U,
+	0x00100000U,
+	0x00ff0000U,
+	0x00f30000U,
+	0x00d20000U,
+	0x00cd0000U,
+	0x000c0000U,
+	0x00130000U,
+	0x00ec0000U,
+	0x005f0000U,
+	0x00970000U,
+	0x00440000U,
+	0x00170000U,
+	0x00c40000U,
+	0x00a70000U,
+	0x007e0000U,
+	0x003d0000U,
+	0x00640000U,
+	0x005d0000U,
+	0x00190000U,
+	0x00730000U,
+	0x00600000U,
+	0x00810000U,
+	0x004f0000U,
+	0x00dc0000U,
+	0x00220000U,
+	0x002a0000U,
+	0x00900000U,
+	0x00880000U,
+	0x00460000U,
+	0x00ee0000U,
+	0x00b80000U,
+	0x00140000U,
+	0x00de0000U,
+	0x005e0000U,
+	0x000b0000U,
+	0x00db0000U,
+	0x00e00000U,
+	0x00320000U,
+	0x003a0000U,
+	0x000a0000U,
+	0x00490000U,
+	0x00060000U,
+	0x00240000U,
+	0x005c0000U,
+	0x00c20000U,
+	0x00d30000U,
+	0x00ac0000U,
+	0x00620000U,
+	0x00910000U,
+	0x00950000U,
+	0x00e40000U,
+	0x00790000U,
+	0x00e70000U,
+	0x00c80000U,
+	0x00370000U,
+	0x006d0000U,
+	0x008d0000U,
+	0x00d50000U,
+	0x004e0000U,
+	0x00a90000U,
+	0x006c0000U,
+	0x00560000U,
+	0x00f40000U,
+	0x00ea0000U,
+	0x00650000U,
+	0x007a0000U,
+	0x00ae0000U,
+	0x00080000U,
+	0x00ba0000U,
+	0x00780000U,
+	0x00250000U,
+	0x002e0000U,
+	0x001c0000U,
+	0x00a60000U,
+	0x00b40000U,
+	0x00c60000U,
+	0x00e80000U,
+	0x00dd0000U,
+	0x00740000U,
+	0x001f0000U,
+	0x004b0000U,
+	0x00bd0000U,
+	0x008b0000U,
+	0x008a0000U,
+	0x00700000U,
+	0x003e0000U,
+	0x00b50000U,
+	0x00660000U,
+	0x00480000U,
+	0x00030000U,
+	0x00f60000U,
+	0x000e0000U,
+	0x00610000U,
+	0x00350000U,
+	0x00570000U,
+	0x00b90000U,
+	0x00860000U,
+	0x00c10000U,
+	0x001d0000U,
+	0x009e0000U,
+	0x00e10000U,
+	0x00f80000U,
+	0x00980000U,
+	0x00110000U,
+	0x00690000U,
+	0x00d90000U,
+	0x008e0000U,
+	0x00940000U,
+	0x009b0000U,
+	0x001e0000U,
+	0x00870000U,
+	0x00e90000U,
+	0x00ce0000U,
+	0x00550000U,
+	0x00280000U,
+	0x00df0000U,
+	0x008c0000U,
+	0x00a10000U,
+	0x00890000U,
+	0x000d0000U,
+	0x00bf0000U,
+	0x00e60000U,
+	0x00420000U,
+	0x00680000U,
+	0x00410000U,
+	0x00990000U,
+	0x002d0000U,
+	0x000f0000U,
+	0x00b00000U,
+	0x00540000U,
+	0x00bb0000U,
+	0x00160000U,
+};
+u32 T4_3[AES_128_TABLE_SIZE] = {
+	0x63000000U,
+	0x7c000000U,
+	0x77000000U,
+	0x7b000000U,
+	0xf2000000U,
+	0x6b000000U,
+	0x6f000000U,
+	0xc5000000U,
+	0x30000000U,
+	0x01000000U,
+	0x67000000U,
+	0x2b000000U,
+	0xfe000000U,
+	0xd7000000U,
+	0xab000000U,
+	0x76000000U,
+	0xca000000U,
+	0x82000000U,
+	0xc9000000U,
+	0x7d000000U,
+	0xfa000000U,
+	0x59000000U,
+	0x47000000U,
+	0xf0000000U,
+	0xad000000U,
+	0xd4000000U,
+	0xa2000000U,
+	0xaf000000U,
+	0x9c000000U,
+	0xa4000000U,
+	0x72000000U,
+	0xc0000000U,
+	0xb7000000U,
+	0xfd000000U,
+	0x93000000U,
+	0x26000000U,
+	0x36000000U,
+	0x3f000000U,
+	0xf7000000U,
+	0xcc000000U,
+	0x34000000U,
+	0xa5000000U,
+	0xe5000000U,
+	0xf1000000U,
+	0x71000000U,
+	0xd8000000U,
+	0x31000000U,
+	0x15000000U,
+	0x04000000U,
+	0xc7000000U,
+	0x23000000U,
+	0xc3000000U,
+	0x18000000U,
+	0x96000000U,
+	0x05000000U,
+	0x9a000000U,
+	0x07000000U,
+	0x12000000U,
+	0x80000000U,
+	0xe2000000U,
+	0xeb000000U,
+	0x27000000U,
+	0xb2000000U,
+	0x75000000U,
+	0x09000000U,
+	0x83000000U,
+	0x2c000000U,
+	0x1a000000U,
+	0x1b000000U,
+	0x6e000000U,
+	0x5a000000U,
+	0xa0000000U,
+	0x52000000U,
+	0x3b000000U,
+	0xd6000000U,
+	0xb3000000U,
+	0x29000000U,
+	0xe3000000U,
+	0x2f000000U,
+	0x84000000U,
+	0x53000000U,
+	0xd1000000U,
+	0x00000000U,
+	0xed000000U,
+	0x20000000U,
+	0xfc000000U,
+	0xb1000000U,
+	0x5b000000U,
+	0x6a000000U,
+	0xcb000000U,
+	0xbe000000U,
+	0x39000000U,
+	0x4a000000U,
+	0x4c000000U,
+	0x58000000U,
+	0xcf000000U,
+	0xd0000000U,
+	0xef000000U,
+	0xaa000000U,
+	0xfb000000U,
+	0x43000000U,
+	0x4d000000U,
+	0x33000000U,
+	0x85000000U,
+	0x45000000U,
+	0xf9000000U,
+	0x02000000U,
+	0x7f000000U,
+	0x50000000U,
+	0x3c000000U,
+	0x9f000000U,
+	0xa8000000U,
+	0x51000000U,
+	0xa3000000U,
+	0x40000000U,
+	0x8f000000U,
+	0x92000000U,
+	0x9d000000U,
+	0x38000000U,
+	0xf5000000U,
+	0xbc000000U,
+	0xb6000000U,
+	0xda000000U,
+	0x21000000U,
+	0x10000000U,
+	0xff000000U,
+	0xf3000000U,
+	0xd2000000U,
+	0xcd000000U,
+	0x0c000000U,
+	0x13000000U,
+	0xec000000U,
+	0x5f000000U,
+	0x97000000U,
+	0x44000000U,
+	0x17000000U,
+	0xc4000000U,
+	0xa7000000U,
+	0x7e000000U,
+	0x3d000000U,
+	0x64000000U,
+	0x5d000000U,
+	0x19000000U,
+	0x73000000U,
+	0x60000000U,
+	0x81000000U,
+	0x4f000000U,
+	0xdc000000U,
+	0x22000000U,
+	0x2a000000U,
+	0x90000000U,
+	0x88000000U,
+	0x46000000U,
+	0xee000000U,
+	0xb8000000U,
+	0x14000000U,
+	0xde000000U,
+	0x5e000000U,
+	0x0b000000U,
+	0xdb000000U,
+	0xe0000000U,
+	0x32000000U,
+	0x3a000000U,
+	0x0a000000U,
+	0x49000000U,
+	0x06000000U,
+	0x24000000U,
+	0x5c000000U,
+	0xc2000000U,
+	0xd3000000U,
+	0xac000000U,
+	0x62000000U,
+	0x91000000U,
+	0x95000000U,
+	0xe4000000U,
+	0x79000000U,
+	0xe7000000U,
+	0xc8000000U,
+	0x37000000U,
+	0x6d000000U,
+	0x8d000000U,
+	0xd5000000U,
+	0x4e000000U,
+	0xa9000000U,
+	0x6c000000U,
+	0x56000000U,
+	0xf4000000U,
+	0xea000000U,
+	0x65000000U,
+	0x7a000000U,
+	0xae000000U,
+	0x08000000U,
+	0xba000000U,
+	0x78000000U,
+	0x25000000U,
+	0x2e000000U,
+	0x1c000000U,
+	0xa6000000U,
+	0xb4000000U,
+	0xc6000000U,
+	0xe8000000U,
+	0xdd000000U,
+	0x74000000U,
+	0x1f000000U,
+	0x4b000000U,
+	0xbd000000U,
+	0x8b000000U,
+	0x8a000000U,
+	0x70000000U,
+	0x3e000000U,
+	0xb5000000U,
+	0x66000000U,
+	0x48000000U,
+	0x03000000U,
+	0xf6000000U,
+	0x0e000000U,
+	0x61000000U,
+	0x35000000U,
+	0x57000000U,
+	0xb9000000U,
+	0x86000000U,
+	0xc1000000U,
+	0x1d000000U,
+	0x9e000000U,
+	0xe1000000U,
+	0xf8000000U,
+	0x98000000U,
+	0x11000000U,
+	0x69000000U,
+	0xd9000000U,
+	0x8e000000U,
+	0x94000000U,
+	0x9b000000U,
+	0x1e000000U,
+	0x87000000U,
+	0xe9000000U,
+	0xce000000U,
+	0x55000000U,
+	0x28000000U,
+	0xdf000000U,
+	0x8c000000U,
+	0xa1000000U,
+	0x89000000U,
+	0x0d000000U,
+	0xbf000000U,
+	0xe6000000U,
+	0x42000000U,
+	0x68000000U,
+	0x41000000U,
+	0x99000000U,
+	0x2d000000U,
+	0x0f000000U,
+	0xb0000000U,
+	0x54000000U,
+	0xbb000000U,
+	0x16000000U,
+};
diff --git a/GPU-MPC/fss/gpu_and.cu b/GPU-MPC/fss/gpu_and.cu
new file mode 100644
index 00000000..e0f4c373
--- /dev/null
+++ b/GPU-MPC/fss/gpu_and.cu
@@ -0,0 +1,56 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+// #include "utils/misc_utils.h"
+
+template <typename T>
+__global__ void keyGenAndKernel(int N, T *b0, T *b1, T *randomMaskOut, T *maskOut)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        maskOut[i] = (b0[i] * b1[i] + randomMaskOut[i]) & 1ULL;
+    }
+}
+
+template <typename T>
+void writeAndKey(u8 **key_as_bytes, int party, int N, T *d_b0, T *d_b1, T *d_maskOut, int bout)
+{
+    assert(bout == 1);
+    writeInt(key_as_bytes, N);
+    writeShares<T, T>(key_as_bytes, party, N, d_b0, bout);
+    writeShares<T, T>(key_as_bytes, party, N, d_b1, bout);
+    writeShares<T, T>(key_as_bytes, party, N, d_maskOut, bout);
+}
+
+template <typename T>
+T *gpuKeyGenAnd(u8 **key_as_bytes, int party, int bout, int N, T *d_b0, T *d_b1)
+{
+    assert(bout == 1);
+    auto d_randomMaskOut = randomGEOnGpu<T>(N, 1);
+    // checkCudaErrors(cudaMemset(d_randomMaskOut, 0, N * sizeof(u64)));
+    auto d_maskOut = (T *)gpuMalloc(N * sizeof(T));
+    keyGenAndKernel<<<(N - 1) / 256 + 1, 256>>>(N, d_b0, d_b1, d_randomMaskOut, d_maskOut);
+    writeAndKey(key_as_bytes, party, N, d_b0, d_b1, d_maskOut, bout);
+    gpuFree(d_maskOut);
+    return d_randomMaskOut;
+}
diff --git a/GPU-MPC/fss/gpu_and.h b/GPU-MPC/fss/gpu_and.h
new file mode 100644
index 00000000..3c37b275
--- /dev/null
+++ b/GPU-MPC/fss/gpu_and.h
@@ -0,0 +1,53 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include <cassert>
+#include <omp.h>
+
+
+struct GPUAndKey {
+    int N;
+    uint32_t *b0, *b1, *b2;
+};
+
+
+GPUAndKey readGPUAndKey(uint8_t** key_as_bytes) {
+    GPUAndKey k;
+    k.N = *((int*) *key_as_bytes);
+    *key_as_bytes += sizeof(int);
+    int num_ints = (k.N - 1) / PACKING_SIZE + 1;
+    size_t size_in_bytes = num_ints * sizeof(uint32_t);
+    k.b0 = (uint32_t*) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.b1 = (uint32_t*) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.b2 = (uint32_t*) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    return k;
+}
+
+#include "gpu_and.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_avgpool.cu b/GPU-MPC/fss/gpu_avgpool.cu
new file mode 100644
index 00000000..4c1a0041
--- /dev/null
+++ b/GPU-MPC/fss/gpu_avgpool.cu
@@ -0,0 +1,92 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_avgpool.h"
+
+template <typename T>
+__global__ void addPoolKernel(AvgPoolParams p, T *I, T *O, int N)
+{
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < N)
+    {
+        int t = thread_id;
+        int n = t / (p.H * p.W * p.C);
+        t = t % (p.H * p.W * p.C);
+        int h = t / (p.W * p.C);
+        t = t % (p.W * p.C);
+        int w = t / p.C;
+        int c = t % p.C;
+        int leftTopCornerH = h * p.strideH; // - p.zPadHLeft;
+        int leftTopCornerW = w * p.strideW; // - p.zPadWLeft;
+        T sum = 0;
+        for (int fh = 0; fh < p.FH; fh++)
+        {
+            for (int fw = 0; fw < (p.isLowerTriangular ? h+1 : p.FW); fw++)
+            {
+                int posH = leftTopCornerH + fh;
+                int posW = leftTopCornerW + fw;
+                assert(posH >= 0 && posH <= p.imgH);
+                assert(posW >= 0 && posW <= p.imgW);
+                int idx = (p.isLowerTriangular? n * (p.imgH * (p.imgH + 1)) / 2 + (leftTopCornerH * (leftTopCornerH+1))/2 + posW : n * p.imgH * p.imgW * p.C + posH * p.imgW * p.C + posW * p.C + c);
+                sum += I[idx];
+                gpuMod(sum, p.bw);
+                // if (thread_id <= 10)
+                    // printf("Add pool with thread %d, %d, %d: %ld, %d, %d, %d, %d, %ld, %d\n", thread_id, fh, fw, sum, N, p.bw, p.bin, idx, I[idx], p.scaleDiv);
+            }
+        }
+        if (p.scaleDiv)
+        {
+            T den = (T(1) << p.scaleDiv) / T(p.FH * p.FW);
+            sum *= den;
+            // printf("Multiplying the sum by a large number: %ld\n", den);
+        }
+        gpuMod(sum, p.bw);
+        O[thread_id] = sum;
+        // if (thread_id <= 10)
+            // printf("Add pool %d: %ld, %d\n", thread_id, sum, p.bw);
+    }
+}
+
+template <typename T>
+T *gpuAddPoolBackProp(AvgPoolParams p, T *d_incomingGrad, Stats *s)
+{
+    int outSz = p.N * p.H * p.W * p.C * p.FH * p.FW;
+    auto d_expandedGrad = (T *)gpuMalloc(outSz * sizeof(T));
+    expandKernel<<<(outSz - 1) / 256 + 1, 256>>>(p, d_incomingGrad, d_expandedGrad, outSz);
+    auto d_outgoingGrad = gpuCollectGradients(p, d_expandedGrad, s);
+    gpuFree(d_expandedGrad);
+    size_t inSz = p.N * p.imgH * p.imgW * p.C;
+    T c = (T(1) << p.scaleDiv) / T(p.FH * p.FW);
+    gpuLinearComb(p.bw, inSz, d_outgoingGrad, c, d_outgoingGrad);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_outgoingGrad;
+}
+
+template <typename T>
+T *gpuAddPool(AvgPoolParams p, T *d_I, Stats *s)
+{
+    // printf("Avg pool: %d\n", p.bw);
+    int outSz = getMSz(p);
+    T *d_O = (T *)gpuMalloc(outSz * sizeof(T));
+    addPoolKernel<<<(outSz - 1) / 256 + 1, 256>>>(p, d_I, d_O, outSz);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_O;
+}
diff --git a/GPU-MPC/fss/gpu_avgpool.h b/GPU-MPC/fss/gpu_avgpool.h
new file mode 100644
index 00000000..ffd38e72
--- /dev/null
+++ b/GPU-MPC/fss/gpu_avgpool.h
@@ -0,0 +1,53 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+
+struct AvgPoolParams {
+    int bw, bin, scale, scaleDiv, bwBackprop;
+    int N, imgH, imgW, C; 
+    int FH, FW; 
+    int strideH, strideW; 
+    int zPadHLeft, zPadHRight; 
+    int zPadWLeft, zPadWRight;
+    int H, W;
+    bool isLowerTriangular = false;
+};
+
+inline int getMSz(AvgPoolParams p)
+{
+    return p.N * p.H * p.W * p.C;
+}
+
+inline void initPoolParams(AvgPoolParams &p)
+{
+    p.H = ((p.imgH - p.FH + (p.zPadHLeft + p.zPadHRight)) / p.strideH) + 1;
+    p.W = ((p.imgW - p.FW + (p.zPadWLeft + p.zPadWRight)) / p.strideW) + 1;
+    // printf("OH=%d, OW=%d\n", p.H, p.W);
+}
+
+template <typename T>
+T* gpuAddPool(AvgPoolParams p, T* d_I, Stats* s);
+
+
+#include "gpu_avgpool.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_conv2d.cu b/GPU-MPC/fss/gpu_conv2d.cu
new file mode 100644
index 00000000..d69c0d25
--- /dev/null
+++ b/GPU-MPC/fss/gpu_conv2d.cu
@@ -0,0 +1,457 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+// Utilities and system includes
+#include <assert.h>
+#include <chrono>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <math.h>
+#include <stdlib.h>
+
+// CUDA runtime
+#include <cstddef>
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_string.h" // helper for shared functions common to CUDA Samples
+
+#include <cutlass/cutlass.h>
+#include <cutlass/conv/kernel/default_conv2d_fprop.h>
+#include <cutlass/conv/kernel/default_conv2d_dgrad.h>
+#include <cutlass/conv/kernel/default_conv2d_wgrad.h>
+#include <cutlass/conv/device/implicit_gemm_convolution.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/util/device_memory.h>
+#include "cutlass/reduction/device/tensor_reduce.h"
+
+// CUDA and CUBLAS functions
+#include "utils/helper_functions.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_stats.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_linear_helper.h"
+
+#include "gpu_conv2d.h"
+
+
+const int block_size = 256;
+// cudnnHandle_t cudnn;
+
+template <typename T>
+using Conv2DKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T, // accumulator, might be overkill for small bitwidths but that's okay
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm86,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<32, 64, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        T,
+        1,
+        T,
+        T>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic>::Kernel;
+
+template <typename T>
+using Conv2DImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2DKernel<T>>;
+
+template <typename T>
+T *cutlass_conv2d(GPUConv2DKey<T> k, T *d_I, T *d_F, T *d_C /*, GPUContext* c*/, bool cIsBias)
+{
+    // auto start = std::chrono::high_resolution_clock::now();
+    auto A = getTensorRef(d_I, k.p.N, k.p.H, k.p.W, k.p.CI);
+    auto B = getTensorRef(d_F, k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    auto C = getTensorRef(d_C, k.p.N, k.p.OH, k.p.OW, k.p.CO);
+    if (cIsBias)
+        C = getTensorRefBias(d_C);
+    auto D = getTensorRef((T *)gpuMalloc(k.mem_size_O), k.p.N, k.p.OH, k.p.OW, k.p.CO);
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    cutlass::Tensor4DCoord input_size(k.p.N, k.p.H, k.p.W, k.p.CI);
+    cutlass::Tensor4DCoord filter_size(k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    // check these initializations later
+    cutlass::Tensor4DCoord padding(k.p.zPadHLeft, k.p.zPadHRight, k.p.zPadWLeft, k.p.zPadWRight);
+    cutlass::MatrixCoord conv_stride(k.p.strideH, k.p.strideW);
+    cutlass::MatrixCoord dilation(1, 1);
+    cutlass::Tensor4DCoord output_size(k.p.N, k.p.OH, k.p.OW, k.p.CO);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        input_size,
+        filter_size,
+        padding,
+        conv_stride,
+        dilation,
+        output_size,
+        mode,
+        1 // split_k_slices
+    );
+
+    typename Conv2DImplicitGemm<T>::Arguments arguments{
+        problem_size,
+        A,
+        B,
+        d_C ? C : D,
+        D,
+        {T(1), d_C ? T(1) : T(0)},
+    };
+
+    Conv2DImplicitGemm<T> implicit_gemm_op;
+
+    size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+    // printf("Allocating gpu workspace\n");
+    uint8_t *workspace = gpuMalloc(workspace_size);
+    // printf("Allocation done\n");
+    // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    auto status = implicit_gemm_op.can_implement(arguments);
+    CUTLASS_CHECK(status);
+
+    status = implicit_gemm_op.initialize(arguments, workspace);
+    CUTLASS_CHECK(status);
+
+    status = implicit_gemm_op();
+    CUTLASS_CHECK(status);
+
+    gpuFree(workspace);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // std::cout << "Time for cutlass conv2d in microseconds: " << std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count() << std::endl;
+
+    return D.data();
+}
+
+template <typename T>
+using ConvDGradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad<
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm86,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        T,
+        1,
+        T,
+        T>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    // cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic,
+    cutlass::conv::StrideSupport::kUnity>::Kernel;
+
+template <typename T>
+using ConvDGradImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<ConvDGradKernel<T>>;
+
+template <typename T>
+T *cutlass_conv_dgrad(GPUConv2DKey<T> k, T *d_incomingGrad, T *d_F, T *d_I)
+{
+
+    auto A = getTensorRef(d_incomingGrad, k.p.N, k.p.OH, k.p.OW, k.p.CO);
+    auto B = getTensorRef(d_F, k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    auto C = getTensorRef(d_I, k.p.N, k.p.H, k.p.W, k.p.CI);
+    auto D = getTensorRef((T *)gpuMalloc(k.mem_size_O), k.p.N, k.p.H, k.p.W, k.p.CI);
+
+    assert(k.mem_size_I == k.p.N * k.p.OH * k.p.OW * k.p.CO * sizeof(T));
+    assert(k.mem_size_F == k.p.CO * k.p.FH * k.p.FW * k.p.CI * sizeof(T));
+    assert(k.mem_size_O == k.p.N * k.p.H * k.p.W * k.p.CI * sizeof(T));
+
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    cutlass::Tensor4DCoord input_size(k.p.N, k.p.H, k.p.W, k.p.CI);
+    cutlass::Tensor4DCoord filter_size(k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    // check these initializations later
+    cutlass::Tensor4DCoord padding(k.p.zPadHLeft, k.p.zPadHRight, k.p.zPadWLeft, k.p.zPadWRight);
+    cutlass::MatrixCoord conv_stride(k.p.strideH, k.p.strideW);
+    cutlass::MatrixCoord dilation(1, 1);
+    cutlass::Tensor4DCoord output_size(k.p.N, k.p.OH, k.p.OW, k.p.CO);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        input_size, // output_size,
+        filter_size,
+        padding,
+        conv_stride,
+        dilation,
+        output_size, // input_size,
+        mode,
+        1 // split_k_slices
+    );
+    // if(k.strideH == 1 && k.strideW == 1) {
+    typename ConvDGradImplicitGemm<T>::Arguments arguments{
+        problem_size,
+        A,
+        B,
+        d_I ? C : D,
+        D,
+        {T(1), d_I ? T(1) : T(0)}};
+    ConvDGradImplicitGemm<T> implicit_gemm_op;
+    size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+    uint8_t *workspace = gpuMalloc(workspace_size);
+    auto status = implicit_gemm_op.can_implement(arguments);
+    CUTLASS_CHECK(status);
+    status = implicit_gemm_op.initialize(arguments, workspace);
+    CUTLASS_CHECK(status);
+    status = implicit_gemm_op();
+    CUTLASS_CHECK(status);
+    gpuFree(workspace);
+
+    return D.data();
+}
+
+template <typename T>
+using ConvWGradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T, cutlass::layout::TensorNHWC,
+    T,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm86,
+    cutlass::gemm::GemmShape<64, 64, 8>,
+    cutlass::gemm::GemmShape<32, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        T,
+        1,
+        T,
+        T>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kAnalytic>::Kernel;
+
+template <typename T>
+using ConvWGradImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<ConvWGradKernel<T>>;
+
+template <typename T>
+T *cutlass_conv_fgrad(GPUConv2DKey<T> k, T *d_grad, T *d_I, T *d_F)
+{
+    auto A = getTensorRef(d_grad, k.p.N, k.p.OH, k.p.OW, k.p.CO);
+    auto B = getTensorRef(d_I, k.p.N, k.p.H, k.p.W, k.p.CI);
+    auto C = getTensorRef(d_F, k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    auto D = getTensorRef((T *)gpuMalloc(k.mem_size_O), k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+
+    assert(k.mem_size_I == k.p.N * k.p.OH * k.p.OW * k.p.CO * sizeof(T));
+    assert(k.mem_size_F == k.p.N * k.p.H * k.p.W * k.p.CI * sizeof(T));
+    assert(k.mem_size_O == k.p.CO * k.p.FH * k.p.FW * k.p.CI * sizeof(T));
+
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    cutlass::Tensor4DCoord input_size(k.p.N, k.p.H, k.p.W, k.p.CI);
+    cutlass::Tensor4DCoord filter_size(k.p.CO, k.p.FH, k.p.FW, k.p.CI);
+    // check these initializations later
+    cutlass::Tensor4DCoord padding(k.p.zPadHLeft, k.p.zPadHRight, k.p.zPadWLeft, k.p.zPadWRight);
+    cutlass::MatrixCoord conv_stride(k.p.strideH, k.p.strideW);
+    cutlass::MatrixCoord dilation(1, 1);
+    cutlass::Tensor4DCoord output_size(k.p.N, k.p.OH, k.p.OW, k.p.CO);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        input_size, // output_size,
+        filter_size,
+        padding,
+        conv_stride,
+        dilation,
+        output_size,
+        mode,
+        1 // split_k_slices
+    );
+
+    typename ConvWGradImplicitGemm<T>::Arguments arguments{
+        problem_size,
+        A,
+        B,
+        d_F ? C : D,
+        D,
+        {T(1), d_F ? T(1) : T(0) /*options.alpha, options.beta*/}};
+
+    ConvWGradImplicitGemm<T> implicit_gemm_op;
+
+    size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+    uint8_t *workspace = gpuMalloc(workspace_size);
+
+    auto status = implicit_gemm_op.can_implement(arguments);
+    CUTLASS_CHECK(status);
+
+    status = implicit_gemm_op.initialize(arguments, workspace);
+    CUTLASS_CHECK(status);
+
+    status = implicit_gemm_op();
+    CUTLASS_CHECK(status);
+
+    gpuFree(workspace);
+
+    // cudaDeviceSynchronize();
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // std::cout << "Time for cutlass conv2d wgrad in ms: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << std::endl;
+
+    return D.data();
+    // return status == cutlass::Status::kSuccess ? cudaSuccess : cudaErrorUnknown;
+}
+
+template <typename T>
+T *cutlassConv2D(GPUConv2DKey<T> k, T *d_I, T *d_F, T *d_C, char op, bool cIsBias = false, bool reduceBw = false)
+{
+    T *d_O;
+    switch (op)
+    {
+    case 0:
+        d_O = cutlass_conv2d(k, d_I, d_F, d_C, cIsBias);
+        break;
+    case 1:
+        d_O = cutlass_conv_dgrad(k, d_I, d_F, d_C);
+        break;
+    case 2:
+        d_O = cutlass_conv_fgrad(k, d_I, d_F, d_C);
+        // unnecessary but ok
+        break;
+    }
+    if (reduceBw && k.p.bout < sizeof(T) * 8)
+    {
+        modKernel<<<(k.p.size_O - 1) / block_size + 1, block_size>>>(k.p.size_O, d_O, k.p.bout);
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+    return d_O;
+}
+
+template <typename T>
+T *gpuConv2DBeaver(GPUConv2DKey<T> k, int party, T *d_I, T *d_F, T *d_a, T *d_b, T *h_bias, Stats *s, char op)
+{
+    T *d_O1, *d_O2, *d_bias = NULL;
+    if (op == 0 && h_bias != NULL)
+    {
+        d_bias = (T *)moveToGPU((uint8_t *)h_bias, k.p.CO * sizeof(T), s);
+    }
+    if (party == SERVER0)
+    {
+        gpuLinearComb(k.p.bout, k.p.size_F, d_b, T(1), d_F, -T(1), d_b);
+    }
+    d_O1 = cutlassConv2D(k, d_I, d_b, d_bias, op, true);
+    d_O2 = cutlassConv2D(k, d_a, d_F, (T *)NULL, op);
+
+    T *d_O = (T *)moveToGPU((uint8_t *)k.O, k.mem_size_O, s);
+    gpuLinearComb(k.p.bout, k.p.size_O, d_O, T(1), d_O, party == SERVER0 ? T(1) : T(-1), d_O1, T(-1), d_O2);
+    // beaverAdd<<<(k.p.size_O - 1) / block_size + 1, block_size>>>(k.p.size_O, party, k.p.bout, d_O1, d_O2, d_O);
+    // checkCudaErrors(cudaDeviceSynchronize());
+    if (d_bias)
+        gpuFree(d_bias);
+    gpuFree(d_O1);
+    gpuFree(d_O2);
+    return d_O;
+}
+
+template <typename T>
+T *gpuConv2DWrapper(GPUConv2DKey<T> k, T *h_I, T *h_F, T *h_C, char op, bool cIsBias)
+{
+    T *d_I, *d_F, *d_C = NULL;
+    d_I = (T *)moveToGPU((uint8_t *)h_I, k.mem_size_I, NULL);
+    d_F = (T *)moveToGPU((uint8_t *)h_F, k.mem_size_F, NULL);
+    if (h_C)
+    {
+        if (cIsBias)
+            d_C = (T *)moveToGPU((uint8_t *)h_C, k.p.CO * sizeof(T), NULL);
+        else
+            d_C = (T *)moveToGPU((uint8_t *)h_C, k.mem_size_O, NULL);
+    }
+
+    // Need to fix this
+    auto d_O = cutlassConv2D(k, d_I, d_F, d_C, op, cIsBias, true);
+    gpuFree(d_I);
+    gpuFree(d_F);
+    if (h_C)
+        gpuFree(d_C);
+
+    T *h_O = (T *)moveToCPU((uint8_t *)d_O, k.mem_size_O, NULL);
+
+    gpuFree(d_O);
+    return h_O;
+}
+
+template <typename T>
+T *gpuConv2DPlaintext(GPUConv2DKey<T> k, T *d_I, T *d_F, T *d_C, char op, bool cIsBias)
+{
+    // printf("here\n");
+    auto d_O = cutlassConv2D(k, d_I, d_F, d_C, op, cIsBias, true);
+    // if(k.Bout < sizeof(T) * 8) modKernel<<<(k.size_O - 1) / block_size + 1, block_size>>>(k.size_O, d_O, k.Bout);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_O;
+}
+
+template <typename T>
+T *gpuKeygenConv2D(u8 **key_as_bytes, int party, GPUConv2DKey<T> k, T *d_mask_I = NULL, T *h_mask_F = NULL, bool maskOutput = false, T* d_mask_C = NULL)
+{
+    bool mask_I_was_null = false;
+    if (!d_mask_I)
+    {
+        d_mask_I = randomGEOnGpu<T>(k.p.size_I, k.p.bin);
+        // checkCudaErrors(cudaMemset(d_mask_I, 0, k.mem_size_I));
+        mask_I_was_null = true;
+    }
+    // T *d_mask_F = NULL;
+    // if (h_mask_F)
+    // {
+    T* d_mask_F = (T *)moveToGPU((u8 *)h_mask_F, k.mem_size_F, NULL);
+    // }
+    // else
+    // {
+        // printf("%lu\n", k.p.size_F);
+        // d_mask_F = (T*) gpuMalloc(k.p.size_F * sizeof(T));
+        // checkCudaErrors(cudaMemset(d_mask_F, 0, k.mem_size_F));
+        // randomGEOnGpu<T>(k.p.size_F, k.p.bin);
+    // }
+
+    // T *d_mask_C = NULL;
+    if (maskOutput && !d_mask_C)
+    {
+        d_mask_C = randomGEOnGpu<T>(k.p.size_O, k.p.bout);
+        // checkCudaErrors(cudaMemset(d_mask_C, 0, k.p.size_O * sizeof(T)));
+    }
+
+    auto d_masked_C = gpuConv2DPlaintext(k, d_mask_I, d_mask_F, d_mask_C, 0, false);
+    // printf("Writing shares=%lu, %lu, %lu\n", k.p.size_I, k.p.size_F, k.p.size_O);
+    writeShares<T, T>(key_as_bytes, party, k.p.size_I, d_mask_I, k.p.bout);
+    writeShares<T, T>(key_as_bytes, party, k.p.size_F, d_mask_F, k.p.bout);
+    writeShares<T, T>(key_as_bytes, party, k.p.size_O, d_masked_C, k.p.bout);
+    // printf("Done writing shares\n");
+    if (mask_I_was_null)
+        gpuFree(d_mask_I);
+    gpuFree(d_mask_F);
+    gpuFree(d_masked_C);
+    // printf("Returning mask\n");
+    return d_mask_C;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_conv2d.h b/GPU-MPC/fss/gpu_conv2d.h
new file mode 100644
index 00000000..9410abeb
--- /dev/null
+++ b/GPU-MPC/fss/gpu_conv2d.h
@@ -0,0 +1,50 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+struct Conv2DParams
+{
+    int bin, bout, N, H, W, CI, FH, FW, CO,
+        zPadHLeft, zPadHRight,
+        zPadWLeft, zPadWRight,
+        strideH, strideW, OH, OW;
+    size_t size_I, size_F, size_O;
+};
+
+template <typename T>
+struct GPUConv2DKey
+{
+    Conv2DParams p;
+    size_t mem_size_I, mem_size_F, mem_size_O;
+    T *I, *F, *O;
+};
+
+void fillConv2DParams(Conv2DParams *p)
+{
+    p->OH = ((p->H - p->FH + (p->zPadHLeft + p->zPadHRight)) / p->strideH) + 1;
+    p->OW = ((p->W - p->FW + (p->zPadWLeft + p->zPadWRight)) / p->strideW) + 1;
+    p->size_I = p->N * p->H * p->W * p->CI;
+    p->size_F = p->CO * p->FH * p->FW * p->CI;
+    p->size_O = p->N * p->OH * p->OW * p->CO;
+}
+
+#include "gpu_conv2d.cu"
diff --git a/GPU-MPC/fss/gpu_dpf.cu b/GPU-MPC/fss/gpu_dpf.cu
new file mode 100644
index 00000000..770be6b8
--- /dev/null
+++ b/GPU-MPC/fss/gpu_dpf.cu
@@ -0,0 +1,480 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <assert.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_random.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+
+#include "gpu_linear_helper.h"
+#include "gpu_dpf_templates.h"
+#include "gpu_dpf.h"
+
+typedef void (*treeTraversal)(int party, int bin, int N,
+                              u64 x,
+                              AESBlock *scw, AESBlock *l0, AESBlock *l1,
+                              u32 *tR, AESSharedContext *c, u32 *out, u64 oStride);
+
+// only supports one bit output
+// can avoid returning AESBlock to reduce copying
+__device__ AESBlock expandDPFTreeNode(int bin, int party,
+                                      const AESBlock s,
+                                      const AESBlock cw,
+                                      const AESBlock l0,
+                                      const AESBlock l1,
+                                      u32 tR,
+                                      const u8 keep,
+                                      int i,
+                                      AESSharedContext *c)
+{
+    const AESBlock notOneAESBlock = ~1;
+    const AESBlock zeroAndAllOne[2] = {0, static_cast<AESBlock>(~0)};
+    const AESBlock OneAESBlock = 1;
+
+    AESBlock tau = 0, stcw;
+    u8 t_previous = lsb(s);
+    /* remove the last two bits from the AES seed */
+    auto ss = s & notOneAESBlock;
+
+    /* get the seed for this level (tau) */
+    // apply aes to either 0 or 2 based on keep (is what is hopefully happening)
+    applyAESPRG(c, (u32 *)&ss, keep * 2, (u32 *)&tau);
+    AESBlock scw = 0;
+    if (i < bin - LOG_AES_BLOCK_LEN - 1)
+    {
+        /* zero out the last two bits of the correction word for s because
+    they must contain the corrections for t0 and t1 */
+        scw = (cw & notOneAESBlock);
+        /* separate the correction bits for t0 and t1 and place them
+    in the lsbs of two AES blocks */
+        // u32 ds1 = tR_l;
+        // if (evalAll)
+        // tR_l = ((tR_l >> i) & 1);
+        // else
+        // ds1 = tR; /*getVCW(1, tR, N, i);*/
+        AESBlock ds[2] = {cw & OneAESBlock, AESBlock(tR)};
+        scw ^= ds[keep];
+    }
+    else
+    {
+        AESBlock ds[2] = {l0, l1};
+        scw = ds[keep];
+    }
+
+    const auto mask = zeroAndAllOne[t_previous];
+
+    /* correct the seed for the next level if necessary */
+    // tau is completely pseudorandom and is being xored with (scw || 0 || tcw)*keep
+    stcw = tau ^ (scw & mask);
+    return stcw;
+}
+
+__device__ u8 getDPFOutput(AESBlock *s, u64 x)
+{
+    gpuMod(x, LOG_AES_BLOCK_LEN);
+    return u8(*s >> x) & 1;
+}
+
+__device__ void doDpf(int party, int bin, int N,
+                      u64 x,
+                      AESBlock *scw, AESBlock *l0, AESBlock *l1,
+                      u32 *tR, AESSharedContext *c, u32 *out, u64 oStride)
+{
+    AESBlock s = scw[0];
+    auto x1 = __brevll(x) >> (64 - bin);
+    for (int i = 0; i < bin - LOG_AES_BLOCK_LEN; ++i)
+    {
+        const u8 keep = lsb(x1);
+        if (i < bin - LOG_AES_BLOCK_LEN - 1)
+        {
+            u32 tR_l = u32(getVCW(1, tR, N, i));
+            s = expandDPFTreeNode(bin, party, s, scw[(i + 1) * N], 0, 0, tR_l, keep, i, c);
+        }
+        else
+        {
+
+            int tid = blockIdx.x * blockDim.x + threadIdx.x;
+            s = expandDPFTreeNode(bin, party, s, 0, l0[tid], l1[tid], 0, keep, i, c);
+        }
+        x1 >>= 1;
+    }
+    auto o = getDPFOutput(&s, x);
+    writePackedOp(out, u64(o), 1, N);
+}
+
+template <int E, dpfPrologue pr, dpfEpilogue ep>
+__device__ void doDcf(int party, int bin, int N,
+                      u64 x,
+                      AESBlock *scw, AESBlock *l0, AESBlock *l1,
+                      u32 *tR,
+                      AESSharedContext *c, u32 *out, u64 oStride)
+{
+    AESBlock s[E];
+    u64 x0[E], x1[E];
+    // populate the input
+    pr(party, bin, N, x, x0);
+    u8 p[E], oldDir[E], keep[E];
+    for (int e = 0; e < E; e++)
+    {
+        s[e] = scw[0];
+        x1[e] = __brevll(x0[e]) >> (64 - bin);
+        p[e] = 0;
+        oldDir[e] = 0;
+        keep[e] = 0;
+    }
+    for (int i = 0; i < bin - LOG_AES_BLOCK_LEN; ++i)
+    {
+        AESBlock curScw = 0, l0_l = 0, l1_l = 0;
+        u32 tR_l;
+        if (i < bin - LOG_AES_BLOCK_LEN - 1)
+        {
+            curScw = scw[(i + 1) * N];
+            tR_l = u32(getVCW(1, tR, N, i));
+        }
+        else
+        {
+            int tid = blockIdx.x * blockDim.x + threadIdx.x;
+            l0_l = l0[tid];
+            l1_l = l1[tid];
+        }
+
+        for (int e = 0; e < E; e++)
+        {
+
+            keep[e] = lsb(x1[e]);
+            // the direction changed
+            if (oldDir[e] != keep[e])
+                p[e] ^= lsb(s[e]);
+            // need to keep track of all the current seeds separately
+            if (i < bin - LOG_AES_BLOCK_LEN - 1)
+            {
+                s[e] = expandDPFTreeNode(bin, party, s[e], curScw, 0, 0, tR_l, keep[e], i, c);
+            }
+            else
+            {
+                s[e] = expandDPFTreeNode(bin, party, s[e], 0, l0_l, l1_l, 0, keep[e], i, c);
+                int ub;
+                int pos = x0[e] & 127;
+                if (keep[e] == 1)
+                {
+                    // xor with the complement of the prefix substring
+                    // get rid of the lower order bits
+                    // Neha: need to change this later
+                    // can need to xor anywhere from 127 bits to 0 bits
+                    ub = 127 - pos;
+                    s[e] >>= (pos + 1);
+                }
+                else
+                {
+                    ub = pos + 1; // x0[e] & 127;
+                    // don't get rid of the lower order bits
+                }
+                for (int i = 0; i < ub; i++)
+                {
+                    // extract the lsb of s
+                    p[e] ^= lsb(s[e]) /*((u32)(*s) & 1)*/;
+                    s[e] >>= 1;
+                }
+            }
+            oldDir[e] = keep[e];
+            x1[e] >>= 1;
+        }
+    }
+    // add loop here as well
+    if (party == SERVER1)
+    {
+        for (int e = 0; e < E; e++)
+        {
+            p[e] ^= u8(1);
+        }
+    }
+    ep(party, bin, N, x, p, out, oStride);
+}
+
+// think about when to pass pointers to large amounts of data like AESBlocks
+/* out needs to be zeroed out before output is written into it. Am currently NOT adding a check for this */
+template <typename T, treeTraversal t>
+__global__ void dpfTreeEval(int party, int bin, int N, T *in, AESBlock *scw,
+                            AESBlock *l0, AESBlock *l1, u32 *tR, u32 *out, u64 oStride, AESGlobalContext gaes)
+{
+    AESSharedContext saes;
+    loadSbox(&gaes, &saes);
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < N)
+    {
+        scw = &scw[tid];
+        auto x = u64(in[tid]);
+        t(party, bin, N, x, scw, l0, l1, tR, &saes, out, oStride);
+    }
+}
+
+template <typename T, treeTraversal t>
+void gpuDpfTreeEval(GPUDPFTreeKey k, int party, T *d_in, AESGlobalContext *g, Stats *s, u32 *d_out, u64 oStride)
+{
+    // auto d_out = moveMasks(k.memSzOut, h_masks, s);
+    assert(k.memSzScw % (k.bin - LOG_AES_BLOCK_LEN) == 0);
+
+    AESBlock *d_scw = (AESBlock *)moveToGPU((u8 *)k.scw, k.memSzScw, s);
+    AESBlock *d_l0 = (AESBlock *)moveToGPU((u8 *)k.l0, k.memSzL, s);
+    AESBlock *d_l1 = (AESBlock *)moveToGPU((u8 *)k.l1, k.memSzL, s);
+    u32 *d_tR = (u32 *)moveToGPU((u8 *)k.tR, k.memSzT, s);
+
+    const int tbSz = 256;
+    int tb = (k.N - 1) / tbSz + 1;
+    // auto start = std::chrono::high_resolution_clock::now();
+    // kernel launch
+    dpfTreeEval<T, t><<<tb, tbSz>>>(party, k.bin, k.N, d_in, d_scw, d_l0, d_l1, d_tR, d_out, oStride, *g);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // printf("Time taken by dpf kernel=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    gpuFree(d_scw);
+    gpuFree(d_l0);
+    gpuFree(d_l1);
+    gpuFree(d_tR);
+}
+
+// no memory leak
+template <typename T>
+u32 *gpuDpf(GPUDPFKey k, int party, T *d_in, AESGlobalContext *g, Stats *s)
+{
+    u32 *d_out;
+    if (k.bin <= 7)
+        d_out = gpuLookupSSTable<T, 1, idPrologue, idEpilogue>(k.ssKey, party, d_in, s);
+    else
+    {
+        d_out = moveMasks(k.memSzOut, NULL, s);
+        int n = k.dpfTreeKey[0].N;
+        size_t gIntSzOut = k.memSzOut / sizeof(PACK_TYPE);
+        size_t bIntSzOut = k.dpfTreeKey[0].memSzOut / sizeof(PACK_TYPE);
+        for (int b = 0; b < k.B; b++)
+        {
+            gpuDpfTreeEval<T, doDpf>(k.dpfTreeKey[b], party, d_in + b * n, g, s, d_out + b * bIntSzOut, (u64)gIntSzOut);
+        }
+    }
+    return d_out;
+}
+
+template <typename T, int E, dpfPrologue pr, dpfEpilogue ep>
+u32 *gpuDcf(GPUDPFKey k, int party, T *d_in, AESGlobalContext *g, Stats *s, std::vector<u32 *> *h_masks = NULL)
+{
+    // printf("Started gpu dcf\n");
+    u32 *d_out;
+    if (k.bin <= 7)
+        d_out = gpuLookupSSTable<T, E, pr, ep>(k.ssKey, party, d_in, s, h_masks);
+    else
+    {
+        d_out = moveMasks(k.memSzOut, h_masks, s);
+        size_t gIntSzOut = k.memSzOut / sizeof(PACK_TYPE);
+        int n = k.dpfTreeKey[0].N;
+        size_t bIntSzOut = k.dpfTreeKey[0].memSzOut / sizeof(PACK_TYPE);
+        // printf("outSz=%lu\n", bIntSzOut);
+        for (int b = 0; b < k.B; b++)
+        {
+            gpuDpfTreeEval<T, doDcf<E, pr, ep>>(k.dpfTreeKey[b], party, d_in + b * n, g, s, d_out + b * bIntSzOut, (u64)gIntSzOut);
+        }
+    }
+    return d_out;
+}
+
+// Real Endpoints
+template <typename T>
+__global__ void keyGenDPFTreeKernel(int party, int bin, int N, T *rinArr, AESBlock *s0, AESBlock *s1, AESBlock *k0, AESBlock *l0, AESBlock *l1, u32 *tR, AESGlobalContext gaes, bool evalAll)
+{
+    AESSharedContext saes;
+    loadSbox(&gaes, &saes);
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < N)
+    {
+        static const AESBlock notOneBlock = ~1;
+        static const AESBlock OneBlock = 1;
+        const AESBlock zeroAndAllOne[2] = {0, static_cast<AESBlock>(~0)};
+
+        AESBlock s[2];
+        s[0] = s0[tid];
+        s[1] = s1[tid];
+        AESBlock si[2][2];
+        // s || secret share of 1
+        s[0] = (s[0] & notOneBlock) ^ ((s[1] & OneBlock) ^ OneBlock);
+        k0[tid] = s[party == 1];
+        u32 tR_l = 0;
+        auto rin = u64(rinArr[tid]);
+        gpuMod(rin, bin);
+        int i;
+        for (i = 0; i < bin - LOG_AES_BLOCK_LEN; ++i)
+        {
+            const u8 keep = static_cast<u8>(rin >> (bin - 1 - i)) & 1;
+            AESBlock a = AESBlock(keep);
+            // 127-bit AES seed
+            auto ss0 = s[0] & notOneBlock;
+            auto ss1 = s[1] & notOneBlock;
+
+            // apply aes to 0 and 2
+            // Neha: make sure we don't need &(si[0][1])
+            applyAESPRGTwoTimes(&saes, (u32 *)&ss0, 0, (u32 *)si[0], (u32 *)&si[0][1]);
+            applyAESPRGTwoTimes(&saes, (u32 *)&ss1, 0, (u32 *)si[1], (u32 *)&si[1][1]);
+            // get the advice bits for this level
+
+            // correction words for the AES seed
+            AESBlock siXOR[] = {si[0][0] ^ si[1][0], si[0][1] ^ si[1][1]};
+            if (i < bin - LOG_AES_BLOCK_LEN - 1)
+            {
+                auto ti0 = (u8)lsb(s[0]);
+                auto ti1 = (u8)lsb(s[1]);
+                // correction words for the advice bits
+                AESBlock t[] = {
+                    (OneBlock & siXOR[0]) ^ a ^ OneBlock,
+                    (OneBlock & siXOR[1]) ^ a};
+
+                auto scw = siXOR[keep ^ 1] & notOneBlock;
+                k0[(i + 1) * N + tid] = scw       // set bits [127, 2] as scw = s0_loss ^ s1_loss
+                                        ^ (t[0]); // set bit 0 as tL
+                                                  //  ^ t[1];       // set bit 0 as tR
+
+                if (evalAll)
+                    tR_l = (tR_l << 1) ^ u32(t[1]);
+                else
+                    writeVCW(1, tR, u64(t[1]), i, N);
+                // printf("tR %d=%u, %u\n", tid, u32(t[1]), tR_l);
+                // next seeds along the special path
+                auto si0Keep = si[0][keep];
+                auto si1Keep = si[1][keep];
+
+                // the advice correction bit along the special path
+                auto TKeep = t[keep];
+                // set the next level of s,t
+                s[0] = si0Keep ^ (zeroAndAllOne[ti0] & (scw ^ TKeep));
+                s[1] = si1Keep ^ (zeroAndAllOne[ti1] & (scw ^ TKeep));
+            }
+            else
+            {
+                auto rinPosInBlock = rin;
+                gpuMod(rinPosInBlock, LOG_AES_BLOCK_LEN);
+                // remember that in the packing code here we are assuming that 0 is the lsb (and
+                // not the msb)
+                auto rinBlock = OneBlock << rinPosInBlock;
+                l0[tid] = siXOR[0] ^ (zeroAndAllOne[keep == 0] & rinBlock);
+                l1[tid] = siXOR[1] ^ (zeroAndAllOne[keep == 1] & rinBlock);
+            }
+        }
+        if (evalAll)
+        {
+            tR[tid] = __brev(tR_l) >> (32 - bin + LOG_AES_BLOCK_LEN + 1);
+        }
+    }
+}
+
+template <typename T>
+void doDpfTreeKeyGen(u8 **key_as_bytes, int party, int bin, int N,
+                     T *d_rin, AESGlobalContext *gaes, bool evalAll)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, N);
+    writeInt(key_as_bytes, evalAll);
+    assert(bin > LOG_AES_BLOCK_LEN);
+
+    u64 memSizeK = N * (bin - LOG_AES_BLOCK_LEN) * sizeof(AESBlock);
+    AESBlock *d_k0 = (AESBlock *)gpuMalloc(memSizeK);
+    u64 memSizeL = N * sizeof(AESBlock);
+    AESBlock *d_l0 = (AESBlock *)gpuMalloc(memSizeL);
+    AESBlock *d_l1 = (AESBlock *)gpuMalloc(memSizeL);
+    u64 memSizeT;
+    if (evalAll)
+        memSizeT = N * sizeof(u32);
+    else
+        memSizeT = ((N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE) * (bin - LOG_AES_BLOCK_LEN);
+    u32 *d_tR = (u32 *)gpuMalloc(memSizeT);
+
+    auto d_s0 = randomAESBlockOnGpu(N);
+    auto d_s1 = randomAESBlockOnGpu(N);
+    keyGenDPFTreeKernel<<<(N - 1) / 256 + 1, 256>>>(party, bin, N, d_rin, d_s0, d_s1, d_k0, d_l0, d_l1, d_tR, *gaes, evalAll);
+    checkCudaErrors(cudaDeviceSynchronize());
+    moveIntoCPUMem(*key_as_bytes, (u8 *)d_k0, memSizeK, NULL);
+
+    *key_as_bytes += memSizeK;
+    moveIntoCPUMem(*key_as_bytes, (u8 *)d_l0, memSizeL, NULL);
+    *key_as_bytes += memSizeL;
+    moveIntoCPUMem(*key_as_bytes, (u8 *)d_l1, memSizeL, NULL);
+    *key_as_bytes += memSizeL;
+    moveIntoCPUMem(*key_as_bytes, (u8 *)d_tR, memSizeT, NULL);
+    *key_as_bytes += memSizeT;
+
+    gpuFree(d_s0);
+    gpuFree(d_s1);
+    gpuFree(d_k0);
+    gpuFree(d_l0);
+    gpuFree(d_l1);
+    gpuFree(d_tR);
+}
+
+template <typename T>
+void gpuKeyGenBatchedDPF(u8 **key_as_bytes, int party, int bin, int N,
+                         T *d_rin, AESGlobalContext *gaes, bool evalAll)
+{
+    u64 memSzOneK = (bin - LOG_AES_BLOCK_LEN + 2) * sizeof(AESBlock);
+    int m = (24 * OneGB) / memSzOneK;
+    m -= (m % 32);
+    int B = (N - 1) / m + 1;
+    // printf("N=%d, m=%d, B=%d, evalAll=%d\n", N, m, B, evalAll);
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, N);
+    writeInt(key_as_bytes, B);
+    for (int b = 0; b < B; b++)
+        doDpfTreeKeyGen(key_as_bytes, party, bin, std::min(m, N - b * m), d_rin + b * m, gaes, evalAll);
+}
+// only a payload of 1 is supported so far
+// should i write the key here?
+template <typename T>
+void gpuKeyGenDPF(u8 **key_as_bytes, int party, int bin, int N,
+                  T *d_rin, AESGlobalContext *gaes, bool evalAll = false)
+{
+    if (bin <= 7)
+    {
+        genSSTable<T, dpfShares>(key_as_bytes, party, bin, N, d_rin);
+    }
+    else
+    {
+        gpuKeyGenBatchedDPF(key_as_bytes, party, bin, N, d_rin, gaes, evalAll);
+    }
+}
+
+template <typename T>
+void gpuKeyGenDCF(u8 **key_as_bytes, int party, int bin, int N,
+                  T *d_rin, AESGlobalContext *gaes)
+{
+    // printf("Bin inside keygenDCF=%d\n", bin);
+    if (bin <= 7)
+    {
+        genSSTable<T, dcfShares>(key_as_bytes, party, bin, N, d_rin);
+    }
+    else
+    {
+        gpuKeyGenBatchedDPF(key_as_bytes, party, bin, N, d_rin, gaes, false);
+    }
+}
diff --git a/GPU-MPC/fss/gpu_dpf.h b/GPU-MPC/fss/gpu_dpf.h
new file mode 100644
index 00000000..307a5d70
--- /dev/null
+++ b/GPU-MPC/fss/gpu_dpf.h
@@ -0,0 +1,107 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "gpu_aes_shm.h"
+#include "gpu_sstab.h"
+
+// using u32 = u32;
+
+struct GPUDPFTreeKey
+{
+    int bin, N, evalAll;
+    AESBlock *scw;
+    AESBlock *l0, *l1;
+    u32 *tR;
+    u64 szScw, memSzScw, memSzL, memSzT, memSzOut;
+};
+
+struct GPUDPFKey
+{
+    // if bin <= 7, populate ss, else ss = NULL
+    int bin, M, B;
+    u64 memSzOut;
+    GPUDPFTreeKey *dpfTreeKey;
+    GPUSSTabKey ssKey;
+};
+
+GPUDPFTreeKey readGPUDPFTreeKey(u8 **key_as_bytes)
+{
+    GPUDPFTreeKey k;
+
+    std::memcpy((char *)&k, *key_as_bytes, 3 * sizeof(int));
+    *key_as_bytes += 3 * sizeof(int);
+
+    k.szScw = k.N * (k.bin - LOG_AES_BLOCK_LEN);
+    k.memSzScw = k.szScw * sizeof(AESBlock);
+    k.scw = (AESBlock *)*key_as_bytes;
+
+    *key_as_bytes += k.memSzScw;
+    k.memSzL = k.N * sizeof(AESBlock);
+    k.l0 = (AESBlock *)*key_as_bytes;
+    *key_as_bytes += k.memSzL;
+    k.l1 = (AESBlock *)*key_as_bytes;
+    *key_as_bytes += k.memSzL;
+
+    if (k.evalAll)
+        k.memSzT = k.N * sizeof(u32);
+    else
+        k.memSzT = ((k.N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE) * (k.bin - LOG_AES_BLOCK_LEN);
+    k.tR = (u32 *)*key_as_bytes;
+    *key_as_bytes += k.memSzT;
+    k.memSzOut = ((k.N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+    return k;
+}
+
+GPUDPFKey readGPUDPFKey(u8 **key_as_bytes)
+{
+    GPUDPFKey k;
+    k.bin = *((int *)*key_as_bytes);
+    if (k.bin <= 7)
+    {
+        k.ssKey = readGPUSSTabKey(key_as_bytes);
+        k.M = k.ssKey.N;
+        k.B = 1;
+        k.memSzOut = k.ssKey.memSzOut;
+    }
+    else
+    {
+        memcpy(&k, *key_as_bytes, 3 * sizeof(int));
+        *key_as_bytes += (3 * sizeof(int));
+
+        printf("M=%d, B=%d\n", k.M, k.B);
+
+        k.dpfTreeKey = new GPUDPFTreeKey[k.B];
+        k.memSzOut = 0;
+        for (int b = 0; b < k.B; b++)
+        {
+            k.dpfTreeKey[b] = readGPUDPFTreeKey(key_as_bytes);
+            k.memSzOut += k.dpfTreeKey[b].memSzOut;
+        }
+    }
+    return k;
+}
+
+const auto readGPUDcfKey = readGPUDPFKey;
+
+#include "gpu_dpf.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_dpf_templates.h b/GPU-MPC/fss/gpu_dpf_templates.h
new file mode 100644
index 00000000..6b9beda5
--- /dev/null
+++ b/GPU-MPC/fss/gpu_dpf_templates.h
@@ -0,0 +1,135 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/misc_utils.h"
+#include "utils/gpu_data_types.h"
+
+#include "gpu_fss_helper.h"
+
+typedef void (*dpfPrologue)(int party, int bin, int N,
+                            u64 x,
+                            u64 *o);
+typedef void (*dpfEpilogue)(int party, int bin, int N,
+                            u64 x,
+                            u8 *o, u32 *out, u64 oStride);
+
+__device__ void idPrologue(int party, int bin, int N,
+                           u64 x,
+                           u64 *o)
+{
+    // printf("Inside truncate=%ld\n", x);
+    o[0] = x;
+    // gpuMod(o[0], bin);
+}
+
+template <u64 p>
+__device__ void dReluPrologue(int party, int bin, int N,
+                              u64 x,
+                              u64 *o)
+{
+    o[0] = p - x - 1;
+}
+
+template <u64 p, u64 q>
+__device__ void geluPrologue(int party, int bin, int N,
+                             u64 x,
+                             u64 *o)
+{
+    o[0] = -x - 1;
+    o[1] = p - x - 1;
+    o[2] = q - x - 1;
+}
+
+template <u64 p, bool flip>
+__device__ void dReluEpilogue(int party, int bin, int N,
+                              u64 x,
+                              u8 *o, u32 *out, u64 oStride)
+{
+    auto o1 = u64(*o);
+    auto mask = getVCW(1, out, N, 0);
+    o1 ^= mask;
+    if (party == SERVER1)
+        o1 ^= (gpuMsb(x - p, bin + 1) ^ u64(flip));
+    // gpuMod(o, 1);
+    //  ^ o ^ mask;
+    // printf("Epilogue: %ld, %ld, %ld\n", mask, u64(*o), gpuMsb(x, bin + 1));
+    writePackedOp(out, o1, 1, N);
+}
+
+template <u64 p, u64 q>
+__device__ void geluEpilogue(int party, int bin, int N,
+                             u64 x,
+                             u8 *o, u32 *out, u64 oStride)
+{
+    auto o1 = u64(o[0]);
+    auto dReluMask = getVCW(1, out, N, 0);
+    o1 ^= dReluMask;
+    if (party == SERVER1)
+        o1 ^= gpuMsb(x, bin + 1);
+    // gpuMod(o, 1);
+    //  ^ o ^ mask;
+    // printf("Epilogue: %lu, %lu\n", dReluMask, o1);
+    writePackedOp(out, o1, 1, N);
+    // writeVCW(1, out, o1, 0, oStride);
+
+    o1 = u64(o[1]);
+    auto o2 = u64(o[2]);
+    auto icMask = getVCW(1, out + oStride, N, 0);
+    o1 ^= (o2 ^ icMask);
+    if (party == SERVER1)
+    {
+        auto xp = x - p;
+        // gpuMod(xp, bin + 1);
+        auto xq = x - q;
+        // gpuMod(xq, bin + 1);
+        o1 ^= (gpuMsb(xp, bin + 1) ^ gpuMsb(xq, bin + 1));
+    }
+    // int i = blockIdx.x * blockDim.x + threadIdx.x;
+    // if(i == 0) printf("Epilogue %d: %ld, %ld\n", i, mask, o, gpuMsb(x, bin + 1));
+    // gpuMod(o, 1);
+    //  ^ o ^ mask;
+    //
+    // writeVCW(1, out, o1, 1, N);
+    writePackedOp(out + oStride, o1, 1, N);
+    // printf("icMask=%lu, ic=%lu\n", icMask, o1);
+
+}
+
+__device__ void maskEpilogue(int party, int bin, int N,
+                             u64 x,
+                             u8 *o, u32 *out, u64 oStride)
+{
+    auto o1 = u64(*o);
+    auto mask = getVCW(1, out, N, 0);
+    // printf("Mask: %ld, output: %ld\n", mask, o);
+    o1 = o1 ^ mask;
+    writePackedOp(out, o1, 1, N);
+}
+
+__device__ void idEpilogue(int party, int bin, int N,
+                           u64 x,
+                           u8 *o, u32 *out, u64 oStride)
+{
+
+    writePackedOp(out, u64(*o), 1, N);
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_fss_helper.h b/GPU-MPC/fss/gpu_fss_helper.h
new file mode 100644
index 00000000..728190cb
--- /dev/null
+++ b/GPU-MPC/fss/gpu_fss_helper.h
@@ -0,0 +1,135 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "utils/misc_utils.h"
+
+
+__host__ __device__ void printAESBlock(AESBlock *b)
+{
+    auto bAsInt = (u8 *)b;
+    for (int i = 15; i >= 0; i--)
+        printf("%02x", bAsInt[i]);
+    printf("\n");
+}
+
+template <typename T>
+__device__ inline u8 lsb(T b)
+{
+    return u8(b) & 1;
+}
+
+template <typename T>
+__device__ inline T gpuMsb(T x, int bin)
+{
+    return ((x >> (bin - 1)) & T(1));
+}
+
+__device__ u64 getVCW(int bout, u32 *vcw, int num_dcfs, int i)
+{
+    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bout == 1 || bout == 2)
+    {
+        int intsPerLevel = (bout * num_dcfs - 1) / PACKING_SIZE + 1;
+        int payloadPerIntMask = (PACKING_SIZE / bout) - 1;
+        auto vAsInt = ((u32 *)vcw)[intsPerLevel * i + ((bout * threadId) / PACKING_SIZE)];
+        vAsInt >>= (bout * (threadIdx.x & payloadPerIntMask));
+        u64 v = static_cast<u64>(vAsInt);
+        gpuMod(v, bout);
+        return v;
+    }
+    else
+    {
+        return ((u64 *)vcw)[i * num_dcfs + threadId];
+    }
+}
+
+__device__ void writeVCW(int bout, u32 *vcwArr, u64 vcw, int i, int N)
+{
+    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned mask = __ballot_sync(FULL_MASK, threadId < N);
+    int laneId = threadIdx.x & 0x1f;
+    gpuMod(vcw, bout);
+    if (bout == 1)
+    {
+        int intsPerLevel = (bout * N - 1) / PACKING_SIZE + 1;
+        int vcwAsInt = static_cast<int>(vcw);
+        vcwAsInt <<= laneId;
+        for (int j = 16; j >= 1; j /= 2)
+            vcwAsInt += __shfl_down_sync(mask, vcwAsInt, j, 32);
+        if (laneId == 0)
+        {
+            // printf("Writing to vcw[%d]=%u\n", i * intsPerLevel + (threadId / PACKING_SIZE), u32(vcwAsInt));
+            ((u32 *)vcwArr)[i * intsPerLevel + (threadId / PACKING_SIZE)] = static_cast<u32>(vcwAsInt);
+        }
+    }
+    else if (bout == 2)
+    {
+        // assert(0 && "bout = 2 not supported yet!");
+        // number of 32-bit ints per level even though we're storing 64-bit integers
+        int intsPerLevel = (bout * N - 1) / PACKING_SIZE + 1;
+        vcw <<= 2 * laneId;
+        for (int j = 16; j >= 1; j /= 2)
+            vcw += __shfl_down_sync(mask, vcw, j, 32);
+        if (laneId == 0)
+        {
+            // thread 0 in each warp will write two integers
+            int localIdx = 2 * (threadId / PACKING_SIZE);
+            int idx = i * intsPerLevel + localIdx;
+            ((u32 *)vcwArr)[idx] = static_cast<u32>(vcw);
+            if (localIdx + 1 < intsPerLevel)
+                ((u32 *)vcwArr)[idx + 1] = static_cast<u32>(vcw >> 32);
+            // else printf("not writing one integer\n");
+            //     vcwArr[i * longIntsPerLevel + (threadId / PACKING_SIZE)] = vcw;
+        }
+    }
+    else
+    {
+        ((u64 *)vcwArr)[i * N + threadId] = vcw;
+    }
+}
+
+u32 *moveMasks(u64 memSz, std::vector<u32 *> *h_masks, Stats *s)
+{
+    // assert(h_masks);
+    u32 *d_out = NULL;
+    if (h_masks)
+    {
+        d_out = (u32 *)gpuMalloc(h_masks->size() * memSz);
+        auto d_outTemp = d_out;
+        assert(memSz % sizeof(u32) == 0);
+        int numInts = memSz / sizeof(u32);
+        // printf("numInts=%d\n", numInts);
+        for (int i = 0; i < h_masks->size(); i++)
+        {
+            // printf("Masks=%u\n", (*h_masks)[i][0]);
+            moveIntoGPUMem((u8 *)d_outTemp, (u8 *)(*h_masks)[i], memSz, s);
+            d_outTemp += numInts;
+        }
+    }
+    else
+    {
+        d_out = (u32 *)gpuMalloc(memSz);
+    }
+    return d_out;
+}
diff --git a/GPU-MPC/fss/gpu_gelu.cu b/GPU-MPC/fss/gpu_gelu.cu
new file mode 100644
index 00000000..26cb0e91
--- /dev/null
+++ b/GPU-MPC/fss/gpu_gelu.cu
@@ -0,0 +1,162 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_maxpool.h"
+#include "gpu_gelu.h"
+
+template <typename TIn, typename TOut>
+__global__ void keyGenGeluMuxKernel(int party, int bin, int bout, TOut *linFunc, int N, TIn *b0Mask, TIn *b1Mask, TIn *mask_X, TOut *outMask, TOut *c0, TOut *c1)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        auto muxMask = 2 * b1Mask[i] + b0Mask[i];
+        for (int j = 0; j < 4; j++)
+        {
+            auto idx = 4 * i + j ^ muxMask;
+            auto temp = -linFunc[4 + j] * mask_X[i];
+            gpuMod(temp, bin);
+            c0[idx] = linFunc[j] + /*- linFunc[4 + j] * mask_X[i]*/ temp + outMask[i];
+            c1[idx] = linFunc[4 + j];
+
+            gpuMod(c0[idx], bout);
+            gpuMod(c1[idx], bout);
+        }
+        // if(i == 0) printf("MuxKernel %d: Input=%ld, Output=%ld\n", i, mask_X[i], outMask[i]);
+    }
+}
+
+template <typename TIn, typename TOut>
+TOut *keyGenGeluMux(u8 **key_as_bytes, int party, int bin, int bout, const TOut linFunc[2][4], int N, TIn *d_b0Mask, TIn *d_b1Mask, TIn *d_mask_X)
+{
+    assert(bin <= 8 * sizeof(TIn));
+    assert(bout <= 8 * sizeof(TOut));
+    auto d_outMask = randomGEOnGpu<TOut>(N, bout);
+    // checkCudaErrors(cudaMemset(d_outMask, 0, N * sizeof(TOut)));
+    u64 memSzC = 4 * N * sizeof(TOut);
+    auto d_c0 = (TOut *)gpuMalloc(memSzC);
+    auto d_c1 = (TOut *)gpuMalloc(memSzC);
+    auto d_linFunc = (TOut *)moveToGPU((u8 *)linFunc, 8 * sizeof(TOut), NULL);
+    keyGenGeluMuxKernel<TIn, TOut><<<(N - 1) / 128 + 1, 128>>>(party, bin, bout, d_linFunc, N, d_b0Mask, d_b1Mask, d_mask_X, d_outMask, d_c0, d_c1);
+    writeShares<TOut, TOut>(key_as_bytes, party, 4 * N, d_c0, bout);
+    writeShares<TOut, TOut>(key_as_bytes, party, 4 * N, d_c1, bout);
+    gpuFree(d_c0);
+    gpuFree(d_c1);
+    gpuFree(d_linFunc);
+    return d_outMask;
+}
+
+template <typename TIn, typename TOut>
+__global__ void geluMuxKernel(int party, int bin, int bout, int N, u32 *drelu_g, u32 *ic_g, TIn *Xt, TOut *out, TOut *c0_g, TOut *c1_g)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        // assert(bout == 8);
+        auto drelu = (drelu_g[i / 32] >> (i % 32)) & u32(1);
+        auto ic = (ic_g[i / 32] >> (i % 32)) & u32(1);
+        // if(i == 0) printf("dReluBit=%u, ze bit=%u\n", ic, drelu);
+        auto c = 4 * i + 2 * ic + drelu;
+        auto c0 = TIn(c0_g[c]);
+        auto c1 = TIn(c1_g[c]);
+        out[i] = c1 * Xt[i] + c0;
+        gpuMod(out[i], bout);
+        // if(i == 0) printf("MuxKernel %d: Input=%ld, Output=%ld\n", i, Xt[i], out[i]);
+    }
+}
+
+template <typename TIn, typename TOut>
+TOut *geluMux(SigmaPeer *peer, int party, GPUGeluMuxKey<TOut> k, int bin, int bout, int N, u32 *d_drelu, u32 *d_ic, TIn *d_Xt, Stats *s)
+{
+    // assert(bout == 8);
+    assert(bout <= 8 * sizeof(TOut));
+    auto d_out = (TOut *)gpuMalloc(N * sizeof(TOut));
+    u64 memSzC = 4 * N * sizeof(TOut);
+    auto d_c0 = (TOut *)moveToGPU((u8 *)k.c0, memSzC, s);
+    auto d_c1 = (TOut *)moveToGPU((u8 *)k.c1, memSzC, s);
+    geluMuxKernel<TIn, TOut><<<(N - 1) / 128 + 1, 128>>>(party, bin, bout, N, d_drelu, d_ic, d_Xt, d_out, d_c0, d_c1);
+    gpuFree(d_c0);
+    gpuFree(d_c1);
+    peer->reconstructInPlace(d_out, bout, N, s);
+    return d_out;
+}
+
+template <typename T, typename TClip, int clipBw>
+T *gpuKeyGenGelu(uint8_t **key_as_bytes, int party, int bw, int bin, int scale, int N, T *d_mask_X, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bw);
+    int bwXt = bin - scale + 6 + 1;
+    // truncated X = Xt
+    auto d_mask_Xt = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrWithSlack, bw, bwXt, scale - 6, N, d_mask_X, gaes);
+    auto d_dReluMask = gpuKeyGenDRelu(key_as_bytes, party, bwXt, N, d_mask_Xt, gaes);
+    assert(bwXt > 7);
+    // printf("ClipBW=%d\n", clipBw);
+    assert(8 * sizeof(TClip) >= clipBw);
+    const u64 max = (1ULL << clipBw) - 1;
+    auto d_icMask = gpuKeyGenIc<T, max, -max>(key_as_bytes, party, bwXt, N, d_mask_Xt, false, gaes);
+    const TClip linFunc[2][4] = {
+        {TClip(max), TClip(max), 0, 0},
+        {0, 0, TClip(-1), 1}};
+    auto d_clipMask = keyGenGeluMux<T, TClip>(key_as_bytes, party, bwXt, clipBw, linFunc, N, d_dReluMask, d_icMask, d_mask_Xt);
+    auto d_lutMask = gpuKeyGenLUT<TClip, T>(key_as_bytes, party, clipBw, bw, N, d_clipMask, gaes);
+    gpuFree(d_clipMask);
+
+    // auto d_reluMask = randomGEOnGpu<T>(N, bw);
+    auto d_reluMask = gpuKeyGenSelect<T, T>(key_as_bytes, party, N, d_mask_X, d_dReluMask, bw);
+
+    gpuLinearComb(bw, N, d_reluMask, T(1), d_reluMask, -T(1), d_lutMask);
+    gpuFree(d_lutMask);
+    return d_reluMask;
+}
+
+// clip happens in place
+template <typename T, typename TClip, int clipBw>
+T *gpuGelu(SigmaPeer *peer, int party, GPUGeluKey<T, TClip> &k, int bw, int bin, int scale, int N, T *d_X, T *d_geluSubRelu, AESGlobalContext *gaes, Stats *s)
+{
+    assert(8 * sizeof(TClip) >= clipBw);
+    assert(bin > scale - 6);
+    int bwXt = bin - scale + 6 + 1;
+    printf("BwXt=%d, Bout=%d, N=%d\n", bwXt, bw, N);
+    // do a truncate reduce
+    auto d_Xt = gpuTruncate(bw, bwXt, TruncateType::TrWithSlack, k.trKey, scale - 6, peer, party, N, d_X, gaes, s);
+    // the -1 doesn't matter because anything larger is anyway set to (1 << clipBw) - 1
+    const u64 clipVal = (1ULL << clipBw) - 1;
+    std::vector<u32 *> h_masks({k.dReluKey.mask, k.icMask});
+    u32 *d_res = gpuDcf<T, 3, geluPrologue<clipVal, -clipVal>, geluEpilogue<clipVal, -clipVal>>(k.dReluKey.dpfKey, party, d_Xt, gaes, s, &h_masks);
+    int numInts = ((N - 1) / PACKING_SIZE + 1);
+    peer->reconstructInPlace(d_res, 1, 2 * numInts * 32, s);
+
+    u32 *d_dRelu = d_res;
+    u32 *d_ic = d_res + numInts;
+    auto d_clippedX = geluMux<T, TClip>(peer, party, k.muxKey, bwXt, clipBw, N, d_dRelu, d_ic, d_Xt, s);
+    gpuFree(d_Xt);
+    auto d_reluSubGelu = gpuDpfLUT<TClip, T>(k.lutKey, peer, party, d_clippedX, d_geluSubRelu, gaes, s, false);
+    // printf("Finished LUT\n");
+    gpuFree(d_clippedX);
+    // printf("Starting relu\n");
+    T *d_relu = gpuSelect<T, T, 0, 0>(peer, party, bw, k.reluSelectKey, d_dRelu, d_X, s, false);
+    // printf("Finished relu\n");
+    gpuFree(d_res);
+    gpuLinearComb(bw, N, d_relu, T(1), d_relu, -T(1), d_reluSubGelu);
+    gpuFree(d_reluSubGelu);
+    peer->reconstructInPlace(d_relu, bw, N, s);
+    return d_relu;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_gelu.h b/GPU-MPC/fss/gpu_gelu.h
new file mode 100644
index 00000000..df6de46a
--- /dev/null
+++ b/GPU-MPC/fss/gpu_gelu.h
@@ -0,0 +1,78 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_relu.h"
+#include "gpu_lut.h"
+#include "gpu_truncate.h"
+
+template <typename T>
+struct GPUGeluMuxKey
+{
+    T *c0, *c1;
+};
+
+template <typename T>
+GPUGeluMuxKey<T> readGPUGeluMuxKey(uint8_t **key_as_bytes, int N)
+{
+    GPUGeluMuxKey<T> k;
+    u64 memSz = 4 * N * sizeof(T); // num bytes
+    k.c0 = (T *)*key_as_bytes;
+    *key_as_bytes += memSz;
+    k.c1 = (T *)*key_as_bytes;
+    *key_as_bytes += memSz;
+    return k;
+}
+
+template <typename T, typename TClip>
+struct GPUGeluKey
+{
+    int bw;
+    GPUTruncateKey<T> trKey;
+    GPUDReluKey dReluKey;
+    u32 *icMask;
+    GPUGeluMuxKey<TClip> muxKey;
+    GPULUTKey<T> lutKey;
+    GPUSelectKey<T> reluSelectKey;
+};
+
+template <typename T, typename TClip>
+GPUGeluKey<T, TClip> readGpuGeluKey(uint8_t **key_as_bytes)
+{
+    GPUGeluKey<T, TClip> k;
+    k.bw = *((int *)*key_as_bytes);
+    *key_as_bytes += sizeof(int);
+    k.trKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
+    k.dReluKey = readGPUDReluKey(key_as_bytes);
+    int N = k.trKey.N;
+    // printf("###### Gelu N=%d\n", N);
+    auto icMaskMemSize = ((N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+    k.icMask = (u32 *)*key_as_bytes;
+    *key_as_bytes += icMaskMemSize;
+    k.muxKey = readGPUGeluMuxKey<TClip>(key_as_bytes, N);
+    k.lutKey = readGPULUTKey<T>(key_as_bytes);
+    k.reluSelectKey = readGPUSelectKey<T>(key_as_bytes, N);
+    return k;
+}
+
+
+#include "gpu_gelu.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_inverse.cu b/GPU-MPC/fss/gpu_inverse.cu
new file mode 100644
index 00000000..cdfb5925
--- /dev/null
+++ b/GPU-MPC/fss/gpu_inverse.cu
@@ -0,0 +1,49 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_inverse.h"
+
+template <typename T>
+T* gpuKeygenLUTInverse(u8** key_as_bytes, int party, int bw, int bin, int scale, int N, T* d_mask_X, AESGlobalContext* gaes) {
+    // only consider the last 16 bits of the input
+    assert(bin <= bw);
+    assert(bin - 6 <= 16);
+    assert(scale == 12);
+    // assert(bin > 16);
+    // printf("here %d, %d\n", bin, scale);
+    auto d_trMask = genGPUTruncateKey<T, u16>(key_as_bytes, party, TruncateType::TrWithSlack, bin, bin - 6, /*std::max(bin - 13, 0)*/6, N, d_mask_X, gaes);
+    // printf("here inside LUT, Bin=%d\n", );
+    auto d_invMask = gpuKeyGenLUT<u16, T>(key_as_bytes, party, /*13*/bin - 6, bw, N, d_trMask, gaes);
+    // printf("here\n");
+    gpuFree(d_trMask);
+    return d_invMask;
+}
+
+template <typename T>
+T* gpuLUTInverse(SigmaPeer* peer, int party, int bw, int bin, int scale, int N, GPULUTInverseKey<T> k, T* d_X, T* d_invTab, AESGlobalContext* gaes, Stats* s) {
+    assert(bin - 6 <= 16);
+    assert(scale == 12);
+    auto d_truncated_X = gpuTruncate<T, u16>(bin, bin - 6, TruncateType::TrWithSlack, k.trKey, /*std::max(bin - 13, 0)*/6, peer, party, k.N, d_X, gaes, s);
+    auto d_invX = gpuDpfLUT<u16, T>(k.lutKey, peer, party, d_truncated_X, d_invTab, gaes, s);
+    gpuFree(d_truncated_X);
+    return d_invX;
+}
+
diff --git a/GPU-MPC/fss/gpu_inverse.h b/GPU-MPC/fss/gpu_inverse.h
new file mode 100644
index 00000000..af0b37cc
--- /dev/null
+++ b/GPU-MPC/fss/gpu_inverse.h
@@ -0,0 +1,43 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_lut.h"
+#include "gpu_truncate.h"
+
+template <typename T>
+struct GPULUTInverseKey {
+    int N;
+    GPUTruncateKey<u16> trKey;
+    GPULUTKey<T> lutKey;
+};
+
+template <typename T>
+GPULUTInverseKey<T> readGPULUTInverseKey(u8** key_as_bytes) {
+    GPULUTInverseKey<T> k;
+    k.trKey = readGPUTruncateKey<u16>(TruncateType::TrWithSlack, key_as_bytes);
+    k.N = k.trKey.N;
+    k.lutKey = readGPULUTKey<T>(key_as_bytes);
+    return k;
+}
+
+#include "gpu_inverse.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_layernorm.cu b/GPU-MPC/fss/gpu_layernorm.cu
new file mode 100644
index 00000000..a9dee453
--- /dev/null
+++ b/GPU-MPC/fss/gpu_layernorm.cu
@@ -0,0 +1,229 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <llama/api.h>
+
+#include <cutlass/reduction/device/tensor_reduce.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/vector.h>
+#include <cutlass/coord.h>
+
+#include "utils/helper_cutlass.h"
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_random.h"
+
+#include "gpu_layernorm.h"
+#include "gpu_maxpool.h"
+
+typedef u64 (*pFunc)(int party, int N, int i, u64 x, u8 *bytes);
+
+template <typename T>
+T *gpuSum(int bw, int M, int N, T *d_X)
+{
+    // printf("Inside gpuSum=%d, %d\n", M, N);
+    // auto h_X = (T*) moveToCPU((u8*) d_X, 4 * sizeof(T), NULL);
+    // printf("%ld, %ld, %ld, %ld\n", h_X[0], h_X[1], h_X[2], h_X[3]);
+    assert(bw <= sizeof(T) * 8);
+    T *d_Y = (T *)gpuMalloc(M * sizeof(T));
+    // const int kV = 1;
+    using TensorReduction = cutlass::reduction::device::TensorReductionAffineContiguous<
+        2, 1, T, T, cutlass::plus<T>>;
+
+    auto t_X = cutlass::TensorRef<T, cutlass::layout::RowMajor>(
+        d_X,
+        cutlass::layout::RowMajor::packed({M, N}));
+    auto t_Y = cutlass::TensorRef<T, cutlass::layout::RowMajor>(
+        d_Y,
+        cutlass::layout::RowMajor::packed({M, 1}));
+
+    TensorReduction reduction(cutlass::Coord<2>({M, N}));
+
+    uint8_t *workspace = gpuMalloc(reduction.workspace_size());
+    i64 dstStride = 1;
+    i64 srcStride = i64(N);
+    cutlass::Status status = reduction.reduce(d_Y, &dstStride, d_X, &srcStride, workspace, T(0));
+    CUTLASS_CHECK(status);
+    if (bw < 8 * sizeof(T))
+        modKernel<<<(M - 1) / 128 + 1, 128>>>(M, d_Y, bw);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(workspace);
+    // auto h_Y = (T*) moveToCPU((u8*) d_Y, 1 * sizeof(T), NULL);
+    // printf("%ld\n", h_Y[0]);
+    return d_Y;
+}
+
+template <typename T>
+__device__ u64 squareKeygen(int party, int N, int i, u64 x, u8 *bytes)
+{
+    u64 r = u64(((T *)bytes)[i]);
+    // printf("Inside sq keygen %d: a=%ld, r=%ld\n", i, x, r);
+    return x * x + r;
+}
+
+template <typename T>
+__device__ u64 square(int party, int N, int i, u64 x, u8 *bytes)
+{
+    u64 a = u64(((T *)bytes)[i]);
+    u64 c = u64(((T *)bytes)[N + i]);
+    // printf("Inside sq %d: x=%ld, a=%ld, c=%ld\n", i, x, a, c);
+    return (party == SERVER1) * x * x - 2 * a * x + c;
+}
+
+template <typename T, pFunc p>
+__global__ void applyPointFunc(int party, int bw, int N, T *d_X, T *d_O, u8 *bytes)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        u64 x = u64(d_X[i]);
+        auto o = (T)p(party, N, i, x, bytes);
+        gpuMod(o, bw);
+        d_O[i] = o;
+        // printf("Inside point func %d: %ld\n", i, d_O[i]);
+    }
+}
+
+template <typename T, pFunc p>
+T *pointFunc(int party, int bw, int N, T *d_X, u8 *bytes)
+{
+    T *d_O = (T *)gpuMalloc(N * sizeof(T));
+    applyPointFunc<T, p><<<(N - 1) / 128 + 1, 128>>>(party, bw, N, d_X, d_O, bytes);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_O;
+}
+
+template <typename T>
+T *gpuKeygenLayerNorm(u8 **key_as_bytes, int party, AvgPoolParams p, T *d_mask_A, T *d_mask_B, T *d_mask_X, AESGlobalContext *gaes, bool computeMu = true)
+{
+    // return d_mask_X;
+    assert(p.N == 1);
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    T *d_mask_xMMu = d_mask_X;
+    if (computeMu)
+    {
+        auto d_sumMask = gpuSum(p.bw, p.imgH, p.imgW, d_mask_X);
+        // return d_sumMask;
+        T *d_mask_mu;
+        // printf("Bw=%d\n", p.bw);
+        if ((p.imgW & (p.imgW - 1)) == 0)
+        {
+            // printf("######## Shift=%d\n", int(log2(p.imgW)));
+            d_mask_mu = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrFloor, p.bw, p.bw, int(log2(p.imgW)), mSz, d_sumMask, gaes);
+        }
+        else
+        {
+            gpuLinearComb(p.bw, mSz, d_sumMask, T((1LL << p.scale) / (double)p.imgW), d_sumMask);
+            d_mask_mu = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrFloor, p.bw, p.bw, p.scale, mSz, d_sumMask, gaes);
+        }
+        // return d_mask_mu;
+        // printf("boo\n");
+        gpuFree(d_sumMask);
+        d_mask_xMMu = windowFunc<T, xPlusM<u64(1), u64(-1)>>(party, p, d_mask_X, d_mask_mu);
+        // return d_mask_xMMu;
+    }
+    // printf("boo\n");
+    auto d_mask_sq = randomGEOnGpu<T>(inSz, p.bw);
+    // checkCudaErrors(cudaMemset(d_mask_sq, 0, inSz * sizeof(T)));
+    auto d_mask_sqModified = pointFunc<T, squareKeygen<T>>(party, p.bw, inSz, d_mask_xMMu, (u8 *)d_mask_sq);
+    writeShares<T, T>(key_as_bytes, party, inSz, d_mask_xMMu, p.bw);
+    writeShares<T, T>(key_as_bytes, party, inSz, d_mask_sqModified, p.bw);
+    gpuFree(d_mask_sqModified);
+
+    // printf("boo##\n");
+    auto d_mask_sumSq = gpuSum(p.bw, p.imgH, p.imgW, d_mask_sq);
+    gpuFree(d_mask_sq);
+    // return d_mask_sumSq;
+
+    auto h_mask_sumSq = (T *)moveToCPU((u8 *)d_mask_sumSq, p.imgH * sizeof(T), NULL);
+    Rsqrt(p.imgH, h_mask_sumSq, h_mask_sumSq, p.imgW, p.scale, "LayerNorm::");
+    // F2BF16(mSz, h_mask_sumSq, h_mask_sumSq, "Rsqrt::");
+    moveIntoGPUMem((u8 *)d_mask_sumSq, (u8 *)h_mask_sumSq, mSz * sizeof(T), NULL);
+    // printf("boo\n");
+    // can potentially make this u16 once we move f2bf16 to the gpu
+    // auto d_mask_invSqrt = gpuKeyGenLUT<T, T>(key_as_bytes, party, 13, p.bw, mSz, d_mask_sumSq, gaes);
+    // gpuFree(d_mask_sumSq);
+    // (x - mu)*var
+    // printf("boo\n");
+    auto d_mask_invSqrt = d_mask_sumSq;
+    // return d_mask_invSqrt;
+    auto d_mask_normX = keygenWindowMul(key_as_bytes, party, p, d_mask_xMMu, d_mask_invSqrt, TruncateType::TrWithSlack, gaes);
+    gpuFree(d_mask_invSqrt);
+    // return d_mask_normX;
+    // doesn't matter even if we change things since we're copying things anyway
+    // p.FH = p.imgH;
+    // p.FW = 1;
+    // p.strideH = p.imgH;
+    // p.strideW = 1;
+    // initPoolParams(p);
+    // printf("boo\n");
+    auto p2 = transposeWindow(p);
+    // printf("Prefinal boo\n");
+    auto d_mask_layerNorm = keygenWindowMul(key_as_bytes, party, p2, d_mask_normX, d_mask_A, TruncateType::TrWithSlack, gaes, d_mask_B);
+    // printf("Final boo\n");
+    gpuFree(d_mask_normX);
+    return d_mask_layerNorm;
+    // no truncate?
+}
+
+template <typename T>
+T *gpuLayerNorm(SigmaPeer *peer, int party, AvgPoolParams p, GPULayerNormKey<T> k, T *d_A, T *d_B, T *d_X, std::vector<GroupElement> *invSqrtTab, AESGlobalContext *gaes, Stats *s, bool computeMu = true)
+{
+    assert(8 * sizeof(T) == 64);
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    T *d_xMMu = d_X;
+    if (computeMu)
+    {
+        auto d_sum = gpuSum(p.bw, p.imgH, p.imgW, d_X);
+        T *d_mu;
+        if ((p.imgW & (p.imgW - 1)) == 0)
+        {
+            d_mu = gpuTruncate<T, T>(p.bw, p.bw, TruncateType::TrFloor, k.muTrKey, int(log2(p.imgW)), peer, party, mSz, d_sum, gaes, s);
+        }
+        else
+        {
+            gpuLinearComb(p.bw, mSz, d_sum, T((1LL << p.scale) / (double)p.imgW), d_sum);
+            d_mu = gpuTruncate<T, T>(p.bw, p.bw, TruncateType::TrFloor, k.muTrKey, p.scale, peer, party, mSz, d_sum, gaes, s);
+        }
+        gpuFree(d_sum);
+        d_xMMu = windowFunc<T, xPlusM<u64(1), u64(-1)>>(party, p, d_X, d_mu);
+    }
+    auto d_sqKey = (u8 *)moveToGPU((u8 *)k.sqKey.a, (2 * inSz) * sizeof(T), s);
+    auto d_sq = pointFunc<T, square<T>>(party, p.bw, inSz, d_xMMu, (u8 *)d_sqKey);
+    gpuFree(d_sqKey);
+
+    auto d_sumSq = gpuSum(p.bw, p.imgH, p.imgW, d_sq);
+    gpuFree(d_sq);
+    peer->reconstructInPlace(d_sumSq, p.bw, mSz, s);
+
+    auto h_sumSq = (T *)moveToCPU((u8 *)d_sumSq, p.imgH * sizeof(T), s);
+    Rsqrt(p.imgH, h_sumSq, h_sumSq, p.imgW, p.scale, "LayerNorm::", invSqrtTab);
+    moveIntoGPUMem((u8 *)d_sumSq, (u8 *)h_sumSq, mSz * sizeof(T), s);
+    // (x - mu)*var
+    auto d_invSqrt = d_sumSq;
+    auto d_normX = windowMul(peer, party, p, k.wMulKey1, d_xMMu, d_invSqrt, TruncateType::TrWithSlack, gaes, s);
+    gpuFree(d_invSqrt);
+    auto p2 = transposeWindow(p);
+    auto d_layerNorm = windowMul(peer, party, p2, k.wMulKey2, d_normX, d_A, TruncateType::TrWithSlack, gaes, s, d_B);
+    gpuFree(d_normX);
+    return d_layerNorm;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_layernorm.h b/GPU-MPC/fss/gpu_layernorm.h
new file mode 100644
index 00000000..acc5cce6
--- /dev/null
+++ b/GPU-MPC/fss/gpu_layernorm.h
@@ -0,0 +1,92 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_random.h"
+
+#include "gpu_truncate.h"
+#include "gpu_window.h"
+#include "gpu_lut.h"
+
+template <typename T>
+struct GPUSqKey
+{
+    T *a, *c;
+};
+
+template <typename T>
+GPUSqKey<T> readGPUSqKey(int N, u8** key_as_bytes)
+{
+    GPUSqKey<T> k;
+    u64 memSz = N * sizeof(T);
+    k.a = (T*) *key_as_bytes;
+    *key_as_bytes += memSz;
+    k.c = (T*) *key_as_bytes;
+    *key_as_bytes += memSz;
+    return k;
+}
+
+
+template <typename T>
+struct GPULayerNormKey
+{
+    GPUTruncateKey<T> muTrKey;
+    GPUSqKey<T> sqKey;
+    // GPULUTKey<T> invSqrtKey;
+    GPUMulKey<T> wMulKey1;
+    GPUMulKey<T> wMulKey2;
+};
+
+AvgPoolParams transposeWindow(AvgPoolParams p1) {
+    assert(p1.FH == 1 && p1.FW == p1.imgW && p1.strideH == 1 && p1.strideW == p1.imgW);
+    AvgPoolParams p2;
+    memcpy(&p2, &p1, sizeof(AvgPoolParams));
+    p2.FH = p1.imgH;
+    p2.FW = 1;
+    p2.strideH = p1.imgH;
+    p2.strideW = 1;
+    initPoolParams(p2);
+    return p2;
+}
+
+template <typename T>
+GPULayerNormKey<T> readGPULayerNormKey(AvgPoolParams p, u8** key_as_bytes, bool computeMu = true)
+{
+    GPULayerNormKey<T> k;
+    auto inSz = getInSz(p);
+    auto mSz = getMSz(p);
+    if(computeMu) {
+        k.muTrKey = readGPUTruncateKey<T>(TruncateType::TrFloor, key_as_bytes);
+        // printf("Num Truncations=%d\n", k.muTrKey.N);
+    }
+    k.sqKey = readGPUSqKey<T>(inSz, key_as_bytes);
+    // printf("Num sq=%ld\n", inSz);
+    // k.invSqrtKey = readGPULUTKey<T>(key_as_bytes);
+    k.wMulKey1 = readGPUWindowMulKey<T>(p, TruncateType::TrWithSlack, key_as_bytes);
+    // printf("here$$$\n");
+    auto p2 = transposeWindow(p);
+    k.wMulKey2 = readGPUWindowMulKey<T>(p2, TruncateType::TrWithSlack, key_as_bytes);
+    // printf("here$$$\n");
+    return k;
+}
+
+#include "gpu_layernorm.cu"
diff --git a/GPU-MPC/fss/gpu_linear_helper.cu b/GPU-MPC/fss/gpu_linear_helper.cu
new file mode 100644
index 00000000..bea8caf4
--- /dev/null
+++ b/GPU-MPC/fss/gpu_linear_helper.cu
@@ -0,0 +1,175 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstdlib>
+#include <cassert>
+#include <stdio.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_cuda.h"
+#include "utils/misc_utils.h"
+#include <cutlass/cutlass.h>
+#include <cutlass/conv/kernel/default_conv2d_fprop.h>
+#include <cutlass/conv/kernel/default_conv2d_dgrad.h>
+#include <cutlass/conv/kernel/default_conv2d_wgrad.h>
+#include <cutlass/conv/device/implicit_gemm_convolution.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/util/device_memory.h>
+#include "cutlass/reduction/device/tensor_reduce.h"
+
+
+template <typename T>
+inline cutlass::TensorRef<T, cutlass::layout::TensorNHWC> getTensorRef(
+    T *ptr, int n, int h, int w, int c)
+{
+    return cutlass::TensorRef<T, cutlass::layout::TensorNHWC>(
+        ptr,
+        cutlass::layout::TensorNHWC::packed({n, h, w, c}));
+}
+
+template <typename T>
+inline cutlass::TensorRef<T, cutlass::layout::TensorNHWC> getTensorRefBias(
+    T *ptr)
+{
+
+    return cutlass::TensorRef<T, cutlass::layout::TensorNHWC>(
+        ptr,
+        cutlass::layout::TensorNHWC::Stride(0));
+}
+
+
+
+template <typename T>
+__global__ void addBiasKernel(int batchSz, int M, int N, int bw, T *A, T *b)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < batchSz * M * N)
+    {
+        int s = i / (M * N);
+        int t = i % (M * N);
+        int bIdx = t % N;
+        A[i] += b[s * N + bIdx];
+        gpuMod(A[i], bw);
+    }
+}
+
+template <typename T>
+void gpuAddBias(int batchSz, int M, int N, int bw, T *d_A, T *h_b, Stats *s)
+{
+    // assert(bw == sizeof(T) * 8);
+    size_t memSizeB = batchSz * N * sizeof(T);
+    auto d_b = (T *)moveToGPU((uint8_t *)h_b, memSizeB, s);
+    addBiasKernel<<<(batchSz * M * N - 1) / 128 + 1, 128>>>(batchSz, M, N, bw, d_A, d_b);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(d_b);
+}
+
+template <typename T>
+void gpuAddBiasWrapper(int batchSz, int M, int N, int bw, T *h_A, T *h_b)
+{ // check this once
+    size_t memSizeA = batchSz * M * N * sizeof(T);
+    auto d_A = (T *)moveToGPU((uint8_t *)h_A, memSizeA, NULL);
+    gpuAddBias(batchSz, M, N, bw, d_A, h_b, NULL);
+    moveIntoCPUMem((uint8_t *)h_A, (uint8_t *)d_A, memSizeA, NULL);
+    gpuFree(d_A);
+}
+
+// bias is an M vector
+template <typename T>
+T *getBiasGrad(int N, int M, int bw, T *d_A)
+{
+    assert(bw == sizeof(T) * 8);
+    T *d_b = (T *)gpuMalloc(M * sizeof(T));
+    const int kV = 1;
+    using TensorReduction = cutlass::reduction::device::TensorReduction<
+        T,                           // output
+        T,                           // source
+        cutlass::layout::TensorNHWC, // Layout
+        cutlass::plus<T>,            // Functor
+        kV,                          // kV
+        T                            // ElementCompute
+        >;
+
+    auto t_A = getTensorRef(d_A, 1, 1, N, M);
+    auto t_b = getTensorRef(d_b, 1, 1, 1, M);
+
+    TensorReduction reduction(/*t_A.extent()*/ {1, 1, N, M}, 2);
+
+    uint8_t *workspace = gpuMalloc(reduction.workspace_size());
+
+    cutlass::Status status = reduction.reduce(
+        t_b /*.device_ref()*/, // dst_tensor
+        t_A /*.device_ref()*/, // src_tensor
+        workspace,             // device_workspace
+        T(0)                   // reduction_identity
+    );
+    CUTLASS_CHECK(status);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(workspace);
+
+    return d_b;
+}
+
+template <typename T>
+T *getBiasGradWrapper(int N, int M, int bw, T *h_A)
+{
+    size_t memSizeA = N * M * sizeof(T);
+    auto d_A = (T *)moveToGPU((uint8_t *)h_A, memSizeA, NULL);
+    auto d_b = getBiasGrad(N, M, bw, d_A);
+    size_t memSizeB = M * sizeof(T);
+    auto h_b = (T *)moveToCPU((uint8_t *)d_b, memSizeB, NULL);
+    gpuFree(d_A);
+    gpuFree(d_b);
+    return h_b;
+}
+
+template <typename T>
+__global__ void leftShiftAndAddKernel(T *A, T *B, T *C, int shift, T alpha, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        C[i] = (A[i] << shift) + alpha * B[i];
+        // if(i == 1) printf("%u %u %u %u %d\n", A[i], B[i], alpha, C[i], shift);
+    }
+}
+
+template <typename T>
+void gpuLeftShiftAndAdd(int N, T *d_A, T *d_B, T *d_C, int shift, T alpha)
+{
+    assert(shift < sizeof(T) * 64);
+    leftShiftAndAddKernel<<<(N - 1) / 256 + 1, 256>>>(d_A, d_B, d_C, shift, alpha, N);
+    checkCudaErrors(cudaDeviceSynchronize());
+}
+
+template <typename T>
+void gpuLeftShiftAndAddWrapper(int N, T *d_A, T *h_B, T *d_C, int shift, T alpha)
+{
+    size_t memSize = N * sizeof(T);
+    // auto d_A = (T*) moveToGPU((uint8_t*) h_A, memSize, NULL);
+    auto d_B = (T *)moveToGPU((uint8_t *)h_B, memSize, NULL);
+    gpuLeftShiftAndAdd(N, d_A, d_B, d_A, shift, alpha);
+    // moveIntoCPUMem((uint8_t*) h_C, (uint8_t*) d_A, memSize, NULL);
+    gpuFree(d_A);
+    gpuFree(d_B);
+}
diff --git a/GPU-MPC/fss/gpu_linear_helper.h b/GPU-MPC/fss/gpu_linear_helper.h
new file mode 100644
index 00000000..e3dac1ca
--- /dev/null
+++ b/GPU-MPC/fss/gpu_linear_helper.h
@@ -0,0 +1,30 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include <cutlass/cutlass.h>
+#include "utils/helper_cutlass.h"
+
+template <typename T> void gpuAddBias(int batchSz, int N, int M, int bw, T* d_A, T* h_b, Stats* s);
+
+#include "gpu_linear_helper.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_local_truncate.h b/GPU-MPC/fss/gpu_local_truncate.h
new file mode 100644
index 00000000..13127d1b
--- /dev/null
+++ b/GPU-MPC/fss/gpu_local_truncate.h
@@ -0,0 +1,62 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/helper_cuda.h"
+
+template <typename TIn, typename TOut>
+using localTrFunc = TOut (*)(int party, int bin, int shift, TIn x);
+
+template <typename TIn, typename TOut>
+__device__ TOut lrs(int party, int bin, int shift, TIn x)
+{
+    return TOut(x >> shift);
+}
+
+template <typename TIn, typename TOut>
+__device__ TOut ars(int party, int bin, int shift, TIn x)
+{
+    x += TIn(1ULL << (bin - 1));
+    gpuMod(x, bin);
+    auto trX = TOut((x >> shift) - (1ULL << (bin - shift - 1)));
+    return trX;
+}
+
+template <typename TIn, typename TOut, localTrFunc<TIn, TOut> tf>
+__global__ void localTrKernel(int party, int bin, int shift, int N, TIn *x, TOut *y)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        y[i] = (TOut)tf(party, bin, shift, x[i]);
+    }
+}
+
+template <typename TIn, typename TOut, localTrFunc<TIn, TOut> tf>
+TOut *gpuLocalTr(int party, int bin, int shift, int N, TIn *d_I, bool inPlace = false)
+{
+    assert(bin >= shift);
+    TOut* d_O = inPlace ? (TOut*) d_I : (TOut*) gpuMalloc(N * sizeof(TOut));
+    localTrKernel<TIn, TOut, tf><<<(N - 1) / 128 + 1, 128>>>(party, bin, shift, N, d_I, d_O);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_O;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_lut.cu b/GPU-MPC/fss/gpu_lut.cu
new file mode 100644
index 00000000..b13587b4
--- /dev/null
+++ b/GPU-MPC/fss/gpu_lut.cu
@@ -0,0 +1,223 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_lut.h"
+#include "gpu_dpf.h"
+#include "utils/gpu_comms.h"
+
+template <typename TIn, typename TOut>
+TOut *gpuKeyGenLUT(uint8_t **key_as_bytes, int party, int bin, int bout, int N,
+                   TIn *d_rin, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bout);
+    gpuKeyGenDPF(key_as_bytes, party, bin, N, d_rin, gaes, true);
+    auto d_maskU = randomGEOnGpu<TOut>(N, 1);
+    // checkCudaErrors(cudaMemset(d_maskU, 0, N * sizeof(TOut)));
+    writeShares<TOut, TOut>(key_as_bytes, party, N, d_maskU, 1);
+    auto d_maskV = randomGEOnGpu<TOut>(N, bout);
+    // checkCudaErrors(cudaMemset(d_maskV, 0, N * sizeof(TOut)));
+    auto d_maskOut = gpuKeyGenSelect<TOut, TOut>(key_as_bytes, party, N, d_maskV, d_maskU, bout);
+    gpuLinearComb(bout, N, d_maskOut, TOut(2), d_maskOut, TOut(-1), d_maskV);
+    gpuFree(d_maskU);
+    gpuFree(d_maskV);
+    return d_maskOut;
+    // return std::make_pair(d_maskU, d_maskV);
+}
+
+__device__ void storeAESBlock(AESBlock *x, int idx, AESBlock y, int N, int threadId)
+{
+    x[idx * N + threadId] = y;
+}
+// stripe the stack across all threads for the time being
+__device__ AESBlock loadAESBlock(AESBlock *x, int idx, int N, int threadId)
+{
+    return x[idx * N + threadId];
+}
+
+template <typename TIn, typename TOut>
+// striping the stack as well for now we'll see what to do later
+__global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *scw_g, AESBlock *stack_g,
+                        AESBlock *l0_g, AESBlock *l1_g, uint32_t *tR_g, u32 *U, TOut *V, AESGlobalContext gaes)
+{
+    AESSharedContext saes;
+    loadSbox(&gaes, &saes);
+    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+    // don't need a sync here at all because there is no data sharing
+    // just data reuse
+    if (threadId < N)
+    {
+        storeAESBlock(stack_g, 0, scw_g[threadId], N, threadId);
+        // stack[threadIdx.x / 32][0][threadIdx.x & 31] = scw[threadId];
+        // for (int i = 0; i < bin - LOG_AES_BLOCK_LEN; i++)
+        // storeAESBlock(scw, 0, scw_g[(i + 1) * N + threadId], N, threadId);
+        // scw[threadIdx.x / 32][i][threadIdx.x & 31] = scw_g[(i+1) * N + threadId];
+        // store these in registers for now and see what happens
+        // hopefully there is no spill
+        auto x = (u64)X[threadId];
+        gpuMod(x, bin);
+        auto l0_cw = l0_g[threadId];
+        auto l1_cw = l1_g[threadId];
+        auto tR = tR_g[threadId];
+        uint32_t pathStack = 0;
+        int depth = 1;
+        TOut u = 0, v = 0;
+        while (depth > 0)
+        {
+            // if(threadId == 2) printf("Stack: %u, Depth: %d\n", pathStack, depth);
+            // peek at the top of the stack
+            auto seed = loadAESBlock(stack_g, depth - 1, N, threadId);
+            // if (threadId == 2) {
+            // printAESBlock(&seed);
+            // }
+            // auto seed = stack[threadIdx.x / 32][depth - 1][threadIdx.x & 31];
+            // extract the stack bit
+            auto bit = pathStack & 1ULL;
+            // should this be +1?
+            if (depth == bin - LOG_AES_BLOCK_LEN)
+            {
+                // if(stack == 0) {
+                //     l0[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l0_g[threadId];
+                //     l1[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l1_g[threadId];
+                // }
+                auto lastBlock = expandDPFTreeNode(bin, party,
+                                                   seed,
+                                                   0,
+                                                   l0_cw,
+                                                   l1_cw,
+                                                   0,
+                                                   uint8_t(bit),
+                                                   depth - 1,
+                                                   &saes);
+                TOut c = party == SERVER1 ? -1 : 1;
+                auto lb = pathStack << LOG_AES_BLOCK_LEN;
+                // do the dot product here
+                for (u64 i = 0; i < AES_BLOCK_LEN_IN_BITS; i++)
+                {
+                    auto w = c * TOut(lastBlock & 1);
+                    u += w;
+                    auto lookup = x - (lb ^ i);
+                    gpuMod(lookup, bin);
+                    // printf("current: %ld, %ld, %ld, %ld\n", x, lb ^ i, lookup, tab[lookup]);
+                    v += /*reluSubGelu(lookup, 6, 12)*/ tab[lookup] * w;
+                    lastBlock >>= 1;
+                }
+                // sum &= 1;
+                // pop all the 1s from the stack
+                while (pathStack & 1ULL /*&& depth > 0*/)
+                {
+                    pathStack >>= 1;
+                    depth--;
+                }
+                // xor the last 0 with 1 to make it 1
+                pathStack ^= 1;
+            }
+            else
+            { // load the scws into shared memory along the all 0 path
+                // if (stack == 0)
+                // {
+                //     scw[][][] = scw_g[];
+                // }
+                // manipulate the seed depending on the bit
+                // aren't storing the first cw because it sees no reuse
+                auto tR_l = (tR >> (depth - 1)) & 1;
+                auto newSeed = expandDPFTreeNode(bin, party,
+                                                 seed,
+                                                 loadAESBlock(scw_g, depth, N, threadId),
+                                                 //   scw[][depth - 1][],
+                                                 0,
+                                                 0,
+                                                 tR_l,
+                                                 uint8_t(bit),
+                                                 depth - 1,
+                                                 &saes);
+                storeAESBlock(stack_g, depth, newSeed, N, threadId);
+                // stack[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = newSeed;
+                depth++;
+                // push a 0 on top of the stack
+                pathStack <<= 1; // (pathStack << 1) ^ 1ULL;
+            }
+        }
+        // Neha: might want to examine this later
+        gpuMod(u, 2);
+        if (party == SERVER1)
+        {
+            u = 2 - ((4 - u) / 2);
+        }
+        else
+        {
+            u = (u + 1) / 2;
+        }
+        gpuMod(u, 1);
+        auto maskU = getVCW(1, U, N, 0);
+        writeVCW(1, U, u64(u ^ maskU), 0, N);
+        // U[threadId] += u;
+        V[threadId] += v;
+        // printf("%d: %ld, %ld\n", threadId, u, v);
+    }
+}
+
+template <typename TIn, typename TOut>
+TOut *gpuDpfLUT(GPULUTKey<TOut> k0, SigmaPeer *peer, int party, TIn *d_X, TOut *d_tab, AESGlobalContext *g, Stats *s, bool opMasked = true)
+{
+    auto k = *(k0.k.dpfTreeKey);
+    assert(k0.k.bin >= 8 && k0.k.B == 1);
+    printf("############### %d, %d, %d\n", k.bin, k.evalAll, k.N, k0.k.B);
+    // Neha: need to change the key reading and writing code
+    //  do not change tb size it is needed to load the sbox
+    const int tbSz = 256;
+    int tb = (k.N - 1) / tbSz + 1;
+    AESBlock *d_scw, *d_stack, *d_l0, *d_l1;
+    uint32_t *d_tR;
+    // *d_out;
+
+    assert(k.memSzScw % (k.bin - LOG_AES_BLOCK_LEN) == 0);
+
+    d_scw = (AESBlock *)moveToGPU((uint8_t *)k.scw, k.memSzScw, s);
+    d_stack = (AESBlock *)gpuMalloc(k.memSzScw);
+    d_l0 = (AESBlock *)moveToGPU((uint8_t *)k.l0, k.memSzL, s);
+    d_l1 = (AESBlock *)moveToGPU((uint8_t *)k.l1, k.memSzL, s);
+    d_tR = (u32 *)moveToGPU((uint8_t *)k.tR, k.memSzT, s);
+    auto d_U = (u32 *)moveToGPU((u8 *)k0.maskU, k.memSzOut, s); // a lot of bits packed together
+    auto d_V = (TOut *)moveToGPU((u8 *)k0.s.b, k.N * sizeof(TOut), s);
+    // d_out = (uint32_t *)gpuMalloc(k.memSzOut);
+    // int shmSize = 32768;
+    // checkCudaErrors(cudaFuncSetAttribute(dpfEvalAll, cudaFuncAttributeMaxDynamicSharedMemorySize, shmSize));
+    // auto start = std::chrono::high_resolution_clock::now();
+    dpfLUT<TIn, TOut><<<tb, tbSz /*, shmSize*/>>>(party, k.bin, k.N, d_X, d_tab, d_scw, d_stack, d_l0, d_l1, d_tR, d_U, d_V, *g);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // printf("Time taken by dpfLUT kernel=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    gpuFree(d_scw);
+    gpuFree(d_stack);
+    gpuFree(d_l0);
+    gpuFree(d_l1);
+    gpuFree(d_tR);
+
+    peer->reconstructInPlace(d_U, 1, k.N, s);
+    peer->reconstructInPlace(d_V, k0.bout, k.N, s);
+    auto d_O = gpuSelect<TOut, TOut, 0, 0>(peer, party, k0.bout, k0.s, d_U, d_V, s, opMasked);
+    gpuLinearComb(k0.bout, k.N, d_O, TOut(2), d_O, TOut(-1 * (opMasked || party == SERVER1)), d_V);
+    gpuFree(d_U);
+    gpuFree(d_V);
+    return d_O;
+}
diff --git a/GPU-MPC/fss/gpu_lut.h b/GPU-MPC/fss/gpu_lut.h
new file mode 100644
index 00000000..bca4f0be
--- /dev/null
+++ b/GPU-MPC/fss/gpu_lut.h
@@ -0,0 +1,147 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_dpf.h"
+#include "gpu_select.h"
+
+typedef void (*tabGen)(int N, int scaleIn, int scaleOut, u8 *tab);
+typedef double (*inlineFunc)(double inp);
+
+
+template <typename T>
+struct GPULUTKey
+{
+    int bout;
+    GPUDPFKey k;
+    u32 *maskU;
+    GPUSelectKey<T> s;
+};
+
+template <typename T>
+GPULUTKey<T> readGPULUTKey(uint8_t **key_as_bytes)
+{
+    GPULUTKey<T> l;
+    l.bout = (int)**key_as_bytes;
+    *key_as_bytes += sizeof(int);
+    l.k = readGPUDPFKey(key_as_bytes);
+    l.maskU = (u32 *)*key_as_bytes;
+    *key_as_bytes += l.k.memSzOut;
+    l.s = readGPUSelectKey<T>(key_as_bytes, l.k.M);
+    return l;
+}
+
+template <typename T>
+__global__ void reluSubGelu(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        double x = double(i) / (1LL << scaleIn);
+        double g = x * 0.5 * (1 + erf(x * rsqrt(2.0)));
+        ((T *)tab)[i] = T((max(0.0, x) - g) * (1LL << scaleOut));
+        // printf("Tab[%d]=%ld\n", i, ((T *)tab)[i]);
+    }
+}
+
+template <typename T>
+__global__ void reluSubSilu(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        double x = double(i) / (1LL << scaleIn);
+        double s = x / (1 + exp(-x));
+        ((T *)tab)[i] = T((max(0.0, x) - s) * (1LL << scaleOut));
+        // printf("Tab[%d]=%ld\n", i, ((T *)tab)[i]);
+    }
+}
+
+template <typename T>
+__global__ void nExpLsb(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        ((T *)tab)[i] = T(std::exp(-i / double(1LL << scaleIn)) * (1LL << scaleOut));
+    }
+}
+
+template <typename T>
+__global__ void nExpMsb(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        ((T *)tab)[i] = T(std::exp(-i / double(1LL << scaleIn)) * (1LL << scaleOut));
+    }
+}
+
+template <typename T>
+__global__ void inv(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i == 0)
+    {
+        ((T *)tab)[i] = T((1ULL << (scaleIn + scaleOut)) / double(N));
+    }
+    else if (i > 0 && i < N)
+    {
+        ((T *)tab)[i] = T((1ULL << (scaleIn + scaleOut)) / double(i));
+    }
+}
+
+template <typename T>
+__global__ void invSqrt(int N, int extradiv, int scale, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        u64 k = i % (1LL << 6);
+        u64 m = i >> 6;
+        double val = double(m + 128) * pow(2.0, k - 7);
+        ((T *)tab)[i] = T(double(1LL << (2 * scale)) / sqrt(val / extradiv));
+    }
+}
+
+template <typename T>
+__global__ void identity(int N, int scaleIn, int scaleOut, u8 *tab)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        ((T *)tab)[i] = T(i);
+    }
+}
+
+template <typename T, tabGen t>
+T *genLUT(int bin, int scaleIn, int scaleOut)
+{
+    assert(bin > 7);
+    assert(bin < 32);
+    int N = 1 << bin;
+    T *d_tab = (T *)gpuMalloc(N * sizeof(T));
+    t<<<(N - 1) / 128 + 1, 128>>>(N, scaleIn, scaleOut, (u8 *)d_tab);
+    return d_tab;
+}
+
+#include "gpu_lut.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_matmul.cu b/GPU-MPC/fss/gpu_matmul.cu
new file mode 100644
index 00000000..f47ce1d8
--- /dev/null
+++ b/GPU-MPC/fss/gpu_matmul.cu
@@ -0,0 +1,399 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Utilities and system includes
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <chrono>
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_string.h" // helper for shared functions common to CUDA Samples
+
+// CUDA and CUBLAS functions
+#include "utils/helper_functions.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_stats.h"
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_batched.h"
+
+#include "gpu_linear_helper.h"
+#include "gpu_matmul.h"
+
+const int block_sz = 256;
+
+
+using RowMajor = cutlass::layout::RowMajor;
+using ColumnMajor = cutlass::layout::ColumnMajor;
+
+template <typename T>
+using GemmRRR = cutlass::gemm::device::Gemm<T,         // Data-type of A matrix
+                                            RowMajor,  // Layout of A matrix
+                                            T,         // Data-type of B matrix
+                                            RowMajor,  // Layout of B matrix
+                                            T,         // Data-type of C matrix
+                                            RowMajor>; // Layout of C matrix
+
+template <typename T>
+using GemmRCR = cutlass::gemm::device::Gemm<T,           // Data-type of A matrix
+                                            RowMajor,    // Layout of A matrix
+                                            T,           // Data-type of B matrix
+                                            ColumnMajor, // Layout of B matrix
+                                            T,           // Data-type of C matrix
+                                            RowMajor>;   // Layout of C matrix
+
+template <typename T>
+using GemmCRR = cutlass::gemm::device::Gemm<T,           // Data-type of A matrix
+                                            ColumnMajor, // Layout of A matrix
+                                            T,           // Data-type of B matrix
+                                            RowMajor,    // Layout of B matrix
+                                            T,           // Data-type of C matrix
+                                            RowMajor>;   // Layout of C matrix
+
+template <typename T>
+T *cutlassMatmul(MatmulParams p, T *d_A, T *d_B, T *d_C, bool cIsBias = false)
+{
+    T *d_D = (T *)gpuMalloc(p.M * p.N * sizeof(T));
+    cutlass::Status status;
+    if (p.rowMaj_A && p.rowMaj_B && p.rowMaj_C)
+    {
+        GemmRRR<T> gemm_operator;
+        typename GemmRRR<T>::Arguments args({p.M, p.N, p.K},                             // Gemm Problem dimensions
+                                            {d_A, p.K},                                  // Tensor-ref for source matrix A
+                                            {d_B, p.N},                                  // Tensor-ref for source matrix B
+                                            {d_C ? d_C : d_D, d_C && cIsBias ? 0 : p.N}, // Tensor-ref for source matrix C
+                                            {d_D, p.N},                                  // Tensor-ref for destination matrix D (may be different memory than source C matrix)
+                                            {T(1), d_C ? T(1) : T(0)});                  // Scalars used in the Epilogue
+        status = gemm_operator(args);
+    }
+    else if (p.rowMaj_A && !p.rowMaj_B && p.rowMaj_C)
+    { /* M x K x N */
+        GemmRCR<T> gemm_operator;
+        typename GemmRCR<T>::Arguments args({p.M, p.N, p.K},            // Gemm Problem dimensions
+                                            {d_A, p.K},                 // Tensor-ref for source matrix A
+                                            {d_B, p.K},                 // Tensor-ref for source matrix B
+                                            {d_C ? d_C : d_D, p.N},     // Tensor-ref for source matrix C
+                                            {d_D, p.N},                 // Tensor-ref for destination matrix D (may be different memory than source C matrix)
+                                            {T(1), d_C ? T(1) : T(0)}); // Scalars used in the Epilogue
+        status = gemm_operator(args);
+    }
+    else if (!p.rowMaj_A && p.rowMaj_B && p.rowMaj_C)
+    { /* M x K x N */
+        GemmCRR<T> gemm_operator;
+        // printf("%d %d %d\n", k.M, k.N, k.K);
+        typename GemmCRR<T>::Arguments args({p.M, p.N, p.K},            // Gemm Problem dimensions
+                                            {d_A, p.M},                 // Tensor-ref for source matrix A
+                                            {d_B, p.N},                 // Tensor-ref for source matrix B
+                                            {d_C ? d_C : d_D, p.N},     // Tensor-ref for source matrix C
+                                            {d_D, p.N},                 // Tensor-ref for destination matrix D (may be different memory than source C matrix)
+                                            {T(1), d_C ? T(1) : T(0)}); // Scalars used in the Epilogue
+        status = gemm_operator(args);
+    }
+    else
+    {
+        assert(false && "no option matches!");
+    }
+    CUTLASS_CHECK(status);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_D;
+}
+
+template <typename T>
+T *cutlassMatmulWrapper(MatmulParams p, T *d_A, T *d_B, T *d_C, bool cIsBias = false, bool reduceBw = false)
+{
+    T *d_O;
+    if (p.batchSz == 1)
+    {
+        d_O = cutlassMatmul(p, d_A, d_B, d_C, cIsBias);
+    }
+    else
+    {
+        // assert(!p.cIsLowerTriangular);
+        d_O = cutlassBatchedMatmul(p, d_A, d_B, d_C, cIsBias);
+    }
+    if (p.cIsLowerTriangular)
+    {
+        assert(!d_C);
+        auto d_temp = packLowerTriangularMatrix(p, d_O);
+        gpuFree(d_O);
+        d_O = d_temp;
+    }
+    if (reduceBw && p.bw < sizeof(T) * 8)
+    {
+        modKernel<<<(p.size_C - 1) / 128 + 1, 128>>>(p.size_C, d_O, p.bw);
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+    return d_O;
+}
+
+template <typename T>
+T *cutlassBatchedMatmul(MatmulParams p, T *d_A, T *d_B, T *d_C, bool cIsBias = false)
+{
+    auto d_D = (T *)gpuMalloc(p.batchSz * p.M * p.N * sizeof(T));
+    using BatchedGemmRRR = cutlass::gemm::device::GemmBatched<
+        T, cutlass::layout::RowMajor,
+        T, cutlass::layout::RowMajor,
+        T, cutlass::layout::RowMajor>;
+
+    using BatchedGemmRCR = cutlass::gemm::device::GemmBatched<
+        T, cutlass::layout::RowMajor,
+        T, cutlass::layout::ColumnMajor,
+        T, cutlass::layout::RowMajor>;
+
+    assert(p.rowMaj_A && p.rowMaj_C);
+    cutlass::Status status;
+    if (p.rowMaj_B)
+    {
+        BatchedGemmRRR gemm_op;
+        status = gemm_op({{p.M, p.N, p.K},
+                          {d_A, p.ld_A},
+                          p.stride_A,
+                          {d_B, p.ld_B},
+                          p.stride_B,
+                          {d_C ? d_C : d_D, d_C && cIsBias ? 0 : p.ld_C}, //
+                          cIsBias ? p.N : p.stride_C,
+                          {d_D, p.ld_C},
+                          p.stride_C,
+                          {T(1), d_C ? T(1) : T(0)},
+                          p.batchSz});
+    }
+    else
+    {
+        // printf("ld_A=%d, ld_B=%d, ld_C=%d, stride_A=%d, stride_B=%d, stride_C=%d, M=%d, N=%d, K=%d, d_C=%ld\n", p.ld_A, p.ld_B, p.ld_C, p.stride_A, p.stride_B, p.stride_C, p.M, p.N, p.K, d_C);
+        assert(!d_C || !cIsBias);
+        BatchedGemmRCR gemm_op;
+        status = gemm_op({{p.M, p.N, p.K},
+                          {d_A, p.ld_A}, // 786
+                          p.stride_A,
+                          {d_B, p.ld_B}, // 786*3
+                          p.stride_B,
+                          {d_C ? d_C : d_D, p.ld_C}, //
+                          p.stride_C,
+                          {d_D, p.ld_C},
+                          p.stride_C,
+                          {T(1), d_C ? T(1) : T(0)},
+                          p.batchSz});
+    }
+    CUTLASS_CHECK(status);
+    return d_D;
+}
+
+template <typename T>
+T *gpuMatmulPlaintext(MatmulParams p, T *d_A, T *d_B, T *d_C, bool cIsBias)
+{
+    // Need to fix this
+    auto d_D = cutlassMatmulWrapper<T>(p, d_A, d_B, d_C, cIsBias, true);
+    return d_D;
+}
+
+template <typename T>
+__global__ void packLowerTriangularKernel(MatmulParams p, T *A, T *packed_A)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < p.size_C)
+    {
+        int t = i;
+        int imgElems = (p.M * (p.M + 1)) / 2;
+        int n = t / imgElems;
+        t = t % imgElems;
+        // figure out the index of the matrix
+        int j = int(floor((-1.0f + __fsqrt_rn(1 + 8.0f * t)) / 2.0f));
+        int k = t - ((j * (j + 1)) / 2);
+        if (k == j + 1)
+        {
+            j++;
+            k = 0;
+        }
+        // printf("t=%d, k=%d, j=%d, sqrt=%f, sqrt2=%f\n", t, k, j, __fsqrt_rd(1 + 8.0f * t), __fsqrt_rd(1 + 8.0f * (t+1)));
+        assert(k < j + 1);
+        // printf("%d <- (%d, %d, %d), %ld, %d\n", i, n, j, k, A[n * imgElems + j * p.N + k], n * imgElems + j * p.N + k);
+        packed_A[i] = A[n * p.M * p.N + j * p.N + k];
+    }
+}
+
+template <typename T>
+T *packLowerTriangularMatrix(MatmulParams p, T *d_A)
+{
+    assert(p.M == p.N);
+    // printf("Packing matrix=%d\n", p.size_C);
+    auto d_packed_A = (T *)gpuMalloc(p.size_C * sizeof(T));
+    packLowerTriangularKernel<<<(p.size_C - 1) / 128 + 1, 128>>>(p, d_A, d_packed_A);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_packed_A;
+}
+
+template <typename T>
+T *gpuKeygenMatmul(u8 **key_as_bytes, int party, MatmulParams p, T *d_mask_X, T *h_mask_W, T *h_mask_Y, TruncateType t, AESGlobalContext *gaes, bool wIsOnGpu = false, T *d_mask_Z = NULL)
+{
+    // printf("w is on gpu=%d, X addr=%lx\n", wIsOnGpu, d_mask_X);
+    assert(!p.cIsLowerTriangular || !h_mask_Y);
+    auto d_mask_W = h_mask_W;
+    if (!wIsOnGpu)
+        d_mask_W = (T *)moveToGPU((u8 *)h_mask_W, p.size_B * sizeof(T), NULL);
+    writeShares<T, T>(key_as_bytes, party, p.size_A, d_mask_X, p.bw);
+    writeShares<T, T>(key_as_bytes, party, p.size_B, d_mask_W, p.bw);
+    // T *d_mask_Z = NULL;
+    if (p.cIsLowerTriangular)
+    {
+        assert(p.size_C == p.batchSz * (p.M * (p.M + 1)) / 2);
+        auto d_Z = gpuMatmulPlaintext(p, d_mask_X, d_mask_W, (T *)NULL, false);
+        d_mask_Z = randomGEOnGpu<T>(p.size_C, p.bw);
+        // checkCudaErrors(cudaMemset(d_mask_Z, 0, p.size_C * sizeof(T)));
+        gpuLinearComb(p.bw, p.size_C, d_Z, T(1), d_Z, T(1), d_mask_Z);
+        writeShares<T, T>(key_as_bytes, party, p.size_C, d_Z, p.bw);
+        gpuFree(d_Z);
+    }
+    else
+    {
+        if (!d_mask_Z)
+            d_mask_Z = randomGEOnGpu<T>(p.size_C, p.bw);
+        // checkCudaErrors(cudaMemset(d_mask_Z, 0, p.size_C * sizeof(T)));
+        auto d_masked_Z = gpuMatmulPlaintext(p, d_mask_X, d_mask_W, d_mask_Z, false);
+        writeShares<T, T>(key_as_bytes, party, p.size_C, d_masked_Z, p.bw);
+        gpuFree(d_masked_Z);
+        if (h_mask_Y)
+            gpuAddBias(p.batchSz, p.M, p.N, p.bw, d_mask_Z, h_mask_Y, NULL);
+    }
+    if (!wIsOnGpu)
+        gpuFree(d_mask_W);
+
+    auto d_mask_truncated_Z = genGPUTruncateKey<T, T>(key_as_bytes, party, t, p.bw, p.bw, p.shift, p.size_C, d_mask_Z, gaes);
+
+    if (d_mask_Z != d_mask_truncated_Z)
+        gpuFree(d_mask_Z);
+    return d_mask_truncated_Z;
+}
+
+template <typename T>
+void gmwGpuKeygenMatmul(u8 **key_as_bytes, int party, MatmulParams p, T *h_W)
+{
+    auto d_mask_X = randomGEOnGpu<T>(p.size_A, p.bw);
+    auto d_mask_W = (T *)moveToGPU((u8 *)h_W, p.size_B * sizeof(T), NULL);
+    // (T*) gpuMalloc(p.size_B * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_mask_W, 0, p.size_B * sizeof(T)));
+    // randomGEOnGpu<T>(p.size_B, p.bw);
+
+    writeShares<T, T>(key_as_bytes, party, p.size_A, d_mask_X, p.bw);
+    writeShares<T, T>(key_as_bytes, party, p.size_B, d_mask_W, p.bw);
+    assert(!p.cIsLowerTriangular);
+    auto d_Z = gpuMatmulPlaintext(p, d_mask_X, d_mask_W, (T *)NULL, false);
+    writeShares<T, T>(key_as_bytes, party, p.size_C, d_Z, p.bw);
+
+    gpuFree(d_mask_X);
+    gpuFree(d_mask_W);
+    gpuFree(d_Z);
+}
+
+template <typename T>
+T *gpuMatmulBeaver(MatmulParams p, GPUMatmulKey<T> k, int party, T *d_A, T *d_B, T *d_r0, T *d_r1, T *d_bias, Stats *s)
+{
+    T *d_C1, *d_C2;
+    if (party == SERVER0)
+    {
+        gpuLinearComb(p.bw, p.size_B, d_r1, T(1), d_B, T(-1), d_r1);
+    }
+    d_C1 = cutlassMatmulWrapper<T>(p, d_A, d_r1, d_bias, true);
+    d_C2 = cutlassMatmulWrapper<T>(p, d_r0, d_B, NULL);
+    T *d_C = (T *)moveToGPU((u8 *)k.C, k.mem_size_C, s);
+    gpuLinearComb(p.bw, p.size_C, d_C, T(1), d_C, party == SERVER0 ? T(1) : T(-1), d_C1, T(-1), d_C2);
+    gpuFree(d_C1);
+    gpuFree(d_C2);
+    return d_C;
+}
+
+template <typename T>
+T *gpuMatmul(SigmaPeer *peer, int party, MatmulParams p, GPUMatmulKey<T> &k, T *d_X, T *h_W, T *h_Y, TruncateType t, AESGlobalContext *gaes, Stats *s, bool wIsOnGpu = false, T* d_mask_X = nullptr)
+{
+    // printf("X=%lx, %lu\n", d_X, k.mem_size_A);
+    u64 b0 = peer->bytesSent() + peer->bytesReceived();
+    if (!d_mask_X)
+        d_mask_X = (T *)moveToGPU((u8 *)k.A, k.mem_size_A, s);
+    auto d_W = h_W;
+    if (!wIsOnGpu)
+        d_W = (T *)moveToGPU((u8 *)h_W, k.mem_size_B, s);
+    auto d_mask_W = (T *)moveToGPU((u8 *)k.B, k.mem_size_B, s);
+    T *d_Y = NULL;
+    // printf("N=%d, batchSz=%d\n", p.N, p.batchSz);
+    if (party == SERVER0 && h_Y)
+        d_Y = (T *)moveToGPU((u8 *)h_Y, p.batchSz * p.N * sizeof(T), s);
+
+    auto d_Z = gpuMatmulBeaver(p, k, party, d_X, d_W, d_mask_X, d_mask_W, d_Y, s);
+    // printf("Finished matmul\n");
+    gpuFree(d_mask_X);
+    if (!wIsOnGpu)
+        gpuFree(d_W);
+    gpuFree(d_mask_W);
+    if (d_Y)
+        gpuFree(d_Y);
+
+    peer->reconstructInPlace(d_Z, p.bw, p.size_C, s);
+
+    auto d_truncatedZ = gpuTruncate<T, T>(p.bw, p.bw, t, k.trKey, p.shift, peer, party, p.size_C, d_Z, gaes, s); //, true);
+    if (d_Z != d_truncatedZ)
+        gpuFree(d_Z);
+
+    u64 b1 = peer->bytesSent() + peer->bytesReceived();
+    s->linear_comm_bytes += (b1 - b0);
+    printf("Matmul Comm=%ld\n", b1 - b0);
+    return d_truncatedZ;
+}
+
+template <typename T>
+T *gpuMatmulGmw(SigmaPeer *peer, int party, MatmulParams p, GPUMatmulKey<T> &k, T *d_X, T *d_mask_X, T *d_W, T *d_mask_W, T *h_Y, Stats *s)
+{
+    T *d_Y = NULL;
+    printf("Adding bias=%lx\n", h_Y);
+    if (h_Y)
+    {
+        printf("here\n");
+        d_Y = (T *)moveToGPU((u8 *)h_Y, p.batchSz * p.N * sizeof(T), s);
+    }
+    auto d_Z = gpuMatmulBeaver(p, k, party, d_X, d_W, d_mask_X, d_mask_W, d_Y, s);
+    if (d_Y)
+        gpuFree(d_Y);
+    return d_Z;
+}
+
+template <typename T>
+T *gpuMatmulWrapper(MatmulParams p, T *h_A, T *h_B, T *h_C, bool cIsBias)
+{
+    size_t memSzBias = p.N * sizeof(T);
+    size_t memSzC = p.size_C * sizeof(T);
+    T *d_A = (T *)moveToGPU((u8 *)h_A, p.size_A * sizeof(T), NULL);
+    T *d_B = (T *)moveToGPU((u8 *)h_B, p.size_B * sizeof(T), NULL);
+    T *d_C = NULL;
+    if (h_C)
+        d_C = (T *)moveToGPU((u8 *)h_C, cIsBias ? memSzBias : memSzC, NULL);
+    // Need to fix this
+    auto d_D = cutlassMatmulWrapper<T>(p, d_A, d_B, d_C, cIsBias, true);
+    gpuFree(d_A);
+    gpuFree(d_B);
+    if (d_C)
+        gpuFree(d_C);
+    T *h_D = (T *)moveToCPU((u8 *)d_D, memSzC, NULL);
+    gpuFree(d_D);
+    return h_D;
+}
diff --git a/GPU-MPC/fss/gpu_matmul.h b/GPU-MPC/fss/gpu_matmul.h
new file mode 100644
index 00000000..5e36b3ee
--- /dev/null
+++ b/GPU-MPC/fss/gpu_matmul.h
@@ -0,0 +1,98 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "gpu_truncate.h"
+
+struct MatmulParams
+{
+    // multiply two matrices of input bitwidth bw
+    // truncate by shift and return the output in output bitwidth bw
+    int bw, shift;
+    int batchSz;
+    int M, K, N;
+    int size_A, size_B, size_C;
+    int ld_A, ld_B, ld_C;
+    int stride_A, stride_B, stride_C;
+    bool rowMaj_A, rowMaj_B, rowMaj_C;
+    bool cIsLowerTriangular = false;
+};
+
+template <typename T>
+struct GPUMatmulKey
+{
+    // MatmulParams p;
+    u64 mem_size_A, mem_size_B, mem_size_C;
+    T *A, *B, *C;
+    GPUTruncateKey<T> trKey;
+};
+
+template <typename T>
+GPUMatmulKey<T> readGPUMatmulKey(MatmulParams p, TruncateType t, uint8_t **key_as_bytes)
+{
+    GPUMatmulKey<T> k;
+    k.mem_size_A = p.size_A * sizeof(T);
+    k.mem_size_B = p.size_B * sizeof(T);
+    k.mem_size_C = p.size_C * sizeof(T);
+    k.A = (T *)*key_as_bytes;
+    *key_as_bytes += k.mem_size_A;
+    k.B = (T *)*key_as_bytes;
+    *key_as_bytes += k.mem_size_B;
+    k.C = (T *)*key_as_bytes;
+    *key_as_bytes += k.mem_size_C;
+    k.trKey = readGPUTruncateKey<T>(t, key_as_bytes);
+    return k;
+}
+
+void stdInit(MatmulParams &p, int bw, int scale)
+{
+    p.bw = bw;
+    p.shift = scale;
+
+    p.ld_A = p.K;
+    p.ld_B = p.N;
+    p.ld_C = p.N;
+
+    p.stride_A = p.M * p.K;
+    p.stride_B = p.K * p.N;
+    p.stride_C = p.M * p.N;
+
+    p.size_A = p.batchSz * p.M * p.K;
+    p.size_B = p.batchSz * p.K * p.N;
+
+    if (p.cIsLowerTriangular)
+    {
+        assert(p.M == p.N);
+        p.size_C = p.batchSz * (p.M * (p.M + 1)) / 2;
+    }
+    else
+    {
+        p.size_C = p.batchSz * p.M * p.N;
+    }
+
+    p.rowMaj_A = true;
+    p.rowMaj_B = true;
+    p.rowMaj_C = true;
+}
+
+#include "gpu_matmul.cu"
diff --git a/GPU-MPC/fss/gpu_maxpool.cu b/GPU-MPC/fss/gpu_maxpool.cu
new file mode 100644
index 00000000..c3924f85
--- /dev/null
+++ b/GPU-MPC/fss/gpu_maxpool.cu
@@ -0,0 +1,698 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_random.h"
+#include "gpu_relu.h"
+
+template <typename T>
+__global__ void populateCurMax(MaxpoolParams p, T *curMax, T *img, int N)
+{
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < N)
+    {
+        int t = thread_id;
+        int n = t / (p.H * p.W * p.C);
+        t = t % (p.H * p.W * p.C);
+        int h = t / (p.W * p.C);
+        t = t % (p.W * p.C);
+        int w = t / p.C;
+        int c = t % p.C;
+        curMax[thread_id] = T(0);
+        int leftTopCornerH = h * p.strideH - p.zPadHLeft;
+        int leftTopCornerW = w * p.strideW - p.zPadWLeft;
+        assert(leftTopCornerH < p.imgH);
+        assert(leftTopCornerW < p.imgW);
+        if (leftTopCornerH >= 0 && leftTopCornerW >= 0)
+        {
+            curMax[thread_id] = img[n * p.imgH * p.imgW * p.C + leftTopCornerH * p.imgW * p.C + leftTopCornerW * p.C + c];
+        }
+    }
+}
+
+/* out needs to be zeroed out before output is written into it. Am currently NOT adding a check for this */
+template <typename T>
+__global__ void diffWithCurMax(MaxpoolParams p, int fh, int fw, T *curMax, T *img, T *diff, int N)
+{
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_id < N)
+    {
+        // printf("CurMax %d=%ld\n", thread_id, curMax[thread_id]);
+        int t = thread_id;
+        int n = t / (p.H * p.W * p.C);
+        t = t % (p.H * p.W * p.C);
+        int h = t / (p.W * p.C);
+        t = t % (p.W * p.C);
+        int w = t / p.C;
+        int c = t % p.C;
+        int leftTopCornerH = h * p.strideH - p.zPadHLeft;
+        int leftTopCornerW = w * p.strideW - p.zPadWLeft;
+        int posH = leftTopCornerH + fh;
+        int posW = leftTopCornerW + fw;
+        // assert(posH >= 0 && posH <= p.imgH);
+        // assert(posW >= 0 && posW <= p.imgW);
+        diff[thread_id] = 0;
+        if (posH >= 0 && posH < p.imgH && posW >= 0 && posW < p.imgW)
+        {
+            T toCmp1 = curMax[thread_id];
+            T toCmp2 = img[n * p.imgH * p.imgW * p.C + posH * p.imgW * p.C + posW * p.C + c];
+            diff[thread_id] = (toCmp2 - toCmp1);
+            gpuMod(diff[thread_id], p.bw);
+        }
+    }
+}
+
+// no memory leak
+template <typename T>
+T *gpuMaxpoolLinHelper(SigmaPeer *peer, int party, MaxpoolParams p, GPUReluKey<T> k, int fh, int fw,
+                       T *d_curMax, T *d_in,
+                       AESGlobalContext *gaes, Stats *s)
+{
+    int outSz = getMSz(p);
+    T *d_diff = (T *)gpuMalloc(outSz * sizeof(T));
+    // int tb_size = 256;
+    diffWithCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, fh, fw, d_curMax, d_in, d_diff, outSz);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // relu(x-y)
+    auto d_newMax = gpuRelu<T, T, 0, 0, false>(peer, party, k, d_diff, gaes, s);
+    gpuFree(d_diff);
+    // relu(x-y) + y
+    gpuLinearComb(p.bw, outSz, d_newMax, T(1), d_newMax, T(1), d_curMax);
+    return d_newMax;
+}
+
+template <typename T>
+T *gpuMaxpoolLin(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    int outSz = getMSz(p);
+    T *d_curMax = (T *)gpuMalloc(outSz * sizeof(T));
+    for (int i = 0; i < p.FH; i++)
+    {
+        for (int j = 0; j < p.FW; j++)
+        {
+            if (i == 0 && j == 0)
+            {
+                populateCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, d_curMax, d_I, outSz);
+                continue;
+            }
+            // printf("Inside Maxpool=%d, %d\n", i, j);
+            auto d_newMax = gpuMaxpoolLinHelper(peer, party, p, k.reluKey[i * p.FW + j - 1], i, j, d_curMax, d_I, gaes, s);
+            // printf("Finished Maxpool=%d, %d\n", i, j);
+            gpuFree(d_curMax);
+            d_curMax = d_newMax;
+        }
+    }
+    return d_curMax;
+}
+
+template <typename T>
+T *gpuKeygenMaxpoolLinHelper(u8 **key_as_bytes, int party, MaxpoolParams p, int fh, int fw, T *d_curMaxMask, T *d_inputMask, AESGlobalContext *gaes)
+{
+    // printf("gpu addr=%lx\n", d_inputMask);
+    int outSz = getMSz(p);
+    // printf("############## FSS maxpool outsz=%lu\n", outSz);
+    T *d_diffMask = (T *)gpuMalloc(outSz * sizeof(T));
+    // int tb_size = 256;
+    diffWithCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, fh, fw, d_curMaxMask, d_inputMask, d_diffMask, outSz);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // auto d_newMaxMask = randomGEOnGpu<T>(outSz, p.bw);
+    // relu(x-y)
+    auto d_newMaxMask = gpuGenReluKey<T, T, 0, 0, false>(key_as_bytes, party, p.bin, p.bw, outSz, d_diffMask, gaes);
+    gpuFree(d_diffMask);
+    // relu(x-y) + y
+    gpuLinearComb(p.bw, outSz, d_newMaxMask, T(1), d_newMaxMask, T(1), d_curMaxMask);
+    return d_newMaxMask;
+}
+
+template <typename T>
+T *gpuKeygenMaxpoolLin(uint8_t **key_as_bytes, int party, MaxpoolParams p, T *d_inputMask, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, p.FH * p.FW - 1);
+    int outSz = getMSz(p);
+    T *d_curMaxMask = (T *)gpuMalloc(outSz * sizeof(T));
+    for (int i = 0; i < p.FH; i++)
+    {
+        for (int j = 0; j < p.FW; j++)
+        {
+            if (i == 0 && j == 0)
+            {
+                populateCurMax<<<(outSz - 1) / 256 + 1, 256>>>(p, d_curMaxMask, d_inputMask, outSz);
+                continue;
+            }
+            auto d_newMaxMask = gpuKeygenMaxpoolLinHelper(key_as_bytes, party, p, i, j, d_curMaxMask, d_inputMask, gaes);
+            gpuFree(d_curMaxMask);
+            d_curMaxMask = d_newMaxMask;
+        }
+    }
+    return d_curMaxMask;
+}
+
+template <typename T>
+__global__ void sub(int bw, int N, int imgH, int imgW, int i, bool isLowerTriangular, T *d_I, T *d_O)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (isLowerTriangular)
+    {
+        int l = i + 1;
+        int elemsPerImg = ((imgH * imgH) / (1ULL << (l + 1)) + imgH / 2);
+        int oLen = N * elemsPerImg;
+        if (j < oLen)
+        {
+            int n = j / elemsPerImg;
+            int k = j % elemsPerImg;
+            float twoPowl = float(1ULL << l);
+            float fourPowl = float(1ULL << (2 * l));
+            int twoPowlInt = (1ULL << l);
+            int h1 = int(floor((-twoPowl + __fsqrt_rd(fourPowl + 8 * twoPowl * k)) / (2.0f * twoPowl)));
+            int elemsInRow = h1 + 1;
+            int h2 = (k - ((h1 * (h1 + 1)) / 2) * twoPowlInt) / elemsInRow;
+            int h = h1 * twoPowlInt + h2;
+            // printf("%d=%d, %d, %d\n", k, h1, h2, h);
+            int w = k - ((h1 * (h1 + 1)) / 2) * twoPowlInt - h2 * elemsInRow;
+            assert(w < elemsInRow);
+            int inW = 2 * w;
+            T o = 0;
+            float twoPowi = float(1ULL << i);
+            elemsInRow = int(ceil((h + 1) / twoPowi));
+            if (inW + 1 < elemsInRow)
+            {
+                int twoPowiInt = (1ULL << i);
+                h1 = int(floor(h / twoPowi));
+                h2 = ((h1 * (h1 + 1)) / 2) * twoPowiInt;
+                int h3 = (h % twoPowiInt) * elemsInRow;
+                elemsPerImg = ((imgH * imgH) / (1ULL << (i + 1)) + imgH / 2);
+                int idx = n * elemsPerImg + h2 + h3 + inW;
+                o = d_I[idx + 1] - d_I[idx];
+                gpuMod(o, bw);
+                // printf("i=%d, j=%d, idx=%d, i0=%ld, i1=%ld, o=%ld\n", i, j, idx, d_I[idx], d_I[idx + 1], o);
+            }
+            d_O[j] = o;
+        }
+    }
+    else
+    {
+        int oLen = N * imgH * (imgW / (1ULL << (i + 1)));
+        int curW = (imgW / (1ULL << (i)));
+        int newW = (imgW / (1ULL << (i + 1)));
+        if (j < oLen)
+        {
+            // now you know you have an element to fill (potentially)
+            int m = j / newW;
+            int n = j % newW;
+            int nIn = 2 * n;
+            T o = 0; // d_I[m * N + nIn];
+            if (nIn + 1 < curW)
+            {
+                // both operands are legit, do the subtraction and store the result
+                o = d_I[m * curW + nIn + 1] - d_I[m * curW + nIn];
+                gpuMod(o, bw);
+                // printf("sub %d, %d, %d: %ld-%ld=%ld\n", i, m, nIn, d_I[m * N + nIn + 1], d_I[m * N + nIn], o);
+            }
+            d_O[m * newW + n] = o;
+        }
+    }
+}
+
+template <typename T>
+__global__ void add(int bw, int N, int imgH, int imgW, int i, bool isLowerTriangular, T *d_I, T *d_O)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (isLowerTriangular)
+    {
+        int l = i + 1;
+        int elemsPerImg = ((imgH * imgH) / (1ULL << (l + 1)) + imgH / 2);
+        int oLen = N * elemsPerImg;
+        if (j < oLen)
+        {
+            int n = j / elemsPerImg;
+            int k = j % elemsPerImg;
+            float twoPowl = float(1ULL << l);
+            float fourPowl = float(1ULL << (2 * l));
+            int twoPowlInt = (1ULL << l);
+            int h1 = int(floor((-twoPowl + __fsqrt_rd(fourPowl + 8 * twoPowl * k)) / (2.0f * twoPowl)));
+            int elemsInRow = h1 + 1;
+            int h2 = (k - ((h1 * (h1 + 1)) / 2) * twoPowlInt) / elemsInRow;
+            int h = h1 * twoPowlInt + h2;
+            int w = k - ((h1 * (h1 + 1)) / 2) * twoPowlInt - h2 * elemsInRow;
+            assert(w < elemsInRow);
+
+            int inW = 2 * w;
+            float twoPowi = float(1ULL << i);
+            int twoPowiInt = (1ULL << i);
+            h1 = int(floor(h / twoPowi));
+            h2 = ((h1 * (h1 + 1)) / 2) * twoPowiInt;
+            elemsInRow = int(ceil((h + 1) / twoPowi));
+            int h3 = (h % twoPowiInt) * elemsInRow;
+            elemsPerImg = ((imgH * imgH) / (1ULL << (i + 1)) + imgH / 2);
+            d_O[j] += d_I[n * elemsPerImg + h2 + h3 + inW];
+            gpuMod(d_O[j], bw);
+            // if(h == 6)
+            // printf("H=%d, %d, %ld, %ld\n", h, j, d_O[j], d_I[n * elemsPerImg + h2 + h3 + inW]);
+        }
+    }
+    else
+    {
+        int oLen = N * imgH * (imgW / (1ULL << (i + 1)));
+        if (j < oLen)
+        {
+            int curW = (imgW / (1ULL << (i + 1)));
+            // now you know you have an element to fill (potentially)
+            int m = j / curW;
+            int n = j % curW;
+            int nIn = 2 * n;
+            d_O[m * curW + n] += d_I[m * (imgW / (1ULL << i)) + nIn];
+            gpuMod(d_O[j], bw);
+            // printf("New max %d=%ld\n", i, d_O[oLen + n]);
+        }
+    }
+}
+
+// M*N matrix
+template <typename T>
+T *keygenMaxpoolLogHelper(u8 **key_as_bytes, int party, MaxpoolParams p, int i, T *d_I, AESGlobalContext *gaes)
+{
+    // p.N is the batch size and N is the number of elems to compare in this round
+    int oLen;
+    if (p.isLowerTriangular)
+    {
+        int l = i + 1;
+        int elemsPerImg = ((p.imgH * p.imgH) / (1ULL << (l + 1)) + p.imgH / 2);
+        oLen = p.N * elemsPerImg;
+    }
+    else
+    {
+        oLen = p.N * p.imgH * (p.imgW / (1ULL << (i + 1)));
+    }
+    T *d_diff = (T *)gpuMalloc(oLen * sizeof(T));
+    sub<<<(oLen - 1) / 128 + 1, 128>>>(p.bw, p.N, p.imgH, p.imgW, i, p.isLowerTriangular, d_I, d_diff);
+    // assert(p.bin + 1 <= p.bw);
+    auto d_mask_relu = gpuGenReluKey<T, T, 0, 0, false>(key_as_bytes, party, p.bin, p.bw, oLen, d_diff, gaes);
+    add<<<(oLen - 1) / 128 + 1, 128>>>(p.bw, p.N, p.imgH, p.imgW, i, p.isLowerTriangular, d_I, d_mask_relu);
+    return d_mask_relu;
+}
+
+template <typename T>
+T *gpuKeygenMaxpoolLog(uint8_t **key_as_bytes, int party, MaxpoolParams p, T *d_inputMask, AESGlobalContext *gaes)
+{
+    assert(/*p.N == 1 &&*/ p.C == 1 && p.strideH == 1 && p.strideW == p.FW && p.strideH == p.FH);
+    T *d_I = d_inputMask;
+    T *d_O;
+    // num elements to compare in round r
+    int r = p.FH * p.FW;
+    // number of rounds
+    int R = int(ceil(log2(r)));
+    writeInt(key_as_bytes, R);
+    for (int i = 0; i < R; i++)
+    {
+        // compare r consecutive elements
+        d_O = keygenMaxpoolLogHelper(key_as_bytes, party, p, i, d_I, gaes);
+        if (i > 0)
+            gpuFree(d_I);
+        d_I = d_O;
+        // halve the number of elements to compare
+        // r = int(ceil(r / 2.0f));
+    }
+    return d_O;
+}
+
+// M*N matrix
+template <typename T>
+T *maxpoolLogHelper(SigmaPeer *peer, int party, MaxpoolParams p, int i, GPUReluKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    int oLen;
+    if (p.isLowerTriangular)
+    {
+        int l = i + 1;
+        int elemsPerImg = ((p.imgH * p.imgH) / (1ULL << (l + 1)) + p.imgH / 2);
+        oLen = p.N * elemsPerImg;
+    }
+    else
+    {
+        oLen = p.N * p.imgH * (p.imgW / (1ULL << (i + 1)));
+    }
+    T *d_diff = (T *)gpuMalloc(oLen * sizeof(T));
+    sub<<<(oLen - 1) / 128 + 1, 128>>>(p.bw, p.N, p.imgH, p.imgW, i, p.isLowerTriangular, d_I, d_diff);
+    auto d_relu = gpuRelu<T, T, 0, 0, false>(peer, party, k, d_diff, gaes, s);
+    add<<<(oLen - 1) / 128 + 1, 128>>>(p.bw, p.N, p.imgH, p.imgW, i, p.isLowerTriangular, d_I, d_relu);
+    return d_relu;
+}
+
+template <typename T>
+T *gpuMaxpoolLog(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    // printf("##################### Using fixed maxpool ##########################\n");
+    assert(/*p.N == 1 &&*/ p.C == 1 && p.strideH == 1 && p.strideW == p.FW && p.strideH == p.FH);
+    // T *d_I = d_in;
+    T *d_O;
+    // num elements to compare in round r
+    int r = p.FH * p.FW;
+    // number of rounds
+    int R = int(ceil(log2(r)));
+    for (int i = 0; i < R; i++)
+    {
+        // printf("Round=%d, num Relus=%d\n", i, k.reluKey[i].numRelus);
+        // compare r consecutive elements
+        d_O = maxpoolLogHelper(peer, party, p, i, k.reluKey[i], d_I, gaes, s);
+        if (i > 0)
+            gpuFree(d_I);
+        d_I = d_O;
+        // halve the number of elements to compare
+        // r = int(ceil(r / 2.0f));
+    }
+    return d_O;
+}
+
+template <typename T>
+T *gpuKeygenMaxpool(uint8_t **key_as_bytes, int party, MaxpoolParams p, T *d_inputMask, AESGlobalContext *gaes, bool logRounds = false)
+{
+    T *d_mask_O;
+    if (logRounds)
+    {
+        assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
+        d_mask_O = gpuKeygenMaxpoolLog(key_as_bytes, party, p, d_inputMask, gaes);
+    }
+    else
+    {
+        d_mask_O = gpuKeygenMaxpoolLin(key_as_bytes, party, p, d_inputMask, gaes);
+    }
+    return d_mask_O;
+}
+
+template <typename T>
+T *gpuMaxpool(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    T *d_O;
+    printf("Gpu maxpool rounds=%d, %d, %d\n", k.rounds, p.FH, p.FW);
+    if (k.rounds < p.FH * p.FW - 1)
+    {
+        assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
+        d_O = gpuMaxpoolLog(peer, party, p, k, d_I, gaes, s);
+    }
+    else
+    {
+        d_O = gpuMaxpoolLin(peer, party, p, k, d_I, gaes, s);
+    }
+    return d_O;
+}
+
+template <typename T>
+__global__ void selectForMaxpoolBackpropKernel(MaxpoolParams p, uint32_t *oneHot,
+                                               T *incomingGrad,
+                                               T *out,
+                                               T *a, T *b,
+                                               T *e, T *d1,
+                                               T *d2, int party, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        int laneId = threadIdx.x & 0x1f;
+        int t = i;
+        int n = t / (p.H * p.W * p.C * p.FH * p.FW);
+        t = t % (p.H * p.W * p.C * p.FH * p.FW);
+        int h = t / (p.W * p.C * p.FH * p.FW);
+        t = t % (p.W * p.C * p.FH * p.FW);
+        int w = t / (p.C * p.FH * p.FW);
+        t = t % (p.C * p.FH * p.FW);
+        int c = t / (p.FH * p.FW);
+
+        T x = (oneHot[i / 32] >> laneId) & T(1);
+        T is_zero_x = (x == 0);
+        int j = n * p.H * p.W * p.C + h * p.W * p.C + w * p.C + c;
+        T y = incomingGrad[j];
+        out[i] = (-a[i] * y - b[i] * x + e[i] + y * is_zero_x * d1[i] + is_zero_x * d2[i] + (party == SERVER1) * (x * y));
+        gpuMod(out[i], p.bwBackprop);
+    }
+}
+
+__global__ void andForMaxpoolKernel(MaxpoolParams p, int pos, uint32_t *dreluBits, uint32_t *oneHotBits,
+                                    uint32_t *b0Bits, uint32_t *b1Bits, uint32_t *b2Bits, int party, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < N);
+    if (i < N)
+    {
+        int laneId = threadIdx.x & 0x1f;
+        int t = i;
+        int n = t / (p.H * p.W * p.C * p.FH * p.FW);
+        t = t % (p.H * p.W * p.C * p.FH * p.FW);
+        int h = t / (p.W * p.C * p.FH * p.FW);
+        t = t % (p.W * p.C * p.FH * p.FW);
+        int w = t / (p.C * p.FH * p.FW);
+        t = t % (p.C * p.FH * p.FW);
+        int c = t / (p.FH * p.FW);
+        int q = t % (p.FH * p.FW);
+        int newOneHot = 0;
+        int idx = i / 32;
+        if (q < pos)
+        {
+            // need to check this once
+            int dreluIndex = n * p.H * p.W * p.C + h * p.W * p.C + w * p.C + c;
+            // printf("%d: %d %d\n", i, dreluIndex / 32, idx);
+            uint32_t drelu = (dreluBits[dreluIndex / 32] >> (dreluIndex % 32)) & 1;
+            // int idx = (dreluIndex * p.FH * p.FW + q) / 32;
+            uint32_t oneHot; // = 0;
+            if (pos == 2 && q == 0)
+                oneHot = 1;
+            else if (pos == 2)
+                oneHot = 0;
+            else
+                oneHot = (oneHotBits[idx] >> laneId) & 1;
+            uint32_t incomingOneHot = (q == pos - 1 ? 1 : 0);
+            uint32_t diff = (incomingOneHot - oneHot) & 1;
+            int keyNum = dreluIndex * pos + q;
+            int keyIdx = keyNum / 32;
+            int keyPos = keyNum % 32;
+            uint32_t b0 = (b0Bits[keyIdx] >> keyPos) & 1;
+            uint32_t b1 = (b1Bits[keyIdx] >> keyPos) & 1;
+            uint32_t b2 = (b2Bits[keyIdx] >> keyPos) & 1;
+            // printf("drelu: %d diff: %d b0: %d b1: %d b2: %d curOneHot: %d %u %u %u\n", drelu, diff, b0, b1, b2, oneHot, b0Bits[idx], b1Bits[idx], b2Bits[idx]);
+            newOneHot = (-b0 * diff - drelu * b1 + b2 + (party == SERVER1) * (drelu * diff + oneHot)) & 1;
+            // printf("%d %d %d %d %d %d %d: %d %d %d %d %d %d\n", n, h, w, c, q, pos, dreluIndex, drelu, oneHot, incomingOneHot, newOneHot, keyIdx, laneId);
+        }
+        // printf("%d: %d\n", i, newOneHot);
+        assert(newOneHot == 0 || newOneHot == 1);
+        newOneHot <<= laneId;
+        for (int j = 16; j >= 1; j /= 2)
+            newOneHot += __shfl_down_sync(mask, newOneHot, j, 32);
+        if (laneId == 0)
+        {
+            // printf("new one hot: %d %d\n", idx, newOneHot);
+            oneHotBits[idx] = static_cast<uint32_t>(newOneHot);
+        }
+        // update uint32_t in tandem
+    }
+}
+
+template <typename T>
+__global__ void gpuCollectGradientsKernel(MaxpoolParams p, T *outgoingGradExpanded, T *outgoingGrad, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        int t = i;
+        int n = t / (p.imgH * p.imgW * p.C);
+        t = t % (p.imgH * p.imgW * p.C);
+        int h = t / (p.imgW * p.C);
+        t = t % (p.imgW * p.C);
+        int w = t / (p.C);
+        int c = t % (p.C);
+        T sumGrads = 0;
+        for (int fh = 0; fh < p.FH; fh++)
+        {
+            for (int fw = 0; fw < p.FW; fw++)
+            {
+                int leftTopCornerH = h - fh;
+                int leftTopCornerW = w - fw;
+                int rightTopCornerH = leftTopCornerH;
+                int rightTopCornerW = leftTopCornerW + p.FW - 1;
+                int leftBottomCornerH = leftTopCornerH + p.FH - 1;
+                int leftBottomCornerW = leftTopCornerW;
+                int rightBottomCornerH = leftTopCornerH + p.FH - 1;
+                int rightBottomCornerW = leftTopCornerW + p.FW - 1;
+                if (leftTopCornerH >= 0 && leftTopCornerW >= 0 &&
+                    rightTopCornerH >= 0 && rightTopCornerW < p.imgW &&
+                    leftBottomCornerH < p.imgH && leftBottomCornerW >= 0 &&
+                    rightBottomCornerH < p.imgH && rightBottomCornerW < p.imgW &&
+                    leftTopCornerH % p.strideH == 0 && leftTopCornerW % p.strideW == 0)
+                {
+                    int gradH = leftTopCornerH / p.strideH;
+                    int gradW = leftTopCornerW / p.strideW;
+                    int idx = n * p.H * p.W * p.C * p.FH * p.FW + gradH * p.W * p.C * p.FH * p.FW + gradW * p.C * p.FH * p.FW + c * p.FH * p.FW + fh * p.FW + fw;
+                    // if(i == 0) printf("gpu: %lu %d\n", outgoingGradExpanded[idx], p.bwBackprop);
+                    sumGrads += outgoingGradExpanded[idx];
+                }
+            }
+        }
+        outgoingGrad[i] = sumGrads;
+        gpuMod(outgoingGrad[i], p.bwBackprop);
+    }
+}
+
+// no memory leak
+template <typename T>
+T *gpuSelectForMaxpoolBackprop(MaxpoolParams p, GPUSelectKey<T> k,
+                               uint32_t *d_oneHot,
+                               T *d_incomingGrad,
+                               int party, Stats *stats)
+{
+    size_t size_in_bytes = k.N * sizeof(T);
+
+    T *d_out = (T *)gpuMalloc(size_in_bytes);
+    T *d_a, *d_b, *d_c, *d_d1, *d_d2;
+    d_a = (T *)moveToGPU((uint8_t *)k.a, 5 * size_in_bytes, stats);
+    d_b = d_a + k.N;
+    d_c = d_b + k.N;
+    d_d1 = d_c + k.N;
+    d_d2 = d_d1 + k.N;
+
+    const int tb_size = 256;
+
+    selectForMaxpoolBackpropKernel<T><<<(k.N - 1) / tb_size + 1, tb_size>>>(p, d_oneHot,
+                                                                            d_incomingGrad, d_out, d_a, d_b, d_c, d_d1, d_d2, party, k.N);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    gpuFree(d_a);
+    return d_out;
+}
+
+// no memory leak
+void gpuAndForMaxpool(MaxpoolParams p, int pos, GPUAndKey k,
+                      uint32_t *d_drelu, /* uint32_t *d_drelu2,*/
+                      uint32_t *d_oneHot,
+                      int party, Stats *stats)
+{
+    // printf("selectKey.N: %d\n", k.N);
+    int num_ints = (k.N - 1) / PACKING_SIZE + 1;
+    size_t size_in_bytes = num_ints * sizeof(uint32_t);
+
+    uint32_t *d_b0, *d_b1, *d_b2;
+    d_b0 = (uint32_t *)moveToGPU((uint8_t *)k.b0, 3 * size_in_bytes, stats);
+    d_b1 = d_b0 + num_ints;
+    d_b2 = d_b1 + num_ints;
+
+    const int tb_size = 256;
+    // printf("N for maxpool and: %d %d %d\n", k.N, num_ints, size_in_bytes);
+    int numElems = p.N * p.H * p.W * p.C * p.FH * p.FW;
+    andForMaxpoolKernel<<<(numElems - 1) / tb_size + 1, tb_size>>>(p, pos, d_drelu, d_oneHot,
+                                                                   d_b0, d_b1, d_b2, party, numElems);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    gpuFree(d_b0);
+}
+
+template <typename T>
+T *gpuCollectGradients(MaxpoolParams p, T *d_outgoingGradExpanded, Stats *s)
+{
+    size_t outgoingGradSize = p.N * p.imgH * p.imgW * p.C;
+    size_t outgoingGradMemSize = outgoingGradSize * sizeof(T);
+    T *d_outgoingGrad = (T *)gpuMalloc(outgoingGradMemSize);
+    const int tbSize = 256;
+    assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
+    gpuCollectGradientsKernel<<<(outgoingGradSize - 1) / tbSize + 1, tbSize>>>(p, d_outgoingGradExpanded, d_outgoingGrad, outgoingGradSize);
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaGetLastError());
+    return d_outgoingGrad;
+}
+
+template <typename T>
+__global__ void expandKernel(MaxpoolParams p,
+                             T *incomingGradMask,
+                             T *expandedIncomingGradMask, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        // int laneId = threadIdx.x & 0x1f;
+        int t = i;
+        int n = t / (p.H * p.W * p.C * p.FH * p.FW);
+        t = t % (p.H * p.W * p.C * p.FH * p.FW);
+        int h = t / (p.W * p.C * p.FH * p.FW);
+        t = t % (p.W * p.C * p.FH * p.FW);
+        int w = t / (p.C * p.FH * p.FW);
+        t = t % (p.C * p.FH * p.FW);
+        int c = t / (p.FH * p.FW);
+
+        int j = n * p.H * p.W * p.C + h * p.W * p.C + w * p.C + c;
+        expandedIncomingGradMask[i] = incomingGradMask[j];
+    }
+}
+
+template <typename T>
+T *keyGenMaxpoolBackProp(uint8_t **key_as_bytes, int party, MaxpoolParams p, u8 *d_oneHotMask, T *d_incomingGradMask)
+{
+    int outSz = p.N * p.H * p.W * p.C * p.FH * p.FW;
+    auto d_expandedGradMask = (T *)gpuMalloc(outSz * sizeof(T));
+    expandKernel<<<(outSz - 1) / 256 + 1, 256>>>(p, d_incomingGradMask, d_expandedGradMask, outSz);
+    auto d_randomMaskOut = gpuKeyGenSelect<T, T, u8>(key_as_bytes, party, outSz, d_expandedGradMask, d_oneHotMask, p.bwBackprop);
+    gpuFree(d_expandedGradMask);
+    auto d_outgoingGradMask = gpuCollectGradients(p, d_randomMaskOut, NULL);
+    gpuFree(d_randomMaskOut);
+    return d_outgoingGradMask;
+}
+
+// need to check this
+template <typename T>
+__global__ void keygenAndForMaxpoolKernel(MaxpoolParams p, int N, int oneHotLen, T *dreluMask, T *oneHotMask,
+                                          T *b0, T *b1, /*T* b2,*/ T *randomOutMask)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        int t = i;
+        int n = t / (p.H * p.W * p.C * oneHotLen);
+        t = t % (p.H * p.W * p.C * oneHotLen);
+        int h = t / (p.W * p.C * oneHotLen);
+        t = t % (p.W * p.C * oneHotLen);
+        int w = t / (p.C * oneHotLen);
+        t = t % (p.C * oneHotLen);
+        int c = t / (oneHotLen);
+        int q = t % (oneHotLen);
+        // if(q < pos) {
+        // need to check this once
+        int dreluIndex = n * p.H * p.W * p.C + h * p.W * p.C + w * p.C + c;
+        int oneHotIdx = dreluIndex * p.FH * p.FW + q;
+        b0[i] = dreluMask[dreluIndex];
+        b1[i] = oneHotMask[oneHotIdx];
+        if (q == oneHotLen - 1)
+            assert(b1[i] == 0);
+        // check if this is okay
+        oneHotMask[oneHotIdx] = randomOutMask[i];
+        // printf("inside keygen and kernel %d: %lu %lu %lu %llu\n", i, b0[i], b1[i], randomOutMask[i], (randomOutMask[i] - b1[i]) & 1ULL);
+        randomOutMask[i] = (b0[i] * b1[i] + randomOutMask[i] - b1[i]) & 1ULL;
+    }
+}
+
+void gpuKeygenOneHotMaxpool(u8 **key_as_bytes, int party, MaxpoolParams p, int outSz, int fh, int fw, u8 *d_dreluMask, u8 *d_oneHotMask)
+{
+    int oneHotLen = fh * p.FW + fw + 1;
+    int numAnds = outSz * oneHotLen;
+    auto d_b0 = gpuMalloc(numAnds);
+    auto d_b1 = gpuMalloc(numAnds);
+    auto d_andOutMask = randomGEOnGpu<u8>(numAnds, 1);
+    // this function copies the latest and out mask to the onehot mask
+    keygenAndForMaxpoolKernel<<<(numAnds - 1) / 256 + 1, 256>>>(p, numAnds, oneHotLen, d_dreluMask, d_oneHotMask,
+                                                                d_b0, d_b1, d_andOutMask);
+    writeAndKey(key_as_bytes, party, numAnds, d_b0, d_b1, d_andOutMask, 1);
+    gpuFree(d_b0);
+    gpuFree(d_b1);
+    gpuFree(d_andOutMask);
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_maxpool.h b/GPU-MPC/fss/gpu_maxpool.h
new file mode 100644
index 00000000..ba393f20
--- /dev/null
+++ b/GPU-MPC/fss/gpu_maxpool.h
@@ -0,0 +1,77 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_avgpool.h"
+#include "gpu_relu.h"
+#include "gpu_mul.h"
+
+using MaxpoolParams = AvgPoolParams;
+
+inline int getInSz(MaxpoolParams p)
+{
+    int sz;
+    if (p.isLowerTriangular)
+    {
+        assert(p.imgH == p.imgW);
+        assert(p.C == 1);
+        sz = p.N * (p.imgH * (p.imgH + 1)) / 2;
+    }
+    else
+    {
+        sz = p.N * p.imgH * p.imgW * p.C;
+    }
+    return sz;
+}
+
+template <typename T>
+struct GPUMaxpoolKey
+{
+    int rounds;
+    GPUReluKey<T> *reluKey;
+    // GPUAndKey* andKey;
+};
+
+template <typename T>
+GPUMaxpoolKey<T> readGPUMaxpoolKey(MaxpoolParams p, u8 **key_as_bytes)
+{
+    GPUMaxpoolKey<T> k;
+    k.rounds = *((int *)*key_as_bytes);
+    printf("Rounds=%d\n", k.rounds);
+    *key_as_bytes += sizeof(int);
+    k.reluKey = new GPUReluKey<T>[/*p.FH * p.FW*/ k.rounds];
+    for (int i = 0; i < /*p.FH*/ k.rounds; i++)
+    {
+        // for (int j = 0; j < p.FW; j++)
+        // {
+        // if (i == 0 && j == 0)
+        // continue;
+        // printf("Reading Relu key=%d, %d\n", i, j);
+        k.reluKey[i] = readReluKey<T>(key_as_bytes);
+        printf("Round %d=%d relus\n", i, k.reluKey[i].numRelus);
+        // if(this->train) maxpoolKey.andKey[i * p.FW + j] = readGPUAndKey(key_as_bytes);
+        // }
+    }
+    return k;
+}
+
+#include "gpu_maxpool.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_mha.cu b/GPU-MPC/fss/gpu_mha.cu
new file mode 100644
index 00000000..5f1417d8
--- /dev/null
+++ b/GPU-MPC/fss/gpu_mha.cu
@@ -0,0 +1,207 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <cmath>
+
+#include "gpu_mha.h"
+
+#include "utils/gpu_mem.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_scalarmul.h"
+#include "fss/gpu_truncate.h"
+
+
+template <typename T>
+__global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
+{
+    // the vectors are N x dim_W
+    assert(pMHA.dim_W % 2 == 0);
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    // int N = n_seq * dim_W;
+    int dim_W_half = pMHA.dim_W / 2;
+    if (tid < N)
+    {
+        // n_seq * dim_W * n_heads
+        int temp = tid;
+        int head = tid / (pMHA.n_seq * pMHA.dim_W);
+        temp = temp % (pMHA.n_seq * pMHA.dim_W);
+        int i = temp / pMHA.dim_W;
+        int j = temp % pMHA.dim_W;
+        float sinx, cosx;
+        auto k = j - (j >= dim_W_half) * dim_W_half;
+        __sincosf(i / __powf(10000, (2 * k / (float)pMHA.dim_W)), &sinx, &cosx);
+        const auto uLim = T(1ULL << (scale - 3));
+        T sinxi = T(sinx * uLim);
+        T cosxi = T(cosx * uLim);
+        if (sinxi == uLim)
+            sinxi -= 1;
+        if (cosxi == uLim)
+            cosxi -= 1;
+
+        auto l = (j + dim_W_half) % pMHA.dim_W;
+        T m1 = 2 * (j >= dim_W_half) - 1;
+        Y[tid] = cosxi * X[tid] + m1 * sinxi * X[head * pMHA.n_seq * pMHA.dim_W + i * pMHA.dim_W + l];
+    }
+}
+
+template <typename T>
+T *gpuKeygenRotEmb(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA, T *d_mask_X, AESGlobalContext *g)
+{
+    printf("*********** Generating rotary embedding key! ***************\n");
+    size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
+    auto d_mask_X1 = (T *)gpuMalloc(size_X * sizeof(T));
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_mask_X, d_mask_X1);
+    // gpuFree(d_mask_X);
+    auto d_mask_truncated_X = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrWithSlack, bw, bw, scale - 3, size_X, d_mask_X1, g);
+    gpuFree(d_mask_X1);
+    return d_mask_truncated_X;
+}
+
+template <typename T>
+T *gpuRotEmb(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, GPUTruncateKey<T> trKey, T *d_X, AESGlobalContext *g, Stats *s)
+{
+    u64 b0 = peer->bytesSent() + peer->bytesReceived();
+
+    size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
+    auto d_X1 = (T *)gpuMalloc(size_X * sizeof(T));
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_X, d_X1);
+    // don't free this because QKV is one long array
+    // gpuFree(d_X);
+    auto d_truncated_X = gpuTruncate<T, T>(bw, bw, TruncateType::TrWithSlack, trKey, scale - 3, peer, party, size_X, d_X1, g, s); //, true);
+    gpuFree(d_X1);
+
+    u64 b1 = peer->bytesSent() + peer->bytesReceived();
+    s->linear_comm_bytes += (b1 - b0);
+    printf("Comm in rotary embedding=%lu, N=%d\n", b1 - b0, size_X);
+    return d_truncated_X;
+}
+
+// neha: to fix: maxpool, and make it so the conv2d output is 40 bits???? (bout == 40????)
+template <typename T>
+T *gpuKeygenMHA(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA, MHAMulParams pMHAMul, T *WQKV, T *YQKV, T *WProj, T *YProj, T *d_mask_X, AESGlobalContext *g)
+{
+    auto d_mask_QKV = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pQKV, d_mask_X, WQKV, YQKV, TruncateType::TrFloor, g);
+    // this->activation.d_data = d_mask_QKV;
+
+    // printf("Size of Q=%d\n", pQKV.size_C / 3);
+    int QKSz = pMHAMul.pQKV.size_C / 3;
+    auto d_mask_Q = d_mask_QKV;
+    auto d_mask_K = d_mask_QKV + QKSz;
+    auto d_mask_V = d_mask_K + QKSz;
+    // this->activation.d_data = d_mask_V;
+    if (pMHA.rotEmb)
+    {
+        d_mask_Q = gpuKeygenRotEmb(key_as_bytes, party, bw, scale, pMHA, d_mask_Q, g);
+        d_mask_K = gpuKeygenRotEmb(key_as_bytes, party, bw, scale, pMHA, d_mask_K, g);
+    }
+
+    auto d_mask_QKt = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pQKt, d_mask_Q, d_mask_K, (T *)NULL, TruncateType::TrFloor, g, true);
+    if (pMHA.rotEmb)
+    {
+        gpuFree(d_mask_Q);
+        gpuFree(d_mask_K);
+    }
+
+    T *d_mask_normQKt = d_mask_QKt;
+    if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
+    {
+        T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
+        printf("####### Doing a scalar multiplication\n");
+        d_mask_normQKt = gpuKeygenScalarMul(key_as_bytes, party, bw, pMHAMul.pQKt.size_C, invSqrtDimW, d_mask_QKt, TruncateType::TrFloor, scale, g);
+        gpuFree(d_mask_QKt);
+    }
+
+    auto d_mask_smQKt = gpuKeygenSoftmax(key_as_bytes, party, pMHAMul.pMPool, d_mask_normQKt, g);
+    gpuFree(d_mask_normQKt);
+    // this->activation.d_data = d_mask_smQKt;
+    auto d_mask_smQKtV = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pSmQKtV, d_mask_smQKt, d_mask_V, (T *)NULL, TruncateType::TrFloor, g, true);
+    gpuFree(d_mask_smQKt);
+    gpuFree(d_mask_QKV);
+    // this->activation.d_data = d_mask_smQKtV;
+
+    auto d_mask_proj = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pProj, d_mask_smQKtV, WProj, YProj, TruncateType::TrFloor, g);
+    gpuFree(d_mask_smQKtV);
+    // free gpu memory
+    // this->activation.d_data = d_mask_proj;
+    return d_mask_proj;
+}
+
+template <typename T>
+T *gpuMHA(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, MHAMulParams pMHAMul, GPUMHAKey<T> k, T *WQKV, T *YQKV, T *WProj, T *YProj, T *d_X, MHATables<T> t, AESGlobalContext *g, Stats *s)
+{
+    auto b0 = peer->bytesSent() + peer->bytesReceived();
+
+    auto d_QKV = gpuMatmul(peer, party, pMHAMul.pQKV, k.mmKeyQKV, d_X, WQKV, YQKV, TruncateType::TrFloor, g, s);
+    // this->activation.d_data = d_QKV;
+    size_t QKSz = pMHAMul.pQKV.size_C / 3;
+    auto d_Q = d_QKV;
+    auto d_K = d_QKV + QKSz;
+    auto d_V = d_K + QKSz;
+    // this->activation.d_data = d_V;
+
+    if (pMHA.rotEmb)
+    {
+        d_Q = gpuRotEmb(peer, party, bw, scale, pMHA, k.reQTrKey, d_Q, g, s);
+        d_K = gpuRotEmb(peer, party, bw, scale, pMHA, k.reKTrKey, d_K, g, s);
+    }
+
+    auto d_QKt = gpuMatmul(peer, party, pMHAMul.pQKt, k.mmKeyQKt, d_Q, d_K, (T *)NULL, TruncateType::TrFloor, g, s, true);
+    if (pMHA.rotEmb)
+    {
+        gpuFree(d_Q);
+        gpuFree(d_K);
+    }
+    // this->activation.d_data = d_QKt;
+
+    T *d_normQKt = d_QKt;
+
+    if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
+    {
+        T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
+        printf("####### Doing a scalar multiplication\n");
+        d_normQKt = gpuScalarMul(peer, party, bw, pMHAMul.pQKt.size_C, k.normQKtTrKey, invSqrtDimW, d_QKt, TruncateType::TrFloor, scale, g, s);
+        gpuFree(d_QKt);
+    }
+
+    // assert(d_nExpMsbTab);
+    // assert(d_nExpLsbTab);
+    // assert(d_invTab);
+    // this->activation.d_data = d_normQKt;
+
+    auto d_smQKt = gpuSoftmax(peer, party, pMHAMul.pMPool, k.softmaxKey, d_normQKt, t.d_nExpMsbTab, t.d_nExpLsbTab, t.d_invTab, g, s);
+    gpuFree(d_normQKt);
+    // this->activation.d_data = d_smQKt;
+    auto d_smQKtV = gpuMatmul(peer, party, pMHAMul.pSmQKtV, k.mmKeySmQKtV, d_smQKt, d_V, (T *)NULL, TruncateType::TrFloor, g, s, true);
+    gpuFree(d_smQKt);
+    gpuFree(d_QKV);
+    // // this->activation.d_data = d_smQKtV;
+    auto d_proj = gpuMatmul(peer, party, pMHAMul.pProj, k.mmKeyProj, d_smQKtV, WProj, YProj, TruncateType::TrFloor, g, s);
+    gpuFree(d_smQKtV);
+    auto b1 = peer->bytesSent() + peer->bytesReceived();
+    printf("MHA Comm=%ld\n", b1 - b0);
+    return d_proj;
+}
diff --git a/GPU-MPC/fss/gpu_mha.h b/GPU-MPC/fss/gpu_mha.h
new file mode 100644
index 00000000..09e29d9d
--- /dev/null
+++ b/GPU-MPC/fss/gpu_mha.h
@@ -0,0 +1,190 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstdint>
+
+#include "fss/gpu_truncate.h"
+#include "fss/gpu_softmax.h"
+#include "fss/gpu_matmul.h"
+
+struct MHAParams
+{
+    int n_seq, n_embed, n_heads, dim_W;
+    bool selfAttn, doNormQKt, rotEmb;
+};
+
+struct MHAMulParams
+{
+    MatmulParams pQKV, pQKt, pSmQKtV, pProj;
+    MaxpoolParams pMPool;
+};
+
+template <typename T>
+struct GPUMHAKey
+{
+    GPUMatmulKey<T> mmKeyQKV, mmKeyQKt, mmKeySmQKtV, mmKeyProj;
+    GPUSoftMaxKey<T> softmaxKey;
+    GPUTruncateKey<T> reQTrKey, reKTrKey, normQKtTrKey;
+};
+
+template <typename T>
+struct MHATables
+{
+    T *d_nExpMsbTab = NULL, *d_nExpLsbTab = NULL, *d_invTab = NULL;
+};
+
+template <typename T>
+inline MHATables<T> initMHATables(int n_seq, int scale)
+{
+    MHATables<T> mhaTab;
+    mhaTab.d_nExpMsbTab = genLUT<T, nExpMsb<T>>(8, 4, scale);
+    mhaTab.d_nExpLsbTab = genLUT<T, nExpLsb<T>>(8, 12, scale);
+    mhaTab.d_invTab = genLUT<T, inv<T>>(int(ceil(log2(n_seq))) + 6, 6, scale);
+    return mhaTab;
+}
+
+inline MatmulParams initPQKV(MHAParams pMHA, int bw, int scale)
+{
+    MatmulParams pQKV;
+    pQKV.M = pMHA.n_seq;
+    pQKV.K = pMHA.n_embed;
+    pQKV.N = pMHA.dim_W;
+    pQKV.batchSz = 3 * pMHA.n_heads;
+    stdInit(pQKV, bw, scale);
+    pQKV.size_A = pQKV.M * pQKV.K;
+    pQKV.stride_A = 0;
+    pQKV.ld_B = 3 * pMHA.n_embed;
+    pQKV.stride_B = pMHA.dim_W;
+    return pQKV;
+}
+
+inline MatmulParams initPQKt(MHAParams pMHA, int bw, int scale)
+{
+    MatmulParams pQKt;
+    pQKt.M = pMHA.n_seq;
+    pQKt.K = pMHA.dim_W;
+    pQKt.N = pMHA.n_seq;
+    pQKt.batchSz = pMHA.n_heads;
+    if (pMHA.selfAttn)
+        pQKt.cIsLowerTriangular = true;
+
+    stdInit(pQKt, bw, scale);
+    if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 0)
+    {
+        // assert(int(log2(dim_W)) % 2 == 0);
+        printf("Shift=%d\n", int(log2(pMHA.dim_W) / 2));
+        pQKt.shift += int(log2(pMHA.dim_W) / 2);
+    }
+    else
+    {
+        printf("Not merging the two truncations, shift=%d\n", pQKt.shift);
+        // assert(0);
+    }
+    // K is stored in column-major form
+    pQKt.rowMaj_B = false;
+    pQKt.ld_B = pQKt.K;
+    return pQKt;
+}
+
+inline MatmulParams initPSmQKtV(MHAParams pMHA, int bw, int scale)
+{
+    MatmulParams pSmQKtV;
+    pSmQKtV.M = pMHA.n_seq;
+    pSmQKtV.K = pMHA.n_seq;
+    pSmQKtV.N = pMHA.dim_W;
+    pSmQKtV.batchSz = pMHA.n_heads;
+    stdInit(pSmQKtV, bw, scale);
+    pSmQKtV.ld_C = pMHA.n_heads * pMHA.dim_W;
+    pSmQKtV.stride_C = pMHA.dim_W;
+    return pSmQKtV;
+}
+
+inline MatmulParams initPProj(MHAParams pMHA, int bw, int scale)
+{
+    MatmulParams pProj;
+    pProj.M = pMHA.n_seq;
+    pProj.K = pMHA.n_embed;
+    pProj.N = pMHA.n_embed;
+    pProj.batchSz = 1;
+    stdInit(pProj, bw, scale);
+    return pProj;
+}
+
+inline MaxpoolParams initPMaxpool(MHAParams pMHA, int bw, int scale)
+{
+    MaxpoolParams pMPool;
+    pMPool.N = pMHA.n_heads;
+    pMPool.imgH = pMHA.n_seq;
+    pMPool.imgW = pMHA.n_seq;
+    pMPool.C = 1;
+    pMPool.FH = 1;
+    pMPool.FW = pMHA.n_seq;
+    pMPool.strideH = 1;
+    pMPool.strideW = pMHA.n_seq;
+    pMPool.zPadHLeft = 0;
+    pMPool.zPadWLeft = 0;
+    pMPool.zPadHRight = 0;
+    pMPool.zPadWRight = 0;
+    pMPool.bw = bw;
+    pMPool.bin = bw - scale;
+    printf("Bin=%d\n", pMPool.bin);
+    pMPool.scale = scale;
+    pMPool.scaleDiv = 0;
+    initPoolParams(pMPool);
+    pMPool.bwBackprop = 0;
+    if (pMHA.selfAttn)
+        pMPool.isLowerTriangular = true;
+    return pMPool;
+}
+
+MHAMulParams initMHAMulParams(MHAParams pMHA, int bw, int scale) {
+    MHAMulParams pMHAMul;
+    pMHAMul.pQKV = initPQKV(pMHA, bw, scale);
+    pMHAMul.pQKt = initPQKt(pMHA, bw, scale);
+    pMHAMul.pMPool = initPMaxpool(pMHA, bw, scale);
+    pMHAMul.pSmQKtV = initPSmQKtV(pMHA, bw, scale);
+    pMHAMul.pProj = initPProj(pMHA, bw, scale);
+    return pMHAMul;
+}
+
+template <typename T>
+GPUMHAKey<T> readGPUMHAKey(MHAParams pMHA, MHAMulParams pMHAMul, u8 **key_as_bytes)
+{
+    GPUMHAKey<T> k;
+    k.mmKeyQKV = readGPUMatmulKey<T>(pMHAMul.pQKV, TruncateType::TrFloor, key_as_bytes);
+    if (pMHA.rotEmb)
+    {
+        k.reQTrKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
+        k.reKTrKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
+    }
+    k.mmKeyQKt = readGPUMatmulKey<T>(pMHAMul.pQKt, TruncateType::TrFloor, key_as_bytes);
+    if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
+        k.normQKtTrKey = readGPUTruncateKey<T>(TruncateType::TrFloor, key_as_bytes);
+    k.softmaxKey = readGPUSoftMaxKey<T>(pMHAMul.pMPool, key_as_bytes);
+    printf("Maxpool rounds=%d", k.softmaxKey.maxPoolKey.rounds);
+    k.mmKeySmQKtV = readGPUMatmulKey<T>(pMHAMul.pSmQKtV, TruncateType::TrFloor, key_as_bytes);
+    k.mmKeyProj = readGPUMatmulKey<T>(pMHAMul.pProj, TruncateType::TrFloor, key_as_bytes);
+    return k;
+}
+
+#include "gpu_mha.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_mul.cu b/GPU-MPC/fss/gpu_mul.cu
new file mode 100644
index 00000000..b476c35b
--- /dev/null
+++ b/GPU-MPC/fss/gpu_mul.cu
@@ -0,0 +1,82 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_mul.h"
+
+template <typename T>
+__global__ void keygenBeaver(int bw, int N, T *A, T *B, T *C, T *C1)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        C1[i] = A[i] * B[i] + C[i];
+        gpuMod(C1[i], bw);
+    }
+}
+
+template <typename T>
+__global__ void doBeaverMul(int party, int bw, int N, T *X, T *Y, T *a, T *b, T *c, T *Z)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        Z[i] = (party == SERVER1) * (X[i] * Y[i]) - X[i] * b[i] - a[i] * Y[i] + c[i];
+        gpuMod(Z[i], bw);
+        // printf("%ld, %ld, %ld\n", X[i], Y[i], Z[i]);
+    }
+}
+
+template <typename T>
+T *gpuKeygenMul(u8 **key_as_bytes, int party, int bw, int scale, int N, T *d_mask_A, T *d_mask_B, TruncateType t, AESGlobalContext *gaes)
+{
+    auto d_mask_C = randomGEOnGpu<T>(N, bw);
+    // checkCudaErrors(cudaMemset(d_mask_C, 0, N * sizeof(T)));
+    auto d_mask_C1 = (T *)gpuMalloc(N * sizeof(T));
+    keygenBeaver<<<(N - 1) / 128 + 1, 128>>>(bw, N, d_mask_A, d_mask_B, d_mask_C, d_mask_C1);
+    writeShares<T, T>(key_as_bytes, party, N, d_mask_A, bw);
+    writeShares<T, T>(key_as_bytes, party, N, d_mask_B, bw);
+    writeShares<T, T>(key_as_bytes, party, N, d_mask_C1, bw);
+    gpuFree(d_mask_C1);
+    printf("##Num truncations: %d\n", N);
+    auto d_mask_truncated_C = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, bw, bw, scale, N, d_mask_C, gaes);
+    gpuFree(d_mask_C);
+    return d_mask_truncated_C;
+}
+
+template <typename T>
+T *gpuMul(SigmaPeer *peer, int party, int bw, int scale, int N, GPUMulKey<T> k, T *d_X, T *d_Y, TruncateType t, AESGlobalContext *gaes, Stats *s)
+{
+    u64 b0 = peer->bytesSent() + peer->bytesReceived();
+    auto d_a = (T *)moveToGPU((u8 *)k.a, 3 * N * sizeof(T), s);
+    auto d_b = d_a + N;
+    auto d_c = d_b + N;
+    auto d_Z = (T *)gpuMalloc(N * sizeof(T));
+    doBeaverMul<<<(N - 1) / 128 + 1, 128>>>(party, bw, N, d_X, d_Y, d_a, d_b, d_c, d_Z);
+    gpuFree(d_a);
+    peer->reconstructInPlace(d_Z, bw, N, s);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, /*TruncateType::TrWithSlack*/t, k.trKey, scale, peer, party, N, d_Z, gaes, s); //, true);
+    gpuFree(d_Z);
+    u64 b1 = peer->bytesSent() + peer->bytesReceived();
+    if (s)
+        s->linear_comm_bytes += (b1 - b0);
+    printf("Comm inside Mul=%ld\n", b1 - b0);
+    return d_truncated_Z;
+}
diff --git a/GPU-MPC/fss/gpu_mul.h b/GPU-MPC/fss/gpu_mul.h
new file mode 100644
index 00000000..6bbdc377
--- /dev/null
+++ b/GPU-MPC/fss/gpu_mul.h
@@ -0,0 +1,55 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "gpu_truncate.h"
+
+template <typename T>
+struct GPUMulKey {
+    u64 szA, szB, szC;
+    T *a, *b, *c;
+    GPUTruncateKey<T> trKey;
+};
+
+template <typename T>
+GPUMulKey<T> readGPUMulKey(u8** key_as_bytes, u64 szA, u64 szB, u64 szC, TruncateType t) {
+    // printf("Inside mul key, N=%lu, %lu, %lu, %lx\n", szA, szB, szC, *key_as_bytes);
+    GPUMulKey<T> k;
+    k.szA = szA;
+    k.szB = szB;
+    k.szC = szC;
+    k.a = (T*) *key_as_bytes;
+    // printf("a=%ld\n", *k.a);
+    *key_as_bytes += (szA * sizeof(T));
+    k.b = (T*) *key_as_bytes;
+    // printf("b=%ld\n", *k.b);
+    *key_as_bytes += (szB * sizeof(T));
+    k.c = (T*) *key_as_bytes;
+    // printf("c=%ld\n", *k.c);
+    *key_as_bytes += (szC * sizeof(T));
+    printf("Reading truncate key######\n");
+    k.trKey = readGPUTruncateKey<T>(/*TruncateType::TrWithSlack*/t, key_as_bytes);
+    return k;
+}
+
+#include "gpu_mul.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_nexp.cu b/GPU-MPC/fss/gpu_nexp.cu
new file mode 100644
index 00000000..27a4b237
--- /dev/null
+++ b/GPU-MPC/fss/gpu_nexp.cu
@@ -0,0 +1,81 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_lut.h"
+#include "gpu_truncate.h"
+#include "gpu_mul.h"
+#include "gpu_relu.h"
+
+template <typename T>
+T *gpuKeygenNExp(u8 **key_as_bytes, int party, int bw, int bin, int scale, int N, T *d_mask_X, AESGlobalContext *gaes)
+{
+    const u64 p = (1ULL << 16) - 1;
+    // assert(bin + 1 <= bw);
+    // small gap between x and p
+    // flip(relu(x - p)) + p
+    // this is wrong, can't arbitrarily do bin + 1 whenever you please
+    // the input is a 39 bit input
+    printf("Input to nExp=%d\n", N);
+    auto d_clipMask = gpuGenReluKey<T, u16, p, p, true>(key_as_bytes, party, bin, 16, N, d_mask_X, gaes);
+    // generate the output in the full bw and scale
+    auto d_lsbLutMask = gpuKeyGenLUT<u16, T>(key_as_bytes, party, 8, bw, N, d_clipMask, gaes);
+    auto d_msbMask = genGPUTruncateKey<u16, u8>(key_as_bytes, party, TruncateType::TrWithSlack, 16, 8, 8, N, d_clipMask, gaes);
+    gpuFree(d_clipMask);
+    auto d_msbLutMask = gpuKeyGenLUT<u8, T>(key_as_bytes, party, 8, bw, N, d_msbMask, gaes);
+    gpuFree(d_msbMask);
+    // clipMask is lsb mask
+    printf("Calling mul with bw=%d, scale=%d, N=%d\n", bw, scale, N);
+    auto d_nExpMask = gpuKeygenMul(key_as_bytes, party, bw, scale, N, d_msbLutMask, d_lsbLutMask, TruncateType::TrWithSlack, gaes);
+    gpuFree(d_msbLutMask);
+    gpuFree(d_lsbLutMask);
+    // auto d_nExpMask = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrWithSlack, bw, bw, scale, N, d_mulMask, gaes);
+    // gpuFree(d_mulMask);
+    return d_nExpMask;
+}
+
+template <typename T>
+T *gpuNExp(SigmaPeer* peer, int party, int bw, int bin, int scale, int N, GPUNExpKey<T> k, T *d_X, T* d_nExpMsbTab, T* d_nExpLsbTab, AESGlobalContext *gaes, Stats *s)
+{
+    const u64 p = (1ULL << 16) - 1;
+    auto d_clippedX = gpuRelu<T, u16, p, p, true>(peer, party, k.reluKey, d_X, gaes, s);
+    // printf("Starting LSB LUT=%d, %d\n", N, k.N);
+    auto d_lsbLookup = gpuDpfLUT<u16, T>(k.lsbLutKey, peer, party, d_clippedX, d_nExpLsbTab, gaes, s);
+    auto d_msb = gpuTruncate<u16, u8>(16, 8, TruncateType::TrWithSlack, k.trKey, 8, peer, party, k.N, d_clippedX, gaes, s);
+    gpuFree(d_clippedX);
+    // printf("Starting MSB LUT\n");
+    auto d_msbLookup = gpuDpfLUT<u8, T>(k.msbLutKey, peer, party, d_msb, d_nExpMsbTab, gaes, s);
+    gpuFree(d_msb);
+    // don't add comm here?
+    auto d_nExp = gpuMul(peer, party, bw, scale, k.N, k.mulKey, d_msbLookup, d_lsbLookup, TruncateType::TrWithSlack, gaes, NULL);
+    gpuFree(d_lsbLookup);
+    gpuFree(d_msbLookup);
+    // printf("N=%d\n", N);
+    // auto d_nExp = gpuTruncate<T, T>(bw, bw, TruncateType::TrWithSlack, k.mulTrKey, scale, peer, party, N, d_nExpMul, gaes, s);
+    // gpuFree(d_nExpMul);
+    return d_nExp;
+}
+
+// auto d_dReluMask = gpuKeyGenDRelu(key_as_bytes, party, bin, N, d_mask_X, gaes);
+// writeShares(key_as_bytes, party, N, d_dReluMask, 1);
+// auto d_clipMask = randomGEOnGpu<T>(u16, 16);
+// gpuLinearComb(16, N, d_clipMask, u16(1), d_clipMask, u16(-1));
+// reconstruct = true
+// gpuKeyGenSelect<T, u16>(key_as_bytes, party, N, d_mask_X, d_dReluMask, d_clipMask, 16);
diff --git a/GPU-MPC/fss/gpu_nexp.h b/GPU-MPC/fss/gpu_nexp.h
new file mode 100644
index 00000000..435dd070
--- /dev/null
+++ b/GPU-MPC/fss/gpu_nexp.h
@@ -0,0 +1,57 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_lut.h"
+#include "gpu_truncate.h"
+#include "gpu_mul.h"
+#include "gpu_relu.h"
+
+template <typename T>
+struct GPUNExpKey
+{
+    int N;
+    GPUReluKey<u16> reluKey;
+    GPULUTKey<T> lsbLutKey;
+    GPUTruncateKey<u8> trKey;
+    GPULUTKey<T> msbLutKey;
+    GPUMulKey<T> mulKey;
+    // GPUTruncateKey<T> mulTrKey;
+};
+
+template <typename T>
+GPUNExpKey<T> readGPUNExpKey(u8 **key_as_bytes)
+{
+    GPUNExpKey<T> k;
+    k.reluKey = readReluKey<u16>(key_as_bytes);
+    printf("##Reading Relu key=%d\n", k.reluKey.bout);
+    k.N = k.reluKey.numRelus;
+    k.lsbLutKey = readGPULUTKey<T>(key_as_bytes);
+    k.trKey = readGPUTruncateKey<u8>(TruncateType::TrWithSlack, key_as_bytes);
+    k.msbLutKey = readGPULUTKey<T>(key_as_bytes);
+    k.mulKey = readGPUMulKey<T>(key_as_bytes, (u64)k.N, (u64)k.N, (u64)k.N, TruncateType::TrWithSlack);
+    // printf("Done reading nexp key\n");
+    // k.mulTrKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
+    return k;
+}
+
+#include "gpu_nexp.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_relu.cu b/GPU-MPC/fss/gpu_relu.cu
new file mode 100644
index 00000000..efe389e8
--- /dev/null
+++ b/GPU-MPC/fss/gpu_relu.cu
@@ -0,0 +1,115 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <assert.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cmath>
+
+#include "utils/gpu_data_types.h"
+#include "utils/helper_cuda.h"
+#include "utils/gpu_mem.h"
+
+#include "gpu_select.h"
+#include "gpu_and.h"
+#include "gpu_relu.h"
+
+using namespace std;
+
+template <typename T>
+__global__ void keyGenDReluKernel(int party, int bin, int N, T *rinArr, T *rout, T *out)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        auto mRin = -rinArr[i];
+        gpuMod(mRin, bin);
+        auto correctionTerm = gpuMsb(mRin, bin) ^ 1 ^ rout[i];
+        assert(correctionTerm == 0 || correctionTerm == 1);
+        out[i] = correctionTerm;
+    }
+}
+
+// drelu(x-p) where x-p is guaranteed to be small
+template <typename T>
+void genDcfKeyForDRelu(uint8_t **key_as_bytes, int party, int bin, int N, T *d_rin, AESGlobalContext *gaes)
+{
+    auto d_mRin = (T *)gpuMalloc(N * sizeof(T));
+    gpuLinearComb(bin - 1, N, d_mRin, -T(1), d_rin);
+    gpuKeyGenDCF(key_as_bytes, party, bin - 1, N, d_mRin, gaes);
+    gpuFree(d_mRin);
+}
+
+// need to check this
+// drelu mask is used as input mask for the next set of protocols
+// do we need something better than GPUGroupElement?
+template <typename T, u64 p, u64 q>
+T *gpuKeyGenIc(uint8_t **key_as_bytes, int party, int bin, int N, T *d_rin, bool genDcfKey, AESGlobalContext *gaes)
+{
+    if (genDcfKey)
+        genDcfKeyForDRelu(key_as_bytes, party, bin, N, d_rin, gaes);
+    auto d_icMask = randomGEOnGpu<T>(N, 1);
+    // checkCudaErrors(cudaMemset(d_icMask, 0, N * sizeof(T)));
+    writeShares<T, T>(key_as_bytes, party, N, d_icMask, 1);
+    return d_icMask;
+}
+
+template <typename T>
+T *gpuKeyGenDRelu(uint8_t **key_as_bytes, int party, int bin, int N, T *d_rin, AESGlobalContext *gaes)
+{
+    genDcfKeyForDRelu(key_as_bytes, party, bin, N, d_rin, gaes);
+    auto d_dReluMask = randomGEOnGpu<T>(N, 1);
+    // checkCudaErrors(cudaMemset(d_dReluMask, 0, N * sizeof(T)));
+    auto d_modifiedMask = (T *)gpuMalloc(N * sizeof(T));
+    keyGenDReluKernel<<<(N - 1) / 128 + 1, 128>>>(party, bin, N, d_rin, d_dReluMask, d_modifiedMask);
+    writeShares<T, T>(key_as_bytes, party, N, d_modifiedMask, 1);
+    gpuFree(d_modifiedMask);
+    return d_dReluMask;
+}
+
+// relu(x-p1) + p2 where x-p1 is guaranteed to be small
+template <typename TIn, typename TOut, u64 p, u64 q, bool flipDRelu>
+TOut *gpuGenReluKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, TIn *d_inputMask, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, bout);
+    writeInt(key_as_bytes, N);
+    // printf("Writing bout=%d, N=%d\n", bout, N);
+    auto d_dreluMask = gpuKeyGenDRelu(key_as_bytes, party, bin, N, d_inputMask, gaes);
+    auto d_reluMask = gpuKeyGenSelect<TIn, TOut>(key_as_bytes, party, N, d_inputMask, d_dreluMask, bout);
+    return d_reluMask;
+}
+
+// Relu(x-p) + q, where x-p is guaranteed to be small
+template <typename TIn, typename TOut, u64 p, u64 q, bool flipDRelu>
+TOut *gpuRelu(SigmaPeer *peer, int party, GPUReluKey<TOut> &k, TIn *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    auto &dreluKey = k.dreluKey;
+    std::vector<u32 *> h_mask({dreluKey.mask});
+    auto d_drelu = gpuDcf<TIn, 1, dReluPrologue<p>, dReluEpilogue<p, flipDRelu>>(dreluKey.dpfKey, party, d_I, gaes, s, &h_mask);
+    peer->reconstructInPlace(d_drelu, 1, k.numRelus, s);
+    auto d_relu = gpuSelect<TIn, TOut, p, q>(peer, party, k.bout, k.selectKey, (u32 *)d_drelu, d_I, s);
+    return d_relu;
+}
diff --git a/GPU-MPC/fss/gpu_relu.h b/GPU-MPC/fss/gpu_relu.h
new file mode 100644
index 00000000..53c254d2
--- /dev/null
+++ b/GPU-MPC/fss/gpu_relu.h
@@ -0,0 +1,72 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+
+#include "gpu_select.h"
+#include "gpu_dpf.h"
+
+// using GPUDReluKey = GPUMaskedDCFKey;
+using u32 = uint32_t;
+
+struct GPUDReluKey
+{
+    GPUDPFKey dpfKey;
+    u32 *mask;
+};
+
+template <typename T>
+struct GPUReluKey
+{
+    int bin, bout, numRelus;
+    GPUDReluKey dreluKey;
+    GPUSelectKey<T> selectKey;
+};
+
+GPUDReluKey readGPUDReluKey(uint8_t **key_as_bytes)
+{
+    GPUDReluKey k;
+    k.dpfKey = readGPUDPFKey(key_as_bytes);
+    int N = k.dpfKey.M;
+    k.mask = (uint32_t *)*key_as_bytes;
+    // number of 32-bit integers * sizeof(int)
+    // only works for bout = 1
+    *key_as_bytes += ((N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+    return k;
+}
+
+// const auto readGPUDReluWithDCFKey = readGPUMaskedDCFKey;
+
+template <typename T>
+GPUReluKey<T> readReluKey(uint8_t **key_as_bytes)
+{
+    GPUReluKey<T> k;
+    memcpy(&k, *key_as_bytes, 3 * sizeof(int));
+    *key_as_bytes += 3 * sizeof(int);
+
+    k.dreluKey = readGPUDReluKey(key_as_bytes);
+    k.selectKey = readGPUSelectKey<T>(key_as_bytes, k.numRelus);
+    return k;
+}
+
+#include "gpu_relu.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_scalarmul.h b/GPU-MPC/fss/gpu_scalarmul.h
new file mode 100644
index 00000000..be7654ea
--- /dev/null
+++ b/GPU-MPC/fss/gpu_scalarmul.h
@@ -0,0 +1,52 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "gpu_truncate.h"
+
+// Z = aX, where a is a public scalar
+template <typename T>
+T *gpuKeygenScalarMul(u8 **key_as_bytes, int party, int bw, int N, T a, T *d_mask_X, TruncateType t, int shift, AESGlobalContext *gaes)
+{
+    auto d_mask_Z = (T *)gpuMalloc(N * sizeof(T));
+    gpuLinearComb(bw, N, d_mask_Z, a, d_mask_X);
+    printf("Truncate type=%d\n", t);
+    auto d_mask_truncated_Z = genGPUTruncateKey<T, T>(key_as_bytes, party, t, bw, bw, shift, N, d_mask_X, gaes);
+    if (d_mask_truncated_Z != d_mask_Z)
+        gpuFree(d_mask_Z);
+    return d_mask_truncated_Z;
+}
+
+template <typename T>
+T *gpuScalarMul(SigmaPeer *peer, int party, int bw, int N, GPUTruncateKey<T> k, T a, T *d_X, TruncateType t, int shift, AESGlobalContext *gaes, Stats *s)
+{
+    u64 b0 = peer->bytesSent() + peer->bytesReceived();
+    auto d_Z = (T *)gpuMalloc(N * sizeof(T));
+    gpuLinearComb(bw, N, d_Z, a, d_X);
+    printf("Truncate type=%d\n", t);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k, shift, peer, party, N, d_Z, gaes, s); //, true);
+    gpuFree(d_Z);
+    u64 b1 = peer->bytesSent() + peer->bytesReceived();
+    s->linear_comm_bytes += (b1 - b0);
+    return d_truncated_Z;
+}
diff --git a/GPU-MPC/fss/gpu_select.cu b/GPU-MPC/fss/gpu_select.cu
new file mode 100644
index 00000000..b5b5a8a9
--- /dev/null
+++ b/GPU-MPC/fss/gpu_select.cu
@@ -0,0 +1,135 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_mem.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_random.h"
+
+#include "gpu_select.h"
+#include "gpu_linear_helper.h"
+
+// select(b, x-p, 0) + q
+template <typename TIn, typename TOut, u64 p, u64 q>
+__global__ void selectKernel(u32 *X,
+                             TIn *Y,
+                             TOut *a, TOut *b,
+                             TOut *c, TOut *d1,
+                             TOut *d2, int party, int N, int bw)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        int laneId = threadIdx.x & 0x1f;
+        TOut x = ((X[i / 32] >> laneId) & 1ULL);
+        TOut is_zero_x = (x == 0);
+        auto y = TOut(Y[i] - p);
+
+        // y -= p;
+        // gpuMod(y, bw);
+        a[i] = -a[i] * y - b[i] * x + c[i] + y * is_zero_x * d1[i] +
+               is_zero_x * d2[i] + (party == SERVER1) * (x * y + TOut(q));
+        gpuMod(a[i], bw);
+        // if (i < 8)
+        // printf("select %d: %ld, %ld, %ld\n", i, i64(x), i64(y), i64(a[i]));
+        // auto selectOutput = select<p>(drelu, diff, a, b, c, d1, d2, party, N, i);
+        // if(party == SERVER1 && curMax != NULL) selectOutput += curMax[i];
+        // a[i] = selectOutput;
+    }
+}
+
+template <typename TIn, typename TOut, u64 p, u64 q>
+TOut *gpuSelect(SigmaPeer *peer, int party, int bw, GPUSelectKey<TOut> k, u32 *d_x, TIn *d_Y, Stats *s, bool opMasked = true)
+{
+    assert(bw <= 8 * sizeof(TOut));
+    size_t memSz = k.N * sizeof(TOut);
+
+    TOut *d_a = (TOut *)moveToGPU((uint8_t *)k.a, memSz, s);
+    TOut *d_b = (TOut *)moveToGPU((uint8_t *)k.b, memSz, s);
+    TOut *d_c = (TOut *)moveToGPU((uint8_t *)k.c, memSz, s);
+    TOut *d_d1 = (TOut *)moveToGPU((uint8_t *)k.d1, memSz, s);
+    TOut *d_d2 = (TOut *)moveToGPU((uint8_t *)k.d2, memSz, s);
+    // printf("Doing select\n");
+    selectKernel<TIn, TOut, p, q><<<(k.N - 1) / 256 + 1, 256>>>(d_x, d_Y, d_a, d_b, d_c, d_d1, d_d2, party, k.N, bw);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // printf("finished kernel\n");
+    if (opMasked)
+        peer->reconstructInPlace(d_a, bw, k.N, s);
+
+    // gpuFree(d_a);
+    gpuFree(d_b);
+    gpuFree(d_c);
+    gpuFree(d_d1);
+    gpuFree(d_d2);
+
+    return d_a;
+}
+
+template <typename TIn, typename TOut, typename TMaskB>
+__global__ void keyGenSelectKernel(int N, TMaskB *maskB, TIn *maskX, TOut *randomMaskOut, TOut *maskOut, TOut *oneBitDcfK1, TOut *oneBitDcfK2, int bw)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        // if (i == 0)
+        //     printf("select key random mask %ld %ld %d\n", maskB[i], maskX[i], bw);
+        maskOut[i] = TOut(maskB[i] * maskX[i]) + (randomMaskOut ? randomMaskOut[i] : 0);
+        gpuMod(maskOut[i], bw);
+        oneBitDcfK1[i] = 0;
+        oneBitDcfK2[i] = 0;
+        if (maskB[i] == TIn(1))
+        {
+            oneBitDcfK1[i] = 2;
+            oneBitDcfK2[i] = -2 * maskX[i];
+            gpuMod(oneBitDcfK1[i], bw);
+            gpuMod(oneBitDcfK2[i], bw);
+        }
+    }
+}
+
+// if you don't have a random mask then the function returns one else it returns null
+template <typename TIn, typename TOut, typename TMaskB>
+TOut *gpuKeyGenSelect(uint8_t **key_as_bytes, int party, int N, TIn *d_maskX, TMaskB *d_maskB, int bw, bool opMasked = true)
+{
+    // printf("bw=%d, Tout=%d\n", bw, sizeof(TOut));
+    assert(bw <= 8 * sizeof(TOut));
+    if (!d_maskX)
+        d_maskX = randomGEOnGpu<TIn>(N, bw);
+    TOut *d_randomMaskOut = opMasked ? randomGEOnGpu<TOut>(N, bw) : NULL;
+    // if (d_randomMaskOut)
+    // {
+    //     checkCudaErrors(cudaMemset(d_randomMaskOut, 0, N * sizeof(TOut)));
+    // }
+    auto d_out = (TOut *)gpuMalloc(N * sizeof(TOut));
+    auto d_oneBitK1 = (TOut *)gpuMalloc(N * sizeof(TOut));
+    auto d_oneBitK2 = (TOut *)gpuMalloc(N * sizeof(TOut));
+    // printf("Bw=%d\n", bw);
+    keyGenSelectKernel<<<(N - 1) / 256 + 1, 256>>>(N, d_maskB, d_maskX, d_randomMaskOut, d_out, d_oneBitK1, d_oneBitK2, bw);
+    checkCudaErrors(cudaDeviceSynchronize());
+    writeShares<TMaskB, TOut>(key_as_bytes, party, N, d_maskB, bw);
+    writeShares<TIn, TOut>(key_as_bytes, party, N, d_maskX, bw);
+    writeShares<TOut, TOut>(key_as_bytes, party, N, d_out, bw);
+    writeShares<TOut, TOut>(key_as_bytes, party, N, d_oneBitK1, bw);
+    writeShares<TOut, TOut>(key_as_bytes, party, N, d_oneBitK2, bw);
+    gpuFree(d_out);
+    gpuFree(d_oneBitK1);
+    gpuFree(d_oneBitK2);
+    return d_randomMaskOut;
+}
diff --git a/GPU-MPC/fss/gpu_select.h b/GPU-MPC/fss/gpu_select.h
new file mode 100644
index 00000000..a82216b8
--- /dev/null
+++ b/GPU-MPC/fss/gpu_select.h
@@ -0,0 +1,62 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+
+template <typename T>
+struct GPUSelectKey
+{
+    int N;
+    T *a, *b, *c, *d1, *d2;
+};
+
+template <typename T>
+GPUSelectKey<T> readGPUSelectKey(uint8_t** key_as_bytes, int N) {
+    GPUSelectKey<T> k;
+    k.N = N;
+
+    size_t size_in_bytes = N * sizeof(T);
+
+    k.a = (T *) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.b = (T *) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.c = (T *) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.d1 = (T *) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    k.d2 = (T *) *key_as_bytes;
+    *key_as_bytes += size_in_bytes;
+
+    return k;
+}
+
+
+// template <typename T>
+// GPUSelectKey<T> readGPUSelectKey(uint8_t **key_as_bytes, int N);
+
+#include "gpu_select.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_softmax.cu b/GPU-MPC/fss/gpu_softmax.cu
new file mode 100644
index 00000000..b331fc4e
--- /dev/null
+++ b/GPU-MPC/fss/gpu_softmax.cu
@@ -0,0 +1,141 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_softmax.h"
+#include "gpu_avgpool.h"
+#include "gpu_truncate.h"
+
+template <typename T>
+__global__ void expandLtMatrixKernel(int N, int imgH, int imgW, T *ltA, T *A, T c)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N * imgH * imgW)
+    {
+        // have something to write
+        int t = i;
+        int n = t / (imgH * imgW);
+        t = t % (imgH * imgW);
+        int h = t / imgW;
+        int w = t % imgW;
+        if (w <= h)
+        {
+            // add an element
+            A[i] = ltA[n * (imgH * (imgH + 1)) / 2 + (h * (h + 1)) / 2 + w];
+        }
+        else
+        {
+            // add zero
+            A[i] = c;
+        }
+    }
+}
+
+template <typename T>
+T *expandLowerTriangularMatrix(MaxpoolParams p, T *d_ltA, T c = 0)
+{
+    p.isLowerTriangular = false;
+    int sz = getInSz(p);
+    auto d_A = (T *)gpuMalloc(sz * sizeof(T));
+    expandLtMatrixKernel<<<(sz - 1) / 128 + 1, 128>>>(p.N, p.imgH, p.imgW, d_ltA, d_A, c);
+    return d_A;
+}
+
+template <typename T>
+T *gpuKeygenSoftmax(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, AESGlobalContext *gaes)
+{
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    printf("################# InSz=%d\n", inSz);
+    int ogBw = p.bw;
+    int reducedBw = p.bin + 2;
+    // int bin = p.bin;
+    // Neha: can change this bin later if necessary but I don't think we need to
+    // do an implicit reduce to bin + 2 bits
+    p.bw = reducedBw;
+    // +1 slack for maxpool
+    p.bin = p.bin + 1;
+    // get the max in 39 bits (implicit reduce)
+    // in this case the input bw and the output bw are the same
+    auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_mask_X, gaes, (inSz & (inSz - 1)) == 0);
+    assert(p.strideH == p.FH && p.strideW == p.FW);
+    auto d_X1Mask = windowFunc<T, xPlusM<u64(-1), u64(1)>>(party, p, d_mask_X, d_maxMask);
+    gpuFree(d_maxMask);
+    // +1 slack for sum, nexp also needs a +1 slack, so overall need a +2 slack
+    auto d_expX1Mask = gpuKeygenNExp(key_as_bytes, party, ogBw, reducedBw, p.scale, inSz, d_X1Mask, gaes);
+    gpuFree(d_X1Mask);
+    p.bw = ogBw;
+    auto d_sumExpX1 = gpuAddPool(p, d_expX1Mask, NULL);
+    // assert(p.imgW <= 128);
+    auto d_sumExpX1Inv = gpuKeygenLUTInverse(key_as_bytes, party, p.bw, p.scale + int(ceil(log2(p.FW))), p.scale, mSz, d_sumExpX1, gaes);
+    gpuFree(d_sumExpX1);
+    auto d_softmaxMask = keygenWindowMul(key_as_bytes, party, p, d_expX1Mask, d_sumExpX1Inv, TruncateType::TrWithSlack, gaes);
+    gpuFree(d_expX1Mask);
+    gpuFree(d_sumExpX1Inv);
+    if (p.isLowerTriangular)
+    {
+        auto d_eSoftmaxMask = expandLowerTriangularMatrix(p, d_softmaxMask);
+        gpuFree(d_softmaxMask);
+        d_softmaxMask = d_eSoftmaxMask;
+    }
+    return d_softmaxMask;
+}
+
+template <typename T>
+T *gpuSoftmax(SigmaPeer *peer, int party, MaxpoolParams p, GPUSoftMaxKey<T> k, T *d_X, T *d_nExpMsbTab, T *d_nExpLsbTab, T *d_invTab, AESGlobalContext *gaes, Stats *s)
+{
+    u64 b0 = peer->bytesSent() + peer->bytesReceived();
+    // need to make sure that this works for N > 1
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    int ogBw = p.bw;
+    int reducedBw = p.bin + 2;
+    p.bw = reducedBw;
+    // need a +1 slack for maxpool
+    p.bin = p.bin + 1;
+    auto start = std::chrono::high_resolution_clock::now();
+    // doesn't assume anything about a gap
+    auto d_max = gpuMaxpool(peer, party, p, k.maxPoolKey, d_X, gaes, s);
+    assert(p.strideH == p.FH && p.strideW == p.FW);
+    auto d_X1 = windowFunc<T, xPlusM<u64(-1), u64(1)>>(party, p, d_X, d_max);
+    gpuFree(d_max);
+    // need a +1 slack for nExp for the first clip (which computes drelu(x - 2^16))
+    auto d_expX1 = gpuNExp(peer, party, ogBw, reducedBw, p.scale, inSz, k.nExpKey, d_X1, d_nExpMsbTab, d_nExpLsbTab, gaes, s);
+    gpuFree(d_X1);
+    p.bw = ogBw;
+    auto d_sumExpX1 = gpuAddPool(p, d_expX1, NULL);
+    auto d_sumExpX1Inv = gpuLUTInverse(peer, party, p.bw, p.scale + int(ceil(log2(p.FW))), p.scale, mSz, k.invKey, d_sumExpX1, d_invTab, gaes, s);
+    gpuFree(d_sumExpX1);
+    auto d_softmax = windowMul(peer, party, p, k.wMulKey, d_expX1, d_sumExpX1Inv, TruncateType::TrWithSlack, gaes, s);
+    gpuFree(d_expX1);
+    gpuFree(d_sumExpX1Inv);
+    if (p.isLowerTriangular)
+    {
+        auto d_eSoftmax = expandLowerTriangularMatrix(p, d_softmax);
+        gpuFree(d_softmax);
+        d_softmax = d_eSoftmax;
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    s->softmax_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    u64 b1 = peer->bytesSent() + peer->bytesReceived();
+    s->softmax_comm_bytes += (b1 - b0);
+    return d_softmax;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_softmax.h b/GPU-MPC/fss/gpu_softmax.h
new file mode 100644
index 00000000..28b8c5c4
--- /dev/null
+++ b/GPU-MPC/fss/gpu_softmax.h
@@ -0,0 +1,59 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_maxpool.h"
+#include "gpu_nexp.h"
+#include "gpu_lut.h"
+#include "gpu_inverse.h"
+#include "gpu_window.h"
+
+
+template <typename T>
+struct GPUSoftMaxKey
+{
+    GPUMaxpoolKey<T> maxPoolKey;
+    GPUNExpKey<T> nExpKey;
+    GPULUTInverseKey<T> invKey;
+    GPUMulKey<T> wMulKey;
+};
+
+template <typename T>
+GPUSoftMaxKey<T> readGPUSoftMaxKey(MaxpoolParams p, u8 **key_as_bytes)
+{
+    GPUSoftMaxKey<T> k;
+    assert(p.C == 1);
+    assert(p.strideH == 1);
+    assert(p.strideW == p.FW);
+
+    u64 inSz = getInSz(p);
+    u64 mSz = getMSz(p);
+    k.maxPoolKey = readGPUMaxpoolKey<T>(p, key_as_bytes);
+    k.nExpKey = readGPUNExpKey<T>(key_as_bytes);
+    k.invKey = readGPULUTInverseKey<T>(key_as_bytes);
+    k.wMulKey = readGPUWindowMulKey<T>(p, TruncateType::TrWithSlack, key_as_bytes);
+    return k;
+}
+
+
+
+#include "gpu_softmax.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_sstab.h b/GPU-MPC/fss/gpu_sstab.h
new file mode 100644
index 00000000..adc7e92c
--- /dev/null
+++ b/GPU-MPC/fss/gpu_sstab.h
@@ -0,0 +1,145 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_data_types.h"
+#include "gpu_dpf_templates.h"
+
+typedef void (*genShares)(u64 x, u8 *tab);
+
+struct GPUSSTabKey
+{
+    int bin, N;
+    u8 *ss;
+    u64 memSzSS, memSzOut;
+};
+
+GPUSSTabKey readGPUSSTabKey(u8 **key_as_bytes)
+{
+    GPUSSTabKey k;
+    memcpy(&k, *key_as_bytes, 2 * sizeof(int));
+    *key_as_bytes += 2 * sizeof(int);
+    k.ss = *key_as_bytes;
+    k.memSzSS = k.N * (1ULL << (max(0, k.bin - 3))); // size in bytes
+    *key_as_bytes += k.memSzSS;
+    k.memSzOut = ((k.N - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE);
+    return k;
+}
+
+__device__ u8 lookup(u8 *ss, u64 x)
+{
+    return u8((ss[x / 8] >> (x % 8)) & u8(1));
+}
+
+template <typename T, int E, dpfPrologue pr, dpfEpilogue ep>
+__global__ void lookupSSTable(int party, int bin, int N,
+                              T *in, u8 *ss, u32 *out)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < N)
+    {
+        auto x = u64(in[tid]);
+        gpuMod(x, bin);
+        int tabSz = (1ULL << (max(0, bin - 3))); // number of bytes occupied by the table
+        // printf("Table start idx %d=%d\n", tid, tabSz * tid);
+        u8 *localSS = &ss[tid * tabSz];
+        u64 x1[E];
+        // populate the input
+        pr(party, bin, N, x, x1);
+        u8 o[E];
+        for (int e = 0; e < E; e++)
+        {
+            gpuMod(x1[e], bin);
+            // printf("X[%d]=%ld, %ld, %d, %d\n", e, x, x1[e], int(lookup(localSS, x1[e])), int(localSS[x1[e] / 8]));
+            o[e] = lookup(localSS, x1[e]);
+        }
+        ep(party, bin, N, x, o, out, 0);
+    }
+}
+
+
+template <typename T, int E, dpfPrologue pr, dpfEpilogue ep>
+u32 *gpuLookupSSTable(GPUSSTabKey &k, int party, T *d_in, Stats* s, std::vector<u32 *> *h_masks=NULL)
+{
+    auto d_out = moveMasks(k.memSzOut, h_masks, s);
+    // printf("Bin=%d, Memsz=%ld\n", k.bin, k.memSzSS);
+    auto d_ss = (u8 *)moveToGPU((u8 *)k.ss, k.memSzSS, s);
+    lookupSSTable<T, E, pr, ep><<<(k.N - 1) / 128 + 1, 128>>>(party, k.bin, k.N, d_in, d_ss, d_out);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(d_ss);
+    return d_out;
+}
+
+__device__ void dpfShares(u64 x, u8 *tab)
+{
+    int idx = x / 8;
+    u8 c = u8(1) << (x % 8);
+    tab[idx] ^= c;
+}
+
+__device__ void dcfShares(u64 x, u8 *tab)
+{
+    int idx = x / 8;
+    // printf("Rin=%ld, tab[0]=%d, %d\n", x, tab[0], idx);
+    for (int i = 0; i < idx; i++)
+    {
+        tab[i] ^= u8(-1);
+    }
+    int off = x % 8;
+    if (off)
+    {
+        u8 c = u8((1 << off) - 1);
+        tab[idx] ^= c;
+    }
+    // printf("Rin=%ld, tab[0]=%d\n", x, tab[0]);
+}
+
+template <typename T, genShares g>
+__global__ void genSSTableKernel(int party, int bin, int N, T *rin, u8 *tab)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < N)
+    {
+        T x = rin[tid];
+        // printf("SSTab %d: %ld\n", tid, x);
+        gpuMod(x, bin);
+        // printf("SSTab %d: %ld\n", tid, x);
+        u64 tabSz = (1ULL << (max(0, bin - 3)));
+        g(u64(x), &tab[tabSz * tid]);
+    }
+}
+
+template <typename T, genShares g>
+void genSSTable(uint8_t **key_as_bytes, int party, int bin, int N, T *d_rin)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, N);
+    // printf("%d, %d\n", bin, N);
+    u64 memSize = N * (1ULL << (max(0, bin - 3)));
+    auto d_share0 = randomGEOnGpu<u8>(memSize, 8);
+    // printf("here!!!!!!!!!!!!!!\n");
+    if (party == SERVER1)
+        genSSTableKernel<T, g><<<(N - 1) / 128 + 1, 128>>>(party, bin, N, d_rin, d_share0);
+    moveIntoCPUMem(*key_as_bytes, d_share0, memSize, NULL);
+    *key_as_bytes += memSize;
+    gpuFree(d_share0);
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_truncate.cu b/GPU-MPC/fss/gpu_truncate.cu
new file mode 100644
index 00000000..1e31ef85
--- /dev/null
+++ b/GPU-MPC/fss/gpu_truncate.cu
@@ -0,0 +1,295 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cassert>
+
+#include "utils/gpu_data_types.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/gpu_comms.h"
+
+#include "gpu_truncate.h"
+#include "gpu_local_truncate.h"
+
+template <typename TIn, typename TOut>
+using trFunc = TOut (*)(int party, int bin, int shift, int i, TIn x, u8 *bytes);
+
+template <typename TIn, typename TOut>
+using keygenTrFunc = void (*)(int party, int bin, int shift, int bout, int N, int i, TIn x, TOut y, TIn z, TOut *trKey, u8 *bytes);
+
+
+
+template <typename TIn, typename TOut>
+__device__ TOut trReduce(int party, int bin, int shift, int i, TIn x, u8 *bytes)
+{
+    // if(i == 0) printf("%lu, %lu\n", x, x >> shift);
+    return (party == SERVER1) * TOut(x >> shift);
+}
+
+template <typename TIn, typename TOut>
+__device__ TOut signExtend(int party, int bin, int shift, int i, TIn x, u8 *bytes)
+{
+    // if(i == 0) printf("sign extend x=%lu\n", x);
+    return (party == SERVER1) * (TOut(x) - (1ULL << (bin - 1)));
+}
+
+template <typename TIn, typename TOut>
+__device__ TOut trWithSlack(int party, int bin, int shift, int i, TIn x, u8 *bytes)
+{
+    auto x1 = (x + (1ULL << (bin - 2)));
+    gpuMod(x1, bin);
+    auto msb_x1 = gpuMsb(x1, bin);
+    // if (i == 0)
+    //     printf("bin=%d, x=%lu, x1=%lu, msb corr=%lu\n", bin, x, x1, ((TOut *)bytes)[i]);
+    return (party == SERVER1) * TOut((x1 >> shift) - (1ULL << (bin - shift - 2))) + ((TOut *)bytes)[i] * (!msb_x1);
+}
+
+template <typename TIn, typename TOut, trFunc<TIn, TOut> tf>
+__global__ void trCorrKernel(int party, int bin, int shift, int bout, int N, TIn *x, u32 *z_g, TOut *corr, TOut *y, u8 *bytes)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        u32 z = (z_g[i / 32] >> (threadIdx.x & 0x1f)) & 1;
+        auto y_l = (TOut)tf(party, bin, shift, i, x[i], bytes) + corr[2 * i + z];
+        gpuMod(y_l, bout);
+        // if (i == 0)
+        //     printf("corr=%lu, %lu, %lu, %u, %lu\n", corr[0], corr[1], y_l, z, x[i]);
+        y[i] = y_l;
+    }
+}
+
+template <typename TIn, typename TOut>
+__device__ void keygenTrReduce(int party, int bin, int shift, int bout, int N, int i, TIn x, TOut y, TIn z, TOut *trKey, u8 *bytes)
+{
+    auto corr = trKey;
+    auto q = -TOut(x >> shift) + y;
+    gpuMod(q, bout);
+    auto qM1 = q - 1;
+    gpuMod(qM1, bout);
+    corr[2 * i + z] = q;
+    corr[2 * i + (z ^ 1)] = qM1;
+}
+
+template <typename TIn, typename TOut>
+__device__ void keygenSignExtend(int party, int bin, int shift, int bout, int N, int i, TIn x, TOut y, TIn z, TOut *trKey, u8 *bytes)
+{
+    auto corr = trKey;
+    auto q = -TOut(x) + y;
+    gpuMod(q, bout);
+    auto r = q + (1ULL << bin);
+    gpuMod(r, bout);
+    corr[2 * i + z] = q;
+    corr[2 * i + (z ^ 1)] = r;
+    // if(i == 0) printf("sign extend mask=%lu, %lu, %lu, %lu\n", y, q, r, z);
+}
+
+template <typename TIn, typename TOut>
+__device__ void keygenTrWithSlack(int party, int bin, int shift, int bout, int N, int i, TIn x, TOut y, TIn z, TOut *trKey, u8 *bytes)
+{
+    keygenTrReduce(party, bin, shift, bout, N, i, x, y, z, trKey, bytes);
+    trKey[2 * N + i] = TOut(gpuMsb(x, bin) * (1ULL << (bin - shift)));
+    // if(i == 0) printf("trSlack key=%lu\n", ((TOut *)bytes)[N + i]);
+}
+
+template <typename TIn, typename TOut, keygenTrFunc<TIn, TOut> tf>
+__global__ void keygenTrFuncKernel(int party, int bin, int shift, int bout, int N, TIn *x, TOut *y, TIn *z, TOut *trKey, u8 *bytes)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        tf(party, bin, shift, bout, N, i, x[i], y[i], z[i], trKey, bytes);
+    }
+}
+
+template <typename TIn, typename TOut, keygenTrFunc<TIn, TOut> tf, u64 m>
+TOut *gpuKeygenTrFunc(u8 **key_as_bytes, int party, int bin, int shift, int bout, int N, int bwToCmp, TIn *d_inputMask, AESGlobalContext *gaes, u8 *bytes = NULL)
+{
+    assert(bin >= shift);
+    gpuKeyGenDCF(key_as_bytes, party, bwToCmp, N, d_inputMask, gaes);
+    auto d_dcfMask = randomGEOnGpu<TIn>(N, 1);
+    TOut *d_outMask = randomGEOnGpu<TOut>(N, bout);
+    // cudaMemset(d_outMask, 0, N * sizeof(TOut));
+    auto d_trKey = (TOut *)gpuMalloc(m * N * sizeof(TOut));
+    keygenTrFuncKernel<TIn, TOut, tf><<<(N - 1) / 128 + 1, 128>>>(party, bin, shift, bout, N, d_inputMask, d_outMask, d_dcfMask, d_trKey, bytes);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    writeShares<TIn, TIn>(key_as_bytes, party, N, d_dcfMask, 1);
+    writeShares<TOut, TOut>(key_as_bytes, party, m * N, (TOut *)d_trKey, bout);
+    gpuFree(d_dcfMask);
+    gpuFree(d_trKey);
+    return d_outMask;
+}
+
+template <typename TIn, typename TOut>
+TOut *genGPUTrWithSlackKey(uint8_t **key_as_bytes, int party, int bin, int shift, int bout, int N, TIn *d_inputMask, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, shift);
+    writeInt(key_as_bytes, bout);
+    writeInt(key_as_bytes, N);
+
+    auto d_outputMask = gpuKeygenTrFunc<TIn, TOut, keygenTrWithSlack<TIn, TOut>, 3>(key_as_bytes, party, bin, shift, bout, N, shift, d_inputMask, gaes);
+    return d_outputMask;
+}
+
+template <typename TIn, typename TOut>
+TOut *genGPUSignExtendKey(uint8_t **key_as_bytes, int party, int bin, int bout, int N, TIn *d_inputMask, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, bout);
+    writeInt(key_as_bytes, N);
+    auto d_outMask = gpuKeygenTrFunc<TIn, TOut, keygenSignExtend<TIn, TOut>, 2>(key_as_bytes, party, bin, 0, bout, N, bin, d_inputMask, gaes);
+    return d_outMask;
+}
+
+template <typename TIn, typename TOut>
+TOut *genGPUTrFloorKey(uint8_t **key_as_bytes, int party, int bin, int shift, int bout, int N, TIn *d_inputMask, AESGlobalContext *gaes)
+{
+    writeInt(key_as_bytes, bin);
+    writeInt(key_as_bytes, shift);
+    writeInt(key_as_bytes, bout);
+    writeInt(key_as_bytes, N);
+
+    auto d_trMask = gpuKeygenTrFunc<TIn, TOut, keygenTrReduce<TIn, TOut>, 2>(key_as_bytes, party, bin, shift, bin - shift, N, shift, d_inputMask, gaes);
+    auto d_outMask = d_trMask;
+    if (bout > bin - shift)
+    {
+        d_outMask = gpuKeygenTrFunc<TOut, TOut, keygenSignExtend<TOut, TOut>, 2>(key_as_bytes, party, bin - shift, 0, bout, N, bin - shift, d_trMask, gaes);
+        gpuFree(d_trMask);
+    }
+    return d_outMask;
+}
+
+template <typename TIn, typename TOut>
+TOut *genGPUTruncateKey(uint8_t **key_as_bytes, int party, TruncateType t, int bin, int bout, int shift, int N, TIn *d_inMask, AESGlobalContext *gaes)
+{
+    assert(shift > 0 || t == TruncateType::None);
+    TOut *d_outMask;
+    switch (t)
+    {
+    case TruncateType::TrWithSlack:
+        // printf("%d, %d, %d\n", bout, bin, shift, bin - shift);
+        // assert(bout > bin - shift);
+        d_outMask = genGPUTrWithSlackKey<TIn, TOut>(key_as_bytes, party, bin, shift, bout, N, d_inMask, gaes);
+        break;
+    case TruncateType::TrFloor:
+        d_outMask = genGPUTrFloorKey<TIn, TOut>(key_as_bytes, party, bin, shift, bout, N, d_inMask, gaes);
+        break;
+    case TruncateType::LocalARS:
+        d_outMask = (TOut *)gpuLocalTr<TIn, TOut, ars>(party, bin, shift, N, d_inMask);
+        break;
+    case TruncateType::LocalLRS:
+        d_outMask = (TOut *)gpuLocalTr<TIn, TOut, lrs>(party, bin, shift, N, d_inMask);
+        break;
+    default:
+        d_outMask = (TOut *)d_inMask;
+        assert(t == TruncateType::None);
+    }
+    return d_outMask;
+}
+
+template <typename TIn, typename TOut, trFunc<TIn, TOut> tf>
+TOut *gpuTrHelper(int party, SigmaPeer *peer, int bin, int shift, int bout, int N, GPUTrCorrKey<TOut> k, TIn *d_X, AESGlobalContext *g, Stats *s, u8 *d_bytes = NULL)
+{
+    // printf("%d, %d, %d, %d, %d, %d\n", bin, shift, bout, N, k.mDpfKey.dpfKey.bin, k.mDpfKey.dpfKey.N);
+    std::vector<u32 *> mask({k.mDpfKey.mask});
+    // get a masked dcf as output
+    auto d_b = gpuDcf<TIn, 1, idPrologue, maskEpilogue>(k.mDpfKey.dpfKey, party, d_X, g, s, &mask);
+    peer->reconstructInPlace(d_b, 1, N, s);
+    size_t memSz = N * sizeof(TOut);
+    auto d_corr = (TOut *)moveToGPU((u8 *)k.corr, 2 * memSz, s);
+    auto d_O = (TOut *)gpuMalloc(memSz);
+    trCorrKernel<TIn, TOut, tf><<<(N - 1) / 128 + 1, 128>>>(party, bin, shift, bout, N, d_X, d_b, d_corr, d_O, d_bytes);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(d_corr);
+    gpuFree(d_b);
+
+    peer->reconstructInPlace(d_O, bout, N, s);
+
+    return d_O;
+}
+
+template <typename TIn, typename TOut>
+TOut *gpuSignExtend(int party, SigmaPeer *peer, int bin, int bout, int N, GPUTrCorrKey<TOut> k, TIn *d_X, AESGlobalContext *g, Stats *s)
+{
+    auto d_Y = (TIn *)gpuMalloc(N * sizeof(TOut));
+    gpuLinearComb(bin, N, d_Y, TIn(1), d_X, TIn(1ULL << (bin - 1)));
+    auto d_O = gpuTrHelper<TIn, TOut, signExtend<TIn, TOut>>(party, peer, bin, 0, bout, N, k, d_Y, g, s);
+    gpuFree(d_Y);
+    return d_O;
+}
+
+template <typename TIn, typename TOut>
+TOut *gpuTrFloor(GPUTruncateKey<TOut> k, int party, SigmaPeer *peer, TIn *d_X, AESGlobalContext *g, Stats *s)
+{
+    auto d_trX = gpuTrHelper<TIn, TOut, trReduce<TIn, TOut>>(party, peer, k.bin, k.shift, k.bin - k.shift, k.N, k.lsbKey, d_X, g, s);
+    auto d_O = d_trX;
+    if (k.bout > k.bin - k.shift)
+    {
+        d_O = gpuSignExtend<TOut, TOut>(party, peer, k.bin - k.shift, k.bout, k.N, k.msbKey, d_trX, g, s);
+        gpuFree(d_trX);
+    }
+    return d_O;
+}
+
+template <typename TIn, typename TOut>
+TOut *gpuTrWithSlack(GPUTruncateKey<TOut> k, int party, SigmaPeer *peer, TIn *d_X, AESGlobalContext *g, Stats *s)
+{
+    auto d_msbCorr = (u8 *)moveToGPU((u8 *)k.msbKey.corr, k.N * sizeof(TOut), s);
+    auto d_O = gpuTrHelper<TIn, TOut, trWithSlack>(party, peer, k.bin, k.shift, k.bout, k.N, k.lsbKey, d_X, g, s, (u8 *)d_msbCorr);
+    gpuFree(d_msbCorr);
+    return d_O;
+}
+
+template <typename TIn, typename TOut>
+TOut *gpuTruncate(int bin, int bout, TruncateType t, GPUTruncateKey<TOut> k, int shift, SigmaPeer *peer, int party, int N, TIn *d_I, AESGlobalContext *gaes, Stats *s)
+{
+    TOut *d_O;
+    switch (t)
+    {
+    case TruncateType::LocalLRS:
+        // static_assert(std::is_same<TIn, TOut>::value);
+        d_O = (TOut *)gpuLocalTr<TIn, TOut, lrs>(party, bin, shift, N, d_I);
+        break;
+    case TruncateType::LocalARS:
+        // static_assert(std::is_same<TIn, TOut>::value);
+        d_O = (TOut *)gpuLocalTr<TIn, TOut, ars>(party, bin, shift, N, d_I);
+        break;
+    case TruncateType::TrWithSlack:
+        // assert(bout > bin - shift);
+        d_O = gpuTrWithSlack(k, party, peer, d_I, gaes, s);
+        break;
+    case TruncateType::TrFloor:
+        d_O = gpuTrFloor(k, party, peer, d_I, gaes, s);
+        break;
+    case TruncateType::None:
+        // static_assert(std::is_same<TIn, TOut>::value);
+        d_O = (TOut *)d_I;
+        break;
+    default:
+        assert(0 && "unknown truncate type!");
+    }
+    return d_O;
+}
diff --git a/GPU-MPC/fss/gpu_truncate.h b/GPU-MPC/fss/gpu_truncate.h
new file mode 100644
index 00000000..d66ea083
--- /dev/null
+++ b/GPU-MPC/fss/gpu_truncate.h
@@ -0,0 +1,155 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cuda.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_stats.h"
+#include "utils/gpu_comms.h"
+
+#include "gpu_relu.h"
+
+using GPUMaskedDPFKey = GPUDReluKey;
+constexpr auto readGPUMaskedDPFKey = readGPUDReluKey;
+
+template <typename T>
+struct GPUTrCorrKey
+{
+    GPUMaskedDPFKey mDpfKey;
+    T *corr;
+};
+
+template <typename T>
+struct GPUTruncateKey
+{
+    int bin, shift, bout, N;
+    GPUTrCorrKey<T> lsbKey, msbKey;
+};
+
+enum TruncateType
+{
+    None,
+    LocalLRS,
+    LocalARS,
+    TrWithSlack,
+    TrFloor
+};
+
+template <typename T>
+GPUTrCorrKey<T> readGPUTrCorrKey(u8 **key_as_bytes)
+{
+    GPUTrCorrKey<T> k;
+    k.mDpfKey = readGPUMaskedDPFKey(key_as_bytes);
+    size_t memSz = k.mDpfKey.dpfKey.M * sizeof(T);
+    k.corr = (T *)*key_as_bytes;
+    *key_as_bytes += 2 * memSz;
+    return k;
+}
+
+template <typename T>
+GPUTruncateKey<T> readGPUTrWithSlackKey(uint8_t **key_as_bytes)
+{
+    GPUTruncateKey<T> k;
+    memcpy(&k, *key_as_bytes, 4 * sizeof(int));
+    *key_as_bytes += 4 * sizeof(int);
+    k.lsbKey = readGPUTrCorrKey<T>(key_as_bytes);
+    // correct the msb only if needed
+    size_t memSz = k.N * sizeof(T);
+    k.msbKey.corr = (T *)*key_as_bytes;
+    *key_as_bytes += memSz;
+    return k;
+}
+
+template <typename T>
+GPUTruncateKey<T> readGPUTrFloorKey(uint8_t **key_as_bytes)
+{
+    GPUTruncateKey<T> k;
+    memcpy(&k, *key_as_bytes, 4 * sizeof(int));
+    *key_as_bytes += 4 * sizeof(int);
+
+    k.lsbKey = readGPUTrCorrKey<T>(key_as_bytes);
+    if (k.bout > k.bin - k.shift)
+        k.msbKey = readGPUTrCorrKey<T>(key_as_bytes);
+    return k;
+}
+
+template <typename T>
+GPUTruncateKey<T> readGPUTruncateKey(TruncateType t, uint8_t **key_as_bytes)
+{
+    GPUTruncateKey<T> k;
+    switch (t)
+    {
+    case TruncateType::TrWithSlack:
+        k = readGPUTrWithSlackKey<T>(key_as_bytes);
+        break;
+    case TruncateType::TrFloor:
+        k = readGPUTrFloorKey<T>(key_as_bytes);
+        break;
+    default:
+        assert(t == TruncateType::None || t == TruncateType::LocalARS || t == TruncateType::LocalLRS);
+    }
+    return k;
+}
+
+template <typename T>
+void checkTrFloor(int bin, int bout, int shift, int N, T *h_masked_A, T *h_mask_A, T *h_A_ct)
+{
+    // printf("N=%d\n", N);
+    for (int i = 0; i < N; i++)
+    {
+        auto truncated_A = cpuArs(h_A_ct[i], bin, shift);
+        cpuMod(truncated_A, bout);
+        auto output = h_masked_A[i] - h_mask_A[i];
+        cpuMod(output, bout);
+        auto diff = output - truncated_A;
+        if (i < 10 || diff != T(0))
+            printf("%d: %ld %ld %ld %ld, %ld, %ld\n", i, u64(output), u64(truncated_A), u64(h_A_ct[i]), h_masked_A[i], h_mask_A[i], h_A_ct[i]);
+        assert(diff == T(0));
+    }
+}
+
+template <typename T>
+void checkTrStochastic(int bin, int bout, int shift, int N, T *h_masked_O, T *h_mask_O, T *h_I, T *h_r)
+{
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_o = h_masked_O[i] - h_mask_O[i];
+        cpuMod(unmasked_o, bout);
+        auto trInp = (h_I[i] + (1ULL << (bin - 1))) >> shift;
+        cpuMod(trInp, bin - shift);
+        T temp = h_I[i];
+        cpuMod(temp, shift);
+        if (h_r[i] < temp)
+        {
+            trInp += 1;
+            cpuMod(trInp, bin - shift);
+        }
+        trInp -= (1ULL << (bin - shift - 1));
+        cpuMod(trInp, bout);
+        if (i < 10 || unmasked_o != trInp)
+            printf("%d=%lu %lu %lu %lu %u\n", i, unmasked_o, trInp, h_r[i], temp, h_r[i] < temp);
+        assert(unmasked_o == trInp); // <= 1);
+    }
+}
+
+#include "gpu_truncate.cu"
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_window.cu b/GPU-MPC/fss/gpu_window.cu
new file mode 100644
index 00000000..b5349fd7
--- /dev/null
+++ b/GPU-MPC/fss/gpu_window.cu
@@ -0,0 +1,159 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "gpu_window.h"
+
+struct WindowArgs
+{
+    int party, i, j, N, M;
+};
+
+typedef u64 (*fMax)(WindowArgs w, u64 x, u64 max, u8 *bytes);
+
+template <u64 a, u64 b>
+__device__ u64 xPlusM(WindowArgs w, u64 x, u64 m, u8 *bytes)
+{
+    // if(w.j <= 10)//>= 4 && i < 8)
+    // printf("Inside max %d=%lu, %lu, %lu\n", w.i, m, x, m - x);
+    return a * x + b * m;
+}
+
+template <typename T>
+__device__ u64 keygenMul(WindowArgs w, u64 x, u64 max, u8 *bytes)
+{
+    auto r = ((T *)bytes)[w.i];
+    // printf("beaverKeygen=%ld, %ld, %ld, %ld\n", x, max, x*max, r);
+    return x * max + r;
+}
+
+template <typename T>
+__device__ u64 beaverMul(WindowArgs w, u64 x, u64 max, u8 *bytes)
+{
+    auto a = ((T *)bytes)[w.i];
+    auto b = ((T *)bytes)[w.N + w.j];
+    auto c = ((T *)bytes)[w.N + w.M + w.i];
+    // printf("%ld, %ld, %ld, %ld, %ld, %d, %d, %d, %d\n", a, b, c, x, max, M, N, i, j);
+    // (x - a) * (y - b) =
+    return (w.party == SERVER1) * (x * max) - x * b - a * max + c;
+}
+
+template <typename T, fMax f>
+__global__ void windowFuncKernel(int party, MaxpoolParams p, T *d_X, T *d_max, T *d_out, int N, int M, u8 *bytes)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        int t = i;
+        int j;
+        if (p.isLowerTriangular)
+        {
+            int n, h;
+            assert(p.imgH == p.imgW);
+            assert(p.C == 1);
+            n = t / ((p.imgH * (p.imgH + 1)) / 2);
+            t = t % ((p.imgH * (p.imgH + 1)) / 2);
+            h = int(floor((-1.0f + sqrt(1 + 8.0f * t)) / 2.0f));
+            // w = t - ((h * (h + 1)) / 2);
+            j = n * p.imgH + h;
+            // printf("%d=%d\n", t, j);
+        }
+        else
+        {
+            int n, h, w, c;
+            n = t / (p.imgH * p.imgW * p.C);
+            t = t % (p.imgH * p.imgW * p.C);
+            h = t / (p.imgW * p.C);
+            t = t % (p.imgW * p.C);
+            w = t / (p.C);
+            c = t % (p.C);
+            j = n * p.H * p.W * p.C + (h / p.strideH) * p.W * p.C + (w / p.strideW) * p.C + c;
+        }
+        WindowArgs wa{party, i, j, N, M};
+        // printf("i=%d, j=%d, max[%d]=%ld\n", i, j, j, d_max[j]);
+        auto o = T(f(wa, u64(d_X[i]), u64(d_max[j]), bytes));
+        gpuMod(o, p.bw);
+        // if (i <= 4)
+        // printf("o[%d]=%ld, %ld, %ld, %d\n", i, u64(d_X[i]), u64(d_max[j]), o, j);
+        d_out[i] = o;
+    }
+}
+
+template <typename T, fMax f>
+T *windowFunc(int party, MaxpoolParams p, T *d_X, T *d_M, u8 *d_bytes = NULL, bool inPlace = false)
+{
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    T *d_out = d_X;
+    if (!inPlace)
+        d_out = (T *)gpuMalloc(inSz * sizeof(T));
+    printf("%d, %d, %d, %d\n", p.strideH, p.FH, p.strideW, p.FW);
+    assert(p.strideH == p.FH && p.strideW == p.FW);
+    assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
+    windowFuncKernel<T, f><<<(inSz - 1) / 128 + 1, 128>>>(party, p, d_X, d_M, d_out, inSz, mSz, d_bytes);
+    checkCudaErrors(cudaDeviceSynchronize());
+    return d_out;
+}
+
+template <typename T>
+T *keygenWindowMul(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, T *d_mask_M, TruncateType t, AESGlobalContext *gaes, T *d_mask_B = NULL)
+{
+    int inSz = getInSz(p);
+    int mSz = getMSz(p);
+    auto d_mulMask = randomGEOnGpu<T>(inSz, p.bw);
+    // checkCudaErrors(cudaMemset(d_mulMask, 0, inSz * sizeof(T)));
+    auto d_mulMask1 = windowFunc<T, keygenMul<T>>(party, p, d_mask_X, d_mask_M, (u8 *)d_mulMask);
+    // printf("Writing mul key, N=%lx\n", *key_as_bytes);
+    writeShares<T, T>(key_as_bytes, party, inSz, d_mask_X, p.bw);
+    writeShares<T, T>(key_as_bytes, party, mSz, d_mask_M, p.bw);
+    writeShares<T, T>(key_as_bytes, party, inSz, d_mulMask1, p.bw);
+    gpuFree(d_mulMask1);
+    if (d_mask_B)
+    {
+        auto d_tempMask = windowFunc<T, xPlusM<u64(1), u64(1)>>(party, p, d_mulMask, d_mask_B, NULL, true);
+        assert(d_tempMask == d_mulMask);
+    }
+    // truncate X*M + B as is correct
+    auto d_truncateMask = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, p.bw, p.bw, p.scale, inSz, d_mulMask, gaes);
+    gpuFree(d_mulMask);
+    return d_truncateMask;
+}
+
+template <typename T>
+T *windowMul(SigmaPeer *peer, int party, MaxpoolParams p, GPUMulKey<T> &k, T *d_X, T *d_M, TruncateType t, AESGlobalContext *gaes, Stats *s, T *d_B = NULL)
+{
+    // printf("Start################\n");
+    auto inSz = getInSz(p);
+    auto mSz = getMSz(p);
+    auto d_mulKey = (u8 *)moveToGPU((u8 *)k.a, (2 * inSz + mSz) * sizeof(T), s);
+    // printf("%ld, %ld, %ld\n", k.mulKey.a[0], k.mulKey.b[0], k.mulKey.c[0]);
+    auto d_mulOut = windowFunc<T, beaverMul<T>>(party, p, d_X, d_M, (u8 *)d_mulKey);
+    gpuFree(d_mulKey);
+    peer->reconstructInPlace(d_mulOut, p.bw, inSz, s);
+    if (d_B /*&& party == SERVER1*/)
+    {
+        auto d_temp = windowFunc<T, xPlusM<u64(1), u64(1)>>(party, p, d_mulOut, d_B, NULL, true);
+        assert(d_mulOut == d_temp);
+    }
+    auto d_truncated_O = gpuTruncate<T, T>(p.bw, p.bw, /*TruncateType::TrWithSlack*/t, k.trKey, p.scale, peer, party, inSz, d_mulOut, gaes, s);
+    gpuFree(d_mulOut);
+    // printf("End################\n");
+    return d_truncated_O;
+}
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_window.h b/GPU-MPC/fss/gpu_window.h
new file mode 100644
index 00000000..c6d9ed2b
--- /dev/null
+++ b/GPU-MPC/fss/gpu_window.h
@@ -0,0 +1,37 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_maxpool.h"
+
+template <typename T>
+GPUMulKey<T> readGPUWindowMulKey(MaxpoolParams p, TruncateType t, u8 **key_as_bytes)
+{
+    GPUMulKey<T> k;
+    u64 inSz = getInSz(p);
+    u64 mSz = getMSz(p);
+    printf("%d, %d\n", inSz, mSz);
+    k = readGPUMulKey<T>(key_as_bytes, inSz, mSz, inSz, t);
+    return k;
+}
+
+#include "gpu_window.cu"
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/avg_pool_layer.cu b/GPU-MPC/nn/orca/avg_pool_layer.cu
new file mode 100644
index 00000000..4c87ba34
--- /dev/null
+++ b/GPU-MPC/nn/orca/avg_pool_layer.cu
@@ -0,0 +1,104 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cassert>
+
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "fss/gpu_avgpool.h"
+#include "fss/dcf/gpu_truncate.h"
+
+#include "avg_pool_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        template <typename T>
+        AvgPool2DLayer<T>::AvgPool2DLayer(int bin, int bout, int scaleDiv, int N, int imgH, int imgW, int C, int FH, int FW, int strideH,
+                                          int strideW, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, TruncateType tf, TruncateType tb)
+        {
+            assert(bin == bout);
+            // assert(bin == sizeof(T) * 8);
+            assert(zPadHLeft == zPadHRight && zPadWLeft == zPadWRight && zPadHLeft == zPadWLeft && zPadHLeft == 0 && "padding is not supported in avgpool!");
+            this->name = "AvgPool2D";
+            p = {bin, bin, 0, scaleDiv, bin, N, imgH, imgW, C, FH, FW, strideH, strideW, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, 0, 0, false};
+            assert(imgH >= FH);
+            assert(imgW >= FW);
+            assert(zPadHLeft == 0 && zPadHRight == 0 && zPadWLeft == 0 && zPadWRight == 0);
+            initPoolParams(p);
+            inSz = getInSz(p);
+            outSz = getMSz(p);
+            this->tf = tf;
+            this->tb = tb;
+        }
+
+        template <typename T>
+        T *AvgPool2DLayer<T>::genForwardKey(u8 **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *gaes)
+        {
+            auto d_mask_O = gpuAddPool(p, d_inputMask, &(this->s));
+            // gpuFree(d_inputMask);
+            auto d_mask_truncated_O = dcf::genGPUTruncateKey(key_as_bytes, party, tf, p.bw, p.bw, p.scaleDiv, outSz, d_mask_O, gaes); /*, mask_truncated_C);*/
+            return d_mask_truncated_O;
+        }
+
+        template <typename T>
+        T *AvgPool2DLayer<T>::genBackwardKey(u8 **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *gaes, int epoch)
+        {
+            auto d_mask_dI = gpuAddPoolBackProp(p, d_incomingGradMask, &(this->s));
+            gpuFree(d_incomingGradMask);
+            auto d_mask_truncated_dI = dcf::genGPUTruncateKey(key_as_bytes, party, tb, p.bw, p.bw, p.scaleDiv, inSz, d_mask_dI, gaes); /*, mask_truncated_C);*/
+            return d_mask_truncated_dI;
+        }
+
+        template <typename T>
+        void AvgPool2DLayer<T>::readForwardKey(u8 **key_as_bytes)
+        {
+            truncateKeyF = dcf::readGPUTruncateKey<T>(tf, key_as_bytes);
+        }
+
+        template <typename T>
+        void AvgPool2DLayer<T>::readBackwardKey(u8 **key_as_bytes, int epoch)
+        {
+            truncateKeyB = dcf::readGPUTruncateKey<T>(tb, key_as_bytes);
+        }
+
+        template <typename T>
+        T *AvgPool2DLayer<T>::forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g)
+        {
+            auto d_O = gpuAddPool(p, d_I, &(this->s));
+            // gpuFree(d_I);
+            dcf::gpuTruncate(p.bw, p.bw, tf, truncateKeyF, p.scaleDiv, peer, party, outSz, d_O, g, &(this->s));
+            return d_O;
+        }
+
+        template <typename T>
+        T *AvgPool2DLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch)
+        {
+            auto d_dI = gpuAddPoolBackProp(p, d_incomingGrad, &(this->s));
+            gpuFree(d_incomingGrad);
+            dcf::gpuTruncate(p.bw, p.bw, tb, truncateKeyB, p.scaleDiv, peer, party, inSz, d_dI, g, &(this->s));
+            return d_dI;
+        }
+    }
+}
diff --git a/GPU-MPC/nn/orca/avg_pool_layer.h b/GPU-MPC/nn/orca/avg_pool_layer.h
new file mode 100644
index 00000000..fb73e9c8
--- /dev/null
+++ b/GPU-MPC/nn/orca/avg_pool_layer.h
@@ -0,0 +1,55 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_comms.h"
+#include "fss/dcf/gpu_truncate.h"
+#include "nn/orca/gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class AvgPool2DLayer : public GPULayer<T>
+        {
+
+        public:
+            AvgPoolParams p;
+            dcf::TruncateType tf, tb;
+            dcf::GPUTruncateKey<T> truncateKeyF, truncateKeyB;
+            int inSz, outSz;
+
+            AvgPool2DLayer(int bin, int bout, int scaleDiv, int N, int imgH, int imgW, int C, int FH, int FW, int strideH,
+                           int strideW, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, TruncateType tf, TruncateType tb);
+            T *genForwardKey(uint8_t **key_as_bytes, int party, T *inputMask, AESGlobalContext *g);
+            T *genBackwardKey(uint8_t **key_as_bytes, int party, T *incomingGradMask, AESGlobalContext *g, int epoch);
+            void readForwardKey(uint8_t **key_as_bytes);
+            void readBackwardKey(uint8_t **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch);
+        };
+    }
+}
+
+#include "avg_pool_layer.cu"
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/conv2d_layer.cu b/GPU-MPC/nn/orca/conv2d_layer.cu
new file mode 100644
index 00000000..61f19f21
--- /dev/null
+++ b/GPU-MPC/nn/orca/conv2d_layer.cu
@@ -0,0 +1,344 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <cmath>
+
+#include "utils/gpu_mem.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_random.h"
+#include "utils/helper_cuda.h"
+
+#include "fss/gpu_conv2d.h"
+#include "fss/dcf/gpu_truncate.h"
+#include "fss/dcf/gpu_sgd.h"
+
+#include "conv2d_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        template <typename T>
+        Conv2DLayer<T>::Conv2DLayer(int bin, int bout, int N, int H, int W, int CI, int FH, int FW, int CO,
+                                    int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, int strideH, int strideW, bool useBias, dcf::TruncateType tf, dcf::TruncateType tb, bool computedI, bool inputIsShares)
+        {
+            assert(bin == bout && bin <= sizeof(T) * 8);
+            this->name = "Conv2D";
+            p = {bin, bout, N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, 0, 0, 0, 0, 0};
+            fillConv2DParams(&p);
+
+            this->useBias = useBias;
+            this->s.comm_time = 0;
+            this->s.transfer_time = 0;
+            this->tf = tf;
+            this->tb = tb;
+            size_t memSizeI = p.size_I * sizeof(T);
+            size_t memSizeF = p.size_F * sizeof(T);
+
+            I = (T *)cpuMalloc(memSizeI);
+            F = (T *)cpuMalloc(memSizeF);
+            Vf = (T *)cpuMalloc(memSizeF);
+            memset(F, 0, memSizeF);
+            memset(Vf, 0, memSizeF);
+            // this is okay to do because we will never use I and mask_I
+            // at the same time
+            mask_I = I;
+            d_mask_I = NULL;
+            mask_F = F;
+            mask_Vf = Vf;
+
+            if (useBias)
+            {
+                size_t memSizeB = p.CO * sizeof(T);
+                b = (T *)cpuMalloc(memSizeB);
+                Vb = (T *)cpuMalloc(memSizeB);
+                memset(b, 0, memSizeB);
+                memset(Vb, 0, memSizeB);
+            }
+            else
+            {
+                b = NULL;
+                Vb = NULL;
+            }
+            mask_b = b;
+            mask_Vb = Vb;
+            this->computedI = computedI;
+            this->inputIsShares = inputIsShares;
+
+            initConvKey();
+            initConvKeydI();
+            initConvKeydF();
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::initConvKey()
+        {
+            memcpy(&convKey, &p, sizeof(Conv2DParams));
+            convKey.mem_size_I = p.size_I * sizeof(T);
+            convKey.mem_size_F = p.size_F * sizeof(T);
+            convKey.mem_size_O = p.size_O * sizeof(T);
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::initConvKeydI()
+        {
+            memcpy(&convKeydI, &p, sizeof(Conv2DParams));
+            convKeydI.p.size_I = p.size_O;
+            convKeydI.p.size_F = p.size_F;
+            convKeydI.p.size_O = p.size_I;
+            convKeydI.mem_size_I = convKey.mem_size_O;
+            convKeydI.mem_size_F = convKey.mem_size_F;
+            convKeydI.mem_size_O = convKey.mem_size_I;
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::initConvKeydF()
+        {
+            memcpy(&convKeydF, &p, sizeof(Conv2DParams));
+            convKeydF.p.size_I = p.size_O;
+            convKeydF.p.size_F = p.size_I;
+            convKeydF.p.size_O = p.size_F;
+            convKeydF.mem_size_I = convKey.mem_size_O;
+            convKeydF.mem_size_F = convKey.mem_size_I;
+            convKeydF.mem_size_O = convKey.mem_size_F;
+        }
+
+        template <typename T>
+        T *Conv2DLayer<T>::genForwardKey(u8 **key_as_bytes, int party, T *d_mask_I, AESGlobalContext *gaes)
+        {
+            if (this->train)
+                moveIntoCPUMem((u8 *)this->mask_I, (u8 *)d_mask_I, convKey.mem_size_I, NULL);
+            auto d_mask_C = gpuKeygenConv2D<T>(key_as_bytes, party, convKey, d_mask_I, mask_F, true);
+            // bias has scale 2s
+            if (useBias)
+                gpuAddBias(1, p.size_O / p.CO, p.CO, p.bout, d_mask_C, mask_b, NULL);
+
+            auto d_mask_truncated_C = genGPUTruncateKey<T>(key_as_bytes, party, tf, p.bin, p.bout, global::scale, p.size_O, d_mask_C, gaes); /*, mask_truncated_C);*/
+            // we don't need to free this because truncate does
+            // gpuFree(d_mask_C);
+            return d_mask_truncated_C;
+        }
+
+        // need to truncate dI
+        template <typename T>
+        T *Conv2DLayer<T>::genBackwardKey(u8 **key_as_bytes, int party, /*T**/ T *d_mask_grad, AESGlobalContext *gaes, int epoch)
+        { // T* mask_truncated_dI) {
+            this->checkIfTrain();
+            // need to free all the leaked memory
+            auto d_mask_dF = randomGEOnGpu<T>(p.size_F, p.bin);
+            auto d_mask_I = (T *)moveToGPU((u8 *)mask_I, convKey.mem_size_I, NULL);
+            auto d_masked_dF = gpuConv2DPlaintext<T>(convKeydF, d_mask_grad, d_mask_I, d_mask_dF, 2, false);
+
+            writeShares<T, T>(key_as_bytes, party, p.size_O, d_mask_grad, p.bout);
+            writeShares<T, T>(key_as_bytes, party, p.size_F, d_masked_dF, p.bout);
+
+            gpuFree(d_masked_dF);
+            gpuFree(d_mask_I);
+            auto d_mask_F = (T *)moveToGPU((u8 *)mask_F, convKey.mem_size_F, NULL);
+
+            // this needs to be computed here because ow the contents of mask_F will change
+            T *d_mask_truncated_dI = NULL;
+            if (computedI)
+            {
+                auto d_mask_dI = randomGEOnGpu<T>(p.size_I, p.bin);
+                auto d_masked_dI = gpuConv2DPlaintext<T>(convKeydI, d_mask_grad, d_mask_F, d_mask_dI, 1, false);
+                writeShares<T, T>(key_as_bytes, party, p.size_I, d_masked_dI, p.bout);
+                gpuFree(d_masked_dI);
+                d_mask_truncated_dI = genGPUTruncateKey<T>(key_as_bytes, party, tb, p.bin, p.bout, global::scale, p.size_I, d_mask_dI, gaes);
+            }
+
+            genOptimizerKey<T>(key_as_bytes, party, p.bin, p.bout, p.size_F, mask_F, d_mask_F, mask_Vf, d_mask_dF, global::scale, 2 * global::scale, 2 * global::scale, tb, this->useMomentum, gaes, epoch);
+
+            if (useBias)
+            {
+                auto d_mask_db = getBiasGrad<T>(p.size_O / p.CO, p.CO, p.bin, d_mask_grad);
+                // printf("Old Mask b=%ld, %ld, %ld\n", b[0], b[1], b[2]);
+                genOptimizerKey<T>(key_as_bytes, party, p.bin, p.bout, p.CO, mask_b, NULL, mask_Vb, d_mask_db, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, tb, this->useMomentum, gaes, epoch);
+                // printf("New Mask b=%ld, %ld, %ld\n", b[0], b[1], b[2]);
+                gpuFree(d_mask_db);
+            }
+
+            gpuFree(d_mask_dF);
+            gpuFree(d_mask_F);
+            gpuFree(d_mask_grad);
+            return d_mask_truncated_dI;
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::readForwardKey(u8 **key_as_bytes)
+        {
+            initConvKey();
+            convKey.I = (T *)*key_as_bytes;
+            *key_as_bytes += convKey.mem_size_I;
+            convKey.F = (T *)*key_as_bytes;
+            *key_as_bytes += convKey.mem_size_F;
+            convKey.O = (T *)*key_as_bytes;
+            *key_as_bytes += convKey.mem_size_O;
+            truncateKeyC = readGPUTruncateKey<T>(tf, key_as_bytes);
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::readBackwardKey(u8 **key_as_bytes, int epoch)
+        {
+            T *mask_grad = (T *)*key_as_bytes;
+            *key_as_bytes += convKey.mem_size_O;
+            T *mask_dF = (T *)*key_as_bytes;
+            *key_as_bytes += convKey.mem_size_F;
+
+            // grad * input
+            convKeydF.I = mask_grad;
+            convKeydF.F = convKey.I;
+            convKeydF.O = mask_dF;
+
+            if (computedI)
+            {
+                T *mask_dI = (T *)*key_as_bytes;
+                *key_as_bytes += convKey.mem_size_I;
+                // grad * F
+                convKeydI.I = mask_grad;
+                convKeydI.F = convKey.F;
+                convKeydI.O = mask_dI;
+
+                // should refactor this later to look pretty
+                truncateKeydI = readGPUTruncateKey<T>(tb, key_as_bytes);
+            }
+            // readGpuSGDWithMomentumKey(tb, &truncateKeyVf, &truncateKeyF, &truncateKeyVb, key_as_bytes, useBias);
+            readOptimizerKey(tb, &truncateKeyVf, &truncateKeyF, key_as_bytes, global::scale, 2 * global::scale, 2 * global::scale, this->useMomentum, epoch);
+            if (useBias)
+                readOptimizerKey(tb, &truncateKeyVb, &truncateKeyb, key_as_bytes, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, this->useMomentum, epoch);
+        }
+
+        template <typename T>
+        T *Conv2DLayer<T>::forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes)
+        {
+            T *d_F, *d_mask_F;
+            d_mask_I = (T *)moveToGPU((u8 *)convKey.I, convKey.mem_size_I, &(this->s));
+            if (inputIsShares)
+            {
+                gpuLinearComb(p.bin, p.size_I, d_I, T(1), d_I, T(1), d_mask_I);
+                peer->reconstructInPlace(d_I, p.bin, p.size_I, &(this->s));
+            }
+            if (this->train)
+                moveIntoCPUMem((u8 *)I, (u8 *)d_I, convKey.mem_size_I, &(this->s));
+
+            d_F = (T *)moveToGPU((u8 *)F, convKey.mem_size_F, &(this->s));
+            d_mask_F = (T *)moveToGPU((u8 *)convKey.F, convKey.mem_size_F, &(this->s));
+            auto d_C = gpuConv2DBeaver(convKey, party, d_I, d_F, d_mask_I, d_mask_F, useBias && party == SERVER0 ? b : (T*) NULL, &(this->s), 0);
+
+            // should not be freeing d_I who knows where else it is being used
+            // gpuFree(d_I);
+            gpuFree(d_F);
+            gpuFree(d_mask_I);
+            gpuFree(d_mask_F);
+
+            peer->reconstructInPlace(d_C, p.bout, p.size_O, &(this->s));
+            dcf::gpuTruncate(p.bin, p.bout, tf, truncateKeyC, global::scale, peer, party, p.size_O, d_C, gaes, &(this->s));
+
+            return d_C;
+        }
+
+        template <typename T>
+        T *Conv2DLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *gaes, int epoch)
+        {
+            this->checkIfTrain();
+
+            auto d_mask_incomingGrad = (T *)moveToGPU((u8 *)convKeydF.I, convKeydF.mem_size_I, &(this->s));
+            auto d_mask_I = (T *)moveToGPU((u8 *)convKey.I, convKey.mem_size_I, &(this->s));
+            auto d_I = (T *)moveToGPU((u8 *)I, convKey.mem_size_I, &(this->s));
+            auto d_F = (T *)moveToGPU((u8 *)F, convKey.mem_size_F, &(this->s));
+
+            T *d_dI = NULL;
+            if (computedI)
+            {
+                auto d_mask_F = (T *)moveToGPU((u8 *)convKey.F, convKey.mem_size_F, &(this->s));
+                d_dI = gpuConv2DBeaver(convKeydI, party, d_incomingGrad, d_F, d_mask_incomingGrad, d_mask_F, (T *)NULL, &(this->s), 1);
+                gpuFree(d_mask_F);
+                peer->reconstructInPlace(d_dI, p.bin, p.size_I, &(this->s));
+                dcf::gpuTruncate(p.bin, p.bout, tb, truncateKeydI, global::scale, peer, party, p.size_I, d_dI, gaes, &(this->s));
+            }
+
+            auto d_dF = gpuConv2DBeaver(convKeydF, party, d_incomingGrad, d_I, d_mask_incomingGrad, d_mask_I, (T *)NULL, &(this->s), 2);
+            peer->reconstructInPlace(d_dF, p.bin, p.size_F, &(this->s));
+
+            optimize(p.bin, p.bout, p.size_F, F, d_F, Vf, d_dF, global::scale, 2 * global::scale, 2 * global::scale, tb, truncateKeyVf, truncateKeyF, party, peer, this->useMomentum, gaes, &(this->s), epoch);
+
+            if (useBias)
+            {
+                auto d_db = getBiasGrad<T>(p.size_O / p.CO, p.CO, p.bout, d_incomingGrad);
+                optimize(p.bin, p.bout, p.CO, b, (T *)NULL, Vb, d_db, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, tb, truncateKeyVb, truncateKeyb,
+                              party, peer, this->useMomentum, gaes, &(this->s), epoch);
+                gpuFree(d_db);
+            }
+
+            gpuFree(d_incomingGrad);
+            gpuFree(d_I);
+            gpuFree(d_F);
+            gpuFree(d_mask_incomingGrad);
+            gpuFree(d_mask_I);
+            gpuFree(d_dF);
+
+            return d_dI;
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::initWeights(u8 **weights, bool floatWeights)
+        {
+            if (floatWeights)
+            {
+                for (int i = 0; i < p.size_F; i++)
+                    F[i] = T(((float *)*weights)[i] * (1ULL << global::scale));
+                // printf("F[%d]=%lu\n", p.size_F - 1, F[p.size_F - 1]);
+                *weights += (p.size_F * sizeof(float));
+                if (useBias)
+                {
+                    for (int i = 0; i < p.CO; i++)
+                        b[i] = T(((float *)*weights)[i] * (1ULL << (2 * global::scale)));
+                    *weights += (p.CO * sizeof(float));
+                }
+            }
+            else
+            {
+                memcpy(F, *weights, p.size_F * sizeof(T));
+                *weights += (p.size_F * sizeof(T));
+                if (useBias)
+                {
+                    memcpy(b, *weights, p.CO * sizeof(T));
+                    *weights += (p.CO * sizeof(T));
+                }
+            }
+        }
+
+        template <typename T>
+        void Conv2DLayer<T>::dumpWeights(std::ofstream &f)
+        {
+            f.write((char *)F, p.size_F * sizeof(T));
+            // printf("Dumping weights=%lu, %lu, %lu\n", F[0], F[1], F[2]);
+            if (useBias)
+                f.write((char *)b, p.CO * sizeof(T));
+        }
+    }
+}
diff --git a/GPU-MPC/nn/orca/conv2d_layer.h b/GPU-MPC/nn/orca/conv2d_layer.h
new file mode 100644
index 00000000..81775654
--- /dev/null
+++ b/GPU-MPC/nn/orca/conv2d_layer.h
@@ -0,0 +1,72 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_stats.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_conv2d.h"
+#include "fss/dcf/gpu_truncate.h"
+
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class Conv2DLayer : public GPULayer<T>
+        {
+
+        private:
+            void initConvKey();
+            void initConvKeydI();
+            void initConvKeydF();
+
+        public:
+            Conv2DParams p;
+            GPUConv2DKey<T> convKey, convKeydI, convKeydF;
+            bool inputIsShares;
+            T *I, *F, *Vf, *b, *Vb;
+            // Stats s;
+            bool useBias, computedI;
+            TruncateType tf, tb;
+            GPUTruncateKey<T> truncateKeyC, truncateKeydI, truncateKeyF, truncateKeyVf, truncateKeyb, truncateKeyVb;
+
+            // using these variables for keygen
+            T *mask_I, *d_mask_I, *mask_F, *mask_Vf, *mask_b, *mask_Vb;
+
+            Conv2DLayer(int bin, int bout, int N, int H, int W, int CI, int FH, int FW, int CO,
+                        int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight, int strideH, int strideW, bool useBias, dcf::TruncateType tf, dcf::TruncateType tb, bool computedI, bool inputIsShares);
+            T *genForwardKey(u8 **key_as_bytes, int party, T *mask_I, AESGlobalContext *gaes);
+            T *genBackwardKey(u8 **key_as_bytes, int party, T *mask_grad, AESGlobalContext *gaes, int epoch);
+            void readForwardKey(u8 **key_as_bytes);
+            void readBackwardKey(u8 **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *gaes, int epoch);
+            void initWeights(u8 **weights, bool floatWeights);
+            void dumpWeights(std::ofstream &f);
+        };
+    }
+}
+#include "conv2d_layer.cu"
diff --git a/GPU-MPC/nn/orca/fc_layer.cu b/GPU-MPC/nn/orca/fc_layer.cu
new file mode 100644
index 00000000..bd7d02e7
--- /dev/null
+++ b/GPU-MPC/nn/orca/fc_layer.cu
@@ -0,0 +1,340 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "fc_layer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <cmath>
+
+#include "utils/gpu_mem.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_matmul.h"
+#include "fss/dcf/gpu_truncate.h"
+#include "fss/dcf/gpu_sgd.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        FCLayer<T>::FCLayer(int bin, int bout, int M, int N, int K, dcf::TruncateType tf, dcf::TruncateType tb, bool useBias, bool computedX, bool inputIsShares)
+        {
+            assert(bin == bout && bin <= sizeof(T) * 8);
+            assert(useBias);
+
+            this->name = "FC";
+            p.batchSz = 1;
+            p.M = M;
+            p.N = N;
+            p.K = K;
+            stdInit(p, bin, 0);
+            initMulParamsdW();
+            initMulParamsdX();
+            initMemSz(p, &mmKey);
+            initMemSz(pdW, &mmKeydW);
+            initMemSz(pdX, &mmKeydX);
+
+            size_t memSzX = p.size_A * sizeof(T);
+            size_t memSzW = p.size_B * sizeof(T);
+            size_t memSzZ = p.size_C * sizeof(T);
+
+            mask_X = (T *)cpuMalloc(memSzX);
+            mask_W = (T *)cpuMalloc(memSzW);
+            mask_Z = (T *)cpuMalloc(memSzZ);
+            mask_dX = (T *)cpuMalloc(memSzX);
+            mask_dW = (T *)cpuMalloc(memSzW);
+            mask_Vw = (T *)cpuMalloc(memSzW);
+
+            memset(mask_W, 0, memSzW);
+            memset(mask_Vw, 0, memSzW);
+
+            this->tf = tf;
+            this->tb = tb;
+            this->useBias = useBias;
+            this->computedX = computedX;
+            this->inputIsShares = inputIsShares;
+            if (useBias)
+            {
+                size_t memSzY = p.N * sizeof(T);
+                mask_Y = (T *)cpuMalloc(memSzY);
+                mask_Vy = (T *)cpuMalloc(memSzY);
+                mask_dY = (T *)cpuMalloc(memSzY);
+                memset(mask_Y, 0, memSzY);
+                memset(mask_Vy, 0, memSzY);
+            }
+            X = mask_X;
+            W = mask_W;
+            Y = mask_Y;
+            Vw = mask_Vw;
+            Vy = mask_Vy;
+        }
+
+        template <typename T>
+        void FCLayer<T>::initMulParamsdW()
+        {
+            pdW.batchSz = 1;
+            pdW.M = p.K;
+            pdW.K = p.M;
+            pdW.N = p.N;
+            stdInit(pdW, p.bw, 0);
+            pdW.rowMaj_A = false;
+        }
+
+        template <typename T>
+        void FCLayer<T>::initMulParamsdX()
+        {
+            pdX.batchSz = 1;
+            pdX.M = p.M;
+            pdX.K = p.N;
+            pdX.N = p.K;
+            stdInit(pdX, p.bw, 0);
+            pdX.rowMaj_B = false;
+        }
+
+        // neha: to fix: maxpool, and make it so the conv2d output is 40 bits???? (bout == 40????)
+        template <typename T>
+        T *FCLayer<T>::genForwardKey(u8 **key_as_bytes, int party, T *d_mask_X, AESGlobalContext *gaes)
+        {
+            if (this->train)
+                moveIntoCPUMem((u8 *)this->mask_X, (u8 *)d_mask_X, mmKey.mem_size_A, NULL);
+            auto d_mask_Z = randomGEOnGpu<T>(p.size_C, p.bw);
+            auto d_mask_W = (T *)moveToGPU((u8 *)mask_W, mmKey.mem_size_B, NULL);
+            auto d_masked_Z = gpuMatmulPlaintext(p, d_mask_X, d_mask_W, d_mask_Z, false); //, true, true, true);
+            writeShares<T, T>(key_as_bytes, party, p.size_A, d_mask_X, p.bw);
+            writeShares<T, T>(key_as_bytes, party, p.size_B, d_mask_W, p.bw);
+            writeShares<T, T>(key_as_bytes, party, p.size_C, d_masked_Z, p.bw);
+            // gpuFree(d_mask_X);
+            gpuFree(d_mask_W);
+            gpuFree(d_masked_Z);
+            if (useBias)
+                gpuAddBias(1, p.M, p.N, p.bw, d_mask_Z, mask_Y, NULL);
+            auto d_mask_truncated_Z = genGPUTruncateKey(key_as_bytes, party, tf, p.bw, p.bw, global::scale, p.size_C, d_mask_Z, gaes); /*, mask_truncated_C);*/
+            // don't need to free this, it happens inside truncate
+            // gpuFree(d_mask_Z);
+            return d_mask_truncated_Z;
+        }
+
+        template <typename T>
+        T *FCLayer<T>::genBackwardKey(u8 **key_as_bytes, int party, T *d_mask_grad, AESGlobalContext *gaes, int epoch)
+        {
+            this->checkIfTrain();
+
+            auto d_mask_dW = randomGEOnGpu<T>(p.size_B, p.bw);
+            auto d_mask_X = (T *)moveToGPU((u8 *)mask_X, mmKey.mem_size_A, NULL);
+            auto d_masked_dW = gpuMatmulPlaintext(pdW, d_mask_X, d_mask_grad, d_mask_dW, false);
+
+            writeShares<T, T>(key_as_bytes, party, p.size_C, d_mask_grad, p.bw);
+            writeShares<T, T>(key_as_bytes, party, p.size_B, d_masked_dW, p.bw);
+            gpuFree(d_masked_dW);
+            gpuFree(d_mask_X);
+
+            auto d_mask_W = (T *)moveToGPU((u8 *)mask_W, mmKey.mem_size_B, NULL);
+
+            T *d_mask_truncated_dX = NULL;
+
+            if (computedX)
+            {
+                auto d_mask_dX = randomGEOnGpu<T>(p.size_A, p.bw);
+                auto d_masked_dX = gpuMatmulPlaintext(pdX, d_mask_grad, d_mask_W, d_mask_dX, false);
+                writeShares<T, T>(key_as_bytes, party, p.size_A, d_masked_dX, p.bw);
+                gpuFree(d_masked_dX);
+                // d_mask_dX gets freed inside keygen for truncate
+                d_mask_truncated_dX = genGPUTruncateKey(key_as_bytes, party, tb, p.bw, p.bw, global::scale, p.size_A, d_mask_dX, gaes);
+            }
+            genOptimizerKey(key_as_bytes, party, p.bw, p.bw, p.size_B, mask_W, d_mask_W, mask_Vw, d_mask_dW, global::scale, 2 * global::scale, 2 * global::scale, tb, this->useMomentum, gaes, epoch);
+            if (useBias)
+            {
+                auto d_mask_dY = getBiasGrad(p.M, p.N, p.bw, d_mask_grad);
+                genOptimizerKey(key_as_bytes, party, p.bw, p.bw, p.N, mask_Y, (T *)NULL, mask_Vy, d_mask_dY, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, tb, this->useMomentum, gaes, epoch);
+                gpuFree(d_mask_dY);
+            }
+            gpuFree(d_mask_W);
+            gpuFree(d_mask_dW);
+            gpuFree(d_mask_grad);
+            return d_mask_truncated_dX;
+        }
+
+        template <typename T>
+        void FCLayer<T>::readForwardKey(u8 **key_as_bytes)
+        {
+            mmKey.A = (T *)*key_as_bytes;
+            *key_as_bytes += mmKey.mem_size_A;
+
+            mmKey.B = (T *)*key_as_bytes;
+            *key_as_bytes += mmKey.mem_size_B;
+
+            mmKey.C = (T *)*key_as_bytes;
+            *key_as_bytes += mmKey.mem_size_C;
+
+            truncateKeyZ = readGPUTruncateKey<T>(tf, key_as_bytes);
+        }
+
+        template <typename T>
+        void FCLayer<T>::readBackwardKey(u8 **key_as_bytes, int epoch)
+        {
+            T *mask_grad = (T *)*key_as_bytes;
+            *key_as_bytes += mmKey.mem_size_C;
+
+            T *mask_dW = (T *)*key_as_bytes;
+            *key_as_bytes += mmKey.mem_size_B;
+
+            mmKeydW.A = mmKey.A;
+            mmKeydW.B = mask_grad;
+            mmKeydW.C = mask_dW;
+
+            if (computedX)
+            {
+                T *mask_dX = (T *)*key_as_bytes;
+                *key_as_bytes += mmKey.mem_size_A;
+
+                mmKeydX.A = mask_grad;
+                mmKeydX.B = mmKey.B;
+                mmKeydX.C = mask_dX;
+
+                truncateKeydX = readGPUTruncateKey<T>(tb, key_as_bytes);
+            }
+
+            readOptimizerKey(tb, &truncateKeyVw, &truncateKeyW, key_as_bytes, global::scale, 2 * global::scale, 2 * global::scale, this->useMomentum, epoch);
+            if (useBias)
+                readOptimizerKey(tb, &truncateKeyVy, &truncateKeyY, key_as_bytes, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, this->useMomentum, epoch);
+        }
+
+        template <typename T>
+        T *FCLayer<T>::forward(SigmaPeer *peer, int party, T *d_X, AESGlobalContext *gaes)
+        {
+            auto d_mask_X = (T *)moveToGPU((u8 *)mmKey.A, mmKey.mem_size_A, &(this->s));
+            if (inputIsShares)
+            {
+                gpuLinearComb(p.bw, p.size_A, d_X, T(1), d_X, T(1), d_mask_X);
+                peer->reconstructInPlace(d_X, p.bw, p.size_A, &(this->s));
+            }
+            if (this->train)
+                moveIntoCPUMem((u8 *)X, (u8 *)d_X, mmKey.mem_size_A, &(this->s));
+
+            auto d_W = (T *)moveToGPU((uint8_t *)W, mmKey.mem_size_B, &(this->s));
+            auto d_mask_W = (T *)moveToGPU((uint8_t *)mmKey.B, mmKey.mem_size_B, &(this->s));
+            T *d_Y = nullptr;
+            if (party == SERVER0 && useBias)
+            {
+                d_Y = (T *)moveToGPU((uint8_t *)Y, p.N * sizeof(T), &(this->s));
+            }
+            auto d_Z = gpuMatmulBeaver(p, mmKey, party, d_X, d_W, d_mask_X, d_mask_W, d_Y, &(this->s));
+
+            // gpuFree(d_X);
+            gpuFree(d_mask_X);
+            gpuFree(d_W);
+            gpuFree(d_mask_W);
+            if (d_Y)
+                gpuFree(d_Y);
+
+            peer->reconstructInPlace(d_Z, p.bw, p.size_C, &(this->s));
+            dcf::gpuTruncate(p.bw, p.bw, tf, truncateKeyZ, global::scale, peer, party, p.size_C, d_Z, gaes, &(this->s));
+
+            return d_Z;
+        }
+
+        template <typename T>
+        T *FCLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *gaes, int epoch)
+        {
+
+            this->checkIfTrain();
+            auto d_mask_grad = (T *)moveToGPU((u8 *)mmKeydW.B, mmKeydW.mem_size_B, &(this->s));
+            auto d_X = (T *)moveToGPU((u8 *)X, mmKeydW.mem_size_A, &(this->s));
+            auto d_mask_X = (T *)moveToGPU((u8 *)mmKeydW.A, mmKeydW.mem_size_A, &(this->s));
+            auto d_W = (T *)moveToGPU((u8 *)W, mmKey.mem_size_B, &(this->s));
+
+            T *d_dX;
+            if (computedX)
+            {
+                auto d_mask_W = (T *)moveToGPU((u8 *)mmKeydX.B, mmKeydX.mem_size_B, &(this->s));
+                d_dX = gpuMatmulBeaver(pdX, mmKeydX, party, d_incomingGrad, d_W, d_mask_grad, d_mask_W, (T *)NULL, &(this->s));
+                gpuFree(d_mask_W);
+                peer->reconstructInPlace(d_dX, p.bw, p.size_A, &(this->s));
+                dcf::gpuTruncate(p.bw, p.bw, tb, truncateKeydX, global::scale, peer, party, p.size_A, d_dX, gaes, &(this->s));
+            }
+
+            auto d_dW = gpuMatmulBeaver(pdW, mmKeydW, party, d_X, d_incomingGrad, d_mask_X, d_mask_grad, (T *)NULL, &(this->s));
+            peer->reconstructInPlace(d_dW, p.bw, p.size_B, &(this->s));
+
+            gpuFree(d_X);
+            gpuFree(d_mask_X);
+
+            optimize(p.bw, p.bw, p.size_B, W, d_W, Vw, d_dW, global::scale, 2 * global::scale, 2 * global::scale, tb, truncateKeyVw, truncateKeyW, party, peer, this->useMomentum, gaes, &(this->s), epoch);
+
+            gpuFree(d_W);
+            gpuFree(d_dW);
+            if (useBias)
+            {
+                auto d_dY = getBiasGrad(p.M, p.N, p.bw, d_incomingGrad);
+                optimize(p.bw, p.bw, p.N, Y, (T *)NULL, Vy, d_dY, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, tb, truncateKeyVy, truncateKeyY,
+                         party, peer, this->useMomentum, gaes, &(this->s), epoch);
+                gpuFree(d_dY);
+            }
+            gpuFree(d_incomingGrad);
+            gpuFree(d_mask_grad);
+
+            return d_dX;
+        }
+
+        template <typename T>
+        void FCLayer<T>::initWeights(u8 **weights, bool floatWeights)
+        {
+            if (floatWeights)
+            {
+                for (int i = 0; i < p.size_B; i++)
+                    W[i] = T(((float *)*weights)[i] * (1ULL << global::scale));
+                *weights += (p.size_B * sizeof(float));
+                if (useBias)
+                {
+                    for (int i = 0; i < p.N; i++)
+                        Y[i] = T(((float *)*weights)[i] * (1ULL << (2 * global::scale)));
+                    *weights += (p.N * sizeof(float));
+                }
+            }
+            else
+            {
+                size_t memSzW = p.size_B * sizeof(T);
+                memcpy(W, *weights, memSzW);
+                *weights += memSzW;
+                if (useBias)
+                {
+                    memcpy(Y, *weights, p.N * sizeof(T));
+                    *weights += (p.N * sizeof(T));
+                }
+            }
+        }
+
+        template <typename T>
+        void FCLayer<T>::dumpWeights(std::ofstream &f)
+        {
+            f.write((char *)W, p.size_B * sizeof(T));
+            if (useBias)
+                f.write((char *)Y, p.N * sizeof(T));
+        }
+
+    }
+}
diff --git a/GPU-MPC/nn/orca/fc_layer.h b/GPU-MPC/nn/orca/fc_layer.h
new file mode 100644
index 00000000..aad5d2d7
--- /dev/null
+++ b/GPU-MPC/nn/orca/fc_layer.h
@@ -0,0 +1,79 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstdint>
+
+#include "utils/gpu_stats.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_matmul.h"
+#include "fss/dcf/gpu_truncate.h"
+
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class FCLayer : public GPULayer<T>
+        {
+        private:
+            void initMulParamsdW();
+            void initMulParamsdX();
+            void initMemSz(MatmulParams p, GPUMatmulKey<T> *k)
+            {
+                k->mem_size_A = p.size_A * sizeof(T);
+                k->mem_size_B = p.size_B * sizeof(T);
+                k->mem_size_C = p.size_C * sizeof(T);
+            }
+
+        public:
+            MatmulParams p, pdW, pdX;
+            GPUMatmulKey<T> mmKey, mmKeydX, mmKeydW;
+            // Z = XW + Y
+            // X = N * something, which means that Y is a row vector
+            T *X, *W, *Y, *Vw, *Vy;
+            T *mask_X, *mask_W, *mask_Z, *mask_Y = NULL, *mask_dX, *mask_dW, *mask_dY = NULL, *mask_Vw, *mask_Vy = NULL;
+            dcf::TruncateType tf, tb;
+            GPUTruncateKey<T> truncateKeyZ, truncateKeydX, truncateKeyW, truncateKeyVw, truncateKeyY, truncateKeyVy;
+            // Stats s;
+            bool useBias;
+            bool computedX;
+            bool inputIsShares;
+
+            FCLayer(int bin, int bout, int M, int N, int K, dcf::TruncateType tf, dcf::TruncateType tb, bool useBias, bool computedX, bool inputIsShares);
+            T *genForwardKey(uint8_t **key_as_bytes, int party, T *d_mask_X, AESGlobalContext *gaes);
+            T *genBackwardKey(uint8_t **key_as_bytes, int party, T *d_mask_grad, AESGlobalContext *gaes, int epoch);
+            void readForwardKey(uint8_t **key_as_bytes);
+            void readBackwardKey(uint8_t **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *gaes, int epoch);
+            void initWeights(uint8_t **weights, bool floatWeights);
+            void dumpWeights(std::ofstream &f);
+        };
+    }
+}
+
+#include "fc_layer.cu"
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/gpu_layer.h b/GPU-MPC/nn/orca/gpu_layer.h
new file mode 100644
index 00000000..da31b329
--- /dev/null
+++ b/GPU-MPC/nn/orca/gpu_layer.h
@@ -0,0 +1,65 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "fss/gpu_aes_shm.h"
+#include "utils/gpu_comms.h"
+#include <cassert>
+#include <vector>
+// #include <mutex>
+// #include <condition_variable>
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class GPULayer
+        {
+        public:
+            std::string name = "";
+            bool train = false;
+            bool useMomentum = true;
+            bool loadedWeights = false;
+            Stats s;
+            virtual void setTrain(bool useMomentum)
+            {
+                train = true;
+                this->useMomentum = useMomentum;
+            }
+            void checkIfTrain()
+            {
+                assert(train && "train is not set!");
+            }
+
+            virtual T *genForwardKey(u8 **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *g) = 0;
+            virtual T *genBackwardKey(u8 **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *g, int epoch) = 0;
+            virtual void readForwardKey(u8 **key_as_bytes) = 0;
+            virtual void readBackwardKey(u8 **key_as_bytes, int epoch) = 0;
+            virtual T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g) = 0;
+            virtual T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch) = 0;
+            virtual void initWeights(u8 **weights, bool floatWeights) {}
+            virtual void dumpWeights(std::ofstream &f) {}
+        };
+
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/gpu_model.h b/GPU-MPC/nn/orca/gpu_model.h
new file mode 100644
index 00000000..59f6a77d
--- /dev/null
+++ b/GPU-MPC/nn/orca/gpu_model.h
@@ -0,0 +1,79 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class GPUModel
+        {
+        public:
+            std::vector<GPULayer<T> *> layers;
+            int classes;
+            int batchSz, inpSz;
+
+            void initWeights(std::string weightsFile, bool floatWeights = false)
+            {
+                std::cout << weightsFile << std::endl;
+                if (weightsFile.compare("") != 0)
+                {
+                    size_t wSize;
+                    auto weights = readFile(weightsFile, &wSize, false);
+                    auto tmpWeights = weights;
+                    for (int i = 0; i < layers.size(); i++)
+                    {
+                        layers[i]->initWeights(&tmpWeights, floatWeights);
+                    }
+                    free(weights);
+                }
+            }
+
+            void setTrain(bool useMomentum)
+            {
+                for (int i = 0; i < layers.size(); i++)
+                    layers[i]->setTrain(useMomentum);
+            }
+
+            void dumpWeights(std::string filename)
+            {
+                std::ofstream f(filename);
+                if (!f)
+                {
+                    std::cerr << "can't open output file=" << filename << std::endl;
+                    assert(0);
+                }
+                for (int i = 0; i < layers.size(); i++)
+                {
+                    layers[i]->dumpWeights(f);
+                }
+                f.flush();
+                f.close();
+            }
+        };
+
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/maxpool_layer.cu b/GPU-MPC/nn/orca/maxpool_layer.cu
new file mode 100644
index 00000000..04528579
--- /dev/null
+++ b/GPU-MPC/nn/orca/maxpool_layer.cu
@@ -0,0 +1,181 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+
+#include "utils/helper_cuda.h"
+#include "utils/gpu_mem.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_file_utils.h"
+
+#include "maxpool_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        MaxPool2DLayer<T>::MaxPool2DLayer(int bin, int bout, int bwBackprop, int N, int imgH, int imgW, int C, int FH, int FW, int strideH,
+                                          int strideW, int zPadHLeft, int zPadHRight, int zPadWLeft, int zPadWRight)
+        {
+            assert(bin == bout);
+            // assert(zPadHLeft == zPadHRight && zPadWLeft == zPadWRight && zPadHLeft == zPadWLeft && zPadHLeft == 0 && "padding is not supported in maxpool!");
+            this->name = "MaxPool2D";
+            p.bw = bin;
+            p.bin = bin;
+            // p.bout = bout;
+            p.bwBackprop = bwBackprop;
+            p.N = N;
+            p.imgH = imgH;
+            p.imgW = imgW;
+            p.C = C;
+            p.FH = FH;
+            p.FW = FW;
+            assert(imgH >= FH);
+            assert(imgW >= FW);
+            p.strideH = strideH;
+            p.strideW = strideW;
+            // no padding for now
+            //  assert(zPadHLeft == 0 && zPadHRight == 0 && zPadWLeft == 0 && zPadWRight == 0);
+            p.zPadHLeft = zPadHLeft;
+            p.zPadHRight = zPadHRight;
+            p.zPadWLeft = zPadWLeft;
+            p.zPadWRight = zPadWRight;
+            initPoolParams(p);
+            int outSz = p.N * p.H * p.W * p.C;
+            oneHotOutputMask = (u8 *)cpuMalloc(p.FH * p.FW * outSz);
+            int numInts = ((outSz * p.FH * p.FW - 1) / PACKING_SIZE + 1);
+            oneHot = (u32 *)cpuMalloc(numInts * sizeof(u32));
+        }
+
+        template <typename T>
+        MaxPool2DLayer<T>::~MaxPool2DLayer()
+        {
+            cpuFree(oneHotOutputMask);
+            cpuFree(oneHot);
+        }
+
+        template <typename T>
+        T *MaxPool2DLayer<T>::genForwardKey(uint8_t **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *gaes)
+        {
+            int outSz = p.N * p.H * p.W * p.C;
+            // T* d_curMax = (T*) gpuMalloc(outSz * sizeof(T));
+            size_t oneHotSize = outSz * p.FH * p.FW;
+            u8 *d_oneHotMask = NULL;
+            if (this->train)
+            {
+                d_oneHotMask = (u8 *)gpuMalloc(oneHotSize);
+                checkCudaErrors(cudaMemset(d_oneHotMask, 0, oneHotSize));
+            }
+            auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_inputMask, d_oneHotMask, gaes);
+            // gpuFree(d_inputMask);
+            if (this->train)
+            {
+                moveIntoCPUMem((u8 *)oneHotOutputMask, (u8 *)d_oneHotMask, oneHotSize, NULL);
+                gpuFree(d_oneHotMask);
+            }
+            return d_maxMask;
+        }
+
+        template <typename T>
+        T *MaxPool2DLayer<T>::genBackwardKey(uint8_t **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *gaes, int epoch)
+        {
+            this->checkIfTrain();
+            size_t oneHotSize = p.N * p.H * p.W * p.C * p.FH * p.FW;
+            auto d_oneHotMask = (u8 *)moveToGPU((u8 *)oneHotOutputMask, oneHotSize, NULL);
+            auto d_outgoingGradMask = keyGenMaxpoolBackProp(key_as_bytes, party, p, d_oneHotMask, d_incomingGradMask);
+            gpuFree(d_oneHotMask);
+            gpuFree(d_incomingGradMask);
+            return d_outgoingGradMask;
+        }
+
+        template <typename T>
+        void MaxPool2DLayer<T>::readForwardKey(uint8_t **key_as_bytes)
+        {
+            // maxpoolKey.p = p;
+            // gaes = g;
+            maxpoolKey.reluKey = new GPU2RoundReLUKey<T>[p.FH * p.FW];
+            maxpoolKey.andKey = new GPUAndKey[p.FH * p.FW];
+            for (int i = 0; i < p.FH; i++)
+            {
+                for (int j = 0; j < p.FW; j++)
+                {
+                    if (i == 0 && j == 0)
+                        continue;
+                    maxpoolKey.reluKey[i * p.FW + j] = readTwoRoundReluKey<T>(key_as_bytes);
+                    if (this->train)
+                        maxpoolKey.andKey[i * p.FW + j] = readGPUAndKey(key_as_bytes);
+                }
+            }
+        }
+
+        template <typename T>
+        void MaxPool2DLayer<T>::readBackwardKey(uint8_t **key_as_bytes, int epoch)
+        {
+            int numSelects = p.N * p.H * p.W * p.C * p.FH * p.FW;
+            backpropSelectKey = readGPUSelectKey<T>(key_as_bytes, numSelects);
+        }
+
+        // no memory leak
+        template <typename T>
+        T *MaxPool2DLayer<T>::forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes)
+        {
+            int outSz = getMSz(p);
+            // T* d_curMax = (T*) gpuMalloc(outSz * sizeof(T));
+            u32 *d_oneHot = NULL;
+            int numInts = ((outSz * p.FH * p.FW - 1) / PACKING_SIZE + 1);
+            if (this->train)
+            {
+                d_oneHot = (u32 *)gpuMalloc(numInts * sizeof(u32));
+            }
+            auto d_curMax = gpuMaxPool(peer, party, p, maxpoolKey, d_I, d_oneHot, gaes, &(this->s));
+            if (this->train)
+            {
+                moveIntoCPUMem((uint8_t *)oneHot, (uint8_t *)d_oneHot, numInts * sizeof(u32), &(this->s));
+                gpuFree(d_oneHot);
+            }
+            return d_curMax;
+        }
+
+        // no memory leak
+        template <typename T>
+        T *MaxPool2DLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch)
+        {
+
+            this->checkIfTrain();
+            int incomingGradSize = getMSz(p);
+            int outgoingGradSize = getInSz(p);
+            int oneHotSize = incomingGradSize * p.FH * p.FW;
+            int numInts = (oneHotSize - 1) / PACKING_SIZE + 1;
+
+            auto d_oneHot = (u32 *)moveToGPU((uint8_t *)oneHot, numInts * sizeof(u32), &(this->s));
+            auto d_outgoingGradExpanded = gpuSelectForMaxpoolBackprop(p, backpropSelectKey, d_oneHot, d_incomingGrad,
+                                                                      party, &(this->s));
+            gpuFree(d_incomingGrad);
+            auto d_outgoingGrad = gpuCollectGradients(p, d_outgoingGradExpanded, &(this->s));
+            peer->reconstructInPlace(d_outgoingGrad, p.bwBackprop, outgoingGradSize, &(this->s));
+            gpuFree(d_oneHot);
+            gpuFree(d_outgoingGradExpanded);
+            return d_outgoingGrad;
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/maxpool_layer.h b/GPU-MPC/nn/orca/maxpool_layer.h
new file mode 100644
index 00000000..b2b2e12a
--- /dev/null
+++ b/GPU-MPC/nn/orca/maxpool_layer.h
@@ -0,0 +1,60 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_comms.h"
+
+#include "fss/dcf/gpu_maxpool.h"
+#include "fss/gpu_and.h"
+
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        template <typename T>
+        class MaxPool2DLayer : public GPULayer<T>
+        {
+        public:
+            MaxpoolParams p;
+            GPUMaxpoolKey<T> maxpoolKey;
+            GPUSelectKey<T> backpropSelectKey;
+            u32 *oneHot;
+            u8 *oneHotOutputMask;
+            AESGlobalContext *gaes;
+
+            MaxPool2DLayer(int bin, int bout, int bwBackprop, int N, int imgH, int imgW, int C, int FH, int FW, int strideH,
+                           int strideW, int zPadHLeft, int zPadHRight,
+                           int zPadWLeft, int zPadWRight);
+            ~MaxPool2DLayer();
+            T *genForwardKey(uint8_t **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *g);
+            T *genBackwardKey(uint8_t **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *g, int epoch);
+            void readForwardKey(uint8_t **key_as_bytes);
+            void readBackwardKey(uint8_t **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch);
+        };
+    }
+}
+
+#include "maxpool_layer.cu"
diff --git a/GPU-MPC/nn/orca/relu_extend_layer.cu b/GPU-MPC/nn/orca/relu_extend_layer.cu
new file mode 100644
index 00000000..232c4a9c
--- /dev/null
+++ b/GPU-MPC/nn/orca/relu_extend_layer.cu
@@ -0,0 +1,117 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <omp.h>
+
+#include "utils/gpu_mem.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/gpu_random.h"
+
+#include "relu_extend_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        template <typename T>
+        ReluExtendLayer<T>::ReluExtendLayer(int bin, int bout, int numRelus)
+        {
+            this->name = "ReLU Extend";
+            this->bin = bin;
+            this->bout = bout;
+            this->numRelus = numRelus;
+            dReluMask = (u8 *)cpuMalloc(numRelus);
+            drelu = (u32 *)cpuMalloc(((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE));
+        }
+
+        template <typename T>
+        ReluExtendLayer<T>::~ReluExtendLayer()
+        {
+            cpuFree(dReluMask);
+            cpuFree(drelu);
+        }
+
+        // have we feed memory in this place?
+        template <typename T>
+        T *ReluExtendLayer<T>::genForwardKey(u8 **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *gaes)
+        {
+            auto res = gpuKeygenReluExtend(key_as_bytes, party, bin, bout, numRelus, d_inputMask, gaes);
+            auto d_dreluMask = res.first;
+            auto d_randomOutMask = res.second;
+            if (this->train)
+                moveIntoCPUMem((u8 *)dReluMask, (u8 *)d_dreluMask, numRelus, NULL);
+            gpuFree(d_dreluMask);
+            return d_randomOutMask;
+        }
+
+        template <typename T>
+        T *ReluExtendLayer<T>::genBackwardKey(u8 **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *gaes, int epoch)
+        {
+            this->checkIfTrain();
+            auto d_dreluMask = (u8 *)moveToGPU((u8 *)dReluMask, numRelus, NULL);
+            auto d_outgoingGradMask = gpuKeyGenSelect<T, T, u8>(key_as_bytes, party, numRelus, d_incomingGradMask, d_dreluMask, bout);
+            gpuFree(d_incomingGradMask);
+            gpuFree(d_dreluMask);
+            return d_outgoingGradMask;
+        }
+
+        template <typename T>
+        void ReluExtendLayer<T>::readForwardKey(u8 **key_as_bytes)
+        {
+            reluExtendKey = readGPUReluExtendKey<T>(key_as_bytes);
+        }
+
+        template <typename T>
+        void ReluExtendLayer<T>::readBackwardKey(u8 **key_as_bytes, int epoch)
+        {
+            backpropSelectKey = readGPUSelectKey<T>(key_as_bytes, numRelus);
+        }
+
+        // no memory leak
+        template <typename T>
+        T *ReluExtendLayer<T>::forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes)
+        {
+            auto res = gpuReluExtend(peer, party, reluExtendKey, d_I, gaes, &(this->s));
+            auto d_dcf = res.first;
+            auto d_drelu = d_dcf;
+            auto d_xLTRin = (u32 *)(((u8 *)d_dcf) + reluExtendKey.dReluKey.dcfKey.memSzOut);
+            auto d_O = res.second;
+            if (this->train)
+                moveIntoCPUMem((u8 *)drelu, (u8 *)d_xLTRin, ((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE), &(this->s));
+            gpuFree(d_drelu);
+            return d_O;
+        }
+
+        // no memory leak
+        template <typename T>
+        T *ReluExtendLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch)
+        {
+            this->checkIfTrain();
+            auto d_drelu = (u32 *)moveToGPU((u8 *)drelu, ((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE), &(this->s));
+            auto d_selectOutput = gpuSelect<T, T, 0, 0>(peer, party, bout, backpropSelectKey, d_drelu, d_incomingGrad, &(this->s));
+            gpuFree(d_drelu);
+            gpuFree(d_incomingGrad);
+            return d_selectOutput;
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/relu_extend_layer.h b/GPU-MPC/nn/orca/relu_extend_layer.h
new file mode 100644
index 00000000..a52fb859
--- /dev/null
+++ b/GPU-MPC/nn/orca/relu_extend_layer.h
@@ -0,0 +1,57 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_comms.h"
+#include "fss/dcf/gpu_relu.h"
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class ReluExtendLayer : public GPULayer<T>
+        {
+        public:
+            int bin, bout, /*f,*/ numRelus;
+            GPUReluExtendKey<T> reluExtendKey;
+            u32 *drelu;
+            u8 *dReluMask;
+            GPUSelectKey<T> backpropSelectKey;
+            // AESGlobalContext* gaes;
+            // Stats s;
+
+            ReluExtendLayer(int bin, int bout, int numRelus);
+            ~ReluExtendLayer();
+            T *genForwardKey(uint8_t **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *gaes);
+            T *genBackwardKey(uint8_t **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *gaes, int epoch);
+            void readForwardKey(uint8_t **key_as_bytes);
+            void readBackwardKey(uint8_t **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch);
+        };
+    }
+}
+
+#include "relu_extend_layer.cu"
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/relu_layer.cu b/GPU-MPC/nn/orca/relu_layer.cu
new file mode 100644
index 00000000..07042e5f
--- /dev/null
+++ b/GPU-MPC/nn/orca/relu_layer.cu
@@ -0,0 +1,117 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+
+#include "relu_layer.h"
+#include "utils/gpu_mem.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/gpu_random.h"
+
+#include "fss/dcf/gpu_dcf.h"
+#include "fss/dcf/gpu_relu.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+        template <typename T>
+        ReluLayer<T>::ReluLayer(int bin, int bout, int numRelus)
+        { //, int shift) {
+            // assert(bin == bout);
+            this->name = "Two Round ReLU";
+            this->bin = bin;
+            this->bout = bout;
+            this->numRelus = numRelus;
+            dReluMask = (u8 *)cpuMalloc(numRelus);
+            drelu = (u32 *)cpuMalloc(((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE));
+            // printf("Two round ReLU: %d relus\n", numRelus);
+        }
+
+        // have we freed memory in this place?
+        template <typename T>
+        T *ReluLayer<T>::genForwardKey(u8 **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *gaes)
+        {
+            // auto d_outputMask = randomGEOnGpu<T>(numRelus, bout);
+            auto d_oMasks = dcf::gpuGenTwoRoundReluKey(key_as_bytes, party, bin, bout, numRelus, d_inputMask, gaes);
+            auto d_dreluMask = d_oMasks.first;
+            auto d_reluMask = d_oMasks.second;
+            if (this->train)
+                moveIntoCPUMem((u8 *)dReluMask, (u8 *)d_dreluMask, numRelus, NULL);
+            gpuFree(d_dreluMask);
+            // gpuFree(d_inputMask);
+            return d_reluMask;
+        }
+
+        template <typename T>
+        T *ReluLayer<T>::genBackwardKey(u8 **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *gaes, int epoch)
+        {
+            this->checkIfTrain();
+            auto d_dreluMask = (u8 *)moveToGPU((u8 *)dReluMask, numRelus, NULL);
+            // auto d_outgoingGradMask = randomGEOnGpu<T>(numRelus, bout);
+            auto d_outgoingGradMask = gpuKeyGenSelect<T, T, u8>(key_as_bytes, party, numRelus, d_incomingGradMask, d_dreluMask, bout);
+            gpuFree(d_incomingGradMask);
+            gpuFree(d_dreluMask);
+            return d_outgoingGradMask;
+        }
+
+        template <typename T>
+        void ReluLayer<T>::readForwardKey(u8 **key_as_bytes)
+        {
+            reluKey = dcf::readTwoRoundReluKey<T>(key_as_bytes);
+        }
+
+        template <typename T>
+        void ReluLayer<T>::readBackwardKey(u8 **key_as_bytes, int epoch)
+        {
+            backpropSelectKey = readGPUSelectKey<T>(key_as_bytes, numRelus);
+        }
+
+        template <typename T>
+        T *ReluLayer<T>::forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *gaes)
+        {
+            auto res = dcf::gpuTwoRoundRelu(peer, party, reluKey, d_I, gaes, &(this->s));
+            auto d_drelu = res.first;
+            auto d_relu = res.second;
+            if (this->train)
+                moveIntoCPUMem((u8 *)drelu, (u8 *)d_drelu, ((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE), &(this->s));
+            gpuFree(d_drelu);
+            // gpuFree(d_I);
+            return d_relu;
+        }
+
+        template <typename T>
+        T *ReluLayer<T>::backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch)
+        {
+            this->checkIfTrain();
+            auto d_drelu = (u32 *)moveToGPU((u8 *)drelu, ((numRelus - 1) / PACKING_SIZE + 1) * sizeof(PACK_TYPE), &(this->s));
+            auto d_selectOutput = gpuSelect<T, T, 0, 0>(peer, party, bout, backpropSelectKey, d_drelu, d_incomingGrad, &(this->s));
+            gpuFree(d_drelu);
+            gpuFree(d_incomingGrad);
+            return d_selectOutput;
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca/relu_layer.h b/GPU-MPC/nn/orca/relu_layer.h
new file mode 100644
index 00000000..5e28a233
--- /dev/null
+++ b/GPU-MPC/nn/orca/relu_layer.h
@@ -0,0 +1,58 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_select.h"
+#include "fss/gpu_and.h"
+#include "fss/dcf/gpu_relu.h"
+
+#include "gpu_layer.h"
+
+namespace dcf
+{
+    namespace orca
+    {
+
+        template <typename T>
+        class ReluLayer : public GPULayer<T>
+        {
+        public:
+            int bin, bout, numRelus;
+            GPU2RoundReLUKey<T> reluKey;
+            GPUSelectKey<T> backpropSelectKey;
+            u32 *drelu;
+            u8 *dReluMask;
+
+            ReluLayer(int bin, int bout, int numRelus);
+            T *genForwardKey(uint8_t **key_as_bytes, int party, T *d_inputMask, AESGlobalContext *g);
+            T *genBackwardKey(uint8_t **key_as_bytes, int party, T *d_incomingGradMask, AESGlobalContext *g, int epoch);
+            void readForwardKey(uint8_t **key_as_bytes);
+            void readBackwardKey(uint8_t **key_as_bytes, int epoch);
+            T *forward(SigmaPeer *peer, int party, T *d_I, AESGlobalContext *g);
+            T *backward(SigmaPeer *peer, int party, T *d_incomingGrad, AESGlobalContext *g, int epoch);
+        };
+    }
+}
+
+#include "relu_layer.cu"
\ No newline at end of file
diff --git a/GPU-MPC/nn/orca_opt.h b/GPU-MPC/nn/orca_opt.h
new file mode 100644
index 00000000..75a3f7fa
--- /dev/null
+++ b/GPU-MPC/nn/orca_opt.h
@@ -0,0 +1,217 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/tensor.h>
+#include <sytorch/graph.h>
+#include "utils/helper_cuda.h"
+
+template <typename T>
+void orcaOpt(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
+{
+    // ReLU-MaxPool optimization
+    if (node->layer->name == "ReLU")
+    {
+        auto child = node->children[0];
+        auto cLayer = node->children[0]->layer;
+        if (node->children.size() == 1 && child->parents.size() == 1 && cLayer->name == "MaxPool2D")
+        {
+            child->layer = node->layer;
+            node->layer = cLayer;
+
+            cLayer->node = node;
+            child->layer->node = child;
+
+            node->currTensor = &(node->layer->activation);
+            child->layer->resize({node->layer->activation.shape});
+            child->currTensor = &(child->layer->activation);
+        }
+    }
+
+    // in LlamaImproved, mode takes the value according to the following rule:
+    // 0: the layer takes as input \ell bits and outputs \ell bits
+    // 1: the layer takes as input \ell bits and outputs \ell - scale bits
+    // 2: the layer takes as input \ell - scale bits and outputs \ell bits
+    // 3: the layer takes as input \ell - scale bits and outputs \ell - scale bits
+
+    // std::cerr << "Visiting " << node->layer->name << std::endl;
+    if (node->layer->name == "Conv2D" || node->layer->name == "FC" || node->layer->name == "BatchNorm2dInference" || node->layer->name == "AvgPool2D" || node->layer->name == "GlobalAvgPool2D")
+    {
+        // only one parent
+        auto parent = node->parents[0];
+        if (parent->layer->mode == 1 || parent->layer->mode == 3)
+        {
+            // std::cerr << "    Found parent " << parent->layer->name << " with mode " << parent->layer->mode << std::endl;
+            node->layer->doPreSignExtension = true;
+        }
+        // printf("Num children=%d\n", node->children.size());
+        if (!node->layer->isTrainingMode && node->children.size() == 0)
+        {
+            // this is the last node
+            node->layer->mode = 0;
+            node->layer->doTruncationForward = false;
+        }
+        else
+        {
+            node->layer->mode = 1;
+            node->layer->forwardTruncationMode = 1;
+        }
+    }
+    else if (node->layer->name == "Add" || node->layer->name == "Concat")
+    {
+        int m = 0;
+        for (auto &parent : node->parents)
+        {
+            if ((parent->layer->mode % 2) == 1)
+            {
+                m = 3;
+                break;
+            }
+        }
+        node->layer->mode = m;
+    }
+    else if (node->layer->name == "Flatten")
+    {
+        // delete flatten and add a flag to FC instead
+        assert(node->parents.size() == 1 && node->children.size() == 1);
+        auto parent = node->parents[0];
+        // printf("%s\n", parent->layer->name.data());
+        auto child = node->children[0];
+        assert(parent->children.size() == 1);
+        assert(child->parents.size() == 1);
+        parent->children[0] = child;
+        child->parents[0] = parent;
+        always_assert(parent->currTensor->shape.size() == 4);
+        always_assert(child->layer->name == "FC");
+        auto fc = static_cast<FC<T> *>(child->layer);
+        // // todo: free the memory used up by flatten
+        // printf("%d, %d, %d, %d\n", parent->layer->activation.shape[0], parent->layer->activation.shape[1], parent->layer->activation.shape[2], parent->layer->activation.shape[3]);
+        auto batchSz = parent->currTensor->shape[0];
+        auto h = parent->currTensor->shape[1];
+        auto w = parent->currTensor->shape[2];
+        auto c = parent->currTensor->shape[3];
+        int m = fc->out;
+        assert(h * w * c == fc->in);
+        parent->currTensor = new Tensor<T>(parent->layer->activation.data, parent->layer->activation.d_data, {batchSz, h * w * c});
+        // printf("New tensor=%lx\n", parent->currTensor);
+        parent->currTensor->graphNode = parent;
+        auto temp = Tensor(fc->weight.data, {fc->weight.d1, fc->weight.d2});
+        temp.copy(fc->weight.as_nd(), false);
+        auto temp_as_2d = temp.as_2d();
+        // printf("%d, %d, %d, %d, %d, %d, %d, %d\n", temp_as_2d.d1, temp_as_2d.d2, fc->weight.d1, fc->weight.d2, h, w, c, batchSz);
+        for (int l = 0; l < m; l++)
+        {
+            for (int i = 0; i < h; i++)
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    for (int k = 0; k < c; k++)
+                    {
+                        fc->weight(i * w * c + j * c + k, l) = temp_as_2d(k * h * w + i * w + j, l);
+                    }
+                }
+            }
+        }
+        int i;
+        for (i = 0; i < node->allNodesInExecutionOrderRef->size(); i++)
+        {
+            if (node->allNodesInExecutionOrderRef->at(i) == node)
+            {
+                // printf("Found\n");
+                break;
+            }
+        }
+        node->allNodesInExecutionOrderRef->erase(node->allNodesInExecutionOrderRef->begin() + i);
+    }
+    else if (node->layer->name == "MaxPool2D")
+    {
+        auto parentMode = node->parents[0]->layer->mode;
+        if (parentMode == 1 || parentMode == 3)
+        {
+            node->layer->mode = 3;
+        }
+        else
+        {
+            node->layer->mode = 0;
+        }
+    }
+    else if (node->layer->name == "ReLU")
+    {
+        auto parentMode = node->parents[0]->layer->mode;
+        if (parentMode == 0 || parentMode == 2)
+        {
+            node->layer->mode = 0;
+        }
+        else
+        {
+            bool oneChildLinear = false;
+            for (auto &child : node->children)
+            {
+                if (child->layer->name == "Conv2D" || child->layer->name == "FC" || child->layer->name == "BatchNorm2dInference" || child->layer->name == "GlobalAvgPool2D" || child->layer->name == "AvgPool2D" || child->layer->name == "Flatten")
+                {
+                    oneChildLinear = true;
+                    break;
+                }
+            }
+            if (oneChildLinear)
+            {
+                node->layer->mode = 2;
+            }
+            else
+            {
+                node->layer->mode = 3;
+            }
+        }
+    }
+    else if (node->layer->name == "Input")
+    {
+        // do nothing
+    }
+    else
+    {
+        throw std::runtime_error("Unknown layer type: " + node->layer->name);
+    }
+}
+
+template <typename T>
+void pinCpuMem(LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+{
+    // printf("Layer=%s, %lx, %lu\n", n->layer->name.data(), n->currTensor->data, n->currTensor->size());
+    if (n->currTensor->data)
+        checkCudaErrors(cudaHostRegister(n->currTensor->data, n->currTensor->size() * sizeof(T), cudaHostRegisterDefault));
+    auto w = n->layer->getweights().data;
+    auto b = n->layer->getbias().data;
+    auto wSz = n->layer->getweights().size;
+    auto bSz = n->layer->getbias().size;
+    if (w)
+        checkCudaErrors(cudaHostRegister(w, wSz * sizeof(T), cudaHostRegisterDefault));
+    if (b)
+        checkCudaErrors(cudaHostRegister(b, bSz * sizeof(T), cudaHostRegisterDefault));
+    if (n->layer->name == "_MHADummy")
+    {
+        auto mha = static_cast<_MHADummy<T> *>(n->layer);
+        checkCudaErrors(cudaHostRegister(mha->wQKV.data, mha->wQKV.size() * sizeof(T), cudaHostRegisterDefault));
+        checkCudaErrors(cudaHostRegister(mha->bQKV.data, mha->bQKV.size() * sizeof(T), cudaHostRegisterDefault));
+        checkCudaErrors(cudaHostRegister(mha->wProj.data, mha->wProj.size() * sizeof(T), cudaHostRegisterDefault));
+        checkCudaErrors(cudaHostRegister(mha->bProj.data, mha->bProj.size() * sizeof(T), cudaHostRegisterDefault));
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/setup.sh b/GPU-MPC/setup.sh
new file mode 100644
index 00000000..6574e99b
--- /dev/null
+++ b/GPU-MPC/setup.sh
@@ -0,0 +1,76 @@
+read -p "Enter CUDA version (default: 11.7): " CUDA_VERSION
+CUDA_VERSION=${CUDA_VERSION:-11.7}
+
+read -p "Enter GPU architecture (default: 86): " GPU_ARCH
+GPU_ARCH=${GPU_ARCH:-86}
+
+# Set environment variables
+export CUDA_VERSION="$CUDA_VERSION"
+export GPU_ARCH="$GPU_ARCH"
+export NVCC_PATH="/usr/local/cuda-$CUDA_VERSION/bin/nvcc"
+
+echo "Updating submodules"
+git submodule update --init --recursive
+
+# Install dependencies
+echo "Installing g++-9"
+sudo apt install -y gcc-9 g++-9;
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9;
+sudo update-alternatives --config gcc;
+
+
+#installing dependencies
+sudo apt install libssl-dev cmake python3-pip libgmp-dev libmpfr-dev;
+
+
+echo "Installing dependencies"
+sudo apt install cmake make libeigen3-dev;
+
+echo "Building CUTLASS"
+# Build CUTLASS
+cd ext/cutlass;
+mkdir build && cd build;
+cmake .. -DCUTLASS_NVCC_ARCHS=$GPU_ARCH -DCMAKE_CUDA_COMPILER_WORKS=1 -DCMAKE_CUDA_COMPILER=$NVCC_PATH;
+make -j;
+cd ../../..;
+
+# Build sytorch
+echo "Building Sytorch"
+cd ext/sytorch;
+mkdir build && cd build;
+cmake -DCMAKE_INSTALL_PREFIX=./install -DCMAKE_BUILD_TYPE=Release ../;
+make sytorch -j;
+cd ../../..;
+
+# Download CIFAR-10
+cd experiments/orca/datasets/cifar-10;
+sh download-cifar10.sh;
+cd ../../../..;
+
+
+# Make shares of data
+make share_data;
+cd experiments/orca;
+./share_data;
+cd ../..;
+
+# Build the orca codebase
+# make orca; 
+
+# Make output directories
+# Orca
+mkdir experiments/orca/output;
+mkdir experiments/orca/output/P0;
+mkdir experiments/orca/output/P1;
+mkdir experiments/orca/output/P0/training;
+mkdir experiments/orca/output/P1/training;
+mkdir experiments/orca/output/P0/inference;
+mkdir experiments/orca/output/P1/inference;
+
+# Sigma
+mkdir experiments/sigma/output;
+mkdir experiments/sigma/output/P0;
+mkdir experiments/sigma/output/P1;
+
+# install matplotlib
+pip3 install matplotlib
diff --git a/GPU-MPC/tests/fss/dcf/aes.cu b/GPU-MPC/tests/fss/dcf/aes.cu
new file mode 100644
index 00000000..e2d1283d
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/aes.cu
@@ -0,0 +1,80 @@
+#include <llama/dcf.h>
+#include <sytorch/random.h>
+#include <sytorch/utils.h>
+#include "fss/gpu_aes_shm.h"
+#include "fss/gpu_fss_helper.h"
+
+using namespace osuCrypto;
+
+__global__ void aesWrapper(AESBlock *ss0, int N, AESGlobalContext gaes)
+{
+    AESSharedContext saes;
+    loadSbox(&gaes, &saes);
+    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadId < N)
+    {
+        printf("AES seed=\n");
+        printAESBlock(&ss0[threadId]);
+        auto ss0_l = ss0[threadId];
+        auto ss1_l = ss0_l;
+        auto ss2_l = ss0_l;
+
+        AESBlock ct[4];
+
+        applyAESPRG(&saes, (u32*) &ss0_l, 0, (u32*) ct);
+        printf("Encrypt one=\n");
+        printAESBlock(ct);
+
+        applyAESPRGTwoTimes(&saes, (u32*) &ss1_l, 0, (u32*) ct, (u32*) &ct[1]);
+        printf("Encrypt two=\n");
+        printAESBlock(ct);
+        printAESBlock(&ct[1]);
+
+        applyAESPRGFourTimes(&saes, (u32*) &ss2_l, (u32*) ct, (u32*) &ct[1], (u32*) &ct[2], (u32*) &ct[3]);
+        printf("Encrypt four=\n");
+        printAESBlock(ct);
+        printAESBlock(&ct[1]);
+        printAESBlock(&ct[2]);
+        printAESBlock(&ct[3]);
+    }
+}
+
+int main()
+{
+    sytorch_init();
+    initGPUMemPool();
+    osuCrypto::block aesSeed(prngWeights.get<block>());
+    static const block ZeroBlock = toBlock(0, 0);
+    static const block OneBlock = toBlock(0, 1);
+    static const block TwoBlock = toBlock(0, 2);
+    static const block ThreeBlock = toBlock(0, 3);
+    osuCrypto::AES ak(aesSeed);
+    osuCrypto::block ct[4];
+    osuCrypto::block pt[4] = {ZeroBlock, TwoBlock, OneBlock, ThreeBlock};
+
+    printf("AES seed=\n");
+    printAESBlock((AESBlock*) &aesSeed);
+    
+    printf("Encrypt one=\n");
+    ak.ecbEncBlock(pt[0], ct[0]);
+    printAESBlock((AESBlock*) &ct[0]);
+    
+    printf("Encrypt two=\n");
+    ak.ecbEncTwoBlocks(pt, ct);
+    printAESBlock((AESBlock*) &ct[0]);
+    printAESBlock((AESBlock*) &ct[1]);
+
+    printf("Encrypt four=\n");
+    ak.ecbEncFourBlocks(pt, ct);
+    printAESBlock((AESBlock*) &ct[0]);
+    printAESBlock((AESBlock*) &ct[1]);
+    printAESBlock((AESBlock*) &ct[2]);
+    printAESBlock((AESBlock*) &ct[3]);
+
+    int N = 1;
+    auto d_aesSeed = (AESBlock*) moveToGPU((u8*) &aesSeed, N * sizeof(AESBlock), (Stats*) NULL);
+    AESGlobalContext g;
+    initAESContext(&g);
+    aesWrapper<<<(N - 1) / 256 + 1, 256>>>(d_aesSeed, N, g);
+    checkCudaErrors(cudaDeviceSynchronize());
+}
diff --git a/GPU-MPC/tests/fss/dcf/dcf.cu b/GPU-MPC/tests/fss/dcf/dcf.cu
new file mode 100644
index 00000000..1afc9d50
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/dcf.cu
@@ -0,0 +1,116 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+
+#include <cassert>
+#include <chrono>
+
+#include "utils/gpu_random.h"
+#include "fss/dcf/gpu_dcf.h"
+
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bin = atoi(argv[1]);
+    int bout = atoi(argv[2]);
+    int N = atoi(argv[3]);
+
+    printf("Bin=%d, Bout=%d, N=%d\n", bin, bout, N);
+
+    u8 *ptr1, *ptr2;
+    getKeyBuf(&ptr1, &ptr2, 10 * OneGB);
+    auto keyBuf1 = ptr1;
+    auto keyBuf2 = ptr2;
+    // auto d_x = (T*) gpuMalloc(N * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_x, 0, N * sizeof(T)));
+    printf("N=%d, memSzN=%lu\n", N, N * sizeof(T));
+    initGPURandomness();
+    auto d_rin = randomGEOnGpu<T>(N, bin);
+    auto h_rin = (T *)moveToCPU((u8 *)d_rin, N * sizeof(T), NULL);
+    auto d_X = randomGEOnGpu<T>(N, bin);
+    auto h_X = (T *)moveToCPU((u8 *)d_X, N * sizeof(T), NULL);
+    // printf("%ld\n", h_X[3]);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    dcf::gpuKeyGenDCF(&keyBuf1, 0, bin, bout, N, d_rin, T(1), &g);
+    printf("Key size=%lu\n", keyBuf1 - ptr1);
+    auto k1 = dcf::readGPUDCFKey(&ptr1);
+    Stats s;
+    auto start = std::chrono::high_resolution_clock::now();
+    auto d_O1 = dcf::gpuDcf<T, 1, dcf::idPrologue, dcf::idEpilogue>(k1, 0, d_X, &g, (Stats *)&s);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+
+    printf("Time taken for P0=%lu micros, transfer time=%lu\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(), s.transfer_time);
+
+    auto h_O1 = (u32 *)moveToCPU((u8 *)d_O1, k1.memSzOut, (Stats *)NULL);
+    gpuFree(d_O1);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    dcf::gpuKeyGenDCF(&keyBuf2, 1, bin, bout, N, d_rin, T(1), &g);
+    auto k2 = dcf::readGPUDCFKey(&ptr2);
+
+    start = std::chrono::high_resolution_clock::now();
+    auto d_O2 = dcf::gpuDcf<T, 1, dcf::idPrologue, dcf::idEpilogue>(k2, 1, d_X, &g, (Stats *)NULL);
+    end = std::chrono::high_resolution_clock::now();
+    elapsed = end - start;
+
+    printf("Time taken for P1=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    auto h_O2 = (u32 *)moveToCPU((u8 *)d_O2, k2.memSzOut, NULL);
+    gpuFree(d_O2);
+    destroyGPURandomness();
+
+    for (int i = 0; i < N; i++)
+    {
+        // auto o1 = h_O1[i];
+        // auto o2 = h_O2[i];
+        // auto o = (o1 + o2);
+        // cpuMod(o, bout);
+        // if (i < 10 || (o != (h_X[i] < h_rin[i])))
+        //     printf("%d: %lu, %lu, %lu, %lu, %lu\n", i, o1, o2, o, h_X[i], h_rin[i]);
+        // // assert((h_O1[i] ^ h_O2[i]) == u32(0));
+        // // assert(o == (h_X[i] < h_rin[i]));
+        // assert(o == (h_X[i] < h_rin[i]));
+
+        auto o1 = (h_O1[i / 32] >> (i & 31)) & T(1);
+        auto o2 = (h_O2[i / 32] >> (i & 31)) & T(1);
+        auto o = (o1 + o2) & u32(1);
+        if (i < 10 || (o != (h_X[i] < h_rin[i])))
+            printf("%d: %u, %u, %u, %lu, %lu, %u\n", i, o1, o2, o, h_X[i], h_rin[i], i / 32);
+        // assert((h_O1[i] ^ h_O2[i]) == u32(0));
+        // assert(o == (h_X[i] < h_rin[i]));
+        assert(o == (h_X[i] < h_rin[i]));
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dcf/maxpool.cu b/GPU-MPC/tests/fss/dcf/maxpool.cu
new file mode 100644
index 00000000..a7ddf60f
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/maxpool.cu
@@ -0,0 +1,150 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/dcf/gpu_maxpool.h"
+
+#include <llama/array.h>
+
+using T = u32;
+
+template <typename T>
+void maxPool2D(MaxpoolParams p, T *in, T *out, T *incomingGrad, T *outgoingGrad)
+{
+    memset(outgoingGrad, 0, p.N * p.imgH * p.imgW * p.C * sizeof(T));
+    for (int i = 0; i < p.N; i++)
+    {
+        for (int j = 0; j < p.H; j++)
+        {
+            for (int k = 0; k < p.W; k++)
+            {
+                for (int l = 0; l < p.C; l++)
+                {
+                    u64 M = 0;
+                    u64 maxIdxI = 0;
+                    u64 maxIdxJ = 0;
+                    int leftTopCornerH = j * p.strideH - p.zPadHLeft;
+                    int leftTopCornerW = k * p.strideW - p.zPadWLeft;
+                    for (int m = 0; m < p.FH; m++)
+                    {
+                        for (int n = 0; n < p.FW; n++)
+                        {
+                            u64 val = 0;
+                            int posH = leftTopCornerH + m;
+                            int posW = leftTopCornerW + n;
+                            if (posH >= 0 && posH <= p.imgH && posW >= 0 && posW <= p.imgW)
+                            {
+                                // printf("%d, %d, %d, %d\n", i, posH, posW, l);
+                                val = Arr4DIdx(in, p.N, p.imgH, p.imgW, p.C, i, posH, posW, l);
+                            }
+                            // printf("Val=%lu, %d\n", val, i * p.imgH * p.imgW * p.C + posH * p.imgW * p.C + posW * p.C + l);
+                            if (m == 0 && n == 0)
+                                M = val;
+                            else if (((val - M) & ((T(1) << p.bin) - 1)) < (T(1) << (p.bin - 1)))
+                            {
+                                M = val;
+                                maxIdxI = m;
+                                maxIdxJ = n;
+                            }
+                        }
+                    }
+                    Arr4DIdx(out, p.N, p.H, p.W, p.C, i, j, k, l) = M;
+                    auto inGrad = Arr4DIdx(incomingGrad, p.N, p.H, p.W, p.C, i, j, k, l);
+                    auto gradSum = Arr4DIdx(outgoingGrad, p.N, p.imgH, p.imgW, p.C, i, j * p.strideH + maxIdxI, k * p.strideW + maxIdxJ, l);
+                    gradSum = (gradSum + inGrad);
+                    cpuMod(gradSum, p.bwBackprop);
+                    Arr4DIdx(outgoingGrad, p.N, p.imgH, p.imgW, p.C, i, j * p.strideH + maxIdxI, k * p.strideW + maxIdxJ, l) = gradSum;
+                    // printf("maxI, maxJ = %d, %d\n", maxIdxI, maxIdxJ);
+                }
+            }
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    // initCommBufs(true);
+    int bin = 20;
+    int bout = 20;
+    int bwBackprop = 32;
+    int N = 100;
+    int imgH = 30;
+    int imgW = 30;
+    int C = 3;
+    int FH = 5;
+    int FW = 5;
+    int strideH = 2;
+    int strideW = 2;
+    int zPadHLeft = 0;
+    int zPadHRight = 0;
+    int zPadWLeft = 0;
+    int zPadWRight = 0;
+    bool useMomentum = true;
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    MaxpoolParams p = {bin, bout, };
+    int inSz = getInSz(p);
+    int outSz = getMSz(p);
+
+    T *h_I, *h_incomingGrad;
+    auto d_inputMask = randomGEOnGpu<T>(inSz, bin);
+    // checkCudaErrors(cudaMemset(d_inputMask, 0, inSz * sizeof(T)));
+    auto h_inputMask = (T *)moveToCPU((u8 *)d_inputMask, inSz * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(inSz, bin, d_inputMask, &h_I);
+
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 4 * OneGB);
+
+    auto d_outputMask = dcf::gpuKeygenMaxpool(&startPtr, party, p, d_inputMask, (u8*) NULL, &g);
+    auto h_outputMask = (T *)moveToCPU((u8 *)d_outputMask, outSz * sizeof(T), NULL);
+
+    curPtr = startPtr;
+    auto k = dcf::readGPUMaxpoolKey<T>(p, &curPtr);
+
+    auto d_O = dcf::gpuMaxPool(peer, party, p, k, d_masked_I, (u32*) NULL, &g, (Stats*) NULL);
+    auto h_O = (T *)moveToCPU((u8 *)d_O, outSz * sizeof(T), NULL);
+
+    T *ct_o = new T[outSz];
+    maxPool2D(p, h_I, ct_o, h_incomingGrad, outgoingGradCt);
+    for (int i = 0; i < outSz; i++)
+    {
+        auto unmasked_output = (h_O[i] - h_outputMask[i]);
+        cpuMod(unmasked_output, bout);
+        if (i < 10 || unmasked_output != ct_o[i])
+            printf("%d=%lu %lu\n", i, unmasked_output, ct_o[i]);
+
+        assert(unmasked_output == ct_o[i]);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dcf/relu.cu b/GPU-MPC/tests/fss/dcf/relu.cu
new file mode 100644
index 00000000..1f348923
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/relu.cu
@@ -0,0 +1,98 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/dcf/gpu_relu.h"
+
+#include <cassert>
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bw = 64;
+    int N = atoi(argv[3]); 
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X);
+
+    auto d_temp = dcf::gpuGenTwoRoundReluKey(&curPtr, party, bw, bw, N, d_mask_X, &g);
+    printf("Key size=%lu\n", curPtr - startPtr);
+
+    auto d_dreluMask = d_temp.first;
+    auto d_reluMask = d_temp.second;
+
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_reluMask, N * sizeof(T), NULL);
+    auto k1 = dcf::readTwoRoundReluKey<T>(&startPtr);
+    T *d_O;
+    for (int i = 0; i < 10; i++)
+    {
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        auto d_temp = dcf::gpuTwoRoundRelu(peer, party, k1, d_masked_X, &g, (Stats *)NULL);
+        auto d_drelu = d_temp.first;
+        gpuFree(d_drelu);
+        d_O = d_temp.second;
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+
+    printf("bw=%d, bw=%d, N=%d\n", bw, bw, N);
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_O = (h_O[i] - h_mask_O[i]);
+        cpuMod(unmasked_O, bw);
+        auto o = h_X[i] * (1 - (h_X[i] >> (bw - 1)));
+        cpuMod(o, bw);
+        // auto o = std::min(std::abs((i64) h_X[i]), (i64) max);
+        if (i < 10)
+            printf("%d: %ld, %ld, %ld, %ld\n", i, h_X[i], o, unmasked_O, h_mask_X[i]);
+        assert(o == unmasked_O);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dcf/relu_extend.cu b/GPU-MPC/tests/fss/dcf/relu_extend.cu
new file mode 100644
index 00000000..890b013c
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/relu_extend.cu
@@ -0,0 +1,97 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../../../utils/gpu_data_types.h"
+#include "../../../utils/gpu_file_utils.h"
+#include "../../../utils/misc_utils.h"
+#include "../../../utils/gpu_mem.h"
+#include "../../../utils/gpu_random.h"
+#include "../../../utils/gpu_comms.h"
+
+#include "../../../fss/dcf/gpu_relu.h"
+
+#include <cassert>
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bin = 40;
+    int bout = 64;
+    int N = atoi(argv[3]); //8;
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bin);
+    // checkCudaErrors(cudaMemset(d_mask_X, 0, N * sizeof(T)));
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bin, d_mask_X, &h_X);
+
+    auto d_tempMask = dcf::gpuKeygenReluExtend(&curPtr, party, bin, bout, N, d_mask_X, &g);
+    auto d_dreluMask = d_tempMask.first;
+    gpuFree(d_dreluMask);
+    auto d_reluMask = d_tempMask.second;
+    printf("Key size=%lu\n", curPtr - startPtr);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_reluMask, N * sizeof(T), NULL);
+    // printf("here\n");
+    auto k1 = dcf::readGPUReluExtendKey<T>(&startPtr);
+    T *d_relu;
+    // printf("here\n");
+    for (int i = 0; i < 1; i++)
+    {
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        auto temp = dcf::gpuReluExtend(peer, party, k1, d_masked_X, &g, (Stats *)NULL);
+        auto d_drelu = temp.first;
+        gpuFree(d_drelu);
+        d_relu = temp.second;
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+
+    auto h_relu = (T *)moveToCPU((u8 *)d_relu, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_relu);
+    destroyGPURandomness();
+
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_O = (h_relu[i] - h_mask_O[i]);
+        cpuMod(unmasked_O, bout);
+        auto o = h_X[i] * (1 - (h_X[i] >> (bin - 1)));
+        if (i < 10)
+            printf("%d: %ld, %ld, %ld, %ld, %ld, %ld\n", i, h_X[i], o, unmasked_O, h_mask_X[i], h_relu[i], h_mask_O[i]);
+        assert(o == unmasked_O);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dcf/stochastic_truncate.cu b/GPU-MPC/tests/fss/dcf/stochastic_truncate.cu
new file mode 100644
index 00000000..e6032e3d
--- /dev/null
+++ b/GPU-MPC/tests/fss/dcf/stochastic_truncate.cu
@@ -0,0 +1,71 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstdint>
+
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/dcf/gpu_truncate.h"
+
+using T = u64;
+
+int main(int argc, char *argv[]) {
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    // initCommBufs(true);
+    int bin = 64;
+    int bout = 64;
+    int shift = 5;
+    int N = atoi(argv[3]);
+    int party = atoi(argv[1]);
+    
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    T *h_I;
+    auto d_inputMask = randomGEOnGpu<T>(N, bin);
+    // checkCudaErrors(cudaMemset(d_inputMask, 0, N * sizeof(T)));
+    auto h_inputMask = (T*) moveToCPU((u8*) d_inputMask, N * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(N, bin, d_inputMask, &h_I);
+
+    u8 *startPtr, *curPtr;
+    size_t keyBufSz = 10 * OneGB;
+    getKeyBuf(&startPtr, &curPtr, keyBufSz);
+    T* h_r = (T*) cpuMalloc(N * sizeof(T));
+    auto d_outputMask = dcf::genGPUStochasticTruncateKey(&curPtr, party, bin, bout, shift, N, d_inputMask, &g, h_r);
+    assert(curPtr - startPtr < keyBufSz);
+    auto h_outputMask = (T*) moveToCPU((u8*) d_outputMask, N * sizeof(T), NULL);
+    gpuFree(d_outputMask);
+
+    curPtr = startPtr;
+    auto k = dcf::readGPUTrStochasticKey<T>(&curPtr);
+
+    dcf::gpuStochasticTruncate(k, party, peer, d_masked_I, &g, (Stats*) NULL);
+
+    auto h_O = (T*) moveToCPU((u8*) d_masked_I, N * sizeof(T), NULL);
+    checkTrStochastic(bin, bout, shift, N, h_O, h_outputMask, h_I, h_r);
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dpf.cu b/GPU-MPC/tests/fss/dpf.cu
new file mode 100644
index 00000000..72caac79
--- /dev/null
+++ b/GPU-MPC/tests/fss/dpf.cu
@@ -0,0 +1,107 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+
+#include <cassert>
+#include <chrono>
+
+#include "utils/gpu_random.h"
+#include "fss/gpu_dpf.h"
+
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPUMemPool();
+    int bin = atoi(argv[1]);
+    int N = atoi(argv[2]);
+
+    printf("Bin=%d, N=%d\n", bin, N);
+
+    u8 *ptr1, *ptr2;
+    getKeyBuf(&ptr1, &ptr2, 50 * OneGB);
+    auto keyBuf1 = ptr1;
+    auto keyBuf2 = ptr2;
+    // auto d_x = (T*) gpuMalloc(N * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_x, 0, N * sizeof(T)));
+    printf("N=%d, memSzN=%lu\n", N, N * sizeof(T));
+    initGPURandomness();
+    auto d_rin = randomGEOnGpu<T>(N, bin);
+    auto h_rin = (T *)moveToCPU((u8 *)d_rin, N * sizeof(T), NULL);
+    auto d_X = randomGEOnGpu<T>(N, bin);
+    auto h_X = (T *)moveToCPU((u8 *)d_X, N * sizeof(T), NULL);
+    // printf("%ld\n", h_X[3]);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    gpuKeyGenDCF(&keyBuf1, 0, bin, N, d_rin, &g);
+    printf("Key size=%lu\n", keyBuf1 - ptr1);
+    auto k1 = readGPUDPFKey(&ptr1);
+
+    Stats s;
+    auto start = std::chrono::high_resolution_clock::now();
+    auto d_O1 = gpuDpf(k1, 0, d_X, &g, &s);
+    // gpuDcf<T, 1, idPrologue, idEpilogue>(k1, 0, d_X, &g, (Stats *)&s);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+
+    printf("Time taken for P0=%lu micros, Transfer time=%lu\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(), s.transfer_time);
+
+    auto h_O1 = (u32 *)moveToCPU((u8 *)d_O1, k1.memSzOut, (Stats *)NULL);
+    gpuFree(d_O1);
+    destroyGPURandomness();
+
+    initGPURandomness();
+    gpuKeyGenDCF(&keyBuf2, 1, bin, N, d_rin, &g);
+    auto k2 = readGPUDPFKey(&ptr2);
+
+    start = std::chrono::high_resolution_clock::now();
+    auto d_O2 = gpuDcf<T, 1, idPrologue, idEpilogue>(k2, 1, d_X, &g, (Stats *)NULL);
+    end = std::chrono::high_resolution_clock::now();
+    elapsed = end - start;
+
+    printf("Time taken for P1=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    auto h_O2 = (u32 *)moveToCPU((u8 *)d_O2, k2.memSzOut, NULL);
+    gpuFree(d_O2);
+    destroyGPURandomness();
+
+    for (int i = 0; i < N; i++)
+    {
+        auto o1 = (h_O1[i / 32] >> (i & 31)) & T(1);
+        auto o2 = (h_O2[i / 32] >> (i & 31)) & T(1);
+        auto o = (o1 + o2) & u32(1);
+        if (i < 10 || (o != (h_X[i] < h_rin[i])))
+            printf("%d: %u, %u, %u, %lu, %lu\n", i, o1, o2, o, h_X[i], h_rin[i]);
+        // assert((h_O1[i] ^ h_O2[i]) == u32(0));
+        // assert(o == (h_X[i] < h_rin[i]));
+        assert(o == (h_X[i] < h_rin[i]));
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dpf_drelu.cu b/GPU-MPC/tests/fss/dpf_drelu.cu
new file mode 100644
index 00000000..15c10b90
--- /dev/null
+++ b/GPU-MPC/tests/fss/dpf_drelu.cu
@@ -0,0 +1,93 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <sytorch/tensor.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_relu.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bw = 64;
+    int N = atoi(argv[3]); 
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X);
+
+    auto d_dreluMask = gpuKeyGenDRelu(&curPtr, party, bw, N, d_mask_X, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_dreluMask, N * sizeof(T), NULL);
+
+    curPtr = startPtr;
+    auto k = readGPUDReluKey(&curPtr);
+    u32 *d_O;
+    for (int i = 0; i < 1; i++)
+    {
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        std::vector<u32 *> h_mask({k.mask});
+        Stats s;
+        d_O = gpuDcf<T, 1, dReluPrologue<0>, dReluEpilogue<0, false>>(k.dpfKey, party, d_masked_X, &g, &s, &h_mask);
+        peer->reconstructInPlace(d_O, 1, N, (Stats*) &s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+        printf("Comm time=%lu micros, Transfer time=%lu micros, Comm=%lu B\n", s.comm_time, s.transfer_time, peer->bytesSent() + peer->bytesReceived());
+    }
+    printf("Mem size out=%lu\n", k.dpfKey.memSzOut);
+    auto h_O = (u32 *)moveToCPU((uint8_t *)d_O, k.dpfKey.memSzOut, (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+
+    printf("bw=%d, N=%d\n", bw, N);
+    for (int i = 0; i < N; i++)
+    {
+        auto drelu = h_X[i] < (1ULL << (bw - 1));
+        auto o = ((h_O[i / 32] >> (i % 32)) & 1) ^ uint32_t(h_mask_O[i]);
+        if (i < 10 || o != drelu)
+            printf("%d: %ld, %ld, %ld, %ld, %u\n", i, h_X[i], drelu, o, h_mask_X[i], uint32_t(h_mask_O[i]));
+        assert(o == drelu);
+
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dpf_eval_all.cu b/GPU-MPC/tests/fss/dpf_eval_all.cu
new file mode 100644
index 00000000..8147f8c1
--- /dev/null
+++ b/GPU-MPC/tests/fss/dpf_eval_all.cu
@@ -0,0 +1,198 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_comms.h"
+
+#include <cassert>
+
+#include "utils/gpu_random.h"
+#include "fss/gpu_lut.h"
+
+#include <sytorch/tensor.h>
+
+using TIn = u64;
+using TOut = u64;
+
+template <typename TIn>
+// striping the stack as well for now we'll see what to do later
+__global__ void dpfEvalAll(int party, int bin, int N, TIn *X, AESBlock *scw_g, AESBlock *stack_g,
+                        AESBlock *l0_g, AESBlock *l1_g, u32 *tR_g, u32* U, AESGlobalContext gaes)
+{
+    AESSharedContext saes;
+    loadSbox(&gaes, &saes);
+    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+    // don't need a sync here at all because there is no data sharing
+    // just data reuse
+    if (threadId < N)
+    {
+        storeAESBlock(stack_g, 0, scw_g[threadId], N, threadId);
+        auto x = (u64)X[threadId];
+        gpuMod(x, bin);
+        auto l0_cw = l0_g[threadId];
+        auto l1_cw = l1_g[threadId];
+        auto tR = tR_g[threadId];
+        u32 pathStack = 0;
+        int depth = 1;
+        TOut u = 0;
+        while (depth > 0)
+        {
+            auto seed = loadAESBlock(stack_g, depth - 1, N, threadId);
+            auto bit = pathStack & 1ULL;
+            if (depth == bin - LOG_AES_BLOCK_LEN)
+            {
+                auto lastBlock = expandDPFTreeNode(bin, party,
+                                                   seed,
+                                                   0,
+                                                   l0_cw,
+                                                   l1_cw,
+                                                   0,
+                                                   uint8_t(bit),
+                                                   depth - 1,
+                                                   &saes);
+                TOut c = party == SERVER1 ? -1 : 1;
+                auto lb = pathStack << LOG_AES_BLOCK_LEN;
+                // do the dot product here
+                for (u64 i = 0; i < AES_BLOCK_LEN_IN_BITS; i++)
+                {
+                    auto w = c * TOut(lastBlock & 1);
+                    u += w;
+                    lastBlock >>= 1;
+                }
+                // sum &= 1;
+                // pop all the 1s from the stack
+                while (pathStack & 1ULL /*&& depth > 0*/)
+                {
+                    pathStack >>= 1;
+                    depth--;
+                }
+                // xor the last 0 with 1 to make it 1
+                pathStack ^= 1;
+            }
+            else
+            { 
+                auto tR_l = (tR >> (depth - 1)) & 1;
+                auto newSeed = expandDPFTreeNode(bin, party,
+                                                 seed,
+                                                 loadAESBlock(scw_g, depth, N, threadId),
+                                                 //   scw[][depth - 1][],
+                                                 0,
+                                                 0,
+                                                 tR_l,
+                                                 uint8_t(bit),
+                                                 depth - 1,
+                                                 &saes);
+                storeAESBlock(stack_g, depth, newSeed, N, threadId);
+                depth++;
+                pathStack <<= 1; 
+            }
+        }
+        // Neha: might want to examine this later
+        gpuMod(u, 1);
+        writeVCW(1, U, u64(u), 0, N);
+    }
+}
+
+
+template <typename TIn>
+u32 *gpuDpfEvalAll(GPUDPFKey k0, int party, TIn *d_X, AESGlobalContext *g, Stats *s)
+{
+    auto k = *(k0.dpfTreeKey);
+    assert(k0.bin >= 8 && k0.B == 1);
+
+    const int tbSz = 256;
+    int tb = (k.N - 1) / tbSz + 1;
+    AESBlock *d_scw, *d_stack, *d_l0, *d_l1;
+    u32 *d_tR;
+    // *d_out;
+
+    assert(k.memSzScw % (k.bin - LOG_AES_BLOCK_LEN) == 0);
+
+    d_scw = (AESBlock *)moveToGPU((uint8_t *)k.scw, k.memSzScw, s);
+    d_stack = (AESBlock *)gpuMalloc(k.memSzScw);
+    d_l0 = (AESBlock *)moveToGPU((uint8_t *)k.l0, k.memSzL, s);
+    d_l1 = (AESBlock *)moveToGPU((uint8_t *)k.l1, k.memSzL, s);
+    d_tR = (u32 *)moveToGPU((uint8_t *)k.tR, k.memSzT, s);
+    auto d_U = (u32 *) gpuMalloc(k.memSzOut); // a lot of bits packed together
+
+    dpfEvalAll<TIn><<<tb, tbSz /*, shmSize*/>>>(party, k.bin, k.N, d_X, d_scw, d_stack, d_l0, d_l1, d_tR, d_U, *g);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // printf("Time taken by dpfLUT kernel=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    gpuFree(d_scw);
+    gpuFree(d_stack);
+    gpuFree(d_l0);
+    gpuFree(d_l1);
+    gpuFree(d_tR);
+
+    return d_U;
+}
+
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bin = atoi(argv[1]);
+    
+    int N = atoi(argv[2]);//1536;//16384;
+    int party = 0;//atoi(argv[1]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 40 * OneGB);
+
+    initGPURandomness();
+    auto d_rin = randomGEOnGpu<TIn>(N, bin);
+    // destroyGPURandomness();
+
+    // initGPURandomness();
+    printf("Started DPF keygen\n");
+    gpuKeyGenDPF(&curPtr, party, bin, N, d_rin, &g, true);
+    auto k = readGPUDPFKey(&startPtr);
+    Stats s;
+    printf("Starting DPF eval\n");
+    auto start = std::chrono::high_resolution_clock::now();
+    u32 *d_O = gpuDpfEvalAll(k, party, d_rin, &g, (Stats *)&s);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    printf("Time taken for P0=%lu micros, Transfer time=%lu\n", elapsed.count(), s.transfer_time);
+
+ 
+    // printf("Bin=%d, N=%d\n", bin, N);
+    // for (int i = 0; i < N; i++)
+    // {
+    //     auto unmasked_O = (h_O[i] - h_mask_O[i]);
+    //     mod(unmasked_O, bout);
+    //     if (i < 10 || h_X[i] != unmasked_O)
+    //         printf("%d: %ld, %ld, %lf\n", i, h_X[i], unmasked_O, asFloat(unmasked_O, bout, 12)); //double(h_X[i]) / (1ULL << 6), double(unmasked_O) / (1ULL << 12));
+    //     assert(h_X[i] == unmasked_O);
+    // }
+    // // gpuFree(d_identity);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/dpf_lut.cu b/GPU-MPC/tests/fss/dpf_lut.cu
new file mode 100644
index 00000000..c9f22a97
--- /dev/null
+++ b/GPU-MPC/tests/fss/dpf_lut.cu
@@ -0,0 +1,94 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_comms.h"
+
+#include <cassert>
+
+#include "utils/gpu_random.h"
+#include "fss/gpu_lut.h"
+
+#include <sytorch/tensor.h>
+
+using TIn = u8;
+using TOut = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bin = 64;
+    int bout = 50;
+    // auto d_inv = genLUT<T, inv<T>>(bin, 6, 12);
+    auto d_tab = genLUT<TOut, identity<TOut>>(bin, 6, 12);
+    int N = 10000000;//1536;//16384;
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, "0.0.0.0");
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 4 * OneGB);
+    // auto keyBuf1 = ptr1;
+    // auto keyBuf2 = ptr2;
+    // auto d_x = (T*) gpuMalloc(N * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_x, 0, N * sizeof(T)));
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<TIn>(N, bin);
+    auto h_mask_X = (TIn *)moveToCPU((u8 *)d_mask_X, N * sizeof(TIn), NULL);
+    TIn *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bin, d_mask_X, &h_X);
+    // destroyGPURandomness();
+
+    // initGPURandomness();
+    auto d_mask_O = gpuKeyGenLUT<TIn, TOut>(&curPtr, party, bin, bout, N, d_mask_X, &g);
+    auto h_mask_O = (TOut *)moveToCPU((u8 *)d_mask_O, N * sizeof(TOut), NULL);
+    auto k = readGPULUTKey<TOut>(&startPtr);
+    auto start = std::chrono::high_resolution_clock::now();
+    TOut *d_O = gpuDpfLUT<TIn, TOut>(k, peer, party, d_masked_X, d_tab, &g, (Stats *)NULL, false);
+    peer->reconstructInPlace(d_O, bout, N, (Stats*) NULL);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+
+    auto h_O = (TOut *)moveToCPU((uint8_t *)d_O, N * sizeof(TOut), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+
+    printf("Bin=%d, N=%d\n", bin, N);
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_O = (h_O[i] - h_mask_O[i]);
+        mod(unmasked_O, bout);
+        if (i < 10 || h_X[i] != unmasked_O)
+            printf("%d: %ld, %ld, %lf\n", i, h_X[i], unmasked_O, asFloat(unmasked_O, bout, 12)); //double(h_X[i]) / (1ULL << 6), double(unmasked_O) / (1ULL << 12));
+        assert(h_X[i] == unmasked_O);
+    }
+    // gpuFree(d_identity);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/gelu.cu b/GPU-MPC/tests/fss/gelu.cu
new file mode 100644
index 00000000..eab275ef
--- /dev/null
+++ b/GPU-MPC/tests/fss/gelu.cu
@@ -0,0 +1,104 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include <cassert>
+
+#include <sytorch/tensor.h>
+#include <sytorch/backend/cleartext.h>
+
+#include "fss/gpu_gelu.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bw = 48;
+    int bin = 36;
+    int scale = 12;
+    auto d_reluSubGelu = genLUT<T, reluSubGelu<T>>(8, 6, scale);
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+    
+    int N = atoi(argv[3]);
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 50 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X, true, 15);
+    auto d_mask_O = gpuKeyGenGelu<T, u8, 8>(&curPtr, party, bw, bin, scale, N, d_mask_X, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, N * sizeof(T), NULL);
+
+    auto k = readGpuGeluKey<T, u8>(&startPtr);
+    T *d_O;
+    Stats s;
+    for (int i = 0; i < 1; i++)
+    {
+        s.comm_time = 0;
+        s.transfer_time = 0;
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuGelu<T, u8, 8>(peer, party, k, bw, bin, scale, N, d_masked_X, d_reluSubGelu, &g, (Stats *)&s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        printf("Gelu time=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+    unmaskValues(bw, N, d_O, h_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+    Tensor<i64> tIn((i64 *)h_X, {(u64)N});
+    Tensor<i64> tOut({(u64)N});
+    ct->gelu(tIn, tOut, (u64)scale, 0);
+    for (int i = 0; i < N; i++)
+    {
+        if(i < 10) {
+            printf("%d=%ld, %ld\n", i, tOut.data[i], h_O[i]);
+        }
+        if ((u64)tOut.data[i] != h_O[i])
+        {
+            printf("%d=%ld, %ld, %ld, %ld\n", i, h_X[i], tOut.data[i], h_O[i], h_mask_O[i]);
+            assert(0);
+        }
+    }
+    gpuFree(d_reluSubGelu);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/layernorm.cu b/GPU-MPC/tests/fss/layernorm.cu
new file mode 100644
index 00000000..617ae774
--- /dev/null
+++ b/GPU-MPC/tests/fss/layernorm.cu
@@ -0,0 +1,136 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include <cassert>
+#include <numeric>
+#include <sytorch/backend/cleartext.h>
+#include <sytorch/backend/llama_transformer.h>
+
+#include "fss/gpu_layernorm.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    // initCommBufs(true);
+    AvgPoolParams p;
+    p.bw = 50;
+    p.bin = 38;
+    p.scale = 12;
+    p.scaleDiv = 0;
+    p.bwBackprop = 0;
+    p.N = 1;
+    p.imgH = atoi(argv[2]); // 128 * 12;
+    p.imgW = atoi(argv[3]); // 128;
+    p.C = 1;
+    p.FH = 1;
+    p.FW = p.imgW;
+    p.strideH = 1;
+    p.strideW = p.FW;
+    p.zPadHLeft = 0;
+    p.zPadHRight = 0;
+    p.zPadWLeft = 0;
+    p.zPadWRight = 0;
+    initPoolParams(p);
+    int party = atoi(argv[1]);
+    // init llama here
+    sytorch_init();
+    auto llama = new LlamaTransformer<u64>();
+    srand(time(NULL));
+
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = p.bw;
+    LlamaConfig::party = DEALER;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    llama->init("0.0.0.0");
+    int inSz = getInSz(p);
+
+    T *h_I, *h_A, *h_B;
+    auto d_mask_I = randomGEOnGpu<T>(inSz, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, inSz * sizeof(T)));
+    auto d_mask_A = randomGEOnGpu<T>(p.imgW, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_A, 0, p.imgW * sizeof(T)));
+    auto d_mask_B = randomGEOnGpu<T>(p.imgW, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_B, 0, p.imgW * sizeof(T)));
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, inSz * sizeof(T)));
+    // auto h_mask_I = (T *)moveToCPU((u8 *)d_mask_I, inSz * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(inSz, p.bw, d_mask_I, &h_I, true, 15);
+    auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
+    auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
+
+    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
+    llama::start();
+    auto d_mask_O = gpuKeygenLayerNorm(&curPtr, party, p, d_mask_A, d_mask_B, d_mask_I, &g);
+    llama::end();
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, inSz * sizeof(T), NULL);
+    llama->finalize();
+
+    auto k = readGPULayerNormKey<T>(p, &startPtr);
+    Stats s;
+    T *d_O;
+    auto peer = new GpuPeer(true);
+    for (int i = 0; i < 1; i++)
+    {
+        LlamaConfig::party = party + 2;
+        llama->init(argv[4]);
+        peer->peer = LlamaConfig::peer;
+        s.reset();
+        llama::start();
+        d_O = gpuLayerNorm(peer, party, p, k, d_masked_A, d_masked_B, d_masked_I, (std::vector<GroupElement> *)NULL, &g, (Stats *)&s);
+        printf("Layernorm time=%lu micros\n", s.compute_time);
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        llama::end();
+        llama->finalize();
+    }
+    unmaskValues(p.bw, inSz, d_O, d_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((u8 *)d_O, inSz * sizeof(T), NULL);
+    auto ct = new ClearText<i64>();
+    ct->bw = p.bw;
+
+    Tensor<i64> t((i64 *)h_I, {(u64)p.imgH, (u64)p.imgW});
+    ct->layernorm(Tensor1D<i64>((i64 *)h_A, (u64)p.imgW), Tensor1D<i64>((i64 *)h_B, (u64)p.imgW), t, t, p.scale); //(t, t, p.scale, 0);
+    for (int i = 0; i < inSz; i++)
+    {
+        if (i < 10)
+            printf("Index %d=%ld, %ld\n", i, t.data[i], h_O[i]);
+        if (T(t.data[i]) != h_O[i])
+        {
+            printf("Index %d=%ld, %ld\n", i, t.data[i], h_O[i]);
+            assert(0);
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/mha.cu b/GPU-MPC/tests/fss/mha.cu
new file mode 100644
index 00000000..35bf473a
--- /dev/null
+++ b/GPU-MPC/tests/fss/mha.cu
@@ -0,0 +1,225 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+#include <cstdint>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_mha.h"
+#include <sytorch/module.h>
+
+using T = u64;
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+T *sigmaMHA(int party, string ip, int bw, int scale, int n_embed, int n_seq, int n_heads, int dim_W, T *h_WQKV, T *h_YQKV, T *h_WProj, T *h_YProj, T *h_I)
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    initCPURandomness();
+
+    auto mhaTab = initMHATables<T>(n_seq, scale);
+    MHAParams pMHA = {n_seq, n_embed, n_heads, dim_W, true, true, false};
+    MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
+
+    auto d_mask_X = randomGEOnGpu<T>(pMHAMul.pQKV.size_A, bw);
+    auto d_masked_X = (T *)moveToGPU((u8 *)h_I, pMHAMul.pQKV.size_A * sizeof(T), NULL);
+    gpuLinearComb(bw, pMHAMul.pQKV.size_A, d_masked_X, T(1), d_masked_X, T(1), d_mask_X);
+
+    auto h_mask_WQKV = randomGEOnCpu<T>(pMHAMul.pQKV.size_B, bw);
+    auto h_mask_WProj = randomGEOnCpu<T>(pMHAMul.pProj.size_B, bw);
+    int biasSzQKV = pMHAMul.pQKV.N * pMHAMul.pQKV.batchSz;
+    printf("Bias size=%d\n", biasSzQKV);
+    auto h_mask_YQKV = randomGEOnCpu<T>(biasSzQKV, bw);
+    auto h_mask_YProj = randomGEOnCpu<T>(pMHAMul.pProj.N, bw);
+
+    printf("Trying to allocate keyBuf\n");
+    auto startPtr = cpuMalloc(10 * OneGB);
+    auto curPtr = startPtr;
+
+    auto d_mask_Z = gpuKeygenMHA(&curPtr, party, bw, scale, pMHA, pMHAMul, h_mask_WQKV, h_mask_YQKV, h_mask_WProj, h_mask_YProj, d_mask_X, &g);
+    printf("Key size=%lu\n", curPtr - startPtr);
+    auto h_mask_Z = (T *)moveToCPU((u8 *)d_mask_Z, pMHAMul.pProj.size_C * sizeof(T), NULL);
+    printf("Mask Z: %ld\n", u64(h_mask_Z[0]));
+
+    auto h_masked_WQKV = h_mask_WQKV;
+    auto h_masked_YQKV = h_mask_YQKV;
+    auto h_masked_WProj = h_mask_WProj;
+    auto h_masked_YProj = h_mask_YProj;
+
+    for (int i = 0; i < pMHAMul.pQKV.size_B; i++)
+    {
+        h_masked_WQKV[i] += h_WQKV[i];
+        cpuMod(h_masked_WQKV[i], bw);
+    }
+    for (int i = 0; i < biasSzQKV; i++)
+    {
+        h_masked_YQKV[i] += h_YQKV[i];
+        cpuMod(h_masked_YQKV[i], bw);
+        // printf("Y[%d]=%ld, Masked Y=%ld\n", i, h_YQKV[i], mha_layer.YQKV[i]);
+    }
+    for (int i = 0; i < pMHAMul.pProj.size_B; i++)
+    {
+        h_masked_WProj[i] += h_WProj[i];
+        cpuMod(h_masked_WProj[i], bw);
+    }
+    for (int i = 0; i < pMHAMul.pProj.N; i++)
+    {
+        h_masked_YProj[i] += h_YProj[i];
+        cpuMod(h_masked_YProj[i], bw);
+    }
+
+    auto k = readGPUMHAKey<T>(pMHA, pMHAMul, &startPtr);
+    Stats s;
+    T *d_masked_Z;
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, ip);
+
+    for (int i = 0; i < 1; i++)
+    {
+        s.reset();
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_masked_Z = gpuMHA(peer, party, bw, scale, pMHA, pMHAMul, k, h_masked_WQKV, h_masked_YQKV, h_masked_WProj, h_masked_YProj, d_masked_X, mhaTab, &g, &s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+        printf("Softmax time=%lu micros\n", s.compute_time);
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        printf("Bytes sent=%lu B\n", peer->bytesSent());
+        printf("Bytes received=%lu B\n", peer->bytesReceived());
+    }
+
+    unmaskValues(bw, pMHAMul.pProj.size_C, d_masked_Z, h_mask_Z, NULL);
+    auto h_Z = (T *)moveToCPU((u8 *)d_masked_Z, k.mmKeyProj.mem_size_C, NULL);
+    return h_Z;
+}
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_seq = 128;
+    int bw = 50;
+    int party = atoi(__argv[1]);
+    std::string ip = __argv[2];
+    const u64 scale = 12;
+
+    MultiHeadAttention<i64> ctNet(n_head, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    ctNet.SytorchModule<i64>::init(scale, input);
+    input.as_2d().randomize(1LL << scale);
+    (ctNet.c_attn)->weight.randomize(10000.0f);
+    (ctNet.c_attn)->bias.randomize(10000.0f);
+    (ctNet.c_proj)->weight.randomize(10000.0f);
+    (ctNet.c_proj)->bias.randomize(10000.0f);
+    ctNet.forward(input);
+    // printf("CPU W size=%d\n", (ctNet.c_attn)->weight.size());
+    // printshape(ctNet.activation.shape);
+    // print(ctNet.activation, scale);//, 52);
+    // printf("\n");
+
+    assert(n_embd % n_head == 0);
+    auto h_O = sigmaMHA(party, ip, bw, (int)scale, n_embd, n_seq, n_head, n_embd / n_head, (u64 *)(ctNet.c_attn)->weight.data, (u64 *)(ctNet.c_attn)->bias.data, (u64 *)(ctNet.c_proj)->weight.data, (u64 *)(ctNet.c_proj)->bias.data, (u64 *)input.data);
+    for (int i = 0; i < ctNet.activation.size(); i++)
+    {
+        if (i < 10)
+        {
+            printf("%d=%ld, %ld\n", i, ctNet.activation.data[i], h_O[i]);
+        }
+        if ((u64)ctNet.activation.data[i] != (u64)h_O[i])
+        {
+            printf("%d=%ld, %ld\n", i, ctNet.activation.data[i], h_O[i]);
+            assert(0);
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/piranha_softmax.cu b/GPU-MPC/tests/fss/piranha_softmax.cu
new file mode 100644
index 00000000..8939a205
--- /dev/null
+++ b/GPU-MPC/tests/fss/piranha_softmax.cu
@@ -0,0 +1,75 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+
+#include <sytorch/backend/llama_base.h>
+#include <sytorch/softmax.h>
+
+int main(int argc, char *argv[])
+{
+    int s1 = 128;
+    Tensor4D<i64> in(s1, 10, 1, 1);
+    for (int i = 0; i < s1 * 10; i++)
+    {
+        in.data[i] = (rand() % 10) * (1LL << 22);
+    }
+    // in.fill(1LL << 24);
+    Tensor4D<i64> out(s1, 10, 1, 1);
+    u64 scale = 24;
+    pirhana_softmax_ct(in, out, scale);
+
+    int party = atoi(argv[1]);
+    if (party == DEALER)
+    {
+        LlamaConfig::party = DEALER;
+        auto llama = new LlamaBase<u64>();
+        llama->init("0.0.0.0");
+        Tensor4D<u64> inMask(s1, 10, 1, 1);
+        inMask.fill(0);
+        Tensor4D<u64> outMask(s1, 10, 1, 1);
+        pirhana_softmax(inMask, outMask, scale);
+        auto outMask_nd = outMask.as_nd();
+        llama->output(outMask_nd);
+        llama->finalize();
+    }
+    else
+    {
+        LlamaConfig::party = party;
+        auto llama = new LlamaBase<u64>();
+        llama->init("0.0.0.0");
+        Tensor4D<u64> inp(s1, 10, 1, 1);
+        memcpy(inp.data, in.data, s1 * 10 * sizeof(u64));
+        Tensor4D<u64> out2(s1, 10, 1, 1);
+        pirhana_softmax(inp, out2, scale);
+        reconstruct(out2.d1 * out2.d2, out2.data, 64);
+        auto output_nd = out2.as_nd();
+        llama->output(output_nd);
+        llama->finalize();
+        for (int i = 0; i < s1; i++)
+        {
+            // printf("Max %d=%ld\n", i, out2.data[i]);
+            if (i < 10 || (out2.data[i] - out.data[i] > 5))
+                printf("%d=%ld, %ld\n", i, out2.data[i], out.data[i]);
+            assert(out2.data[i] - out.data[i] <= 5);
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/relu.cu b/GPU-MPC/tests/fss/relu.cu
new file mode 100644
index 00000000..e9ad6d12
--- /dev/null
+++ b/GPU-MPC/tests/fss/relu.cu
@@ -0,0 +1,101 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_relu.h"
+
+#include <cassert>
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bw = 64;
+    // T max = (1ULL << 14) - 1;
+    // int bw = 64;
+    int N = atoi(argv[2]); //8;
+    const u64 p = (1ULL << 16) - 1;
+    const u64 q = p;
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[3]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+    // auto keyBuf1 = ptr1;
+    // auto keyBuf2 = ptr2;
+    // auto d_x = (T*) gpuMalloc(N * sizeof(T));
+    // checkCudaErrors(cudaMemset(d_x, 0, N * sizeof(T)));
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X);
+
+    auto d_reluMask = gpuGenReluKey<T, T, p, q, false>(&curPtr, party, bw, bw, N, d_mask_X, &g);
+    printf("Key size=%lu\n", curPtr - startPtr);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_reluMask, N * sizeof(T), NULL);
+    auto k1 = readReluKey<T>(&startPtr);
+    T *d_O;
+    for (int i = 0; i < 1; i++)
+    {
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuRelu<T, T, p, q, false>(peer, party, k1, d_masked_X, &g, (Stats *)NULL);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+
+    printf("bw=%d, bw=%d, N=%d\n", bw, bw, N);
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_O = (h_O[i] - h_mask_O[i]);
+        cpuMod(unmasked_O, bw);
+        h_X[i] -= p;
+        cpuMod(h_X[i], bw);
+        auto o = h_X[i] * (1 - (h_X[i] >> (bw - 1))) + q;
+        cpuMod(o, bw);
+        // auto o = std::min(std::abs((i64) h_X[i]), (i64) max);
+        if (i < 10)
+            printf("%d: %ld, %ld, %ld, %ld\n", i, h_X[i], o, unmasked_O, h_mask_X[i]);
+        assert(o == unmasked_O);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/rmsnorm.cu b/GPU-MPC/tests/fss/rmsnorm.cu
new file mode 100644
index 00000000..537ab820
--- /dev/null
+++ b/GPU-MPC/tests/fss/rmsnorm.cu
@@ -0,0 +1,140 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_layernorm.h"
+
+#include <cassert>
+#include <numeric>
+#include <sytorch/backend/cleartext.h>
+#include <sytorch/backend/llama_transformer.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    AvgPoolParams p;
+    p.bw = 50;
+    p.bin = 38;
+    p.scale = 12;
+    p.scaleDiv = 0;
+    p.bwBackprop = 0;
+    p.N = 1;
+    p.imgH = atoi(argv[2]); //128 * 12;
+    p.imgW = atoi(argv[3]); //128;
+    p.C = 1;
+    p.FH = 1;
+    p.FW = p.imgW;
+    p.strideH = 1;
+    p.strideW = p.FW;
+    p.zPadHLeft = 0;
+    p.zPadHRight = 0;
+    p.zPadWLeft = 0;
+    p.zPadWRight = 0;
+    initPoolParams(p);
+
+    int party = atoi(argv[1]);
+    // init llama here
+    sytorch_init();
+    auto llama = new LlamaTransformer<u64>();
+    srand(time(NULL));
+
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = p.bw;
+    LlamaConfig::party = 1;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    llama->init("0.0.0.0");
+    int inSz = getInSz(p);
+
+    T *h_I, *h_A, *h_B;
+    auto d_mask_I = randomGEOnGpu<T>(inSz, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, inSz * sizeof(T)));
+    auto d_mask_A = randomGEOnGpu<T>(p.imgW, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_A, 0, p.imgW * sizeof(T)));
+    auto d_mask_B = randomGEOnGpu<T>(p.imgW, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_B, 0, p.imgW * sizeof(T)));
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, inSz * sizeof(T)));
+    // auto h_mask_I = (T *)moveToCPU((u8 *)d_mask_I, inSz * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(inSz, p.bw, d_mask_I, &h_I, true, 15);
+    auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
+    auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
+
+    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
+    llama::start();
+    auto d_mask_O = gpuKeygenLayerNorm(&curPtr, party, p, d_mask_A, d_mask_B, d_mask_I, &g, false);
+    llama::end();
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, inSz * sizeof(T), NULL);
+    llama->finalize();
+
+    auto k = readGPULayerNormKey<T>(p, &startPtr, false);
+    Stats s;
+    T *d_O;
+    auto peer = new GpuPeer(true);
+    for (int i = 0; i < 1; i++)
+    {
+        LlamaConfig::party = party + 2;
+        llama->init(argv[4]);
+        peer->peer = LlamaConfig::peer;
+        s.compute_time = 0;
+        s.comm_time = 0;
+        s.transfer_time = 0;
+        llama::start();
+        d_O = gpuLayerNorm(peer, party, p, k, d_masked_A, d_masked_B, d_masked_I, NULL, &g, (Stats *)&s, false);
+        printf("Layernorm time=%lu micros\n", s.compute_time);
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        llama::end();
+        llama->finalize();
+    }
+    unmaskValues(p.bw, inSz, d_O, d_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((u8 *)d_O, inSz * sizeof(T), NULL);
+    
+    auto ct = new ClearText<i64>();
+    ct->bw = p.bw;
+
+    Tensor<i64> t((i64 *)h_I, {(u64)p.imgH, (u64)p.imgW});
+    ct->rmsnorm(Tensor1D<i64>((i64 *)h_A, (u64)p.imgW), Tensor1D<i64>((i64 *)h_B, (u64)p.imgW), t, t, p.scale); //(t, t, p.scale, 0);
+    ct->Backend<i64>::truncate(t, (u64)p.scale);
+    for (int i = 0; i < inSz; i++)
+    {
+        if(i < 10) printf("Index %d=%ld, %ld\n", i, t.data[i], h_O[i]);
+        if (T(t.data[i]) != h_O[i])
+        {
+            printf("Index %d=%ld, %ld\n", i, t.data[i], h_O[i]);
+            assert(0);
+            // break;
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/secfloat_softmax.cu b/GPU-MPC/tests/fss/secfloat_softmax.cu
new file mode 100644
index 00000000..fd30656f
--- /dev/null
+++ b/GPU-MPC/tests/fss/secfloat_softmax.cu
@@ -0,0 +1,79 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+
+#include <sytorch/backend/llama_base.h>
+#include <sytorch/softmax.h>
+
+int main(int argc, char *argv[])
+{
+    int s1 = 100;
+    Tensor4D<i64> in(s1, 10, 1, 1);
+    Tensor4D<float> inFloat(s1, 10, 1, 1);
+    Tensor4D<float> outFloat(s1, 10, 1, 1);
+    u64 scale = 24;
+    for (int i = 0; i < s1 * 10; i++)
+    {
+        in.data[i] = (rand() % 10) * (1LL << (scale - 2));
+        inFloat.data[i] = in.data[i] / float(1LL << scale);
+    }
+    softmax<float, 0>(inFloat, outFloat);
+
+    int party = atoi(argv[1]);
+    if (party == DEALER)
+    {
+        LlamaConfig::party = DEALER;
+        auto llama = new LlamaBase<u64>();
+        llama->init("0.0.0.0");
+        Tensor4D<u64> inMask(s1, 10, 1, 1);
+        inMask.fill(0);
+        Tensor4D<u64> outMask(s1, 10, 1, 1);
+        softmax_secfloat(inMask, outMask, scale, LlamaConfig::party);
+        printf("Outmask=%lu\n", outMask.data[0]);
+        auto outMask_nd = outMask.as_nd();
+        llama->output(outMask_nd);
+        llama->finalize();
+    }
+    else
+    {
+        LlamaConfig::party = party;
+        auto llama = new LlamaBase<u64>();
+        llama->init("0.0.0.0");
+        secfloat_init(party - 1, "0.0.0.0");
+        Tensor4D<u64> inp(s1, 10, 1, 1);
+        memcpy(inp.data, in.data, s1 * 10 * sizeof(u64));
+        Tensor4D<u64> out2(s1, 10, 1, 1);
+        softmax_secfloat(inp, out2, scale, LlamaConfig::party);
+        reconstruct(out2.d1 * out2.d2, out2.data, 64);
+        auto output_nd = out2.as_nd();
+        printf("Output=%lu\n", out2.data[0]);
+        llama->output(output_nd);
+        llama->finalize();
+        for (int i = 0; i < s1; i++)
+        {
+            // printf("Max %d=%ld\n", i, out2.data[i]);
+            if (i < 10 /*|| (out2.data[i] - out.data[i] > 5)*/)
+                printf("%d=%f, %lu, %f, %f\n", i, inFloat.data[i], out2.data[i], out2.data[i] / float(1LL << (scale)), outFloat.data[i]);
+            // assert(out2.data[i] - out.data[i] <= 5);
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/silu.cu b/GPU-MPC/tests/fss/silu.cu
new file mode 100644
index 00000000..a0a59795
--- /dev/null
+++ b/GPU-MPC/tests/fss/silu.cu
@@ -0,0 +1,105 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cassert>
+
+#include <sytorch/tensor.h>
+#include <sytorch/backend/cleartext.h>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include "fss/gpu_gelu.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bw = 48;//51;
+    int bin = 36;//38;
+    int scale = 12;
+    auto d_reluSubSilu = genLUT<T, reluSubSilu<T>>(10, 6, scale);
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+
+    int N = atoi(argv[3]);
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 40 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X, true, 15);
+
+    auto d_mask_O = gpuKeyGenGelu<T, u16, 10>(&curPtr, party, bw, bin, scale, N, d_mask_X, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, N * sizeof(T), NULL);
+
+    auto k = readGpuGeluKey<T, u16>(&startPtr);
+    T *d_O;
+    Stats s;
+    for (int i = 0; i < 1; i++)
+    {
+        s.comm_time = 0;
+        s.transfer_time = 0;
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuGelu<T, u16, 10>(peer, party, k, bw, bin, scale, N, d_masked_X, d_reluSubSilu, &g, (Stats *)&s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        printf("Silu time=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+    unmaskValues(bw, N, d_O, h_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+    Tensor<i64> tIn((i64 *)h_X, {(u64)N});
+    Tensor<i64> tOut({(u64)N});
+    ct->silu(tIn, tOut, (u64)scale, 0);
+    for (int i = 0; i < N; i++)
+    {
+        if(i < 10) {
+            printf("%d=%ld, %ld\n", i, tOut.data[i], h_O[i]);
+        }
+        if ((u64)tOut.data[i] != h_O[i])
+        {
+            printf("%d=%ld, %ld, %ld\n", i, tIn.data[i], tOut.data[i], h_O[i]);
+            assert(0);
+        }
+    }
+    gpuFree(d_reluSubSilu);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/softmax.cu b/GPU-MPC/tests/fss/softmax.cu
new file mode 100644
index 00000000..3dfe2a08
--- /dev/null
+++ b/GPU-MPC/tests/fss/softmax.cu
@@ -0,0 +1,133 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include <cassert>
+#include <numeric>
+#include <sytorch/backend/cleartext.h>
+
+#include "fss/gpu_softmax.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    // initCommBufs(true);
+    MaxpoolParams p;
+    p.bw = 50;
+    p.bin = 38;
+    p.scale = 12;
+    p.scaleDiv = 0;
+    p.bwBackprop = 0;
+    p.N = 12;
+    p.imgH = atoi(argv[2]); // 128 * 12;
+    p.imgW = atoi(argv[3]); // 128;
+    p.C = 1;
+    p.FH = 1;
+    p.FW = p.imgW;
+    p.strideH = 1;
+    p.strideW = p.FW;
+    p.zPadHLeft = 0;
+    p.zPadHRight = 0;
+    p.zPadWLeft = 0;
+    p.zPadWRight = 0;
+    p.H = ((p.imgH - p.FH + (p.zPadHLeft + p.zPadHRight)) / p.strideH) + 1;
+    p.W = ((p.imgW - p.FW + (p.zPadWLeft + p.zPadWRight)) / p.strideW) + 1;
+    p.isLowerTriangular = true;
+    printf("Output H=%d, output W=%d\n", p.H, p.W);
+    auto d_nExpMsbTab = genLUT<T, nExpMsb<T>>(8, 4, p.scale);
+    auto d_nExpLsbTab = genLUT<T, nExpLsb<T>>(8, 12, p.scale);
+    auto d_invTab = genLUT<T, inv<T>>(int(ceil(log2(p.FW))) + p.scale, 6 /*p.FW*/, p.scale);
+
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[4]);
+    
+    // printf("Here\n");
+    int inSz = getInSz(p);
+    int outSz = p.N * p.imgH * p.imgW * p.C;
+    // printf("N=%d, %d\n", inSz, p.N * p.imgH * p.imgW * p.C);
+
+    T *h_I;
+    auto d_mask_I = randomGEOnGpu<T>(inSz, p.bin);
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, inSz * sizeof(T)));
+    auto h_mask_I = (T *)moveToCPU((u8 *)d_mask_I, inSz * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(inSz, p.bw, d_mask_I, &h_I, true, 15);
+
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+
+    auto d_mask_O = gpuKeygenSoftmax(&curPtr, party, p, d_mask_I, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, outSz * sizeof(T), NULL);
+
+    auto k = readGPUSoftMaxKey<T>(p, &startPtr);
+    Stats s;
+    s.compute_time = 0;
+    peer->sync();
+    auto startComm = peer->bytesSent() + peer->bytesReceived();
+    auto start = std::chrono::high_resolution_clock::now();
+    auto d_O = gpuSoftmax(peer, party, p, k, d_masked_I, d_nExpMsbTab, d_nExpLsbTab, d_invTab, &g, (Stats *)&s);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    printf("Maxpool time=%lu micros\n", s.compute_time);
+    auto endComm = peer->bytesSent() + peer->bytesReceived();
+    printf("Comm=%ld B\n", endComm - startComm);
+    auto h_O = (T *)moveToCPU((u8 *)d_O, outSz * sizeof(T), NULL);
+
+    for (int i = 0; i < outSz; i++)
+    {
+        h_O[i] = h_O[i] - h_mask_O[i];
+        cpuMod(h_O[i], p.bw);
+    }
+    auto ct = new ClearText<i64>();
+    ct->bw = p.bw;
+    
+    auto d_I = (T *)moveToGPU((u8 *)h_I, inSz * sizeof(T), NULL);
+    T mInf = -(1ULL << (p.bin - 1));
+    auto d_eI = expandLowerTriangularMatrix(p, d_I, mInf);
+    auto h_eI = (T *)moveToCPU((u8 *)d_eI, outSz * sizeof(T), NULL);
+    Tensor<i64> t1((i64 *)h_eI, {(u64)p.N * p.imgH, (u64)p.imgW});
+    Tensor<i64> t2({(u64)p.N * p.imgH, (u64)p.imgW});
+    ct->softmax(t1, t2, p.scale, 0);
+    for (int i = 0; i < outSz; i++)
+    {
+        if (i < 10)
+            printf("Index %d=%ld, %ld, %lf\n", i, t2.data[i], h_O[i], asFloat(h_eI[i], p.bw, p.scale));
+
+        if (T(t2.data[i]) != h_O[i])
+        {
+            printf("Index %d=%ld, %ld, %lf\n", i, t2.data[i], h_O[i], asFloat(h_eI[i], p.bw, p.scale));
+            assert(0);
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/fss/truncate.cu b/GPU-MPC/tests/fss/truncate.cu
new file mode 100644
index 00000000..e24c9af1
--- /dev/null
+++ b/GPU-MPC/tests/fss/truncate.cu
@@ -0,0 +1,95 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_truncate.h"
+
+#include <cassert>
+#include <sytorch/tensor.h>
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    // initCommBufs(true);
+    // initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    int bin = 64;
+    int bout = 40;
+    int shift = 24;
+    int N = atoi(argv[3]);
+    int party = atoi(argv[1]);
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+    
+    TruncateType t = TruncateType::TrWithSlack;
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+    
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bin);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bin, d_mask_X, &h_X, true, bin - 1);
+
+    auto d_truncateMask = genGPUTruncateKey<T, T>(&curPtr, party, t, bin, bout, shift, N, d_mask_X, &g);
+    printf("Key size=%lu\n", curPtr - startPtr);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_truncateMask, N * sizeof(T), NULL);
+    gpuFree(d_truncateMask);
+    // printf("here\n");
+    auto k1 = readGPUTruncateKey<T>(t, &startPtr);
+    T *d_O;
+    // printf("here\n");
+    for (int i = 0; i < 1; i++)
+    {
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuTruncate<T, T>(bin, bout, t, k1, shift, peer, party, N, d_masked_X, &g, (Stats *)NULL);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Time taken=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    gpuFree(d_O);
+    destroyGPURandomness();
+
+    for (int i = 0; i < N; i++)
+    {
+        auto unmasked_O = (h_O[i] - h_mask_O[i]);
+        cpuMod(unmasked_O, bout);
+        auto o = cpuArs(h_X[i], bin, shift);
+        cpuMod(o, bout);
+        if (i < 10 || o != unmasked_O)
+            printf("%d: %ld, %ld, %ld, %ld\n", i, h_X[i], o, unmasked_O, h_mask_X[i]);
+        assert(o == unmasked_O);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/nn/orca/conv2d_test.cu b/GPU-MPC/tests/nn/orca/conv2d_test.cu
new file mode 100644
index 00000000..9ee651b8
--- /dev/null
+++ b/GPU-MPC/tests/nn/orca/conv2d_test.cu
@@ -0,0 +1,142 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <cassert>
+#include <cstdint>
+
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_conv2d.h"
+#include "nn/orca/conv2d_layer.h"
+
+
+using T = u64;
+
+using namespace dcf;
+using namespace dcf::orca;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    int bin = global::bw;  //64;
+    int bout = global::bw; //64;
+    int N = 128;
+    int H = 32;
+    int W = 32;
+    int CI = 3;
+    int FH = 5;         //11;//5;
+    int FW = 5;         //11;//5;
+    int CO = 64;        //96;//64;
+    int zPadHLeft = 1;  //9;//1;
+    int zPadHRight = 1; //9;//1;
+    int zPadWLeft = 1;  //9;//1;
+    int zPadWRight = 1; //9;//1;
+    int strideH = 1;    //4;//1;
+    int strideW = 1;    //4;//1;
+    bool useMomentum = true;
+    int epoch = 0;
+
+    int party = atoi(argv[1]);
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+    auto conv2d_layer = Conv2DLayer<T>(bin, bout, N, H, W, CI, FH, FW, CO, zPadHLeft, zPadHRight, zPadWLeft, zPadWRight, strideH, strideW, true, dcf::TruncateType::StochasticTruncate, dcf::TruncateType::StochasticTruncate, true, false);
+    conv2d_layer.setTrain(useMomentum);
+
+    T *h_I, *h_F, *h_b, *h_grad, *h_Vf, *h_Vb;
+
+    auto d_mask_I = randomGEOnGpu<T>(conv2d_layer.p.size_I, bin);
+    // checkCudaErrors(cudaMemset(d_mask_I, 0, conv2d_layer.p.size_I * sizeof(T)));
+    auto d_masked_I = getMaskedInputOnGpu<T>(conv2d_layer.p.size_I, bin, d_mask_I, &h_I);
+    auto d_mask_F = randomGEOnGpu<T>(conv2d_layer.p.size_F, bin);
+    // checkCudaErrors(cudaMemset(d_mask_F, 0, conv2d_layer.p.size_F * sizeof(T)));
+    auto h_masked_F = getMaskedInputOnCpu<T>(conv2d_layer.p.size_F, bin, d_mask_F, &h_F);
+    auto d_mask_b = randomGEOnGpu<T>(CO, bin);
+    // checkCudaErrors(cudaMemset(d_mask_b, 0, conv2d_layer.p.CO * sizeof(T)));
+    auto h_masked_b = getMaskedInputOnCpu<T>(CO, bin, d_mask_b, &h_b);
+
+    auto d_mask_grad = randomGEOnGpu<T>(conv2d_layer.p.size_O, bin);
+    auto d_masked_grad = getMaskedInputOnGpu<T>(conv2d_layer.p.size_O, bin, d_mask_grad, &h_grad);
+    auto d_mask_Vf = randomGEOnGpu<T>(conv2d_layer.p.size_F, bin);
+    auto h_masked_Vf = getMaskedInputOnCpu<T>(conv2d_layer.p.size_F, bin, d_mask_Vf, &h_Vf);
+    auto d_mask_Vb = randomGEOnGpu<T>(CO, bin);
+    auto h_masked_Vb = getMaskedInputOnCpu<T>(CO, bin, d_mask_Vb, &h_Vb);
+
+    moveIntoCPUMem((u8 *)conv2d_layer.mask_F, (u8 *)d_mask_F, conv2d_layer.p.size_F * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)conv2d_layer.mask_Vf, (u8 *)d_mask_Vf, conv2d_layer.p.size_F * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)conv2d_layer.mask_b, (u8 *)d_mask_b, CO * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)conv2d_layer.mask_Vb, (u8 *)d_mask_Vb, CO * sizeof(T), NULL);
+
+    auto startPtr = cpuMalloc(10 * OneGB);
+    auto curPtr = startPtr;
+
+    auto d_mask_C = conv2d_layer.genForwardKey(&curPtr, party, d_mask_I, &g);
+    auto h_mask_C = (T *)moveToCPU((u8 *)d_mask_C, conv2d_layer.convKey.mem_size_O, NULL);
+    printf("mask C=%lu, %lu\n", h_mask_C[0], h_mask_C[1]);
+    auto d_mask_dI = conv2d_layer.genBackwardKey(&curPtr, party, d_mask_grad, &g, epoch);
+    auto h_mask_dI = (T *)moveToCPU((u8 *)d_mask_dI, conv2d_layer.convKey.mem_size_I, NULL);
+
+    // it says dF but actually means updated F
+    auto h_mask_new_Vf = (T *)cpuMalloc(conv2d_layer.p.size_F * sizeof(T));
+    auto h_mask_new_Vb = (T *)cpuMalloc(CO * sizeof(T));
+    auto h_mask_new_F = (T *)cpuMalloc(conv2d_layer.p.size_F * sizeof(T));
+    auto h_mask_new_b = (T *)cpuMalloc(CO * sizeof(T));
+
+    memcpy(h_mask_new_Vf, conv2d_layer.mask_Vf, conv2d_layer.p.size_F * sizeof(T));
+    memcpy(h_mask_new_Vb, conv2d_layer.mask_Vb, CO * sizeof(T));
+    memcpy(h_mask_new_F, conv2d_layer.mask_F, conv2d_layer.p.size_F * sizeof(T));
+    memcpy(h_mask_new_b, conv2d_layer.mask_b, CO * sizeof(T));
+
+    curPtr = startPtr;
+    conv2d_layer.readForwardKey(&curPtr);
+    conv2d_layer.readBackwardKey(&curPtr, epoch);
+
+    memcpy(conv2d_layer.F, h_masked_F, conv2d_layer.convKey.mem_size_F);
+    memcpy(conv2d_layer.b, h_masked_b, CO * sizeof(T));
+    auto d_masked_C = conv2d_layer.forward(peer, party, d_masked_I, &g);
+    memcpy(conv2d_layer.Vf, h_masked_Vf, conv2d_layer.convKey.mem_size_F);
+    memcpy(conv2d_layer.Vb, h_masked_Vb, CO * sizeof(T));
+    auto d_masked_dI = conv2d_layer.backward(peer, party, d_masked_grad, &g, epoch);
+    auto h_masked_C = (T *)moveToCPU((u8 *)d_masked_C, conv2d_layer.convKey.mem_size_O, NULL);
+    auto h_masked_dI = (T *)moveToCPU((u8 *)d_masked_dI, conv2d_layer.convKey.mem_size_I, NULL);
+
+    auto h_C_ct = gpuConv2DWrapper<T>(conv2d_layer.convKey, h_I, h_F, h_b, 0, true);
+    printf("Checking C\n");
+    checkTrStWithTol(bin, bout /* - scale*/, global::scale, conv2d_layer.p.size_O, h_masked_C, h_mask_C, h_C_ct);
+    printf("Checking dI\n");
+    auto h_dI_ct = gpuConv2DWrapper<T>(conv2d_layer.convKeydI, h_grad, h_F, NULL, 1, false);
+    checkTrStWithTol<T>(bin, bout, global::scale, conv2d_layer.p.size_I, h_masked_dI, h_mask_dI, h_dI_ct);
+
+    auto h_dF_ct = gpuConv2DWrapper<T>(conv2d_layer.convKeydF, h_grad, h_I, NULL, 2, false);
+    printf("Checking sgd for F, momentum=%d\n", useMomentum);
+    checkOptimizer(bin, bout, conv2d_layer.p.size_F, h_F, h_Vf, h_dF_ct, conv2d_layer.F, conv2d_layer.Vf,
+                   h_mask_new_F, h_mask_new_Vf, global::scale, 2 * global::scale, 2 * global::scale, useMomentum, epoch);
+    auto h_db_ct = getBiasGradWrapper(conv2d_layer.p.size_O / CO, CO, bout, h_grad);
+    printf("Checking sgd for b, momentum=%d\n", useMomentum);
+    checkOptimizer(bin, bout, CO, h_b, h_Vb, h_db_ct, conv2d_layer.b, conv2d_layer.Vb, h_mask_new_b, h_mask_new_Vb, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, useMomentum, epoch);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/nn/orca/fc_test.cu b/GPU-MPC/tests/nn/orca/fc_test.cu
new file mode 100644
index 00000000..c6321410
--- /dev/null
+++ b/GPU-MPC/tests/nn/orca/fc_test.cu
@@ -0,0 +1,132 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <cassert>
+#include <cstdint>
+
+#include "utils/gpu_mem.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_random.h"
+
+#include "fss/gpu_matmul.h"
+#include "nn/orca/fc_layer.h"
+
+using T = u64;
+
+using namespace dcf;
+using namespace dcf::orca;
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+    int bin = 64, bout = 64, M = 100, N = 10, K = 64;
+    bool useMomentum = true;
+    int epoch = 0;
+
+    int party = atoi(argv[1]);
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    auto fc_layer = FCLayer<T>(bin, bout, M, N, K, dcf::TruncateType::StochasticTruncate, dcf::TruncateType::StochasticTruncate, true, true, false);
+    fc_layer.setTrain(useMomentum);
+    T *h_X, *h_W, *h_Y, *h_Z, *h_grad, *h_Vw, *h_Vy;
+
+    // check: have you reconstructed the masked output in the protocol?
+    auto d_mask_X = randomGEOnGpu<T>(fc_layer.p.size_A, bin);
+    auto d_masked_X = getMaskedInputOnGpu<T>(fc_layer.p.size_A, bin, d_mask_X, &h_X);
+    auto d_mask_W = randomGEOnGpu<T>(fc_layer.p.size_B, bin);
+    auto h_masked_W = getMaskedInputOnCpu<T>(fc_layer.p.size_B, bin, d_mask_W, &h_W);
+    auto d_mask_Y = randomGEOnGpu<T>(N, bin);
+    auto h_masked_Y = getMaskedInputOnCpu<T>(N, bin, d_mask_Y, &h_Y);
+
+    auto d_mask_grad = randomGEOnGpu<T>(fc_layer.p.size_C, bin);
+    auto d_masked_grad = getMaskedInputOnGpu<T>(fc_layer.p.size_C, bin, d_mask_grad, &h_grad);
+
+    auto d_mask_Vw = randomGEOnGpu<T>(fc_layer.p.size_B, bin);
+    auto h_masked_Vw = getMaskedInputOnCpu<T>(fc_layer.p.size_B, bin, d_mask_Vw, &h_Vw);
+    auto d_mask_Vy = randomGEOnGpu<T>(N, bin);
+    auto h_masked_Vy = getMaskedInputOnCpu<T>(N, bin, d_mask_Vy, &h_Vy);
+
+    moveIntoCPUMem((u8 *)fc_layer.mask_W, (u8 *)d_mask_W, fc_layer.p.size_B * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)fc_layer.mask_Vw, (u8 *)d_mask_Vw, fc_layer.p.size_B * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)fc_layer.mask_Y, (u8 *)d_mask_Y, N * sizeof(T), NULL);
+    moveIntoCPUMem((u8 *)fc_layer.mask_Vy, (u8 *)d_mask_Vy, N * sizeof(T), NULL);
+
+    auto startPtr = cpuMalloc(5 * OneGB);
+    auto curPtr = startPtr;
+
+    auto d_mask_Z = fc_layer.genForwardKey(&curPtr, party, d_mask_X, &g);
+    auto h_mask_Z = (T *)moveToCPU((u8 *)d_mask_Z, fc_layer.mmKey.mem_size_C, NULL);
+    auto d_mask_dX = fc_layer.genBackwardKey(&curPtr, party, d_mask_grad, &g, epoch);
+    auto h_mask_dX = (T *)moveToCPU((u8 *)d_mask_dX, fc_layer.mmKey.mem_size_A, NULL);
+
+    auto h_mask_new_Vw = (T *)cpuMalloc(fc_layer.p.size_B * sizeof(T));
+    auto h_mask_new_Vy = (T *)cpuMalloc(N * sizeof(T));
+    auto h_mask_new_W = (T *)cpuMalloc(fc_layer.p.size_B * sizeof(T));
+    auto h_mask_new_Y = (T *)cpuMalloc(N * sizeof(T));
+
+    memcpy(h_mask_new_Vw, fc_layer.mask_Vw, fc_layer.p.size_B * sizeof(T));
+    memcpy(h_mask_new_W, fc_layer.mask_W, fc_layer.p.size_B * sizeof(T));
+    // uncomment for bias
+    memcpy(h_mask_new_Vy, fc_layer.mask_Vy, N * sizeof(T));
+    memcpy(h_mask_new_Y, fc_layer.mask_Y, N * sizeof(T));
+
+    curPtr = startPtr;
+    fc_layer.readForwardKey(&curPtr);
+    fc_layer.readBackwardKey(&curPtr, epoch);
+
+    memcpy(fc_layer.W, h_masked_W, fc_layer.mmKey.mem_size_B);
+    memcpy(fc_layer.Y, h_masked_Y, N * sizeof(T));
+    auto d_masked_Z = fc_layer.forward(peer, party, d_masked_X, &g);
+
+    memcpy(fc_layer.Vw, h_masked_Vw, fc_layer.mmKey.mem_size_B);
+    // uncommment for bias
+    memcpy(fc_layer.Vy, h_masked_Vy, N * sizeof(T));
+
+    auto d_masked_dX = fc_layer.backward(peer, party, d_masked_grad, &g, epoch);
+
+    auto h_masked_Z = (T *)moveToCPU((u8 *)d_masked_Z, fc_layer.mmKey.mem_size_C, NULL);
+    auto h_masked_dX = (T *)moveToCPU((u8 *)d_masked_dX, fc_layer.mmKey.mem_size_A, NULL);
+    auto h_Z_ct = gpuMatmulWrapper<T>(fc_layer.p, h_X, h_W, h_Y, true);
+
+    printf("Checking Z\n");
+    checkTrStWithTol<T>(bin, bout, global::scale, fc_layer.p.size_C, h_masked_Z, h_mask_Z, h_Z_ct);
+
+    auto h_dX_ct = gpuMatmulWrapper<T>(fc_layer.pdX, h_grad, h_W, NULL, false);
+    printf("Checking dX\n");
+
+    checkTrStWithTol<T>(bin, bout, global::scale, fc_layer.p.size_A, h_masked_dX, h_mask_dX, h_dX_ct);
+    auto h_dW_ct = gpuMatmulWrapper<T>(fc_layer.pdW, h_X, h_grad, NULL, false);
+
+    printf("Checking sgd for W, momentum=%d\n", useMomentum);
+    checkOptimizer<T>(bin, bout, fc_layer.p.size_B, h_W, h_Vw, h_dW_ct, fc_layer.W, fc_layer.Vw,
+                      h_mask_new_W, h_mask_new_Vw, global::scale, 2 * global::scale, 2 * global::scale, useMomentum, epoch);
+
+    auto h_dY_ct = getBiasGradWrapper<T>(M, N, bout, h_grad);
+    printf("Checking sgd for Y, momentum=%d\n", useMomentum);
+    checkOptimizer<T>(bin, bout, N, h_Y, h_Vy, h_dY_ct, fc_layer.Y, fc_layer.Vy, h_mask_new_Y, h_mask_new_Vy, 2 * global::scale, 2 * global::scale - lr_scale[epoch], global::scale, useMomentum, epoch);
+    return 0;
+}
diff --git a/GPU-MPC/tests/nn/orca/maxpool_test.cu b/GPU-MPC/tests/nn/orca/maxpool_test.cu
new file mode 100644
index 00000000..83aea460
--- /dev/null
+++ b/GPU-MPC/tests/nn/orca/maxpool_test.cu
@@ -0,0 +1,177 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <cassert>
+
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "nn/orca/maxpool_layer.h"
+
+#include <llama/array.h>
+
+using namespace dcf;
+using namespace dcf::orca;
+
+using T = u64;
+
+template <typename T>
+void maxPool2D(MaxpoolParams p, T *in, T *out, T *incomingGrad, T *outgoingGrad)
+{
+    memset(outgoingGrad, 0, p.N * p.imgH * p.imgW * p.C * sizeof(T));
+    for (int i = 0; i < p.N; i++)
+    {
+        for (int j = 0; j < p.H; j++)
+        {
+            for (int k = 0; k < p.W; k++)
+            {
+                for (int l = 0; l < p.C; l++)
+                {
+                    u64 M = 0;
+                    u64 maxIdxI = 0;
+                    u64 maxIdxJ = 0;
+                    int leftTopCornerH = j * p.strideH - p.zPadHLeft;
+                    int leftTopCornerW = k * p.strideW - p.zPadWLeft;
+                    for (int m = 0; m < p.FH; m++)
+                    {
+                        for (int n = 0; n < p.FW; n++)
+                        {
+                            u64 val = 0;
+                            int posH = leftTopCornerH + m;
+                            int posW = leftTopCornerW + n;
+                            if (posH >= 0 && posH <= p.imgH && posW >= 0 && posW <= p.imgW)
+                            {
+                                // printf("%d, %d, %d, %d\n", i, posH, posW, l);
+                                val = Arr4DIdx(in, p.N, p.imgH, p.imgW, p.C, i, posH, posW, l);
+                            }
+                            // printf("Val=%lu, %d\n", val, i * p.imgH * p.imgW * p.C + posH * p.imgW * p.C + posW * p.C + l);
+                            if (m == 0 && n == 0)
+                                M = val;
+                            else if (((val - M) & ((T(1) << p.bin) - 1)) < (T(1) << (p.bin - 1)))
+                            {
+                                M = val;
+                                maxIdxI = m;
+                                maxIdxJ = n;
+                            }
+                        }
+                    }
+                    Arr4DIdx(out, p.N, p.H, p.W, p.C, i, j, k, l) = M;
+                    auto inGrad = Arr4DIdx(incomingGrad, p.N, p.H, p.W, p.C, i, j, k, l);
+                    auto gradSum = Arr4DIdx(outgoingGrad, p.N, p.imgH, p.imgW, p.C, i, j * p.strideH + maxIdxI, k * p.strideW + maxIdxJ, l);
+                    gradSum = (gradSum + inGrad);
+                    cpuMod(gradSum, p.bwBackprop);
+                    Arr4DIdx(outgoingGrad, p.N, p.imgH, p.imgW, p.C, i, j * p.strideH + maxIdxI, k * p.strideW + maxIdxJ, l) = gradSum;
+                    // printf("maxI, maxJ = %d, %d\n", maxIdxI, maxIdxJ);
+                }
+            }
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+
+    int bin = 40;
+    int bout = 40;
+    int bwBackprop = 64;
+    int N = 100;
+    int imgH = 30;
+    int imgW = 30;
+    int C = 3;
+    int FH = 5;
+    int FW = 5;
+    int strideH = 2;
+    int strideW = 2;
+    int zPadHLeft = 0;
+    int zPadHRight = 0;
+    int zPadWLeft = 0;
+    int zPadWRight = 0;
+    bool useMomentum = true;
+    int epoch = 0;
+
+    int party = atoi(argv[1]);
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    auto maxpool_layer = MaxPool2DLayer<T>(bin, bout, bwBackprop, N, imgH, imgW,
+                                           C, FH, FW, strideH,
+                                           strideW, zPadHLeft, zPadHRight,
+                                           zPadWLeft, zPadWRight);
+    maxpool_layer.setTrain(useMomentum);
+    int inSz = getInSz(maxpool_layer.p);
+    int outSz = getMSz(maxpool_layer.p);
+
+    T *h_I, *h_incomingGrad;
+    auto d_inputMask = randomGEOnGpu<T>(inSz, bin);
+    // checkCudaErrors(cudaMemset(d_inputMask, 0, inSz * sizeof(T)));
+    auto h_inputMask = (T *)moveToCPU((u8 *)d_inputMask, inSz * sizeof(T), NULL);
+    auto d_masked_I = getMaskedInputOnGpu(inSz, bin, d_inputMask, &h_I);
+
+    auto d_incomingGradMask = randomGEOnGpu<T>(outSz, bwBackprop);
+    auto d_maskedIncomingGrad = getMaskedInputOnGpu(outSz, bwBackprop, d_incomingGradMask, &h_incomingGrad);
+
+    u8 *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 4 * OneGB);
+
+    auto d_outputMask = maxpool_layer.genForwardKey(&curPtr, party, d_inputMask, &g);
+    auto h_outputMask = (T *)moveToCPU((u8 *)d_outputMask, outSz * sizeof(T), NULL);
+    auto d_outgoingGradMask = maxpool_layer.genBackwardKey(&curPtr, party, d_incomingGradMask, &g, epoch);
+    auto h_outgoingGradMask = (T *)moveToCPU((u8 *)d_outgoingGradMask, inSz * sizeof(T), NULL);
+
+    curPtr = startPtr;
+    maxpool_layer.readForwardKey(&curPtr);
+    maxpool_layer.readBackwardKey(&curPtr, epoch);
+
+    auto d_O = maxpool_layer.forward(peer, party, d_masked_I, &g);
+    auto d_maskedOutgoingGrad = maxpool_layer.backward(peer, party, d_maskedIncomingGrad, &g, epoch);
+
+    auto h_O = (T *)moveToCPU((u8 *)d_O, outSz * sizeof(T), NULL);
+    T *ct_o = new T[outSz];
+    T *outgoingGradCt = new T[inSz];
+    maxPool2D(maxpool_layer.p, h_I, ct_o, h_incomingGrad, outgoingGradCt);
+    for (int i = 0; i < outSz; i++)
+    {
+        auto unmasked_output = (h_O[i] - h_outputMask[i]);
+        cpuMod(unmasked_output, bout);
+        if (i < 10 || unmasked_output != ct_o[i])
+            printf("%d=%lu %lu\n", i, unmasked_output, ct_o[i]);
+
+        assert(unmasked_output == ct_o[i]);
+    }
+    // printf("\n");
+    T *h_maskedOutgoingGrad = (T *)moveToCPU((u8 *)d_maskedOutgoingGrad, inSz * sizeof(T), NULL);
+    for (int i = 0; i < inSz; i++)
+    {
+        auto outGrad = h_maskedOutgoingGrad[i] - h_outgoingGradMask[i];
+        cpuMod(outGrad, bwBackprop);
+        if (i < 10 || outGrad != outgoingGradCt[i])
+            printf("%d: %lu %lu\n", i, outGrad, outgoingGradCt[i]);
+        assert(outGrad == outgoingGradCt[i]);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/nn/orca/relu_extend_test.cu b/GPU-MPC/tests/nn/orca/relu_extend_test.cu
new file mode 100644
index 00000000..ac8ba17e
--- /dev/null
+++ b/GPU-MPC/tests/nn/orca/relu_extend_test.cu
@@ -0,0 +1,96 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "nn/orca/relu_extend_layer.h"
+
+#include <cassert>
+
+using T = u64;
+
+using namespace dcf;
+using namespace dcf::orca;
+
+int main(int argc, char *argv[]) {
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+
+    int bin = 40;
+    int bout = 64;
+    int N = atoi(argv[3]);//1638400;
+    bool useMomentum = true;
+    int epoch = 0;
+
+    int party = atoi(argv[1]);
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+    
+    auto relu_extend_layer = ReluExtendLayer<T>(bin, bout, N);
+    relu_extend_layer.setTrain(useMomentum);
+    T *h_I, *h_incomingGrad;
+
+    u8* startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 4 * OneGB);
+
+    auto d_inputMask = randomGEOnGpu<T>(N, bin);
+    // checkCudaErrors(cudaMemset(d_inputMask, 0, N * sizeof(T)));
+    auto d_masked_I = getMaskedInputOnGpu(N, bin, d_inputMask, &h_I);
+    auto d_outputMask = relu_extend_layer.genForwardKey(&curPtr, party, d_inputMask, &g);
+    auto h_outputMask = (T*) moveToCPU((u8*) d_outputMask, N * sizeof(T), NULL);
+    
+    
+    auto d_incomingGradMask = randomGEOnGpu<T>(N, bout);
+    auto d_maskedIncomingGrad = getMaskedInputOnGpu(N, bout, d_incomingGradMask, &h_incomingGrad);
+    auto h_incomingGradMask = (T*) moveToCPU((u8*) d_incomingGradMask, N * sizeof(T), NULL);
+    auto d_outgoingGradMask = relu_extend_layer.genBackwardKey(&curPtr, party, d_incomingGradMask, &g, epoch);
+    auto h_outgoingGradMask = (T*) moveToCPU((u8*) d_outgoingGradMask, N * sizeof(T), NULL);
+
+    curPtr = startPtr;
+    relu_extend_layer.readForwardKey(&curPtr);
+    relu_extend_layer.readBackwardKey(&curPtr, epoch);
+    
+    auto d_masked_O = relu_extend_layer.forward(peer, party, d_masked_I, &g);
+    auto d_maskedOutgoingGrad = relu_extend_layer.backward(peer, party, d_maskedIncomingGrad, &g, epoch);
+    auto h_masked_O = (T*) moveToCPU((u8*) d_masked_O, N * sizeof(T), NULL);
+    for(int i = 0; i < N; i++) {
+        auto unmasked_output = h_masked_O[i] - h_outputMask[i];
+        auto relu = (h_I[i] < (T(1) << (bin - 1)) ? h_I[i] : 0);
+        if(i < 10 || unmasked_output != relu) printf("%d: %lu, %lu %lu\n", i, h_I[i], u64(unmasked_output), u64(relu));
+        assert(unmasked_output == relu);
+    }
+    // printf("\n");
+    auto h_maskedOutgoingGrad = (T*) moveToCPU((u8*) d_maskedOutgoingGrad, N * sizeof(T), NULL);
+    for(int i = 0; i < N; i++) {
+        auto outgoingGradCt = (h_I[i] < (T(1) << (bin - 1)) ? h_incomingGrad[i] : 0);
+        auto outgoingGrad = h_maskedOutgoingGrad[i] - h_outgoingGradMask[i];
+        cpuMod(outgoingGrad, bout);
+        if(i < 10) printf("%lu %lu\n", u64(outgoingGrad), u64(outgoingGradCt));
+        assert(outgoingGrad == outgoingGradCt);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/tests/nn/orca/relu_test.cu b/GPU-MPC/tests/nn/orca/relu_test.cu
new file mode 100644
index 00000000..f357f8e8
--- /dev/null
+++ b/GPU-MPC/tests/nn/orca/relu_test.cu
@@ -0,0 +1,112 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+
+#include <cassert>
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_comms.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+
+#include "fss/dcf/gpu_dcf.h"
+#include "fss/dcf/gpu_truncate.h"
+
+#include "nn/orca/relu_layer.h"
+
+using T = u64;
+
+using namespace dcf;
+using namespace dcf::orca;
+
+int main(int argc, char *argv[]) {
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+    initGPURandomness();
+
+    int bin = 40;//64;
+    int bout = 64;//64;
+    int shift = 24;//24;
+    int N = atoi(argv[3]);
+    bool useMomentum = true;
+    int epoch = 0;
+
+    int party = atoi(argv[1]);
+    auto peer = new GpuPeer(false);
+    peer->connect(party, argv[2]);
+
+    auto relu_layer = ReluLayer<T>(bin - shift, bout, N);
+    relu_layer.setTrain(useMomentum);
+    T *h_I, *h_incomingGrad;
+
+    u8* startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 4 * OneGB);
+
+    auto d_inputMask = randomGEOnGpu<T>(N, bin);
+    auto d_masked_I = getMaskedInputOnGpu(N, bin, d_inputMask, &h_I);
+    if(shift > 0) gpuLocalTr<T, T, ars>(party, bin, shift, N, d_inputMask, true);
+    auto d_outputMask = relu_layer.genForwardKey(&curPtr, party, d_inputMask, &g);
+    auto h_outputMask = (T*) moveToCPU((u8*) d_outputMask, N * sizeof(T), NULL);
+    
+    
+    auto d_incomingGradMask = randomGEOnGpu<T>(N, bout);
+    auto d_maskedIncomingGrad = getMaskedInputOnGpu(N, bout, d_incomingGradMask, &h_incomingGrad);
+    if(shift > 0) gpuLocalTr<T, T, ars>(party, bin, shift, N, d_incomingGradMask, true);
+    auto d_outgoingGradMask = relu_layer.genBackwardKey(&curPtr, party, d_incomingGradMask, &g, epoch);
+    auto h_outgoingGradMask = (T*) moveToCPU((u8*) d_outgoingGradMask, N * sizeof(T), NULL);
+
+    curPtr = startPtr;
+
+    relu_layer.readForwardKey(&curPtr);
+    relu_layer.readBackwardKey(&curPtr, epoch);
+
+    if(shift > 0) gpuLocalTr<T, T, ars>(party, bin, shift, N, d_masked_I, true);
+    auto d_masked_O = relu_layer.forward(peer, party, d_masked_I, &g);
+    if(shift > 0) gpuLocalTr<T, T, ars>(party, bin, shift, N, d_maskedIncomingGrad, true);
+    auto d_maskedOutgoingGrad = relu_layer.backward(peer, party, d_maskedIncomingGrad, &g, epoch);
+    auto h_masked_O = (T*) moveToCPU((u8*) d_masked_O, N * sizeof(T), NULL);
+    auto h_masked_grad = (T*) moveToCPU((u8*) d_maskedOutgoingGrad, N * sizeof(T), NULL);
+    int forwardCount = 0, backwardCount = 0;
+    for(int i = 0; i < N; i++) {
+        auto unmasked_o = h_masked_O[i] - h_outputMask[i];
+        cpuMod(unmasked_o, bout);
+        auto shifted_I = h_I[i] >> shift;
+        int dReLU = (h_I[i] < (T(1) << (bin - 1)));
+        auto relu = dReLU * shifted_I;
+        if(i < 10) printf("%lu %lu %lu\n", h_I[i], u64(relu), u64(unmasked_o));
+        // assert(dReLU * h_I[i] == unmasked_o);
+        int64_t tol = 1;
+        if(abs(static_cast<int64_t>(relu - unmasked_o)) > tol) forwardCount++;
+        auto shifted_grad = h_incomingGrad[i] >> shift;
+        auto unmasked_grad = h_masked_grad[i] - h_outgoingGradMask[i];
+        cpuMod(unmasked_grad, bout);
+        auto relu_back = dReLU * shifted_grad;
+        if(i < 10) printf("%lu %lu\n", relu_back, unmasked_grad);
+        // assert(dReLU * h_incomingGrad[i] == unmasked_grad);
+        if(abs(static_cast<int64_t>(relu_back - unmasked_grad)) > tol) backwardCount++;
+    }
+    printf("Num errors forward=%d\n", forwardCount);
+    printf("Num errors backward=%d\n", backwardCount);
+    return 0;
+}
diff --git a/GPU-MPC/utils/cpu_comms.h b/GPU-MPC/utils/cpu_comms.h
new file mode 100644
index 00000000..dfb39d44
--- /dev/null
+++ b/GPU-MPC/utils/cpu_comms.h
@@ -0,0 +1,243 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "sigma_comms.h"
+#include "gpu_data_types.h"
+#include <sys/socket.h>
+
+class CpuPeer : public SigmaPeer
+{
+private:
+    // compression also takes time
+    template <typename T>
+    u8 *cpuCompressMem(int bw, int modBw, int N, T *h_A0, size_t &memSz, size_t &numInts, Stats *s)
+    {
+        assert(modBw == bw);
+        memSz = size_t((N * bw - 1) / 64 + 1) * 8;
+        u8 *h_compressedA0 = (u8 *)malloc(memSz);
+        if (bw < 8 * sizeof(T))
+        {
+#pragma omp parallel for
+            for (int i = 0; i < memSz / 8; i++)
+            {
+                int b = (64 * i) / bw;
+                int elems = 63 / bw + 2;
+                if (b + elems > N)
+                    elems = N - b;
+                u64 temp = 0;
+                int offset = (64 * i) % bw;
+                u64 elem = h_A0[b];
+                cpuMod(elem, modBw);
+                // printf("%d=%lu\n", b, h_A0[b]);
+                temp = u64(/*h_A0[b]*/ elem >> offset);
+                offset = bw - offset;
+                for (int j = 1; j < elems; j++)
+                {
+                    u64 elem = h_A0[b + j];
+                    cpuMod(elem, modBw);
+                    // printf("%d=%lu\n", b + j, h_A0[b + j]);
+                    temp += (elem << offset);
+                    offset += bw;
+                    if (offset >= 64)
+                        break;
+                }
+                ((u64 *)h_compressedA0)[i] = temp;
+            }
+        }
+        else
+        {
+            assert(modBw == bw);
+            memcpy(h_compressedA0, h_A0, memSz);
+        }
+        return h_compressedA0;
+    }
+
+    void cpuXor(u64 N, u32 *x, u32 *y)
+    {
+        // printf("x=%u, y=%u\n", x[0], y[0]);
+#pragma omp parallel for
+        for (u64 i = 0; i < N; i++)
+        {
+            x[i] ^= y[i];
+        }
+        // printf("%u\n", x[0]);
+    }
+
+    template <typename T>
+    T *cpuAdd(int bw, u64 N, const T *x, const T *y)
+    {
+        T *z = (T *)malloc(N * sizeof(T));
+#pragma omp parallel for
+        for (u64 i = 0; i < N; i++)
+        {
+            z[i] = x[i] + y[i];
+            cpuMod(z[i], bw);
+        }
+        return z;
+    }
+
+    template <typename T>
+    void cpuAddInPlace(int bw, u64 N, T *x, T *y)
+    {
+#pragma omp parallel for
+        for (u64 i = 0; i < N; i++)
+        {
+            x[i] += y[i];
+            cpuMod(x[i], bw);
+        }
+    }
+
+public:
+    CpuPeer() : SigmaPeer(false, false)
+    {
+    }
+
+    void connect(int party, std::string addr, int port = 42003)
+    {
+        SigmaPeer::connect(party, addr, port);
+        // int optval = 4; // valid values are in the range [1,7]
+        //                 // 1- low priority, 7 - high priority
+        // auto sendsocket = static_cast<SocketBuf *>(this->peer->keyBuf)->sendsocket;
+        // int prio = -1;
+        // socklen_t len;
+        // if (getsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &prio, &len) < 0)
+        // {
+        //     assert(0 && "setsockopt error");
+        // }
+        // printf("Current prio=%d, %lu\n", prio, len);
+        // if (setsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &optval, sizeof(optval)) < 0)
+        // {
+        //     printf("errno: %d, %s\n", errno, strerror(errno));
+        //     assert(0 && "setsockopt error");
+        // }
+        // if (getsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &prio, &len) < 0)
+        // {
+        //     assert(0 && "setsockopt error");
+        // }
+        // printf("New prio=%d\n", prio);    
+    }
+
+    template <typename T>
+    void _send(T *h_A0, int bw, u64 N, Stats *s)
+    {
+        size_t memSz = 0, numInts = 0;
+        this->getMemSz<T>(bw, N, memSz, numInts);
+        // printf("Getting mem size=%lu\n", memSz);
+        auto start = std::chrono::high_resolution_clock::now();
+        this->sendBytes((u8 *)h_A0, memSz);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        if (s)
+            s->comm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    template <typename T>
+    T *_recv(int bw, u64 N, Stats *s)
+    {
+        size_t memSz = 0, numInts = 0;
+        this->getMemSz<T>(bw, N, memSz, numInts);
+        // if (!h_A)
+        auto h_A = (T *)cpuMalloc(memSz, false);
+        auto start = std::chrono::high_resolution_clock::now();
+        this->recvBytes((u8 *)h_A, memSz);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        if (s)
+            s->comm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+        return h_A;
+    }
+
+    template <typename T>
+    void _reconstructInPlace(T *A0, int bw, u64 N, Stats *s)
+    {
+        size_t memSz = 0, numInts = 0;
+        this->getMemSz<T>(bw, N, memSz, numInts);
+        this->exchangeShares((u8 *)A0, memSz, s);
+        if (bw == 1)
+            cpuXor(numInts, (u32 *)A0, (u32 *)h_bufA1);
+        else
+            cpuAddInPlace(bw, N, A0, (T *)h_bufA1);
+    }
+
+    template <typename T>
+    T *_addAndReconstruct(int bw, u64 N, T *A0, T *B0, Stats *s, bool inPlace)
+    {
+        auto C0 = A0;
+        if (inPlace)
+            cpuAddInPlace(bw, N, A0, B0);
+        else
+            C0 = cpuAdd(bw, N, A0, B0);
+        this->reconstructInPlace(C0, bw, N, s);
+        return C0;
+    }
+
+    void Send(u64 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u64>(h_A0, bw, N, s);
+    }
+
+    void Send(u32 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u32>(h_A0, bw, N, s);
+    }
+
+    void Send(u8 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u8>(h_A0, bw, N, s);
+    }
+
+    u8 *Recv(int bw, u64 N, Stats *s)
+    {
+        return _recv<u8>(bw, N, s);
+    }
+
+    void reconstructInPlace(u64 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u64>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u32 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u32>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u16 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u16>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u8 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u8>(A0, bw, N, s);
+    }
+
+    u64 *addAndReconstruct(int bw, u64 N, u64 *A0, u64 *B0, Stats *s, bool inPlace = false)
+    {
+        return _addAndReconstruct<u64>(bw, N, A0, B0, s, inPlace);
+    }
+
+    u32 *addAndReconstruct(int bw, u64 N, u32 *A0, u32 *B0, Stats *s, bool inPlace)
+    {
+        return _addAndReconstruct<u32>(bw, N, A0, B0, s, inPlace);
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/utils/curand_utils.h b/GPU-MPC/utils/curand_utils.h
new file mode 100644
index 00000000..f42bb280
--- /dev/null
+++ b/GPU-MPC/utils/curand_utils.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <curand.h>
+
+// CUDA API error checking
+#define CUDA_CHECK(err)                                                        \
+  do {                                                                         \
+    cudaError_t err_ = (err);                                                  \
+    if (err_ != cudaSuccess) {                                                 \
+      std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);       \
+      throw std::runtime_error("CUDA error");                                  \
+    }                                                                          \
+  } while (0)
+
+// curand API error checking
+#define CURAND_CHECK(err)                                                      \
+  do {                                                                         \
+    curandStatus_t err_ = (err);                                               \
+    if (err_ != CURAND_STATUS_SUCCESS) {                                       \
+      std::printf("curand error %d at %s:%d\n", err_, __FILE__, __LINE__);     \
+      throw std::runtime_error("curand error");                                \
+    }                                                                          \
+  } while (0)
+
+template <typename T> void print_vector(const std::vector<T> &data);
+
+template <> void print_vector(const std::vector<float> &data) {
+  for (auto &i : data)
+    std::printf("%0.6f\n", i);
+}
+
+template <> void print_vector(const std::vector<unsigned int> &data) {
+  for (auto &i : data)
+    std::printf("%d\n", i);
+}
\ No newline at end of file
diff --git a/GPU-MPC/utils/exception.h b/GPU-MPC/utils/exception.h
new file mode 100644
index 00000000..63b93933
--- /dev/null
+++ b/GPU-MPC/utils/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_comms.h b/GPU-MPC/utils/gpu_comms.h
new file mode 100644
index 00000000..fb12d01a
--- /dev/null
+++ b/GPU-MPC/utils/gpu_comms.h
@@ -0,0 +1,326 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "sigma_comms.h"
+#include "gpu_data_types.h"
+#include "misc_utils.h"
+
+template <typename T>
+__global__ void compressKernel(int bw, int modBw, u64 threads, int N, T *d_A, u8 *d_compressedA)
+{
+    u64 i = blockIdx.x * (u64)blockDim.x + threadIdx.x;
+    if (i < threads)
+    {
+        // this thread is responsible for packing 64 bits
+        // figure out the index of the starting element
+        u64 b = (64 * i) / bw;
+        int elems = 63 / bw + 2;
+        if (b + elems > N)
+            elems = N - b;
+        u64 temp = 0;
+        int offset = (64 * i) % bw;
+        gpuMod(d_A[b], modBw);
+        temp = u64(d_A[b] >> offset);
+        offset = bw - offset;
+        for (int j = 1; j < elems; j++)
+        {
+            gpuMod(d_A[b + j], modBw);
+            temp += (u64(d_A[b + j]) << offset);
+            offset += bw;
+            if (offset >= 64)
+                break;
+        }
+        ((u64 *)d_compressedA)[i] = temp;
+    }
+}
+
+template <typename T>
+__global__ void expandKernel(int bw, int N, u8 *d_compressedA, T *d_A)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        assert(bw <= 64);
+        u64 b = bw * (u64)i / 64;
+        AESBlock temp;
+        ((u64 *)&temp)[0] = ((u64 *)d_compressedA)[b];
+        ((u64 *)&temp)[1] = ((u64 *)d_compressedA)[b + 1];
+        int offset = (bw * (u64)i) % 64;
+        auto elem = T(temp >> offset);
+        gpuMod(elem, bw);
+        d_A[i] = elem;
+    }
+}
+
+__global__ void addMod4(int numInts, u32 *A, u32 *B)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j < numInts)
+    {
+        u32 x = A[j];
+        u32 y = B[j];
+        u32 z = 0;
+        for (int i = 0; i < 32; i += 2)
+        {
+            u32 a = (x >> i) & 3;
+            u32 b = (y >> i) & 3;
+            u32 c = ((a + b) & 3) << i;
+            z |= c;
+        }
+        A[j] = z;
+    }
+}
+
+class GpuPeer : public SigmaPeer
+{
+private:
+    template <typename T>
+    u8 *compressMem(int bw, int modBw, int N, T *d_A0, size_t &memSz, size_t &numInts, Stats *s, bool returnNew = false)
+    {
+        assert(modBw == bw);
+        u8 *d_compressedA0;
+        // printf("######## compressing=%d, bw=%d\n", this->compress, bw);
+        if (this->compress && (bw > 2 /*|| compressBwLt3*/) && bw < 8 * sizeof(T))
+        {
+            // size in bytes
+            // printf("^^^^^^^^^^^^^^ here\n");
+            memSz = size_t(((u64)N * bw - 1) / 64 + 1) * 8;
+            d_compressedA0 = (u8 *)gpuMalloc(memSz);
+            u64 threads = memSz / 8; //(memSz - 1) / 8 + 1;
+            // printf("%lu\n", threads);
+            compressKernel<<<(threads - 1) / 128 + 1, 128>>>(bw, modBw, threads, N, d_A0, d_compressedA0);
+            checkCudaErrors(cudaDeviceSynchronize());
+        }
+        else
+        {
+            // printf("Not compressing memory\n");
+            assert(modBw == bw);
+            memSz = 0;
+            this->getMemSz<T>(bw, N, memSz, numInts);
+            d_compressedA0 = (u8 *)d_A0;
+            if (returnNew)
+            {
+                assert(memSz > 0);
+                d_compressedA0 = (u8 *)gpuMalloc(memSz);
+                checkCudaErrors(cudaMemcpy(d_compressedA0, d_A0, memSz, cudaMemcpyDeviceToDevice));
+            }
+        }
+        return d_compressedA0;
+    }
+
+    template <typename T>
+    T *expandMem(int bw, int N, u8 *h_compressedA, size_t memSz, size_t numInts, Stats *s)
+    {
+        T *d_A;
+        if (this->compress && bw > 2 && bw < 8 * sizeof(T))
+        {
+            auto d_compressedA = (u8 *)moveToGPU(h_compressedA, memSz, s);
+            // size in bytes
+            memSz = size_t(N * sizeof(T));
+            d_A = (T *)gpuMalloc(memSz);
+            expandKernel<<<(N - 1) / 128 + 1, 128>>>(bw, N, d_compressedA, (T *)d_A);
+            checkCudaErrors(cudaDeviceSynchronize());
+            gpuFree(d_compressedA);
+        }
+        else
+        {
+            // don't need this here
+            // getMemSz<T>(bw, N, memSz, numInts);
+            // printf("Moving to gpu\n");
+            d_A = (T *)moveToGPU((u8 *)h_compressedA, memSz, s);
+        }
+        return d_A;
+    }
+
+    void gpuAddMod4(u32 *d_A, u32 *d_B, int N)
+    {
+        const int thread_blk_size = 128;
+        int numInts = (2 * N - 1) / 32 + 1;
+        // printf("numInts: %d\n", numInts);
+        addMod4<<<(numInts - 1) / thread_blk_size + 1, thread_blk_size>>>(numInts, d_A, d_B);
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+
+    template <typename T>
+    void reconstructHelper(int bw, int N, u64 memSz, int numInts, T *d_A0, Stats *s, T *d_A1 = NULL)
+    {
+        if (!d_A1)
+            d_A1 = (T *)moveToGPU((u8 *)h_bufA1, memSz, s);
+        if (bw == 1)
+            gpuXor((u32 *)d_A0, (u32 *)d_A1, numInts, s);
+        else if (bw == 2)
+            gpuAddMod4((u32 *)d_A0, (u32 *)d_A1, N);
+        else
+            gpuLinearComb(bw, N, d_A0, T(1), d_A0, T(1), d_A1);
+        gpuFree(d_A1);
+    }
+
+public:
+    GpuPeer(bool compress) : SigmaPeer(true, compress)
+    {
+    }
+
+    void connect(int party, std::string addr, int port = 42003)
+    {
+        SigmaPeer::connect(party, addr, port);
+        printf("Not setting socket priority!\n");
+        // int optval = 7; // valid values are in the range [1,7]
+        //                 // 1- low priority, 7 - high priority
+        // auto sendsocket = static_cast<SocketBuf *>(this->peer->keyBuf)->sendsocket;
+        // int prio = -1;
+        // socklen_t len;
+        // if (getsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &prio, &len) < 0)
+        // {
+        //     assert(0 && "setsockopt error");
+        // }
+        // printf("Current prio=%d, %lu\n", prio, len);
+        // if (setsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &optval, sizeof(optval)) < 0)
+        // {
+        //     printf("errno: %d, %s\n", errno, strerror(errno));
+        //     assert(0 && "setsockopt error");
+        // }
+        // if (getsockopt(sendsocket, SOL_SOCKET, SO_PRIORITY, &prio, &len) < 0)
+        // {
+        //     assert(0 && "setsockopt error");
+        // }
+        // printf("New prio=%d\n", prio);
+    }
+
+    template <typename T>
+    void _reconstructInPlace(T *d_A0, int bw, int N, Stats *s)
+    {
+        // printf("%d, %d\n", bw, N);
+        size_t memSz = 0, numInts = 0;
+        auto d_compressedA0 = compressMem(bw, bw, N, d_A0, memSz, numInts, s);
+        moveIntoCPUMem(h_bufA0, (u8 *)d_compressedA0 /*d_A0*/, memSz, s);
+        if (d_compressedA0 != (u8 *)d_A0)
+            gpuFree(d_compressedA0);
+        this->exchangeShares((u8 *)h_bufA0, memSz, s);
+        auto d_A1 = expandMem<T>(bw, N, h_bufA1, memSz, numInts, s);
+        reconstructHelper(bw, N, memSz, numInts, d_A0, s, d_A1);
+    }
+
+    template <typename T>
+    void _send(T *d_A0, int bw, int N, Stats *s)
+    {
+        size_t memSz = 0, numInts = 0;
+        auto d_compressedA0 = compressMem(bw, bw, N, d_A0, memSz, numInts, s);
+        moveIntoCPUMem(h_bufA0, (u8 *)d_compressedA0 /*d_A0*/, memSz, s);
+        if (d_compressedA0 != (u8 *)d_A0)
+            gpuFree(d_compressedA0);
+        auto start = std::chrono::high_resolution_clock::now();
+        this->sendBytes((u8 *)h_bufA0, memSz);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        if (s)
+            s->comm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    }
+
+    template <typename T>
+    T *_recv(int bw, int N, Stats *s)
+    {
+        size_t memSz = 0, numInts = 0;
+        this->getMemSz<T>(bw, N, memSz, numInts);
+        auto start = std::chrono::high_resolution_clock::now();
+        this->recvBytes((u8 *)h_bufA0, memSz);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        if (s)
+            s->comm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+
+        auto d_A1 = expandMem<T>(bw, N, h_bufA0, memSz, numInts, s);
+        return d_A1;
+    }
+
+    void Send(u64 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u64>(h_A0, bw, N, s);
+    }
+
+    void Send(u32 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u32>(h_A0, bw, N, s);
+    }
+
+    void Send(u8 *h_A0, int bw, u64 N, Stats *s)
+    {
+        _send<u8>(h_A0, bw, N, s);
+    }
+
+    u8 *Recv(int bw, u64 N, Stats *s)
+    {
+        return _recv<u8>(bw, N, s);
+    }
+
+    void reconstructInPlace(u64 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u64>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u32 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u32>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u16 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u16>(A0, bw, N, s);
+    }
+
+    void reconstructInPlace(u8 *A0, int bw, u64 N, Stats *s)
+    {
+        _reconstructInPlace<u8>(A0, bw, N, s);
+    }
+
+    template <typename T>
+    T *_addAndReconstruct(int bw, u64 N, T *d_A0, T *h_B0, Stats *s)
+
+    {
+        auto d_B0 = (T *)moveToGPU((u8 *)h_B0, N * sizeof(T), s);
+        gpuLinearComb(bw, N, d_A0, T(1), d_A0, T(1), d_B0);
+        gpuFree(d_B0);
+        _reconstructInPlace(d_A0, bw, N, s);
+        return d_A0;
+    }
+
+    u64 *addAndReconstruct(int bw, u64 N, u64 *d_A0, u64 *h_B0, Stats *s, bool inPlace = true)
+    {
+        assert(inPlace == true);
+        return _addAndReconstruct(bw, N, d_A0, h_B0, s);
+    }
+
+    u32 *addAndReconstruct(int bw, u64 N, u32 *A0, u32 *B0, Stats *s, bool inPlace)
+    {
+        assert(0);
+    }
+};
+
+// T *reconstruct(T *h_A0, int bw, int N, Stats *s)
+// {
+//     size_t memSz, numInts = 0;
+//     getMemSz(bw, N, memSz, numInts);
+//     exchangeShares(peer, (u8 *)h_A0, h_bufA1, memSz, party, s);
+//     auto d_A0 = (T *)moveToGPU((u8 *)h_A0, memSz, s);
+//     reconstructHelper(bw, N, memSz, numInts, d_A0, s);
+//     return d_A0;
+// }
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_data_types.h b/GPU-MPC/utils/gpu_data_types.h
new file mode 100644
index 00000000..490b4161
--- /dev/null
+++ b/GPU-MPC/utils/gpu_data_types.h
@@ -0,0 +1,60 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// #ifndef GPU_DATA_TYPES_H
+// #define GPU_DATA_TYPES_H
+
+#pragma once
+
+#include <utility>
+#include <stdint.h>
+#include <cstddef>
+
+#include <sytorch/tensor.h>
+
+#include "gpu_stats.h"
+
+typedef unsigned __int128 AESBlock;
+
+#define SERVER0 0
+#define SERVER1 1
+#define AES_BLOCK_LEN_IN_BITS 128
+#define FULL_MASK 0xffffffff
+#define LOG_AES_BLOCK_LEN 7
+
+#define PACKING_SIZE 32
+#define PACK_TYPE uint32_t
+
+#define NUM_SHARED_MEM_BANKS 32
+
+using orcaTemplateClass = u64;
+
+namespace dcf
+{
+    namespace orca
+    {
+        namespace global
+        {
+            static const int bw = 64;
+            static const int scale = 24;
+        }
+    }
+}
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_file_utils.cpp b/GPU-MPC/utils/gpu_file_utils.cpp
new file mode 100644
index 00000000..fb957840
--- /dev/null
+++ b/GPU-MPC/utils/gpu_file_utils.cpp
@@ -0,0 +1,172 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cstdint>
+#include <filesystem>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cassert>
+#include <chrono>
+
+#include "helper_cuda.h"
+#include "gpu_data_types.h"
+#include "gpu_mem.h"
+#include "gpu_file_utils.h"
+
+extern int errno;
+
+// using u8 = uint8_t;
+
+u8 *readFile(std::string filename, size_t *input_size, bool pin)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    std::ifstream file(filename, std::ios::binary);
+    size_t size_in_bytes = std::filesystem::file_size(filename);
+    *input_size = size_in_bytes;
+    u8 *mem_bytes = cpuMalloc(size_in_bytes, pin);
+    file.read((char *)mem_bytes, size_in_bytes);
+    file.close();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    // std::cout << "Time to read file in ms: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << std::endl;
+    // std::cout << "File size: " << size_in_bytes << std::endl;
+    return mem_bytes;
+}
+
+void readKey(int fd, size_t keySize, u8 *key_as_bytes, uint64_t *time)
+{
+    size_t chunkSize = (1ULL << 30);
+    size_t bytesRead = 0;
+    auto start = std::chrono::high_resolution_clock::now();
+    while (bytesRead < keySize)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        size_t toRead = std::min(chunkSize, keySize - bytesRead);
+        ssize_t numRead = read(fd, key_as_bytes + bytesRead, toRead);
+        if (numRead == -1)
+        {
+            printf("errno: %d, %s\n", errno, strerror(errno));
+            assert(0 && "read");
+        }
+        assert(numRead == toRead);
+        bytesRead += numRead;
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        // std::cout << "Time for key read: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << std::endl;
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (time)
+        *time += std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count();
+    std::cout << "Time for key read: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << std::endl;
+}
+
+void writeKeyBuf(int fd, size_t keySize, const u8 *key_as_bytes)
+{
+    size_t chunkSize = (1ULL << 30);
+    size_t bytesWritten = 0;
+    // printf("%d, %lu, %lx\n", fd, keySize, key_as_bytes);
+    // auto start = std::chrono::high_resolution_clock::now();
+    while (bytesWritten < keySize)
+    {
+        size_t toWrite = std::min(chunkSize, keySize - bytesWritten);
+        // printf("%d, %lx, %lu\n", fd, key_as_bytes + bytesWritten, toWrite);
+        ssize_t numWritten = write(fd, key_as_bytes + bytesWritten, toWrite);
+        if (numWritten == -1)
+        {
+            printf("errno: %d, %s\n", errno, strerror(errno));
+            assert(0 && "write");
+        }
+        assert(numWritten == toWrite);
+        bytesWritten += numWritten;
+    }
+    // sync();
+    // auto end = std::chrono::high_resolution_clock::now();
+    // auto elapsed = end - start;
+    // std::cout << "Time for key write: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << std::endl;
+}
+
+int openForReading(std::string filename)
+{
+    printf("Opening file=%s\n", filename.data());
+    int fd = open(filename.data(), O_RDONLY | O_DIRECT | O_LARGEFILE);
+    if (fd == -1)
+        assert(0 && "fopen");
+    lseek(fd, 0, SEEK_SET);
+    return fd;
+}
+
+int openForWriting(std::string filename)
+{
+    int fd = open(filename.data(), O_WRONLY | O_DIRECT | O_LARGEFILE | O_TRUNC | O_CREAT, 0644);
+    if (fd == -1)
+        assert(0 && "fopen");
+    return fd;
+}
+
+void writeKey(std::string filename, u8 *startPtr, u8 *curPtr)
+{
+    std::ofstream f(filename);
+    size_t keyBytes = curPtr - startPtr;
+    f.write((char *)startPtr, keyBytes);
+    f.close();
+}
+
+void getKeyBuf(u8 **startPtr, u8 **curPtr, size_t bufSize, bool pin)
+{
+    // printf("Getting key buf\n");
+    *startPtr = cpuMalloc(/*4 * OneGB*/ bufSize, pin);
+    *curPtr = *startPtr;
+}
+
+void getAlignedBuf(u8 **startPtr, size_t bufSize, bool pin)
+{
+    int err = posix_memalign((void **)startPtr, 4096, bufSize);
+    assert(err == 0 && "posix memalign");
+    if (pin)
+        checkCudaErrors(cudaHostRegister(*startPtr, bufSize, cudaHostRegisterDefault));
+}
+
+void closeFile(int fd)
+{
+    int error = close(fd);
+    assert(error == 0 && "close file");
+}
+
+void makeDir(std::string dirName)
+{
+    if (!std::filesystem::create_directory(dirName))
+    {
+        if (errno == EEXIST)
+        {
+            return;
+        }
+        else
+        {
+            assert(0 && "could not create directory");
+        }
+    }
+}
diff --git a/GPU-MPC/utils/gpu_file_utils.h b/GPU-MPC/utils/gpu_file_utils.h
new file mode 100644
index 00000000..a1c95f5a
--- /dev/null
+++ b/GPU-MPC/utils/gpu_file_utils.h
@@ -0,0 +1,41 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_data_types.h"
+#include <cstdint>
+#include <string>
+
+extern size_t OneGB;
+
+using u8 = uint8_t;
+
+u8 *readFile(std::string filename, size_t *input_size, bool pin = true);
+void readKey(int fd, size_t keySize, u8 *key_as_bytes, uint64_t *time);
+void writeKey(std::string filename, u8 *startPtr, u8 *curPtr);
+void getKeyBuf(u8 **startPtr, u8 **curPtr, size_t bufSize, bool pin = true);
+void writeKeyBuf(int fd, size_t keySize, const u8 *key_as_bytes);
+void getAlignedBuf(u8 **startPtr, size_t bufSize, bool pin = true);
+int openForReading(std::string filename);
+int openForWriting(std::string filename);
+void closeFile(int fd);
+void makeDir(std::string dirName);
diff --git a/GPU-MPC/utils/gpu_mem.cu b/GPU-MPC/utils/gpu_mem.cu
new file mode 100644
index 00000000..f241ad4c
--- /dev/null
+++ b/GPU-MPC/utils/gpu_mem.cu
@@ -0,0 +1,133 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <chrono>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include "helper_cuda.h"
+#include "gpu_stats.h"
+#include <cassert>
+
+// #include <sys/types.h>
+
+cudaMemPool_t mempool;
+
+extern "C" void initGPUMemPool()
+{
+    int isMemPoolSupported = 0;
+    int device = 0;
+    // is it okay to use device=0?
+    checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
+                                           cudaDevAttrMemoryPoolsSupported, device));
+    // printf("%d\n", isMemPoolSupported);
+    assert(isMemPoolSupported);
+    /* implicitly assumes that the device is 0 */
+
+    checkCudaErrors(cudaDeviceGetDefaultMemPool(&mempool, device));
+    uint64_t threshold = UINT64_MAX;
+    checkCudaErrors(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
+    uint64_t *d_dummy_ptr;
+    uint64_t bytes = 40 * (1ULL << 30);
+    checkCudaErrors(cudaMallocAsync(&d_dummy_ptr, bytes, 0));
+    checkCudaErrors(cudaFreeAsync(d_dummy_ptr, 0));
+    uint64_t reserved_read, threshold_read;
+    checkCudaErrors(cudaMemPoolGetAttribute(mempool, cudaMemPoolAttrReservedMemCurrent, &reserved_read));
+    checkCudaErrors(cudaMemPoolGetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold_read));
+    printf("reserved memory: %lu %lu\n", reserved_read, threshold_read);
+}
+
+extern "C" uint8_t *gpuMalloc(size_t size_in_bytes)
+{
+    uint8_t *d_a;
+    checkCudaErrors(cudaMallocAsync(&d_a, size_in_bytes, 0));
+    return d_a;
+}
+
+
+extern "C" uint8_t *cpuMalloc(size_t size_in_bytes, bool pin)
+{
+    uint8_t *h_a;
+    int err = posix_memalign((void **)&h_a, 32, size_in_bytes);
+    assert(err == 0 && "posix memalign");
+    if (pin)
+        checkCudaErrors(cudaHostRegister(h_a, size_in_bytes, cudaHostRegisterDefault));
+    return h_a;
+}
+
+extern "C" void gpuFree(void *d_a)
+{
+    checkCudaErrors(cudaFreeAsync(d_a, 0));
+}
+
+extern "C" void cpuFree(void *h_a, bool pinned)
+{
+    if (pinned)
+        checkCudaErrors(cudaHostUnregister(h_a));
+    free(h_a);
+}
+
+extern "C" uint8_t *moveToCPU(uint8_t *d_a, size_t size_in_bytes, Stats *s)
+{
+    uint8_t *h_a = cpuMalloc(size_in_bytes, true);
+    auto start = std::chrono::high_resolution_clock::now();
+    checkCudaErrors(cudaMemcpy(h_a, d_a, size_in_bytes, cudaMemcpyDeviceToHost));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (s)
+        s->transfer_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    return h_a;
+}
+
+extern "C" uint8_t *moveIntoGPUMem(uint8_t *d_a, uint8_t *h_a, size_t size_in_bytes, Stats *s)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    checkCudaErrors(cudaMemcpy(d_a, h_a, size_in_bytes, cudaMemcpyHostToDevice));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (s)
+        s->transfer_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    return h_a;
+}
+
+extern "C" uint8_t *moveIntoCPUMem(uint8_t *h_a, uint8_t *d_a, size_t size_in_bytes, Stats *s)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    checkCudaErrors(cudaMemcpy(h_a, d_a, size_in_bytes, cudaMemcpyDeviceToHost));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (s)
+        s->transfer_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    return h_a;
+}
+
+extern "C" uint8_t *moveToGPU(uint8_t *h_a, size_t size_in_bytes, Stats *s)
+{
+    uint8_t *d_a = gpuMalloc(size_in_bytes);
+    auto start = std::chrono::high_resolution_clock::now();
+    checkCudaErrors(cudaMemcpy(d_a, h_a, size_in_bytes, cudaMemcpyHostToDevice));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (s)
+        s->transfer_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    return d_a;
+}
diff --git a/GPU-MPC/utils/gpu_mem.h b/GPU-MPC/utils/gpu_mem.h
new file mode 100644
index 00000000..e1259379
--- /dev/null
+++ b/GPU-MPC/utils/gpu_mem.h
@@ -0,0 +1,38 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "gpu_stats.h"
+// #include <sys/types.h>
+// extern cudaMemPool_t mempool;
+
+extern "C" uint8_t *gpuMalloc(size_t size_in_bytes);
+extern "C" uint8_t *cpuMalloc(size_t size_in_bytes, bool pin = true);
+extern "C" void cpuFree(void *h_a, bool pinned = true);
+extern "C" void gpuFree(void *d_a);
+extern "C" uint8_t *moveToGPU(uint8_t *h_a, size_t size_in_bytes, Stats *);
+extern "C" uint8_t *moveIntoGPUMem(uint8_t *d_a, uint8_t *h_a, size_t size_in_bytes, Stats *s);
+extern "C" uint8_t *moveToCPU(uint8_t *d_a, size_t size_in_bytes, Stats *);
+extern "C" uint8_t *moveIntoCPUMem(uint8_t *h_a, uint8_t *d_a, size_t size_in_bytes, Stats *s);
+extern "C" void initGPUMemPool();
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_random.cu b/GPU-MPC/utils/gpu_random.cu
new file mode 100644
index 00000000..7882fe6a
--- /dev/null
+++ b/GPU-MPC/utils/gpu_random.cu
@@ -0,0 +1,212 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+ * This program uses the host CURAND API to generate 100
+ * pseudorandom floats.
+ */
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+#include <cassert>
+
+#include <cuda_runtime.h>
+// #include <curand.h>
+
+#include "curand_utils.h"
+#include "gpu_data_types.h"
+#include "gpu_mem.h"
+#include "misc_utils.h"
+
+#include "gpu_random.h"
+
+// using data_type = u32;
+// cudaStream_t stream = NULL;
+curandGenerator_t gpuGen[2];
+curandGenerator_t cpuGen[2];
+curandRngType_t rng = CURAND_RNG_PSEUDO_XORWOW;
+curandOrdering_t order = CURAND_ORDERING_PSEUDO_BEST;
+
+void randomUIntsOnGpu(const u64 n, u32 *d_data)
+{
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  CURAND_CHECK(curandGenerate(gpuGen[device], d_data, n));
+}
+
+void randomUIntsOnCpu(const u64 n, u32 *h_data)
+{
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  CURAND_CHECK(curandGenerate(cpuGen[device], h_data, n));
+}
+
+template <typename T>
+T *randomGEOnGpu(const u64 n, int bw)
+{
+  u64 numUInts = (n * sizeof(T) - 1) / (sizeof(u32)) + 1;
+  // printf("random n=%lu, ints=%lu, bw=%d\n", n, numUInts, bw);
+  // assert((n * sizeof(T)) % sizeof(u32) == 0);
+  auto d_data = (u32 *)gpuMalloc(numUInts * sizeof(u32));
+  randomUIntsOnGpu(/*2 * n*/ numUInts, /*(u32*)*/ d_data);
+  modKernel<<<(n - 1) / 256 + 1, 256>>>(n, (T *)d_data, bw);
+  return (T *)d_data;
+}
+
+// extern "C" void randomGEOnGpu(const int n) {
+//   randomUIntsOnGpu(2 * n, (u32*) d_data);
+// }
+
+template <typename T>
+void randomGEOnCpu(const u64 n, int bw, T *h_data)
+{
+  u64 numUInts = (n * sizeof(T)) / (sizeof(u32));
+  assert((n * sizeof(T)) % sizeof(u32) == 0);
+  randomUIntsOnCpu(numUInts, (u32 *)h_data);
+  if (bw < sizeof(T) * 8)
+  {
+    for (u64 i = 0; i < n; i++)
+    {
+      h_data[i] &= ((T(1) << bw) - 1);
+    }
+  }
+}
+
+template <typename T>
+T *randomGEOnCpu(const u64 n, int bw)
+{
+  // printf("n=%lu\n", n);
+  auto h_data = (T *)cpuMalloc(n * sizeof(T));
+  randomGEOnCpu(n, bw, h_data);
+  return h_data;
+}
+
+AESBlock *randomAESBlockOnGpu(const int n)
+{
+  AESBlock *d_data = (AESBlock *)gpuMalloc(n * sizeof(AESBlock));
+  randomUIntsOnGpu(4 * n, (u32 *)d_data);
+  return d_data;
+}
+
+void initGPURandomness()
+{
+  const unsigned long long offset = 0ULL;
+  const unsigned long long seed = 12345ULL;
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  CURAND_CHECK(curandCreateGenerator(&(gpuGen[device]), CURAND_RNG_PSEUDO_XORWOW));
+  CURAND_CHECK(curandSetGeneratorOffset(gpuGen[device], offset));
+  CURAND_CHECK(curandSetGeneratorOrdering(gpuGen[device], order));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(gpuGen[device], seed));
+}
+
+void initCPURandomness()
+{
+  const unsigned long long offset = 0ULL;
+  const unsigned long long seed = 1234567890ULL;
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  printf("CPU randomness, seed: %llu, offset: %llu\n", seed, offset);
+  CURAND_CHECK(curandCreateGeneratorHost(&(cpuGen[device]), CURAND_RNG_PSEUDO_XORWOW));
+  CURAND_CHECK(curandSetGeneratorOffset(cpuGen[device], offset));
+  CURAND_CHECK(curandSetGeneratorOrdering(cpuGen[device], order));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(cpuGen[device], seed));
+}
+
+void destroyGPURandomness()
+{
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  CURAND_CHECK(curandDestroyGenerator(gpuGen[device]));
+  // CUDA_CHECK(cudaDeviceReset());
+}
+
+void destroyCPURandomness()
+{
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  CURAND_CHECK(curandDestroyGenerator(cpuGen[device]));
+  // CUDA_CHECK(cudaDeviceReset());
+}
+
+template <typename TIn, typename TOut>
+void writeShares(u8 **key_as_bytes, int party, u64 N, TIn *d_A, int bw, bool randomShares)
+{
+  assert(bw <= 8 * sizeof(TOut));
+  TOut *d_A0 = NULL;
+  if (randomShares)
+    d_A0 = randomGEOnGpu<TOut>(N, bw);
+  // checkCudaErrors(cudaMemset(d_A0, 0, N * sizeof(TOut)));
+  size_t memSzA;
+  if (bw == 1 || bw == 2)
+  {
+    auto numInts = ((bw * N - 1) / PACKING_SIZE + 1);
+    memSzA = numInts * sizeof(PACK_TYPE);
+  }
+  else
+    memSzA = N * sizeof(TOut);
+  auto d_packed_A = (u32 *)gpuMalloc(memSzA);
+
+  getPackedSharesKernel<<<(N - 1) / 256 + 1, 256>>>(N, party, d_A, d_A0, d_packed_A, bw);
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  moveIntoCPUMem((u8 *)*key_as_bytes, (u8 *)d_packed_A, memSzA, NULL);
+  *key_as_bytes += memSzA;
+
+  if (d_A0)
+    gpuFree(d_A0);
+  gpuFree(d_packed_A);
+}
+
+template <typename T>
+T *getMaskedInputOnGpu(int N, int bw, T *d_mask_I, T **h_I, bool smallInputs, int smallBw)
+{
+  size_t memSzI = N * sizeof(T);
+  // int smallBw = 38; //15;
+  // printf("small inputs=%d\n", smallInputs);
+  // keep a gap here
+  auto d_I = randomGEOnGpu<T>(N, smallInputs ? std::min(smallBw, bw) : bw /*- 1*/);
+  // checkCudaErrors(cudaMemset(d_I, 0, memSzI));
+  // printf("%ld\n", -(T(1) << (smallInputs ? 14 : bw - 2)));
+  // uncomment this for small negative inputs
+  if (smallInputs)
+    gpuLinearComb(64 /*bw*/, N, d_I, T(1), d_I, -(T(1) << (smallBw - 1)));
+  *h_I = (T *)moveToCPU((u8 *)d_I, memSzI, NULL);
+  // printf("Input: %ld, %ld\n", i64((*h_I)[0]), i64((*h_I)[1]));
+  gpuLinearComb(bw, N, d_I, T(1), d_I, T(1), d_mask_I);
+  // gpuAddSharesInPlace<T>(d_I, d_mask_I, bw, N);
+  return d_I;
+}
+
+template <typename T>
+T *getMaskedInputOnCpu(int N, int bw, T *h_mask_I, T **h_I, bool smallInputs, int smallBw)
+{
+  size_t memSzI = N * sizeof(T);
+  auto d_mask_I = (T *)moveToGPU((u8 *)h_mask_I, memSzI, NULL);
+  auto d_masked_I = getMaskedInputOnGpu<T>(N, bw, d_mask_I, h_I, smallInputs, smallBw);
+  gpuFree(d_mask_I);
+  auto h_masked_I = (T *)moveToCPU((u8 *)d_masked_I, memSzI, NULL);
+  gpuFree(d_masked_I);
+  return h_masked_I;
+}
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_random.h b/GPU-MPC/utils/gpu_random.h
new file mode 100644
index 00000000..fbaa8e82
--- /dev/null
+++ b/GPU-MPC/utils/gpu_random.h
@@ -0,0 +1,47 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include "gpu_data_types.h"
+#include <curand.h>
+
+
+template <typename T>
+T *randomGEOnGpu(const u64 n, int bw);
+template <typename T>
+void randomGEOnCpu(const u64 n, int bw, T *h_data);
+template <typename T>
+T *randomGEOnCpu(const u64 n, int bw);
+AESBlock *randomAESBlockOnGpu(const int n);
+void initGPURandomness();
+void destroyGPURandomness();
+void initCPURandomness();
+void destroyCPURandomness();
+template <typename T>
+T *getMaskedInputOnGpu(int N, int bw, T *d_mask_I, T **h_I, bool smallInputs = false, int smallBw = 0);
+template <typename T>
+T *getMaskedInputOnCpu(int N, int bw, T *h_mask_I, T **h_I, bool smallInputs = false, int smallBw = 0);
+template <typename TIn, typename TOut>
+void writeShares(u8 **key_as_bytes, int party, u64 N, TIn *d_A, int bw, bool randomShares = true);
+
+
+#include "gpu_random.cu"
\ No newline at end of file
diff --git a/GPU-MPC/utils/gpu_stats.h b/GPU-MPC/utils/gpu_stats.h
new file mode 100644
index 00000000..503be493
--- /dev/null
+++ b/GPU-MPC/utils/gpu_stats.h
@@ -0,0 +1,98 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+class Stats
+{
+public:
+    uint64_t transfer_time = 0;
+    uint64_t compute_time = 0;
+    uint64_t comm_time = 0;
+
+    uint64_t conv_time = 0;
+    uint64_t conv_compute_time = 0;
+    uint64_t conv_comm_time = 0;
+
+    uint64_t matmul_time = 0;
+    uint64_t matmul_compute_time = 0;
+    uint64_t matmul_comm_time = 0;
+
+    uint64_t relu_time = 0;
+    uint64_t reluext_time = 0;
+    uint64_t reluext_comm_time = 0;
+
+    uint64_t maxpool_time = 0;
+    uint64_t maxpool_comm_time = 0;
+    uint64_t avgpool_time = 0;
+
+    uint64_t truncate_time = 0;
+    uint64_t truncate_comm_time = 0;
+    uint64_t signext_time = 0;
+
+    uint64_t gelu_time = 0;
+    uint64_t layernorm_time = 0;
+    uint64_t softmax_time = 0;
+    uint64_t mha_time = 0;
+
+    uint64_t linear_comm_bytes = 0;
+    uint64_t gelu_comm_bytes = 0;
+    uint64_t softmax_comm_bytes = 0;
+    uint64_t layernorm_comm_bytes = 0;
+
+    void reset()
+    {
+        transfer_time = 0;
+        compute_time = 0;
+        comm_time = 0;
+
+        conv_time = 0;
+        conv_compute_time = 0;
+        conv_comm_time = 0;
+
+        matmul_time = 0;
+        matmul_compute_time = 0;
+        matmul_comm_time = 0;
+
+        relu_time = 0;
+        reluext_time = 0;
+        reluext_comm_time = 0;
+
+        maxpool_time = 0;
+        maxpool_comm_time = 0;
+
+        avgpool_time = 0;
+        
+        truncate_time = 0;
+        truncate_comm_time = 0;
+
+        signext_time = 0;
+
+        gelu_time = 0;
+        layernorm_time = 0;
+        softmax_time = 0;
+        layernorm_comm_bytes = 0;
+        linear_comm_bytes = 0;
+        softmax_comm_bytes = 0;
+        gelu_comm_bytes = 0;
+        mha_time = 0;
+    }
+};
\ No newline at end of file
diff --git a/GPU-MPC/utils/helper_cuda.h b/GPU-MPC/utils/helper_cuda.h
new file mode 100644
index 00000000..31138bde
--- /dev/null
+++ b/GPU-MPC/utils/helper_cuda.h
@@ -0,0 +1,970 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(result));
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
+
diff --git a/GPU-MPC/utils/helper_cutlass.h b/GPU-MPC/utils/helper_cutlass.h
new file mode 100644
index 00000000..fcfac625
--- /dev/null
+++ b/GPU-MPC/utils/helper_cutlass.h
@@ -0,0 +1,41 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cutlass/cutlass.h>
+
+#define CUTLASS_CHECK(status)                                                                      \
+{                                                                                                  \
+    cutlass::Status error = status;                                                                \
+    if (error != cutlass::Status::kSuccess) {                                                      \
+        std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                  << std::endl;                                                                    \
+    }                                                                                              \
+}
\ No newline at end of file
diff --git a/GPU-MPC/utils/helper_functions.h b/GPU-MPC/utils/helper_functions.h
new file mode 100644
index 00000000..6d021898
--- /dev/null
+++ b/GPU-MPC/utils/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+// #include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include "helper_string.h" // helper functions for string parsing
+// #include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
\ No newline at end of file
diff --git a/GPU-MPC/utils/helper_string.h b/GPU-MPC/utils/helper_string.h
new file mode 100644
index 00000000..47fb1ac1
--- /dev/null
+++ b/GPU-MPC/utils/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
\ No newline at end of file
diff --git a/GPU-MPC/utils/misc_utils.h b/GPU-MPC/utils/misc_utils.h
new file mode 100644
index 00000000..7ddbada8
--- /dev/null
+++ b/GPU-MPC/utils/misc_utils.h
@@ -0,0 +1,236 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cstddef>
+#include <cassert>
+#include <cstdint>
+#include <omp.h>
+#include <unistd.h>
+
+#include "gpu_data_types.h"
+#include "gpu_mem.h"
+#include "helper_cuda.h"
+
+#include "gpu_file_utils.h"
+
+#include <numeric>
+
+template <typename T>
+__global__ void xorKernel(T *A, T *B, int N)
+{
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j < N)
+    {
+        A[j] ^= B[j];
+    }
+}
+
+template <typename T>
+__device__ inline void gpuMod(T &x, int bw)
+{
+    if (bw < sizeof(T) * 8)
+        x &= ((T(1) << bw) - 1);
+}
+
+template <typename T>
+__device__ void linearComb(int i, T c, T d_A)
+{
+    static_assert(std::is_arithmetic<T>::value, "Only arithmetic types supported");
+    return c * d_A[i];
+}
+
+template <typename T, typename... Args>
+__device__ T linearComb(int i, T c)
+{
+    // static_assert(std::is_arithmetic<T>::value, "Only arithmetic types supported");
+    return c;
+}
+
+template <typename T, typename... Args>
+__device__ T linearComb(int i, T c, T *A)
+{
+    // static_assert(std::is_arithmetic<T>::value, "Only arithmetic types supported");
+    // if(i == 0) printf("Linear comb: %ld, %ld\n", c, A[i]);
+    return c * A[i];
+}
+
+template <typename T, typename... Args>
+__device__ T linearComb(int i, T c, T *A, Args... args)
+{
+    // if(i == 0) printf("Linear comb: %ld, %ld\n", c, A[i]);
+    // static_assert(std::is_arithmetic<T>::value, "Only arithmetic types supported");
+    return c * A[i] + linearComb(i, args...);
+}
+
+template <typename T, typename... Args>
+__global__ void linearCombWrapper(int bw, int N, T *d_O, Args... args)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        d_O[i] = linearComb(i, args...);
+        gpuMod(d_O[i], bw);
+        // if(i == 0) printf("Op=%lu\n", d_O[i]);
+    }
+}
+
+template <typename T, typename... Arguments>
+void gpuLinearComb(int bw, int N, T *d_O, Arguments... args)
+{
+    const int thread_block_size = 128;
+    linearCombWrapper<<<(N - 1) / thread_block_size + 1, thread_block_size>>>(bw, N, d_O, args...);
+    checkCudaErrors(cudaDeviceSynchronize());
+    // checkCudaErrors(cudaGetLastError());
+}
+
+template <typename T>
+void gpuXor(T *d_A, T *d_B, int N, Stats *s)
+{
+    const int thread_block_size = 128;
+    xorKernel<<<(N - 1) / thread_block_size + 1, thread_block_size>>>(d_A, d_B, N);
+    checkCudaErrors(cudaDeviceSynchronize());
+}
+
+void writeInt(u8 **key_as_bytes, int N)
+{
+    memcpy(*key_as_bytes, &N, sizeof(int));
+    *key_as_bytes += sizeof(int);
+}
+
+template <typename T>
+__global__ void unmaskKernel(int bw, int N, T *A, T *mask_A)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        A[i] = A[i] - mask_A[i];
+        gpuMod(A[i], bw);
+        A[i] -= ((A[i] >> (bw - 1)) << bw);
+    }
+}
+
+template <typename T>
+void unmaskValues(int bw, int N, T *d_A, T *h_mask_A, Stats *s)
+{
+    auto d_mask_A = (T *)moveToGPU((u8 *)h_mask_A, N * sizeof(T), s);
+    unmaskKernel<<<(N - 1) / 256 + 1, 256>>>(bw, N, d_A, d_mask_A);
+    checkCudaErrors(cudaDeviceSynchronize());
+    gpuFree(d_mask_A);
+    // return d_A;
+}
+
+template <typename T>
+__device__ void writePackedOp(u32 *dReluOutput, T dReluBit, int bout, u64 N)
+{
+    u64 threadId = blockIdx.x * (u64)blockDim.x + threadIdx.x;
+    unsigned mask = __ballot_sync(FULL_MASK, /*threadIdx.x*/ threadId < N);
+    int laneId = threadIdx.x & 0x1f;
+    if (bout == 1)
+    {
+        int dreluAsInt = static_cast<int>(dReluBit);
+        dreluAsInt <<= laneId;
+        for (int j = 16; j >= 1; j /= 2)
+            dreluAsInt += __shfl_down_sync(mask, dreluAsInt, j, 32);
+        if (laneId == 0)
+        {
+            // if(threadId == 0 || (threadId / 32 == 71088)) printf("bit=%u, %lx\n", dreluAsInt, dReluOutput);
+            ((u32 *)dReluOutput)[threadId / 32] = static_cast<u32>(dreluAsInt);
+        }
+    }
+    else if (bout == 2)
+    {
+        auto dreluAsLongInt = u64(dReluBit);
+        dreluAsLongInt <<= (2 * laneId);
+        for (int j = 16; j >= 1; j /= 2)
+            dreluAsLongInt += __shfl_down_sync(mask, dreluAsLongInt, j, 32);
+        if (laneId == 0)
+        {
+            ((u32 *)dReluOutput)[threadId / 16] = static_cast<u32>(dreluAsLongInt);
+            if (N - threadId > 16)
+            {
+                ((u32 *)dReluOutput)[threadId / 16 + 1] = static_cast<u32>(dreluAsLongInt >> 32);
+            }
+        }
+    }
+    else
+    {
+        ((T *)dReluOutput)[threadId] = dReluBit;
+    }
+}
+
+template <typename T>
+__global__ void modKernel(u64 N, T *d_data, int bw)
+{
+    u64 i = blockIdx.x * (u64)blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        gpuMod(d_data[i], bw);
+    }
+}
+
+template <typename TIn, typename TOut>
+__global__ void getPackedSharesKernel(u64 N, int party, TIn *d_A, TOut *d_A0, u32 *d_packed_A, int bw)
+{
+    u64 i = blockIdx.x * (u64)blockDim.x + threadIdx.x;
+    if (i < N)
+    {
+        TOut share_A = TOut(d_A[i]);
+        if (d_A0)
+            share_A = party == SERVER0 ? d_A0[i] : TOut(d_A[i]) - d_A0[i];
+        gpuMod(share_A, bw);
+        // if(i == 1) printf("%lu: share_A = %u, %u, %d\n", i, share_A, d_A[i], party);
+        writePackedOp(d_packed_A, share_A, bw, N);
+    }
+}
+
+template <typename T>
+inline void cpuMod(T &x, int bw)
+{
+    if (bw < sizeof(T) * 8)
+        x &= ((T(1) << bw) - 1);
+}
+
+template <typename T>
+T cpuArs(T x, int bin, int shift)
+{
+    x -= (1ULL << (bin - 1));
+    cpuMod(x, bin);
+    x >>= shift;
+    x -= (1ULL << (bin - shift - 1));
+    // T msb = (x & (T(1) << (bin - 1))) >> (bin - 1);
+    // T signMask = (((T(1) << shift) - msb) << (8 * sizeof(T) - shift));
+    // x = (x >> shift) | signMask;
+    return x;
+}
+
+inline double asFloat(u64 x, int bw, int scale)
+{
+    return ((i64)cpuArs(x << (64 - bw), 64, 64 - bw)) / (double)(1ULL << scale);
+}
+
+void dropOSPageCache()
+{
+    printf("Dropping the OS page cache\n");
+    assert(0 == system("sudo sh -c \"echo  3 > /proc/sys/vm/drop_caches\"") && "could not drop page caches!");
+    sync();
+}
diff --git a/GPU-MPC/utils/sigma_comms.cpp b/GPU-MPC/utils/sigma_comms.cpp
new file mode 100644
index 00000000..43e968db
--- /dev/null
+++ b/GPU-MPC/utils/sigma_comms.cpp
@@ -0,0 +1,182 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "sigma_comms.h"
+#include "gpu_file_utils.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cassert>
+#include "helper_cuda.h"
+#include "gpu_mem.h"
+#include <chrono>
+#include <sys/socket.h>
+#include <future>
+
+size_t OneGB = 1024 * 1024 * 1024;
+
+SigmaPeer::SigmaPeer(bool pinMem, bool compress) : compress(compress), sendThread{}, mtx{}, cv{}
+{
+    initCommBufs(pinMem);
+    sendHasWork = false;
+    sendThread = std::thread(&SigmaPeer::wait, this);
+    // sendThread.detach();
+}
+
+void SigmaPeer::wait()
+{
+    while (true)
+    {
+        std::unique_lock<std::mutex> lock(mtx);
+        cv.wait(lock, [&]()
+                {return terminate || sendHasWork; });
+        if (sendHasWork)
+        {
+            sendBytes(sendBuf, sendSz);
+            sendHasWork = false;
+        }
+        if (terminate)
+        {
+            return;
+        }
+        lock.unlock();
+    }
+}
+
+void SigmaPeer::initCommBufs(bool pinMem)
+{
+    // printf("################## Increase the size of comm bufs! #####################\n");
+    printf("Allocating %lu bytes of memory for comm bufs\n", commBufSize);
+    h_bufA0 = cpuMalloc(commBufSize, pinMem);
+    h_bufA1 = cpuMalloc(commBufSize, pinMem);
+}
+
+void SigmaPeer::freeCommBufs(bool pinMem)
+{
+    cpuFree(h_bufA0, pinMem);
+    cpuFree(h_bufA1, pinMem);
+}
+
+void SigmaPeer::sendBytes(const u8 *data, size_t size)
+{
+    int sendsocket = static_cast<SocketBuf *>(peer->keyBuf)->sendsocket;
+    writeKeyBuf(sendsocket, size, data);
+    peer->keyBuf->bytesSent += size;
+}
+
+void SigmaPeer::recvBytes(u8 *data, size_t size)
+{
+    int recvsocket = static_cast<SocketBuf *>(peer->keyBuf)->recvsocket;
+    size_t chunkSize = (1ULL << 30);
+    size_t bytesRead = 0;
+    while (bytesRead < size)
+    {
+        size_t toRead = std::min(chunkSize, size - bytesRead);
+        ssize_t numRead = recv(recvsocket, data + bytesRead, toRead, MSG_WAITALL);
+
+        if (numRead == -1)
+        {
+            printf("errno: %d, %s\n", errno, strerror(errno));
+            assert(0 && "read");
+        }
+        assert(numRead == toRead);
+        bytesRead += numRead;
+    }
+    // assert(size == recv(peer->recvsocket, data, size, MSG_WAITALL));
+    peer->keyBuf->bytesReceived += size;
+}
+
+void SigmaPeer::exchangeShares(u8 *to_send, size_t bytes, Stats *s)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    // #pragma omp parallel /*sections*/ num_threads(2)
+    //     {
+    //         // printf("thread %d entering\n", omp_get_thread_num());
+    // #pragma omp sections
+    //         {
+    // #pragma omp section
+    //             {
+    //                 // printf("send %d %d %lu\n", omp_get_thread_num(), omp_get_num_threads(), bytes);
+    //                 sendBytes(to_send, bytes);
+    //             }
+    // #pragma omp section
+    //             {
+    //                 // printf("recv %d %d %lu\n", omp_get_thread_num(), omp_get_num_threads(), bytes);
+    //                 recvBytes(h_bufA1, bytes);
+    //             }
+    //         }
+    //         // printf("thread %d exiting, %d\n", omp_get_thread_num(), omp_get_active_level());
+    //     }
+    // std::future<void> res = std::async(&SigmaPeer::sendBytes, this, to_send, bytes);
+    // std::thread send_thread(&SigmaPeer::sendBytes, this, to_send, bytes);
+    // std::thread recv_thread(&recvBytes, peer, to_recv, bytes);
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        sendBuf = to_send;
+        sendSz = bytes;
+        sendHasWork = true;
+    }
+    cv.notify_one();
+    recvBytes(h_bufA1, bytes);
+    while (sendHasWork)
+    {
+    }
+    // res.get();
+    // send_thread.join();
+    // recv_thread.join();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = end - start;
+    if (s)
+        s->comm_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
+    // std::cout << "Time to exchange shares in ms: " << std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count() << " " << s->comm_time << std::endl;
+    // return h_bufA1;
+}
+
+void SigmaPeer::connect(int party, std::string addr, int port)
+{
+    this->party = party;
+    // Peer* peer;
+    if (party == SERVER0)
+        peer = waitForPeer(/*42003*/ port);
+    else
+        peer = new Peer(addr, /*42003*/ port);
+    peer->sync();
+    // return peer;
+}
+
+template <typename T>
+void SigmaPeer::getMemSz(int bw, u64 N, size_t &memSz, size_t &numInts)
+{
+    if (bw > 2)
+        memSz = N * sizeof(T);
+    else
+    {
+        assert(bw == 1 || bw == 2);
+        numInts = ((bw * N - 1) / 32 + 1);
+        memSz = numInts * sizeof(u32);
+    }
+    assert(memSz < commBufSize);
+}
+
+template void SigmaPeer::getMemSz<u64>(int bw, u64 N, size_t &memSz, size_t &numInts);
+template void SigmaPeer::getMemSz<u32>(int bw, u64 N, size_t &memSz, size_t &numInts);
+template void SigmaPeer::getMemSz<u16>(int bw, u64 N, size_t &memSz, size_t &numInts);
+template void SigmaPeer::getMemSz<u8>(int bw, u64 N, size_t &memSz, size_t &numInts);
\ No newline at end of file
diff --git a/GPU-MPC/utils/sigma_comms.h b/GPU-MPC/utils/sigma_comms.h
new file mode 100644
index 00000000..be941ab4
--- /dev/null
+++ b/GPU-MPC/utils/sigma_comms.h
@@ -0,0 +1,97 @@
+// Author: Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <sytorch/tensor.h>
+#include <llama/comms.h>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <atomic>
+#include "gpu_stats.h"
+
+extern size_t OneGB;
+// extern u8 *h_bufA0, *h_bufA1;
+// extern size_t commBufSize;
+class SigmaPeer
+{
+public:
+    size_t commBufSize = 5 * OneGB;
+    u8 *h_bufA0, *h_bufA1;
+    u8 *sendBuf = nullptr;
+    size_t sendSz;
+    Peer *peer;
+    int party;
+    bool compress;
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool terminate = false;
+    std::atomic<bool> sendHasWork;
+    std::thread sendThread;
+
+    SigmaPeer(bool pinMem, bool compress);
+    void wait();
+    void initCommBufs(bool pinMem);
+    void freeCommBufs(bool pinMem);
+    void sendBytes(const u8 *data, size_t size);
+    void recvBytes(u8 *data, size_t size);
+    /*virtual*/ void exchangeShares(u8 *to_send, size_t bytes, Stats *s);
+    void connect(int party, std::string addr, int port);
+    inline void sync()
+    {
+        peer->sync();
+    }
+
+    inline u64 bytesSent()
+    {
+        return peer->bytesSent();
+    }
+
+    inline u64 bytesReceived()
+    {
+        return peer->bytesReceived();
+    }
+
+    inline void close()
+    {
+        {
+            std::unique_lock<std::mutex> lock(mtx);
+            terminate = true;
+        }
+        cv.notify_one();
+        sendThread.join();
+        peer->close();
+    }
+
+    template <typename T>
+    void getMemSz(int bw, u64 N, size_t &memSz, size_t &numInts);
+    virtual void Send(u64 *h_A0, int bw, u64 N, Stats *s) = 0;
+    virtual void Send(u32 *h_A0, int bw, u64 N, Stats *s) = 0;
+    virtual void Send(u8 *h_A0, int bw, u64 N, Stats *s) = 0;
+    virtual u8 *Recv(int bw, u64 N, Stats *s) = 0;
+    virtual void reconstructInPlace(u64 *A0, int bw, u64 N, Stats *s) = 0;
+    virtual void reconstructInPlace(u32 *A0, int bw, u64 N, Stats *s) = 0;
+    virtual void reconstructInPlace(u16 *A0, int bw, u64 N, Stats *s) = 0;
+    virtual void reconstructInPlace(u8 *A0, int bw, u64 N, Stats *s) = 0;
+    virtual u64 *addAndReconstruct(int bw, u64 N, u64 *A0, u64 *B0, Stats *s, bool inPlace) = 0;
+    virtual u32 *addAndReconstruct(int bw, u64 N, u32 *A0, u32 *B0, Stats *s, bool inPlace) = 0;
+};

From 3631ee64716906c8489f27cc7b3234f7c288f856 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:06:13 -0700
Subject: [PATCH 02/32] Adding gitignore

---
 GPU-MPC/experiments/orca/.gitignore | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 GPU-MPC/experiments/orca/.gitignore

diff --git a/GPU-MPC/experiments/orca/.gitignore b/GPU-MPC/experiments/orca/.gitignore
new file mode 100644
index 00000000..ff946c95
--- /dev/null
+++ b/GPU-MPC/experiments/orca/.gitignore
@@ -0,0 +1,28 @@
+/*
+/*Copyright:
+/*
+/*Copyright (c) 2024 Microsoft Research
+/*
+/*Permission is hereby granted, free of charge, to any person obtaining a copy
+/*of this software and associated documentation files (the "Software"), to deal
+/*in the Software without restriction, including without limitation the rights
+/*to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/*copies of the Software, and to permit persons to whom the Software is
+/*furnished to do so, subject to the following conditions:
+/*The above copyright notice and this permission notice shall be included in all
+/*copies or substantial portions of the Software.
+/*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/*IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/*FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/*AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/*LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/*OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/*SOFTWARE.
+
+output/
+orca_dealer
+orca_evaluator
+orca_inference
+orca_inference_u32
+piranha
+datasets/shares

From 87eae6b2b0b090f4bd2031f1db99fb7a40dc9310 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:07:33 -0700
Subject: [PATCH 03/32] Added weights as a submodule

---
 .gitmodules                      | 3 +++
 GPU-MPC/experiments/orca/weights | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 GPU-MPC/experiments/orca/weights

diff --git a/.gitmodules b/.gitmodules
index 56abae41..522f9f6a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "CrypTFlow2/extern/eigen"]
 	path = SCI/extern/eigen
 	url = https://gitlab.com/libeigen/eigen
+[submodule "GPU-MPC/experiments/orca/weights"]
+	path = GPU-MPC/experiments/orca/weights
+	url = https://github.com/neha-jawalkar/weights.git
diff --git a/GPU-MPC/experiments/orca/weights b/GPU-MPC/experiments/orca/weights
new file mode 160000
index 00000000..89fb0e0b
--- /dev/null
+++ b/GPU-MPC/experiments/orca/weights
@@ -0,0 +1 @@
+Subproject commit 89fb0e0b35d5374f1bcd72528c05056725637719

From 02d429b5ed8bd74540c26ee23ed3ecd5d6a728d9 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:08:12 -0700
Subject: [PATCH 04/32] Added mnist as a submodule

---
 .gitmodules                             | 3 +++
 GPU-MPC/experiments/orca/datasets/mnist | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 GPU-MPC/experiments/orca/datasets/mnist

diff --git a/.gitmodules b/.gitmodules
index 522f9f6a..417a3667 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "GPU-MPC/experiments/orca/weights"]
 	path = GPU-MPC/experiments/orca/weights
 	url = https://github.com/neha-jawalkar/weights.git
+[submodule "GPU-MPC/experiments/orca/datasets/mnist"]
+	path = GPU-MPC/experiments/orca/datasets/mnist
+	url = https://github.com/neha-jawalkar/mnist.git
diff --git a/GPU-MPC/experiments/orca/datasets/mnist b/GPU-MPC/experiments/orca/datasets/mnist
new file mode 160000
index 00000000..09f5c2ca
--- /dev/null
+++ b/GPU-MPC/experiments/orca/datasets/mnist
@@ -0,0 +1 @@
+Subproject commit 09f5c2ca42489e5369f5b5560fce2fc6b1e2f81e

From 7ef44723b33734f559b2a8bb279f5d14f1314d70 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:09:25 -0700
Subject: [PATCH 05/32] Added cutlass as a submodule

---
 .gitmodules         | 3 +++
 GPU-MPC/ext/cutlass | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 GPU-MPC/ext/cutlass

diff --git a/.gitmodules b/.gitmodules
index 417a3667..3ebd47fe 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "GPU-MPC/experiments/orca/datasets/mnist"]
 	path = GPU-MPC/experiments/orca/datasets/mnist
 	url = https://github.com/neha-jawalkar/mnist.git
+[submodule "GPU-MPC/ext/cutlass"]
+	path = GPU-MPC/ext/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/GPU-MPC/ext/cutlass b/GPU-MPC/ext/cutlass
new file mode 160000
index 00000000..66d9cddc
--- /dev/null
+++ b/GPU-MPC/ext/cutlass
@@ -0,0 +1 @@
+Subproject commit 66d9cddc832c1cdc2b30a8755274f7f74640cfe6

From a4f613a115c94b795bd0188e1349e3ff0632f8f7 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 05:11:03 -0700
Subject: [PATCH 06/32] Added SEAL as a submodule

---
 .gitmodules                             | 3 +++
 GPU-MPC/ext/sytorch/ext/sci/extern/SEAL | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 GPU-MPC/ext/sytorch/ext/sci/extern/SEAL

diff --git a/.gitmodules b/.gitmodules
index 3ebd47fe..1235e7cd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "GPU-MPC/ext/cutlass"]
 	path = GPU-MPC/ext/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "GPU-MPC/ext/sytorch/ext/sci/extern/SEAL"]
+	path = GPU-MPC/ext/sytorch/ext/sci/extern/SEAL
+	url = https://github.com/microsoft/SEAL.git
diff --git a/GPU-MPC/ext/sytorch/ext/sci/extern/SEAL b/GPU-MPC/ext/sytorch/ext/sci/extern/SEAL
new file mode 160000
index 00000000..1d5c8169
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/ext/sci/extern/SEAL
@@ -0,0 +1 @@
+Subproject commit 1d5c8169aa5aca9deb75c4079e53ea8d5e94007d

From 28a6b8d58cc55338d4bf411c7ed8eca3f56058ce Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Fri, 17 May 2024 23:27:47 -0700
Subject: [PATCH 07/32] Fixed plots for sequential runs of Fig5a and Fig5b

---
 GPU-MPC/experiments/orca/run_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU-MPC/experiments/orca/run_experiment.py b/GPU-MPC/experiments/orca/run_experiment.py
index 8e91a236..b8c239ec 100644
--- a/GPU-MPC/experiments/orca/run_experiment.py
+++ b/GPU-MPC/experiments/orca/run_experiment.py
@@ -53,7 +53,7 @@ def run_fig_helper(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, exp_nam
     plt.xlabel("Iterations")
     plt.ylabel("Cross-entropy loss")
     plt.savefig("output/P{}/{}/{}.png".format(party, fig_name, fig_name), dpi=300, bbox_inches='tight')
-    
+    plt.clf()
     with open('output/P{}/{}/loss.csv'.format(party, fig_name),'w') as out_file:
         writer = csv.writer(out_file)
         writer.writerow(['Iteration','Cross-Entropy Loss'])

From 2de6295bf11d31df0bd21dbb8684a47f3aa64807 Mon Sep 17 00:00:00 2001
From: Neha J <b-nejawalkar@microsoft.com>
Date: Tue, 21 May 2024 14:38:36 -0700
Subject: [PATCH 08/32] Added test for rotary embedding

---
 GPU-MPC/Makefile                              |   3 +
 GPU-MPC/ext/sytorch/examples/llama7b.cpp      |   2 -
 .../sytorch/include/sytorch/backend/backend.h |  44 ++++++-
 .../sytorch/include/sytorch/layers/layers.h   |  29 +----
 GPU-MPC/ext/sytorch/include/sytorch/module.h  |  17 ++-
 GPU-MPC/fss/gpu_mha.cu                        |  21 ++--
 GPU-MPC/fss/gpu_truncate.cu                   |   7 +-
 GPU-MPC/tests/fss/layernorm.cu                |   2 +-
 GPU-MPC/tests/fss/rmsnorm.cu                  |   2 +-
 GPU-MPC/tests/fss/rotary_embedding.cu         | 113 ++++++++++++++++++
 10 files changed, 188 insertions(+), 52 deletions(-)
 create mode 100644 GPU-MPC/tests/fss/rotary_embedding.cu

diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile
index 022da47b..9e399452 100644
--- a/GPU-MPC/Makefile
+++ b/GPU-MPC/Makefile
@@ -56,6 +56,9 @@ truncate: tests/fss/truncate.cu
 mha: tests/fss/mha.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/mha
 
+rotary_embedding: tests/fss/rotary_embedding.cu
+	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/rotary_embedding
+
 secfloat_softmax: tests/fss/secfloat_softmax.cu
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o tests/fss/secfloat_softmax
 
diff --git a/GPU-MPC/ext/sytorch/examples/llama7b.cpp b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
index 4c239287..a14593fd 100644
--- a/GPU-MPC/ext/sytorch/examples/llama7b.cpp
+++ b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
@@ -244,8 +244,6 @@ void ct_main()
     int argmax = 0;
     for (int i = 0; i < n_vocab; i++)
     {
-        if (i == 0)
-            printf("res=%ld\n", res.data[i]);
         if (res.data[i] > max)
         {
             max = res.data[i];
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
index c4aba60a..8f2fa11b 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -182,6 +182,46 @@ class Backend
         assert(0 && "not implemented");
     }
 
+    virtual void rotary_embedding(Tensor<T> &x, Tensor<T> &y, u64 scale, u64 max_position_embeddings = 2048, u64 base = 10000)
+    {
+        u64 n_seq = x.shape[0];
+        u64 dim = x.shape[1];
+        // printf("dims=%d, %lu, %ld, %ld\n", x.shape.size(), x.size(), x.data[0], y.data[0]);
+        auto y_2d = y.as_2d();
+        auto x_2d = x.as_2d();
+
+        for (u64 i = 0; i < n_seq; ++i)
+        {
+            for (u64 j = 0; j < dim; j++)
+            {
+                // before
+                double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
+                T scalarInt = (i * scalar) * std::pow(2, scale);
+                // y_2d(i, ind) = i * scalar;
+                T sinx = std::sin(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
+                T cosx = std::cos(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
+
+                // now
+                // double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
+                // int tempScale = scale - 3;
+                // T sinx = (T) std::sin(i * scalar) * std::pow(2, tempScale);
+                // T cosx = (T) std::cos(i * scalar) * std::pow(2, tempScale);
+
+                if (sinx == (1ULL << (scale - 3)))
+                    sinx -= 1;
+                if (cosx == (1ULL << (scale - 3)))
+                    cosx -= 1;
+                u64 k = (j + dim / 2) % dim;
+                T mul = 2 * (j >= dim / 2) - 1;
+                T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
+                // if ((i == 8 && j == 81) /*|| (i == n_seq - 1 && j == dim - 2)*/)
+                //     printf("x=%ld, sin=%ld, cos=%ld, y=%ld, mul=%ld, out=%ld, %f, %f, %f, %lf, %lf, %f, %ld\n", x_2d(i, j), sinx, cosx, x_2d(i, k), mul, z, std::sin(scalarInt / (float)std::pow(2, scale)), std::cos(scalarInt / (float)std::pow(2, scale)), scalarInt / (float)std::pow(2, scale), (std::pow(base, (double)((2 * j) % dim) / dim)), 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim)), scalarInt / (float)std::pow(2, scale), scalarInt);
+                y_2d(i, j) = z;
+            }
+        }
+        this->truncate(y_2d, scale - 3);
+    }
+
     virtual void optimize(LayerGraphNode<T> *root)
     {
     }
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
index 9fb03770..93fc9e21 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
@@ -1719,34 +1719,7 @@ class RotaryEmbedding : public Layer<T>
 
     void _forward(Tensor<T> &a)
     {
-        u64 n_seq = a.shape[0];
-        u64 dim = a.shape[1];
-        auto x_2d = a.as_2d();
-        auto y_2d = this->activation.as_2d();
-        
-
-        for (u64 i = 0; i < n_seq; ++i)
-        {
-            for (u64 j = 0; j < dim; j++)
-            {
-                double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
-                T scalarInt = (i * this->scalar) * std::pow(2, this->scale);
-                T sinx = std::sin(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
-                T cosx = std::cos(scalarInt / (float) std::pow(2, this->scale)) * std::pow(2, this->scale - 3);
-                // T sinx = std::sin(i * scalar) * std::pow(2, this->scale - 3);
-                // T cosx = std::cos(i * scalar) * std::pow(2, this->scale - 3);
-
-                if (sinx == (1ULL << (this->scale - 3)))
-                    sinx -= 1;
-                if (cosx == (1ULL << (this->scale - 3)))
-                    cosx -= 1;
-                u64 k = (j + dim / 2) % dim;
-                T mul = 2 * (j >= dim / 2) - 1;
-                T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
-                y_2d(i, j) = z;
-            }
-        }
-        this->backend->truncate(this->activation, this->scale - 3);
+        this->backend->rotary_embedding(a, this->activation, this->scale);
     }
 
     std::vector<u64> get_output_dims(const std::vector<std::vector<u64>> &inShapes)
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/module.h b/GPU-MPC/ext/sytorch/include/sytorch/module.h
index 3b1b1d5c..14706e6f 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/module.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/module.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -316,11 +316,18 @@ class SytorchModule
                     mha->wProj.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
                 }
                 wIdx += mha->wProj.size();
-                for (u64 j = 0; j < mha->bProj.size(); ++j)
+                if (mha->qkvLayout == "qkvsep")
                 {
-                    mha->bProj.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    mha->bProj.as_nd().zero();
+                }
+                else
+                {
+                    for (u64 j = 0; j < mha->bProj.size(); ++j)
+                    {
+                        mha->bProj.data[j] = T(floatWeights[wIdx + j] * (1LL << (2 * scale)));
+                    }
+                    wIdx += mha->bProj.size();
                 }
-                wIdx += mha->bProj.size();
             }
             else if (layer->name == "BatchNormInference")
             {
diff --git a/GPU-MPC/fss/gpu_mha.cu b/GPU-MPC/fss/gpu_mha.cu
index 5f1417d8..d867df94 100644
--- a/GPU-MPC/fss/gpu_mha.cu
+++ b/GPU-MPC/fss/gpu_mha.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -34,9 +34,8 @@
 #include "fss/gpu_scalarmul.h"
 #include "fss/gpu_truncate.h"
 
-
 template <typename T>
-__global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
+__global__ void rotEmbKernel(MHAParams pMHA, int bw, int scale, u64 N, T *X, T *Y)
 {
     // the vectors are N x dim_W
     assert(pMHA.dim_W % 2 == 0);
@@ -51,12 +50,13 @@ __global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
         temp = temp % (pMHA.n_seq * pMHA.dim_W);
         int i = temp / pMHA.dim_W;
         int j = temp % pMHA.dim_W;
-        float sinx, cosx;
         auto k = j - (j >= dim_W_half) * dim_W_half;
-        __sincosf(i / __powf(10000, (2 * k / (float)pMHA.dim_W)), &sinx, &cosx);
+
+        double scalar = 1.0 / std::pow(10000.0, (2 * k / (double)pMHA.dim_W));
+        float scalarInt = T((i * scalar) * (1ULL << scale)) / (float) (1ULL << scale); 
         const auto uLim = T(1ULL << (scale - 3));
-        T sinxi = T(sinx * uLim);
-        T cosxi = T(cosx * uLim);
+        T sinxi = (T)(i64)(std::sin(scalarInt) * uLim);
+        T cosxi = (T)(i64)(std::cos(scalarInt) * uLim);
         if (sinxi == uLim)
             sinxi -= 1;
         if (cosxi == uLim)
@@ -65,6 +65,7 @@ __global__ void rotEmbKernel(MHAParams pMHA, int scale, u64 N, T *X, T *Y)
         auto l = (j + dim_W_half) % pMHA.dim_W;
         T m1 = 2 * (j >= dim_W_half) - 1;
         Y[tid] = cosxi * X[tid] + m1 * sinxi * X[head * pMHA.n_seq * pMHA.dim_W + i * pMHA.dim_W + l];
+        gpuMod(Y[tid], bw);
     }
 }
 
@@ -74,7 +75,7 @@ T *gpuKeygenRotEmb(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pM
     printf("*********** Generating rotary embedding key! ***************\n");
     size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
     auto d_mask_X1 = (T *)gpuMalloc(size_X * sizeof(T));
-    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_mask_X, d_mask_X1);
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, bw, scale, size_X, d_mask_X, d_mask_X1);
     // gpuFree(d_mask_X);
     auto d_mask_truncated_X = genGPUTruncateKey<T, T>(key_as_bytes, party, TruncateType::TrWithSlack, bw, bw, scale - 3, size_X, d_mask_X1, g);
     gpuFree(d_mask_X1);
@@ -88,7 +89,7 @@ T *gpuRotEmb(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, GPUT
 
     size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
     auto d_X1 = (T *)gpuMalloc(size_X * sizeof(T));
-    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, scale, size_X, d_X, d_X1);
+    rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, bw, scale, size_X, d_X, d_X1);
     // don't free this because QKV is one long array
     // gpuFree(d_X);
     auto d_truncated_X = gpuTruncate<T, T>(bw, bw, TruncateType::TrWithSlack, trKey, scale - 3, peer, party, size_X, d_X1, g, s); //, true);
diff --git a/GPU-MPC/fss/gpu_truncate.cu b/GPU-MPC/fss/gpu_truncate.cu
index 1e31ef85..f21e6154 100644
--- a/GPU-MPC/fss/gpu_truncate.cu
+++ b/GPU-MPC/fss/gpu_truncate.cu
@@ -74,8 +74,8 @@ __global__ void trCorrKernel(int party, int bin, int shift, int bout, int N, TIn
         u32 z = (z_g[i / 32] >> (threadIdx.x & 0x1f)) & 1;
         auto y_l = (TOut)tf(party, bin, shift, i, x[i], bytes) + corr[2 * i + z];
         gpuMod(y_l, bout);
-        // if (i == 0)
-        //     printf("corr=%lu, %lu, %lu, %u, %lu\n", corr[0], corr[1], y_l, z, x[i]);
+        if (i == 0)
+            printf("corr=%lu, %lu, %lu, %u, %lu\n", corr[0], corr[1], y_l, z, x[i]);
         y[i] = y_l;
     }
 }
@@ -110,7 +110,7 @@ __device__ void keygenTrWithSlack(int party, int bin, int shift, int bout, int N
 {
     keygenTrReduce(party, bin, shift, bout, N, i, x, y, z, trKey, bytes);
     trKey[2 * N + i] = TOut(gpuMsb(x, bin) * (1ULL << (bin - shift)));
-    // if(i == 0) printf("trSlack key=%lu\n", ((TOut *)bytes)[N + i]);
+    if(i == 0) printf("trSlack key=%lu\n", trKey[2 * N + i]);
 }
 
 template <typename TIn, typename TOut, keygenTrFunc<TIn, TOut> tf>
@@ -119,6 +119,7 @@ __global__ void keygenTrFuncKernel(int party, int bin, int shift, int bout, int
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < N)
     {
+        if(i == 0) printf("x=%lu, y=%lu, z=%lu\n", x[i], y[i], z[i]);
         tf(party, bin, shift, bout, N, i, x[i], y[i], z[i], trKey, bytes);
     }
 }
diff --git a/GPU-MPC/tests/fss/layernorm.cu b/GPU-MPC/tests/fss/layernorm.cu
index 617ae774..391ff4d1 100644
--- a/GPU-MPC/tests/fss/layernorm.cu
+++ b/GPU-MPC/tests/fss/layernorm.cu
@@ -88,7 +88,7 @@ int main(int argc, char *argv[])
     auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
     auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
 
-    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    // printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
     u8 *startPtr, *curPtr;
     getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
     llama::start();
diff --git a/GPU-MPC/tests/fss/rmsnorm.cu b/GPU-MPC/tests/fss/rmsnorm.cu
index 537ab820..c6955b1b 100644
--- a/GPU-MPC/tests/fss/rmsnorm.cu
+++ b/GPU-MPC/tests/fss/rmsnorm.cu
@@ -88,7 +88,7 @@ int main(int argc, char *argv[])
     auto d_masked_A = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_A, &h_A, true, 15);
     auto d_masked_B = getMaskedInputOnGpu(p.imgW, p.bw, d_mask_B, &h_B, true, 15);
 
-    printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
+    // printf("A=%ld, B=%ld, I=%ld, %ld, %ld, %ld\n", h_A[0], h_B[0], h_I[0], h_I[1], h_I[2], h_I[3]);
     u8 *startPtr, *curPtr;
     getKeyBuf(&startPtr, &curPtr, 8 * OneGB);
     llama::start();
diff --git a/GPU-MPC/tests/fss/rotary_embedding.cu b/GPU-MPC/tests/fss/rotary_embedding.cu
new file mode 100644
index 00000000..e4b120fc
--- /dev/null
+++ b/GPU-MPC/tests/fss/rotary_embedding.cu
@@ -0,0 +1,113 @@
+// Author: Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "utils/gpu_data_types.h"
+#include "utils/gpu_file_utils.h"
+#include "utils/misc_utils.h"
+#include "utils/gpu_mem.h"
+#include "utils/gpu_random.h"
+#include "utils/gpu_comms.h"
+
+#include <cassert>
+
+#include <sytorch/tensor.h>
+#include <sytorch/backend/cleartext.h>
+
+#include "fss/gpu_mha.h"
+
+using T = u64;
+
+int main(int argc, char *argv[])
+{
+    initGPUMemPool();
+    AESGlobalContext g;
+    initAESContext(&g);
+
+    int bw = 48;
+    int scale = 12;
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+
+    int party = atoi(argv[1]);
+    int n_seq = 128;
+    int n_heads = 32;
+    int n_embed = 4096;
+    int dim_W = 128;
+    MHAParams pMHA = {n_seq, n_embed, n_heads, dim_W, true, true, true};
+    int N = pMHA.n_heads * pMHA.n_seq * pMHA.dim_W;
+
+    auto peer = new GpuPeer(true);
+    peer->connect(party, argv[2]);
+
+    uint8_t *startPtr, *curPtr;
+    getKeyBuf(&startPtr, &curPtr, 10 * OneGB);
+
+    initGPURandomness();
+    auto d_mask_X = randomGEOnGpu<T>(N, bw);
+    auto h_mask_X = (T *)moveToCPU((u8 *)d_mask_X, N * sizeof(T), NULL);
+    T *h_X;
+    auto d_masked_X = getMaskedInputOnGpu(N, bw, d_mask_X, &h_X, true, bw - scale);
+    auto d_mask_O = gpuKeygenRotEmb(&curPtr, party, bw, scale, pMHA, d_mask_X, &g);
+    auto h_mask_O = (T *)moveToCPU((u8 *)d_mask_O, N * sizeof(T), NULL);
+    auto k = readGPUTruncateKey<T>(TruncateType::TrWithSlack, &startPtr);
+    T *d_O;
+    Stats s;
+    for (int i = 0; i < 1; i++)
+    {
+        s.comm_time = 0;
+        s.transfer_time = 0;
+        peer->sync();
+        auto start = std::chrono::high_resolution_clock::now();
+        d_O = gpuRotEmb(peer, party, bw, scale, pMHA, k, d_masked_X, &g, &s);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = end - start;
+        printf("Comm time=%lu micros\n", s.comm_time);
+        printf("Transfer time=%lu micros\n", s.transfer_time);
+        printf("Rotary embedding time=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
+    }
+    unmaskValues(bw, N, d_O, h_mask_O, NULL);
+    auto h_O = (T *)moveToCPU((uint8_t *)d_O, N * sizeof(T), (Stats *)NULL);
+    printf("%ld, %ld\n", h_O[0], h_O[1]);
+    gpuFree(d_O);
+    destroyGPURandomness();
+    for (int i = 0; i < n_heads; i++)
+    {
+        auto h_X_temp = (i64 *)(h_X + i * n_seq * dim_W);
+        auto h_O_temp = (i64 *)(h_O + i * n_seq * dim_W);
+        Tensor<i64> x((i64 *)h_X_temp, {(u64)n_seq, (u64)dim_W});
+        Tensor<i64> y({(u64)n_seq, (u64)dim_W});
+        ct->rotary_embedding(x, y, (u64)scale);
+        for (int j = 0; j < n_seq * dim_W; j++)
+        {
+            if (i * n_seq * dim_W + j < 10)
+            {
+                printf("%d=%ld, %ld\n", i * n_seq * dim_W + j, y.data[j], h_O_temp[j]);
+            }
+            auto diff = std::abs((i64)((i64)y.data[j] - (i64)h_O_temp[j]));
+            if (diff > 0)
+            {
+                printf("%d=%ld, %ld, %ld\n", i * n_seq * dim_W + j, y.data[j], h_O_temp[j], diff);
+                assert(0);
+            }
+        }
+    }
+    return 0;
+}
\ No newline at end of file

From 22775c11af5c09c522fa81d1826363f6bbd816d1 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Wed, 22 May 2024 05:35:08 -0700
Subject: [PATCH 09/32] Removed printfs

---
 GPU-MPC/backend/orca.h                        |   2 -
 GPU-MPC/backend/sigma.h                       |  23 +-
 GPU-MPC/experiments/orca/config.json          |   4 +-
 GPU-MPC/experiments/sigma/sigma.cu            |  53 ++-
 GPU-MPC/ext/sytorch/examples/llama7b.cpp      |  43 ++-
 GPU-MPC/ext/sytorch/ext/llama/api.cpp         |  12 +-
 GPU-MPC/ext/sytorch/ext/llama/lut.cpp         |   4 +-
 .../sytorch/include/sytorch/backend/backend.h |  10 -
 .../sytorch/include/sytorch/layers/layers.h   |   6 -
 GPU-MPC/ext/sytorch/include/sytorch/module.h  |   7 +-
 GPU-MPC/fss/dcf/gpu_dcf_templates.h           |   2 -
 GPU-MPC/fss/dcf/gpu_sgd.cu                    |   1 -
 GPU-MPC/fss/gpu_dpf.h                         |   2 -
 GPU-MPC/fss/gpu_gelu.cu                       |   4 -
 GPU-MPC/fss/gpu_lut.cu                        |  38 +--
 GPU-MPC/fss/gpu_matmul.cu                     |   1 -
 GPU-MPC/fss/gpu_maxpool.cu                    |   2 -
 GPU-MPC/fss/gpu_maxpool.h                     |   9 -
 GPU-MPC/fss/gpu_mha.cu                        |   6 -
 GPU-MPC/fss/gpu_mha.h                         |   9 +-
 GPU-MPC/fss/gpu_mul.cu                        |  17 +-
 GPU-MPC/fss/gpu_mul.h                         |   6 +-
 GPU-MPC/fss/gpu_nexp.cu                       |   2 -
 GPU-MPC/fss/gpu_nexp.h                        |   3 -
 GPU-MPC/fss/gpu_scalarmul.h                   |  11 +-
 GPU-MPC/fss/gpu_softmax.cu                    |   3 +-
 GPU-MPC/fss/gpu_truncate.cu                   |   7 -
 GPU-MPC/fss/gpu_window.cu                     |  23 +-
 GPU-MPC/fss/gpu_window.h                      |   1 -
 GPU-MPC/tests/fss/dpf.cu                      |   2 -
 GPU-MPC/tests/fss/dpf_lut.cu                  |   1 -
 GPU-MPC/utils/cpu_comms.h                     |   2 -
 GPU-MPC/utils/sigma_comms.cpp                 |   1 -
 sytorch/include/sytorch/tensor.h              | 304 ++++++++++++------
 34 files changed, 318 insertions(+), 303 deletions(-)

diff --git a/GPU-MPC/backend/orca.h b/GPU-MPC/backend/orca.h
index 4850da52..671a2b8c 100644
--- a/GPU-MPC/backend/orca.h
+++ b/GPU-MPC/backend/orca.h
@@ -96,8 +96,6 @@ class Orca : public OrcaBase<T>
         {
             assert(0);
         }
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h
index d5667478..65f5ecec 100644
--- a/GPU-MPC/backend/sigma.h
+++ b/GPU-MPC/backend/sigma.h
@@ -128,12 +128,9 @@ class SIGMA : public Backend<T>
         p.N = b.d2;
         p.batchSz = 1;
         stdInit(p, bw, 0);
+
         auto k = readGPUMatmulKey<T>(p, TruncateType::None, &keyBuf);
         c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false);
-        // printf("Matmul weights=%ld, %ld, %ld\n", b.data[0], b.data[1], b.data[b.size() - 1]);
-
-        // auto h_out = (T*) moveToCPU((u8*) c.d_data, p.size_C * sizeof(T), NULL);
-        // printf("Matmul output=%ld, %ld\n", h_out[0], h_out[1]);
 
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
@@ -218,16 +215,12 @@ class SIGMA : public Backend<T>
 
     void truncateForward(Tensor<T> &in, u64 shift, u8 mode = 0)
     {
-        // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size);
         auto start = std::chrono::high_resolution_clock::now();
 
         TruncateType t = TruncateType::TrFloor;
         auto k = readGPUTruncateKey<T>(t, &keyBuf);
         in.d_data = gpuTruncate<T, T>(k.bin, k.bout, t, k, k.shift, peer, party, k.N, in.d_data, &g, &s);
 
-        // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL);
-        // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]);
-
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = end - start;
         s.truncate_time += std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
@@ -242,25 +235,20 @@ class SIGMA : public Backend<T>
 
     void output(Tensor<T> &a)
     {
-        // printf("Inside output=%lx\n", a.d_data);
-        // int tmpBw = bw - scale;
         int N = a.size();
-        // printf("keyBuf=%lx, %lu\n", keyBuf, keyBuf - startPtr);
         unmaskValues(bw, N, a.d_data, (T *)keyBuf, &s);
-        // printf("boo\n");
         moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s);
     }
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void optimize(LayerGraphNode<T> *root)
@@ -345,6 +333,7 @@ class SIGMAKeygen : public Backend<T>
     void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
     {
         out.d_data = gpuKeyGenGelu<T, u16, 10>(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g);
+
     }
 
     void SIGMALayernormKeygen(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale, bool computeMu)
@@ -376,7 +365,6 @@ class SIGMAKeygen : public Backend<T>
     {
         MHAParams pMHA = {X.d1, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb};
         MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale);
-        printf("scale=%d\n", pMHAMul.pQKV.shift);
         Y.d_data = gpuKeygenMHA(&keyBuf, party, bw, scale, pMHA, pMHAMul, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, &g);
     }
 
@@ -393,16 +381,13 @@ class SIGMAKeygen : public Backend<T>
 
     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
     {
-        int tmpBw = bw - scale;
         int N = in[0]->size();
-        // printf("Add input=%d, %lx, %lx\n", N, in[0]->d_data, in[1]->d_data);
         std::vector<T *> gpuInp;
         for (int i = 0; i < in.size(); i++)
         {
             gpuInp.push_back(in[i]->d_data);
-            // printf("Add inp=%lx\n", in[i]->d_data);
         }
-        out.d_data = gpuAdd(tmpBw, N, gpuInp);
+        out.d_data = gpuAdd(bw, N, gpuInp);
     }
 
     void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
diff --git a/GPU-MPC/experiments/orca/config.json b/GPU-MPC/experiments/orca/config.json
index 7251b442..3f860523 100644
--- a/GPU-MPC/experiments/orca/config.json
+++ b/GPU-MPC/experiments/orca/config.json
@@ -12,11 +12,11 @@
     "P1": {
         "dealer": {
             "gpu": 2,
-            "key_dir": "/tmp/"
+            "key_dir": "/mnt/nvme/neha/"
         },
         "evaluator": {
             "gpu": 3,
-            "peer": "0.0.0.0"
+            "peer": "172.31.42.201"
         }
     }
 }
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 047daba0..5e064063 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -25,8 +25,9 @@
 #include "llama2.h"
 #include "backend/sigma.h"
 
-inline std::string toGB(u64 bytes) {
-    return std::to_string(bytes) + " B (" + std::to_string((float) bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)";
+inline std::string toGB(u64 bytes)
+{
+    return std::to_string(bytes) + " B (" + std::to_string((float)bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)";
 }
 
 int main(int __argc, char **__argv)
@@ -50,6 +51,7 @@ int main(int __argc, char **__argv)
     auto keyFile = keyDir + model + "_inference_key";
     u64 keyBufSz = 0;
     SytorchModule<u64> *net;
+    Tensor<u64> input({n_seq, n_embd});
 
     if (model == "gpt2")
     {
@@ -60,6 +62,10 @@ int main(int __argc, char **__argv)
         bw = 50;
         keyBufSz = 20 * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-tiny")
     {
@@ -69,6 +75,10 @@ int main(int __argc, char **__argv)
         bw = 37;
         keyBufSz = OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-base")
     {
@@ -78,6 +88,10 @@ int main(int __argc, char **__argv)
         bw = 50;
         keyBufSz = 20 * OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "bert-large")
     {
@@ -87,6 +101,10 @@ int main(int __argc, char **__argv)
         bw = 50;
         keyBufSz = 50 * OneGB;
         net = new GPUBERT<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "gpt-neo")
     {
@@ -98,6 +116,10 @@ int main(int __argc, char **__argv)
         bw = 51;
         keyBufSz = 80 * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "gpt-neo-large")
     {
@@ -109,6 +131,10 @@ int main(int __argc, char **__argv)
         bw = 51; // 52;
         keyBufSz = 200 * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat, false);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "llama7b")
     {
@@ -121,24 +147,28 @@ int main(int __argc, char **__argv)
         u64 intermediate_size = 11008;
         keyBufSz = 300 * OneGB;
         net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
     else if (model == "llama13b")
     {
-        n_layer = 40;
+        n_layer = 1;//40;
         n_head = 40;
         n_embd = 5120;
         attnMask = "self";
         qkvFormat = "qkvsep";
         bw = 48;
         u64 intermediate_size = 13824;
-        keyBufSz = 450 * OneGB;
+        keyBufSz = /*450*/ 40 * OneGB;
         net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
+        input.resize({n_seq, n_embd});
+        input.zero();
+        net->init(scale, input);
+        net->zero();
     }
-
-    Tensor<u64> input({n_seq, n_embd});
-    net->init(scale, input);
     srand(time(NULL));
-
     if (role == 0)
     {
         auto sigma = new SIGMAKeygen<u64>(party, bw, scale, keyFile, keyBufSz);
@@ -163,6 +193,9 @@ int main(int __argc, char **__argv)
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
+        auto signedAct = Tensor<i64>((i64*) activation.data, activation.shape).as_2d();
+        auto maxIdx = signedAct.argmax(0);
+        printf("%d, %ld\n", maxIdx, activation.data[maxIdx]);
 
         std::stringstream ss;
 
diff --git a/GPU-MPC/ext/sytorch/examples/llama7b.cpp b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
index a14593fd..b322973c 100644
--- a/GPU-MPC/ext/sytorch/examples/llama7b.cpp
+++ b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
@@ -186,7 +186,8 @@ class LLAMA_MODEL : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-        return ln_f->forward(*x);
+        return *x;
+        // return ln_f->forward(*x);
     }
 };
 
@@ -209,8 +210,9 @@ class LlamaNextWordLogits : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = llama_model->forward(input);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, -1);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
     }
 };
 
@@ -225,20 +227,37 @@ void ct_main()
 {
     sytorch_init();
 
+    // const u64 n_vocab = 32000;
+    // const u64 n_embd = 4096;
+    // const u64 n_head = 32;
+    // const u64 n_layer = 32;
+    // const u64 intermediate_size = 11008;
+    // const u64 scale = 12;
+
     const u64 n_vocab = 32000;
-    const u64 n_embd = 4096;
-    const u64 n_head = 32;
-    const u64 n_layer = 32;
-    const u64 intermediate_size = 11008;
+    const u64 n_ctx = 4096;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;
+    const u64 n_layer = 1;//40;
+    const u64 intermediate_size = 13824;
     const u64 scale = 12;
 
     LlamaNextWordLogits<i64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
-    llama_model.init(scale);
-    llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_7b.dat");
-    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"993.dat";
-    u64 n_seq = get_n_seq(fname, n_embd);
+    u64 n_seq = 128;//get_n_seq(fname, n_embd);
     Tensor<i64> input({n_seq, n_embd});
+    llama_model.init(scale, input);
+
+    auto ct = new ClearText<i64>();
+    ct->bw = 48;
+    llama_model.setBackend(ct);
+
+    // llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_7b.dat");
+    llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_13b.dat");
+
+    // std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-13b/") + /*std::to_string(i)*/ +"999.dat";
     input.load(fname, scale);
+
     auto &res = llama_model.forward(input);
     i64 max = INT_MIN;
     int argmax = 0;
@@ -250,8 +269,10 @@ void ct_main()
             argmax = i;
         }
     }
+    std::cout << "Output:" << std::endl;
     std::cout << argmax << std::endl;
     std::cout << max << std::endl;
+    std::cout << res.data[0] << std::endl;
 }
 
 int main()
diff --git a/GPU-MPC/ext/sytorch/ext/llama/api.cpp b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
index abc5c6c0..69bb303c 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/api.cpp
+++ b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
@@ -2648,12 +2648,12 @@ void Rsqrt(int size, GroupElement *x, GroupElement *y, GroupElement extradiv, in
         (*lut)[i] = GroupElement(double(1LL<<(2*scale)) / sqrt(val / extradiv));
     } });
     }
-    if (party != DEALER)
-    {
-        // Llama::stat_t stat = { "Rsqrt::LutGen", 0, t, 0, 0, 0 };
-        // stat.print();
-        // Llama::push_stats(stat);
-    }
+    // if (party != DEALER)
+    // {
+    //     // Llama::stat_t stat = { "Rsqrt::LutGen", 0, t, 0, 0, 0 };
+    //     // stat.print();
+    //     // Llama::push_stats(stat);
+    // }
 
     LUT_dpf(size, 13, bitlength, *lut, y, y, prefix + "Rsqrt::");
     // printf("Op=%lu\n", y[0]);
diff --git a/GPU-MPC/ext/sytorch/ext/llama/lut.cpp b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
index 92d62046..8dc5dd08 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
+++ b/GPU-MPC/ext/sytorch/ext/llama/lut.cpp
@@ -171,7 +171,7 @@ GroupElement evalLUTSS_2(int party, GroupElement res, GroupElement corr, const L
 
 std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, GroupElement rin, GroupElement routRes, GroupElement routCorr)
 {
-    assert(bin == 8);
+    assert(bin >= 8);
     LUTDPFETKeyPack key0, key1;
     key0.bin = bin;
     key1.bin = bin;
@@ -196,7 +196,7 @@ std::pair<LUTDPFETKeyPack, LUTDPFETKeyPack> keyGenLUTDPFET(int bin, int bout, Gr
 std::pair<GroupElement, GroupElement> evalLUTDPFET_1(int party, GroupElement x, const std::vector<GroupElement> &tab, LUTDPFETKeyPack &kp)
 {
     int bin = kp.bin;
-    assert(bin == 8);
+    assert(bin >= 8);
     mod(x, bin);
 
     GroupElement res = 0, corr = 0;
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
index 8f2fa11b..10f625b7 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
@@ -194,19 +194,11 @@ class Backend
         {
             for (u64 j = 0; j < dim; j++)
             {
-                // before
                 double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
                 T scalarInt = (i * scalar) * std::pow(2, scale);
-                // y_2d(i, ind) = i * scalar;
                 T sinx = std::sin(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
                 T cosx = std::cos(scalarInt / (float)std::pow(2, scale)) * std::pow(2, scale - 3);
 
-                // now
-                // double scalar = 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim));
-                // int tempScale = scale - 3;
-                // T sinx = (T) std::sin(i * scalar) * std::pow(2, tempScale);
-                // T cosx = (T) std::cos(i * scalar) * std::pow(2, tempScale);
-
                 if (sinx == (1ULL << (scale - 3)))
                     sinx -= 1;
                 if (cosx == (1ULL << (scale - 3)))
@@ -214,8 +206,6 @@ class Backend
                 u64 k = (j + dim / 2) % dim;
                 T mul = 2 * (j >= dim / 2) - 1;
                 T z = cosx * x_2d(i, j) + sinx * mul * x_2d(i, k);
-                // if ((i == 8 && j == 81) /*|| (i == n_seq - 1 && j == dim - 2)*/)
-                //     printf("x=%ld, sin=%ld, cos=%ld, y=%ld, mul=%ld, out=%ld, %f, %f, %f, %lf, %lf, %f, %ld\n", x_2d(i, j), sinx, cosx, x_2d(i, k), mul, z, std::sin(scalarInt / (float)std::pow(2, scale)), std::cos(scalarInt / (float)std::pow(2, scale)), scalarInt / (float)std::pow(2, scale), (std::pow(base, (double)((2 * j) % dim) / dim)), 1.0 / (std::pow(base, (double)((2 * j) % dim) / dim)), scalarInt / (float)std::pow(2, scale), scalarInt);
                 y_2d(i, j) = z;
             }
         }
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
index 93fc9e21..1d37ea24 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
@@ -489,11 +489,8 @@ class Flatten : public Layer<T>
 
     void _forward(Tensor<T> &a)
     {
-        printf("############### Flatten=%d\n", transpose);
         if (transpose && (a.shape.size() == 4 || a.shape.size() == 5))
         {
-            printf("@@@@@@@@@@@@@@@@@@@@@@@ In here!!!!!!!!!!!\n");
-            // printf("Flatten: %d\n", a.shape.size());
             if (a.shape.size() == 4)
             {
                 auto a_4d = a.as_4d();
@@ -512,9 +509,7 @@ class Flatten : public Layer<T>
                         {
                             for (u64 l = 0; l < d4; l++)
                             {
-                                // this->activation(i, j * d3 * d4 + k * d4 + l, 0, 0) = a(i, j, k, l);
                                 act_2d(i, l * d2 * d3 + j * d3 + k) = a_4d(i, j, k, l);
-                                // printf("Flatten: %ld, %ld\n", act_2d(i, l * d2 * d3 + j * d3 + k), a_4d(i, j, k, l));
                             }
                         }
                     }
@@ -551,7 +546,6 @@ class Flatten : public Layer<T>
         }
         else
         {
-            printf("################# In here!!!!!!!!!!!\n");
             u64 sz = a.size();
 #pragma omp parallel for
             for (u64 i = 0; i < sz; i++)
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/module.h b/GPU-MPC/ext/sytorch/include/sytorch/module.h
index 14706e6f..39a69b4c 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/module.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/module.h
@@ -227,7 +227,7 @@ class SytorchModule
         for (auto &node : allNodesInExecutionOrder)
         {
             auto layer = node->layer;
-            printf("Loading weights layer %s\n", layer->name.data());
+            // printf("Loading weights layer %s\n", layer->name.data());
             if (layer->name == "_MHADummy")
             {
                 auto mha = (_MHADummy<T> *)layer;
@@ -253,7 +253,6 @@ class SytorchModule
                     Tensor2D<T> wQ(mha->wQKV.d1, mha->wQKV.d2 / 3);
                     if (mha->qkvLayout == "kvqsep")
                     {
-                        printf("#################### %d, %d, %d\n", wK.d1, wK.d2, mha->wQKV.d1);
                         for (u64 j = 0; j < wK.size(); j++)
                         {
                             wK.data[j] = T(floatWeights[wIdx + j] * (1LL << scale));
@@ -296,8 +295,6 @@ class SytorchModule
                     {
                         assert(0);
                     }
-
-                    printf("Loading model weights=%ld, %ld, %ld\n", wK.data[0], wV.data[0], wQ.data[0]);
                     for (u64 j = 0; j < mha->wQKV.d1; j++)
                     {
                         for (u64 k = 0; k < mha->wQKV.d2 / 3; k++)
@@ -307,8 +304,6 @@ class SytorchModule
                             mha->wQKV(j, 2 * mha->wQKV.d2 / 3 + k) = wV(j, k);
                         }
                     }
-                    printf("Loaded model weights=%d, %d, %ld, %ld, %ld\n", mha->wQKV.d1, mha->wQKV.d2, mha->wQKV(mha->wQKV.d1 - 1, 0), mha->wQKV(mha->wQKV.d1 - 1, mha->wQKV.d2 / 3), mha->wQKV(mha->wQKV.d1 - 1, 2 * mha->wQKV.d2 / 3));
-                    // memcpy(mha->wQKV.data, wQKV.data, mha->wQKV.size());
                     mha->bQKV.as_nd().zero();
                 }
                 for (u64 j = 0; j < mha->wProj.size(); j++)
diff --git a/GPU-MPC/fss/dcf/gpu_dcf_templates.h b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
index d569e058..d4a73bac 100644
--- a/GPU-MPC/fss/dcf/gpu_dcf_templates.h
+++ b/GPU-MPC/fss/dcf/gpu_dcf_templates.h
@@ -90,9 +90,7 @@ namespace dcf
         {
             auto x2 = (x + (1ULL << (bin - 1)));
             gpuMod(x2, bin);
-            // printf("x=%lu, x2=%lu, %lu\n", x, x2, (1ULL << (bin - 1)));
             o += (x2 >= (1ULL << (bin - 1)));
-            // printf("o=%ld, %d, %d, %d\n", o, (x2 >= (1ULL << (bin - 1))), bin, bout);
         }
         gpuMod(o, bout);
         writePackedOp(out_g, o, bout, N);
diff --git a/GPU-MPC/fss/dcf/gpu_sgd.cu b/GPU-MPC/fss/dcf/gpu_sgd.cu
index 5fa26533..392bb41e 100644
--- a/GPU-MPC/fss/dcf/gpu_sgd.cu
+++ b/GPU-MPC/fss/dcf/gpu_sgd.cu
@@ -137,7 +137,6 @@ namespace dcf
         {
             auto vw = h_masked_Vw[i] - h_mask_Vw[i];
             auto vw_ct = cpuArs((h_dW[i] << shiftdW) + T(orca::mom_fp) * h_Vw[i], bin, orca::mom_scale);
-            // if(i < 10) printf("%lu %lu\n", u64(vw), u64(vw_ct));
             assert(vw - vw_ct <= 1);
             auto w_ct = cpuArs((h_W[i] << shiftW) - T(orca::lr_fp) * vw_ct, bin, shiftW);
             // this is the new masked f
diff --git a/GPU-MPC/fss/gpu_dpf.h b/GPU-MPC/fss/gpu_dpf.h
index 307a5d70..c963e020 100644
--- a/GPU-MPC/fss/gpu_dpf.h
+++ b/GPU-MPC/fss/gpu_dpf.h
@@ -89,8 +89,6 @@ GPUDPFKey readGPUDPFKey(u8 **key_as_bytes)
         memcpy(&k, *key_as_bytes, 3 * sizeof(int));
         *key_as_bytes += (3 * sizeof(int));
 
-        printf("M=%d, B=%d\n", k.M, k.B);
-
         k.dpfTreeKey = new GPUDPFTreeKey[k.B];
         k.memSzOut = 0;
         for (int b = 0; b < k.B; b++)
diff --git a/GPU-MPC/fss/gpu_gelu.cu b/GPU-MPC/fss/gpu_gelu.cu
index 26cb0e91..c4e4643d 100644
--- a/GPU-MPC/fss/gpu_gelu.cu
+++ b/GPU-MPC/fss/gpu_gelu.cu
@@ -134,7 +134,6 @@ T *gpuGelu(SigmaPeer *peer, int party, GPUGeluKey<T, TClip> &k, int bw, int bin,
     assert(8 * sizeof(TClip) >= clipBw);
     assert(bin > scale - 6);
     int bwXt = bin - scale + 6 + 1;
-    printf("BwXt=%d, Bout=%d, N=%d\n", bwXt, bw, N);
     // do a truncate reduce
     auto d_Xt = gpuTruncate(bw, bwXt, TruncateType::TrWithSlack, k.trKey, scale - 6, peer, party, N, d_X, gaes, s);
     // the -1 doesn't matter because anything larger is anyway set to (1 << clipBw) - 1
@@ -149,11 +148,8 @@ T *gpuGelu(SigmaPeer *peer, int party, GPUGeluKey<T, TClip> &k, int bw, int bin,
     auto d_clippedX = geluMux<T, TClip>(peer, party, k.muxKey, bwXt, clipBw, N, d_dRelu, d_ic, d_Xt, s);
     gpuFree(d_Xt);
     auto d_reluSubGelu = gpuDpfLUT<TClip, T>(k.lutKey, peer, party, d_clippedX, d_geluSubRelu, gaes, s, false);
-    // printf("Finished LUT\n");
     gpuFree(d_clippedX);
-    // printf("Starting relu\n");
     T *d_relu = gpuSelect<T, T, 0, 0>(peer, party, bw, k.reluSelectKey, d_dRelu, d_X, s, false);
-    // printf("Finished relu\n");
     gpuFree(d_res);
     gpuLinearComb(bw, N, d_relu, T(1), d_relu, -T(1), d_reluSubGelu);
     gpuFree(d_reluSubGelu);
diff --git a/GPU-MPC/fss/gpu_lut.cu b/GPU-MPC/fss/gpu_lut.cu
index b13587b4..01ef611c 100644
--- a/GPU-MPC/fss/gpu_lut.cu
+++ b/GPU-MPC/fss/gpu_lut.cu
@@ -65,12 +65,6 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
     if (threadId < N)
     {
         storeAESBlock(stack_g, 0, scw_g[threadId], N, threadId);
-        // stack[threadIdx.x / 32][0][threadIdx.x & 31] = scw[threadId];
-        // for (int i = 0; i < bin - LOG_AES_BLOCK_LEN; i++)
-        // storeAESBlock(scw, 0, scw_g[(i + 1) * N + threadId], N, threadId);
-        // scw[threadIdx.x / 32][i][threadIdx.x & 31] = scw_g[(i+1) * N + threadId];
-        // store these in registers for now and see what happens
-        // hopefully there is no spill
         auto x = (u64)X[threadId];
         gpuMod(x, bin);
         auto l0_cw = l0_g[threadId];
@@ -81,22 +75,10 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
         TOut u = 0, v = 0;
         while (depth > 0)
         {
-            // if(threadId == 2) printf("Stack: %u, Depth: %d\n", pathStack, depth);
-            // peek at the top of the stack
             auto seed = loadAESBlock(stack_g, depth - 1, N, threadId);
-            // if (threadId == 2) {
-            // printAESBlock(&seed);
-            // }
-            // auto seed = stack[threadIdx.x / 32][depth - 1][threadIdx.x & 31];
-            // extract the stack bit
             auto bit = pathStack & 1ULL;
-            // should this be +1?
             if (depth == bin - LOG_AES_BLOCK_LEN)
             {
-                // if(stack == 0) {
-                //     l0[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l0_g[threadId];
-                //     l1[threadIdx.x / 32][depth - 1][threadIdx.x & 31] = l1_g[threadId];
-                // }
                 auto lastBlock = expandDPFTreeNode(bin, party,
                                                    seed,
                                                    0,
@@ -115,11 +97,9 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
                     u += w;
                     auto lookup = x - (lb ^ i);
                     gpuMod(lookup, bin);
-                    // printf("current: %ld, %ld, %ld, %ld\n", x, lb ^ i, lookup, tab[lookup]);
-                    v += /*reluSubGelu(lookup, 6, 12)*/ tab[lookup] * w;
+                    v += tab[lookup] * w;
                     lastBlock >>= 1;
                 }
-                // sum &= 1;
                 // pop all the 1s from the stack
                 while (pathStack & 1ULL /*&& depth > 0*/)
                 {
@@ -130,11 +110,7 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
                 pathStack ^= 1;
             }
             else
-            { // load the scws into shared memory along the all 0 path
-                // if (stack == 0)
-                // {
-                //     scw[][][] = scw_g[];
-                // }
+            { 
                 // manipulate the seed depending on the bit
                 // aren't storing the first cw because it sees no reuse
                 auto tR_l = (tR >> (depth - 1)) & 1;
@@ -168,9 +144,7 @@ __global__ void dpfLUT(int party, int bin, int N, TIn *X, TOut *tab, AESBlock *s
         gpuMod(u, 1);
         auto maskU = getVCW(1, U, N, 0);
         writeVCW(1, U, u64(u ^ maskU), 0, N);
-        // U[threadId] += u;
         V[threadId] += v;
-        // printf("%d: %ld, %ld\n", threadId, u, v);
     }
 }
 
@@ -179,7 +153,6 @@ TOut *gpuDpfLUT(GPULUTKey<TOut> k0, SigmaPeer *peer, int party, TIn *d_X, TOut *
 {
     auto k = *(k0.k.dpfTreeKey);
     assert(k0.k.bin >= 8 && k0.k.B == 1);
-    printf("############### %d, %d, %d\n", k.bin, k.evalAll, k.N, k0.k.B);
     // Neha: need to change the key reading and writing code
     //  do not change tb size it is needed to load the sbox
     const int tbSz = 256;
@@ -197,15 +170,8 @@ TOut *gpuDpfLUT(GPULUTKey<TOut> k0, SigmaPeer *peer, int party, TIn *d_X, TOut *
     d_tR = (u32 *)moveToGPU((uint8_t *)k.tR, k.memSzT, s);
     auto d_U = (u32 *)moveToGPU((u8 *)k0.maskU, k.memSzOut, s); // a lot of bits packed together
     auto d_V = (TOut *)moveToGPU((u8 *)k0.s.b, k.N * sizeof(TOut), s);
-    // d_out = (uint32_t *)gpuMalloc(k.memSzOut);
-    // int shmSize = 32768;
-    // checkCudaErrors(cudaFuncSetAttribute(dpfEvalAll, cudaFuncAttributeMaxDynamicSharedMemorySize, shmSize));
-    // auto start = std::chrono::high_resolution_clock::now();
     dpfLUT<TIn, TOut><<<tb, tbSz /*, shmSize*/>>>(party, k.bin, k.N, d_X, d_tab, d_scw, d_stack, d_l0, d_l1, d_tR, d_U, d_V, *g);
     checkCudaErrors(cudaDeviceSynchronize());
-    // auto end = std::chrono::high_resolution_clock::now();
-    // auto elapsed = end - start;
-    // printf("Time taken by dpfLUT kernel=%lu micros\n", std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
 
     gpuFree(d_scw);
     gpuFree(d_stack);
diff --git a/GPU-MPC/fss/gpu_matmul.cu b/GPU-MPC/fss/gpu_matmul.cu
index f47ce1d8..08e40f73 100644
--- a/GPU-MPC/fss/gpu_matmul.cu
+++ b/GPU-MPC/fss/gpu_matmul.cu
@@ -357,7 +357,6 @@ T *gpuMatmul(SigmaPeer *peer, int party, MatmulParams p, GPUMatmulKey<T> &k, T *
 
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
-    printf("Matmul Comm=%ld\n", b1 - b0);
     return d_truncatedZ;
 }
 
diff --git a/GPU-MPC/fss/gpu_maxpool.cu b/GPU-MPC/fss/gpu_maxpool.cu
index c3924f85..003901b6 100644
--- a/GPU-MPC/fss/gpu_maxpool.cu
+++ b/GPU-MPC/fss/gpu_maxpool.cu
@@ -358,7 +358,6 @@ T *maxpoolLogHelper(SigmaPeer *peer, int party, MaxpoolParams p, int i, GPUReluK
 template <typename T>
 T *gpuMaxpoolLog(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
 {
-    // printf("##################### Using fixed maxpool ##########################\n");
     assert(/*p.N == 1 &&*/ p.C == 1 && p.strideH == 1 && p.strideW == p.FW && p.strideH == p.FH);
     // T *d_I = d_in;
     T *d_O;
@@ -400,7 +399,6 @@ template <typename T>
 T *gpuMaxpool(SigmaPeer *peer, int party, MaxpoolParams p, GPUMaxpoolKey<T> k, T *d_I, AESGlobalContext *gaes, Stats *s)
 {
     T *d_O;
-    printf("Gpu maxpool rounds=%d, %d, %d\n", k.rounds, p.FH, p.FW);
     if (k.rounds < p.FH * p.FW - 1)
     {
         assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
diff --git a/GPU-MPC/fss/gpu_maxpool.h b/GPU-MPC/fss/gpu_maxpool.h
index ba393f20..a8edfc8a 100644
--- a/GPU-MPC/fss/gpu_maxpool.h
+++ b/GPU-MPC/fss/gpu_maxpool.h
@@ -56,20 +56,11 @@ GPUMaxpoolKey<T> readGPUMaxpoolKey(MaxpoolParams p, u8 **key_as_bytes)
 {
     GPUMaxpoolKey<T> k;
     k.rounds = *((int *)*key_as_bytes);
-    printf("Rounds=%d\n", k.rounds);
     *key_as_bytes += sizeof(int);
     k.reluKey = new GPUReluKey<T>[/*p.FH * p.FW*/ k.rounds];
     for (int i = 0; i < /*p.FH*/ k.rounds; i++)
     {
-        // for (int j = 0; j < p.FW; j++)
-        // {
-        // if (i == 0 && j == 0)
-        // continue;
-        // printf("Reading Relu key=%d, %d\n", i, j);
         k.reluKey[i] = readReluKey<T>(key_as_bytes);
-        printf("Round %d=%d relus\n", i, k.reluKey[i].numRelus);
-        // if(this->train) maxpoolKey.andKey[i * p.FW + j] = readGPUAndKey(key_as_bytes);
-        // }
     }
     return k;
 }
diff --git a/GPU-MPC/fss/gpu_mha.cu b/GPU-MPC/fss/gpu_mha.cu
index d867df94..e9e8e3d2 100644
--- a/GPU-MPC/fss/gpu_mha.cu
+++ b/GPU-MPC/fss/gpu_mha.cu
@@ -72,7 +72,6 @@ __global__ void rotEmbKernel(MHAParams pMHA, int bw, int scale, u64 N, T *X, T *
 template <typename T>
 T *gpuKeygenRotEmb(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA, T *d_mask_X, AESGlobalContext *g)
 {
-    printf("*********** Generating rotary embedding key! ***************\n");
     size_t size_X = pMHA.n_heads * (u64)pMHA.n_seq * pMHA.dim_W;
     auto d_mask_X1 = (T *)gpuMalloc(size_X * sizeof(T));
     rotEmbKernel<<<(size_X - 1) / 128 + 1, 128>>>(pMHA, bw, scale, size_X, d_mask_X, d_mask_X1);
@@ -97,7 +96,6 @@ T *gpuRotEmb(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, GPUT
 
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
-    printf("Comm in rotary embedding=%lu, N=%d\n", b1 - b0, size_X);
     return d_truncated_X;
 }
 
@@ -108,7 +106,6 @@ T *gpuKeygenMHA(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA,
     auto d_mask_QKV = gpuKeygenMatmul(key_as_bytes, party, pMHAMul.pQKV, d_mask_X, WQKV, YQKV, TruncateType::TrFloor, g);
     // this->activation.d_data = d_mask_QKV;
 
-    // printf("Size of Q=%d\n", pQKV.size_C / 3);
     int QKSz = pMHAMul.pQKV.size_C / 3;
     auto d_mask_Q = d_mask_QKV;
     auto d_mask_K = d_mask_QKV + QKSz;
@@ -131,7 +128,6 @@ T *gpuKeygenMHA(u8 **key_as_bytes, int party, int bw, int scale, MHAParams pMHA,
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
     {
         T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
-        printf("####### Doing a scalar multiplication\n");
         d_mask_normQKt = gpuKeygenScalarMul(key_as_bytes, party, bw, pMHAMul.pQKt.size_C, invSqrtDimW, d_mask_QKt, TruncateType::TrFloor, scale, g);
         gpuFree(d_mask_QKt);
     }
@@ -183,7 +179,6 @@ T *gpuMHA(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, MHAMulP
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
     {
         T invSqrtDimW = T((1.0f / sqrt(double(pMHA.dim_W))) * (1LL << scale));
-        printf("####### Doing a scalar multiplication\n");
         d_normQKt = gpuScalarMul(peer, party, bw, pMHAMul.pQKt.size_C, k.normQKtTrKey, invSqrtDimW, d_QKt, TruncateType::TrFloor, scale, g, s);
         gpuFree(d_QKt);
     }
@@ -203,6 +198,5 @@ T *gpuMHA(SigmaPeer *peer, int party, int bw, int scale, MHAParams pMHA, MHAMulP
     auto d_proj = gpuMatmul(peer, party, pMHAMul.pProj, k.mmKeyProj, d_smQKtV, WProj, YProj, TruncateType::TrFloor, g, s);
     gpuFree(d_smQKtV);
     auto b1 = peer->bytesSent() + peer->bytesReceived();
-    printf("MHA Comm=%ld\n", b1 - b0);
     return d_proj;
 }
diff --git a/GPU-MPC/fss/gpu_mha.h b/GPU-MPC/fss/gpu_mha.h
index 09e29d9d..4b733e52 100644
--- a/GPU-MPC/fss/gpu_mha.h
+++ b/GPU-MPC/fss/gpu_mha.h
@@ -92,14 +92,9 @@ inline MatmulParams initPQKt(MHAParams pMHA, int bw, int scale)
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 0)
     {
         // assert(int(log2(dim_W)) % 2 == 0);
-        printf("Shift=%d\n", int(log2(pMHA.dim_W) / 2));
+        // printf("Shift=%d\n", int(log2(pMHA.dim_W) / 2));
         pQKt.shift += int(log2(pMHA.dim_W) / 2);
     }
-    else
-    {
-        printf("Not merging the two truncations, shift=%d\n", pQKt.shift);
-        // assert(0);
-    }
     // K is stored in column-major form
     pQKt.rowMaj_B = false;
     pQKt.ld_B = pQKt.K;
@@ -147,7 +142,6 @@ inline MaxpoolParams initPMaxpool(MHAParams pMHA, int bw, int scale)
     pMPool.zPadWRight = 0;
     pMPool.bw = bw;
     pMPool.bin = bw - scale;
-    printf("Bin=%d\n", pMPool.bin);
     pMPool.scale = scale;
     pMPool.scaleDiv = 0;
     initPoolParams(pMPool);
@@ -181,7 +175,6 @@ GPUMHAKey<T> readGPUMHAKey(MHAParams pMHA, MHAMulParams pMHAMul, u8 **key_as_byt
     if (pMHA.doNormQKt && int(log2(pMHA.dim_W)) % 2 == 1)
         k.normQKtTrKey = readGPUTruncateKey<T>(TruncateType::TrFloor, key_as_bytes);
     k.softmaxKey = readGPUSoftMaxKey<T>(pMHAMul.pMPool, key_as_bytes);
-    printf("Maxpool rounds=%d", k.softmaxKey.maxPoolKey.rounds);
     k.mmKeySmQKtV = readGPUMatmulKey<T>(pMHAMul.pSmQKtV, TruncateType::TrFloor, key_as_bytes);
     k.mmKeyProj = readGPUMatmulKey<T>(pMHAMul.pProj, TruncateType::TrFloor, key_as_bytes);
     return k;
diff --git a/GPU-MPC/fss/gpu_mul.cu b/GPU-MPC/fss/gpu_mul.cu
index b476c35b..506da33b 100644
--- a/GPU-MPC/fss/gpu_mul.cu
+++ b/GPU-MPC/fss/gpu_mul.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -40,7 +40,6 @@ __global__ void doBeaverMul(int party, int bw, int N, T *X, T *Y, T *a, T *b, T
     {
         Z[i] = (party == SERVER1) * (X[i] * Y[i]) - X[i] * b[i] - a[i] * Y[i] + c[i];
         gpuMod(Z[i], bw);
-        // printf("%ld, %ld, %ld\n", X[i], Y[i], Z[i]);
     }
 }
 
@@ -55,9 +54,9 @@ T *gpuKeygenMul(u8 **key_as_bytes, int party, int bw, int scale, int N, T *d_mas
     writeShares<T, T>(key_as_bytes, party, N, d_mask_B, bw);
     writeShares<T, T>(key_as_bytes, party, N, d_mask_C1, bw);
     gpuFree(d_mask_C1);
-    printf("##Num truncations: %d\n", N);
-    auto d_mask_truncated_C = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, bw, bw, scale, N, d_mask_C, gaes);
-    gpuFree(d_mask_C);
+    auto d_mask_truncated_C = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/ t, bw, bw, scale, N, d_mask_C, gaes);
+    if (d_mask_truncated_C != d_mask_C)
+        gpuFree(d_mask_C);
     return d_mask_truncated_C;
 }
 
@@ -72,11 +71,11 @@ T *gpuMul(SigmaPeer *peer, int party, int bw, int scale, int N, GPUMulKey<T> k,
     doBeaverMul<<<(N - 1) / 128 + 1, 128>>>(party, bw, N, d_X, d_Y, d_a, d_b, d_c, d_Z);
     gpuFree(d_a);
     peer->reconstructInPlace(d_Z, bw, N, s);
-    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, /*TruncateType::TrWithSlack*/t, k.trKey, scale, peer, party, N, d_Z, gaes, s); //, true);
-    gpuFree(d_Z);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k.trKey, scale, peer, party, N, d_Z, gaes, s); //, true);
+    if (d_truncated_Z != d_Z)
+        gpuFree(d_Z);
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     if (s)
         s->linear_comm_bytes += (b1 - b0);
-    printf("Comm inside Mul=%ld\n", b1 - b0);
     return d_truncated_Z;
 }
diff --git a/GPU-MPC/fss/gpu_mul.h b/GPU-MPC/fss/gpu_mul.h
index 6bbdc377..d168d84d 100644
--- a/GPU-MPC/fss/gpu_mul.h
+++ b/GPU-MPC/fss/gpu_mul.h
@@ -39,16 +39,12 @@ GPUMulKey<T> readGPUMulKey(u8** key_as_bytes, u64 szA, u64 szB, u64 szC, Truncat
     k.szB = szB;
     k.szC = szC;
     k.a = (T*) *key_as_bytes;
-    // printf("a=%ld\n", *k.a);
     *key_as_bytes += (szA * sizeof(T));
     k.b = (T*) *key_as_bytes;
-    // printf("b=%ld\n", *k.b);
     *key_as_bytes += (szB * sizeof(T));
     k.c = (T*) *key_as_bytes;
-    // printf("c=%ld\n", *k.c);
     *key_as_bytes += (szC * sizeof(T));
-    printf("Reading truncate key######\n");
-    k.trKey = readGPUTruncateKey<T>(/*TruncateType::TrWithSlack*/t, key_as_bytes);
+    k.trKey = readGPUTruncateKey<T>(t, key_as_bytes);
     return k;
 }
 
diff --git a/GPU-MPC/fss/gpu_nexp.cu b/GPU-MPC/fss/gpu_nexp.cu
index 27a4b237..eb5cf5f6 100644
--- a/GPU-MPC/fss/gpu_nexp.cu
+++ b/GPU-MPC/fss/gpu_nexp.cu
@@ -33,7 +33,6 @@ T *gpuKeygenNExp(u8 **key_as_bytes, int party, int bw, int bin, int scale, int N
     // flip(relu(x - p)) + p
     // this is wrong, can't arbitrarily do bin + 1 whenever you please
     // the input is a 39 bit input
-    printf("Input to nExp=%d\n", N);
     auto d_clipMask = gpuGenReluKey<T, u16, p, p, true>(key_as_bytes, party, bin, 16, N, d_mask_X, gaes);
     // generate the output in the full bw and scale
     auto d_lsbLutMask = gpuKeyGenLUT<u16, T>(key_as_bytes, party, 8, bw, N, d_clipMask, gaes);
@@ -42,7 +41,6 @@ T *gpuKeygenNExp(u8 **key_as_bytes, int party, int bw, int bin, int scale, int N
     auto d_msbLutMask = gpuKeyGenLUT<u8, T>(key_as_bytes, party, 8, bw, N, d_msbMask, gaes);
     gpuFree(d_msbMask);
     // clipMask is lsb mask
-    printf("Calling mul with bw=%d, scale=%d, N=%d\n", bw, scale, N);
     auto d_nExpMask = gpuKeygenMul(key_as_bytes, party, bw, scale, N, d_msbLutMask, d_lsbLutMask, TruncateType::TrWithSlack, gaes);
     gpuFree(d_msbLutMask);
     gpuFree(d_lsbLutMask);
diff --git a/GPU-MPC/fss/gpu_nexp.h b/GPU-MPC/fss/gpu_nexp.h
index 435dd070..f7a65010 100644
--- a/GPU-MPC/fss/gpu_nexp.h
+++ b/GPU-MPC/fss/gpu_nexp.h
@@ -43,14 +43,11 @@ GPUNExpKey<T> readGPUNExpKey(u8 **key_as_bytes)
 {
     GPUNExpKey<T> k;
     k.reluKey = readReluKey<u16>(key_as_bytes);
-    printf("##Reading Relu key=%d\n", k.reluKey.bout);
     k.N = k.reluKey.numRelus;
     k.lsbLutKey = readGPULUTKey<T>(key_as_bytes);
     k.trKey = readGPUTruncateKey<u8>(TruncateType::TrWithSlack, key_as_bytes);
     k.msbLutKey = readGPULUTKey<T>(key_as_bytes);
     k.mulKey = readGPUMulKey<T>(key_as_bytes, (u64)k.N, (u64)k.N, (u64)k.N, TruncateType::TrWithSlack);
-    // printf("Done reading nexp key\n");
-    // k.mulTrKey = readGPUTruncateKey<T>(TruncateType::TrWithSlack, key_as_bytes);
     return k;
 }
 
diff --git a/GPU-MPC/fss/gpu_scalarmul.h b/GPU-MPC/fss/gpu_scalarmul.h
index be7654ea..b941f871 100644
--- a/GPU-MPC/fss/gpu_scalarmul.h
+++ b/GPU-MPC/fss/gpu_scalarmul.h
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -30,7 +30,6 @@ T *gpuKeygenScalarMul(u8 **key_as_bytes, int party, int bw, int N, T a, T *d_mas
 {
     auto d_mask_Z = (T *)gpuMalloc(N * sizeof(T));
     gpuLinearComb(bw, N, d_mask_Z, a, d_mask_X);
-    printf("Truncate type=%d\n", t);
     auto d_mask_truncated_Z = genGPUTruncateKey<T, T>(key_as_bytes, party, t, bw, bw, shift, N, d_mask_X, gaes);
     if (d_mask_truncated_Z != d_mask_Z)
         gpuFree(d_mask_Z);
@@ -43,9 +42,9 @@ T *gpuScalarMul(SigmaPeer *peer, int party, int bw, int N, GPUTruncateKey<T> k,
     u64 b0 = peer->bytesSent() + peer->bytesReceived();
     auto d_Z = (T *)gpuMalloc(N * sizeof(T));
     gpuLinearComb(bw, N, d_Z, a, d_X);
-    printf("Truncate type=%d\n", t);
-    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k, shift, peer, party, N, d_Z, gaes, s); //, true);
-    gpuFree(d_Z);
+    auto d_truncated_Z = gpuTruncate<T, T>(bw, bw, t, k, shift, peer, party, N, d_Z, gaes, s);
+    if (d_truncated_Z != d_Z)
+        gpuFree(d_Z);
     u64 b1 = peer->bytesSent() + peer->bytesReceived();
     s->linear_comm_bytes += (b1 - b0);
     return d_truncated_Z;
diff --git a/GPU-MPC/fss/gpu_softmax.cu b/GPU-MPC/fss/gpu_softmax.cu
index b331fc4e..ad27094c 100644
--- a/GPU-MPC/fss/gpu_softmax.cu
+++ b/GPU-MPC/fss/gpu_softmax.cu
@@ -63,7 +63,6 @@ T *gpuKeygenSoftmax(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X,
 {
     int inSz = getInSz(p);
     int mSz = getMSz(p);
-    printf("################# InSz=%d\n", inSz);
     int ogBw = p.bw;
     int reducedBw = p.bin + 2;
     // int bin = p.bin;
@@ -74,7 +73,7 @@ T *gpuKeygenSoftmax(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X,
     p.bin = p.bin + 1;
     // get the max in 39 bits (implicit reduce)
     // in this case the input bw and the output bw are the same
-    auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_mask_X, gaes, (inSz & (inSz - 1)) == 0);
+    auto d_maxMask = gpuKeygenMaxpool(key_as_bytes, party, p, d_mask_X, gaes, true);
     assert(p.strideH == p.FH && p.strideW == p.FW);
     auto d_X1Mask = windowFunc<T, xPlusM<u64(-1), u64(1)>>(party, p, d_mask_X, d_maxMask);
     gpuFree(d_maxMask);
diff --git a/GPU-MPC/fss/gpu_truncate.cu b/GPU-MPC/fss/gpu_truncate.cu
index f21e6154..aaff4606 100644
--- a/GPU-MPC/fss/gpu_truncate.cu
+++ b/GPU-MPC/fss/gpu_truncate.cu
@@ -43,7 +43,6 @@ using keygenTrFunc = void (*)(int party, int bin, int shift, int bout, int N, in
 template <typename TIn, typename TOut>
 __device__ TOut trReduce(int party, int bin, int shift, int i, TIn x, u8 *bytes)
 {
-    // if(i == 0) printf("%lu, %lu\n", x, x >> shift);
     return (party == SERVER1) * TOut(x >> shift);
 }
 
@@ -60,8 +59,6 @@ __device__ TOut trWithSlack(int party, int bin, int shift, int i, TIn x, u8 *byt
     auto x1 = (x + (1ULL << (bin - 2)));
     gpuMod(x1, bin);
     auto msb_x1 = gpuMsb(x1, bin);
-    // if (i == 0)
-    //     printf("bin=%d, x=%lu, x1=%lu, msb corr=%lu\n", bin, x, x1, ((TOut *)bytes)[i]);
     return (party == SERVER1) * TOut((x1 >> shift) - (1ULL << (bin - shift - 2))) + ((TOut *)bytes)[i] * (!msb_x1);
 }
 
@@ -74,8 +71,6 @@ __global__ void trCorrKernel(int party, int bin, int shift, int bout, int N, TIn
         u32 z = (z_g[i / 32] >> (threadIdx.x & 0x1f)) & 1;
         auto y_l = (TOut)tf(party, bin, shift, i, x[i], bytes) + corr[2 * i + z];
         gpuMod(y_l, bout);
-        if (i == 0)
-            printf("corr=%lu, %lu, %lu, %u, %lu\n", corr[0], corr[1], y_l, z, x[i]);
         y[i] = y_l;
     }
 }
@@ -110,7 +105,6 @@ __device__ void keygenTrWithSlack(int party, int bin, int shift, int bout, int N
 {
     keygenTrReduce(party, bin, shift, bout, N, i, x, y, z, trKey, bytes);
     trKey[2 * N + i] = TOut(gpuMsb(x, bin) * (1ULL << (bin - shift)));
-    if(i == 0) printf("trSlack key=%lu\n", trKey[2 * N + i]);
 }
 
 template <typename TIn, typename TOut, keygenTrFunc<TIn, TOut> tf>
@@ -119,7 +113,6 @@ __global__ void keygenTrFuncKernel(int party, int bin, int shift, int bout, int
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < N)
     {
-        if(i == 0) printf("x=%lu, y=%lu, z=%lu\n", x[i], y[i], z[i]);
         tf(party, bin, shift, bout, N, i, x[i], y[i], z[i], trKey, bytes);
     }
 }
diff --git a/GPU-MPC/fss/gpu_window.cu b/GPU-MPC/fss/gpu_window.cu
index b5349fd7..09910f9c 100644
--- a/GPU-MPC/fss/gpu_window.cu
+++ b/GPU-MPC/fss/gpu_window.cu
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -87,11 +87,8 @@ __global__ void windowFuncKernel(int party, MaxpoolParams p, T *d_X, T *d_max, T
             j = n * p.H * p.W * p.C + (h / p.strideH) * p.W * p.C + (w / p.strideW) * p.C + c;
         }
         WindowArgs wa{party, i, j, N, M};
-        // printf("i=%d, j=%d, max[%d]=%ld\n", i, j, j, d_max[j]);
         auto o = T(f(wa, u64(d_X[i]), u64(d_max[j]), bytes));
         gpuMod(o, p.bw);
-        // if (i <= 4)
-        // printf("o[%d]=%ld, %ld, %ld, %d\n", i, u64(d_X[i]), u64(d_max[j]), o, j);
         d_out[i] = o;
     }
 }
@@ -104,7 +101,7 @@ T *windowFunc(int party, MaxpoolParams p, T *d_X, T *d_M, u8 *d_bytes = NULL, bo
     T *d_out = d_X;
     if (!inPlace)
         d_out = (T *)gpuMalloc(inSz * sizeof(T));
-    printf("%d, %d, %d, %d\n", p.strideH, p.FH, p.strideW, p.FW);
+    // printf("%d, %d, %d, %d\n", p.strideH, p.FH, p.strideW, p.FW);
     assert(p.strideH == p.FH && p.strideW == p.FW);
     assert(p.zPadHLeft == 0 && p.zPadHRight == 0 && p.zPadWLeft == 0 && p.zPadWRight == 0);
     windowFuncKernel<T, f><<<(inSz - 1) / 128 + 1, 128>>>(party, p, d_X, d_M, d_out, inSz, mSz, d_bytes);
@@ -120,7 +117,6 @@ T *keygenWindowMul(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, T
     auto d_mulMask = randomGEOnGpu<T>(inSz, p.bw);
     // checkCudaErrors(cudaMemset(d_mulMask, 0, inSz * sizeof(T)));
     auto d_mulMask1 = windowFunc<T, keygenMul<T>>(party, p, d_mask_X, d_mask_M, (u8 *)d_mulMask);
-    // printf("Writing mul key, N=%lx\n", *key_as_bytes);
     writeShares<T, T>(key_as_bytes, party, inSz, d_mask_X, p.bw);
     writeShares<T, T>(key_as_bytes, party, mSz, d_mask_M, p.bw);
     writeShares<T, T>(key_as_bytes, party, inSz, d_mulMask1, p.bw);
@@ -131,19 +127,18 @@ T *keygenWindowMul(u8 **key_as_bytes, int party, MaxpoolParams p, T *d_mask_X, T
         assert(d_tempMask == d_mulMask);
     }
     // truncate X*M + B as is correct
-    auto d_truncateMask = genGPUTruncateKey<T, T>(key_as_bytes, party, /*TruncateType::TrWithSlack*/t, p.bw, p.bw, p.scale, inSz, d_mulMask, gaes);
-    gpuFree(d_mulMask);
+    auto d_truncateMask = genGPUTruncateKey<T, T>(key_as_bytes, party, t, p.bw, p.bw, p.scale, inSz, d_mulMask, gaes);
+    if (d_truncateMask != d_mulMask)
+        gpuFree(d_mulMask);
     return d_truncateMask;
 }
 
 template <typename T>
 T *windowMul(SigmaPeer *peer, int party, MaxpoolParams p, GPUMulKey<T> &k, T *d_X, T *d_M, TruncateType t, AESGlobalContext *gaes, Stats *s, T *d_B = NULL)
 {
-    // printf("Start################\n");
     auto inSz = getInSz(p);
     auto mSz = getMSz(p);
     auto d_mulKey = (u8 *)moveToGPU((u8 *)k.a, (2 * inSz + mSz) * sizeof(T), s);
-    // printf("%ld, %ld, %ld\n", k.mulKey.a[0], k.mulKey.b[0], k.mulKey.c[0]);
     auto d_mulOut = windowFunc<T, beaverMul<T>>(party, p, d_X, d_M, (u8 *)d_mulKey);
     gpuFree(d_mulKey);
     peer->reconstructInPlace(d_mulOut, p.bw, inSz, s);
@@ -152,8 +147,8 @@ T *windowMul(SigmaPeer *peer, int party, MaxpoolParams p, GPUMulKey<T> &k, T *d_
         auto d_temp = windowFunc<T, xPlusM<u64(1), u64(1)>>(party, p, d_mulOut, d_B, NULL, true);
         assert(d_mulOut == d_temp);
     }
-    auto d_truncated_O = gpuTruncate<T, T>(p.bw, p.bw, /*TruncateType::TrWithSlack*/t, k.trKey, p.scale, peer, party, inSz, d_mulOut, gaes, s);
-    gpuFree(d_mulOut);
-    // printf("End################\n");
+    auto d_truncated_O = gpuTruncate<T, T>(p.bw, p.bw, t, k.trKey, p.scale, peer, party, inSz, d_mulOut, gaes, s);
+    if (d_truncated_O != d_mulOut)
+        gpuFree(d_mulOut);
     return d_truncated_O;
 }
\ No newline at end of file
diff --git a/GPU-MPC/fss/gpu_window.h b/GPU-MPC/fss/gpu_window.h
index c6d9ed2b..f6a55ac8 100644
--- a/GPU-MPC/fss/gpu_window.h
+++ b/GPU-MPC/fss/gpu_window.h
@@ -29,7 +29,6 @@ GPUMulKey<T> readGPUWindowMulKey(MaxpoolParams p, TruncateType t, u8 **key_as_by
     GPUMulKey<T> k;
     u64 inSz = getInSz(p);
     u64 mSz = getMSz(p);
-    printf("%d, %d\n", inSz, mSz);
     k = readGPUMulKey<T>(key_as_bytes, inSz, mSz, inSz, t);
     return k;
 }
diff --git a/GPU-MPC/tests/fss/dpf.cu b/GPU-MPC/tests/fss/dpf.cu
index 72caac79..14cfc84b 100644
--- a/GPU-MPC/tests/fss/dpf.cu
+++ b/GPU-MPC/tests/fss/dpf.cu
@@ -42,8 +42,6 @@ int main(int argc, char *argv[])
     int bin = atoi(argv[1]);
     int N = atoi(argv[2]);
 
-    printf("Bin=%d, N=%d\n", bin, N);
-
     u8 *ptr1, *ptr2;
     getKeyBuf(&ptr1, &ptr2, 50 * OneGB);
     auto keyBuf1 = ptr1;
diff --git a/GPU-MPC/tests/fss/dpf_lut.cu b/GPU-MPC/tests/fss/dpf_lut.cu
index c9f22a97..0cd6d32f 100644
--- a/GPU-MPC/tests/fss/dpf_lut.cu
+++ b/GPU-MPC/tests/fss/dpf_lut.cu
@@ -80,7 +80,6 @@ int main(int argc, char *argv[])
     gpuFree(d_O);
     destroyGPURandomness();
 
-    printf("Bin=%d, N=%d\n", bin, N);
     for (int i = 0; i < N; i++)
     {
         auto unmasked_O = (h_O[i] - h_mask_O[i]);
diff --git a/GPU-MPC/utils/cpu_comms.h b/GPU-MPC/utils/cpu_comms.h
index dfb39d44..dec8d659 100644
--- a/GPU-MPC/utils/cpu_comms.h
+++ b/GPU-MPC/utils/cpu_comms.h
@@ -74,13 +74,11 @@ class CpuPeer : public SigmaPeer
 
     void cpuXor(u64 N, u32 *x, u32 *y)
     {
-        // printf("x=%u, y=%u\n", x[0], y[0]);
 #pragma omp parallel for
         for (u64 i = 0; i < N; i++)
         {
             x[i] ^= y[i];
         }
-        // printf("%u\n", x[0]);
     }
 
     template <typename T>
diff --git a/GPU-MPC/utils/sigma_comms.cpp b/GPU-MPC/utils/sigma_comms.cpp
index 43e968db..01f8d8bc 100644
--- a/GPU-MPC/utils/sigma_comms.cpp
+++ b/GPU-MPC/utils/sigma_comms.cpp
@@ -63,7 +63,6 @@ void SigmaPeer::wait()
 
 void SigmaPeer::initCommBufs(bool pinMem)
 {
-    // printf("################## Increase the size of comm bufs! #####################\n");
     printf("Allocating %lu bytes of memory for comm bufs\n", commBufSize);
     h_bufA0 = cpuMalloc(commBufSize, pinMem);
     h_bufA1 = cpuMalloc(commBufSize, pinMem);
diff --git a/sytorch/include/sytorch/tensor.h b/sytorch/include/sytorch/tensor.h
index c07c2baf..c381291f 100644
--- a/sytorch/include/sytorch/tensor.h
+++ b/sytorch/include/sytorch/tensor.h
@@ -40,13 +40,16 @@ inline u64 type_cast(float val)
 }
 
 template <typename T>
-class TensorRef {
+class TensorRef
+{
 public:
-    T* data;
+    T *data;
     u64 size;
     TensorRef(T *data, u64 size) : data(data), size(size) {}
-    void zero() {
-        for(u64 i = 0; i < size; i++) {
+    void zero()
+    {
+        for (u64 i = 0; i < size; i++)
+        {
             data[i] = 0;
         }
     }
@@ -65,7 +68,8 @@ template <typename T>
 class Tensor1D;
 
 template <typename T>
-class Tensor {
+class Tensor
+{
     bool isFreed = false;
 
 public:
@@ -93,24 +97,31 @@ class Tensor {
         return this->data[offset];
     }
 
-    void allocate(const std::vector<u64> &s) {
+    void allocate(const std::vector<u64> &s)
+    {
         always_assert(isOwner);
         this->shape = s;
-        if (this->size() > 0) {
+        if (this->size() > 0)
+        {
             this->data = new T[this->size()];
             isFreed = false;
-        } else {
+        }
+        else
+        {
             this->data = nullptr;
             isFreed = true;
         }
     }
 
-    void free() {
+    void free()
+    {
         always_assert(isOwner);
-        if (isFreed) {
+        if (isFreed)
+        {
             return;
         }
-        if (this->size() == 0) {
+        if (this->size() == 0)
+        {
             return;
         }
         delete[] data;
@@ -118,17 +129,22 @@ class Tensor {
         isFreed = true;
     }
 
-    void resize(const std::vector<u64> &s) {
+    void resize(const std::vector<u64> &s)
+    {
         always_assert(isOwner);
-        if (s.size() == this->shape.size()){
+        if (s.size() == this->shape.size())
+        {
             bool allSameDims = true;
-            for (u64 i = 0; i < s.size(); i++) {
-                if (s[i] != this->shape[i]) {
+            for (u64 i = 0; i < s.size(); i++)
+            {
+                if (s[i] != this->shape[i])
+                {
                     allSameDims = false;
                     break;
                 }
             }
-            if (allSameDims) {
+            if (allSameDims)
+            {
                 return;
             }
         }
@@ -136,60 +152,74 @@ class Tensor {
         allocate(s);
     }
 
-    Tensor(const std::vector<u64> &s) {
+    Tensor(const std::vector<u64> &s)
+    {
         allocate(s);
     }
 
-    Tensor(std::initializer_list<u64> s) {
+    Tensor(std::initializer_list<u64> s)
+    {
         allocate(s);
     }
 
-    Tensor(T* data, const std::vector<u64> &s) {
+    Tensor(T *data, const std::vector<u64> &s)
+    {
         this->data = data;
         this->shape = s;
         this->isOwner = false;
     }
 
-    ~Tensor() {
+    ~Tensor()
+    {
         if (isOwner)
             free();
-    } 
+    }
 
-    u64 size() const {
-        if (this->shape.size() == 0) {
+    u64 size() const
+    {
+        if (this->shape.size() == 0)
+        {
             return 0;
         }
         u64 s = 1;
-        for (auto d : this->shape) {
+        for (auto d : this->shape)
+        {
             s *= d;
         }
         return s;
     }
 
-    bool is_same_shape(const Tensor<T> &other) const {
-        if (!(this->shape.size() == other.shape.size())) {
+    bool is_same_shape(const Tensor<T> &other) const
+    {
+        if (!(this->shape.size() == other.shape.size()))
+        {
             return false;
         }
-        for (u64 i = 0; i < this->shape.size(); i++) {
-            if (!(this->shape[i] == other.shape[i])) {
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
+            if (!(this->shape[i] == other.shape[i]))
+            {
                 return false;
             }
         }
         return true;
     }
-    
-    void assert_same_shape(const Tensor<T> &other) {
+
+    void assert_same_shape(const Tensor<T> &other)
+    {
         always_assert(this->shape.size() == other.shape.size());
-        for (u64 i = 0; i < this->shape.size(); i++) {
+        for (u64 i = 0; i < this->shape.size(); i++)
+        {
             always_assert(this->shape[i] == other.shape[i]);
         }
     }
 
-    void copy(const Tensor<T> &other, bool copyGraph = true) {
+    void copy(const Tensor<T> &other, bool copyGraph = true)
+    {
         assert_same_shape(other);
         // memcpy(data, other.data, size() * sizeof(T));
-        //#pragma omp parallel for
-        for(u64 i = 0; i < size(); ++i)
+        // #pragma omp parallel for
+        for (u64 i = 0; i < size(); ++i)
         {
             data[i] = other.data[i];
         }
@@ -197,13 +227,16 @@ class Tensor {
             this->graphNode = other.graphNode;
     }
 
-    void fill(T x) {
-        for (u64 i = 0; i < size(); i++) {
+    void fill(T x)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
             data[i] = x;
         }
     }
 
-    void zero() {
+    void zero()
+    {
         fill(0);
     }
 
@@ -214,18 +247,17 @@ class Tensor {
             double d;
             std::cin >> d;
             data[i] = type_cast<T>(d * (1LL << scale));
-
         }
     }
 
     void input_nchw(int scale)
     {
         always_assert(this->shape.size() >= 2); // atleast batch and channel axis
-        
+
         u64 batch_size = shape[0];
         u64 num_channel = shape.back();
         u64 rest_size = size() / (batch_size * num_channel);
-        
+
         for (u64 i = 0; i < size(); i++)
         {
             double d;
@@ -238,7 +270,7 @@ class Tensor {
             data[new_idx] = type_cast<T>(d);
 #else
             data[new_idx] = type_cast<T>(d * (1LL << scale));
-#endif        
+#endif
         }
     }
 
@@ -257,9 +289,11 @@ class Tensor {
         std::cout << "\n";
     }
 
-    void printshape() {
+    void printshape()
+    {
         std::cout << "(";
-        for(int i = 0; i < this->shape.size(); i++) {
+        for (int i = 0; i < this->shape.size(); i++)
+        {
             std::cout << this->shape[i] << ", ";
         }
         std::cout << ")" << "\n";
@@ -330,14 +364,15 @@ class Tensor {
         struct stat sb;
         fstat(fd2, &sb);
         buffersize = sb.st_size;
-        int advise=posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED);
-        floatInput= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0);
-        for(u64 i = 0; i < size(); ++i)
+        int advise = posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED);
+        floatInput = (float *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0);
+        for (u64 i = 0; i < size(); ++i)
         {
             data[i] = type_cast<T>(floatInput[i] * (1LL << scale));
         }
         ::close(fd2);
-        //delete[] floatInput;
+        printf("Input=%lu\n", data[0]);
+        // delete[] floatInput;
         munmap(floatInput, buffersize);
     }
 
@@ -346,7 +381,7 @@ class Tensor {
         assert(this->shape.size() == 5);
         return Tensor5D<T>(this->data, this->shape[0], this->shape[1], this->shape[2], this->shape[3], this->shape[4]);
     }
-    
+
     Tensor4D<T> as_4d()
     {
         assert(this->shape.size() == 4);
@@ -370,46 +405,56 @@ class Tensor {
 };
 
 template <typename T>
-class Tensor1D {
+class Tensor1D
+{
 public:
     T *data;
     u64 d1;
 
     Tensor1D(u64 s) : d1(s), data(new T[s]) {}
 
-    void randomize(double range) {
-        for(u64 i = 0; i < this->d1; i++) {
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
             auto r = (double)prngWeights.get<int32_t>();
             this->data[i] = (T)((r / (1LL << 31)) * range);
         }
     }
 
-    ~Tensor1D() {
+    ~Tensor1D()
+    {
         delete[] this->data;
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    T &operator()(u64 i) const {
+    T &operator()(u64 i) const
+    {
         assert(i < this->d1);
         return this->data[i];
     }
 
-    void fill(T val) {
-        for (u64 i = 0; i < this->d1; i++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
             this->data[i] = val;
         }
     }
 };
 
 template <typename T>
-class Tensor2D {
+class Tensor2D
+{
 public:
     u64 d1, d2;
     T *data;
@@ -419,26 +464,33 @@ class Tensor2D {
 
     Tensor2D(T *data, u64 d1, u64 d2) : d1(d1), d2(d2), data(data), isOwner(false) {}
 
-    void randomize(double range) {
-        for(u64 i = 0; i < this->d1; i++) {
-            for(u64 j = 0; j < this->d2; j++) {
+    void randomize(double range)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
                 auto r = (double)prngWeights.get<int32_t>();
                 this->data[i * this->d2 + j] = (T)((r / (1LL << 31)) * range);
             }
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2) {
+    void resize(u64 d1, u64 d2)
+    {
         always_assert(this->isOwner);
-        if (this->d1 == d1 && this->d2 == d2) {
+        if (this->d1 == d1 && this->d2 == d2)
+        {
             return;
         }
         delete[] data;
@@ -447,29 +499,51 @@ class Tensor2D {
         data = new T[d1 * d2];
     }
 
-    ~Tensor2D() {
+    ~Tensor2D()
+    {
         if (this->isOwner)
             delete[] this->data;
     }
 
-    T& operator()(u64 i, u64 j) const {
+    T &operator()(u64 i, u64 j) const
+    {
         assert(i < this->d1);
         assert(j < this->d2);
         return this->data[i * this->d2 + j];
     }
 
-    void zero() {
+    void zero()
+    {
         fill(0);
     }
 
-    void fill(T val) {
-        for(u64 i = 0; i < this->d1; i++) {
-            for(u64 j = 0; j < this->d2; j++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < this->d1; i++)
+        {
+            for (u64 j = 0; j < this->d2; j++)
+            {
                 this->data[i * this->d2 + j] = val;
             }
         }
     }
 
+    u64 argmax(u64 i)
+    {
+        assert(i < d1);
+        u64 maxIndex = 0;
+        T maxValue = data[i * d2];
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
+                maxValue = data[i * d2 + j];
+                maxIndex = j;
+            }
+        }
+        return maxIndex;
+    }
+
     Tensor<T> as_nd()
     {
         return Tensor<T>(data, {d1, d2});
@@ -477,37 +551,46 @@ class Tensor2D {
 };
 
 template <typename T>
-class Tensor4D {
+class Tensor4D
+{
 public:
     u64 d1, d2, d3, d4;
-    T* data;
+    T *data;
     bool isOwner = true;
 
-    Tensor4D(u64 d1, u64 d2, u64 d3, u64 d4) : d1(d1), d2(d2), d3(d3), d4(d4) {
+    Tensor4D(u64 d1, u64 d2, u64 d3, u64 d4) : d1(d1), d2(d2), d3(d3), d4(d4)
+    {
         data = new T[d1 * d2 * d3 * d4];
     }
 
-    Tensor4D(T* data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d1(d1), d2(d2), d3(d3), d4(d4) {
+    Tensor4D(T *data, u64 d1, u64 d2, u64 d3, u64 d4) : data(data), d1(d1), d2(d2), d3(d3), d4(d4)
+    {
         isOwner = false;
     }
 
-    ~Tensor4D() {
-        if (isOwner) {
+    ~Tensor4D()
+    {
+        if (isOwner)
+        {
             delete[] data;
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2 * d3 * d4;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2, u64 d3, u64 d4) {
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4)
+    {
         always_assert(isOwner);
-        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4) {
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4)
+        {
             return;
         }
         delete[] data;
@@ -518,13 +601,15 @@ class Tensor4D {
         data = new T[d1 * d2 * d3 * d4];
     }
 
-    void resize(const std::vector<u64> &shape) {
+    void resize(const std::vector<u64> &shape)
+    {
         always_assert(isOwner);
         always_assert(shape.size() == 4);
         resize(shape[0], shape[1], shape[2], shape[3]);
     }
 
-    T& operator()(u64 i, u64 j, u64 k, u64 l) const {
+    T &operator()(u64 i, u64 j, u64 k, u64 l) const
+    {
         assert(i < d1);
         assert(j < d2);
         assert(k < d3);
@@ -532,14 +617,17 @@ class Tensor4D {
         return data[i * d2 * d3 * d4 + j * d3 * d4 + k * d4 + l];
     }
 
-    u64 argmax(u64 i) {
+    u64 argmax(u64 i)
+    {
         assert(d3 == 1);
         assert(d4 == 1);
         assert(i < d1);
         u64 maxIndex = 0;
         T maxValue = data[i * d2];
-        for (u64 j = 1; j < d2; j++) {
-            if (data[i * d2 + j] > maxValue) {
+        for (u64 j = 1; j < d2; j++)
+        {
+            if (data[i * d2 + j] > maxValue)
+            {
                 maxValue = data[i * d2 + j];
                 maxIndex = j;
             }
@@ -552,47 +640,56 @@ class Tensor4D {
         return Tensor<T>(data, {d1, d2, d3, d4});
     }
 
-    void fill(T val) {
-        for (u64 i = 0; i < size(); i++) {
+    void fill(T val)
+    {
+        for (u64 i = 0; i < size(); i++)
+        {
             this->data[i] = val;
         }
     }
-
 };
 
-
 template <typename T>
-class Tensor5D {
+class Tensor5D
+{
 public:
     u64 d1, d2, d3, d4, d5;
-    T* data;
+    T *data;
     bool isOwner = true;
 
-    Tensor5D(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : d1(d1), d2(d2), d3(d3), d4(d4), d5(d5) {
+    Tensor5D(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
         data = new T[d1 * d2 * d3 * d4 * d5];
     }
 
-    Tensor5D(T* data, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : data(data), d1(d1), d2(d2), d3(d3), d4(d4), d5(d5) {
+    Tensor5D(T *data, u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) : data(data), d1(d1), d2(d2), d3(d3), d4(d4), d5(d5)
+    {
         isOwner = false;
     }
 
-    ~Tensor5D() {
-        if (isOwner) {
+    ~Tensor5D()
+    {
+        if (isOwner)
+        {
             delete[] data;
         }
     }
 
-    u64 size() const {
+    u64 size() const
+    {
         return d1 * d2 * d3 * d4 * d5;
     }
 
-    TensorRef<T> ref() {
+    TensorRef<T> ref()
+    {
         return TensorRef<T>(data, size());
     }
 
-    void resize(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5) {
+    void resize(u64 d1, u64 d2, u64 d3, u64 d4, u64 d5)
+    {
         always_assert(isOwner);
-        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4 && this->d5 == d5) {
+        if (this->d1 == d1 && this->d2 == d2 && this->d3 == d3 && this->d4 == d4 && this->d5 == d5)
+        {
             return;
         }
         delete[] data;
@@ -604,13 +701,15 @@ class Tensor5D {
         data = new T[d1 * d2 * d3 * d4 * d5];
     }
 
-    void resize(const std::vector<u64> &shape) {
+    void resize(const std::vector<u64> &shape)
+    {
         always_assert(isOwner);
         always_assert(shape.size() == 5);
         resize(shape[0], shape[1], shape[2], shape[3], shape[4]);
     }
 
-    T& operator()(u64 i, u64 j, u64 k, u64 l, u64 m) const {
+    T &operator()(u64 i, u64 j, u64 k, u64 l, u64 m) const
+    {
         assert(i < d1);
         assert(j < d2);
         assert(k < d3);
@@ -623,5 +722,4 @@ class Tensor5D {
     {
         return Tensor<T>(data, {d1, d2, d3, d4, d5});
     }
-
 };

From a3af98f849b1382bfc355872628f90e59f8bf98a Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Wed, 22 May 2024 05:36:55 -0700
Subject: [PATCH 10/32] Changed llama13b parameters

---
 GPU-MPC/experiments/sigma/sigma.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 5e064063..2b949f7b 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -154,14 +154,14 @@ int main(int __argc, char **__argv)
     }
     else if (model == "llama13b")
     {
-        n_layer = 1;//40;
+        n_layer = 40;
         n_head = 40;
         n_embd = 5120;
         attnMask = "self";
         qkvFormat = "qkvsep";
         bw = 48;
         u64 intermediate_size = 13824;
-        keyBufSz = /*450*/ 40 * OneGB;
+        keyBufSz = 450 * OneGB;
         net = new GPULlama<u64>(n_layer, n_head, n_embd, intermediate_size);
         input.resize({n_seq, n_embd});
         input.zero();

From a53836ee00c575fc08aa7cfcec8ca76c52dac2da Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Fri, 24 May 2024 12:46:41 -0700
Subject: [PATCH 11/32] Added a flag for correctness

---
 GPU-MPC/Makefile                   |  3 +++
 GPU-MPC/experiments/sigma/bert.h   | 22 +++++++++++++++-------
 GPU-MPC/experiments/sigma/sigma.cu |  2 ++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile
index 9e399452..5c815265 100644
--- a/GPU-MPC/Makefile
+++ b/GPU-MPC/Makefile
@@ -110,6 +110,9 @@ orca_inference_u32: experiments/orca/orca_inference.cu
 sigma: experiments/sigma/sigma.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
 
+sigma_correctness: experiments/sigma/sigma.cu 
+	$(CXX) $(FLAGS) -DCORRECTNESS=1 $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma
+
 piranha: experiments/orca/piranha.cu 
 	$(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/piranha
 
diff --git a/GPU-MPC/experiments/sigma/bert.h b/GPU-MPC/experiments/sigma/bert.h
index 976be4eb..25dc91c1 100644
--- a/GPU-MPC/experiments/sigma/bert.h
+++ b/GPU-MPC/experiments/sigma/bert.h
@@ -1,8 +1,8 @@
 // Author: Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -21,6 +21,10 @@
 
 #pragma once
 
+#ifndef CORRECTNESS
+#define CORRECTNESS 0
+#endif
+
 #include <sytorch/module.h>
 
 template <typename T>
@@ -70,8 +74,10 @@ class GPUBERT : public SytorchModule<T>
     using SytorchModule<T>::add;
     using SytorchModule<T>::unsqueeze;
     std::vector<GPUBertTransformerBlock<T> *> blocks;
+#if CORRECTNESS
     LayerNorm<T> *ln_f;
     FC<T> *pool;
+#endif
     u64 n_layer, n_heads, n_embd;
     std::string attnMask, qkvFormat;
 
@@ -82,20 +88,22 @@ class GPUBERT : public SytorchModule<T>
         {
             blocks.push_back(new GPUBertTransformerBlock<T>(n_heads, n_embd, attnMask, qkvFormat));
         }
+#if CORRECTNESS
         ln_f = new LayerNorm<T>(n_embd);
         pool = new FC<T>(n_embd, n_embd, true);
+#endif
     }
 
     Tensor<T> &_forward(Tensor<T> &input)
     {
-        // auto &y = ln_f->forward(input);
-        // Tensor<T> *x = &y;
-
         Tensor<T> *x = &input;
+#if CORRECTNESS
+        auto &ln_out = ln_f->forward(input);
+        x = &ln_out;
+#endif
         for (u64 i = 0; i < n_layer; ++i)
         {
-            auto &block = blocks[i];
-            auto &x_out = block->forward(*x);
+            auto &x_out = blocks[i]->forward(*x);
             x = &x_out;
         }
         return *x;
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 2b949f7b..a869a746 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -20,6 +20,7 @@
 // SOFTWARE.
 
 #include <sytorch/module.h>
+#include <sytorch/utils.h>
 #include "gpt2.h"
 #include "bert.h"
 #include "llama2.h"
@@ -194,6 +195,7 @@ int main(int __argc, char **__argv)
         auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
         auto signedAct = Tensor<i64>((i64*) activation.data, activation.shape).as_2d();
+        // print(signedAct.as_nd(), scale, (u64) bw);
         auto maxIdx = signedAct.argmax(0);
         printf("%d, %ld\n", maxIdx, activation.data[maxIdx]);
 

From 22f7bf65c665f45ff00dc02cb0c1800d19629c3e Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 04:57:56 -0700
Subject: [PATCH 12/32] Added support for llama on the cpu

---
 GPU-MPC/ext/sytorch/examples/bert.cpp         |  63 +++--
 .../ext/sytorch/examples/bertbenchmark.cpp    |  93 ++++---
 GPU-MPC/ext/sytorch/examples/gpt2.cpp         |  11 +-
 GPU-MPC/ext/sytorch/examples/gptneo.cpp       |  26 +-
 GPU-MPC/ext/sytorch/examples/llama7b.cpp      | 104 ++++++--
 GPU-MPC/ext/sytorch/ext/llama/api.cpp         | 251 +++++++++++++++---
 .../ext/sytorch/ext/llama/include/llama/api.h |   2 +
 .../sytorch/include/sytorch/backend/backend.h |   2 +-
 .../include/sytorch/backend/llama_base.h      |  47 ++--
 .../sytorch/backend/llama_transformer.h       |  99 ++++---
 .../sytorch/include/sytorch/layers/layers.h   |   7 +-
 11 files changed, 509 insertions(+), 196 deletions(-)

diff --git a/GPU-MPC/ext/sytorch/examples/bert.cpp b/GPU-MPC/ext/sytorch/examples/bert.cpp
index c77e6d23..1e7f04b6 100644
--- a/GPU-MPC/ext/sytorch/examples/bert.cpp
+++ b/GPU-MPC/ext/sytorch/examples/bert.cpp
@@ -199,13 +199,14 @@ class BERT : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-
-        auto &x0 = view(*x, 0);
-        auto &x0_unsqueeze = unsqueeze(x0);
-        auto &pool_out = pool->forward(x0_unsqueeze);
-        auto &tanh_out = tanh(pool_out);
-        // return view(tanh_out, 0);
-        return tanh_out;
+        return *x;
+
+        // auto &x0 = view(*x, 0);
+        // auto &x0_unsqueeze = unsqueeze(x0);
+        // auto &pool_out = pool->forward(x0_unsqueeze);
+        // auto &tanh_out = tanh(pool_out);
+        // // return view(tanh_out, 0);
+        // return tanh_out;
     }
 };
 
@@ -229,8 +230,9 @@ class BERTSequenceClassification : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = gpt2->forward(input);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, 0);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, 0);
     }
 };
 
@@ -381,22 +383,40 @@ int fixed_mrpc_validation(int __argc, char**__argv) {
 
 int ct_main(int __argc, char**__argv) {
     sytorch_init();
-
-    const u64 n_vocab = 50257;
-    const u64 n_ctx = 1024;
-    const u64 n_embd = 768;
-    const u64 n_head = 12;
-    const u64 n_layer = 12;
+     // Bert base
+    // const u64 n_embd = 768;
+    // const u64 n_head = 12;
+    // const u64 n_layer = 12;
+    // const u64 bw = 50;
+    // const u64 scale = 12;
+
+    // Bert tiny
+    // const u64 n_embd = 128;
+    // const u64 n_head = 2;
+    // const u64 n_layer = 2;
+    // const u64 bw = 37;
+    // const u64 scale = 12;
+
+    // // Bert large
+    const u64 n_embd = 1024;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 bw = 50;
     const u64 scale = 12;
 
+    
     BERTSequenceClassification<i64> bert(n_layer, n_head, n_embd, 2);
-    bert.init(scale);
+    u64 n_seq = 128;//get_n_seq(fname, n_embd);
+    Tensor<i64> input({n_seq, n_embd});
+    bert.init(scale, input);
     hasInit = true;
-    bert.load("bertclass.dat");
 
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+    bert.setBackend(ct);
+
+    bert.load("bert-large-weights.dat");
     std::string fname = __argv[1];
-    u64 n_seq = get_n_seq(fname, n_embd);
-    Tensor<i64> input({n_seq, n_embd});
     input.load(fname, scale);
 
     auto t1 = std::chrono::high_resolution_clock::now();
@@ -404,7 +424,7 @@ int ct_main(int __argc, char**__argv) {
     auto t2 = std::chrono::high_resolution_clock::now();
     auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
     std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
-    print(bert.activation, scale);
+    print(bert.activation, scale, bw);
 
     return 0;
 }
@@ -545,8 +565,9 @@ int float_sst2_single(int __argc, char**__argv) {
 int main(int __argc, char**__argv)
 {
     // float_sst2_validation(__argc, __argv);
-    fixed_sst2_validation(__argc, __argv);
+    // fixed_sst2_validation(__argc, __argv);
     // lt_main(__argc, __argv);
+    ct_main(__argc, __argv);
     // float_mrpc_validation(__argc, __argv);
     // fixed_mrpc_validation(__argc, __argv);
     // float_sst2_single(__argc, __argv);
diff --git a/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
index 6239b8cd..bd2e01eb 100644
--- a/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
+++ b/GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -32,16 +32,17 @@
 template <typename T>
 class FFN : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::gelu;
 
     u64 in;
     u64 hidden;
+
 public:
     FC<T> *up;
     FC<T> *down;
 
-    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
     {
         up = new FC<T>(in, hidden, true);
         down = new FC<T>(hidden, in, true);
@@ -56,7 +57,7 @@ class FFN : public SytorchModule<T>
 template <typename T>
 class MultiHeadAttention : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::split;
     using SytorchModule<T>::view;
     using SytorchModule<T>::add;
@@ -75,10 +76,10 @@ class MultiHeadAttention : public SytorchModule<T>
     u64 n_heads;
     u64 n_embd;
 
-    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
     {
         always_assert(n_embd % n_heads == 0);
-        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
         c_proj = new FC<T>(n_embd, n_embd, true);
     }
 
@@ -95,8 +96,8 @@ class MultiHeadAttention : public SytorchModule<T>
 
         double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
 
-        std::vector<Tensor<T>*> qks_sm_vs;
-        for(u64 i = 0; i < n_heads; ++i)
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
         {
             auto &q = view(qs, i);
             auto &k = view(ks, i);
@@ -120,21 +121,21 @@ class MultiHeadAttention : public SytorchModule<T>
 template <typename T>
 class TransformerBlock : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::add;
 
     MultiHeadAttention<T> *attn;
     FFN<T> *ffn;
     LayerNorm<T> *ln0;
     LayerNorm<T> *ln1;
-    
+
     u64 n_heads, n_embd;
-public:
 
-    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
     {
         attn = new MultiHeadAttention<T>(n_heads, n_embd);
-        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
         ln0 = new LayerNorm<T>(n_embd);
         ln1 = new LayerNorm<T>(n_embd);
     }
@@ -155,7 +156,7 @@ class TransformerBlock : public SytorchModule<T>
 template <typename T>
 class BERT : public SytorchModule<T>
 {
-    public:
+public:
     using SytorchModule<T>::tanh;
     using SytorchModule<T>::view;
     using SytorchModule<T>::add;
@@ -166,10 +167,9 @@ class BERT : public SytorchModule<T>
     u64 n_layer, n_heads, n_embd;
 
 public:
-    
-    BERT(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
     {
-        for(u64 i = 0; i < n_layer; ++i)
+        for (u64 i = 0; i < n_layer; ++i)
         {
             blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
         }
@@ -179,11 +179,11 @@ class BERT : public SytorchModule<T>
 
     Tensor<T> &_forward(Tensor<T> &input)
     {
-        // auto &y = ln_f->forward(input);
-        // Tensor<T> *x = &y;
-        Tensor<T> *x = &input;
-        
-        for(u64 i = 0; i < n_layer; ++i)
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
         {
             auto &block = blocks[i];
             auto &x_out = block->forward(*x);
@@ -194,18 +194,18 @@ class BERT : public SytorchModule<T>
     }
 };
 
-int main(int __argc, char**__argv)
+int main(int __argc, char **__argv)
 {
     sytorch_init();
 
     // bert tiny
-    // const u64 n_embd = 128;
-    // const u64 n_head = n_embd / 64;
-    // const u64 n_layer = 2;
-    // const u64 scale = 12;
-    // const u64 bw = 38;
-    // const u64 n_seq = 128;
-    
+    const u64 n_embd = 128;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 2;
+    const u64 scale = 12;
+    const u64 bw = 37;
+    const u64 n_seq = 128;
+
     // bert base
     // const u64 n_embd = 768;
     // const u64 n_head = 12;
@@ -215,12 +215,12 @@ int main(int __argc, char**__argv)
     // const u64 n_seq = 128;
 
     // bert large
-    const u64 n_embd = 1024;
-    const u64 n_head = n_embd / 64;
-    const u64 n_layer = 24;
-    const u64 scale = 12;
-    const u64 bw = 51;
-    const u64 n_seq = 128;
+    // const u64 n_embd = 1024;
+    // const u64 n_head = n_embd / 64;
+    // const u64 n_layer = 24;
+    // const u64 scale = 12;
+    // const u64 bw = 51;
+    // const u64 n_seq = 128;
 
     int party = atoi(__argv[1]);
     std::string ip = "127.0.0.1";
@@ -237,15 +237,17 @@ int main(int __argc, char**__argv)
     llama->init(ip, true);
 
     BERT<u64> net(n_layer, n_head, n_embd);
-    net.init(scale);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
     net.setBackend(llama);
     net.optimize();
-    llama->initializeInferencePartyA(net.root);
-
-    Tensor<u64> input({n_seq, n_embd});
-    if(party == CLIENT){
-        input.fill(1LL << (scale-2));
+    if (party != DEALER)
+    {
+        net.load("bert-tiny-weights.dat");
+        input.load("15469.dat", scale);
     }
+
+    llama->initializeInferencePartyA(net.root);
     llama->initializeInferencePartyB(input);
 
     llama::start();
@@ -256,5 +258,10 @@ int main(int __argc, char**__argv)
     llama->outputA(output);
     llama->finalize();
 
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
     return 0;
 }
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/examples/gpt2.cpp b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
index a4f09c79..351eb3d2 100644
--- a/GPU-MPC/ext/sytorch/examples/gpt2.cpp
+++ b/GPU-MPC/ext/sytorch/examples/gpt2.cpp
@@ -337,10 +337,13 @@ int lt_main(int __argc, char**__argv){
     const u64 n_layer = 12;
     const u64 scale = 12;
     u64 bitlength = 50;
-    GPT2<u64> net(n_layer, n_head, n_embd);
-    net.init(scale);
-    net.load("gpt2lmr.dat");
-    Tensor<u64> input({128, n_embd});
+    GPT2<i64> net(n_layer, n_head, n_embd);
+    Tensor<i64> input({128, n_embd});
+    net.init(scale, input);
+    auto ct = new ClearText<i64>();
+    ct->bw = 50;
+    net.setBackend(ct);
+    net.load("gpt2-weights.dat");
     input.load("15469.dat", scale);
     printf("Starting\n");
     net.forward(input);
diff --git a/GPU-MPC/ext/sytorch/examples/gptneo.cpp b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
index 87584ff6..a1f321a9 100644
--- a/GPU-MPC/ext/sytorch/examples/gptneo.cpp
+++ b/GPU-MPC/ext/sytorch/examples/gptneo.cpp
@@ -257,7 +257,8 @@ class GPT2 : public SytorchModule<T>
             auto &x_out = block->forward(*x);
             x = &x_out;
         }
-        return ln_f->forward(*x);
+        return *x;
+        // return ln_f->forward(*x);
     }
 };
 
@@ -281,9 +282,10 @@ class GPT2NextWordLogits : public SytorchModule<T>
     Tensor<T> &_forward(Tensor<T> &input)
     {
         auto &fc_in = gpt2->forward(input);
+        return fc_in;
         // printshape(fc_in.shape);
-        auto &fc_out = fc->forward(fc_in);
-        return view(fc_out, -1);
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
     }
 };
 
@@ -304,14 +306,19 @@ void ct_main(std::string fname) {
     const u64 n_layer = 24;
     const u64 scale = 12;
     const u64 window_size = 256;
+    const u64 bw = 51;
 
     GPT2NextWordLogits <i64> net(n_layer, n_head, n_embd, n_vocab, window_size);
-    net.init(scale);
-    hasInit = true;
-    net.load("gpt-neo-1pt3B-weights.dat");
-
-    u64 n_seq = get_n_seq(fname, n_embd);
+    u64 n_seq = 128;//get_n_seq(fname, n_embd);
     Tensor<i64> input({n_seq, n_embd});
+    net.init(scale, input);
+
+    auto ct = new ClearText<i64>();
+    ct->bw = bw;
+    net.setBackend(ct);
+
+    hasInit = true;
+    net.load("gpt-neo-weights.dat");
     input.load(fname, scale);
 
     auto t1 = std::chrono::high_resolution_clock::now();
@@ -319,7 +326,8 @@ void ct_main(std::string fname) {
     auto t2 = std::chrono::high_resolution_clock::now();
     auto compute_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
     std::cout << "Total time = " << compute_time / (1000.0)  << " ms" << std::endl;
-    printfe(net.activation, 5);
+    // printfe(net.activation, 5);
+    print(net.activation, scale, bw);
 }
 
 int lt_main(std::string fname, int __argc, char**__argv){
diff --git a/GPU-MPC/ext/sytorch/examples/llama7b.cpp b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
index b322973c..b8e0e661 100644
--- a/GPU-MPC/ext/sytorch/examples/llama7b.cpp
+++ b/GPU-MPC/ext/sytorch/examples/llama7b.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -223,7 +223,7 @@ u64 get_n_seq(std::string filename, u64 n_embd)
     return n_elements / (4 * n_embd);
 }
 
-void ct_main()
+void ct_main(std::string inpName)
 {
     sytorch_init();
 
@@ -237,13 +237,13 @@ void ct_main()
     const u64 n_vocab = 32000;
     const u64 n_ctx = 4096;
     const u64 n_embd = 5120;
-    const u64 n_head = 40;
-    const u64 n_layer = 1;//40;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 1; // 40;
     const u64 intermediate_size = 13824;
     const u64 scale = 12;
 
     LlamaNextWordLogits<i64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
-    u64 n_seq = 128;//get_n_seq(fname, n_embd);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
     Tensor<i64> input({n_seq, n_embd});
     llama_model.init(scale, input);
 
@@ -255,28 +255,90 @@ void ct_main()
     llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_13b.dat");
 
     // std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
-    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-13b/") + /*std::to_string(i)*/ +"999.dat";
+    std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-13b/") + /*std::to_string(i)*/ inpName;
     input.load(fname, scale);
+    auto &res = llama_model.forward(input);
+    auto signedAct = Tensor<i64>((i64 *)res.data, res.shape);
+    print(signedAct, scale, ct->bw);
+    auto maxIdx = signedAct.as_2d().argmax(0);
+    std::cout << "Output:" << std::endl;
+    std::cout << maxIdx << std::endl;
+    std::cout << res.data[maxIdx] << std::endl;
+    printf("%ld\n", signedAct.data[res.size() - 1]);
+}
+
+void lt_main(std::string inpName, int party)
+{
+    sytorch_init();
+
+    // const u64 n_vocab = 32000;
+    // const u64 n_embd = 4096;
+    // const u64 n_head = 32;
+    // const u64 n_layer = 32;//32;
+    // const u64 intermediate_size = 11008;
+    // const u64 scale = 12;
 
+    const u64 n_vocab = 32000;
+    const u64 n_ctx = 4096;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 1; // 40;
+    const u64 intermediate_size = 13824;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init("0.0.0.0", true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    if (party != DEALER)
+    {
+        // llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_7b.dat");
+        llama_model.load("/home/t-nejawalkar/ananta/meta_llama2_13b.dat");
+        std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-13b/") + /*std::to_string(i)*/ inpName;
+        input.load(fname, scale);
+    }
+
+    // std::string fname = std::string("/home/t-nejawalkar/ananta/lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
     auto &res = llama_model.forward(input);
-    i64 max = INT_MIN;
-    int argmax = 0;
-    for (int i = 0; i < n_vocab; i++)
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
     {
-        if (res.data[i] > max)
-        {
-            max = res.data[i];
-            argmax = i;
-        }
+        auto signedAct = Tensor<i64>((i64 *)llama_model.activation.data, llama_model.activation.shape);
+        print(signedAct, scale, LlamaConfig::bitlength);
+        auto maxIdx = signedAct.as_2d().argmax(0);
+        std::cout << "Output:" << std::endl;
+        std::cout << maxIdx << std::endl;
+        std::cout << output.data[maxIdx] << std::endl;
+        printf("%ld\n", signedAct.data[output.size() - 1]);
     }
-    std::cout << "Output:" << std::endl;
-    std::cout << argmax << std::endl;
-    std::cout << max << std::endl;
-    std::cout << res.data[0] << std::endl;
 }
 
-int main()
+int main(int __argc, char **__argv)
 {
-    ct_main();
+    int party = atoi(__argv[1]);
+    if (party == 0)
+        ct_main("999.dat");
+    else
+        lt_main("999.dat", party);
     return 0;
 }
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/ext/llama/api.cpp b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
index 69bb303c..052a2f65 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/api.cpp
+++ b/GPU-MPC/ext/sytorch/ext/llama/api.cpp
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -2770,6 +2770,83 @@ void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale)
     }
 }
 
+inline double relu_sub_silu(double x)
+{
+    double g = x / (1 + exp(-x));
+    return std::max(0.0, x) - g;
+}
+
+inline GroupElement relu_sub_silu(GroupElement x, int scale_in, int scale_out)
+{
+    return (GroupElement)(relu_sub_silu((double)x / (1LL << scale_in)) * (1LL << scale_out));
+}
+
+void SlothSilu(int size, int bin, GroupElement *x, GroupElement *out, int scale)
+{
+    always_assert(scale == 12);
+
+    GroupElement *y = new GroupElement[size];
+    GroupElement *d = new GroupElement[size];
+    GroupElement *rp = new GroupElement[size];
+    GroupElement *abs = new GroupElement[size];
+    GroupElement *r = new GroupElement[size];
+
+    SlothTR(size, bin, x, y, 6, "SiLU::");
+    SlothDrelu(size, bin - 6, y, d, "SiLU::");
+
+    Select(size, bin - 6, d, y, rp, "SiLU::");
+
+    auto t1 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        abs[i] = 2 * rp[i] - y[i];
+        mod(abs[i], bin - 6);
+    } });
+
+    SlothClip(size, bin - 6, 10, 10, abs, abs, "SiLU::");
+
+    std::vector<GroupElement> lut(1LL << 10);
+    auto t2 = time_this_block([&]()
+                              {
+    for(int i = 0; i < (1LL<<10); ++i)
+    {
+        lut[i] = relu_sub_silu(i, 6, scale);
+    } });
+
+    LUT_dpf(size, 10, bitlength, lut, abs, abs, "SiLU::", false);
+
+    Select(size, bitlength, d, x, r, "SiLU::", false);
+
+    auto t3 = time_this_block([&]()
+                              {
+#pragma omp parallel for
+    for (int i = 0; i < size; ++i)
+    {
+        out[i] = r[i] - abs[i];
+    } });
+
+    if (party != DEALER)
+    {
+        Llama::push_stats({"SiLU::Misc", 0, t1 + t3, 0, 0, 0});
+        // Llama::push_stats({ "GeLU::LutGen", 0, t2, 0, 0, 0 });
+        auto reconstruction_stats = time_comm_this_block([&]()
+                                                         { reconstruct(size, out, bitlength); });
+
+        Llama::stat_t stat = {
+            "SiLU::Reconstruct",
+            0,
+            0,
+            reconstruction_stats.first,
+            reconstruction_stats.second,
+            0,
+        };
+        stat.print();
+        Llama::push_stats(stat);
+    }
+}
+
 void TruncateReduce(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix)
 {
     if (party == DEALER)
@@ -3398,6 +3475,75 @@ void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupEleme
     delete[] var;
 }
 
+///////////////////////////////////////////////////////////////////////////////////
+
+void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale)
+// void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *y, int scale)
+{
+    GroupElement *tmp = new GroupElement[s1 * s2];
+
+    auto t1 = time_this_block([&]() {
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            tmp[i * s2 + j] = x[i * s2 + j];
+        }
+    }
+    });
+
+    GroupElement *var = new GroupElement[s1];
+    SumOfSquare(s1, s2, tmp, var, "LayerNorm::");
+
+    Rsqrt(s1, var, var, s2, scale, "LayerNorm::");
+
+    auto t2 = time_this_block([&]() { 
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            y[i * s2 + j] = var[i];
+        }
+    }
+    });
+
+    ElemWiseMul(s1 * s2, tmp, y, y, "LayerNorm::");
+    SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    GroupElement *Aexpand = tmp;
+    auto t3 = time_this_block([&]() { 
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < s1; ++i) {
+        for (int j = 0; j < s2; j++) {
+            Aexpand[i * s2 + j] = A[j];
+        }
+    }
+    });
+    
+    ElemWiseMul(s1 * s2, Aexpand, y, y, "LayerNorm::");
+
+    // auto t5 = time_this_block([&]() { 
+    // #pragma omp parallel for collapse(2)
+    // for (int i = 0; i < s1; ++i) {
+    //     for (int j = 0; j < s2; j++) {
+    //         y[i * s2 + j] += B[j];
+    //     }
+    // }
+    // });
+
+    // SlothARS(s1 * s2, y, y, scale, "LayerNorm::");
+
+    Llama::stat_t stat = {"LayerNorm::Misc", 0, t1 + t2 + t3, 0, 0, 0};
+    stat.print();
+    Llama::push_stats(stat);
+
+    delete[] tmp;
+    delete[] var;
+
+
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////
+
 // unused
 void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale)
 {
@@ -4681,7 +4827,8 @@ void mult_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Grou
 {
     auto thread_start = std::chrono::high_resolution_clock::now();
     auto p = get_start_end(size, thread_idx);
-    for(int i = p.first; i < p.second; i += 1){
+    for (int i = p.first; i < p.second; i += 1)
+    {
         outArr[i] = MultEval(party - SERVER, keys[i], inArr[i], multArrVec[i]);
     }
     auto thread_end = std::chrono::high_resolution_clock::now();
@@ -4691,12 +4838,14 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
                                     MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr))
 {
     std::cerr << ">> ElemWise Mult - start" << std::endl;
-    if (party == DEALER) {
+    if (party == DEALER)
+    {
         uint64_t dealer_toal_time = 0;
         pair<MultKey> *keys = new pair<MultKey>[size];
 
-        #pragma omp parallel for
-        for(int i = 0; i < size; ++i) {
+#pragma omp parallel for
+        for (int i = 0; i < size; ++i)
+        {
             auto dealer_start = std::chrono::high_resolution_clock::now();
             auto rout = random_ge(bitlength);
             keys[i] = MultGen(inArr_mask[i], multArrVec_mask[i], rout);
@@ -4705,31 +4854,37 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
             dealer_toal_time += std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count();
         }
 
-        for(int i = 0; i < size; ++i) {
+        for (int i = 0; i < size; ++i)
+        {
             server->send_mult_key(keys[i].first);
             client->send_mult_key(keys[i].second);
         }
         dealerMicroseconds = dealerMicroseconds + dealer_toal_time;
         delete[] keys;
     }
-    else {
+    else
+    {
         MultKey *keys = new MultKey[size];
         auto keyread_start = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < size; ++i) {
+        for (int i = 0; i < size; ++i)
+        {
             keys[i] = dealer->recv_mult_key();
         }
         auto keyread_end = std::chrono::high_resolution_clock::now();
         auto keyread_time_taken = std::chrono::duration_cast<std::chrono::milliseconds>(keyread_end -
-                                                            keyread_start).count();
+                                                                                        keyread_start)
+                                      .count();
 
         peer->sync();
         auto start = std::chrono::high_resolution_clock::now();
         std::thread thread_pool[num_threads];
-        for(int i = 0; i < num_threads; ++i) {
+        for (int i = 0; i < num_threads; ++i)
+        {
             thread_pool[i] = std::thread(mult_threads_helper, i, size, inArr, multArrVec, outputArr, keys);
         }
 
-        for(int i = 0; i < num_threads; ++i) {
+        for (int i = 0; i < num_threads; ++i)
+        {
             thread_pool[i].join();
         }
         auto mid = std::chrono::high_resolution_clock::now();
@@ -4744,68 +4899,81 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
         evalMicroseconds += (reconstruct_time + compute_time);
         multEvalMicroseconds += (reconstruct_time + compute_time);
         delete[] keys;
-
     }
     std::cerr << ">> ElemWise Mult - end" << std::endl;
 }
 
-
-void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf) 
+void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t sf)
 {
     // s1 = batch size
     // s2 = number of classes
 
     std::cerr << ">> Softmax - start" << std::endl;
-    GroupElement *max = make_array<GroupElement>(s1); 
+    GroupElement *max = make_array<GroupElement>(s1);
     // step 1 - calculate max for each image in batch
     GroupElement *oneHot = make_array<GroupElement>(s1 * (s2 - 1));
     MaxPool(s1, 1, 1, 1, s2, 1, 0, 0, 0, 0, 1, 1, s1, s2, 1, 1, MASK_PAIR(inArr), max, max, oneHot);
     delete[] oneHot; // TODO: support passing oneHot as nullptr
 
     // step 2 - subtract max from each element in each image in batch and add 2
-    if (party == DEALER) {
-        for(int i = 0; i < s1; ++i) {
-            for(int j = 0; j < s2; ++j) {
+    if (party == DEALER)
+    {
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s2; ++j)
+            {
                 Arr2DIdx(outArr_mask, s1, s2, i, j) = Arr2DIdx(inArr_mask, s1, s2, i, j) - max[i];
             }
         }
     }
-    else {
-        for(int i = 0; i < s1; ++i) {
-            for(int j = 0; j < s2; ++j) {
-                Arr2DIdx(outArr, s1, s2, i, j) = Arr2DIdx(inArr, s1, s2, i, j) - max[i] + (1<<(sf + 1));
+    else
+    {
+        for (int i = 0; i < s1; ++i)
+        {
+            for (int j = 0; j < s2; ++j)
+            {
+                Arr2DIdx(outArr, s1, s2, i, j) = Arr2DIdx(inArr, s1, s2, i, j) - max[i] + (1 << (sf + 1));
             }
         }
     }
 
-    // step 3 - exponentiate each element in each image in batch 
+    // step 3 - exponentiate each element in each image in batch
     // e^x = RT((x+2), 1) for negative x
     // ReluTruncate(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), 1, nullptr); // Q: can we do this in place? can be a source of bug in future
     Relu2Round(s1 * s2, MASK_PAIR(outArr), MASK_PAIR(outArr), nullptr, 64);
-    for(int i = 0; i < s1 * s2; ++i) {
-        if (party == DEALER) {
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        if (party == DEALER)
+        {
             outArr_mask[i] = outArr_mask[i] / 2;
         }
-        else {
+        else
+        {
             outArr[i] = outArr[i] / 2;
         }
     }
 
     GroupElement *denominators = max; // reuse the array
     // // step 4 - calculate sum of exponentiated elements for each image in batch
-    if (party == DEALER) {
-        for(int i = 0; i < s1; ++i) {
+    if (party == DEALER)
+    {
+        for (int i = 0; i < s1; ++i)
+        {
             denominators[i] = 0;
-            for(int j = 0; j < s2; ++j) {
+            for (int j = 0; j < s2; ++j)
+            {
                 denominators[i] = denominators[i] + Arr2DIdx(outArr_mask, s1, s2, i, j);
             }
             // denominators[i] = denominators[i] * s1;
         }
     }
-    else {
-        for(int i = 0; i < s1; ++i) {
+    else
+    {
+        for (int i = 0; i < s1; ++i)
+        {
             denominators[i] = 0;
-            for(int j = 0; j < s2; ++j) {
+            for (int j = 0; j < s2; ++j)
+            {
                 denominators[i] = denominators[i] + Arr2DIdx(outArr, s1, s2, i, j);
             }
             // denominators[i] = denominators[i] * s1;
@@ -4816,8 +4984,10 @@ void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK
 
     // step 6 - multiply each element in each image in batch by the inverse of the denominator
     GroupElement *expandedDenominator = make_array<GroupElement>(s1 * s2);
-    for(int i = 0; i < s1; ++i) {
-        for(int j = 0; j < s2; ++j) {
+    for (int i = 0; i < s1; ++i)
+    {
+        for (int j = 0; j < s2; ++j)
+        {
             Arr2DIdx(expandedDenominator, s1, s2, i, j) = denominators[i];
         }
     }
@@ -4826,13 +4996,16 @@ void PiranhaSoftmax(int32_t s1, int32_t s2, MASK_PAIR(GroupElement *inArr), MASK
     ElemWiseSecretSharedVectorMult(s1 * s2, expandedDenominator, expandedDenominator, MASK_PAIR(outArr), MASK_PAIR(outArr));
     // ScaleDown(s1 * s2, MASK_PAIR(outArr), sf);
 
-    always_assert((s1 & (s1-1)) == 0);
+    always_assert((s1 & (s1 - 1)) == 0);
     auto logs1 = osuCrypto::log2ceil(s1);
-    for(int i = 0; i < s1 * s2; ++i) {
-        if (party == DEALER) {
+    for (int i = 0; i < s1 * s2; ++i)
+    {
+        if (party == DEALER)
+        {
             outArr_mask[i] = outArr_mask[i] >> (sf + logs1);
         }
-        else {
+        else
+        {
             outArr[i] = outArr[i] >> (sf + logs1);
         }
     }
diff --git a/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
index 5f16e0fb..3e146c40 100644
--- a/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
+++ b/GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h
@@ -182,6 +182,7 @@ void SlothMaxpool(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std
 void SlothMaxpoolTriangular(int s1, int s2, int bin, GroupElement *x, GroupElement *y, std::string prefix = "");
 void SumOfSquare(int s1, int s2, GroupElement *x, GroupElement *y, std::string prefix = "");
 void SlothLayerNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale);
+void SlothRMSNorm(int s1, int s2, GroupElement *x, GroupElement *A, GroupElement *B, GroupElement *y, int scale);
 void SlothGemm(int s1, int s2, int s3, GroupElement *x, GroupElement *A, GroupElement *y, int scale);
 void SoftmaxTriangular(int32_t s1, int32_t s2, int bin, GroupElement *x, GroupElement *y, int32_t scale);
 void MatMul2DTriangular(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
@@ -191,6 +192,7 @@ void SlothLRS(int size, GroupElement *x, GroupElement *y, int scale, std::string
 void SlothARS(int size, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 void SlothTR(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 void SlothGelu(int size, int bin, GroupElement *x, GroupElement *out, int scale);
+void SlothSilu(int size, int bin, GroupElement *x, GroupElement *out, int scale);
 void SlothFaithfulARS(int size, int bin, GroupElement *x, GroupElement *y, int scale, std::string prefix = "");
 
 void reconstruct(int32_t size, GroupElement *arr, int bw);
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
index 10f625b7..264543c5 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h
@@ -209,7 +209,7 @@ class Backend
                 y_2d(i, j) = z;
             }
         }
-        this->truncate(y_2d, scale - 3);
+        this->truncate(y_2d, scale - 3, 1);
     }
 
     virtual void optimize(LayerGraphNode<T> *root)
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
index 34e70c92..ffb632b8 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -285,6 +285,8 @@ class LlamaBase : public Backend<T>
             {
                 auto mask = LlamaConfig::dealer->recv_mask();
                 a[i] = a[i] - mask;
+                mod(a[i], LlamaConfig::bitlength);
+                a[i] -= ((a[i] >> (LlamaConfig::bitlength - 1) << LlamaConfig::bitlength));
             }
         }
     }
@@ -305,7 +307,6 @@ class LlamaBase : public Backend<T>
             for (int i = 0; i < sz; i++)
             {
                 auto mask = LlamaConfig::dealer->recv_mask();
-                if(i == 0) printf("Mask=%lu\n", mask);
                 a[i] = a[i] - mask;
             }
         }
@@ -533,26 +534,26 @@ class LlamaBase : public Backend<T>
         }
     }
 
-    //     void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
-    //     {
-    //         always_assert(in.size() > 0);
-    //         always_assert(out.size() == in[0]->size());
-    //         for (int i = 0; i < in.size(); i++)
-    //         {
-    //             always_assert(out.size() == in[i]->size());
-    //         }
-
-    // #pragma omp parallel for
-    //         for (u64 i = 0; i < out.size(); ++i)
-    //         {
-    //             T sum = 0;
-    //             for (int j = 0; j < in.size(); j++)
-    //             {
-    //                 sum += in[j]->data[i];
-    //             }
-    //             out.data[i] = sum;
-    //         }
-    //     }
+    void add(const std::vector<Tensor<T> *> &in, Tensor<T> &out)
+    {
+        always_assert(in.size() > 0);
+        always_assert(out.size() == in[0]->size());
+        for (int i = 0; i < in.size(); i++)
+        {
+            always_assert(out.size() == in[i]->size());
+        }
+
+#pragma omp parallel for
+        for (u64 i = 0; i < out.size(); ++i)
+        {
+            T sum = 0;
+            for (int j = 0; j < in.size(); j++)
+            {
+                sum += in[j]->data[i];
+            }
+            out.data[i] = sum;
+        }
+    }
 
     void addbias(Tensor<T> &x, const Tensor1D<T> &bias)
     {
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
index f287bf51..bd6a12d7 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -23,16 +23,28 @@
 #include <sytorch/backend/llama_base.h>
 
 template <typename T>
-class LlamaTransformer : public LlamaBase<T> {
+class LlamaTransformer : public LlamaBase<T>
+{
 public:
-
-    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode) {
+    void truncate(T *in, T *out, u64 shift, u64 size, u8 mode)
+    {
         // ARS(size, in, in, out, out, shift);
         // SlothARS(size, in, out, shift);
-        SlothFaithfulARS(size, LlamaConfig::bitlength, in, out, shift, "Linear::");
+        if (mode == 0)
+        {
+            SlothFaithfulARS(size, LlamaConfig::bitlength, in, out, shift, "Linear::");
+        }
+        else if (mode == 1)
+        {
+            SlothARS(size, in, out, shift);
+        }
+        else
+        {
+            assert(0 && "Unknown truncate type");
+        }
     }
 
-    void gelu(const Tensor<T> &in, const Tensor<T> &out, u64 scale, u64 mode = 0)
+    void gelu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
     {
         u64 sz = in.size();
         always_assert(sz == out.size());
@@ -46,6 +58,20 @@ class LlamaTransformer : public LlamaBase<T> {
         }
     }
 
+    void silu(const Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode = 0)
+    {
+        u64 sz = in.size();
+        always_assert(sz == out.size());
+        if (mode == 0)
+        {
+            SlothSilu(sz, LlamaConfig::bitlength, in.data, out.data, scale);
+        }
+        else if (mode == 1)
+        {
+            SlothSilu(sz, LlamaConfig::bitlength - scale, in.data, out.data, scale);
+        }
+    }
+
     void softmax(Tensor<T> &in, Tensor<T> &out, u64 scale, u64 mode)
     {
         in.is_same_shape(out);
@@ -65,6 +91,16 @@ class LlamaTransformer : public LlamaBase<T> {
         SlothLayerNorm(s1, s2, x.data, A.data, B.data, y.data, scale);
     }
 
+    void rmsnorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)
+    {
+        always_assert(A.d1 == B.d1);
+        always_assert(A.d1 == x.shape.back());
+        always_assert(x.is_same_shape(y));
+        u64 s2 = x.shape.back();
+        u64 s1 = x.size() / s2;
+        SlothRMSNorm(s1, s2, x.data, A.data, B.data, y.data, scale);
+    }
+
     void attention_mask(Tensor<T> &x, T scalar, Tensor<T> &y)
     {
         always_assert(x.is_same_shape(y));
@@ -81,11 +117,14 @@ class LlamaTransformer : public LlamaBase<T> {
             auto y_2d = y.as_2d();
             auto x_2d = x.as_2d();
 
-            for (u64 j = 0; j < n_seq; ++j) {
-                for (u64 k = 0; k < j + 1; ++k) {
+            for (u64 j = 0; j < n_seq; ++j)
+            {
+                for (u64 k = 0; k < j + 1; ++k)
+                {
                     y_2d(j, k) = x_2d(j, k);
                 }
-                for (u64 k = j + 1; k < n_seq; ++k) {
+                for (u64 k = j + 1; k < n_seq; ++k)
+                {
                     y_2d(j, k) = x_2d(j, k) - scalar;
                 }
             }
@@ -116,12 +155,12 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeGelu(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward) 
+        if (node->layer->doTruncationForward)
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "GeLU") 
+                if (child->layer->name == "GeLU")
                 {
                     child->layer->mode = 1;
                 }
@@ -131,17 +170,17 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeDiv(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward) 
+        if (node->layer->doTruncationForward)
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "_ScalarDiv") 
+                if (child->layer->name == "_ScalarDiv")
                 {
                     auto layer_sd = (_ScalarDiv<T> *)child->layer;
                     T d = T(double(1LL << (layer_sd->scale)) / layer_sd->scalar);
                     // if d is power of two
-                    if ((d & (d - 1)) == 0) 
+                    if ((d & (d - 1)) == 0)
                     {
                         // seems very hacky
                         node->layer->scale += (layer_sd->scale - log2(d));
@@ -161,12 +200,12 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void doOptimizeSoftmax(LayerGraphNode<T> *node, LayerGraphNode<T> *root)
     {
-        if (node->layer->doTruncationForward || node->layer->name == "_ScalarDiv") 
+        if (node->layer->doTruncationForward || node->layer->name == "_ScalarDiv")
         {
-            if (node->children.size() == 1) 
+            if (node->children.size() == 1)
             {
                 LayerGraphNode<T> *child = node->children[0];
-                if (child->layer->name == "SoftMax" || child->layer->name == "SoftMaxTriangular") 
+                if (child->layer->name == "SoftMax" || child->layer->name == "SoftMaxTriangular")
                 {
                     child->layer->mode = 1;
                 }
@@ -176,26 +215,24 @@ class LlamaTransformer : public LlamaBase<T> {
 
     void optimize(LayerGraphNode<T> *root)
     {
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeGelu(n, r);
-        });
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeSoftmax(n, r);
-        });
-        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r) {
-            doOptimizeDiv(n, r);
-        });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeGelu(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeSoftmax(n, r); });
+        topologicalApply(root, [&](LayerGraphNode<T> *n, LayerGraphNode<T> *r)
+                         { doOptimizeDiv(n, r); });
     }
 
     void scalardiv(Tensor<T> &x, double scalar, Tensor<T> &y, u64 scale, u64 mode)
     {
-        if (mode == 1) {
+        if (mode == 1)
+        {
             y.copy(x, false);
         }
         else
         {
             T d = T(double(1LL << (scale)) / scalar);
-            if ((d & (d - 1)) == 0) 
+            if ((d & (d - 1)) == 0)
             {
                 SlothFaithfulARS(x.size(), LlamaConfig::bitlength, x.data, y.data, scale - log2(d), "Linear::");
             }
diff --git a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
index 1d37ea24..f870de8e 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h
@@ -1,8 +1,8 @@
 // Authors: Kanav Gupta, Neha Jawalkar
 // Copyright:
-// 
+//
 // Copyright (c) 2024 Microsoft Research
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
@@ -167,7 +167,6 @@ class Layer
         if (doTruncationForward)
         {
             this->backend->truncateForward(activation, scale, forwardTruncationMode);
-            // printf("After truncate=%ld\n", activation.data[0]);
         }
         if (doPostSignExtension)
         {
@@ -1702,7 +1701,7 @@ template <typename T>
 class RotaryEmbedding : public Layer<T>
 {
 public:
-    u64 base=10000;
+    u64 base = 10000;
 
     RotaryEmbedding() : Layer<T>("RotaryEmbedding") {}
 

From 7e27e28968b5e321f8e18176cc7c997c575a0e30 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 16:00:31 -0700
Subject: [PATCH 13/32] Added run_experiment.py

---
 GPU-MPC/experiments/sigma/config.json       |  24 +++
 GPU-MPC/experiments/sigma/run_experiment.py | 179 ++++++++++++++++++++
 GPU-MPC/experiments/sigma/sigma.cu          |  39 +++--
 3 files changed, 229 insertions(+), 13 deletions(-)
 create mode 100644 GPU-MPC/experiments/sigma/config.json
 create mode 100644 GPU-MPC/experiments/sigma/run_experiment.py

diff --git a/GPU-MPC/experiments/sigma/config.json b/GPU-MPC/experiments/sigma/config.json
new file mode 100644
index 00000000..541c4cec
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/config.json
@@ -0,0 +1,24 @@
+{
+    "P0": {
+        "dealer": {
+            "gpu": 0,
+            "key_dir": "/mnt/nvme/neha/"
+        },
+        "evaluator": {
+            "gpu": 1,
+            "peer": "0.0.0.0",
+            "cpu_threads": 64
+        }
+    },
+    "P1": {
+        "dealer": {
+            "gpu": 2,
+            "key_dir": "/mnt/nvme/neha/"
+        },
+        "evaluator": {
+            "gpu": 3,
+            "peer": "0.0.0.0",
+            "cpu_threads": 64
+        }
+    }
+}
diff --git a/GPU-MPC/experiments/sigma/run_experiment.py b/GPU-MPC/experiments/sigma/run_experiment.py
new file mode 100644
index 00000000..5f28ef02
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/run_experiment.py
@@ -0,0 +1,179 @@
+# 
+# Copyright:
+# 
+# Copyright (c) 2024 Microsoft Research
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+import json
+import os
+import csv
+
+# -- matplotlib stuff --
+
+import matplotlib.pyplot as plt
+
+import sys
+sys.path.insert(0, '../..')
+from experiments.utils import run_seq, remove_key
+
+def get_time(line):
+    return round(float(line.split('=')[-1].split(' ')[0]) / 10**6, 3)
+
+def get_comm(line):
+    return round(float(line.split('(')[-1].split(' ')[0]), 3)
+
+def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
+    for model in ['bert-tiny', 'bert-base', 'bert-large', 'gpt2', 'gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 0 {} {}".format(dealer_gpu, model, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 1 {} {} {} {}".format(eval_gpu, model, party, dealer_key_dir, peer_ip, cpu_threads)
+        log_dir = "output/P{}/{}-128/logs/".format(party, model)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = '{}_inference_key_{}.dat'.format(model, party)
+        remove_key(dealer_key_dir, key_file)
+
+    stats = dict({'dealer': dict(), 'evaluator': dict()})
+    for model in ['bert-tiny', 'bert-base', 'bert-large', 'gpt2', 'gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']:
+        stats['dealer'][model] = dict()
+        stats['evaluator'][model] = dict()
+        
+        dealer_lines = open('output/P{}/{}-128/dealer.txt'.format(party, model)).readlines()
+        stats['dealer'][model]['time'] = get_time(dealer_lines[0])
+        stats['dealer'][model]['key_size'] = get_comm(dealer_lines[1])
+
+        eval_lines = open('output/P{}/{}-128/evaluator.txt'.format(party, model)).readlines()
+        stats['evaluator'][model]['gelu'] = dict()
+        stats['evaluator'][model]['gelu']['time'] = get_time(eval_lines[6])
+        stats['evaluator'][model]['gelu']['comm'] = get_comm(eval_lines[11])
+        stats['evaluator'][model]['softmax'] = dict()
+        stats['evaluator'][model]['softmax']['time'] = get_time(eval_lines[7])
+        stats['evaluator'][model]['softmax']['comm'] = get_comm(eval_lines[12])
+        stats['evaluator'][model]['layernorm'] = dict()
+        stats['evaluator'][model]['layernorm']['time'] = get_time(eval_lines[8])
+        stats['evaluator'][model]['layernorm']['comm'] = get_comm(eval_lines[13])
+        stats['evaluator'][model]['total'] = dict()
+        stats['evaluator'][model]['total']['time'] = get_time(eval_lines[0])
+        stats['evaluator'][model]['total']['comm'] = get_comm(eval_lines[10])
+    
+    with open('output/P{}/Table3.json'.format(party), 'w') as outfile:
+        table3 = dict()
+        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            table3[pretty_name] = {
+                'Activation': 
+                {
+                    'Time (s)': stats['evaluator'][model]['gelu']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['gelu']['comm']
+                },
+                'Softmax': 
+                {
+                    'Time (s)': stats['evaluator'][model]['softmax']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['softmax']['comm']
+                },
+                'Norm': 
+                {   'Time (s)': stats['evaluator'][model]['layernorm']['time'], 
+                    'Comm (GB)': stats['evaluator'][model]['layernorm']['comm']
+                }
+            }
+        json.dump(table3, outfile, indent=4)
+    
+    with open('output/P{}/Table5.json'.format(party), 'w') as outfile:
+        table5 = dict()
+        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            table5[pretty_name] = {
+                'Time (s)': stats['evaluator'][model]['total']['time'],
+                'Comm (GB)': stats['evaluator'][model]['total']['comm']
+            }
+        json.dump(table5, outfile, indent=4)
+
+    
+    with open('output/P{}/Table9.json'.format(party), 'w') as outfile:
+        table9 = dict()
+        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            table9[pretty_name] = {
+                'Key size (GB)': stats['dealer'][model]['key_size'],
+                'Generation time (s)': stats['dealer'][model]['time'],
+                'Online time (s)': stats['evaluator'][model]['total']['time']
+            }
+        json.dump(table9, outfile, indent=4)
+
+    with open('output/P{}/Fig11_data.csv'.format(party),'w') as out_file:
+        online_time = list(map(lambda model: stats['evaluator'][model]['total']['time'], ['gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']))
+        X = ('1.3', '2.7', '7', '13')
+        plt.plot(X, online_time, marker='s')
+        plt.xlabel('Number of parameters (in billions)')
+        plt.ylabel('Time (s)')
+        plt.savefig("output/P{}/Fig11.png".format(party), dpi=300, bbox_inches='tight')
+        plt.clf()
+        
+        writer = csv.writer(out_file)
+        writer.writerow(['Number of parameters (in billions)','Time (s)'])
+        for i in range(len(X)):
+            writer.writerow((X[i], online_time[i]))
+
+
+def run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
+
+    for n_seq in [64, 128, 256, 512, 1024]:
+        dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 0 {} {}".format(dealer_gpu, n_seq, party, dealer_key_dir)
+        eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 1 {} {} {} {}".format(eval_gpu, n_seq, party, dealer_key_dir, peer_ip, cpu_threads)
+        log_dir = 'output/P{}/gpt2-{}/logs/'.format(party, n_seq)
+        run_seq(dealer_cmd, eval_cmd, log_dir)
+        key_file = 'gpt2_inference_key_{}.dat'.format(party)
+        remove_key(dealer_key_dir, key_file)
+
+    with open('output/P{}/Table8.json'.format(party), 'w') as outfile:
+        table8 = dict()
+        for n_seq in [64, 128, 256, 512, 1024]:
+            eval_lines = open('output/P{}/gpt2-{}/evaluator.txt'.format(party, n_seq)).readlines()
+            table8[n_seq] = {
+                'Time (s)': get_time(eval_lines[0]),
+                'Comm (GB)': get_comm(eval_lines[10])
+            }
+        json.dump(table8, outfile, indent=4)
+
+def main():
+    parser = argparse.ArgumentParser(description='Run artifact evaluation!')
+    parser.add_argument('--n_seq', default=False, type=bool, help='Run Table 8.')
+    parser.add_argument('--perf', default=False, type=bool, help='Run all performance experiments.')
+    parser.add_argument('--all', default=False, type=bool, help='Run all experiments.')
+    parser.add_argument('--party', default=0, type=int, help='Party to run (0/1).')
+
+    args = parser.parse_args();
+    global_config = None
+    with open('config.json', 'r') as f:
+        global_config = json.load(f)
+    config = None
+
+    if args.party == None:
+        raise Exception("Must specify party")
+    if args.party == 0:
+        config = global_config['P0']
+    else:
+        config = global_config['P1']
+    dealer_config = config['dealer']
+    eval_config = config['evaluator']
+    if args.all:
+        run_perf(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+        run_table8(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+    elif args.perf:
+        run_perf(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+    elif args.n_seq:
+        run_table8(args.party, dealer_config['gpu'], eval_config['gpu'], dealer_config['key_dir'], eval_config['peer'], eval_config['cpu_threads'])
+
+if __name__ == '__main__':
+    main();
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index a869a746..2c2efc9c 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -175,10 +175,23 @@ int main(int __argc, char **__argv)
         auto sigma = new SIGMAKeygen<u64>(party, bw, scale, keyFile, keyBufSz);
         net->setBackend(sigma);
         net->optimize();
+        auto start = std::chrono::high_resolution_clock::now();
         input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL);
         auto &activation = net->forward(input);
         sigma->output(activation);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
+        std::stringstream ss;
+        ss << "Total time=" + std::to_string(elapsed.count()) + " us";
+        ss << std::endl;
+        ss << "Key size=" + toGB(sigma->keySize);
+        ss << std::endl;
+        auto inferenceDir = "output/P" + std::to_string(party) + "/" + model + "-" + std::to_string(n_seq) + "/";
+        makeDir(inferenceDir);
+        std::ofstream statsFile(inferenceDir + "dealer.txt");
+        statsFile << ss.rdbuf();
+        statsFile.close();
     }
     else
     {
@@ -194,31 +207,30 @@ int main(int __argc, char **__argv)
         auto end = std::chrono::high_resolution_clock::now();
         auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
         sigma->close();
-        auto signedAct = Tensor<i64>((i64*) activation.data, activation.shape).as_2d();
+        auto signedAct = Tensor<i64>((i64 *)activation.data, activation.shape).as_2d();
         // print(signedAct.as_nd(), scale, (u64) bw);
         auto maxIdx = signedAct.argmax(0);
         printf("%d, %ld\n", maxIdx, activation.data[maxIdx]);
 
         std::stringstream ss;
 
-        ss << "Time in ms" << std::endl;
-        ss << "Total time=" + std::to_string(elapsed.count());
+        ss << "Total time=" + std::to_string(elapsed.count()) + " us";
         ss << std::endl;
-        ss << "Comm time=" + std::to_string(sigma->s.comm_time);
+        ss << "Comm time=" + std::to_string(sigma->s.comm_time) + " us";
         ss << std::endl;
-        ss << "Transfer time=" + std::to_string(sigma->s.transfer_time);
+        ss << "Transfer time=" + std::to_string(sigma->s.transfer_time) + " us";
         ss << std::endl;
-        ss << "MHA time=" + std::to_string(sigma->s.mha_time);
+        ss << "MHA time=" + std::to_string(sigma->s.mha_time) + " us";
         ss << std::endl;
-        ss << "Matmul time=" + std::to_string(sigma->s.matmul_time);
+        ss << "Matmul time=" + std::to_string(sigma->s.matmul_time) + " us";
         ss << std::endl;
-        ss << "Truncate time=" + std::to_string(sigma->s.truncate_time);
+        ss << "Truncate time=" + std::to_string(sigma->s.truncate_time) + " us";
         ss << std::endl;
-        ss << "Gelu time=" + std::to_string(sigma->s.gelu_time);
+        ss << "Gelu time=" + std::to_string(sigma->s.gelu_time) + " us";
         ss << std::endl;
-        ss << "Softmax time=" + std::to_string(sigma->s.softmax_time);
+        ss << "Softmax time=" + std::to_string(sigma->s.softmax_time) + " us";
         ss << std::endl;
-        ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time);
+        ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time) + " us";
         ss << std::endl;
         ss << std::endl;
         ss << "Total Comm=" + toGB(sigma->peer->bytesSent() + sigma->peer->bytesReceived());
@@ -230,8 +242,9 @@ int main(int __argc, char **__argv)
         ss << "Layernorm Comm=" + toGB(sigma->s.layernorm_comm_bytes);
         ss << std::endl;
 
-        auto inferenceDir = "output/P" + std::to_string(party) + "/";
-        std::ofstream statsFile(inferenceDir + model + ".txt");
+        auto inferenceDir = "output/P" + std::to_string(party) + "/" + model + "-" + std::to_string(n_seq) + "/";
+        makeDir(inferenceDir);
+        std::ofstream statsFile(inferenceDir + "evaluator.txt");
         statsFile << ss.rdbuf();
         statsFile.close();
     }

From b744ce5a0c377704b85cd887ada8e6f0bf18e199 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 16:03:44 -0700
Subject: [PATCH 14/32] Added sigma.h

---
 GPU-MPC/backend/sigma.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h
index 65f5ecec..8f62f5f6 100644
--- a/GPU-MPC/backend/sigma.h
+++ b/GPU-MPC/backend/sigma.h
@@ -269,6 +269,7 @@ class SIGMAKeygen : public Backend<T>
     size_t keyBufSize = 0;
     int party = -1;
     std::string keyFile;
+    size_t keySize = 0;
     int scale;
     int bw;
     AESGlobalContext g;
@@ -300,7 +301,7 @@ class SIGMAKeygen : public Backend<T>
 
     void close()
     {
-        size_t keySize = keyBuf - startPtr;
+        /*size_t*/ keySize = keyBuf - startPtr;
         size_t padding = 4096 - (keySize % 4096);
         char *zeros = new char[padding];
         memset(zeros, 0, padding);

From f81f8e5e7a3745dcaf174ed5a2b640bde5af23f2 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 17:10:01 -0700
Subject: [PATCH 15/32] Changed keybuf size

---
 GPU-MPC/experiments/sigma/run_experiment.py | 9 ++++++---
 GPU-MPC/experiments/sigma/sigma.cu          | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/run_experiment.py b/GPU-MPC/experiments/sigma/run_experiment.py
index 5f28ef02..8a038b17 100644
--- a/GPU-MPC/experiments/sigma/run_experiment.py
+++ b/GPU-MPC/experiments/sigma/run_experiment.py
@@ -72,7 +72,8 @@ def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
     
     with open('output/P{}/Table3.json'.format(party), 'w') as outfile:
         table3 = dict()
-        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
             table3[pretty_name] = {
                 'Activation': 
                 {
@@ -93,7 +94,8 @@ def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
     
     with open('output/P{}/Table5.json'.format(party), 'w') as outfile:
         table5 = dict()
-        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
             table5[pretty_name] = {
                 'Time (s)': stats['evaluator'][model]['total']['time'],
                 'Comm (GB)': stats['evaluator'][model]['total']['comm']
@@ -103,7 +105,8 @@ def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
     
     with open('output/P{}/Table9.json'.format(party), 'w') as outfile:
         table9 = dict()
-        for pretty_name, model in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large, ''bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+        for tup in [('BERT-tiny', 'bert-tiny'), ('BERT-base', 'bert-base'), ('BERT-large', 'bert-large'), ('GPT2', 'gpt2'), ('GPT-Neo', 'gpt-neo'), ('Llama2-7B', 'llama7b'), ('Llama2-13B', 'llama13b')]:
+            pretty_name, model = tup
             table9[pretty_name] = {
                 'Key size (GB)': stats['dealer'][model]['key_size'],
                 'Generation time (s)': stats['dealer'][model]['time'],
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 2c2efc9c..10ce2d18 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -61,7 +61,7 @@ int main(int __argc, char **__argv)
         n_embd = 768;
         attnMask = "self";
         bw = 50;
-        keyBufSz = 20 * OneGB;
+        keyBufSz = 20 * ((n_seq - 1) / 128 + 1) * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
         input.resize({n_seq, n_embd});
         input.zero();

From 561b739c5b1fcab882b3684621ec6453c3ed7287 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 18:16:39 -0700
Subject: [PATCH 16/32] Changed keybuf size

---
 GPU-MPC/experiments/sigma/run_experiment.py | 10 +++++-----
 GPU-MPC/experiments/sigma/sigma.cu          | 11 ++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/run_experiment.py b/GPU-MPC/experiments/sigma/run_experiment.py
index 8a038b17..3821a822 100644
--- a/GPU-MPC/experiments/sigma/run_experiment.py
+++ b/GPU-MPC/experiments/sigma/run_experiment.py
@@ -42,7 +42,7 @@ def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
     for model in ['bert-tiny', 'bert-base', 'bert-large', 'gpt2', 'gpt-neo', 'gpt-neo-large', 'llama7b', 'llama13b']:
         dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 0 {} {}".format(dealer_gpu, model, party, dealer_key_dir)
         eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma {} 128 1 {} {} {} {}".format(eval_gpu, model, party, dealer_key_dir, peer_ip, cpu_threads)
-        log_dir = "output/P{}/{}-128/logs/".format(party, model)
+        log_dir = "output/P{}/models/{}-128/logs/".format(party, model)
         run_seq(dealer_cmd, eval_cmd, log_dir)
         key_file = '{}_inference_key_{}.dat'.format(model, party)
         remove_key(dealer_key_dir, key_file)
@@ -52,11 +52,11 @@ def run_perf(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads):
         stats['dealer'][model] = dict()
         stats['evaluator'][model] = dict()
         
-        dealer_lines = open('output/P{}/{}-128/dealer.txt'.format(party, model)).readlines()
+        dealer_lines = open('output/P{}/models/{}-128/dealer.txt'.format(party, model)).readlines()
         stats['dealer'][model]['time'] = get_time(dealer_lines[0])
         stats['dealer'][model]['key_size'] = get_comm(dealer_lines[1])
 
-        eval_lines = open('output/P{}/{}-128/evaluator.txt'.format(party, model)).readlines()
+        eval_lines = open('output/P{}/models/{}-128/evaluator.txt'.format(party, model)).readlines()
         stats['evaluator'][model]['gelu'] = dict()
         stats['evaluator'][model]['gelu']['time'] = get_time(eval_lines[6])
         stats['evaluator'][model]['gelu']['comm'] = get_comm(eval_lines[11])
@@ -134,7 +134,7 @@ def run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads
     for n_seq in [64, 128, 256, 512, 1024]:
         dealer_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 0 {} {}".format(dealer_gpu, n_seq, party, dealer_key_dir)
         eval_cmd = "CUDA_VISIBLE_DEVICES={} ./sigma gpt2 {} 1 {} {} {} {}".format(eval_gpu, n_seq, party, dealer_key_dir, peer_ip, cpu_threads)
-        log_dir = 'output/P{}/gpt2-{}/logs/'.format(party, n_seq)
+        log_dir = 'output/P{}/models/gpt2-{}/logs/'.format(party, n_seq)
         run_seq(dealer_cmd, eval_cmd, log_dir)
         key_file = 'gpt2_inference_key_{}.dat'.format(party)
         remove_key(dealer_key_dir, key_file)
@@ -142,7 +142,7 @@ def run_table8(party, dealer_gpu, eval_gpu, dealer_key_dir, peer_ip, cpu_threads
     with open('output/P{}/Table8.json'.format(party), 'w') as outfile:
         table8 = dict()
         for n_seq in [64, 128, 256, 512, 1024]:
-            eval_lines = open('output/P{}/gpt2-{}/evaluator.txt'.format(party, n_seq)).readlines()
+            eval_lines = open('output/P{}/models/gpt2-{}/evaluator.txt'.format(party, n_seq)).readlines()
             table8[n_seq] = {
                 'Time (s)': get_time(eval_lines[0]),
                 'Comm (GB)': get_comm(eval_lines[10])
diff --git a/GPU-MPC/experiments/sigma/sigma.cu b/GPU-MPC/experiments/sigma/sigma.cu
index 10ce2d18..795715a5 100644
--- a/GPU-MPC/experiments/sigma/sigma.cu
+++ b/GPU-MPC/experiments/sigma/sigma.cu
@@ -61,7 +61,8 @@ int main(int __argc, char **__argv)
         n_embd = 768;
         attnMask = "self";
         bw = 50;
-        keyBufSz = 20 * ((n_seq - 1) / 128 + 1) * OneGB;
+        u64 mul = (u64) std::pow(2.3, std::log2(n_seq / 64));
+        keyBufSz = 10 * mul * OneGB;
         net = new GPUGPT2<u64>(n_layer, n_head, n_embd, attnMask, qkvFormat);
         input.resize({n_seq, n_embd});
         input.zero();
@@ -170,6 +171,10 @@ int main(int __argc, char **__argv)
         net->zero();
     }
     srand(time(NULL));
+    std::string outDir = "output/P" + std::to_string(party) + "/models/";
+    makeDir(outDir);
+    auto inferenceDir = outDir + model + "-" + std::to_string(n_seq) + "/";
+    makeDir(inferenceDir);
     if (role == 0)
     {
         auto sigma = new SIGMAKeygen<u64>(party, bw, scale, keyFile, keyBufSz);
@@ -187,8 +192,6 @@ int main(int __argc, char **__argv)
         ss << std::endl;
         ss << "Key size=" + toGB(sigma->keySize);
         ss << std::endl;
-        auto inferenceDir = "output/P" + std::to_string(party) + "/" + model + "-" + std::to_string(n_seq) + "/";
-        makeDir(inferenceDir);
         std::ofstream statsFile(inferenceDir + "dealer.txt");
         statsFile << ss.rdbuf();
         statsFile.close();
@@ -242,8 +245,6 @@ int main(int __argc, char **__argv)
         ss << "Layernorm Comm=" + toGB(sigma->s.layernorm_comm_bytes);
         ss << std::endl;
 
-        auto inferenceDir = "output/P" + std::to_string(party) + "/" + model + "-" + std::to_string(n_seq) + "/";
-        makeDir(inferenceDir);
         std::ofstream statsFile(inferenceDir + "evaluator.txt");
         statsFile << ss.rdbuf();
         statsFile.close();

From ae64b280e697644340831826887f9a21f82206ce Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 18:42:02 -0700
Subject: [PATCH 17/32] Increased input bitlength of softmax

---
 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
index bd6a12d7..18516c71 100644
--- a/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
+++ b/GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h
@@ -78,7 +78,7 @@ class LlamaTransformer : public LlamaBase<T>
         if (mode == 0)
             Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength, in.data, out.data, scale);
         else if (mode == 1)
-            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale, in.data, out.data, scale);
+            Softmax(in.shape[0], in.shape[1], LlamaConfig::bitlength - scale + 1, in.data, out.data, scale);
     }
 
     void layernorm(const Tensor1D<T> &A, const Tensor1D<T> &B, const Tensor<T> &x, Tensor<T> &y, u64 scale)

From a855f05b2f440e9f2536b126b87066f949d3af18 Mon Sep 17 00:00:00 2001
From: Neha J <jawalkarp@iisc.ac.in>
Date: Sun, 26 May 2024 19:39:56 -0700
Subject: [PATCH 18/32] Updated README.md

---
 GPU-MPC/README.md | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
index a9515405..648db489 100644
--- a/GPU-MPC/README.md
+++ b/GPU-MPC/README.md
@@ -1,7 +1,7 @@
 
 # Orca: FSS-based Secure Training and Inference with GPUs
 
-Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206) and [SIGMA]().
 
 **Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
 
@@ -33,8 +33,13 @@ sh setup.sh
 ```
 make orca
 ```
+4. Make sigma (this does not require making Orca)
 
-## Run
+```
+make sigma
+```
+
+## Run Orca
 
 1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
 
@@ -75,6 +80,34 @@ Results are stored in the `output/P<party-number>/Table<table-number>` or `outpu
 
 Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
 
+
+## Run SIGMA
+
+#Prerequisites and caveats
+
+1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table xx of the paper.
+
+2. Once the key has been stored on disk, SIGMA loads the key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
+
+3. Currently, we only support sequence lengths that are powers-of-2.
+
+4. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+
+#How to run
+
+1. The `sigma` executable is in `experiments/sigma`.
+
+2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In general, the syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+
+For example, to run GPT2, the server will run (in sequence):
+`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 0 0 /tmp/ 0.0.0.0 64`.
+
+Results are stored in `output/P<party-number>/<model-name>.txt`.
+
+3. Alternately, to reproduce Tables 4, 5, 9, and Figures 10 and 11, run `run_experiment.py --perf true`
+To reproduce Table 8, run `run_experiment.py --table 8`.
+Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example and re-running `run_experiment.py --perf true`.) 
+
 ## Docker Build
 
 You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 

From 51638551d65c9b6f06a5e6b3700df28f17eed5dc Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 09:13:37 +0530
Subject: [PATCH 19/32] Create README.md

---
 GPU-MPC/experiments/orca/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 GPU-MPC/experiments/orca/README.md

diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/GPU-MPC/experiments/orca/README.md
@@ -0,0 +1 @@
+

From c06b67527140f46138c46f6fa7a101659f33e9df Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 09:15:08 +0530
Subject: [PATCH 20/32] Update README.md

---
 GPU-MPC/experiments/orca/README.md | 124 +++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
index 8b137891..41d1dda8 100644
--- a/GPU-MPC/experiments/orca/README.md
+++ b/GPU-MPC/experiments/orca/README.md
@@ -1 +1,125 @@
 
+
+# Orca: FSS-based Secure Training and Inference with GPUs
+
+Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+
+3. Make Orca
+
+```
+make orca
+```
+
+## Run Orca
+
+1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "key_dir": <The directory in which the dealer will store keys>
+    }
+```
+
+FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
+
+Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
+
+```javascript
+"dealer" :
+    { "gpu": <The ID of the GPU to use>,
+      "peer": <The address of the remote peer>
+    }
+```
+
+You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
+
+Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
+
+Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
+
+```
+usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
+
+optional arguments:
+  --figure FIGURE  Figure # to run.
+  --table TABLE    Table # to run.
+  --all true       Run all the experiments.
+```
+
+Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
+
+Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
+
+
+## Docker Build
+
+You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 
+
+### Install Nvidia Container Toolkit
+- Configure the repository:
+```
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+&& sudo apt-get update
+```
+
+- Install the NVIDIA Container Toolkit packages:
+```
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+### Build the Docker Image / pull the image from Docker Hub
+```
+# Local Build
+docker build -t gpu_mpc -f Dockerfile_Gen .
+
+# Pull from Docker Hub (Cuda 11.8)
+docker pull trajore/gpu_mpc
+```
+### Run the Docker Container
+```
+sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
+
+```
+Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@INPROCEEDINGS {,
+author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
+booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
+title = {Orca: FSS-based Secure Training and Inference with GPUs},
+year = {2024}
+}
+```
+

From 0068691d5decfe9acc4483e9bccaf128ec05a840 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 09:23:03 +0530
Subject: [PATCH 21/32] Create README.md

---
 GPU-MPC/experiments/sigma/README.md | 113 ++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 GPU-MPC/experiments/sigma/README.md

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
new file mode 100644
index 00000000..e0e3d43b
--- /dev/null
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -0,0 +1,113 @@
+
+# Orca: FSS-based Secure Training and Inference with GPUs
+
+Implementation of protocols from the paper [SIGMA](https://eprint.iacr.org/2023/1269).
+
+**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use.
+
+## Build
+
+This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. 
+
+Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. 
+
+The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
+
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+
+1. Export environment variables
+
+```
+export CUDA_VERSION=11.7
+export GPU_ARCH=86
+```
+
+2. Set up the environment
+
+```
+sh setup.sh
+```
+
+3. Make SIGMA
+
+```
+make sigma
+```
+
+## Run SIGMA
+
+#Prerequisites and caveats
+
+1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table 9 of the paper.
+
+2. Once the key has been stored on disk, SIGMA loads the key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
+
+3. Currently, we only support sequence lengths that are powers-of-2.
+
+4. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+
+#How to run
+
+#Standalone
+
+1. The `sigma` executable is in `experiments/sigma`.
+
+2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In general, the syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+
+For example, to run GPT2, the server will run (in sequence):
+`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 0 0 /tmp/ 0.0.0.0 64`.
+
+Results are stored in the `output/P<party number>/models/<model name>-<sequence length>/` folder.
+
+#Running the artifact
+
+3. Alternately, to reproduce Tables 4, 5, 9, and Figures 10 and 11, run `python run_experiment.py --perf true`
+To reproduce Table 8, run `run_experiment.py --table 8`.
+Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example and re-running `python run_experiment.py --perf true`.) 
+
+## Docker Build
+
+You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 
+
+### Install Nvidia Container Toolkit
+- Configure the repository:
+```
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+&& sudo apt-get update
+```
+
+- Install the NVIDIA Container Toolkit packages:
+```
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+### Build the Docker Image / pull the image from Docker Hub
+```
+# Local Build
+docker build -t gpu_mpc -f Dockerfile_Gen .
+
+# Pull from Docker Hub (Cuda 11.8)
+docker pull trajore/gpu_mpc
+```
+### Run the Docker Container
+```
+sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
+
+```
+Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+
+## Citation
+
+You can cite the paper using the following BibTeX entry:
+
+```
+@INPROCEEDINGS {,
+author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
+booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
+title = {Orca: FSS-based Secure Training and Inference with GPUs},
+year = {2024}
+}
+```
+

From 076dc67e43ad118608b129ed2bc22ee12f2e3039 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 10:01:51 +0530
Subject: [PATCH 22/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 83 ++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index e0e3d43b..6cb27e7b 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -1,5 +1,5 @@
 
-# Orca: FSS-based Secure Training and Inference with GPUs
+# SIGMA: Secure GPT Inference with Function Secret Sharing
 
 Implementation of protocols from the paper [SIGMA](https://eprint.iacr.org/2023/1269).
 
@@ -13,7 +13,7 @@ Please note that Sytorch requires CMake version >= 3.17 and the build will fail
 
 The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout <branch>;` after line 31 (`cd ext/cutlass;`) of setup.sh.
 
-The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
+The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figure 11. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`.
 
 1. Export environment variables
 
@@ -36,34 +36,77 @@ make sigma
 
 ## Run SIGMA
 
-#Prerequisites and caveats
+### Prerequisites and caveats
 
-1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table 9 of the paper.
+1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama2-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table 9 of the paper.
 
 2. Once the key has been stored on disk, SIGMA loads the key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
 
 3. Currently, we only support sequence lengths that are powers-of-2.
 
-4. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
 
-#How to run
+### Run standalone
 
-#Standalone
+1. Make produces the `sigma` executable which is in `experiments/sigma`.
 
-1. The `sigma` executable is in `experiments/sigma`.
+2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
 
-2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In general, the syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see Prerequisites and caveats).
+
+The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
+
+The syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+
+The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
 
 For example, to run GPT2, the server will run (in sequence):
-`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 0 0 /tmp/ 0.0.0.0 64`.
+`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 1 0 /tmp/ <client IP> 64`.
+
+The client will run (*** on a different machine ***)
+`./sigma gpt2 128 0 1 /tmp/` and `./sigma gpt2 128 1 1 /tmp/ <server IP> 64`.
 
 Results are stored in the `output/P<party number>/models/<model name>-<sequence length>/` folder.
 
-#Running the artifact
+### Running the artifact
+
+Before the artifact can be run, we need to specify the dealer and evaluator configurations in `config.json`. These files are essentially used to populate the arguments specified in the previous section.
+
+For the server(=P0), `config.json` looks like:
+`{
+    "P0": {
+        "dealer": {
+            "gpu": <>,
+            "key_dir": <>
+        },
+        "evaluator": {
+            "gpu": <>,
+            "peer": <>,
+            "cpu_threads": <>
+        }
+    }
+}`
+
+For the client(=P1), `config.json` looks like:
+
+`"P1": {
+        "dealer": {
+            "gpu": 2,
+            "key_dir": "/mnt/nvme/neha/"
+        },
+        "evaluator": {
+            "gpu": 3,
+            "peer": "0.0.0.0",
+            "cpu_threads": 64
+        }
+    }
+`
+
+
+Once `config.json` has been filled, the script `run_experiment.py` can be used to reproduce the tables and figures in the paper.
+To reproduce Tables 4, 5, 9, and Figure 11, run `python run_experiment.py --perf true`
+To reproduce Table 8, run `run_experiment.py --n_seq true`.
+Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true`. 
 
-3. Alternately, to reproduce Tables 4, 5, 9, and Figures 10 and 11, run `python run_experiment.py --perf true`
-To reproduce Table 8, run `run_experiment.py --table 8`.
-Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example and re-running `python run_experiment.py --perf true`.) 
 
 ## Docker Build
 
@@ -103,11 +146,13 @@ Then Run setup.sh to configure according to GPU_arch and make orca as mentioned
 You can cite the paper using the following BibTeX entry:
 
 ```
-@INPROCEEDINGS {,
-author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma},
-booktitle = {2024 IEEE Symposium on Security and Privacy (SP)},
-title = {Orca: FSS-based Secure Training and Inference with GPUs},
-year = {2024}
+@misc{cryptoeprint:2023/1269,
+      author = {Kanav Gupta and Neha Jawalkar and Ananta Mukherjee and Nishanth Chandran and Divya Gupta and Ashish Panwar and Rahul Sharma},
+      title = {SIGMA: Secure GPT Inference with Function Secret Sharing},
+      howpublished = {Cryptology ePrint Archive, Paper 2023/1269},
+      year = {2023},
+      note = {\url{https://eprint.iacr.org/2023/1269}},
+      url = {https://eprint.iacr.org/2023/1269}
 }
 ```
 

From fc1f0d69c2e5d82ccc2ece4d58f520f0055fd157 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 12:40:18 +0530
Subject: [PATCH 23/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 41 ++++++++++++-----------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index 6cb27e7b..dd7f8379 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -47,11 +47,11 @@ make sigma
 
 ### Run standalone
 
-1. Make produces the `sigma` executable which is in `experiments/sigma`.
+Make produces the `sigma` executable which is in `experiments/sigma`.
 
-2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
+Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
 
-In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see Prerequisites and caveats).
+In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats).
 
 The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
 
@@ -72,39 +72,32 @@ Results are stored in the `output/P<party number>/models/<model name>-<sequence
 Before the artifact can be run, we need to specify the dealer and evaluator configurations in `config.json`. These files are essentially used to populate the arguments specified in the previous section.
 
 For the server(=P0), `config.json` looks like:
-`{
+```javascript
+{
     "P0": {
         "dealer": {
-            "gpu": <>,
-            "key_dir": <>
+            "gpu": <The ID of the GPU to use>,
+            "key_dir": <The directory in which the dealer will store keys>
         },
         "evaluator": {
-            "gpu": <>,
-            "peer": <>,
-            "cpu_threads": <>
+            "gpu": <The ID of the GPU to use>,
+            "peer": <The IP address of the remote peer>,
+            "cpu_threads": <The number of CPU threads to use for computation>
         }
     }
-}`
+}
+```
 
-For the client(=P1), `config.json` looks like:
-
-`"P1": {
-        "dealer": {
-            "gpu": 2,
-            "key_dir": "/mnt/nvme/neha/"
-        },
-        "evaluator": {
-            "gpu": 3,
-            "peer": "0.0.0.0",
-            "cpu_threads": 64
-        }
-    }
-`
+For the client(=P1), `config.json` looks exactly the same, only the arguments are specified under the key "P1".
 
+A sample `config.json` file can be found in the `experiments/sigma` folder.
 
 Once `config.json` has been filled, the script `run_experiment.py` can be used to reproduce the tables and figures in the paper.
+
 To reproduce Tables 4, 5, 9, and Figure 11, run `python run_experiment.py --perf true`
+
 To reproduce Table 8, run `run_experiment.py --n_seq true`.
+
 Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true`. 
 
 

From c423f4ce2215b0d5d8d077f92c670a017e7e9973 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 20:53:11 +0530
Subject: [PATCH 24/32] Update README.md

---
 GPU-MPC/README.md | 67 +++--------------------------------------------
 1 file changed, 3 insertions(+), 64 deletions(-)

diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
index 648db489..e1462ad2 100644
--- a/GPU-MPC/README.md
+++ b/GPU-MPC/README.md
@@ -1,5 +1,5 @@
 
-# Orca: FSS-based Secure Training and Inference with GPUs
+# GPU-MPC
 
 Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206) and [SIGMA]().
 
@@ -41,72 +41,11 @@ make sigma
 
 ## Run Orca
 
-1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "key_dir": <The directory in which the dealer will store keys>
-    }
-```
-
-FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable.
-
-Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
-
-```javascript
-"dealer" :
-    { "gpu": <The ID of the GPU to use>,
-      "peer": <The address of the remote peer>
-    }
-```
-
-You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations.
-
-Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish.
-
-Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options:
-
-```
-usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1
-
-optional arguments:
-  --figure FIGURE  Figure # to run.
-  --table TABLE    Table # to run.
-  --all true       Run all the experiments.
-```
-
-Results are stored in the `output/P<party-number>/Table<table-number>` or `output/P<party-number>/Fig<figure-number>` folders. 
-
-Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
-
+Please see the [Orca README](experiments/orca/README.md).
 
 ## Run SIGMA
 
-#Prerequisites and caveats
-
-1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table xx of the paper.
-
-2. Once the key has been stored on disk, SIGMA loads the key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
-
-3. Currently, we only support sequence lengths that are powers-of-2.
-
-4. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
-
-#How to run
-
-1. The `sigma` executable is in `experiments/sigma`.
-
-2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In general, the syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
-
-For example, to run GPT2, the server will run (in sequence):
-`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 0 0 /tmp/ 0.0.0.0 64`.
-
-Results are stored in `output/P<party-number>/<model-name>.txt`.
-
-3. Alternately, to reproduce Tables 4, 5, 9, and Figures 10 and 11, run `run_experiment.py --perf true`
-To reproduce Table 8, run `run_experiment.py --table 8`.
-Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example and re-running `run_experiment.py --perf true`.) 
+Please see the [SIGMA README](experiments/sigma/README.md)
 
 ## Docker Build
 

From b1a46d8170bfa2a8f3758108931c9373be33f457 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 20:55:03 +0530
Subject: [PATCH 25/32] Update README.md

---
 GPU-MPC/experiments/orca/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
index 41d1dda8..ae24af15 100644
--- a/GPU-MPC/experiments/orca/README.md
+++ b/GPU-MPC/experiments/orca/README.md
@@ -51,7 +51,7 @@ FSS keys tend to be quite large so please make sure that the key directory has a
 Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as:
 
 ```javascript
-"dealer" :
+"evaluator" :
     { "gpu": <The ID of the GPU to use>,
       "peer": <The address of the remote peer>
     }

From 70b94fad1c677187df1b23c8bccbb15ffffbb359 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 21:54:59 +0530
Subject: [PATCH 26/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 54 +++++++----------------------
 1 file changed, 12 insertions(+), 42 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index dd7f8379..156596d3 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -40,29 +40,31 @@ make sigma
 
 1. Since FSS generates large keys, please ensure that you have a writeable disk with at least 500GB of free space. This is only required by our largest model (Llama2-13B). Other models require less space, and an idea of how much free space is needed per model can be estimated from the key size reported in Table 9 of the paper.
 
-2. Once the key has been stored on disk, SIGMA loads the key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
+2. In the online phase, SIGMA loads the entire key from the disk into CPU memory. Thus, the CPU must have (free) memory that is at least as large as the key that will be read from the disk.
 
 3. Currently, we only support sequence lengths that are powers-of-2.
 
 
 ### Run standalone
 
-Make produces the `sigma` executable which is in `experiments/sigma`.
+1. Make produces the `sigma` executable which is in `experiments/sigma`.
 
-Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
+2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
 
-In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats).
+3. In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats).
 
-The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
+4. The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
 
-The syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+5. The syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
 
-The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+6. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+
+7. For example, to run GPT2, the server will run (in sequence):
 
-For example, to run GPT2, the server will run (in sequence):
 `./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 1 0 /tmp/ <client IP> 64`.
 
-The client will run (*** on a different machine ***)
+The client will run (_on a different machine_):
+
 `./sigma gpt2 128 0 1 /tmp/` and `./sigma gpt2 128 1 1 /tmp/ <server IP> 64`.
 
 Results are stored in the `output/P<party number>/models/<model name>-<sequence length>/` folder.
@@ -100,39 +102,7 @@ To reproduce Table 8, run `run_experiment.py --n_seq true`.
 
 Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true`. 
 
-
-## Docker Build
-
-You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 
-
-### Install Nvidia Container Toolkit
-- Configure the repository:
-```
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
-&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
-&& sudo apt-get update
-```
-
-- Install the NVIDIA Container Toolkit packages:
-```
-sudo apt-get install -y nvidia-container-toolkit
-sudo nvidia-ctk runtime configure --runtime=docker
-sudo systemctl restart docker
-```
-### Build the Docker Image / pull the image from Docker Hub
-```
-# Local Build
-docker build -t gpu_mpc -f Dockerfile_Gen .
-
-# Pull from Docker Hub (Cuda 11.8)
-docker pull trajore/gpu_mpc
-```
-### Run the Docker Container
-```
-sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
-
-```
-Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+The output of the scripts can be found in `output/P<party number>/Table<table number>.json` and `output/P<party number>/Fig<figure number>.json`. Log files (which might help with debugging) can be found in the `output/P<party number>/models/<model name>-<sequence length>/logs/` folder.
 
 ## Citation
 

From e31a09ef2877c28997beda84ae56175c68f16a40 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 21:57:01 +0530
Subject: [PATCH 27/32] Update README.md

---
 GPU-MPC/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md
index e1462ad2..1434bb76 100644
--- a/GPU-MPC/README.md
+++ b/GPU-MPC/README.md
@@ -78,7 +78,7 @@ docker pull trajore/gpu_mpc
 sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
 
 ```
-Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
+Then Run setup.sh to configure according to GPU_arch and make Orca/SIGMA as mentioned above.
 
 ## Citation
 

From 2810cc22348a22ebbe2b8f88849cee3abd2e5dd5 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 21:57:39 +0530
Subject: [PATCH 28/32] Update README.md

---
 GPU-MPC/experiments/orca/README.md | 34 ------------------------------
 1 file changed, 34 deletions(-)

diff --git a/GPU-MPC/experiments/orca/README.md b/GPU-MPC/experiments/orca/README.md
index ae24af15..3a201bab 100644
--- a/GPU-MPC/experiments/orca/README.md
+++ b/GPU-MPC/experiments/orca/README.md
@@ -76,40 +76,6 @@ Results are stored in the `output/P<party-number>/Table<table-number>` or `outpu
 
 Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P<party-number>/Table<table-number>/logs` and `output/P<party-number>/Fig<figure-number>/logs`.
 
-
-## Docker Build
-
-You can also build the docker image using the provided Dockerfile_Gen for building the Environment. 
-
-### Install Nvidia Container Toolkit
-- Configure the repository:
-```
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
-&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
-&& sudo apt-get update
-```
-
-- Install the NVIDIA Container Toolkit packages:
-```
-sudo apt-get install -y nvidia-container-toolkit
-sudo nvidia-ctk runtime configure --runtime=docker
-sudo systemctl restart docker
-```
-### Build the Docker Image / pull the image from Docker Hub
-```
-# Local Build
-docker build -t gpu_mpc -f Dockerfile_Gen .
-
-# Pull from Docker Hub (Cuda 11.8)
-docker pull trajore/gpu_mpc
-```
-### Run the Docker Container
-```
-sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash
-
-```
-Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above.
-
 ## Citation
 
 You can cite the paper using the following BibTeX entry:

From d5538a0db16b0a33d24b3bdc31beb94615577b98 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 22:23:47 +0530
Subject: [PATCH 29/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 53 ++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index 156596d3..9c651877 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -47,31 +47,38 @@ make sigma
 
 ### Run standalone
 
-1. Make produces the `sigma` executable which is in `experiments/sigma`.
+Make produces the `sigma` executable which is in `experiments/sigma`.
 
-2. Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator.
+Each party (the server and the client) needs to run two processes in sequence: the dealer and the evaluator. In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats). The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
 
-3. In addition to other arguments, the dealer requires the user to specify the directory in which it will store keys (see prerequisites and caveats).
-
-4. The evaluator requires the user to specify the directory to read keys from, the IP address of its peer, and the number of CPU threads to use for computation.
-
-5. The syntax for running the dealer is `./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>`. We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
-
-6. The syntax for running the evaluator is `./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`.
+The syntax for running the dealer is 
+```javascript
+./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>
+```
+We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
 
-7. For example, to run GPT2, the server will run (in sequence):
+The syntax for running the evaluator is 
+```javascript
+./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`
+```
 
-`./sigma gpt2 128 0 0 /tmp/` and `./sigma gpt2 128 1 0 /tmp/ <client IP> 64`.
+For example, to run GPT2, the server will run (in sequence):
+```javascript
+./sigma gpt2 128 0 0 /tmp/
+./sigma gpt2 128 1 0 /tmp/ <client IP> 64
+```
 
 The client will run (_on a different machine_):
-
-`./sigma gpt2 128 0 1 /tmp/` and `./sigma gpt2 128 1 1 /tmp/ <server IP> 64`.
+```javascript
+./sigma gpt2 128 0 1 /tmp/
+./sigma gpt2 128 1 1 /tmp/ <server IP> 64
+```
 
 Results are stored in the `output/P<party number>/models/<model name>-<sequence length>/` folder.
 
 ### Running the artifact
 
-Before the artifact can be run, we need to specify the dealer and evaluator configurations in `config.json`. These files are essentially used to populate the arguments specified in the previous section.
+Before the artifact can be run, we need to specify the dealer and evaluator configurations in `config.json`. 
 
 For the server(=P0), `config.json` looks like:
 ```javascript
@@ -94,15 +101,23 @@ For the client(=P1), `config.json` looks exactly the same, only the arguments ar
 
 A sample `config.json` file can be found in the `experiments/sigma` folder.
 
-Once `config.json` has been filled, the script `run_experiment.py` can be used to reproduce the tables and figures in the paper.
+Once `config.json` has been filled, the script `run_experiment.py` can be used to reproduce the tables and figures in the paper. Here are the relevant options:
+
+```
+usage: python run_experiment.py [-h] [--perf true] [--n_seq true] [--all true] --party 0/1
+
+optional arguments:
+  --perf true      Generate Tables 3, 5, 9, and Figure 11.
+  --n_seq true     Generate Table 8.
+  --all true       Run all the experiments.
+```
 
-To reproduce Tables 4, 5, 9, and Figure 11, run `python run_experiment.py --perf true`
+Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true` to generate Table 5. 
 
-To reproduce Table 8, run `run_experiment.py --n_seq true`.
+Results are stored in `output/P<party-number>/Table<table-number>.son` or `output/P<party-number>/Fig<figure-number>.json`. 
 
-Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true`. 
+Log files (which might help with debugging) can be found in the `output/P<party number>/models/<model name>-<sequence length>/logs/` folder.
 
-The output of the scripts can be found in `output/P<party number>/Table<table number>.json` and `output/P<party number>/Fig<figure number>.json`. Log files (which might help with debugging) can be found in the `output/P<party number>/models/<model name>-<sequence length>/logs/` folder.
 
 ## Citation
 

From b77979b2a8f016b3a0b477391ad646e1601017cf Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 22:26:09 +0530
Subject: [PATCH 30/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index 9c651877..b26ea2ad 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -55,13 +55,14 @@ The syntax for running the dealer is
 ```javascript
 ./sigma <model name> <sequence length> <role=0 for dealer> <party=0/1 (server/client)> <key directory>
 ```
-We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
 
 The syntax for running the evaluator is 
 ```javascript
 ./sigma <model name> <sequence length> <role=1 for evaluator> <party=0/1 (server/client)> <key directory> <peer IP> <CPU threads>`
 ```
 
+We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
+
 For example, to run GPT2, the server will run (in sequence):
 ```javascript
 ./sigma gpt2 128 0 0 /tmp/
@@ -114,7 +115,7 @@ optional arguments:
 
 Table 7 can be reproduced by throttling the network bandwidth (with `tc`, for example) and re-running `python run_experiment.py --perf true` to generate Table 5. 
 
-Results are stored in `output/P<party-number>/Table<table-number>.son` or `output/P<party-number>/Fig<figure-number>.json`. 
+Results are stored in `output/P<party-number>/Table<table-number>.json` or `output/P<party-number>/Fig<figure-number>.json`. 
 
 Log files (which might help with debugging) can be found in the `output/P<party number>/models/<model name>-<sequence length>/logs/` folder.
 

From c3a5c46d3c93ea476937fa653658c431b7882b21 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 22:27:23 +0530
Subject: [PATCH 31/32] Update README.md

---
 GPU-MPC/experiments/sigma/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU-MPC/experiments/sigma/README.md b/GPU-MPC/experiments/sigma/README.md
index b26ea2ad..2221b86b 100644
--- a/GPU-MPC/experiments/sigma/README.md
+++ b/GPU-MPC/experiments/sigma/README.md
@@ -63,7 +63,7 @@ The syntax for running the evaluator is
 
 We currently support the following models: `bert-tiny, bert-base, bert-large, gpt2, llama-7b, llama-13b`.
 
-For example, to run GPT2, the server will run (in sequence):
+**Example:** To run GPT2, the server will run (in sequence):
 ```javascript
 ./sigma gpt2 128 0 0 /tmp/
 ./sigma gpt2 128 1 0 /tmp/ <client IP> 64

From a9c6df1c14a974e24bbc039a891e82694fd40548 Mon Sep 17 00:00:00 2001
From: Neha Jawalkar <neha.jawalkar94@gmail.com>
Date: Mon, 27 May 2024 22:27:56 +0530
Subject: [PATCH 32/32] Update config.json

---
 GPU-MPC/experiments/sigma/config.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU-MPC/experiments/sigma/config.json b/GPU-MPC/experiments/sigma/config.json
index 541c4cec..190d5a61 100644
--- a/GPU-MPC/experiments/sigma/config.json
+++ b/GPU-MPC/experiments/sigma/config.json
@@ -2,7 +2,7 @@
     "P0": {
         "dealer": {
             "gpu": 0,
-            "key_dir": "/mnt/nvme/neha/"
+            "key_dir": "/tmp/"
         },
         "evaluator": {
             "gpu": 1,
@@ -13,7 +13,7 @@
     "P1": {
         "dealer": {
             "gpu": 2,
-            "key_dir": "/mnt/nvme/neha/"
+            "key_dir": "/tmp/"
         },
         "evaluator": {
             "gpu": 3,