From 30679fffd32a6653ae53b58e1ad3b240330aa235 Mon Sep 17 00:00:00 2001 From: Neha J Date: Fri, 17 May 2024 05:05:03 -0700 Subject: [PATCH 01/32] Adding GPU-MPC --- GPU-MPC/Dockerfile_Gen | 34 + GPU-MPC/Makefile | 132 + GPU-MPC/README.md | 123 + GPU-MPC/backend/orca.h | 243 + GPU-MPC/backend/orca_base.h | 311 + GPU-MPC/backend/piranha.h | 160 + GPU-MPC/backend/sigma.h | 428 ++ GPU-MPC/experiments/__init__.py | 21 + GPU-MPC/experiments/orca/__init__.py | 21 + GPU-MPC/experiments/orca/cnn.h | 1388 ++++ GPU-MPC/experiments/orca/config.json | 22 + .../datasets/cifar-10/download-cifar10.sh | 12 + GPU-MPC/experiments/orca/datasets/cifar10.h | 333 + GPU-MPC/experiments/orca/datasets/gpu_data.cu | 71 + GPU-MPC/experiments/orca/datasets/gpu_data.h | 15 + GPU-MPC/experiments/orca/datasets/mnist.cpp | 17 + GPU-MPC/experiments/orca/datasets/mnist.h | 227 + GPU-MPC/experiments/orca/model_accuracy.cu | 30 + GPU-MPC/experiments/orca/model_accuracy.h | 228 + GPU-MPC/experiments/orca/orca_dealer.cu | 360 ++ GPU-MPC/experiments/orca/orca_evaluator.cu | 504 ++ GPU-MPC/experiments/orca/orca_inference.cu | 122 + GPU-MPC/experiments/orca/piranha.cu | 119 + GPU-MPC/experiments/orca/run_experiment.py | 326 + GPU-MPC/experiments/orca/share_data.cpp | 182 + GPU-MPC/experiments/sigma/bert.h | 103 + GPU-MPC/experiments/sigma/gpt2.h | 125 + GPU-MPC/experiments/sigma/llama2.h | 120 + GPU-MPC/experiments/sigma/sigma.cu | 204 + GPU-MPC/experiments/utils.py | 115 + GPU-MPC/ext/sytorch/.gitignore | 25 + GPU-MPC/ext/sytorch/CMakeLists.txt | 372 ++ GPU-MPC/ext/sytorch/README.md | 24 + .../Toy example- multiple inference.md | 113 + .../sytorch/Toy example- single inference.md | 97 + GPU-MPC/ext/sytorch/examples/bert.cpp | 554 ++ .../ext/sytorch/examples/bertbenchmark.cpp | 260 + .../sytorch/examples/gpt-neo_nexttoken.cpp | 499 ++ GPU-MPC/ext/sytorch/examples/gpt2.cpp | 762 +++ .../ext/sytorch/examples/gpt2benchmark.cpp | 244 + .../ext/sytorch/examples/gpt2correctness.cpp | 336 + GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp | 162 + GPU-MPC/ext/sytorch/examples/gptneo.cpp | 398 ++ .../ext/sytorch/examples/gptneobenchmark.cpp | 304 + GPU-MPC/ext/sytorch/examples/llama7b.cpp | 263 + GPU-MPC/ext/sytorch/examples/resnet18.cpp | 210 + GPU-MPC/ext/sytorch/examples/resnet50.cpp | 404 ++ GPU-MPC/ext/sytorch/examples/vgg16.cpp | 185 + .../ext/sytorch/ext/bitpack/CMakeLists.txt | 22 + .../ext/bitpack/include/bitpack/bitpack.h | 11 + .../ext/bitpack/src/bitpack/bitpack.cpp | 136 + .../ext/sytorch/ext/bitpack/tests/test.cpp | 38 + .../sytorch/ext/cryptoTools/CMakeLists.txt | 18 + GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE | 50 + .../cryptoTools/Common/Defines.cpp | 88 + .../cryptoTools/cryptoTools/Common/Defines.h | 149 + .../cryptoTools/cryptoTools/Common/Log.cpp | 136 + .../ext/cryptoTools/cryptoTools/Common/Log.h | 222 + .../cryptoTools/cryptoTools/Common/config.h | 27 + .../cryptoTools/cryptoTools/Crypto/AES.cpp | 792 +++ .../ext/cryptoTools/cryptoTools/Crypto/AES.h | 136 + .../cryptoTools/cryptoTools/Crypto/PRNG.cpp | 78 + .../ext/cryptoTools/cryptoTools/Crypto/PRNG.h | 178 + .../cryptoTools/cryptoTools/gsl/GSL.natvis | 98 + .../cryptoTools/cryptoTools/gsl/gls-lite.hpp | 2382 +++++++ .../ext/cryptoTools/cryptoTools/gsl/gsl | 207 + .../cryptoTools/cryptoTools/gsl/gsl_algorithm | 61 + .../cryptoTools/cryptoTools/gsl/gsl_assert | 97 + .../ext/cryptoTools/cryptoTools/gsl/gsl_byte | 190 + .../ext/cryptoTools/cryptoTools/gsl/gsl_util | 170 + .../cryptoTools/cryptoTools/gsl/multi_span | 2266 +++++++ .../ext/cryptoTools/cryptoTools/gsl/span | 735 +++ .../cryptoTools/cryptoTools/gsl/string_span | 847 +++ GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt | 64 + GPU-MPC/ext/sytorch/ext/llama/and.cpp | 75 + GPU-MPC/ext/sytorch/ext/llama/and.h | 28 + GPU-MPC/ext/sytorch/ext/llama/api.cpp | 4842 ++++++++++++++ GPU-MPC/ext/sytorch/ext/llama/clip.cpp | 93 + GPU-MPC/ext/sytorch/ext/llama/clip.h | 28 + GPU-MPC/ext/sytorch/ext/llama/conv.cpp | 472 ++ GPU-MPC/ext/sytorch/ext/llama/dcf.cpp | 378 ++ GPU-MPC/ext/sytorch/ext/llama/dpf.cpp | 538 ++ .../ext/sytorch/ext/llama/fixtobfloat16.cpp | 135 + GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h | 29 + GPU-MPC/ext/sytorch/ext/llama/float.cpp | 209 + GPU-MPC/ext/sytorch/ext/llama/float.h | 46 + .../ext/sytorch/ext/llama/include/llama/api.h | 196 + .../sytorch/ext/llama/include/llama/array.h | 72 + .../sytorch/ext/llama/include/llama/assert.h | 30 + .../sytorch/ext/llama/include/llama/comms.h | 488 ++ .../sytorch/ext/llama/include/llama/config.h | 38 + .../sytorch/ext/llama/include/llama/conv.h | 129 + .../ext/sytorch/ext/llama/include/llama/dcf.h | 82 + .../ext/sytorch/ext/llama/include/llama/dpf.h | 36 + .../sytorch/ext/llama/include/llama/freekey.h | 410 ++ .../ext/llama/include/llama/group_element.h | 120 + .../ext/llama/include/llama/input_prng.h | 61 + .../sytorch/ext/llama/include/llama/keypack.h | 445 ++ .../sytorch/ext/llama/include/llama/prng.h | 29 + .../sytorch/ext/llama/include/llama/stats.h | 77 + .../sytorch/ext/llama/include/llama/utils.h | 173 + GPU-MPC/ext/sytorch/ext/llama/lut.cpp | 225 + GPU-MPC/ext/sytorch/ext/llama/lut.h | 33 + GPU-MPC/ext/sytorch/ext/llama/mic.cpp | 89 + GPU-MPC/ext/sytorch/ext/llama/mic.h | 34 + GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp | 76 + GPU-MPC/ext/sytorch/ext/llama/msnzb.h | 34 + GPU-MPC/ext/sytorch/ext/llama/mult.cpp | 115 + GPU-MPC/ext/sytorch/ext/llama/mult.h | 51 + GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp | 61 + GPU-MPC/ext/sytorch/ext/llama/pubcmp.h | 27 + GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp | 248 + GPU-MPC/ext/sytorch/ext/llama/pubdiv.h | 61 + GPU-MPC/ext/sytorch/ext/llama/relu.cpp | 312 + GPU-MPC/ext/sytorch/ext/llama/relu.h | 64 + GPU-MPC/ext/sytorch/ext/llama/select.cpp | 71 + GPU-MPC/ext/sytorch/ext/llama/select.h | 26 + GPU-MPC/ext/sytorch/ext/llama/signextend.cpp | 80 + GPU-MPC/ext/sytorch/ext/llama/signextend.h | 28 + .../ext/sytorch/ext/llama/src/llama/comms.cpp | 1808 ++++++ .../sytorch/ext/llama/src/llama/config.cpp | 35 + .../ext/llama/src/llama/input_prng.cpp | 173 + .../ext/sytorch/ext/llama/src/llama/prng.cpp | 48 + .../ext/sytorch/ext/llama/src/llama/stats.cpp | 105 + .../ext/sytorch/ext/llama/src/llama/utils.cpp | 685 ++ GPU-MPC/ext/sytorch/ext/llama/taylor.cpp | 172 + GPU-MPC/ext/sytorch/ext/llama/taylor.h | 43 + GPU-MPC/ext/sytorch/ext/llama/truncate.cpp | 108 + GPU-MPC/ext/sytorch/ext/llama/truncate.h | 38 + GPU-MPC/ext/sytorch/ext/llama/wrap.cpp | 125 + GPU-MPC/ext/sytorch/ext/llama/wrap.h | 29 + GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt | 44 + GPU-MPC/ext/sytorch/ext/sci/README.md | 59 + GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore | 1 + .../sytorch/ext/sci/cmake/SCIConfig.cmake.in | 18 + .../sci/cmake/SecureFixedPointConfig.cmake | 16 + .../sytorch/ext/sci/cmake/install_EMP.cmake | 49 + .../ext/sci/cmake/install_Eigen3.cmake | 18 + GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch | 13 + .../ext/sci/src/BuildingBlocks/CMakeLists.txt | 10 + .../sci/src/BuildingBlocks/aux-protocols.cpp | 745 +++ .../sci/src/BuildingBlocks/aux-protocols.h | 167 + .../ext/sci/src/BuildingBlocks/truncation.cpp | 298 + .../ext/sci/src/BuildingBlocks/truncation.h | 111 + .../src/BuildingBlocks/value-extension.cpp | 111 + .../sci/src/BuildingBlocks/value-extension.h | 50 + .../ext/sytorch/ext/sci/src/CMakeLists.txt | 123 + .../ext/sci/src/FloatingPoint/CMakeLists.txt | 4 + .../ext/sci/src/FloatingPoint/bool-data.cpp | 171 + .../ext/sci/src/FloatingPoint/bool-data.h | 162 + .../ext/sci/src/FloatingPoint/fixed-point.cpp | 1095 ++++ .../ext/sci/src/FloatingPoint/fixed-point.h | 414 ++ .../sci/src/FloatingPoint/floating-point.cpp | 1611 +++++ .../sci/src/FloatingPoint/floating-point.h | 462 ++ .../sci/src/FloatingPoint/fp-math-coeffs.h | 712 +++ .../ext/sci/src/FloatingPoint/fp-math.cpp | 924 +++ .../ext/sci/src/FloatingPoint/fp-math.h | 65 + .../ext/sytorch/ext/sci/src/GC/CMakeLists.txt | 7 + GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h | 152 + GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h | 68 + GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp | 80 + .../ext/sci/src/GC/circuit_execution.h | 60 + .../ext/sytorch/ext/sci/src/GC/comparable.h | 57 + .../ext/sytorch/ext/sci/src/GC/emp-sh2pc.h | 4 + .../ext/sytorch/ext/sci/src/GC/emp-tool.cpp | 43 + GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h | 19 + GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h | 232 + .../sytorch/ext/sci/src/GC/halfgate_eva.cpp | 52 + .../ext/sytorch/ext/sci/src/GC/halfgate_eva.h | 66 + .../sytorch/ext/sci/src/GC/halfgate_gen.cpp | 61 + .../ext/sytorch/ext/sci/src/GC/halfgate_gen.h | 80 + GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h | 87 + .../ext/sytorch/ext/sci/src/GC/integer.hpp | 424 ++ GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h | 86 + GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h | 74 + .../ext/sci/src/GC/protocol_execution.h | 55 + .../ext/sytorch/ext/sci/src/GC/semihonest.h | 51 + GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h | 112 + GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h | 118 + GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h | 68 + .../ext/sytorch/ext/sci/src/GC/swappable.h | 50 + GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h | 47 + .../ext/sci/src/LinearHE/CMakeLists.txt | 34 + .../ext/sci/src/LinearHE/conv-field.cpp | 946 +++ .../sytorch/ext/sci/src/LinearHE/conv-field.h | 173 + .../sytorch/ext/sci/src/LinearHE/defines-HE.h | 42 + .../sci/src/LinearHE/elemwise-prod-field.cpp | 220 + .../sci/src/LinearHE/elemwise-prod-field.h | 56 + .../sytorch/ext/sci/src/LinearHE/fc-field.cpp | 395 ++ .../sytorch/ext/sci/src/LinearHE/fc-field.h | 98 + .../ext/sci/src/LinearHE/generate_primes.py | 26 + .../sytorch/ext/sci/src/LinearHE/utils-HE.cpp | 237 + .../sytorch/ext/sci/src/LinearHE/utils-HE.h | 70 + .../ext/sci/src/LinearOT/CMakeLists.txt | 6 + .../ext/sci/src/LinearOT/linear-ot.cpp | 813 +++ .../sytorch/ext/sci/src/LinearOT/linear-ot.h | 113 + .../ext/sci/src/LinearOT/linear-uniform.h | 750 +++ .../sytorch/ext/sci/src/Math/CMakeLists.txt | 4 + .../ext/sci/src/Math/math-functions.cpp | 889 +++ .../sytorch/ext/sci/src/Math/math-functions.h | 82 + .../ext/sci/src/Millionaire/CMakeLists.txt | 4 + .../src/Millionaire/bit-triple-generator.h | 323 + .../ext/sci/src/Millionaire/equality.h | 353 ++ .../ext/sci/src/Millionaire/millionaire.h | 557 ++ .../Millionaire/millionaire_with_equality.h | 391 ++ .../ext/sci/src/NonLinear/CMakeLists.txt | 4 + .../sytorch/ext/sci/src/NonLinear/argmax.h | 476 ++ .../ext/sci/src/NonLinear/drelu-field.h | 355 ++ .../sytorch/ext/sci/src/NonLinear/maxpool.h | 181 + .../ext/sci/src/NonLinear/relu-field.h | 213 + .../ext/sci/src/NonLinear/relu-interface.h | 31 + .../sytorch/ext/sci/src/NonLinear/relu-ring.h | 251 + .../ext/sytorch/ext/sci/src/OT/CMakeLists.txt | 4 + GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h | 13 + GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h | 159 + GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h | 783 +++ GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h | 239 + GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h | 218 + GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h | 217 + GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h | 124 + GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h | 124 + .../ext/sytorch/ext/sci/src/OT/split-iknp.h | 1023 +++ .../ext/sytorch/ext/sci/src/OT/split-kkot.h | 604 ++ .../ext/sytorch/ext/sci/src/OT/split-utils.h | 190 + .../ext/sci/src/cleartext_library_fixed.cpp | 1224 ++++ .../ext/sci/src/cleartext_library_fixed.h | 180 + .../sci/src/cleartext_library_fixed_uniform.h | 3061 +++++++++ .../ext/sci/src/cleartext_library_float.cpp | 162 + .../ext/sci/src/cleartext_library_float.h | 62 + GPU-MPC/ext/sytorch/ext/sci/src/defines.h | 54 + .../ext/sytorch/ext/sci/src/defines_float.h | 37 + .../ext/sytorch/ext/sci/src/defines_uniform.h | 88 + .../ext/sci/src/functionalities_uniform.h | 1127 ++++ GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp | 118 + GPU-MPC/ext/sytorch/ext/sci/src/globals.h | 140 + .../ext/sytorch/ext/sci/src/globals_float.cpp | 35 + .../ext/sytorch/ext/sci/src/globals_float.h | 40 + .../ext/sytorch/ext/sci/src/library_fixed.cpp | 2810 +++++++++ .../ext/sytorch/ext/sci/src/library_fixed.h | 298 + .../ext/sci/src/library_fixed_common.h | 361 ++ .../ext/sci/src/library_fixed_uniform.cpp | 2093 +++++++ .../ext/sci/src/library_fixed_uniform.h | 209 + .../ext/sytorch/ext/sci/src/library_float.cpp | 1252 ++++ .../ext/sytorch/ext/sci/src/library_float.h | 207 + .../ext/sci/src/utils/ArgMapping/ArgMapping.h | 142 + .../ext/sci/src/utils/ArgMapping/LICENSE | 202 + .../ext/sci/src/utils/ArgMapping/NOTICE | 3 + .../sytorch/ext/sci/src/utils/CMakeLists.txt | 27 + .../sytorch/ext/sci/src/utils/ThreadPool.h | 116 + .../ext/sytorch/ext/sci/src/utils/aes-ni.h | 408 ++ GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h | 147 + .../ext/sytorch/ext/sci/src/utils/aes_opt.h | 1052 ++++ GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h | 393 ++ GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h | 58 + GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h | 65 + .../ext/sci/src/utils/cmake/FindGMP.cmake | 21 + .../utils/cmake/source_of_randomness.cmake | 27 + .../ext/sytorch/ext/sci/src/utils/constants.h | 573 ++ GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h | 84 + .../ext/sytorch/ext/sci/src/utils/emp-tool.h | 18 + GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h | 205 + GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h | 99 + .../sytorch/ext/sci/src/utils/group_openssl.h | 216 + GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h | 115 + .../sytorch/ext/sci/src/utils/io_channel.h | 84 + .../ext/sytorch/ext/sci/src/utils/io_pack.h | 44 + .../ext/sci/src/utils/net_io_channel.h | 205 + GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h | 300 + GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h | 113 + .../ext/sytorch/ext/sci/src/utils/sse2neon.h | 5566 +++++++++++++++++ GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h | 73 + .../sci/src/utils/ubuntu_terminal_colors.h | 39 + GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h | 89 + .../ext/sytorch/ext/sci/src/utils/utils.hpp | 445 ++ .../ext/sytorch/ext/sci/tests/CMakeLists.txt | 45 + .../ext/sytorch/ext/sci/tests/FindMPFR.cmake | 72 + .../sytorch/ext/sci/tests/GC/CMakeLists.txt | 9 + .../ext/sytorch/ext/sci/tests/GC/test_and.cpp | 49 + .../ext/sytorch/ext/sci/tests/GC/test_bit.cpp | 89 + .../ext/sytorch/ext/sci/tests/GC/test_int.cpp | 67 + .../sytorch/ext/sci/tests/GC/test_msnzb.cpp | 126 + .../ext/sytorch/ext/sci/tests/float_utils.h | 13 + .../ext/sci/tests/test_field_argmax.cpp | 186 + .../sytorch/ext/sci/tests/test_field_conv.cpp | 141 + .../sci/tests/test_field_elemwise_prod.cpp | 80 + .../sytorch/ext/sci/tests/test_field_fc.cpp | 91 + .../ext/sci/tests/test_field_maxpool.cpp | 194 + .../sytorch/ext/sci/tests/test_field_relu.cpp | 271 + .../ext/sci/tests/test_float_bench_op.cpp | 236 + .../sytorch/ext/sci/tests/test_float_math.cpp | 308 + .../ext/sci/tests/test_float_primitive.cpp | 349 ++ .../ext/sci/tests/test_ring_argmax.cpp | 164 + .../ext/sci/tests/test_ring_aux_protocols.cpp | 484 ++ .../sytorch/ext/sci/tests/test_ring_exp.cpp | 192 + .../sci/tests/test_ring_hadamard_product.cpp | 99 + .../ext/sci/tests/test_ring_matmul.cpp | 211 + .../ext/sci/tests/test_ring_maxpool.cpp | 200 + .../sytorch/ext/sci/tests/test_ring_relu.cpp | 180 + .../ext/sci/tests/test_ring_sigmoid.cpp | 184 + .../sytorch/ext/sci/tests/test_ring_sqrt.cpp | 206 + .../sytorch/ext/sci/tests/test_ring_tanh.cpp | 184 + .../ext/sci/tests/test_ring_truncation.cpp | 189 + .../sci/tests/test_ring_value_extension.cpp | 145 + GPU-MPC/ext/sytorch/ezpc-cli-2.sh | 436 ++ GPU-MPC/ext/sytorch/ezpc-cli.sh | 298 + .../sytorch/include/sytorch/backend/backend.h | 194 + .../sytorch/backend/baseline_cleartext.h | 115 + .../include/sytorch/backend/cleartext.h | 130 + .../sytorch/backend/crypten_cleartext.h | 142 + .../sytorch/include/sytorch/backend/default.h | 35 + .../sytorch/include/sytorch/backend/float.h | 70 + .../include/sytorch/backend/llama_base.h | 578 ++ .../include/sytorch/backend/llama_extended.h | 89 + .../sytorch/backend/llama_transformer.h | 209 + .../sytorch/backend/piranha_cleartext.h | 135 + .../sytorch/backend/secureml_cleartext.h | 113 + GPU-MPC/ext/sytorch/include/sytorch/graph.h | 121 + .../sytorch/include/sytorch/layers/layers.h | 1758 ++++++ GPU-MPC/ext/sytorch/include/sytorch/module.h | 740 +++ GPU-MPC/ext/sytorch/include/sytorch/random.h | 30 + GPU-MPC/ext/sytorch/include/sytorch/softmax.h | 174 + GPU-MPC/ext/sytorch/include/sytorch/tensor.h | 801 +++ GPU-MPC/ext/sytorch/include/sytorch/utils.h | 442 ++ GPU-MPC/ext/sytorch/scores/.gitignore | 1 + GPU-MPC/ext/sytorch/scripts/dealer.py | 119 + GPU-MPC/ext/sytorch/scripts/diff.py | 33 + GPU-MPC/ext/sytorch/scripts/download_keys.py | 62 + GPU-MPC/ext/sytorch/scripts/gptacc.py | 46 + .../ext/sytorch/scripts/mnli_matched_acc.py | 38 + .../sytorch/scripts/mnnli_mismatched_acc.py | 38 + GPU-MPC/ext/sytorch/scripts/mrpcacc.py | 38 + GPU-MPC/ext/sytorch/scripts/qnli_acc.py | 38 + GPU-MPC/ext/sytorch/scripts/server.py | 33 + GPU-MPC/ext/sytorch/scripts/sst2acc.py | 37 + .../sytorch/backend/baseline_cleartext.cpp | 575 ++ .../sytorch/src/sytorch/backend/cleartext.cpp | 1108 ++++ .../ext/sytorch/src/sytorch/backend/float.cpp | 411 ++ GPU-MPC/ext/sytorch/src/sytorch/random.cpp | 31 + GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp | 107 + GPU-MPC/ext/sytorch/tests/bf16.cpp | 67 + GPU-MPC/ext/sytorch/tests/clip.cpp | 88 + GPU-MPC/ext/sytorch/tests/dcf.cpp | 74 + GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp | 58 + GPU-MPC/ext/sytorch/tests/dpf.cpp | 124 + GPU-MPC/ext/sytorch/tests/dpfet.cpp | 57 + GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp | 43 + .../ext/sytorch/tests/evalallbenchmark.cpp | 65 + GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp | 68 + GPU-MPC/ext/sytorch/tests/lutss.cpp | 57 + .../ext/sytorch/tests/multi_party/bf16.cpp | 90 + .../ext/sytorch/tests/multi_party/clip.cpp | 87 + GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp | 83 + .../ext/sytorch/tests/multi_party/gelu.cpp | 98 + .../ext/sytorch/tests/multi_party/gemm.cpp | 82 + .../sytorch/tests/multi_party/layernorm.cpp | 100 + GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp | 89 + .../ext/sytorch/tests/multi_party/prtrunc.cpp | 91 + .../ext/sytorch/tests/multi_party/rsqrt.cpp | 84 + .../sytorch/tests/multi_party/sloth_ars.cpp | 85 + .../tests/multi_party/sloth_ars_faithful.cpp | 85 + .../sytorch/tests/multi_party/sloth_clip.cpp | 87 + .../sytorch/tests/multi_party/sloth_drelu.cpp | 87 + .../sytorch/tests/multi_party/sloth_lrs.cpp | 82 + .../tests/multi_party/sloth_maxpool.cpp | 81 + .../tests/multi_party/sloth_maxpool_tri.cpp | 84 + .../sytorch/tests/multi_party/sloth_relu.cpp | 87 + .../ext/sytorch/tests/multi_party/softmax.cpp | 91 + .../ext/sytorch/tests/multi_party/tanh.cpp | 82 + .../tests/multi_party/truncatereduce.cpp | 81 + GPU-MPC/ext/sytorch/tests/pubcmp.cpp | 58 + GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp | 55 + GPU-MPC/ext/sytorch/tests/truncatereduce.cpp | 56 + GPU-MPC/ext/sytorch/tests/wrap.cpp | 92 + GPU-MPC/fss/dcf/gpu_dcf.cu | 378 ++ GPU-MPC/fss/dcf/gpu_dcf.h | 102 + GPU-MPC/fss/dcf/gpu_dcf_templates.h | 109 + GPU-MPC/fss/dcf/gpu_maxpool.cu | 119 + GPU-MPC/fss/dcf/gpu_maxpool.h | 54 + GPU-MPC/fss/dcf/gpu_relu.cu | 168 + GPU-MPC/fss/dcf/gpu_relu.h | 102 + GPU-MPC/fss/dcf/gpu_sgd.cu | 330 + GPU-MPC/fss/dcf/gpu_sgd.h | 57 + GPU-MPC/fss/dcf/gpu_sstab.h | 67 + GPU-MPC/fss/dcf/gpu_truncate.cu | 270 + GPU-MPC/fss/dcf/gpu_truncate.h | 132 + GPU-MPC/fss/gpu_add.h | 45 + GPU-MPC/fss/gpu_aes_shm.cu | 241 + GPU-MPC/fss/gpu_aes_shm.h | 52 + GPU-MPC/fss/gpu_aes_table.h | 1317 ++++ GPU-MPC/fss/gpu_and.cu | 56 + GPU-MPC/fss/gpu_and.h | 53 + GPU-MPC/fss/gpu_avgpool.cu | 92 + GPU-MPC/fss/gpu_avgpool.h | 53 + GPU-MPC/fss/gpu_conv2d.cu | 457 ++ GPU-MPC/fss/gpu_conv2d.h | 50 + GPU-MPC/fss/gpu_dpf.cu | 480 ++ GPU-MPC/fss/gpu_dpf.h | 107 + GPU-MPC/fss/gpu_dpf_templates.h | 135 + GPU-MPC/fss/gpu_fss_helper.h | 135 + GPU-MPC/fss/gpu_gelu.cu | 162 + GPU-MPC/fss/gpu_gelu.h | 78 + GPU-MPC/fss/gpu_inverse.cu | 49 + GPU-MPC/fss/gpu_inverse.h | 43 + GPU-MPC/fss/gpu_layernorm.cu | 229 + GPU-MPC/fss/gpu_layernorm.h | 92 + GPU-MPC/fss/gpu_linear_helper.cu | 175 + GPU-MPC/fss/gpu_linear_helper.h | 30 + GPU-MPC/fss/gpu_local_truncate.h | 62 + GPU-MPC/fss/gpu_lut.cu | 223 + GPU-MPC/fss/gpu_lut.h | 147 + GPU-MPC/fss/gpu_matmul.cu | 399 ++ GPU-MPC/fss/gpu_matmul.h | 98 + GPU-MPC/fss/gpu_maxpool.cu | 698 +++ GPU-MPC/fss/gpu_maxpool.h | 77 + GPU-MPC/fss/gpu_mha.cu | 207 + GPU-MPC/fss/gpu_mha.h | 190 + GPU-MPC/fss/gpu_mul.cu | 82 + GPU-MPC/fss/gpu_mul.h | 55 + GPU-MPC/fss/gpu_nexp.cu | 81 + GPU-MPC/fss/gpu_nexp.h | 57 + GPU-MPC/fss/gpu_relu.cu | 115 + GPU-MPC/fss/gpu_relu.h | 72 + GPU-MPC/fss/gpu_scalarmul.h | 52 + GPU-MPC/fss/gpu_select.cu | 135 + GPU-MPC/fss/gpu_select.h | 62 + GPU-MPC/fss/gpu_softmax.cu | 141 + GPU-MPC/fss/gpu_softmax.h | 59 + GPU-MPC/fss/gpu_sstab.h | 145 + GPU-MPC/fss/gpu_truncate.cu | 295 + GPU-MPC/fss/gpu_truncate.h | 155 + GPU-MPC/fss/gpu_window.cu | 159 + GPU-MPC/fss/gpu_window.h | 37 + GPU-MPC/nn/orca/avg_pool_layer.cu | 104 + GPU-MPC/nn/orca/avg_pool_layer.h | 55 + GPU-MPC/nn/orca/conv2d_layer.cu | 344 + GPU-MPC/nn/orca/conv2d_layer.h | 72 + GPU-MPC/nn/orca/fc_layer.cu | 340 + GPU-MPC/nn/orca/fc_layer.h | 79 + GPU-MPC/nn/orca/gpu_layer.h | 65 + GPU-MPC/nn/orca/gpu_model.h | 79 + GPU-MPC/nn/orca/maxpool_layer.cu | 181 + GPU-MPC/nn/orca/maxpool_layer.h | 60 + GPU-MPC/nn/orca/relu_extend_layer.cu | 117 + GPU-MPC/nn/orca/relu_extend_layer.h | 57 + GPU-MPC/nn/orca/relu_layer.cu | 117 + GPU-MPC/nn/orca/relu_layer.h | 58 + GPU-MPC/nn/orca_opt.h | 217 + GPU-MPC/setup.sh | 76 + GPU-MPC/tests/fss/dcf/aes.cu | 80 + GPU-MPC/tests/fss/dcf/dcf.cu | 116 + GPU-MPC/tests/fss/dcf/maxpool.cu | 150 + GPU-MPC/tests/fss/dcf/relu.cu | 98 + GPU-MPC/tests/fss/dcf/relu_extend.cu | 97 + GPU-MPC/tests/fss/dcf/stochastic_truncate.cu | 71 + GPU-MPC/tests/fss/dpf.cu | 107 + GPU-MPC/tests/fss/dpf_drelu.cu | 93 + GPU-MPC/tests/fss/dpf_eval_all.cu | 198 + GPU-MPC/tests/fss/dpf_lut.cu | 94 + GPU-MPC/tests/fss/gelu.cu | 104 + GPU-MPC/tests/fss/layernorm.cu | 136 + GPU-MPC/tests/fss/mha.cu | 225 + GPU-MPC/tests/fss/piranha_softmax.cu | 75 + GPU-MPC/tests/fss/relu.cu | 101 + GPU-MPC/tests/fss/rmsnorm.cu | 140 + GPU-MPC/tests/fss/secfloat_softmax.cu | 79 + GPU-MPC/tests/fss/silu.cu | 105 + GPU-MPC/tests/fss/softmax.cu | 133 + GPU-MPC/tests/fss/truncate.cu | 95 + GPU-MPC/tests/nn/orca/conv2d_test.cu | 142 + GPU-MPC/tests/nn/orca/fc_test.cu | 132 + GPU-MPC/tests/nn/orca/maxpool_test.cu | 177 + GPU-MPC/tests/nn/orca/relu_extend_test.cu | 96 + GPU-MPC/tests/nn/orca/relu_test.cu | 112 + GPU-MPC/utils/cpu_comms.h | 243 + GPU-MPC/utils/curand_utils.h | 68 + GPU-MPC/utils/exception.h | 151 + GPU-MPC/utils/gpu_comms.h | 326 + GPU-MPC/utils/gpu_data_types.h | 60 + GPU-MPC/utils/gpu_file_utils.cpp | 172 + GPU-MPC/utils/gpu_file_utils.h | 41 + GPU-MPC/utils/gpu_mem.cu | 133 + GPU-MPC/utils/gpu_mem.h | 38 + GPU-MPC/utils/gpu_random.cu | 212 + GPU-MPC/utils/gpu_random.h | 47 + GPU-MPC/utils/gpu_stats.h | 98 + GPU-MPC/utils/helper_cuda.h | 970 +++ GPU-MPC/utils/helper_cutlass.h | 41 + GPU-MPC/utils/helper_functions.h | 59 + GPU-MPC/utils/helper_string.h | 428 ++ GPU-MPC/utils/misc_utils.h | 236 + GPU-MPC/utils/sigma_comms.cpp | 182 + GPU-MPC/utils/sigma_comms.h | 97 + 492 files changed, 115841 insertions(+) create mode 100644 GPU-MPC/Dockerfile_Gen create mode 100644 GPU-MPC/Makefile create mode 100644 GPU-MPC/README.md create mode 100644 GPU-MPC/backend/orca.h create mode 100644 GPU-MPC/backend/orca_base.h create mode 100644 GPU-MPC/backend/piranha.h create mode 100644 GPU-MPC/backend/sigma.h create mode 100644 GPU-MPC/experiments/__init__.py create mode 100644 GPU-MPC/experiments/orca/__init__.py create mode 100644 GPU-MPC/experiments/orca/cnn.h create mode 100644 GPU-MPC/experiments/orca/config.json create mode 100755 GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh create mode 100644 GPU-MPC/experiments/orca/datasets/cifar10.h create mode 100644 GPU-MPC/experiments/orca/datasets/gpu_data.cu create mode 100644 GPU-MPC/experiments/orca/datasets/gpu_data.h create mode 100644 GPU-MPC/experiments/orca/datasets/mnist.cpp create mode 100644 GPU-MPC/experiments/orca/datasets/mnist.h create mode 100644 GPU-MPC/experiments/orca/model_accuracy.cu create mode 100644 GPU-MPC/experiments/orca/model_accuracy.h create mode 100644 GPU-MPC/experiments/orca/orca_dealer.cu create mode 100644 GPU-MPC/experiments/orca/orca_evaluator.cu create mode 100644 GPU-MPC/experiments/orca/orca_inference.cu create mode 100644 GPU-MPC/experiments/orca/piranha.cu create mode 100644 GPU-MPC/experiments/orca/run_experiment.py create mode 100644 GPU-MPC/experiments/orca/share_data.cpp create mode 100644 GPU-MPC/experiments/sigma/bert.h create mode 100644 GPU-MPC/experiments/sigma/gpt2.h create mode 100644 GPU-MPC/experiments/sigma/llama2.h create mode 100644 GPU-MPC/experiments/sigma/sigma.cu create mode 100644 GPU-MPC/experiments/utils.py create mode 100644 GPU-MPC/ext/sytorch/.gitignore create mode 100755 GPU-MPC/ext/sytorch/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/README.md create mode 100644 GPU-MPC/ext/sytorch/Toy example- multiple inference.md create mode 100644 GPU-MPC/ext/sytorch/Toy example- single inference.md create mode 100644 GPU-MPC/ext/sytorch/examples/bert.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/bertbenchmark.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gpt-neo_nexttoken.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2benchmark.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2correctness.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gpt2dummy.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gptneo.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/gptneobenchmark.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/llama7b.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/resnet18.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/resnet50.cpp create mode 100644 GPU-MPC/ext/sytorch/examples/vgg16.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/include/bitpack/bitpack.h create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/src/bitpack/bitpack.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/bitpack/tests/test.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/LICENSE create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Defines.h create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Common/config.h create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/AES.h create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/Crypto/PRNG.h create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/GSL.natvis create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gls-lite.hpp create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_algorithm create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_assert create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_byte create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/gsl_util create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/multi_span create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/span create mode 100644 GPU-MPC/ext/sytorch/ext/cryptoTools/cryptoTools/gsl/string_span create mode 100644 GPU-MPC/ext/sytorch/ext/llama/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/llama/and.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/and.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/api.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/clip.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/clip.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/conv.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/dcf.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/dpf.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/fixtobfloat16.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/float.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/float.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/api.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/array.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/assert.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/comms.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/config.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/conv.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/dcf.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/dpf.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/freekey.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/group_element.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/input_prng.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/keypack.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/prng.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/stats.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/include/llama/utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/lut.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/lut.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mic.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mic.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/msnzb.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/msnzb.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mult.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/mult.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubcmp.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubcmp.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubdiv.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/pubdiv.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/relu.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/relu.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/select.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/select.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/signextend.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/signextend.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/comms.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/config.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/input_prng.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/prng.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/stats.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/src/llama/utils.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/taylor.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/taylor.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/truncate.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/truncate.h create mode 100644 GPU-MPC/ext/sytorch/ext/llama/wrap.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/llama/wrap.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/README.md create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/.gitignore create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/SCIConfig.cmake.in create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/SecureFixedPointConfig.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/install_EMP.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/install_Eigen3.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/cmake/seal.patch create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/aux-protocols.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/truncation.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/BuildingBlocks/value-extension.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/bool-data.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fixed-point.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/floating-point.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math-coeffs.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/FloatingPoint/fp-math.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/aes_opt.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/bit.hpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/circuit_execution.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/comparable.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-sh2pc.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/emp-tool.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/f2k.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_eva.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/halfgate_gen.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/integer.hpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/mitccrh.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/number.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/protocol_execution.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/semihonest.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_eva.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_gen.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/sh_party.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/swappable.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/GC/utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/conv-field.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/defines-HE.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/elemwise-prod-field.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/fc-field.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/generate_primes.py create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearHE/utils-HE.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-ot.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/LinearOT/linear-uniform.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Math/math-functions.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/bit-triple-generator.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/equality.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/Millionaire/millionaire_with_equality.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/argmax.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/drelu-field.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/maxpool.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-field.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-interface.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/NonLinear/relu-ring.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/emp-ot.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ideal.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/iknp.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/kkot.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/np.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot-utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/ot_pack.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-iknp.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-kkot.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/OT/split-utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_fixed_uniform.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/cleartext_library_float.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines_float.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/defines_uniform.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/functionalities_uniform.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals_float.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/globals_float.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_common.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_fixed_uniform.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_float.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/library_float.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/ArgMapping.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/LICENSE create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ArgMapping/NOTICE create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ThreadPool.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes-ni.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/aes_opt.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/block.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrf.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ccrh.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/FindGMP.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/cmake/source_of_randomness.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/constants.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/crh.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/emp-tool.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/f2k.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/group.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/group_openssl.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/hash.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/io_channel.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/io_pack.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/net_io_channel.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/prg.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/prp.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/sse2neon.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/tccrh.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/ubuntu_terminal_colors.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/src/utils/utils.hpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/FindMPFR.cmake create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/CMakeLists.txt create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_and.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_bit.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_int.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/GC/test_msnzb.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/float_utils.h create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_argmax.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_conv.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_elemwise_prod.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_fc.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_maxpool.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_field_relu.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_bench_op.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_math.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_float_primitive.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_argmax.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_aux_protocols.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_exp.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_hadamard_product.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_matmul.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_maxpool.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_relu.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sigmoid.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_sqrt.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_tanh.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_truncation.cpp create mode 100644 GPU-MPC/ext/sytorch/ext/sci/tests/test_ring_value_extension.cpp create mode 100755 GPU-MPC/ext/sytorch/ezpc-cli-2.sh create mode 100755 GPU-MPC/ext/sytorch/ezpc-cli.sh create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/backend.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/baseline_cleartext.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/cleartext.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/crypten_cleartext.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/default.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/float.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_base.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_extended.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/llama_transformer.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/piranha_cleartext.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/backend/secureml_cleartext.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/graph.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/layers/layers.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/module.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/random.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/softmax.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/tensor.h create mode 100644 GPU-MPC/ext/sytorch/include/sytorch/utils.h create mode 100644 GPU-MPC/ext/sytorch/scores/.gitignore create mode 100644 GPU-MPC/ext/sytorch/scripts/dealer.py create mode 100644 GPU-MPC/ext/sytorch/scripts/diff.py create mode 100644 GPU-MPC/ext/sytorch/scripts/download_keys.py create mode 100644 GPU-MPC/ext/sytorch/scripts/gptacc.py create mode 100644 GPU-MPC/ext/sytorch/scripts/mnli_matched_acc.py create mode 100644 GPU-MPC/ext/sytorch/scripts/mnnli_mismatched_acc.py create mode 100644 GPU-MPC/ext/sytorch/scripts/mrpcacc.py create mode 100644 GPU-MPC/ext/sytorch/scripts/qnli_acc.py create mode 100644 GPU-MPC/ext/sytorch/scripts/server.py create mode 100644 GPU-MPC/ext/sytorch/scripts/sst2acc.py create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/baseline_cleartext.cpp create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/cleartext.cpp create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/backend/float.cpp create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/random.cpp create mode 100644 GPU-MPC/ext/sytorch/src/sytorch/softmax.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/bf16.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/clip.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/dcf.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/dcf_dpf_et.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/dpf.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/dpfet.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/eigenbenchmark.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/evalallbenchmark.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/gelu_ulp.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/lutss.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/bf16.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/clip.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/exp.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/gelu.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/gemm.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/layernorm.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/lut.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/prtrunc.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/rsqrt.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_ars_faithful.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_clip.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_drelu.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_lrs.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_maxpool_tri.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/sloth_relu.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/softmax.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/tanh.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/multi_party/truncatereduce.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/pubcmp.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/sloth_drelu.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/truncatereduce.cpp create mode 100644 GPU-MPC/ext/sytorch/tests/wrap.cpp create mode 100644 GPU-MPC/fss/dcf/gpu_dcf.cu create mode 100644 GPU-MPC/fss/dcf/gpu_dcf.h create mode 100644 GPU-MPC/fss/dcf/gpu_dcf_templates.h create mode 100644 GPU-MPC/fss/dcf/gpu_maxpool.cu create mode 100644 GPU-MPC/fss/dcf/gpu_maxpool.h create mode 100644 GPU-MPC/fss/dcf/gpu_relu.cu create mode 100644 GPU-MPC/fss/dcf/gpu_relu.h create mode 100644 GPU-MPC/fss/dcf/gpu_sgd.cu create mode 100644 GPU-MPC/fss/dcf/gpu_sgd.h create mode 100644 GPU-MPC/fss/dcf/gpu_sstab.h create mode 100644 GPU-MPC/fss/dcf/gpu_truncate.cu create mode 100644 GPU-MPC/fss/dcf/gpu_truncate.h create mode 100644 GPU-MPC/fss/gpu_add.h create mode 100644 GPU-MPC/fss/gpu_aes_shm.cu create mode 100644 GPU-MPC/fss/gpu_aes_shm.h create mode 100644 GPU-MPC/fss/gpu_aes_table.h create mode 100644 GPU-MPC/fss/gpu_and.cu create mode 100644 GPU-MPC/fss/gpu_and.h create mode 100644 GPU-MPC/fss/gpu_avgpool.cu create mode 100644 GPU-MPC/fss/gpu_avgpool.h create mode 100644 GPU-MPC/fss/gpu_conv2d.cu create mode 100644 GPU-MPC/fss/gpu_conv2d.h create mode 100644 GPU-MPC/fss/gpu_dpf.cu create mode 100644 GPU-MPC/fss/gpu_dpf.h create mode 100644 GPU-MPC/fss/gpu_dpf_templates.h create mode 100644 GPU-MPC/fss/gpu_fss_helper.h create mode 100644 GPU-MPC/fss/gpu_gelu.cu create mode 100644 GPU-MPC/fss/gpu_gelu.h create mode 100644 GPU-MPC/fss/gpu_inverse.cu create mode 100644 GPU-MPC/fss/gpu_inverse.h create mode 100644 GPU-MPC/fss/gpu_layernorm.cu create mode 100644 GPU-MPC/fss/gpu_layernorm.h create mode 100644 GPU-MPC/fss/gpu_linear_helper.cu create mode 100644 GPU-MPC/fss/gpu_linear_helper.h create mode 100644 GPU-MPC/fss/gpu_local_truncate.h create mode 100644 GPU-MPC/fss/gpu_lut.cu create mode 100644 GPU-MPC/fss/gpu_lut.h create mode 100644 GPU-MPC/fss/gpu_matmul.cu create mode 100644 GPU-MPC/fss/gpu_matmul.h create mode 100644 GPU-MPC/fss/gpu_maxpool.cu create mode 100644 GPU-MPC/fss/gpu_maxpool.h create mode 100644 GPU-MPC/fss/gpu_mha.cu create mode 100644 GPU-MPC/fss/gpu_mha.h create mode 100644 GPU-MPC/fss/gpu_mul.cu create mode 100644 GPU-MPC/fss/gpu_mul.h create mode 100644 GPU-MPC/fss/gpu_nexp.cu create mode 100644 GPU-MPC/fss/gpu_nexp.h create mode 100644 GPU-MPC/fss/gpu_relu.cu create mode 100644 GPU-MPC/fss/gpu_relu.h create mode 100644 GPU-MPC/fss/gpu_scalarmul.h create mode 100644 GPU-MPC/fss/gpu_select.cu create mode 100644 GPU-MPC/fss/gpu_select.h create mode 100644 GPU-MPC/fss/gpu_softmax.cu create mode 100644 GPU-MPC/fss/gpu_softmax.h create mode 100644 GPU-MPC/fss/gpu_sstab.h create mode 100644 GPU-MPC/fss/gpu_truncate.cu create mode 100644 GPU-MPC/fss/gpu_truncate.h create mode 100644 GPU-MPC/fss/gpu_window.cu create mode 100644 GPU-MPC/fss/gpu_window.h create mode 100644 GPU-MPC/nn/orca/avg_pool_layer.cu create mode 100644 GPU-MPC/nn/orca/avg_pool_layer.h create mode 100644 GPU-MPC/nn/orca/conv2d_layer.cu create mode 100644 GPU-MPC/nn/orca/conv2d_layer.h create mode 100644 GPU-MPC/nn/orca/fc_layer.cu create mode 100644 GPU-MPC/nn/orca/fc_layer.h create mode 100644 GPU-MPC/nn/orca/gpu_layer.h create mode 100644 GPU-MPC/nn/orca/gpu_model.h create mode 100644 GPU-MPC/nn/orca/maxpool_layer.cu create mode 100644 GPU-MPC/nn/orca/maxpool_layer.h create mode 100644 GPU-MPC/nn/orca/relu_extend_layer.cu create mode 100644 GPU-MPC/nn/orca/relu_extend_layer.h create mode 100644 GPU-MPC/nn/orca/relu_layer.cu create mode 100644 GPU-MPC/nn/orca/relu_layer.h create mode 100644 GPU-MPC/nn/orca_opt.h create mode 100644 GPU-MPC/setup.sh create mode 100644 GPU-MPC/tests/fss/dcf/aes.cu create mode 100644 GPU-MPC/tests/fss/dcf/dcf.cu create mode 100644 GPU-MPC/tests/fss/dcf/maxpool.cu create mode 100644 GPU-MPC/tests/fss/dcf/relu.cu create mode 100644 GPU-MPC/tests/fss/dcf/relu_extend.cu create mode 100644 GPU-MPC/tests/fss/dcf/stochastic_truncate.cu create mode 100644 GPU-MPC/tests/fss/dpf.cu create mode 100644 GPU-MPC/tests/fss/dpf_drelu.cu create mode 100644 GPU-MPC/tests/fss/dpf_eval_all.cu create mode 100644 GPU-MPC/tests/fss/dpf_lut.cu create mode 100644 GPU-MPC/tests/fss/gelu.cu create mode 100644 GPU-MPC/tests/fss/layernorm.cu create mode 100644 GPU-MPC/tests/fss/mha.cu create mode 100644 GPU-MPC/tests/fss/piranha_softmax.cu create mode 100644 GPU-MPC/tests/fss/relu.cu create mode 100644 GPU-MPC/tests/fss/rmsnorm.cu create mode 100644 GPU-MPC/tests/fss/secfloat_softmax.cu create mode 100644 GPU-MPC/tests/fss/silu.cu create mode 100644 GPU-MPC/tests/fss/softmax.cu create mode 100644 GPU-MPC/tests/fss/truncate.cu create mode 100644 GPU-MPC/tests/nn/orca/conv2d_test.cu create mode 100644 GPU-MPC/tests/nn/orca/fc_test.cu create mode 100644 GPU-MPC/tests/nn/orca/maxpool_test.cu create mode 100644 GPU-MPC/tests/nn/orca/relu_extend_test.cu create mode 100644 GPU-MPC/tests/nn/orca/relu_test.cu create mode 100644 GPU-MPC/utils/cpu_comms.h create mode 100644 GPU-MPC/utils/curand_utils.h create mode 100644 GPU-MPC/utils/exception.h create mode 100644 GPU-MPC/utils/gpu_comms.h create mode 100644 GPU-MPC/utils/gpu_data_types.h create mode 100644 GPU-MPC/utils/gpu_file_utils.cpp create mode 100644 GPU-MPC/utils/gpu_file_utils.h create mode 100644 GPU-MPC/utils/gpu_mem.cu create mode 100644 GPU-MPC/utils/gpu_mem.h create mode 100644 GPU-MPC/utils/gpu_random.cu create mode 100644 GPU-MPC/utils/gpu_random.h create mode 100644 GPU-MPC/utils/gpu_stats.h create mode 100644 GPU-MPC/utils/helper_cuda.h create mode 100644 GPU-MPC/utils/helper_cutlass.h create mode 100644 GPU-MPC/utils/helper_functions.h create mode 100644 GPU-MPC/utils/helper_string.h create mode 100644 GPU-MPC/utils/misc_utils.h create mode 100644 GPU-MPC/utils/sigma_comms.cpp create mode 100644 GPU-MPC/utils/sigma_comms.h diff --git a/GPU-MPC/Dockerfile_Gen b/GPU-MPC/Dockerfile_Gen new file mode 100644 index 00000000..baca632e --- /dev/null +++ b/GPU-MPC/Dockerfile_Gen @@ -0,0 +1,34 @@ +# Author: Tanmay Rajore,Neha Jawalkar +# +# Copyright: +# Copyright (c) 2024 Microsoft Research +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 + +WORKDIR /home +RUN ln -sf /bin/bash /bin/sh + +RUN apt update && apt upgrade -y && apt install -y git apt-utils; \ + apt install -y sudo ; \ + sudo apt install -y gcc-9 g++-9; \ + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9;\ + sudo update-alternatives --config gcc;\ + sudo apt install libssl-dev cmake python3-pip libgmp-dev libmpfr-dev -y;\ + sudo apt install cmake make libeigen3-dev -y ; + +RUN git config --global --add safe.directory * +#RUN git submodule update --init --recursive diff --git a/GPU-MPC/Makefile b/GPU-MPC/Makefile new file mode 100644 index 00000000..022da47b --- /dev/null +++ b/GPU-MPC/Makefile @@ -0,0 +1,132 @@ +CUDA_VERSION ?= $(value CUDA_VERSION) +ifeq ($(CUDA_VERSION),) + CUDA_VERSION = 11.7 +endif +CUTLASS_PATH=./ext/cutlass +SYTORCH_PATH=./ext/sytorch +SYTORCH_BUILD_PATH=$(SYTORCH_PATH)/build +LLAMA_PATH=$(SYTORCH_PATH)/ext/llama +CUDA_ARCH =$(GPU_ARCH) + +CXX=/usr/local/cuda-$(CUDA_VERSION)/bin/nvcc +FLAGS := -O3 -gencode arch=compute_$(CUDA_ARCH),code=[sm_$(CUDA_ARCH),compute_$(CUDA_ARCH)] -std=c++17 -m64 -Xcompiler="-O3,-w,-std=c++17,-fpermissive,-fpic,-pthread,-fopenmp,-march=native" +LIBS := -lsytorch -lcryptoTools -lLLAMA -lbitpack -lcuda -lcudart -lcurand +SECFLOAT_LIBS := -lSCI-FloatML -lSCI-FloatingPoint -lSCI-BuildingBlocks -lSCI-LinearOT -lSCI-GC -lcrypto + +UTIL_FILES := ./utils/gpu_mem.cu ./utils/gpu_file_utils.cpp ./utils/sigma_comms.cpp +OBJ_INCLUDES := -I '$(CUTLASS_PATH)/include' -I '$(CUTLASS_PATH)/tools/util/include' -I '$(SYTORCH_PATH)/include' -I '$(LLAMA_PATH)/include' -I '$(SYTORCH_PATH)/ext/cryptoTools' -I '.' +INCLUDES := $(OBJ_INCLUDES) -L$(CUTLASS_PATH)/build/tools/library -L$(SYTORCH_BUILD_PATH) -L$(SYTORCH_BUILD_PATH)/ext/cryptoTools -L$(SYTORCH_BUILD_PATH)/ext/llama -L$(SYTORCH_BUILD_PATH)/ext/bitpack -L$(SYTORCH_BUILD_PATH)/lib + +dpf: tests/fss/dpf.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf + +dpf_eval_all: tests/fss/dpf_eval_all.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_eval_all + +dpf_drelu: tests/fss/dpf_drelu.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_drelu + +dpf_lut: tests/fss/dpf_lut.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dpf_lut + +gelu: tests/fss/gelu.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/gelu + +relu: tests/fss/relu.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/relu + +rmsnorm: tests/fss/rmsnorm.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/rmsnorm + +softmax: tests/fss/softmax.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/softmax + +fc: tests/fss/fc.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/fc + +layernorm: tests/fss/layernorm.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/layernorm + +silu: tests/fss/silu.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/silu + +truncate: tests/fss/truncate.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/truncate + +mha: tests/fss/mha.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/mha + +secfloat_softmax: tests/fss/secfloat_softmax.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o tests/fss/secfloat_softmax + +piranha_softmax: tests/fss/piranha_softmax.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/piranha_softmax + +orca_dealer: experiments/orca/orca_dealer.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o experiments/orca/orca_dealer + +orca_evaluator: experiments/orca/orca_evaluator.cu experiments/orca/datasets/mnist.cpp + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) $(SECFLOAT_LIBS) -o experiments/orca/orca_evaluator + +dcf: tests/fss/dcf/dcf.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/dcf + +aes: tests/fss/dcf/aes.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/aes + +dcf_relu_extend: tests/fss/dcf/relu_extend.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/relu_extend + +dcf_stochastic_truncate: tests/fss/dcf/stochastic_truncate.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/stochastic_truncate + +dcf_relu: tests/fss/dcf/relu.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/fss/dcf/relu + +orca_conv2d: tests/nn/orca/conv2d_test.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/conv2d + +orca_maxpool: tests/nn/orca/maxpool_test.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/maxpool + +orca_relu_extend: tests/nn/orca/relu_extend_test.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/relu_extend + +orca_fc: tests/nn/orca/fc_test.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/fc + +orca_relu: tests/nn/orca/relu_test.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o tests/nn/orca/relu + +orca_inference: experiments/orca/orca_inference.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/orca_inference + +orca_inference_u32: experiments/orca/orca_inference.cu + $(CXX) $(FLAGS) -DInfType=u32 $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/orca_inference_u32 + +sigma: experiments/sigma/sigma.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/sigma/sigma + +piranha: experiments/orca/piranha.cu + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/piranha + +share_data: experiments/orca/share_data.cpp experiments/orca/datasets/mnist.cpp + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/share_data + +model_accuracy: experiments/orca/model_accuracy.cu experiments/orca/datasets/mnist.cpp + $(CXX) $(FLAGS) $(INCLUDES) $^ $(UTIL_FILES) $(LIBS) -o experiments/orca/model_accuracy + +orca: orca_dealer orca_evaluator orca_inference orca_inference_u32 piranha + +clean: + rm -rf ext/cutlass/build + rm -rf ext/sytorch/build + rm -rf orca/experiments/output + rm -rf sigma/experiments/output + rm experiments/orca/orca_dealer + rm experiments/orca/orca_evaluator + rm experiments/orca/orca_inference + rm experiments/orca/orca_inference_u32 + rm experiments/orca/piranha + rm experiments/sigma/sigma + diff --git a/GPU-MPC/README.md b/GPU-MPC/README.md new file mode 100644 index 00000000..a9515405 --- /dev/null +++ b/GPU-MPC/README.md @@ -0,0 +1,123 @@ + +# Orca: FSS-based Secure Training and Inference with GPUs + +Implementation of protocols from the paper [Orca](https://eprint.iacr.org/2023/206). + +**Warning**: This is an academic proof-of-concept prototype and has not received careful code review. This implementation is NOT ready for production use. + +## Build + +This project requires NVIDIA GPUs, and assumes that GPU drivers and the [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/) are already installed. The following has been tested on Ubuntu 20.04 with CUDA 11.7, CMake 3.27.2 and g++-9. + +Please note that Sytorch requires CMake version >= 3.17 and the build will fail if this depency is not met. + +The code uses CUTLASS version 2.11 by default, so if you change the CUDA version, please make sure that the CUTLASS version being built is compatible with the new CUDA version. To change the version of CUTLASS being built, add `git checkout ;` after line 31 (`cd ext/cutlass;`) of setup.sh. + +The last line of `setup.sh` tries to install `matplotlib`, which is needed for generating Figures 5a and 5b. In our experience, the installation fails if the versions of Python and `pip` do not match. In case the installation fails, please install `matplotlib` manually before running `run_experiment.py`. + +1. Export environment variables + +``` +export CUDA_VERSION=11.7 +export GPU_ARCH=86 +``` + +2. Set up the environment + +``` +sh setup.sh +``` + +3. Make Orca + +``` +make orca +``` + +## Run + +1. Each party runs two processes: a dealer and an evaluator. The configuration needs to define the GPU on which the dealer will run, and the directory in which it will store FSS keys. This is done in `config.json` as: + +```javascript +"dealer" : + { "gpu": , + "key_dir": + } +``` + +FSS keys tend to be quite large so please make sure that the key directory has at least 500GB of free space. Please also ensure that it is writeable. + +Similarly, the configuration also needs to define the GPU on which the evaluator will run, and the IP address of its peer, i.e., the address of the remote party the evaluator will communicate with for secure training or inference. This is done in `config.json` as: + +```javascript +"dealer" : + { "gpu": , + "peer": + } +``` + +You can run Orca to generate Figures 5a and 5b, as well as Tables 3, 4, 6, 7, 8 and 9. Table 5 can be generated by throttling the network bandwidth (with `tc`, for example) and regenerating Table 4. The script reports numbers for Tables 4, 6, 7 and 9 as the average of 10 iterations. + +Figure 5b and Table 3 run end-to-end training and so can take a couple of days to finish. + +Evaluation runs through `experiments/orca/run_experiment.py`. Here are the relevant options: + +``` +usage: run_experiment.py [-h] [--figure FIGURE] [--table TABLE] --party 0/1 + +optional arguments: + --figure FIGURE Figure # to run. + --table TABLE Table # to run. + --all true Run all the experiments. +``` + +Results are stored in the `output/P/Table` or `output/P/Fig` folders. + +Log files (which might help with debugging) are stored in the corresponding experiment folders, i.e., in `output/P/Table/logs` and `output/P/Fig/logs`. + +## Docker Build + +You can also build the docker image using the provided Dockerfile_Gen for building the Environment. + +### Install Nvidia Container Toolkit +- Configure the repository: +``` +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey |sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ +&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ +&& sudo apt-get update +``` + +- Install the NVIDIA Container Toolkit packages: +``` +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` +### Build the Docker Image / pull the image from Docker Hub +``` +# Local Build +docker build -t gpu_mpc -f Dockerfile_Gen . + +# Pull from Docker Hub (Cuda 11.8) +docker pull trajore/gpu_mpc +``` +### Run the Docker Container +``` +sudo docker run --gpus all --network host -v /home/$USER/path_to_GPU-MPC/:/home -it container_name /bin/bash + +``` +Then Run setup.sh to configure according to GPU_arch and make orca as mentioned above. + +## Citation + +You can cite the paper using the following BibTeX entry: + +``` +@INPROCEEDINGS {, +author = {N. Jawalkar and K. Gupta and A. Basu and N. Chandran and D. Gupta and R. Sharma}, +booktitle = {2024 IEEE Symposium on Security and Privacy (SP)}, +title = {Orca: FSS-based Secure Training and Inference with GPUs}, +year = {2024} +} +``` + diff --git a/GPU-MPC/backend/orca.h b/GPU-MPC/backend/orca.h new file mode 100644 index 00000000..4850da52 --- /dev/null +++ b/GPU-MPC/backend/orca.h @@ -0,0 +1,243 @@ +// Author: Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "utils/gpu_random.h" +#include "utils/gpu_mem.h" + +#include "orca_base.h" + +#include "fss/dcf/gpu_relu.h" +#include "fss/dcf/gpu_truncate.h" +#include "fss/dcf/gpu_maxpool.h" +// pin all the weights and activations in cpu memory + +template +class Orca : public OrcaBase +{ +public: + Orca() : OrcaBase() {} + + Orca(int party, std::string ip, int bw, int scale, std::string keyFile = "") : OrcaBase(party, ip, bw, scale, keyFile, false) + { + } + + void relu(Tensor &in, Tensor &out, const Tensor &drelu, u64 scale, int mode) + { + if (mode == 2) + { + // auto h_inp = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL); + // printf("Relu input=%ld, %ld, %ld\n", h_inp[0], h_inp[1], h_inp[2]); + + auto start = std::chrono::high_resolution_clock::now(); + + auto k = dcf::readGPUReluExtendKey(&(this->keyBuf)); + auto d_temp = dcf::gpuReluExtend(this->peer, this->party, k, in.d_data, &(this->g), &(this->s)); + auto d_drelu = d_temp.first; + gpuFree(d_drelu); + out.d_data = d_temp.second; + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.reluext_time += std::chrono::duration_cast(elapsed).count(); + + // printf("Num relus=%d, %lx, %lu\n", out.size(), in.d_data, out.size() * sizeof(T)); + // auto h_data = (T*) moveToCPU((u8*) out.d_data, out.size() * sizeof(T), NULL); + // printf("Relu output=%lu, %lu, %ld\n", h_data[0], h_data[1], h_data[2]); + } + else + { + auto start = std::chrono::high_resolution_clock::now(); + + auto k = dcf::readTwoRoundReluKey(&(this->keyBuf)); + auto d_temp = dcf::gpuTwoRoundRelu(this->peer, this->party, k, in.d_data, &(this->g), &(this->s)); + auto d_drelu = d_temp.first; + gpuFree(d_drelu); + out.d_data = d_temp.second; + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.relu_time += std::chrono::duration_cast(elapsed).count(); + } + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size); + auto start = std::chrono::high_resolution_clock::now(); + if (mode == 0) + { + auto k = dcf::readGPUTrStochasticKey(&(this->keyBuf)); + dcf::gpuStochasticTruncate(k, this->party, this->peer, in.d_data, &(this->g), &(this->s)); + } + else if (mode == 1) + { + auto k = dcf::readGPUStTRKey(&(this->keyBuf)); + dcf::gpuStochasticTR(k, this->party, this->peer, in.d_data, &(this->g), &(this->s)); + } + else + { + assert(0); + } + // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL); + // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.truncate_time += std::chrono::duration_cast(elapsed).count(); + } + + void signext(Tensor &x, u64 scale) + { + // printf("Sign ext=%lu\n", x.size()); + auto start = std::chrono::high_resolution_clock::now(); + auto k = dcf::readGPUSignExtendKey(&(this->keyBuf)); + dcf::gpuSignExtend(k, this->party, this->peer, x.d_data, &(this->g), &(this->s)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.signext_time += std::chrono::duration_cast(elapsed).count(); + } + + void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D &in, Tensor4D &out, Tensor4D &maxIdx, u64 scale, u8 mode) + { + auto start = std::chrono::high_resolution_clock::now(); + + assert(in.d1 == out.d1); + assert(in.d4 == out.d4); + int tmpBw = this->bw; + // Neha: ugly hack + if (mode == 3) + tmpBw -= scale; + MaxpoolParams p = { + tmpBw, tmpBw, 0, 0, this->bw, + (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4, + (int)ks, (int)ks, + (int)stride, (int)stride, + (int)padding, (int)padding, + (int)padding, (int)padding, + 0, 0, false}; + initPoolParams(p); + auto k = dcf::readGPUMaxpoolKey(p, &(this->keyBuf)); + out.d_data = dcf::gpuMaxPool(this->peer, this->party, p, k, in.d_data, (u32 *)NULL, &(this->g), &(this->s)); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.maxpool_time += std::chrono::duration_cast(elapsed).count(); + } +}; + +template +class OrcaKeygen : public OrcaBaseKeygen +{ +public: + OrcaKeygen(int party, int bw, int scale, std::string keyFile) : OrcaBaseKeygen(party, bw, scale, keyFile) + { + } + + void relu(Tensor &in, Tensor &out, const Tensor &drelu, u64 scale, int mode) + { + assert(in.is_same_shape(out)); + assert(in.is_same_shape(drelu)); + // printf("Keygen relu mode=%d\n", mode); + if (mode == 2) + { + // auto h_inp = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL); + // printf("Relu inp mask=%ld, %ld\n", h_inp[0], h_inp[1]); + // printf("Addr=%lx\n", in.d_data); + auto d_tempMask = dcf::gpuKeygenReluExtend(&(this->keyBuf), this->party, this->bw - scale, this->bw, in.size(), in.d_data, &(this->g)); + auto d_dreluMask = d_tempMask.first; + gpuFree(d_dreluMask); + auto d_reluMask = d_tempMask.second; + out.d_data = d_reluMask; + // auto h_out = (T*) moveToCPU((u8*) out.d_data, in.size() * sizeof(T), NULL); + // printf("Relu out mask=%ld, %ld\n", h_out[0], h_out[1]); + } + else + { + int tmpBw = this->bw; + if (mode == 3) + tmpBw -= scale; + auto d_tempMask = dcf::gpuGenTwoRoundReluKey(&(this->keyBuf), this->party, tmpBw, tmpBw, in.size(), in.d_data, &(this->g)); + auto d_dreluMask = d_tempMask.first; + gpuFree(d_dreluMask); + auto d_reluMask = d_tempMask.second; + out.d_data = d_reluMask; + } + // printf("Done keygen relu\n"); + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + if (mode == 0) + { + in.d_data = dcf::genGPUStochasticTruncateKey(&(this->keyBuf), this->party, this->bw, this->bw, shift, in.size(), in.d_data, &(this->g)); + } + else if (mode == 1) + { + in.d_data = dcf::genGPUStTRKey(&(this->keyBuf), this->party, this->bw, this->bw - shift, shift, in.size(), in.d_data, &(this->g)); + } + else + { + assert(0); + } + } + + void + signext(Tensor &x, u64 scale) + { + // printf("Signext inp mask %lx\n", x.d_data); + + int bin = this->bw - scale; + int bout = this->bw; + x.d_data = dcf::genSignExtendKey(&(this->keyBuf), this->party, bin, bout, x.size(), x.d_data, &(this->g)); + + // auto h_mask = (T*) moveToCPU((u8*) x.d_data, x.size() * sizeof(T), NULL); + // printf("Signext out mask %lx=%ld, %ld\n", x.d_data, h_mask[0], h_mask[1]); + } + + void maxPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D &in, Tensor4D &out, Tensor4D &maxIdx, u64 scale, u8 mode) + { + int tmpBw = this->bw; + // Neha: ugly hack + if (mode == 3) + tmpBw -= scale; + MaxpoolParams p = { + tmpBw, tmpBw, 0, 0, this->bw, + (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4, + (int)ks, (int)ks, + (int)stride, (int)stride, + (int)padding, (int)padding, + (int)padding, (int)padding, + 0, 0, false}; + initPoolParams(p); + out.d_data = dcf::gpuKeygenMaxpool(&(this->keyBuf), this->party, p, in.d_data, (u8 *)NULL, &(this->g)); + // printf("done with keygen maxpool=%lx\n", out.d_data); + } +}; + +template +class OrcaDummy : public Orca +{ +public: + OrcaDummy() + { + } +}; \ No newline at end of file diff --git a/GPU-MPC/backend/orca_base.h b/GPU-MPC/backend/orca_base.h new file mode 100644 index 00000000..7e2b1104 --- /dev/null +++ b/GPU-MPC/backend/orca_base.h @@ -0,0 +1,311 @@ +// Author: Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "nn/orca_opt.h" + +#include "utils/gpu_random.h" +#include "utils/gpu_mem.h" + +#include "fss/gpu_matmul.h" +#include "fss/gpu_conv2d.h" +#include "fss/gpu_relu.h" +#include "fss/gpu_maxpool.h" +#include "fss/gpu_avgpool.h" +#include "fss/gpu_add.h" + +template +class OrcaBase : public Backend +{ +public: + u8 *startPtr = NULL; + u8 *keyBuf = NULL; + size_t keySize = 0; + int fd = -1; + GpuPeer *peer = NULL; + int party = -1; + Stats s; + int bw; + int scale; + AESGlobalContext g; + + OrcaBase() {} + + OrcaBase(int party, std::string ip, int bw, int scale, std::string keyFile = "", bool compress = true) : party(party), bw(bw), scale(scale) + { + initAESContext(&g); + initGPUMemPool(); + // omp_set_num_threads(2); + if (keyFile.compare("") != 0) + { + auto filename = keyFile + "_inference_key" + std::to_string(party) + ".dat"; + keySize = std::filesystem::file_size(filename); + fd = openForReading(filename); + // printf("%s, %d\n", filename.data(), fd); + getAlignedBuf(&keyBuf, keySize); + startPtr = keyBuf; + } + peer = new GpuPeer(compress); + peer->connect(party, ip); + } + + void close() + { + peer->close(); + // printf("Key read=%lu\n", keyBuf - startPtr); + } + + void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D &input, const Tensor2D &filter, bool useBias, const Tensor1D &bias, Tensor4D &output, bool isFirst) + { + auto comm_start = s.comm_time; + auto start = std::chrono::high_resolution_clock::now(); + GPUConv2DKey k; + k.p = { + bw, bw, (int)input.d1, (int)input.d2, (int)input.d3, (int)ci, + (int)fh, (int)fw, (int)co, (int)padding, (int)padding, (int)padding, (int)padding, + (int)stride, (int)stride, 0, 0, 0, 0, 0}; + fillConv2DParams(&(k.p)); + k.mem_size_I = k.p.size_I * sizeof(T); + k.mem_size_F = k.p.size_F * sizeof(T); + k.mem_size_O = k.p.size_O * sizeof(T); + + k.I = (T *)keyBuf; + keyBuf += k.mem_size_I; + k.F = (T *)keyBuf; + keyBuf += k.mem_size_F; + k.O = (T *)keyBuf; + keyBuf += k.mem_size_O; + + auto d_mask_I = (T *)moveToGPU((u8 *)k.I, k.mem_size_I, &s); + if (isFirst) + { + gpuLinearComb(bw, k.p.size_I, input.d_data, T(1), input.d_data, T(1), d_mask_I); + peer->reconstructInPlace(input.d_data, bw, k.p.size_I, &s); + } + // printf("Input=%lx\n", input.d_data); + auto d_F = (T *)moveToGPU((u8 *)filter.data, k.mem_size_F, &s); + // printf("filter=%lu\n", filter.data[k.p.size_F - 1]); + auto d_mask_F = (T *)moveToGPU((u8 *)k.F, k.mem_size_F, &s); + auto d_C = gpuConv2DBeaver(k, party, input.d_data, d_F, d_mask_I, d_mask_F, useBias && party == SERVER0 ? bias.data : (T *)NULL, &s, 0); + + gpuFree(d_F); + gpuFree(d_mask_I); + gpuFree(d_mask_F); + // printf("size O=%lu\n", k.p.size_O); + peer->reconstructInPlace(d_C, k.p.bout, k.p.size_O, &s); + output.d_data = d_C; + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.conv_time += std::chrono::duration_cast(elapsed).count(); + auto comm_end = s.comm_time; + s.conv_comm_time += (comm_end - comm_start); + } + + void matmul(const Tensor2D &a, const Tensor2D &b, Tensor2D &c, bool useBias, Tensor1D &d, bool isFirst) + { + // auto h_data = (T*) moveToCPU((u8*) a.d_data, a.size() * sizeof(T), NULL); + // printf("Matmul input=%ld, %ld\n", h_data[0], h_data[1]); + // for(int i = 0; i < a.size(); i++) printf("Matmul input=%ld\n", h_data[i]); + auto comm_start = s.comm_time; + auto start = std::chrono::high_resolution_clock::now(); + + MatmulParams p; + p.M = a.d1; + p.K = a.d2; + p.N = b.d2; + p.batchSz = 1; + stdInit(p, bw, 0); + auto k = readGPUMatmulKey(p, TruncateType::None, &keyBuf); + + auto d_mask_A = (T *)moveToGPU((u8 *)k.A, k.mem_size_A, &s); + if (isFirst) + { + gpuLinearComb(bw, p.size_A, a.d_data, T(1), a.d_data, T(1), d_mask_A); + peer->reconstructInPlace(a.d_data, bw, p.size_A, &s); + } + c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false, d_mask_A); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.matmul_time += std::chrono::duration_cast(elapsed).count(); + auto comm_end = s.comm_time; + s.matmul_comm_time += (comm_end - comm_start); + } + + void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D &in, Tensor4D &out, u64 scale) + { + AvgPoolParams p = { + bw, bw, (int)scale, (int)scale, 0, (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4, + (int)ks, (int)ks, (int)stride, (int)stride, (int)padding, (int)padding, (int)padding, (int)padding, 0, 0, false}; + initPoolParams(p); + out.d_data = gpuAddPool(p, in.d_data, &s); + } + + void output(Tensor &a) + { + // int tmpBw = bw - scale; + int N = a.size(); + unmaskValues(/*tmpBw*/ bw, N, a.d_data, (T *)keyBuf, &s); + gpuLocalTr(party, bw, scale, N, a.d_data, true); + moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s); + } + + void add(const std::vector *> &in, Tensor &out) + { + int tmpBw = bw - scale; + int N = in[0]->size(); + std::vector gpuInp; + for (int i = 0; i < in.size(); i++) + { + gpuInp.push_back(in[i]->d_data); + } + out.d_data = gpuAdd(tmpBw, N, gpuInp); + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { orcaOpt(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + } +}; + +template +class OrcaBaseKeygen : public Backend +{ +public: + u8 *startPtr; + u8 *keyBuf = NULL; + size_t keyBufSize = 0; + int party = -1; + std::string keyFile; + int scale; + int bw; + AESGlobalContext g; + + OrcaBaseKeygen(int party, int bw, int scale, std::string keyFile) : party(party), bw(bw), scale(scale), keyFile(keyFile) + { + initAESContext(&g); + initGPURandomness(); + initCPURandomness(); + initGPUMemPool(); + keyBufSize = 20 * OneGB; + getAlignedBuf(&keyBuf, keyBufSize, true); + startPtr = keyBuf; + } + + void close() + { + size_t keySize = keyBuf - startPtr; + size_t padding = 4096 - (keySize % 4096); + char *zeros = new char[padding]; + memset(zeros, 0, padding); + memcpy(keyBuf, zeros, padding); + keyBuf += padding; + keySize += padding; + assert(keySize < keyBufSize); + int fd = openForWriting(keyFile + "_inference_key" + std::to_string(party) + ".dat"); + writeKeyBuf(fd, keySize, startPtr); + assert(0 == fsync(fd) && "sync error!"); + closeFile(fd); + cpuFree(startPtr, true); + destroyGPURandomness(); + destroyCPURandomness(); + } + + void conv2D(u64 fh, u64 fw, u64 padding, u64 stride, u64 ci, u64 co, const Tensor4D &input, const Tensor2D &filter, Tensor4D &output, bool isFirst) + { + GPUConv2DKey k; + k.p = { + bw, bw, (int)input.d1, (int)input.d2, (int)input.d3, (int)ci, + (int)fh, (int)fw, (int)co, (int)padding, (int)padding, (int)padding, (int)padding, + (int)stride, (int)stride, 0, 0, 0, 0, 0}; + fillConv2DParams(&(k.p)); + k.mem_size_I = k.p.size_I * sizeof(T); + k.mem_size_F = k.p.size_F * sizeof(T); + k.mem_size_O = k.p.size_O * sizeof(T); + output.d_data = gpuKeygenConv2D(&keyBuf, party, k, input.d_data, filter.data, true); + } + + void matmul(const Tensor2D &a, const Tensor2D &b, Tensor2D &c) + { + MatmulParams p; + p.M = a.d1; + p.K = a.d2; + p.N = b.d2; + p.batchSz = 1; + stdInit(p, bw, 0); + // printf("####### X=%lu\n", a.size()); + // auto h_temp = (u8*) moveToCPU((u8*) a.d_data, a.size() * sizeof(T), (Stats*) NULL); + c.d_data = gpuKeygenMatmul(&keyBuf, party, p, a.d_data, b.data, (T *)NULL, TruncateType::None, &g, false); + } + + void avgPool2D(u64 ks, u64 padding, u64 stride, const Tensor4D &in, Tensor4D &out, u64 scale) + { + AvgPoolParams p = { + bw, bw, (int)scale, (int)scale, 0, (int)in.d1, (int)in.d2, (int)in.d3, (int)in.d4, + (int)ks, (int)ks, (int)stride, (int)stride, (int)padding, (int)padding, (int)padding, (int)padding, 0, 0, false}; + initPoolParams(p); + out.d_data = gpuAddPool(p, in.d_data, (Stats *)NULL); + } + + void add(const std::vector *> &in, Tensor &out) + { + int tmpBw = this->bw - this->scale; + int N = in[0]->size(); + std::vector gpuInp; + for (int i = 0; i < in.size(); i++) + { + gpuInp.push_back(in[i]->d_data); + } + out.d_data = gpuAdd(tmpBw, N, gpuInp); + } + + void addbias(Tensor &x, const Tensor1D &bias) + { + gpuAddBias(1, x.size() / bias.d1, bias.d1, bw, x.d_data, bias.data, NULL); + } + + void output(Tensor &a) + { + int N = a.size(); + size_t memSz = N * sizeof(T); + moveIntoCPUMem((u8 *)keyBuf, (u8 *)a.d_data, memSz, (Stats *)NULL); + keyBuf += memSz; + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { orcaOpt(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + } +}; diff --git a/GPU-MPC/backend/piranha.h b/GPU-MPC/backend/piranha.h new file mode 100644 index 00000000..5d2f10ec --- /dev/null +++ b/GPU-MPC/backend/piranha.h @@ -0,0 +1,160 @@ +// Author: Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "utils/gpu_random.h" +#include "utils/gpu_mem.h" + +#include "orca.h" +// pin all the weights and activations in cpu memory + +template +void piranhaOpt(LayerGraphNode *n, LayerGraphNode *r) +{ + if (!n->layer->isTrainingMode && n->children.size() == 0) + { + n->layer->doTruncationForward = false; + } + if (n->layer->name == "AvgPool2D" && n->children[0]->layer->name == "ReLU") + { + auto avgPool = static_cast *>(n->layer); + auto relu = n->children[0]->layer; + relu->mode = (int)std::log2(avgPool->ks * avgPool->ks); + } + else if (n->layer->name == "Flatten") + { + // delete flatten and add a flag to FC instead + assert(n->parents.size() == 1 && n->children.size() == 1); + auto parent = n->parents[0]; + // printf("%s\n", parent->layer->name.data()); + auto child = n->children[0]; + assert(parent->children.size() == 1); + assert(child->parents.size() == 1); + parent->children[0] = child; + child->parents[0] = parent; + always_assert(parent->currTensor->shape.size() == 4); + always_assert(child->layer->name == "FC"); + auto fc = static_cast *>(child->layer); + // // todo: free the memory used up by flatten + auto batchSz = parent->currTensor->shape[0]; + auto h = parent->currTensor->shape[1]; + auto w = parent->currTensor->shape[2]; + auto c = parent->currTensor->shape[3]; + int m = fc->out; + assert(h * w * c == fc->in); + parent->currTensor = new Tensor(parent->layer->activation.data, parent->layer->activation.d_data, {batchSz, h * w * c}); + // printf("New tensor=%lx\n", parent->currTensor); + parent->currTensor->graphNode = parent; + int i; + for (i = 0; i < n->allNodesInExecutionOrderRef->size(); i++) + { + if (n->allNodesInExecutionOrderRef->at(i) == n) + { + break; + } + } + n->allNodesInExecutionOrderRef->erase(n->allNodesInExecutionOrderRef->begin() + i); + } +} + +template +class Piranha : public Orca +{ +public: + Piranha() : Orca() {} + + Piranha(int party, std::string ip, int bw, int scale, std::string keyFile = "") : Orca(party, ip, bw, scale, keyFile) + { + } + + void relu(Tensor &in, Tensor &out, const Tensor &drelu, u64 scale, int mode) + { + // assert(mode == 2); + auto start = std::chrono::high_resolution_clock::now(); + auto k = dcf::readTwoRoundReluKey(&(this->keyBuf)); + auto d_temp = dcf::gpuTwoRoundRelu(this->peer, this->party, k, in.d_data, &(this->g), &(this->s)); + auto d_drelu = d_temp.first; + gpuFree(d_drelu); + out.d_data = d_temp.second; + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.relu_time += std::chrono::duration_cast(elapsed).count(); + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + auto start = std::chrono::high_resolution_clock::now(); + auto d_inp = in.d_data; + GPUTruncateKey k; + in.d_data = gpuTruncate(this->bw, this->bw, TruncateType::LocalARS, k, (int)shift, this->peer, (int)this->party, (int)in.size(), (T *)in.d_data, &(this->g), &(this->s)); + gpuFree(d_inp); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + this->s.truncate_time += std::chrono::duration_cast(elapsed).count(); + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { piranhaOpt(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + } +}; + +template +class PiranhaKeygen : public OrcaKeygen +{ +public: + PiranhaKeygen(int party, int bw, int scale, std::string keyFile) : OrcaKeygen(party, bw, scale, keyFile) + { + } + + void relu(Tensor &in, Tensor &out, const Tensor &drelu, u64 scale, int mode) + { + assert(in.is_same_shape(out)); + assert(in.is_same_shape(drelu)); + // assert(mode == 2); + int tmpBw = this->bw - scale - mode; + // printf("Inp=%lx, mode=%d, N=%lu\n", in.d_data, mode, in.size()); + auto d_tempMask = dcf::gpuGenTwoRoundReluKey(&(this->keyBuf), this->party, tmpBw, this->bw, in.size(), in.d_data, &(this->g)); + auto d_dreluMask = d_tempMask.first; + gpuFree(d_dreluMask); + auto d_reluMask = d_tempMask.second; + out.d_data = d_reluMask; + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + auto d_inp = in.d_data; + in.d_data = genGPUTruncateKey(&(this->keyBuf), this->party, TruncateType::LocalARS, this->bw, this->bw, shift, in.size(), in.d_data, &(this->g)); + gpuFree(d_inp); + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { piranhaOpt(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + } +}; \ No newline at end of file diff --git a/GPU-MPC/backend/sigma.h b/GPU-MPC/backend/sigma.h new file mode 100644 index 00000000..d5667478 --- /dev/null +++ b/GPU-MPC/backend/sigma.h @@ -0,0 +1,428 @@ +// Author: Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "nn/orca_opt.h" + +#include "utils/gpu_random.h" +#include "utils/gpu_mem.h" + +#include "fss/gpu_matmul.h" +#include "fss/gpu_gelu.h" +#include "fss/gpu_layernorm.h" +#include "fss/gpu_mha.h" +#include "fss/gpu_add.h" + +template +void noTruncateAfterRmsnorm(LayerGraphNode *n, LayerGraphNode *r) +{ + if (n->layer->name == "RMSNorm") + { + n->layer->doTruncationForward = false; + } +} + +template +class SIGMA : public Backend +{ +public: + u8 *startPtr = NULL; + u8 *keyBuf = NULL; + size_t keySize = 0; + // int fd = -1; + GpuPeer *peer = NULL; + int party = -1; + Stats s; + int bw = 0, scale = 0, n_seq = 0; + AESGlobalContext g; + MHATables d_mhaTab; + T *d_geluTab, *d_siluTab; + std::vector *invSqrtTab; + LlamaTransformer *llama; + + SIGMA(int party, std::string ip, std::string keyFile, int bw, int scale, int n_seq, int n_embed, int numThreads) : party(party), bw(bw), scale(scale), n_seq(n_seq) + { + initAESContext(&g); + initGPUMemPool(); + // initCommBufs(true); + + d_geluTab = genLUT>(8, 6, scale); + d_siluTab = genLUT>(10, 6, scale); + d_mhaTab = initMHATables(n_seq, scale); + + omp_set_num_threads(numThreads); + + invSqrtTab = new std::vector(1LL << 13); +#pragma omp parallel for + for (int i = 0; i < (1LL << 13); ++i) + { + GroupElement k = i % (1LL << 6); + GroupElement m = i >> 6; + double val = double(m + 128) * std::pow(2.0, k - 7); + (*invSqrtTab)[i] = GroupElement(double(1LL << (2 * scale)) / sqrt(val / n_embed)); + } + + auto filename = keyFile + "_" + std::to_string(party) + ".dat"; + keySize = std::filesystem::file_size(filename); + int fd = openForReading(filename); + printf("%s, %d\n", filename.data(), fd); + getAlignedBuf(&keyBuf, keySize); + readKey(fd, keySize, keyBuf, NULL); + + startPtr = keyBuf; + + LlamaConfig::bitlength = bw; + LlamaConfig::party = party + 2; + LlamaConfig::stochasticT = false; + LlamaConfig::stochasticRT = false; + + llama = new LlamaTransformer(); + if (party == SERVER0) + llama->initServer(ip, (char **)&keyBuf); + else + llama->initClient(ip, (char **)&keyBuf); + + peer = new GpuPeer(true); + peer->peer = LlamaConfig::peer; + } + + void close() + { + peer->close(); + // printf("Key read=%lu\n", keyBuf - startPtr); + } + + void matmul(const Tensor2D &a, const Tensor2D &b, Tensor2D &c, bool useBias, Tensor1D &d, bool isFirst) + { + auto start = std::chrono::high_resolution_clock::now(); + + MatmulParams p; + p.M = a.d1; + p.K = a.d2; + p.N = b.d2; + p.batchSz = 1; + stdInit(p, bw, 0); + auto k = readGPUMatmulKey(p, TruncateType::None, &keyBuf); + c.d_data = gpuMatmul(peer, party, p, k, a.d_data, b.data, useBias ? d.data : (T *)NULL, TruncateType::None, &g, &s, false); + // printf("Matmul weights=%ld, %ld, %ld\n", b.data[0], b.data[1], b.data[b.size() - 1]); + + // auto h_out = (T*) moveToCPU((u8*) c.d_data, p.size_C * sizeof(T), NULL); + // printf("Matmul output=%ld, %ld\n", h_out[0], h_out[1]); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.matmul_time += std::chrono::duration_cast(elapsed).count(); + } + + void gelu(const Tensor &in, Tensor &out, u64 scale, u64 mode = 0) + { + u64 b0 = peer->bytesSent() + peer->bytesReceived(); + auto start = std::chrono::high_resolution_clock::now(); + + auto k = readGpuGeluKey(&keyBuf); + out.d_data = gpuGelu(peer, party, k, bw, bw - scale, (int)scale, in.size(), in.d_data, d_geluTab, &g, &s); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.gelu_time += std::chrono::duration_cast(elapsed).count(); + u64 b1 = peer->bytesSent() + peer->bytesReceived(); + s.gelu_comm_bytes += (b1 - b0); + } + + void silu(const Tensor &in, Tensor &out, u64 scale, u64 mode = 0) + { + u64 b0 = peer->bytesSent() + peer->bytesReceived(); + auto start = std::chrono::high_resolution_clock::now(); + + auto k = readGpuGeluKey(&keyBuf); + out.d_data = gpuGelu(peer, party, k, bw, bw - scale, (int)scale, in.size(), in.d_data, d_siluTab, &g, &s); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.gelu_time += std::chrono::duration_cast(elapsed).count(); + u64 b1 = peer->bytesSent() + peer->bytesReceived(); + s.gelu_comm_bytes += (b1 - b0); + } + + void SIGMALayernorm(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale, bool computeMu) + { + u64 b0 = peer->bytesSent() + peer->bytesReceived(); + auto start = std::chrono::high_resolution_clock::now(); + + AvgPoolParams p = {bw, bw, scale, 0, 0, 1, x.shape[0], x.shape[1], 1, 1, x.shape[1], 1, x.shape[1], 0, 0, 0, 0}; + initPoolParams(p); + auto k = readGPULayerNormKey(p, &keyBuf, computeMu); + // assert(d_invSqrtTab); + auto d_A = (T *)moveToGPU((u8 *)A.data, A.size() * sizeof(T), &s); + auto d_B = (T *)moveToGPU((u8 *)B.data, B.size() * sizeof(T), &s); + y.d_data = gpuLayerNorm(peer, party, p, k, d_A, d_B, x.d_data, /*(std::vector *)*/ invSqrtTab, &g, &s, computeMu); + gpuFree(d_A); + gpuFree(d_B); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.layernorm_time += std::chrono::duration_cast(elapsed).count(); + u64 b1 = peer->bytesSent() + peer->bytesReceived(); + s.layernorm_comm_bytes += (b1 - b0); + } + + void layernorm(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale) + { + SIGMALayernorm(A, B, x, y, scale, true); + } + + void rmsnorm(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale) + { + SIGMALayernorm(A, B, x, y, scale, false); + } + + void mha(int n_heads, int n_embed, int dim_W, bool selfAttn, bool doNormQKt, bool doRotEmb, const Tensor2D &wQKV, const Tensor1D &bQKV, const Tensor2D &wProj, const Tensor1D &bProj, const Tensor2D &X, Tensor2D &Y) + { + auto start = std::chrono::high_resolution_clock::now(); + + MHAParams pMHA = {n_seq, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb}; + MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale); + auto k = readGPUMHAKey(pMHA, pMHAMul, &keyBuf); + Y.d_data = gpuMHA(peer, party, bw, scale, pMHA, pMHAMul, k, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, d_mhaTab, &g, &s); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.mha_time += std::chrono::duration_cast(elapsed).count(); + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + // printf("Truncate=%lu, %lu, %lu\n", mode, shift, size); + auto start = std::chrono::high_resolution_clock::now(); + + TruncateType t = TruncateType::TrFloor; + auto k = readGPUTruncateKey(t, &keyBuf); + in.d_data = gpuTruncate(k.bin, k.bout, t, k, k.shift, peer, party, k.N, in.d_data, &g, &s); + + // auto h_data = (T*) moveToCPU((u8*) in.d_data, in.size() * sizeof(T), NULL); + // printf("Truncate output=%lu, %lu, %lu\n", h_data[0], h_data[1], h_data[in.size() - 1]); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = end - start; + s.truncate_time += std::chrono::duration_cast(elapsed).count(); + } + + void mul(const Tensor &a, const Tensor &b, Tensor &out) + { + u64 N = a.size(); + auto k = readGPUMulKey(&keyBuf, N, N, N, TruncateType::None); + out.d_data = gpuMul(peer, party, bw, scale, N, k, a.d_data, b.d_data, TruncateType::None, &g, &s); + } + + void output(Tensor &a) + { + // printf("Inside output=%lx\n", a.d_data); + // int tmpBw = bw - scale; + int N = a.size(); + // printf("keyBuf=%lx, %lu\n", keyBuf, keyBuf - startPtr); + unmaskValues(bw, N, a.d_data, (T *)keyBuf, &s); + // printf("boo\n"); + moveIntoCPUMem((u8 *)a.data, (u8 *)a.d_data, N * sizeof(T), &s); + } + + void add(const std::vector *> &in, Tensor &out) + { + int tmpBw = bw - scale; + int N = in[0]->size(); + std::vector gpuInp; + for (int i = 0; i < in.size(); i++) + { + gpuInp.push_back(in[i]->d_data); + } + out.d_data = gpuAdd(tmpBw, N, gpuInp); + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { noTruncateAfterRmsnorm(n, r); }); + } +}; + +template +class SIGMAKeygen : public Backend +{ +public: + u8 *startPtr; + u8 *keyBuf = NULL; + size_t keyBufSize = 0; + int party = -1; + std::string keyFile; + int scale; + int bw; + AESGlobalContext g; + LlamaTransformer *llama; + u8 *llamaBuf1, *llamaBuf2; + u8 *dummyBuf1, *dummyBuf2; + + SIGMAKeygen(int party, int bw, int scale, std::string keyFile, size_t keyBufSize) : party(party), bw(bw), scale(scale), keyFile(keyFile), keyBufSize(keyBufSize) + { + initAESContext(&g); + initGPURandomness(); + initGPUMemPool(); + // keyBufSize = 20 * OneGB; + keyBuf = cpuMalloc(keyBufSize); + startPtr = keyBuf; + + LlamaConfig::bitlength = bw; + LlamaConfig::party = DEALER; + LlamaConfig::stochasticT = false; + LlamaConfig::stochasticRT = false; + + llama = new LlamaTransformer(); + llamaBuf1 = (u8 *)cpuMalloc(OneGB); + dummyBuf1 = (u8 *)cpuMalloc(OneGB); + llamaBuf2 = llamaBuf1; + dummyBuf2 = dummyBuf1; + llama->initDealer((char **)(party == SERVER0 ? &llamaBuf2 : &dummyBuf2), (char **)(party == SERVER1 ? &llamaBuf2 : &dummyBuf2)); + } + + void close() + { + size_t keySize = keyBuf - startPtr; + size_t padding = 4096 - (keySize % 4096); + char *zeros = new char[padding]; + memset(zeros, 0, padding); + memcpy(keyBuf, zeros, padding); + keyBuf += padding; + keySize += padding; + assert(keySize < keyBufSize); + std::ofstream f(keyFile + "_" + std::to_string(party) + ".dat"); + f.write((char *)startPtr, keySize); + f.close(); + cpuFree(startPtr); + } + + void matmul(const Tensor2D &a, const Tensor2D &b, Tensor2D &c) + { + MatmulParams p; + p.M = a.d1; + p.K = a.d2; + p.N = b.d2; + p.batchSz = 1; + stdInit(p, bw, 0); + c.d_data = gpuKeygenMatmul(&keyBuf, party, p, a.d_data, b.data, (T *)NULL, TruncateType::None, &g, false); + } + + void gelu(const Tensor &in, Tensor &out, u64 scale, u64 mode = 0) + { + out.d_data = gpuKeyGenGelu(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g); + } + + void silu(const Tensor &in, Tensor &out, u64 scale, u64 mode = 0) + { + out.d_data = gpuKeyGenGelu(&keyBuf, party, bw, bw - scale, (int)scale, in.size(), in.d_data, &g); + } + + void SIGMALayernormKeygen(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale, bool computeMu) + { + AvgPoolParams p = {bw, bw, scale, 0, 0, 1, x.shape[0], x.shape[1], 1, 1, x.shape[1], 1, x.shape[1], 0, 0, 0, 0}; + initPoolParams(p); + auto d_mask_A = (T *)moveToGPU((u8 *)A.data, A.size() * sizeof(T), (Stats *)NULL); + auto d_mask_B = (T *)moveToGPU((u8 *)B.data, B.size() * sizeof(T), (Stats *)NULL); + y.d_data = gpuKeygenLayerNorm(&keyBuf, party, p, d_mask_A, d_mask_B, x.d_data, &g, computeMu); + size_t llamaKeySz = llamaBuf2 - llamaBuf1; + memcpy(keyBuf, llamaBuf1, llamaKeySz); + keyBuf += llamaKeySz; + llamaBuf2 = llamaBuf1; + gpuFree(d_mask_A); + gpuFree(d_mask_B); + } + + void layernorm(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale) + { + SIGMALayernormKeygen(A, B, x, y, scale, true); + } + + void rmsnorm(const Tensor1D &A, const Tensor1D &B, const Tensor &x, Tensor &y, u64 scale) + { + SIGMALayernormKeygen(A, B, x, y, scale, false); + } + + void mha(int n_heads, int n_embed, int dim_W, bool selfAttn, bool doNormQKt, bool doRotEmb, const Tensor2D &wQKV, const Tensor1D &bQKV, const Tensor2D &wProj, const Tensor1D &bProj, const Tensor2D &X, Tensor2D &Y) + { + MHAParams pMHA = {X.d1, n_embed, n_heads, dim_W, selfAttn, doNormQKt, doRotEmb}; + MHAMulParams pMHAMul = initMHAMulParams(pMHA, bw, scale); + printf("scale=%d\n", pMHAMul.pQKV.shift); + Y.d_data = gpuKeygenMHA(&keyBuf, party, bw, scale, pMHA, pMHAMul, wQKV.data, bQKV.data, wProj.data, bProj.data, X.d_data, &g); + } + + void mul(const Tensor &a, const Tensor &b, Tensor &out) + { + out.d_data = gpuKeygenMul(&keyBuf, party, bw, scale, a.size(), a.d_data, b.d_data, TruncateType::None, &g); + } + + void truncateForward(Tensor &in, u64 shift, u8 mode = 0) + { + TruncateType t = TruncateType::TrFloor; + in.d_data = genGPUTruncateKey(&keyBuf, party, t, bw, bw, shift, in.size(), in.d_data, &g); + } + + void add(const std::vector *> &in, Tensor &out) + { + int tmpBw = bw - scale; + int N = in[0]->size(); + // printf("Add input=%d, %lx, %lx\n", N, in[0]->d_data, in[1]->d_data); + std::vector gpuInp; + for (int i = 0; i < in.size(); i++) + { + gpuInp.push_back(in[i]->d_data); + // printf("Add inp=%lx\n", in[i]->d_data); + } + out.d_data = gpuAdd(tmpBw, N, gpuInp); + } + + void addbias(Tensor &x, const Tensor1D &bias) + { + gpuAddBias(1, x.size() / bias.d1, bias.d1, bw, x.d_data, bias.data, NULL); + } + + void output(Tensor &a) + { + int N = a.size(); + size_t memSz = N * sizeof(T); + moveIntoCPUMem((u8 *)keyBuf, (u8 *)a.d_data, memSz, (Stats *)NULL); + keyBuf += memSz; + } + + void optimize(LayerGraphNode *root) + { + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { pinCpuMem(n, r); }); + topologicalApply(root, [&](LayerGraphNode *n, LayerGraphNode *r) + { noTruncateAfterRmsnorm(n, r); }); + } +}; diff --git a/GPU-MPC/experiments/__init__.py b/GPU-MPC/experiments/__init__.py new file mode 100644 index 00000000..11ba17de --- /dev/null +++ b/GPU-MPC/experiments/__init__.py @@ -0,0 +1,21 @@ +# +# Copyright: +# +# Copyright (c) 2024 Microsoft Research +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + diff --git a/GPU-MPC/experiments/orca/__init__.py b/GPU-MPC/experiments/orca/__init__.py new file mode 100644 index 00000000..11ba17de --- /dev/null +++ b/GPU-MPC/experiments/orca/__init__.py @@ -0,0 +1,21 @@ +# +# Copyright: +# +# Copyright (c) 2024 Microsoft Research +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + diff --git a/GPU-MPC/experiments/orca/cnn.h b/GPU-MPC/experiments/orca/cnn.h new file mode 100644 index 00000000..fd3d59b6 --- /dev/null +++ b/GPU-MPC/experiments/orca/cnn.h @@ -0,0 +1,1388 @@ +// +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include + +#include "utils/gpu_data_types.h" + +#include "nn/orca/gpu_model.h" +#include "nn/orca/conv2d_layer.h" +#include "nn/orca/maxpool_layer.h" +#include "nn/orca/relu_layer.h" +#include "nn/orca/relu_extend_layer.h" +#include "nn/orca/avg_pool_layer.h" +#include "nn/orca/fc_layer.h" + +#include "backend/orca.h" +#include "backend/piranha.h" + +template +class CNN2 : public SytorchModule +{ + Conv2D *conv1; + ReLU *relu1; + MaxPool2D *maxpool1; + Conv2D *conv2; + ReLU *relu2; + MaxPool2D *maxpool2; + Flatten *flatten3; + FC *fc4; + ReLU *relu4; + FC *fc5; + +public: + CNN2() + { + conv1 = new Conv2D(1, 8, 5, 0, 1, true); + relu1 = new ReLU(); + maxpool1 = new MaxPool2D(2, 0, 2); + + conv2 = new Conv2D(8, 16, 5, 0, 1, true); + relu2 = new ReLU(); + maxpool2 = new MaxPool2D(2, 0, 2); + + flatten3 = new Flatten(); + fc4 = new FC(256, 128, true); + relu4 = new ReLU(); + + fc5 = new FC(128, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var1 = conv1->forward(input); + auto &var2 = relu1->forward(var1); + auto &var3 = maxpool1->forward(var2); + auto &var4 = conv2->forward(var3); + auto &var5 = relu2->forward(var4); + auto &var6 = maxpool2->forward(var5); + auto &var7 = flatten3->forward(var6); + auto &var8 = fc4->forward(var7); + auto &var9 = relu4->forward(var8); + auto &var10 = fc5->forward(var9); + return var10; + } +}; + +template +class PLenetNoReluAvgPool : public SytorchModule +{ + Conv2D *conv1; + AvgPool2D *avgpool1; + ReLU *relu1; + Conv2D *conv2; + AvgPool2D *avgpool2; + ReLU *relu2; + Flatten *flatten3; + FC *fc4; + ReLU *relu4; + FC *fc5; + +public: + PLenetNoReluAvgPool() + { + conv1 = new Conv2D(1, 20, 5, 0, 1, false); + avgpool1 = new AvgPool2D(2, 0, 2); + relu1 = new ReLU(); + conv2 = new Conv2D(20, 50, 5, 0, 1, false); + avgpool2 = new AvgPool2D(2, 0, 2); + relu2 = new ReLU(); + flatten3 = new Flatten(); + fc4 = new FC(800, 500, true); + relu4 = new ReLU(); + fc5 = new FC(500, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var1 = conv1->forward(input); + auto &var2 = avgpool1->forward(var1); + auto &var3 = relu1->forward(var2); + auto &var4 = conv2->forward(var3); + auto &var5 = avgpool2->forward(var4); + auto &var6 = relu2->forward(var5); + auto &var7 = flatten3->forward(var6); + auto &var8 = fc4->forward(var7); + auto &var9 = relu4->forward(var8); + auto &var10 = fc5->forward(var9); + return var10; + } +}; + +template +class MinionnLenet : public SytorchModule +{ + Conv2D *conv1; + ReLU *relu1; + MaxPool2D *maxpool1; + Conv2D *conv2; + ReLU *relu2; + MaxPool2D *maxpool2; + Flatten *flatten3; + FC *fc4; + ReLU *relu4; + FC *fc5; + +public: + MinionnLenet() + { + conv1 = new Conv2D(1, 16, 5, 0, 1, true); + relu1 = new ReLU(); + maxpool1 = new MaxPool2D(2, 0, 2); + + conv2 = new Conv2D(16, 16, 5, 0, 1, true); + relu2 = new ReLU(); + maxpool2 = new MaxPool2D(2, 0, 2); + + flatten3 = new Flatten(); + fc4 = new FC(256, 100, true); + relu4 = new ReLU(); + + fc5 = new FC(100, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var1 = conv1->forward(input); + auto &var2 = relu1->forward(var1); + auto &var3 = maxpool1->forward(var2); + auto &var4 = conv2->forward(var3); + auto &var5 = relu2->forward(var4); + auto &var6 = maxpool2->forward(var5); + auto &var7 = flatten3->forward(var6); + auto &var8 = fc4->forward(var7); + auto &var9 = relu4->forward(var8); + auto &var10 = fc5->forward(var9); + return var10; + } +}; + +template +class PSecureMlNoRelu : public SytorchModule +{ + FC *fc1; + ReLU *relu1; + FC *fc2; + ReLU *relu2; + FC *fc3; + +public: + PSecureMlNoRelu() + { + fc1 = new FC(784, 128, true); + relu1 = new ReLU(); + fc2 = new FC(128, 128, true); + relu2 = new ReLU(); + fc3 = new FC(128, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var1 = fc1->forward(input); + auto &var2 = relu1->forward(var1); + auto &var3 = fc2->forward(var2); + auto &var4 = relu2->forward(var3); + auto &var5 = fc3->forward(var4); + return var5; + } +}; + +template +class CNN3 : public SytorchModule +{ + Conv2D *conv1; + ReLU *relu1; + MaxPool2D *maxpool1; + Conv2D *conv2; + ReLU *relu2; + MaxPool2D *maxpool2; + Conv2D *conv3; + ReLU *relu3; + MaxPool2D *maxpool3; + Flatten *flatten4; + FC *fc5; + +public: + CNN3() + { + conv1 = new Conv2D(3, 64, 5, 1, 1, true); + relu1 = new ReLU(); + maxpool1 = new MaxPool2D(3, 0, 2); + + conv2 = new Conv2D(64, 64, 5, 1, 1, true); + relu2 = new ReLU(); + maxpool2 = new MaxPool2D(3, 0, 2); + + conv3 = new Conv2D(64, 64, 5, 1, 1, true); + relu3 = new ReLU(); + maxpool3 = new MaxPool2D(3, 0, 2); + + flatten4 = new Flatten(); + fc5 = new FC(64, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var1 = conv1->forward(input); + auto &var2 = relu1->forward(var1); + auto &var3 = maxpool1->forward(var2); + auto &var4 = conv2->forward(var3); + auto &var5 = relu2->forward(var4); + auto &var6 = maxpool2->forward(var5); + auto &var7 = conv3->forward(var6); + auto &var8 = relu3->forward(var7); + auto &var9 = maxpool3->forward(var8); + auto &var10 = flatten4->forward(var9); + auto &var11 = fc5->forward(var10); + return var11; + } +}; + +template +class VGG16 : public SytorchModule +{ + using SytorchModule::add; + +public: + Conv2D *conv0; + ReLU *relu1; + Conv2D *conv2; + MaxPool2D *maxpool3; + ReLU *relu4; + Conv2D *conv5; + ReLU *relu6; + Conv2D *conv7; + MaxPool2D *maxpool8; + ReLU *relu9; + Conv2D *conv10; + ReLU *relu11; + Conv2D *conv12; + ReLU *relu13; + Conv2D *conv14; + MaxPool2D *maxpool15; + ReLU *relu16; + Conv2D *conv17; + ReLU *relu18; + Conv2D *conv19; + ReLU *relu20; + Conv2D *conv21; + MaxPool2D *maxpool22; + ReLU *relu23; + Conv2D *conv24; + ReLU *relu25; + Conv2D *conv26; + ReLU *relu27; + Conv2D *conv28; + MaxPool2D *maxpool29; + ReLU *relu30; + Flatten *reshape31; + FC *gemm32; + ReLU *relu33; + FC *gemm34; + ReLU *relu35; + FC *gemm36; + +public: + VGG16() + { + conv0 = new Conv2D(3, 64, 3, 1, 1, true); + relu1 = new ReLU(); + conv2 = new Conv2D(64, 64, 3, 1, 1, true); + maxpool3 = new MaxPool2D(2, 0, 2); + relu4 = new ReLU(); + conv5 = new Conv2D(64, 128, 3, 1, 1, true); + relu6 = new ReLU(); + conv7 = new Conv2D(128, 128, 3, 1, 1, true); + maxpool8 = new MaxPool2D(2, 0, 2); + relu9 = new ReLU(); + conv10 = new Conv2D(128, 256, 3, 1, 1, true); + relu11 = new ReLU(); + conv12 = new Conv2D(256, 256, 3, 1, 1, true); + relu13 = new ReLU(); + conv14 = new Conv2D(256, 256, 3, 1, 1, true); + maxpool15 = new MaxPool2D(2, 0, 2); + relu16 = new ReLU(); + conv17 = new Conv2D(256, 512, 3, 1, 1, true); + relu18 = new ReLU(); + conv19 = new Conv2D(512, 512, 3, 1, 1, true); + relu20 = new ReLU(); + conv21 = new Conv2D(512, 512, 3, 1, 1, true); + maxpool22 = new MaxPool2D(2, 0, 2); + relu23 = new ReLU(); + conv24 = new Conv2D(512, 512, 3, 1, 1, true); + relu25 = new ReLU(); + conv26 = new Conv2D(512, 512, 3, 1, 1, true); + relu27 = new ReLU(); + conv28 = new Conv2D(512, 512, 3, 1, 1, true); + maxpool29 = new MaxPool2D(2, 0, 2); + relu30 = new ReLU(); + reshape31 = new Flatten(); + gemm32 = new FC(25088, 4096, true); + relu33 = new ReLU(); + gemm34 = new FC(4096, 4096, true); + relu35 = new ReLU(); + gemm36 = new FC(4096, 1000, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var35 = conv0->forward(input); + auto &var36 = relu1->forward(var35); + auto &var37 = conv2->forward(var36); + auto &var38 = maxpool3->forward(var37); + auto &var39 = relu4->forward(var38); + auto &var40 = conv5->forward(var39); + auto &var41 = relu6->forward(var40); + auto &var42 = conv7->forward(var41); + auto &var43 = maxpool8->forward(var42); + auto &var44 = relu9->forward(var43); + auto &var45 = conv10->forward(var44); + auto &var46 = relu11->forward(var45); + auto &var47 = conv12->forward(var46); + auto &var48 = relu13->forward(var47); + auto &var49 = conv14->forward(var48); + auto &var50 = maxpool15->forward(var49); + auto &var51 = relu16->forward(var50); + auto &var52 = conv17->forward(var51); + auto &var53 = relu18->forward(var52); + auto &var54 = conv19->forward(var53); + auto &var55 = relu20->forward(var54); + auto &var56 = conv21->forward(var55); + auto &var57 = maxpool22->forward(var56); + auto &var58 = relu23->forward(var57); + auto &var59 = conv24->forward(var58); + auto &var60 = relu25->forward(var59); + auto &var61 = conv26->forward(var60); + auto &var62 = relu27->forward(var61); + auto &var63 = conv28->forward(var62); + auto &var64 = maxpool29->forward(var63); + auto &var65 = relu30->forward(var64); + auto &var66 = reshape31->forward(var65); + auto &var67 = gemm32->forward(var66); + auto &var68 = relu33->forward(var67); + auto &var69 = gemm34->forward(var68); + auto &var70 = relu35->forward(var69); + auto &var71 = gemm36->forward(var70); + return var71; + } +}; + +template +class ResNet18 : public SytorchModule +{ + using SytorchModule::add; + using SytorchModule::concat; + +public: + Conv2D *conv0; + MaxPool2D *maxpool1; + ReLU *relu2; + Conv2D *conv3; + ReLU *relu4; + Conv2D *conv5; + ReLU *relu7; + Conv2D *conv8; + ReLU *relu9; + Conv2D *conv10; + ReLU *relu12; + Conv2D *conv13; + ReLU *relu14; + Conv2D *conv15; + Conv2D *conv16; + ReLU *relu18; + Conv2D *conv19; + ReLU *relu20; + Conv2D *conv21; + ReLU *relu23; + Conv2D *conv24; + ReLU *relu25; + Conv2D *conv26; + Conv2D *conv27; + ReLU *relu29; + Conv2D *conv30; + ReLU *relu31; + Conv2D *conv32; + ReLU *relu34; + Conv2D *conv35; + ReLU *relu36; + Conv2D *conv37; + Conv2D *conv38; + ReLU *relu40; + Conv2D *conv41; + ReLU *relu42; + Conv2D *conv43; + ReLU *relu45; + GlobalAvgPool2D *globalaveragepool46; + Flatten *flatten47; + FC *gemm48; + +public: + ResNet18() + { + conv0 = new Conv2D(3, 64, 7, 3, 2, true); + maxpool1 = new MaxPool2D(3, 1, 2); + relu2 = new ReLU(); + conv3 = new Conv2D(64, 64, 3, 1, 1, true); + relu4 = new ReLU(); + conv5 = new Conv2D(64, 64, 3, 1, 1, true); + relu7 = new ReLU(); + conv8 = new Conv2D(64, 64, 3, 1, 1, true); + relu9 = new ReLU(); + conv10 = new Conv2D(64, 64, 3, 1, 1, true); + relu12 = new ReLU(); + conv13 = new Conv2D(64, 128, 3, 1, 2, true); + relu14 = new ReLU(); + conv15 = new Conv2D(128, 128, 3, 1, 1, true); + conv16 = new Conv2D(64, 128, 1, 0, 2, true); + relu18 = new ReLU(); + conv19 = new Conv2D(128, 128, 3, 1, 1, true); + relu20 = new ReLU(); + conv21 = new Conv2D(128, 128, 3, 1, 1, true); + relu23 = new ReLU(); + conv24 = new Conv2D(128, 256, 3, 1, 2, true); + relu25 = new ReLU(); + conv26 = new Conv2D(256, 256, 3, 1, 1, true); + conv27 = new Conv2D(128, 256, 1, 0, 2, true); + relu29 = new ReLU(); + conv30 = new Conv2D(256, 256, 3, 1, 1, true); + relu31 = new ReLU(); + conv32 = new Conv2D(256, 256, 3, 1, 1, true); + relu34 = new ReLU(); + conv35 = new Conv2D(256, 512, 3, 1, 2, true); + relu36 = new ReLU(); + conv37 = new Conv2D(512, 512, 3, 1, 1, true); + conv38 = new Conv2D(256, 512, 1, 0, 2, true); + relu40 = new ReLU(); + conv41 = new Conv2D(512, 512, 3, 1, 1, true); + relu42 = new ReLU(); + conv43 = new Conv2D(512, 512, 3, 1, 1, true); + relu45 = new ReLU(); + globalaveragepool46 = new GlobalAvgPool2D(); + flatten47 = new Flatten(); + gemm48 = new FC(512, 1000, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var44 = conv0->forward(input); + auto &var45 = maxpool1->forward(var44); + auto &var46 = relu2->forward(var45); + auto &var47 = conv3->forward(var46); + auto &var48 = relu4->forward(var47); + auto &var49 = conv5->forward(var48); + auto &var50 = add(var49, var46); + auto &var51 = relu7->forward(var50); + auto &var52 = conv8->forward(var51); + auto &var53 = relu9->forward(var52); + auto &var54 = conv10->forward(var53); + auto &var55 = add(var54, var51); + auto &var56 = relu12->forward(var55); + auto &var57 = conv13->forward(var56); + auto &var58 = relu14->forward(var57); + auto &var59 = conv15->forward(var58); + auto &var60 = conv16->forward(var56); + auto &var61 = add(var59, var60); + auto &var62 = relu18->forward(var61); + auto &var63 = conv19->forward(var62); + auto &var64 = relu20->forward(var63); + auto &var65 = conv21->forward(var64); + auto &var66 = add(var65, var62); + auto &var67 = relu23->forward(var66); + auto &var68 = conv24->forward(var67); + auto &var69 = relu25->forward(var68); + auto &var70 = conv26->forward(var69); + auto &var71 = conv27->forward(var67); + auto &var72 = add(var70, var71); + auto &var73 = relu29->forward(var72); + auto &var74 = conv30->forward(var73); + auto &var75 = relu31->forward(var74); + auto &var76 = conv32->forward(var75); + auto &var77 = add(var76, var73); + auto &var78 = relu34->forward(var77); + auto &var79 = conv35->forward(var78); + auto &var80 = relu36->forward(var79); + auto &var81 = conv37->forward(var80); + auto &var82 = conv38->forward(var78); + auto &var83 = add(var81, var82); + auto &var84 = relu40->forward(var83); + auto &var85 = conv41->forward(var84); + auto &var86 = relu42->forward(var85); + auto &var87 = conv43->forward(var86); + auto &var88 = add(var87, var84); + auto &var89 = relu45->forward(var88); + auto &var90 = globalaveragepool46->forward(var89); + auto &var91 = flatten47->forward(var90); + auto &var92 = gemm48->forward(var91); + return var92; + } +}; + +template +class ResNet50 : public SytorchModule +{ + using SytorchModule::add; + using SytorchModule::concat; + +public: + Conv2D *conv0; + MaxPool2D *maxpool1; + ReLU *relu2; + Conv2D *conv3; + ReLU *relu4; + Conv2D *conv5; + ReLU *relu6; + Conv2D *conv7; + Conv2D *conv8; + ReLU *relu10; + Conv2D *conv11; + ReLU *relu12; + Conv2D *conv13; + ReLU *relu14; + Conv2D *conv15; + ReLU *relu17; + Conv2D *conv18; + ReLU *relu19; + Conv2D *conv20; + ReLU *relu21; + Conv2D *conv22; + ReLU *relu24; + Conv2D *conv25; + ReLU *relu26; + Conv2D *conv27; + ReLU *relu28; + Conv2D *conv29; + Conv2D *conv30; + ReLU *relu32; + Conv2D *conv33; + ReLU *relu34; + Conv2D *conv35; + ReLU *relu36; + Conv2D *conv37; + ReLU *relu39; + Conv2D *conv40; + ReLU *relu41; + Conv2D *conv42; + ReLU *relu43; + Conv2D *conv44; + ReLU *relu46; + Conv2D *conv47; + ReLU *relu48; + Conv2D *conv49; + ReLU *relu50; + Conv2D *conv51; + ReLU *relu53; + Conv2D *conv54; + ReLU *relu55; + Conv2D *conv56; + ReLU *relu57; + Conv2D *conv58; + Conv2D *conv59; + ReLU *relu61; + Conv2D *conv62; + ReLU *relu63; + Conv2D *conv64; + ReLU *relu65; + Conv2D *conv66; + ReLU *relu68; + Conv2D *conv69; + ReLU *relu70; + Conv2D *conv71; + ReLU *relu72; + Conv2D *conv73; + ReLU *relu75; + Conv2D *conv76; + ReLU *relu77; + Conv2D *conv78; + ReLU *relu79; + Conv2D *conv80; + ReLU *relu82; + Conv2D *conv83; + ReLU *relu84; + Conv2D *conv85; + ReLU *relu86; + Conv2D *conv87; + ReLU *relu89; + Conv2D *conv90; + ReLU *relu91; + Conv2D *conv92; + ReLU *relu93; + Conv2D *conv94; + ReLU *relu96; + Conv2D *conv97; + ReLU *relu98; + Conv2D *conv99; + ReLU *relu100; + Conv2D *conv101; + Conv2D *conv102; + ReLU *relu104; + Conv2D *conv105; + ReLU *relu106; + Conv2D *conv107; + ReLU *relu108; + Conv2D *conv109; + ReLU *relu111; + Conv2D *conv112; + ReLU *relu113; + Conv2D *conv114; + ReLU *relu115; + Conv2D *conv116; + ReLU *relu118; + GlobalAvgPool2D *globalaveragepool119; + Flatten *flatten120; + FC *gemm121; + +public: + ResNet50() + { + conv0 = new Conv2D(3, 64, 7, 3, 2, true); + maxpool1 = new MaxPool2D(3, 1, 2); + relu2 = new ReLU(); + conv3 = new Conv2D(64, 64, 1, 0, 1, true); + relu4 = new ReLU(); + conv5 = new Conv2D(64, 64, 3, 1, 1, true); + relu6 = new ReLU(); + conv7 = new Conv2D(64, 256, 1, 0, 1, true); + conv8 = new Conv2D(64, 256, 1, 0, 1, true); + relu10 = new ReLU(); + conv11 = new Conv2D(256, 64, 1, 0, 1, true); + relu12 = new ReLU(); + conv13 = new Conv2D(64, 64, 3, 1, 1, true); + relu14 = new ReLU(); + conv15 = new Conv2D(64, 256, 1, 0, 1, true); + relu17 = new ReLU(); + conv18 = new Conv2D(256, 64, 1, 0, 1, true); + relu19 = new ReLU(); + conv20 = new Conv2D(64, 64, 3, 1, 1, true); + relu21 = new ReLU(); + conv22 = new Conv2D(64, 256, 1, 0, 1, true); + relu24 = new ReLU(); + conv25 = new Conv2D(256, 128, 1, 0, 1, true); + relu26 = new ReLU(); + conv27 = new Conv2D(128, 128, 3, 1, 2, true); + relu28 = new ReLU(); + conv29 = new Conv2D(128, 512, 1, 0, 1, true); + conv30 = new Conv2D(256, 512, 1, 0, 2, true); + relu32 = new ReLU(); + conv33 = new Conv2D(512, 128, 1, 0, 1, true); + relu34 = new ReLU(); + conv35 = new Conv2D(128, 128, 3, 1, 1, true); + relu36 = new ReLU(); + conv37 = new Conv2D(128, 512, 1, 0, 1, true); + relu39 = new ReLU(); + conv40 = new Conv2D(512, 128, 1, 0, 1, true); + relu41 = new ReLU(); + conv42 = new Conv2D(128, 128, 3, 1, 1, true); + relu43 = new ReLU(); + conv44 = new Conv2D(128, 512, 1, 0, 1, true); + relu46 = new ReLU(); + conv47 = new Conv2D(512, 128, 1, 0, 1, true); + relu48 = new ReLU(); + conv49 = new Conv2D(128, 128, 3, 1, 1, true); + relu50 = new ReLU(); + conv51 = new Conv2D(128, 512, 1, 0, 1, true); + relu53 = new ReLU(); + conv54 = new Conv2D(512, 256, 1, 0, 1, true); + relu55 = new ReLU(); + conv56 = new Conv2D(256, 256, 3, 1, 2, true); + relu57 = new ReLU(); + conv58 = new Conv2D(256, 1024, 1, 0, 1, true); + conv59 = new Conv2D(512, 1024, 1, 0, 2, true); + relu61 = new ReLU(); + conv62 = new Conv2D(1024, 256, 1, 0, 1, true); + relu63 = new ReLU(); + conv64 = new Conv2D(256, 256, 3, 1, 1, true); + relu65 = new ReLU(); + conv66 = new Conv2D(256, 1024, 1, 0, 1, true); + relu68 = new ReLU(); + conv69 = new Conv2D(1024, 256, 1, 0, 1, true); + relu70 = new ReLU(); + conv71 = new Conv2D(256, 256, 3, 1, 1, true); + relu72 = new ReLU(); + conv73 = new Conv2D(256, 1024, 1, 0, 1, true); + relu75 = new ReLU(); + conv76 = new Conv2D(1024, 256, 1, 0, 1, true); + relu77 = new ReLU(); + conv78 = new Conv2D(256, 256, 3, 1, 1, true); + relu79 = new ReLU(); + conv80 = new Conv2D(256, 1024, 1, 0, 1, true); + relu82 = new ReLU(); + conv83 = new Conv2D(1024, 256, 1, 0, 1, true); + relu84 = new ReLU(); + conv85 = new Conv2D(256, 256, 3, 1, 1, true); + relu86 = new ReLU(); + conv87 = new Conv2D(256, 1024, 1, 0, 1, true); + relu89 = new ReLU(); + conv90 = new Conv2D(1024, 256, 1, 0, 1, true); + relu91 = new ReLU(); + conv92 = new Conv2D(256, 256, 3, 1, 1, true); + relu93 = new ReLU(); + conv94 = new Conv2D(256, 1024, 1, 0, 1, true); + relu96 = new ReLU(); + conv97 = new Conv2D(1024, 512, 1, 0, 1, true); + relu98 = new ReLU(); + conv99 = new Conv2D(512, 512, 3, 1, 2, true); + relu100 = new ReLU(); + conv101 = new Conv2D(512, 2048, 1, 0, 1, true); + conv102 = new Conv2D(1024, 2048, 1, 0, 2, true); + relu104 = new ReLU(); + conv105 = new Conv2D(2048, 512, 1, 0, 1, true); + relu106 = new ReLU(); + conv107 = new Conv2D(512, 512, 3, 1, 1, true); + relu108 = new ReLU(); + conv109 = new Conv2D(512, 2048, 1, 0, 1, true); + relu111 = new ReLU(); + conv112 = new Conv2D(2048, 512, 1, 0, 1, true); + relu113 = new ReLU(); + conv114 = new Conv2D(512, 512, 3, 1, 1, true); + relu115 = new ReLU(); + conv116 = new Conv2D(512, 2048, 1, 0, 1, true); + relu118 = new ReLU(); + globalaveragepool119 = new GlobalAvgPool2D(); + flatten120 = new Flatten(); + gemm121 = new FC(2048, 1000, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var110 = conv0->forward(input); + auto &var111 = maxpool1->forward(var110); + auto &var112 = relu2->forward(var111); + auto &var113 = conv3->forward(var112); + auto &var114 = relu4->forward(var113); + auto &var115 = conv5->forward(var114); + auto &var116 = relu6->forward(var115); + auto &var117 = conv7->forward(var116); + auto &var118 = conv8->forward(var112); + auto &var119 = add(var117, var118); + auto &var120 = relu10->forward(var119); + auto &var121 = conv11->forward(var120); + auto &var122 = relu12->forward(var121); + auto &var123 = conv13->forward(var122); + auto &var124 = relu14->forward(var123); + auto &var125 = conv15->forward(var124); + auto &var126 = add(var125, var120); + auto &var127 = relu17->forward(var126); + auto &var128 = conv18->forward(var127); + auto &var129 = relu19->forward(var128); + auto &var130 = conv20->forward(var129); + auto &var131 = relu21->forward(var130); + auto &var132 = conv22->forward(var131); + auto &var133 = add(var132, var127); + auto &var134 = relu24->forward(var133); + auto &var135 = conv25->forward(var134); + auto &var136 = relu26->forward(var135); + auto &var137 = conv27->forward(var136); + auto &var138 = relu28->forward(var137); + auto &var139 = conv29->forward(var138); + auto &var140 = conv30->forward(var134); + auto &var141 = add(var139, var140); + auto &var142 = relu32->forward(var141); + auto &var143 = conv33->forward(var142); + auto &var144 = relu34->forward(var143); + auto &var145 = conv35->forward(var144); + auto &var146 = relu36->forward(var145); + auto &var147 = conv37->forward(var146); + auto &var148 = add(var147, var142); + auto &var149 = relu39->forward(var148); + auto &var150 = conv40->forward(var149); + auto &var151 = relu41->forward(var150); + auto &var152 = conv42->forward(var151); + auto &var153 = relu43->forward(var152); + auto &var154 = conv44->forward(var153); + auto &var155 = add(var154, var149); + auto &var156 = relu46->forward(var155); + auto &var157 = conv47->forward(var156); + auto &var158 = relu48->forward(var157); + auto &var159 = conv49->forward(var158); + auto &var160 = relu50->forward(var159); + auto &var161 = conv51->forward(var160); + auto &var162 = add(var161, var156); + auto &var163 = relu53->forward(var162); + auto &var164 = conv54->forward(var163); + auto &var165 = relu55->forward(var164); + auto &var166 = conv56->forward(var165); + auto &var167 = relu57->forward(var166); + auto &var168 = conv58->forward(var167); + auto &var169 = conv59->forward(var163); + auto &var170 = add(var168, var169); + auto &var171 = relu61->forward(var170); + auto &var172 = conv62->forward(var171); + auto &var173 = relu63->forward(var172); + auto &var174 = conv64->forward(var173); + auto &var175 = relu65->forward(var174); + auto &var176 = conv66->forward(var175); + auto &var177 = add(var176, var171); + auto &var178 = relu68->forward(var177); + auto &var179 = conv69->forward(var178); + auto &var180 = relu70->forward(var179); + auto &var181 = conv71->forward(var180); + auto &var182 = relu72->forward(var181); + auto &var183 = conv73->forward(var182); + auto &var184 = add(var183, var178); + auto &var185 = relu75->forward(var184); + auto &var186 = conv76->forward(var185); + auto &var187 = relu77->forward(var186); + auto &var188 = conv78->forward(var187); + auto &var189 = relu79->forward(var188); + auto &var190 = conv80->forward(var189); + auto &var191 = add(var190, var185); + auto &var192 = relu82->forward(var191); + auto &var193 = conv83->forward(var192); + auto &var194 = relu84->forward(var193); + auto &var195 = conv85->forward(var194); + auto &var196 = relu86->forward(var195); + auto &var197 = conv87->forward(var196); + auto &var198 = add(var197, var192); + auto &var199 = relu89->forward(var198); + auto &var200 = conv90->forward(var199); + auto &var201 = relu91->forward(var200); + auto &var202 = conv92->forward(var201); + auto &var203 = relu93->forward(var202); + auto &var204 = conv94->forward(var203); + auto &var205 = add(var204, var199); + auto &var206 = relu96->forward(var205); + auto &var207 = conv97->forward(var206); + auto &var208 = relu98->forward(var207); + auto &var209 = conv99->forward(var208); + auto &var210 = relu100->forward(var209); + auto &var211 = conv101->forward(var210); + auto &var212 = conv102->forward(var206); + auto &var213 = add(var211, var212); + auto &var214 = relu104->forward(var213); + auto &var215 = conv105->forward(var214); + auto &var216 = relu106->forward(var215); + auto &var217 = conv107->forward(var216); + auto &var218 = relu108->forward(var217); + auto &var219 = conv109->forward(var218); + auto &var220 = add(var219, var214); + auto &var221 = relu111->forward(var220); + auto &var222 = conv112->forward(var221); + auto &var223 = relu113->forward(var222); + auto &var224 = conv114->forward(var223); + auto &var225 = relu115->forward(var224); + auto &var226 = conv116->forward(var225); + auto &var227 = add(var226, var221); + auto &var228 = relu118->forward(var227); + auto &var229 = globalaveragepool119->forward(var228); + auto &var230 = flatten120->forward(var229); + auto &var231 = gemm121->forward(var230); + return var231; + } +}; + +template +class PVGG16NoRelu : public SytorchModule +{ + +public: + Conv2D *conv0; + ReLU *relu1; + Conv2D *conv2; + AvgPool2D *maxpool3; + ReLU *relu4; + Conv2D *conv5; + ReLU *relu6; + Conv2D *conv7; + AvgPool2D *maxpool8; + ReLU *relu9; + Conv2D *conv10; + ReLU *relu11; + Conv2D *conv12; + ReLU *relu13; + Conv2D *conv14; + AvgPool2D *maxpool15; + ReLU *relu16; + Conv2D *conv17; + ReLU *relu18; + Conv2D *conv19; + ReLU *relu20; + Conv2D *conv21; + AvgPool2D *maxpool22; + ReLU *relu23; + Conv2D *conv24; + ReLU *relu25; + Conv2D *conv26; + ReLU *relu27; + Conv2D *conv28; + AvgPool2D *maxpool29; + ReLU *relu30; + Flatten *reshape31; + FC *gemm32; + ReLU *relu33; + FC *gemm34; + ReLU *relu35; + FC *gemm36; + +public: + PVGG16NoRelu() + { + conv0 = new Conv2D(3, 64, 3, 1, 1, false); + relu1 = new ReLU(); + conv2 = new Conv2D(64, 64, 3, 1, 1, false); + maxpool3 = new AvgPool2D(2, 0, 2); + relu4 = new ReLU(); + conv5 = new Conv2D(64, 128, 3, 1, 1, false); + relu6 = new ReLU(); + conv7 = new Conv2D(128, 128, 3, 1, 1, false); + maxpool8 = new AvgPool2D(2, 0, 2); + relu9 = new ReLU(); + conv10 = new Conv2D(128, 256, 3, 1, 1, false); + relu11 = new ReLU(); + conv12 = new Conv2D(256, 256, 3, 1, 1, false); + relu13 = new ReLU(); + conv14 = new Conv2D(256, 256, 3, 1, 1, false); + maxpool15 = new AvgPool2D(2, 0, 2); + relu16 = new ReLU(); + conv17 = new Conv2D(256, 512, 3, 1, 1, false); + relu18 = new ReLU(); + conv19 = new Conv2D(512, 512, 3, 1, 1, false); + relu20 = new ReLU(); + conv21 = new Conv2D(512, 512, 3, 1, 1, false); + maxpool22 = new AvgPool2D(2, 0, 2); + relu23 = new ReLU(); + conv24 = new Conv2D(512, 512, 3, 1, 1, false); + relu25 = new ReLU(); + conv26 = new Conv2D(512, 512, 3, 1, 1, false); + relu27 = new ReLU(); + conv28 = new Conv2D(512, 512, 3, 1, 1, false); + maxpool29 = new AvgPool2D(2, 0, 2); + relu30 = new ReLU(); + reshape31 = new Flatten(); + gemm32 = new FC(512, 256, true); + relu33 = new ReLU(); + gemm34 = new FC(256, 256, true); + relu35 = new ReLU(); + gemm36 = new FC(256, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var35 = conv0->forward(input); + auto &var36 = relu1->forward(var35); + auto &var37 = conv2->forward(var36); + auto &var38 = maxpool3->forward(var37); + auto &var39 = relu4->forward(var38); + auto &var40 = conv5->forward(var39); + auto &var41 = relu6->forward(var40); + auto &var42 = conv7->forward(var41); + auto &var43 = maxpool8->forward(var42); + auto &var44 = relu9->forward(var43); + auto &var45 = conv10->forward(var44); + auto &var46 = relu11->forward(var45); + auto &var47 = conv12->forward(var46); + auto &var48 = relu13->forward(var47); + auto &var49 = conv14->forward(var48); + auto &var50 = maxpool15->forward(var49); + auto &var51 = relu16->forward(var50); + auto &var52 = conv17->forward(var51); + auto &var53 = relu18->forward(var52); + auto &var54 = conv19->forward(var53); + auto &var55 = relu20->forward(var54); + auto &var56 = conv21->forward(var55); + auto &var57 = maxpool22->forward(var56); + auto &var58 = relu23->forward(var57); + auto &var59 = conv24->forward(var58); + auto &var60 = relu25->forward(var59); + auto &var61 = conv26->forward(var60); + auto &var62 = relu27->forward(var61); + auto &var63 = conv28->forward(var62); + auto &var64 = maxpool29->forward(var63); + auto &var65 = relu30->forward(var64); + auto &var66 = reshape31->forward(var65); + auto &var67 = gemm32->forward(var66); + auto &var68 = relu33->forward(var67); + auto &var69 = gemm34->forward(var68); + auto &var70 = relu35->forward(var69); + auto &var71 = gemm36->forward(var70); + return var71; + } +}; + +template +class PAlexnetNoRelu : public SytorchModule +{ + +public: + Conv2D *conv0; + AvgPool2D *maxpool1; + ReLU *relu2; + Conv2D *conv3; + AvgPool2D *maxpool4; + ReLU *relu5; + Conv2D *conv6; + ReLU *relu7; + Conv2D *conv8; + ReLU *relu9; + Conv2D *conv10; + ReLU *relu11; + Flatten *reshape12; + FC *gemm13; + ReLU *relu14; + FC *gemm15; + ReLU *relu16; + FC *gemm17; + +public: + PAlexnetNoRelu() + { + conv0 = new Conv2D(3, 96, 11, 9, 4, false); + maxpool1 = new AvgPool2D(3, 0, 2); + relu2 = new ReLU(); + conv3 = new Conv2D(96, 256, 5, 1, 1, false); + maxpool4 = new AvgPool2D(2, 0, 1); + relu5 = new ReLU(); + conv6 = new Conv2D(256, 384, 3, 1, 1, false); + relu7 = new ReLU(); + conv8 = new Conv2D(384, 384, 3, 1, 1, false); + relu9 = new ReLU(); + conv10 = new Conv2D(384, 256, 3, 1, 1, false); + relu11 = new ReLU(); + reshape12 = new Flatten(); + gemm13 = new FC(256, 256, true); + relu14 = new ReLU(); + gemm15 = new FC(256, 256, true); + relu16 = new ReLU(); + gemm17 = new FC(256, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var15 = conv0->forward(input); + auto &var16 = maxpool1->forward(var15); + auto &var17 = relu2->forward(var16); + auto &var18 = conv3->forward(var17); + auto &var19 = maxpool4->forward(var18); + auto &var20 = relu5->forward(var19); + auto &var21 = conv6->forward(var20); + auto &var22 = relu7->forward(var21); + auto &var23 = conv8->forward(var22); + auto &var24 = relu9->forward(var23); + auto &var25 = conv10->forward(var24); + auto &var26 = relu11->forward(var25); + auto &var27 = reshape12->forward(var26); + auto &var28 = gemm13->forward(var27); + auto &var29 = relu14->forward(var28); + auto &var30 = gemm15->forward(var29); + auto &var31 = relu16->forward(var30); + auto &var32 = gemm17->forward(var31); + return var32; + } +}; + +template +class FalconAlexnetNoRelu : public SytorchModule +{ + +public: + Conv2D *conv0; + MaxPool2D *maxpool1; + ReLU *relu2; + Conv2D *conv3; + MaxPool2D *maxpool4; + ReLU *relu5; + Conv2D *conv6; + ReLU *relu7; + Conv2D *conv8; + ReLU *relu9; + Conv2D *conv10; + ReLU *relu11; + Flatten *reshape12; + FC *gemm13; + ReLU *relu14; + FC *gemm15; + ReLU *relu16; + FC *gemm17; + +public: + FalconAlexnetNoRelu() + { + conv0 = new Conv2D(3, 96, 11, 9, 4, true); + maxpool1 = new MaxPool2D(3, 0, 2); + relu2 = new ReLU(); + conv3 = new Conv2D(96, 256, 5, 1, 1, true); + maxpool4 = new MaxPool2D(2, 0, 1); + relu5 = new ReLU(); + conv6 = new Conv2D(256, 384, 3, 1, 1, true); + relu7 = new ReLU(); + conv8 = new Conv2D(384, 384, 3, 1, 1, true); + relu9 = new ReLU(); + conv10 = new Conv2D(384, 256, 3, 1, 1, true); + relu11 = new ReLU(); + reshape12 = new Flatten(); + gemm13 = new FC(256, 256, true); + relu14 = new ReLU(); + gemm15 = new FC(256, 256, true); + relu16 = new ReLU(); + gemm17 = new FC(256, 10, true); + } + + Tensor &_forward(Tensor &input) + { + auto &var15 = conv0->forward(input); + auto &var16 = maxpool1->forward(var15); + auto &var17 = relu2->forward(var16); + auto &var18 = conv3->forward(var17); + auto &var19 = maxpool4->forward(var18); + auto &var20 = relu5->forward(var19); + auto &var21 = conv6->forward(var20); + auto &var22 = relu7->forward(var21); + auto &var23 = conv8->forward(var22); + auto &var24 = relu9->forward(var23); + auto &var25 = conv10->forward(var24); + auto &var26 = relu11->forward(var25); + auto &var27 = reshape12->forward(var26); + auto &var28 = gemm13->forward(var27); + auto &var29 = relu14->forward(var28); + auto &var30 = gemm15->forward(var29); + auto &var31 = relu16->forward(var30); + auto &var32 = gemm17->forward(var31); + return var32; + } +}; + +template +SytorchModule *getCNN(std::string name) +{ + SytorchModule *m; + if (name.compare("CNN2") == 0) + { + m = new CNN2(); + } + else if (name.compare("CNN3") == 0) + { + m = new CNN3(); + } + else if (name.compare("ResNet18") == 0) + { + m = new ResNet18(); + } + else if (name.compare("ResNet50") == 0) + { + m = new ResNet50(); + } + else if (name.compare("VGG16") == 0) + { + m = new VGG16(); + } + else if (name.compare("P-LeNet") == 0) + { + m = new PLenetNoReluAvgPool(); + } + else if (name.compare("P-SecureML") == 0) + { + m = new PSecureMlNoRelu(); + } + else if (name.compare("P-VGG16") == 0) + { + m = new PVGG16NoRelu(); + } + else if (name.compare("P-AlexNet") == 0) + { + m = new PAlexnetNoRelu(); + } + else if (name.compare("AlexNet") == 0) + { + m = new FalconAlexnetNoRelu(); + } + else if (name.compare("ModelB") == 0) + { + m = new MinionnLenet(); + } + else + { + assert(0 && "unknown model"); + } + return m; +} + +template +dcf::orca::GPUModel *getGPUModel(std::string modelName, Tensor inp) +{ + dcf::orca::GPUModel *m; + if (*(modelName.data()) == 'P') + { + m = getPiranhaCNN(modelName, inp); + } + else + { + m = getOrcaCNN(modelName, inp); + } + return m; +} + +// in LlamaImproved, mode takes the value according to the following rule: +// 0: the layer takes as input \ell bits and outputs \ell bits +// 1: the layer takes as input \ell bits and outputs \ell - scale bits +// 2: the layer takes as input \ell - scale bits and outputs \ell bits +// 3: the layer takes as input \ell - scale bits and outputs \ell - scale bits + +template +dcf::orca::GPUModel *getOrcaCNN(std::string modelName, Tensor inp) +{ + auto m = getCNN(modelName); + m->init((u64)dcf::orca::global::scale, inp); + m->train(); + auto b = new Orca(); + m->setBackend(b); + m->optimize(); + dcf::orca::GPUModel *gpuModel = new dcf::orca::GPUModel(); + for (auto n : m->allNodesInExecutionOrder) + { + auto layer = n->layer; + if (layer->name == "Conv2D") + { + assert(layer->mode == 1); + auto convLayer = (Conv2D *)(layer); + int N, h, w, c; + N = convLayer->inputDerivative.shape[0]; + h = convLayer->inputDerivative.shape[1]; + w = convLayer->inputDerivative.shape[2]; + c = convLayer->inputDerivative.shape[3]; + assert(c == convLayer->ci); + auto orcaConv2D = new dcf::orca::Conv2DLayer((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, N, h, w, (int)convLayer->ci, (int)convLayer->fh, (int)convLayer->fw, (int)convLayer->co, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->stride, (int)convLayer->stride, convLayer->useBias, dcf::TruncateType::StochasticTR, dcf::TruncateType::StochasticTruncate, !layer->isFirst, layer->isFirst); + auto filter = convLayer->getweights(); + // memcpy(orcaConv2D->F, filter.data, filter.size * sizeof(T)); + if (convLayer->useBias) + { + auto bias = convLayer->getbias(); + // memcpy(orcaConv2D->b, bias.data, bias.size * sizeof(T)); + } + gpuModel->layers.push_back(orcaConv2D); + } + + else if (layer->name == "MaxPool2D") + { + assert(layer->mode == 3); + auto maxPoolLayer = (MaxPool2D *)(layer); + int bwToUse = dcf::orca::global::bw; + bwToUse -= dcf::orca::global::scale; + int N, h, w, c; + N = maxPoolLayer->inputDerivative.shape[0]; + h = maxPoolLayer->inputDerivative.shape[1]; + w = maxPoolLayer->inputDerivative.shape[2]; + c = maxPoolLayer->inputDerivative.shape[3]; + auto orcaMaxPool = new dcf::orca::MaxPool2DLayer(bwToUse, bwToUse, dcf::orca::global::bw, N, h, w, c, maxPoolLayer->ks, maxPoolLayer->ks, maxPoolLayer->stride, maxPoolLayer->stride, maxPoolLayer->padding, maxPoolLayer->padding, maxPoolLayer->padding, maxPoolLayer->padding); + gpuModel->layers.push_back(orcaMaxPool); + } + else if (layer->name == "FC") + { + assert(layer->mode == 1); + auto fcLayer = (FC *)(layer); + auto orcaFC = new dcf::orca::FCLayer(dcf::orca::global::bw, dcf::orca::global::bw, (int)fcLayer->inputDerivative.shape[0], (int)fcLayer->out, (int)fcLayer->in, dcf::TruncateType::StochasticTR, dcf::TruncateType::StochasticTruncate, fcLayer->useBias, !layer->isFirst, layer->isFirst); + auto W = fcLayer->getweights(); + // memcpy(orcaFC->W, W.data, W.size * sizeof(T)); + if (fcLayer->useBias) + { + auto bias = fcLayer->getbias(); + // memcpy(orcaFC->Y, bias.data, bias.size * sizeof(T)); + } + gpuModel->layers.push_back(orcaFC); + } + else if (layer->name == "ReLU") + { + assert(layer->mode == 2); + auto reluLayer = (ReLU *)(layer); + int r = layer->activation.size(); + auto orcaRelu = new dcf::orca::ReluExtendLayer(dcf::orca::global::bw - dcf::orca::global::scale, dcf::orca::global::bw, r); + gpuModel->layers.push_back(orcaRelu); + } + } + int l = m->allNodesInExecutionOrder.size(); + gpuModel->batchSz = inp.shape[0]; + gpuModel->inpSz = inp.size(); + gpuModel->classes = m->allNodesInExecutionOrder[l - 1]->currTensor->shape[1]; + return gpuModel; +} + +template +dcf::orca::GPUModel *getPiranhaCNN(std::string modelName, Tensor inp) +{ + auto m = getCNN(modelName); + if (modelName.compare("P-SecureML") == 0) + { + Tensor temp(nullptr, {inp.shape[0], inp.size() / inp.shape[0]}); + m->init((u64)dcf::orca::global::scale, temp); + } + else + { + m->init((u64)dcf::orca::global::scale, inp); + } + m->train(); + auto b = new Piranha(); + m->setBackend(b); + m->optimize(); + dcf::orca::GPUModel *gpuModel = new dcf::orca::GPUModel(); + for (auto n : m->allNodesInExecutionOrder) + { + auto layer = n->layer; + if (layer->name == "Conv2D") + { + auto convLayer = (Conv2D *)(layer); + assert(!convLayer->useBias); + int N, h, w, c; + N = convLayer->inputDerivative.shape[0]; + h = convLayer->inputDerivative.shape[1]; + w = convLayer->inputDerivative.shape[2]; + c = convLayer->inputDerivative.shape[3]; + assert(c == convLayer->ci); + auto gpuLayer = new dcf::orca::Conv2DLayer((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, (int)N, (int)h, (int)w, (int)convLayer->ci, (int)convLayer->fh, (int)convLayer->fw, (int)convLayer->co, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->padding, (int)convLayer->stride, (int)convLayer->stride, convLayer->useBias, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS, !layer->isFirst, layer->isFirst); + gpuModel->layers.push_back(gpuLayer); + } + else if (layer->name == "FC") + { + auto fcLayer = (FC *)(layer); + auto gpuLayer = new dcf::orca::FCLayer((int)dcf::orca::global::bw, (int)dcf::orca::global::bw, (int)fcLayer->inputDerivative.shape[0], (int)fcLayer->out, (int)fcLayer->in, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS, fcLayer->useBias, !layer->isFirst, layer->isFirst); + gpuModel->layers.push_back(gpuLayer); + } + else if (layer->name == "ReLU") + { + auto reluLayer = (ReLU *)(layer); + int r = layer->activation.size(); + // printf("r=%lu\n", r); + int inputBw = dcf::orca::global::bw - dcf::orca::global::scale - layer->mode; + auto gpuLayer = new dcf::orca::ReluLayer(inputBw, dcf::orca::global::bw, r); + gpuModel->layers.push_back(gpuLayer); + } + else if (layer->name == "AvgPool2D") + { + auto avgPoolLayer = (AvgPool2D *)(layer); + assert(n->parents.size() == 1); + auto p = n->parents[0]; + auto &a = p->layer->activation; + assert(a.shape.size() == 4); + int N, h, w, c; + N = a.shape[0]; + h = a.shape[1]; + w = a.shape[2]; + c = a.shape[3]; + auto gpuLayer = new dcf::orca::AvgPool2DLayer(dcf::orca::global::bw, dcf::orca::global::bw, dcf::orca::global::scale, N, h, w, c, avgPoolLayer->ks, avgPoolLayer->ks, avgPoolLayer->stride, avgPoolLayer->stride, avgPoolLayer->padding, avgPoolLayer->padding, avgPoolLayer->padding, avgPoolLayer->padding, dcf::TruncateType::LocalARS, dcf::TruncateType::LocalARS); + gpuModel->layers.push_back(gpuLayer); + } + } + int l = m->allNodesInExecutionOrder.size(); + printf("########Layers=%d\n", l); + gpuModel->batchSz = inp.shape[0]; + gpuModel->inpSz = inp.size(); + gpuModel->classes = m->allNodesInExecutionOrder[l - 1]->currTensor->shape[1]; + return gpuModel; +} \ No newline at end of file diff --git a/GPU-MPC/experiments/orca/config.json b/GPU-MPC/experiments/orca/config.json new file mode 100644 index 00000000..7251b442 --- /dev/null +++ b/GPU-MPC/experiments/orca/config.json @@ -0,0 +1,22 @@ +{ + "P0": { + "dealer": { + "gpu": 0, + "key_dir": "/tmp/" + }, + "evaluator": { + "gpu": 1, + "peer": "0.0.0.0" + } + }, + "P1": { + "dealer": { + "gpu": 2, + "key_dir": "/tmp/" + }, + "evaluator": { + "gpu": 3, + "peer": "0.0.0.0" + } + } +} diff --git a/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh b/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh new file mode 100755 index 00000000..ea42d46d --- /dev/null +++ b/GPU-MPC/experiments/orca/datasets/cifar-10/download-cifar10.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# If not already downloaded +if [ ! -f ./cifar-10-batches-bin/data_batch_1.bin ]; then + # If the archive does not exist, download it + if [ ! -f ./cifar-10-binary.tar.gz ]; then + wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz + fi + + # Extract all the files + tar xf cifar-10-binary.tar.gz +fi \ No newline at end of file diff --git a/GPU-MPC/experiments/orca/datasets/cifar10.h b/GPU-MPC/experiments/orca/datasets/cifar10.h new file mode 100644 index 00000000..2ccc0462 --- /dev/null +++ b/GPU-MPC/experiments/orca/datasets/cifar10.h @@ -0,0 +1,333 @@ +//======================================================================= +// Copyright (c) 2017 Baptiste Wicht +// Distributed under the terms of the MIT License. +// (See accompanying file LICENSE or copy at +// http://opensource.org/licenses/MIT) +//======================================================================= + +/*! + * \file + * \brief Contains functions to read the CIFAR-10 dataset + */ + +#ifndef CIFAR10_READER_HPP +#define CIFAR10_READER_HPP + +#include +#include +#include +#include +#include +#include + +namespace cifar { + +/*! + * \brief Represents a complete CIFAR10 dataset + * \tparam Container The container to use + * \tparam Image The type of image + * \tparam Label The type of label + */ +template