From 222db9ab3eed6665d23074fc01572a1f2d24aef7 Mon Sep 17 00:00:00 2001 From: Kanav Gupta Date: Tue, 28 May 2024 19:34:44 +0530 Subject: [PATCH] cpu benchmarking scripts --- GPU-MPC/ext/sytorch/CMakeLists.txt | 49 +++ GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp | 256 +++++++++++++++ GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp | 256 +++++++++++++++ GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp | 250 ++++++++++++++ GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp | 226 +++++++++++++ GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp | 304 ++++++++++++++++++ GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp | 266 +++++++++++++++ GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp | 266 +++++++++++++++ .../scripts/all-cpu-benchmarks-local.py | 85 +++++ .../scripts/all-cpu-benchmarks-remote.py | 88 +++++ 10 files changed, 2046 insertions(+) create mode 100644 GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp create mode 100644 GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp create mode 100644 GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py create mode 100644 GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py diff --git a/GPU-MPC/ext/sytorch/CMakeLists.txt b/GPU-MPC/ext/sytorch/CMakeLists.txt index 544e70a8..9d871fad 100755 --- a/GPU-MPC/ext/sytorch/CMakeLists.txt +++ b/GPU-MPC/ext/sytorch/CMakeLists.txt @@ -370,3 +370,52 @@ add_executable( ) target_link_libraries(gptneobenchmark ${PROJECT_NAME}) + +add_executable( + benchmark-bert-tiny + benchmarks/bert-tiny.cpp +) + +target_link_libraries(benchmark-bert-tiny ${PROJECT_NAME}) + +add_executable( + benchmark-bert-base + benchmarks/bert-base.cpp +) + +target_link_libraries(benchmark-bert-base ${PROJECT_NAME}) + +add_executable( + benchmark-bert-large + benchmarks/bert-large.cpp +) + +target_link_libraries(benchmark-bert-large ${PROJECT_NAME}) + +add_executable( + benchmark-gpt2 + benchmarks/gpt2.cpp +) + +target_link_libraries(benchmark-gpt2 ${PROJECT_NAME}) + +add_executable( + benchmark-gptneo + benchmarks/gptneo.cpp +) + +target_link_libraries(benchmark-gptneo ${PROJECT_NAME}) + +add_executable( + benchmark-llama-7b + benchmarks/llama-7b.cpp +) + +target_link_libraries(benchmark-llama-7b ${PROJECT_NAME}) + +add_executable( + benchmark-llama-13b + benchmarks/llama-13b.cpp +) + +target_link_libraries(benchmark-llama-13b ${PROJECT_NAME}) diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp new file mode 100644 index 00000000..078f0ab0 --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp @@ -0,0 +1,256 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ +public: + using SytorchModule::gelu; + + u64 in; + u64 hidden; + +public: + FC *up; + FC *down; + + FFN(u64 in, u64 hidden) : in(in), hidden(hidden) + { + up = new FC(in, hidden, true); + down = new FC(hidden, in, true); + } + + Tensor &_forward(Tensor &input) + { + return down->forward(gelu(up->forward(input))); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ +public: + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::scalarmul; + using SytorchModule::invsqrt; + using SytorchModule::softmax; + using SytorchModule::concat; + using SytorchModule::attention_mask; + +public: + FC *c_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + c_attn = new FC(n_embd, 3 * n_embd, true); + c_proj = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &x = c_attn->forward(input); + auto &qkv_heads = split(x, 3); + auto &q_heads = view(qkv_heads, 0); + auto &k_heads = view(qkv_heads, 1); + auto &v_heads = view(qkv_heads, 2); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector *> qks_sm_vs; + for (u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + auto &kt = transpose(k); + auto &qk = matmul(q, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ +public: + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + LayerNorm *ln0; + LayerNorm *ln1; + + u64 n_heads, n_embd; + +public: + TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, 4 * n_embd); + ln0 = new LayerNorm(n_embd); + ln1 = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + auto &attn_out = attn->forward(input); + auto &add0_out = add(attn_out, input); + auto &ln0_out = ln0->forward(add0_out); + + auto &ffn_out = ffn->forward(ln0_out); + auto &add1_out = add(ffn_out, ln0_out); + auto &ln1_out = ln1->forward(add1_out); + return ln1_out; + } +}; + +template +class BERT : public SytorchModule +{ +public: + using SytorchModule::tanh; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::unsqueeze; + std::vector *> blocks; + LayerNorm *ln_f; + FC *pool; + u64 n_layer, n_heads, n_embd; + +public: + BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd) + { + for (u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd)); + } + ln_f = new LayerNorm(n_embd); + pool = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &y = ln_f->forward(input); + Tensor *x = &y; + // Tensor *x = &input; + + for (u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + + return *x; + } +}; + +int main(int __argc, char **__argv) +{ + sytorch_init(); + + // bert base + const u64 n_embd = 768; + const u64 n_head = 12; + const u64 n_layer = 12; + const u64 scale = 12; + const u64 bw = 51; + const u64 n_seq = 128; + + int party = atoi(__argv[1]); + std::string ip = "127.0.0.1"; + if (__argc > 2) + ip = __argv[2]; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + srand(time(NULL)); + + LlamaConfig::bitlength = bw; + LlamaConfig::party = party; + + llama->init(ip, true); + + BERT net(n_layer, n_head, n_embd); + Tensor input({n_seq, n_embd}); + net.init(scale, input); + net.setBackend(llama); + net.optimize(); + if (party != DEALER) + { + // net.load("bert-tiny-weights.dat"); + // input.load("15469.dat", scale); + input.fill(1LL << (scale - 2)); + } + else + { + net.zero(); + } + + llama->initializeInferencePartyA(net.root); + llama->initializeInferencePartyB(input); + + llama::start(); + net.forward(input); + llama::end(); + + auto &output = net.activation; + llama->outputA(output); + llama->finalize(); + + if (party == CLIENT) + { + auto signedAct = Tensor((i64*) net.activation.data, net.activation.shape); + print(signedAct, scale, bw); + } + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp new file mode 100644 index 00000000..f345d34a --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp @@ -0,0 +1,256 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ +public: + using SytorchModule::gelu; + + u64 in; + u64 hidden; + +public: + FC *up; + FC *down; + + FFN(u64 in, u64 hidden) : in(in), hidden(hidden) + { + up = new FC(in, hidden, true); + down = new FC(hidden, in, true); + } + + Tensor &_forward(Tensor &input) + { + return down->forward(gelu(up->forward(input))); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ +public: + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::scalarmul; + using SytorchModule::invsqrt; + using SytorchModule::softmax; + using SytorchModule::concat; + using SytorchModule::attention_mask; + +public: + FC *c_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + c_attn = new FC(n_embd, 3 * n_embd, true); + c_proj = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &x = c_attn->forward(input); + auto &qkv_heads = split(x, 3); + auto &q_heads = view(qkv_heads, 0); + auto &k_heads = view(qkv_heads, 1); + auto &v_heads = view(qkv_heads, 2); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector *> qks_sm_vs; + for (u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + auto &kt = transpose(k); + auto &qk = matmul(q, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ +public: + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + LayerNorm *ln0; + LayerNorm *ln1; + + u64 n_heads, n_embd; + +public: + TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, 4 * n_embd); + ln0 = new LayerNorm(n_embd); + ln1 = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + auto &attn_out = attn->forward(input); + auto &add0_out = add(attn_out, input); + auto &ln0_out = ln0->forward(add0_out); + + auto &ffn_out = ffn->forward(ln0_out); + auto &add1_out = add(ffn_out, ln0_out); + auto &ln1_out = ln1->forward(add1_out); + return ln1_out; + } +}; + +template +class BERT : public SytorchModule +{ +public: + using SytorchModule::tanh; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::unsqueeze; + std::vector *> blocks; + LayerNorm *ln_f; + FC *pool; + u64 n_layer, n_heads, n_embd; + +public: + BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd) + { + for (u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd)); + } + ln_f = new LayerNorm(n_embd); + pool = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &y = ln_f->forward(input); + Tensor *x = &y; + // Tensor *x = &input; + + for (u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + + return *x; + } +}; + +int main(int __argc, char **__argv) +{ + sytorch_init(); + + // bert large + const u64 n_embd = 1024; + const u64 n_head = n_embd / 64; + const u64 n_layer = 24; + const u64 scale = 12; + const u64 bw = 51; + const u64 n_seq = 128; + + int party = atoi(__argv[1]); + std::string ip = "127.0.0.1"; + if (__argc > 2) + ip = __argv[2]; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + srand(time(NULL)); + + LlamaConfig::bitlength = bw; + LlamaConfig::party = party; + + llama->init(ip, true); + + BERT net(n_layer, n_head, n_embd); + Tensor input({n_seq, n_embd}); + net.init(scale, input); + net.setBackend(llama); + net.optimize(); + if (party != DEALER) + { + // net.load("bert-tiny-weights.dat"); + // input.load("15469.dat", scale); + input.fill(1LL << (scale - 2)); + } + else + { + net.zero(); + } + + llama->initializeInferencePartyA(net.root); + llama->initializeInferencePartyB(input); + + llama::start(); + net.forward(input); + llama::end(); + + auto &output = net.activation; + llama->outputA(output); + llama->finalize(); + + if (party == CLIENT) + { + auto signedAct = Tensor((i64*) net.activation.data, net.activation.shape); + print(signedAct, scale, bw); + } + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp new file mode 100644 index 00000000..4f428c8e --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp @@ -0,0 +1,250 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ +public: + using SytorchModule::gelu; + + u64 in; + u64 hidden; + +public: + FC *up; + FC *down; + + FFN(u64 in, u64 hidden) : in(in), hidden(hidden) + { + up = new FC(in, hidden, true); + down = new FC(hidden, in, true); + } + + Tensor &_forward(Tensor &input) + { + return down->forward(gelu(up->forward(input))); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ +public: + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::scalarmul; + using SytorchModule::invsqrt; + using SytorchModule::softmax; + using SytorchModule::concat; + using SytorchModule::attention_mask; + +public: + FC *c_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + c_attn = new FC(n_embd, 3 * n_embd, true); + c_proj = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &x = c_attn->forward(input); + auto &qkv_heads = split(x, 3); + auto &q_heads = view(qkv_heads, 0); + auto &k_heads = view(qkv_heads, 1); + auto &v_heads = view(qkv_heads, 2); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector *> qks_sm_vs; + for (u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + auto &kt = transpose(k); + auto &qk = matmul(q, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ +public: + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + LayerNorm *ln0; + LayerNorm *ln1; + + u64 n_heads, n_embd; + +public: + TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, 4 * n_embd); + ln0 = new LayerNorm(n_embd); + ln1 = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + auto &attn_out = attn->forward(input); + auto &add0_out = add(attn_out, input); + auto &ln0_out = ln0->forward(add0_out); + + auto &ffn_out = ffn->forward(ln0_out); + auto &add1_out = add(ffn_out, ln0_out); + auto &ln1_out = ln1->forward(add1_out); + return ln1_out; + } +}; + +template +class BERT : public SytorchModule +{ +public: + using SytorchModule::tanh; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::unsqueeze; + std::vector *> blocks; + LayerNorm *ln_f; + FC *pool; + u64 n_layer, n_heads, n_embd; + +public: + BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd) + { + for (u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd)); + } + ln_f = new LayerNorm(n_embd); + pool = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &y = ln_f->forward(input); + Tensor *x = &y; + // Tensor *x = &input; + + for (u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + + return *x; + } +}; + +int main(int __argc, char **__argv) +{ + sytorch_init(); + + // bert tiny + const u64 n_embd = 128; + const u64 n_head = n_embd / 64; + const u64 n_layer = 2; + const u64 scale = 12; + const u64 bw = 37; + const u64 n_seq = 128; + + int party = atoi(__argv[1]); + std::string ip = "127.0.0.1"; + if (__argc > 2) + ip = __argv[2]; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + srand(time(NULL)); + + LlamaConfig::bitlength = bw; + LlamaConfig::party = party; + + llama->init(ip, true); + + BERT net(n_layer, n_head, n_embd); + Tensor input({n_seq, n_embd}); + net.init(scale, input); + net.setBackend(llama); + net.optimize(); + if (party != DEALER) + { + // net.load("bert-tiny-weights.dat"); + // input.load("15469.dat", scale); + input.fill(1LL << (scale - 2)); + } + else + { + net.zero(); + } + + llama->initializeInferencePartyA(net.root); + llama->initializeInferencePartyB(input); + + llama::start(); + net.forward(input); + llama::end(); + + auto &output = net.activation; + llama->outputA(output); + llama->finalize(); + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp new file mode 100644 index 00000000..52efc339 --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp @@ -0,0 +1,226 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ + using SytorchModule::gelu; + + u64 in; + u64 hidden; +public: + FC *up; + FC *down; + + FFN(u64 in, u64 hidden) : in(in), hidden(hidden) + { + up = new FC(in, hidden, true); + down = new FC(hidden, in, true); + } + + Tensor &_forward(Tensor &input) + { + return down->forward(gelu(up->forward(input))); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::matmul_triangular; + using SytorchModule::scalarmul; + using SytorchModule::softmax_triangular; + using SytorchModule::concat; + +public: + FC *c_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + c_attn = new FC(n_embd, 3*n_embd, true); + c_proj = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + auto &x = c_attn->forward(input); + auto &qkv_heads = split(x, 3); + auto &q_heads = view(qkv_heads, 0); + auto &k_heads = view(qkv_heads, 1); + auto &v_heads = view(qkv_heads, 2); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector*> qks_sm_vs; + for(u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + auto &kt = transpose(k); + auto &qk = matmul_triangular(q, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax_triangular(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + LayerNorm *ln0; + LayerNorm *ln1; + + u64 n_heads, n_embd; +public: + + TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, 4*n_embd); + ln0 = new LayerNorm(n_embd); + ln1 = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + auto &ln0_out = ln0->forward(input); + auto &attn_out = attn->forward(ln0_out); + auto &attn_out_add = add(attn_out, input); + auto &ln1_out = ln1->forward(attn_out_add); + auto &ffn_out = ffn->forward(ln1_out); + auto &ffn_out_add = add(ffn_out, attn_out_add); + return ffn_out_add; + } +}; + +template +class GPT2 : public SytorchModule +{ + std::vector *> blocks; + LayerNorm *ln_f; + u64 n_layer, n_heads, n_embd; + +public: + + GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd) + { + for(u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd)); + } + ln_f = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + Tensor *x = &input; + + for(u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + // return ln_f->forward(*x); + return *x; + } +}; + +int main(int __argc, char**__argv) +{ + sytorch_init(); + + // gpt2 + const u64 n_embd = 768; + const u64 n_head = 12; + const u64 n_layer = 12; + const u64 bw = 51; + + const u64 scale = 12; + const u64 n_seq = 128; + + int party = atoi(__argv[1]); + std::string ip = "127.0.0.1"; + if (__argc > 2) + ip = __argv[2]; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + srand(time(NULL)); + + LlamaConfig::bitlength = bw; + LlamaConfig::party = party; + + llama->init(ip, true); + + GPT2 net(n_layer, n_head, n_embd); + net.init(scale); + net.setBackend(llama); + net.optimize(); + llama->initializeInferencePartyA(net.root); + + Tensor input({n_seq, n_embd}); + if(party == CLIENT){ + input.fill(1LL << (scale-2)); + } + llama->initializeInferencePartyB(input); + + llama::start(); + net.forward(input); + llama::end(); + + auto &output = net.activation; + llama->outputA(output); + llama->finalize(); + + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp new file mode 100644 index 00000000..c44f5dd6 --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp @@ -0,0 +1,304 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ + using SytorchModule::gelu; + + u64 in; + u64 hidden; +public: + FC *up; + FC *down; + + FFN(u64 in, u64 hidden) : in(in), hidden(hidden) + { + up = new FC(in, hidden, true); + down = new FC(hidden, in, true); + } + + Tensor &_forward(Tensor &input) + { + return down->forward(gelu(up->forward(input))); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::add; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::scalarmul; + using SytorchModule::invsqrt; + using SytorchModule::softmax; + using SytorchModule::concat; + using SytorchModule::attention_mask; + // using SytorchModule::local_attention_mask; + /////////////////////////// + using SytorchModule::matmul_triangular; + using SytorchModule::softmax_triangular; + +public: + // FC *c_attn; + FC *k_attn; + FC *v_attn; + FC *q_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + u64 attention_type; + u64 window_size; + + MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + // c_attn = new FC(n_embd, 3*n_embd, true); + k_attn = new FC(n_embd, n_embd, false); + v_attn = new FC(n_embd, n_embd, false); + q_attn = new FC(n_embd, n_embd, false); + c_proj = new FC(n_embd, n_embd, true); + } + + Tensor &_forward(Tensor &input) + { + // auto &x = c_attn->forward(input); + // auto &qkv_heads = split(x, 3); + // auto &q_heads = view(qkv_heads, 0); + // auto &k_heads = view(qkv_heads, 1); + // auto &v_heads = view(qkv_heads, 2); + auto &k_heads = k_attn->forward(input); + auto &v_heads = v_attn->forward(input); + auto &q_heads = q_attn->forward(input); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + // double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + // double divisor = 1; + + std::vector*> qks_sm_vs; + for(u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + auto &kt = transpose(k); + // auto &qks = matmul(q, kt); + auto &qks = matmul_triangular(q, kt); + // auto &qk = matmul(q, kt); + // auto &qks = scalarmul(qk, divisor); + + /* + Tensor *x = &input; + if(attention_type % 2 == 0) + { + // printf("global\n"); + auto &qks_masked = attention_mask(qks, 10000.0); + x = &qks_masked; + } + else + { + auto &qks_masked = local_attention_mask(qks, 10000.0); + x = &qks_masked; + } + auto &qks_sm = softmax(*x); + auto &qks_sm_v = matmul(qks_sm, v); + */ + + Tensor *x = &input; + if(attention_type % 2 == 0) + { + auto &qks_sm = softmax_triangular(qks); + x = &qks_sm; + } + else + { + // auto &qks_masked = local_attention_mask(qks, 10000.0); + // auto &qks_sm = softmax_triangular(qks_masked); + + auto &qks_sm = softmax_triangular(qks); + x = &qks_sm; + } + auto &qks_sm_v = matmul(*x, v); + + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + LayerNorm *ln0; + LayerNorm *ln1; + + u64 n_heads, n_embd; + u64 attention_type; + u64 window_size; +public: + + TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd) + { + attn = new MultiHeadAttention(n_heads, n_embd, attention_type, window_size); + ffn = new FFN(n_embd, 4*n_embd); + ln0 = new LayerNorm(n_embd); + ln1 = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + auto &ln0_out = ln0->forward(input); + auto &attn_out = attn->forward(ln0_out); + auto &attn_out_add = add(attn_out, input); + auto &ln1_out = ln1->forward(attn_out_add); + auto &ffn_out = ffn->forward(ln1_out); + auto &ffn_out_add = add(ffn_out, attn_out_add); + return ffn_out_add; + } +}; + +template +class GPT2 : public SytorchModule +{ + std::vector *> blocks; + // LayerNorm *ln_f; + u64 n_layer, n_heads, n_embd; + u64 window_size; + +public: + + GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd) + { + for(u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd, i, window_size)); + } + // ln_f = new LayerNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + Tensor *x = &input; + + for(u64 i = 0; i < n_layer - 1; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + + auto &block = blocks[n_layer - 1]; + return block->forward(*x); + + // for(u64 i = 0; i < n_layer; ++i) + // { + // auto &block = blocks[i]; + // auto &x_out = block->forward(*x); + // x = &x_out; + // } + // return ln_f->forward(*x); + } +}; + + +int lt_main(int __argc, char**__argv){ + + sytorch_init(); + + + const u64 n_embd = 2048; + const u64 n_head = 16; + const u64 n_layer = 24; + const u64 window_size = 256; + + int party = atoi(__argv[1]); + std::string ip = "127.0.0.1"; + if (__argc > 2) + ip = __argv[2]; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + srand(time(NULL)); + + const u64 scale = 12; + + LlamaConfig::bitlength = 52; + LlamaConfig::party = party; + LlamaConfig::stochasticT = false; + LlamaConfig::stochasticRT = false; + LlamaConfig::num_threads = 4; + + llama->init(ip, true); + + GPT2 net(n_layer, n_head, n_embd, window_size); + net.init(scale); + net.setBackend(llama); + net.optimize(); + if(party == SERVER){ + // net.load("gpt-neo-1pt3B-weights.dat"); + net.zero(); + } + else if(party == DEALER){ + net.zero(); + } + llama->initializeInferencePartyA(net.root); + + u64 n_seq = 128; + Tensor input({n_seq, n_embd}); + if(party == CLIENT){ + input.fill(1LL << (scale-2)); + } + llama->initializeInferencePartyB(input); + + llama::start(); + net.forward(input); + llama::end(); + + auto &output = net.activation; + llama->outputA(output); + llama->finalize(); + + return 0; +} + +int main(int __argc, char**__argv) +{ + lt_main(__argc,__argv); +} diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp new file mode 100644 index 00000000..e208cad5 --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp @@ -0,0 +1,266 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ + using SytorchModule::silu; + using SytorchModule::mul; + + u64 in; + u64 intermediate_size; + +public: + FC *up1; + FC *up2; + FC *down; + + FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size) + { + up1 = new FC(in, intermediate_size, false); + up2 = new FC(in, intermediate_size, false); + down = new FC(intermediate_size, in, false); + } + + Tensor &_forward(Tensor &input) + { + auto &a = up1->forward(input); + auto &b = up2->forward(input); + return down->forward(mul(silu(a), b)); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::matmul_triangular; + using SytorchModule::scalarmul; + using SytorchModule::softmax_triangular; + using SytorchModule::concat; + + using SytorchModule::mul; + using SytorchModule::add; + using SytorchModule::silu; + using SytorchModule::rotary_embedding; + +public: + FC *q_attn; + FC *k_attn; + FC *v_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + q_attn = new FC(n_embd, n_embd, false); + k_attn = new FC(n_embd, n_embd, false); + v_attn = new FC(n_embd, n_embd, false); + c_proj = new FC(n_embd, n_embd, false); + } + + Tensor &_forward(Tensor &input) + { + auto &q_heads = q_attn->forward(input); + auto &k_heads = k_attn->forward(input); + auto &v_heads = v_attn->forward(input); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector *> qks_sm_vs; + for (u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + + auto &q1 = rotary_embedding(q); + auto &k1 = rotary_embedding(k); + + auto &kt = transpose(k1); + auto &qk = matmul_triangular(q1, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax_triangular(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + RMSNorm *ln0; + RMSNorm *ln1; + + u64 n_heads, n_embd, intermediate_size; + +public: + TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, intermediate_size); + ln0 = new RMSNorm(n_embd, false); + ln1 = new RMSNorm(n_embd, false); + } + + Tensor &_forward(Tensor &input) + { + auto &ln0_out = ln0->forward(input); + auto &attn_out = attn->forward(ln0_out); + auto &attn_out_add = add(attn_out, input); + auto &ln1_out = ln1->forward(attn_out_add); + auto &ffn_out = ffn->forward(ln1_out); + auto &ffn_out_add = add(ffn_out, attn_out_add); + return ffn_out_add; + } +}; + +template +class LLAMA_MODEL : public SytorchModule +{ + std::vector *> blocks; + RMSNorm *ln_f; + u64 n_layer, n_heads, n_embd, intermediate_size; + +public: + LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + { + for (u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd, intermediate_size)); + } + ln_f = new RMSNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + Tensor *x = &input; + for (u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + return *x; + // return ln_f->forward(*x); + } +}; + +template +class LlamaNextWordLogits : public SytorchModule +{ + using SytorchModule::view; + using SytorchModule::transpose; + LLAMA_MODEL *llama_model; + FC *fc; + u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size; + +public: + LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size) + { + llama_model = new LLAMA_MODEL(n_layer, n_heads, n_embd, intermediate_size); + fc = new FC(n_embd, n_vocab, false); + } + + Tensor &_forward(Tensor &input) + { + auto &fc_in = llama_model->forward(input); + return fc_in; + // auto &fc_out = fc->forward(fc_in); + // return view(fc_out, -1); + } +}; + +void lt_main(int party, std::string ip) +{ + sytorch_init(); + + const u64 n_vocab = 32000; + const u64 n_embd = 5120; + const u64 n_head = 40; // 40; + const u64 n_layer = 40; // 40; + const u64 intermediate_size = 13824; + const u64 scale = 12; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + LlamaConfig::bitlength = 48; + LlamaConfig::party = party; + llama->init(ip, true); + + LlamaNextWordLogits llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size); + u64 n_seq = 128; // get_n_seq(fname, n_embd); + Tensor input({n_seq, n_embd}); + input.zero(); + llama_model.init(scale, input); + llama_model.setBackend(llama); + llama_model.optimize(); + llama_model.zero(); + + // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat"; + llama->initializeInferencePartyA(llama_model.root); + llama->initializeInferencePartyB(input); + + llama::start(); + auto &res = llama_model.forward(input); + llama::end(); + + auto &output = llama_model.activation; + llama->outputA(output); + llama->finalize(); +} + +int main(int __argc, char **__argv) +{ + int party = atoi(__argv[1]); + std::string ip = "0.0.0.0"; + if (__argc > 2) + ip = __argv[2]; + lt_main(party, ip); + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp new file mode 100644 index 00000000..595dd5b6 --- /dev/null +++ b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp @@ -0,0 +1,266 @@ +// Authors: Kanav Gupta, Neha Jawalkar +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include + +template +class FFN : public SytorchModule +{ + using SytorchModule::silu; + using SytorchModule::mul; + + u64 in; + u64 intermediate_size; + +public: + FC *up1; + FC *up2; + FC *down; + + FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size) + { + up1 = new FC(in, intermediate_size, false); + up2 = new FC(in, intermediate_size, false); + down = new FC(intermediate_size, in, false); + } + + Tensor &_forward(Tensor &input) + { + auto &a = up1->forward(input); + auto &b = up2->forward(input); + return down->forward(mul(silu(a), b)); + } +}; + +template +class MultiHeadAttention : public SytorchModule +{ + using SytorchModule::split; + using SytorchModule::view; + using SytorchModule::transpose; + using SytorchModule::matmul; + using SytorchModule::matmul_triangular; + using SytorchModule::scalarmul; + using SytorchModule::softmax_triangular; + using SytorchModule::concat; + + using SytorchModule::mul; + using SytorchModule::add; + using SytorchModule::silu; + using SytorchModule::rotary_embedding; + +public: + FC *q_attn; + FC *k_attn; + FC *v_attn; + FC *c_proj; + + u64 n_heads; + u64 n_embd; + + MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd) + { + always_assert(n_embd % n_heads == 0); + q_attn = new FC(n_embd, n_embd, false); + k_attn = new FC(n_embd, n_embd, false); + v_attn = new FC(n_embd, n_embd, false); + c_proj = new FC(n_embd, n_embd, false); + } + + Tensor &_forward(Tensor &input) + { + auto &q_heads = q_attn->forward(input); + auto &k_heads = k_attn->forward(input); + auto &v_heads = v_attn->forward(input); + auto &qs = split(q_heads, n_heads); + auto &ks = split(k_heads, n_heads); + auto &vs = split(v_heads, n_heads); + + double divisor = 1 / sqrt(double(n_embd) / double(n_heads)); + + std::vector *> qks_sm_vs; + for (u64 i = 0; i < n_heads; ++i) + { + auto &q = view(qs, i); + auto &k = view(ks, i); + auto &v = view(vs, i); + + auto &q1 = rotary_embedding(q); + auto &k1 = rotary_embedding(k); + + auto &kt = transpose(k1); + auto &qk = matmul_triangular(q1, kt); + auto &qks = scalarmul(qk, divisor); + + auto &qks_sm = softmax_triangular(qks); + + auto &qks_sm_v = matmul(qks_sm, v); + qks_sm_vs.push_back(&qks_sm_v); + } + + auto &qks_sm_vs_cat = concat(qks_sm_vs); + auto &res = c_proj->forward(qks_sm_vs_cat); + return res; + } +}; + +template +class TransformerBlock : public SytorchModule +{ + using SytorchModule::add; + + MultiHeadAttention *attn; + FFN *ffn; + RMSNorm *ln0; + RMSNorm *ln1; + + u64 n_heads, n_embd, intermediate_size; + +public: + TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + { + attn = new MultiHeadAttention(n_heads, n_embd); + ffn = new FFN(n_embd, intermediate_size); + ln0 = new RMSNorm(n_embd, false); + ln1 = new RMSNorm(n_embd, false); + } + + Tensor &_forward(Tensor &input) + { + auto &ln0_out = ln0->forward(input); + auto &attn_out = attn->forward(ln0_out); + auto &attn_out_add = add(attn_out, input); + auto &ln1_out = ln1->forward(attn_out_add); + auto &ffn_out = ffn->forward(ln1_out); + auto &ffn_out_add = add(ffn_out, attn_out_add); + return ffn_out_add; + } +}; + +template +class LLAMA_MODEL : public SytorchModule +{ + std::vector *> blocks; + RMSNorm *ln_f; + u64 n_layer, n_heads, n_embd, intermediate_size; + +public: + LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + { + for (u64 i = 0; i < n_layer; ++i) + { + blocks.push_back(new TransformerBlock(n_heads, n_embd, intermediate_size)); + } + ln_f = new RMSNorm(n_embd); + } + + Tensor &_forward(Tensor &input) + { + Tensor *x = &input; + for (u64 i = 0; i < n_layer; ++i) + { + auto &block = blocks[i]; + auto &x_out = block->forward(*x); + x = &x_out; + } + return *x; + // return ln_f->forward(*x); + } +}; + +template +class LlamaNextWordLogits : public SytorchModule +{ + using SytorchModule::view; + using SytorchModule::transpose; + LLAMA_MODEL *llama_model; + FC *fc; + u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size; + +public: + LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size) + { + llama_model = new LLAMA_MODEL(n_layer, n_heads, n_embd, intermediate_size); + fc = new FC(n_embd, n_vocab, false); + } + + Tensor &_forward(Tensor &input) + { + auto &fc_in = llama_model->forward(input); + return fc_in; + // auto &fc_out = fc->forward(fc_in); + // return view(fc_out, -1); + } +}; + +void lt_main(int party, std::string ip) +{ + sytorch_init(); + + const u64 n_vocab = 32000; + const u64 n_embd = 4096; + const u64 n_head = 32; + const u64 n_layer = 32;//32; + const u64 intermediate_size = 11008; + const u64 scale = 12; + + using LlamaVersion = LlamaTransformer; + LlamaVersion *llama = new LlamaVersion(); + LlamaConfig::bitlength = 48; + LlamaConfig::party = party; + llama->init(ip, true); + + LlamaNextWordLogits llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size); + u64 n_seq = 128; // get_n_seq(fname, n_embd); + Tensor input({n_seq, n_embd}); + input.zero(); + llama_model.init(scale, input); + llama_model.setBackend(llama); + llama_model.optimize(); + llama_model.zero(); + + // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat"; + llama->initializeInferencePartyA(llama_model.root); + llama->initializeInferencePartyB(input); + + llama::start(); + auto &res = llama_model.forward(input); + llama::end(); + + auto &output = llama_model.activation; + llama->outputA(output); + llama->finalize(); +} + +int main(int __argc, char **__argv) +{ + int party = atoi(__argv[1]); + std::string ip = "0.0.0.0"; + if (__argc > 2) + ip = __argv[2]; + lt_main(party, ip); + return 0; +} \ No newline at end of file diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py new file mode 100644 index 00000000..49591859 --- /dev/null +++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py @@ -0,0 +1,85 @@ +import subprocess +import csv + +mute = False + +benchmarks = [ + 'bert-tiny', + 'bert-base', + 'bert-large', # very large key + 'gpt2', + 'gptneo', # very large key + "llama-7b", # very large key + "llama-13b", # very large key +] + +logfile1 = open("log1.log", 'a') +logfile2 = open("log2.log", 'a') +outcsv = open("results.csv", 'a') +outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n") +outcsv.flush() + +def run_seq(cmd): + p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1) + p.wait() + + +def run_par(cmd1, cmd2): + p1 = subprocess.Popen(cmd1, shell=True, stdout=logfile1, stderr=logfile1) + p2 = subprocess.Popen(cmd2, shell=True, stdout=logfile2, stderr=logfile2) + p1.wait() + p2.wait() + +for b in benchmarks: + print("[+] benchmarking " + b) + print("[+] --- compiling...") + run_seq('make benchmark-' + b) + print("[+] --- running dealer...") + run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1') + print("[+] --- running online phase...") + run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3') + + total_time = 0.0 + total_comm = 0.0 + act_time = 0.0 + act_comm = 0.0 + softmax_time = 0.0 + softmax_comm = 0.0 + norm_time = 0.0 + norm_comm = 0.0 + with open('llama3.csv') as f: + csvFile = csv.reader(f) + header_skipped = False + for lines in csvFile: + if not header_skipped: + header_skipped = True + continue + if lines[0].startswith('GeLU::'): + act_time += float(lines[1]) + act_comm += float(lines[2]) + elif lines[0].startswith('LayerNorm::'): + norm_time += float(lines[1]) + norm_comm += float(lines[2]) + elif lines[0].startswith('nExp::'): + softmax_time += float(lines[1]) + softmax_comm += float(lines[2]) + elif lines[0].startswith('Softmax::'): + softmax_time += float(lines[1]) + softmax_comm += float(lines[2]) + total_time += float(lines[1]) + total_comm += float(lines[2]) + run_seq(f"cp llama3.csv {b}.csv") + print("[+] --- act time = " + str(act_time/1000.0) + " s") + print("[+] --- act comm = " + str(act_comm/1024.0) + " GB") + print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s") + print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB") + print("[+] --- norm time = " + str(norm_time/1000.0) + " s") + print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB") + print("[+] --- online time = " + str(total_time/1000.0) + " s") + print("[+] --- online comm = " + str(total_comm/1024.0) + " GB") + outcsv.write(f"{b},{act_time},{act_comm},{softmax_time},{softmax_comm},{norm_time},{norm_comm},{total_time},{total_comm}\n") + outcsv.flush() + +logfile1.close() +logfile2.close() +outcsv.close() diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py new file mode 100644 index 00000000..57f12366 --- /dev/null +++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py @@ -0,0 +1,88 @@ +import subprocess +import csv +import sys + +mute = False + +if len(sys.argv) < 3: + print("missing arguments") + print(f"usage: python {sys.argv[0]} ") + exit() + +ip = sys.argv[1] +party = int(sys.argv[2]) + +benchmarks = [ + 'bert-tiny', + 'bert-base', + 'bert-large', # very large key + 'gpt2', + 'gptneo', # very large key + "llama-7b", # very large key + "llama-13b", # very large key +] + +logfile1 = open("log1.log", 'a') +outcsv = open("results.csv", 'a') +outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n") +outcsv.flush() + +def run_seq(cmd): + p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1) + p.wait() + + +for b in benchmarks: + print("[+] benchmarking " + b) + print("[+] --- compiling...") + run_seq('make benchmark-' + b) + print("[+] --- running dealer...") + run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1') + print("[+] --- running online phase...") + # run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3') + run_seq(f"OMP_NUM_THREADS=4 ./benchmark-{b} {party+2} {ip}") + + total_time = 0.0 + total_comm = 0.0 + act_time = 0.0 + act_comm = 0.0 + softmax_time = 0.0 + softmax_comm = 0.0 + norm_time = 0.0 + norm_comm = 0.0 + with open(f'llama{party+2}.csv') as f: + csvFile = csv.reader(f) + header_skipped = False + for lines in csvFile: + if not header_skipped: + header_skipped = True + continue + if lines[0].startswith('GeLU::'): + act_time += float(lines[1]) + act_comm += float(lines[2]) + elif lines[0].startswith('LayerNorm::'): + norm_time += float(lines[1]) + norm_comm += float(lines[2]) + elif lines[0].startswith('nExp::'): + softmax_time += float(lines[1]) + softmax_comm += float(lines[2]) + elif lines[0].startswith('Softmax::'): + softmax_time += float(lines[1]) + softmax_comm += float(lines[2]) + total_time += float(lines[1]) + total_comm += float(lines[2]) + run_seq(f"cp llama{party+2}.csv remote-{b}.csv") + print("[+] --- act time = " + str(act_time/1000.0) + " s") + print("[+] --- act comm = " + str(act_comm/1024.0) + " GB") + print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s") + print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB") + print("[+] --- norm time = " + str(norm_time/1000.0) + " s") + print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB") + print("[+] --- online time = " + str(total_time/1000.0) + " s") + print("[+] --- online comm = " + str(total_comm/1024.0) + " GB") + outcsv.write(f"remote-{b},{act_time},{act_comm},{softmax_time},{softmax_comm},{norm_time},{norm_comm},{total_time},{total_comm}\n") + outcsv.flush() + +logfile1.close() +logfile2.close() +outcsv.close()