diff --git a/GPU-MPC/ext/sytorch/CMakeLists.txt b/GPU-MPC/ext/sytorch/CMakeLists.txt
index 544e70a8..9d871fad 100755
--- a/GPU-MPC/ext/sytorch/CMakeLists.txt
+++ b/GPU-MPC/ext/sytorch/CMakeLists.txt
@@ -370,3 +370,52 @@ add_executable(
 )
 
 target_link_libraries(gptneobenchmark ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-tiny
+    benchmarks/bert-tiny.cpp
+)
+
+target_link_libraries(benchmark-bert-tiny ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-base
+    benchmarks/bert-base.cpp
+)
+
+target_link_libraries(benchmark-bert-base ${PROJECT_NAME})
+
+add_executable(
+    benchmark-bert-large
+    benchmarks/bert-large.cpp
+)
+
+target_link_libraries(benchmark-bert-large ${PROJECT_NAME})
+
+add_executable(
+    benchmark-gpt2
+    benchmarks/gpt2.cpp
+)
+
+target_link_libraries(benchmark-gpt2 ${PROJECT_NAME})
+
+add_executable(
+    benchmark-gptneo
+    benchmarks/gptneo.cpp
+)
+
+target_link_libraries(benchmark-gptneo ${PROJECT_NAME})
+
+add_executable(
+    benchmark-llama-7b
+    benchmarks/llama-7b.cpp
+)
+
+target_link_libraries(benchmark-llama-7b ${PROJECT_NAME})
+
+add_executable(
+    benchmark-llama-13b
+    benchmarks/llama-13b.cpp
+)
+
+target_link_libraries(benchmark-llama-13b ${PROJECT_NAME})
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp
new file mode 100644
index 00000000..078f0ab0
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-base.cpp
@@ -0,0 +1,256 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert base
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 scale = 12;
+    const u64 bw = 51;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp
new file mode 100644
index 00000000..f345d34a
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-large.cpp
@@ -0,0 +1,256 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert large
+    const u64 n_embd = 1024;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 24;
+    const u64 scale = 12;
+    const u64 bw = 51;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    if (party == CLIENT)
+    {
+        auto signedAct = Tensor<i64>((i64*) net.activation.data, net.activation.shape);
+        print(signedAct, scale, bw);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp
new file mode 100644
index 00000000..4f428c8e
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/bert-tiny.cpp
@@ -0,0 +1,250 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_extended.h>
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/backend/piranha_cleartext.h>
+#include <sytorch/backend/secureml_cleartext.h>
+#include <sytorch/backend/float.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden)
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3 * n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+
+    u64 n_heads, n_embd;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4 * n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &attn_out = attn->forward(input);
+        auto &add0_out = add(attn_out, input);
+        auto &ln0_out = ln0->forward(add0_out);
+
+        auto &ffn_out = ffn->forward(ln0_out);
+        auto &add1_out = add(ffn_out, ln0_out);
+        auto &ln1_out = ln1->forward(add1_out);
+        return ln1_out;
+    }
+};
+
+template <typename T>
+class BERT : public SytorchModule<T>
+{
+public:
+    using SytorchModule<T>::tanh;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::unsqueeze;
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    FC<T> *pool;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    BERT(u64 n_layer, u64 n_heads, u64 n_embd) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+        pool = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &y = ln_f->forward(input);
+        Tensor<T> *x = &y;
+        // Tensor<T> *x = &input;
+
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        return *x;
+    }
+};
+
+int main(int __argc, char **__argv)
+{
+    sytorch_init();
+
+    // bert tiny
+    const u64 n_embd = 128;
+    const u64 n_head = n_embd / 64;
+    const u64 n_layer = 2;
+    const u64 scale = 12;
+    const u64 bw = 37;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    BERT<u64> net(n_layer, n_head, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    net.init(scale, input);
+    net.setBackend(llama);
+    net.optimize();
+    if (party != DEALER)
+    {
+        // net.load("bert-tiny-weights.dat");
+        // input.load("15469.dat", scale);
+        input.fill(1LL << (scale - 2));
+    }
+    else
+    {
+        net.zero();
+    }
+
+    llama->initializeInferencePartyA(net.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp
new file mode 100644
index 00000000..52efc339
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/gpt2.cpp
@@ -0,0 +1,226 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+public:
+    FC<T> *c_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &x = c_attn->forward(input);
+        auto &qkv_heads = split(x, 3);
+        auto &q_heads = view(qkv_heads, 0);
+        auto &k_heads = view(qkv_heads, 1);
+        auto &v_heads = view(qkv_heads, 2);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            auto &qk = matmul_triangular(q, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd));
+        }
+        ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        // return ln_f->forward(*x);
+        return *x;
+    }
+};
+
+int main(int __argc, char**__argv)
+{
+    sytorch_init();
+
+    // gpt2
+    const u64 n_embd = 768;
+    const u64 n_head = 12;
+    const u64 n_layer = 12;
+    const u64 bw = 51;
+
+    const u64 scale = 12;
+    const u64 n_seq = 128;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+
+    LlamaConfig::bitlength = bw;
+    LlamaConfig::party = party;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    llama->initializeInferencePartyA(net.root);
+
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp
new file mode 100644
index 00000000..c44f5dd6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/gptneo.cpp
@@ -0,0 +1,304 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+// 
+// Copyright (c) 2024 Microsoft Research
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::gelu;
+
+    u64 in;
+    u64 hidden;
+public:
+    FC<T> *up;
+    FC<T> *down;
+
+    FFN(u64 in, u64 hidden) : in(in), hidden(hidden) 
+    {
+        up = new FC<T>(in, hidden, true);
+        down = new FC<T>(hidden, in, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        return down->forward(gelu(up->forward(input)));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::invsqrt;
+    using SytorchModule<T>::softmax;
+    using SytorchModule<T>::concat;
+    using SytorchModule<T>::attention_mask;
+    // using SytorchModule<T>::local_attention_mask;
+    ///////////////////////////
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::softmax_triangular;
+
+public:
+    // FC<T> *c_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *q_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+    u64 attention_type;
+    u64 window_size;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        // c_attn = new FC<T>(n_embd, 3*n_embd, true);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, true);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        // auto &x = c_attn->forward(input);
+        // auto &qkv_heads = split(x, 3);
+        // auto &q_heads = view(qkv_heads, 0);
+        // auto &k_heads = view(qkv_heads, 1);
+        // auto &v_heads = view(qkv_heads, 2);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &q_heads = q_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        // double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+        // double divisor = 1;
+
+        std::vector<Tensor<T>*> qks_sm_vs;
+        for(u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+            auto &kt = transpose(k);
+            // auto &qks = matmul(q, kt);
+            auto &qks = matmul_triangular(q, kt);
+            // auto &qk = matmul(q, kt);
+            // auto &qks = scalarmul(qk, divisor);
+
+            /*
+            Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                // printf("global\n");
+                auto &qks_masked = attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            else 
+            {
+                auto &qks_masked = local_attention_mask(qks, 10000.0);
+                x = &qks_masked;
+            }
+            auto &qks_sm = softmax(*x);
+            auto &qks_sm_v = matmul(qks_sm, v);
+            */
+
+           Tensor<T> *x = &input;
+            if(attention_type % 2 == 0)
+            {   
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            else 
+            {
+                // auto &qks_masked = local_attention_mask(qks, 10000.0);
+                // auto &qks_sm = softmax_triangular(qks_masked);
+
+                auto &qks_sm = softmax_triangular(qks);
+                x = &qks_sm;
+            }
+            auto &qks_sm_v = matmul(*x, v);
+
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    LayerNorm<T> *ln0;
+    LayerNorm<T> *ln1;
+    
+    u64 n_heads, n_embd;
+    u64 attention_type; 
+    u64 window_size;
+public:
+
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 attention_type, u64 window_size): n_heads(n_heads), n_embd(n_embd)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd, attention_type, window_size);
+        ffn = new FFN<T>(n_embd, 4*n_embd);
+        ln0 = new LayerNorm<T>(n_embd);
+        ln1 = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class GPT2 : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    // LayerNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd;
+    u64 window_size;
+
+public:
+    
+    GPT2(u64 n_layer, u64 n_heads, u64 n_embd, u64 window_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd)
+    {
+        for(u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, i, window_size));
+        }
+        // ln_f = new LayerNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+
+        for(u64 i = 0; i < n_layer - 1; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+
+        auto &block = blocks[n_layer - 1];
+        return block->forward(*x);
+        
+        // for(u64 i = 0; i < n_layer; ++i)
+        // {
+        //     auto &block = blocks[i];
+        //     auto &x_out = block->forward(*x);
+        //     x = &x_out;
+        // }
+        // return ln_f->forward(*x);
+    }
+};
+
+
+int lt_main(int __argc, char**__argv){
+    
+    sytorch_init();
+
+
+    const u64 n_embd = 2048;
+    const u64 n_head = 16;
+    const u64 n_layer = 24;
+    const u64 window_size = 256;
+
+    int party = atoi(__argv[1]);
+    std::string ip = "127.0.0.1";
+    if (__argc > 2)
+        ip = __argv[2];
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    srand(time(NULL));
+    
+    const u64 scale = 12;
+
+    LlamaConfig::bitlength = 52;
+    LlamaConfig::party = party;
+    LlamaConfig::stochasticT = false;
+    LlamaConfig::stochasticRT = false;
+    LlamaConfig::num_threads = 4;
+
+    llama->init(ip, true);
+
+    GPT2<u64> net(n_layer, n_head, n_embd, window_size);
+    net.init(scale);
+    net.setBackend(llama);
+    net.optimize();
+    if(party == SERVER){
+        // net.load("gpt-neo-1pt3B-weights.dat");
+        net.zero();
+    }
+    else if(party == DEALER){
+        net.zero();
+    }
+    llama->initializeInferencePartyA(net.root);
+
+    u64 n_seq = 128;
+    Tensor<u64> input({n_seq, n_embd});
+    if(party == CLIENT){
+        input.fill(1LL << (scale-2));
+    }
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    net.forward(input);
+    llama::end();
+
+    auto &output = net.activation;
+    llama->outputA(output);
+    llama->finalize();
+
+    return 0;
+}
+
+int main(int __argc, char**__argv)
+{
+    lt_main(__argc,__argv);
+}
diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp
new file mode 100644
index 00000000..e208cad5
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/llama-13b.cpp
@@ -0,0 +1,266 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+#include <filesystem>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+    using SytorchModule<T>::mul;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::rotary_embedding;
+
+public:
+    FC<T> *q_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &q_heads = q_attn->forward(input);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+
+            auto &q1 = rotary_embedding(q);
+            auto &k1 = rotary_embedding(k);
+
+            auto &kt = transpose(k1);
+            auto &qk = matmul_triangular(q1, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class LLAMA_MODEL : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+        // return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class LlamaNextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    LLAMA_MODEL<T> *llama_model;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size;
+
+public:
+    LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size)
+    {
+        llama_model = new LLAMA_MODEL<T>(n_layer, n_heads, n_embd, intermediate_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = llama_model->forward(input);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
+    }
+};
+
+void lt_main(int party, std::string ip)
+{
+    sytorch_init();
+
+    const u64 n_vocab = 32000;
+    const u64 n_embd = 5120;
+    const u64 n_head = 40;  // 40;
+    const u64 n_layer = 40; // 40;
+    const u64 intermediate_size = 13824;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init(ip, true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    auto &res = llama_model.forward(input);
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+}
+
+int main(int __argc, char **__argv)
+{
+    int party = atoi(__argv[1]);
+    std::string ip = "0.0.0.0";
+    if (__argc > 2)
+        ip = __argv[2];
+    lt_main(party, ip);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp
new file mode 100644
index 00000000..595dd5b6
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/benchmarks/llama-7b.cpp
@@ -0,0 +1,266 @@
+// Authors: Kanav Gupta, Neha Jawalkar
+// Copyright:
+//
+// Copyright (c) 2024 Microsoft Research
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sytorch/backend/llama_transformer.h>
+#include <sytorch/layers/layers.h>
+#include <sytorch/module.h>
+#include <llama/utils.h>
+#include <llama/api.h>
+#include <filesystem>
+
+template <typename T>
+class FFN : public SytorchModule<T>
+{
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::mul;
+
+    u64 in;
+    u64 intermediate_size;
+
+public:
+    FC<T> *up1;
+    FC<T> *up2;
+    FC<T> *down;
+
+    FFN(u64 in, u64 intermediate_size) : in(in), intermediate_size(intermediate_size)
+    {
+        up1 = new FC<T>(in, intermediate_size, false);
+        up2 = new FC<T>(in, intermediate_size, false);
+        down = new FC<T>(intermediate_size, in, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &a = up1->forward(input);
+        auto &b = up2->forward(input);
+        return down->forward(mul(silu(a), b));
+    }
+};
+
+template <typename T>
+class MultiHeadAttention : public SytorchModule<T>
+{
+    using SytorchModule<T>::split;
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    using SytorchModule<T>::matmul;
+    using SytorchModule<T>::matmul_triangular;
+    using SytorchModule<T>::scalarmul;
+    using SytorchModule<T>::softmax_triangular;
+    using SytorchModule<T>::concat;
+
+    using SytorchModule<T>::mul;
+    using SytorchModule<T>::add;
+    using SytorchModule<T>::silu;
+    using SytorchModule<T>::rotary_embedding;
+
+public:
+    FC<T> *q_attn;
+    FC<T> *k_attn;
+    FC<T> *v_attn;
+    FC<T> *c_proj;
+
+    u64 n_heads;
+    u64 n_embd;
+
+    MultiHeadAttention(u64 n_heads, u64 n_embd) : n_heads(n_heads), n_embd(n_embd)
+    {
+        always_assert(n_embd % n_heads == 0);
+        q_attn = new FC<T>(n_embd, n_embd, false);
+        k_attn = new FC<T>(n_embd, n_embd, false);
+        v_attn = new FC<T>(n_embd, n_embd, false);
+        c_proj = new FC<T>(n_embd, n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &q_heads = q_attn->forward(input);
+        auto &k_heads = k_attn->forward(input);
+        auto &v_heads = v_attn->forward(input);
+        auto &qs = split(q_heads, n_heads);
+        auto &ks = split(k_heads, n_heads);
+        auto &vs = split(v_heads, n_heads);
+
+        double divisor = 1 / sqrt(double(n_embd) / double(n_heads));
+
+        std::vector<Tensor<T> *> qks_sm_vs;
+        for (u64 i = 0; i < n_heads; ++i)
+        {
+            auto &q = view(qs, i);
+            auto &k = view(ks, i);
+            auto &v = view(vs, i);
+
+            auto &q1 = rotary_embedding(q);
+            auto &k1 = rotary_embedding(k);
+
+            auto &kt = transpose(k1);
+            auto &qk = matmul_triangular(q1, kt);
+            auto &qks = scalarmul(qk, divisor);
+
+            auto &qks_sm = softmax_triangular(qks);
+
+            auto &qks_sm_v = matmul(qks_sm, v);
+            qks_sm_vs.push_back(&qks_sm_v);
+        }
+
+        auto &qks_sm_vs_cat = concat(qks_sm_vs);
+        auto &res = c_proj->forward(qks_sm_vs_cat);
+        return res;
+    }
+};
+
+template <typename T>
+class TransformerBlock : public SytorchModule<T>
+{
+    using SytorchModule<T>::add;
+
+    MultiHeadAttention<T> *attn;
+    FFN<T> *ffn;
+    RMSNorm<T> *ln0;
+    RMSNorm<T> *ln1;
+
+    u64 n_heads, n_embd, intermediate_size;
+
+public:
+    TransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size) : n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        attn = new MultiHeadAttention<T>(n_heads, n_embd);
+        ffn = new FFN<T>(n_embd, intermediate_size);
+        ln0 = new RMSNorm<T>(n_embd, false);
+        ln1 = new RMSNorm<T>(n_embd, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &ln0_out = ln0->forward(input);
+        auto &attn_out = attn->forward(ln0_out);
+        auto &attn_out_add = add(attn_out, input);
+        auto &ln1_out = ln1->forward(attn_out_add);
+        auto &ffn_out = ffn->forward(ln1_out);
+        auto &ffn_out_add = add(ffn_out, attn_out_add);
+        return ffn_out_add;
+    }
+};
+
+template <typename T>
+class LLAMA_MODEL : public SytorchModule<T>
+{
+    std::vector<TransformerBlock<T> *> blocks;
+    RMSNorm<T> *ln_f;
+    u64 n_layer, n_heads, n_embd, intermediate_size;
+
+public:
+    LLAMA_MODEL(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size)
+    {
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            blocks.push_back(new TransformerBlock<T>(n_heads, n_embd, intermediate_size));
+        }
+        ln_f = new RMSNorm<T>(n_embd);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        Tensor<T> *x = &input;
+        for (u64 i = 0; i < n_layer; ++i)
+        {
+            auto &block = blocks[i];
+            auto &x_out = block->forward(*x);
+            x = &x_out;
+        }
+        return *x;
+        // return ln_f->forward(*x);
+    }
+};
+
+template <typename T>
+class LlamaNextWordLogits : public SytorchModule<T>
+{
+    using SytorchModule<T>::view;
+    using SytorchModule<T>::transpose;
+    LLAMA_MODEL<T> *llama_model;
+    FC<T> *fc;
+    u64 n_layer, n_heads, n_embd, n_vocab, intermediate_size;
+
+public:
+    LlamaNextWordLogits(u64 n_layer, u64 n_heads, u64 n_embd, u64 n_vocab, u64 intermediate_size) : n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), n_vocab(n_vocab), intermediate_size(intermediate_size)
+    {
+        llama_model = new LLAMA_MODEL<T>(n_layer, n_heads, n_embd, intermediate_size);
+        fc = new FC<T>(n_embd, n_vocab, false);
+    }
+
+    Tensor<T> &_forward(Tensor<T> &input)
+    {
+        auto &fc_in = llama_model->forward(input);
+        return fc_in;
+        // auto &fc_out = fc->forward(fc_in);
+        // return view(fc_out, -1);
+    }
+};
+
+void lt_main(int party, std::string ip)
+{
+    sytorch_init();
+
+    const u64 n_vocab = 32000;
+    const u64 n_embd = 4096;
+    const u64 n_head = 32;
+    const u64 n_layer = 32;//32;
+    const u64 intermediate_size = 11008;
+    const u64 scale = 12;
+
+    using LlamaVersion = LlamaTransformer<u64>;
+    LlamaVersion *llama = new LlamaVersion();
+    LlamaConfig::bitlength = 48;
+    LlamaConfig::party = party;
+    llama->init(ip, true);
+
+    LlamaNextWordLogits<u64> llama_model(n_layer, n_head, n_embd, n_vocab, intermediate_size);
+    u64 n_seq = 128; // get_n_seq(fname, n_embd);
+    Tensor<u64> input({n_seq, n_embd});
+    input.zero();
+    llama_model.init(scale, input);
+    llama_model.setBackend(llama);
+    llama_model.optimize();
+    llama_model.zero();
+
+    // std::string fname = std::string("lambada-meta-llama2-7b/") + /*std::to_string(i)*/ +"999.dat";
+    llama->initializeInferencePartyA(llama_model.root);
+    llama->initializeInferencePartyB(input);
+
+    llama::start();
+    auto &res = llama_model.forward(input);
+    llama::end();
+
+    auto &output = llama_model.activation;
+    llama->outputA(output);
+    llama->finalize();
+}
+
+int main(int __argc, char **__argv)
+{
+    int party = atoi(__argv[1]);
+    std::string ip = "0.0.0.0";
+    if (__argc > 2)
+        ip = __argv[2];
+    lt_main(party, ip);
+    return 0;
+}
\ No newline at end of file
diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py
new file mode 100644
index 00000000..49591859
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-local.py
@@ -0,0 +1,85 @@
+import subprocess
+import csv
+
+mute = False
+
+benchmarks = [
+    'bert-tiny',
+    'bert-base',
+    'bert-large', # very large key
+    'gpt2',
+    'gptneo', # very large key
+    "llama-7b", # very large key
+    "llama-13b", # very large key
+]
+
+logfile1 = open("log1.log", 'a')
+logfile2 = open("log2.log", 'a')
+outcsv = open("results.csv", 'a')
+outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n")
+outcsv.flush()
+
+def run_seq(cmd):
+    p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1)
+    p.wait()
+
+
+def run_par(cmd1, cmd2):
+    p1 = subprocess.Popen(cmd1, shell=True, stdout=logfile1, stderr=logfile1)
+    p2 = subprocess.Popen(cmd2, shell=True, stdout=logfile2, stderr=logfile2)
+    p1.wait()
+    p2.wait()
+
+for b in benchmarks:
+    print("[+] benchmarking " + b)
+    print("[+] --- compiling...")
+    run_seq('make benchmark-' + b)
+    print("[+] --- running dealer...")
+    run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1')
+    print("[+] --- running online phase...")
+    run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3')
+
+    total_time = 0.0
+    total_comm = 0.0
+    act_time = 0.0
+    act_comm = 0.0
+    softmax_time = 0.0
+    softmax_comm = 0.0
+    norm_time = 0.0
+    norm_comm = 0.0
+    with open('llama3.csv') as f:
+        csvFile = csv.reader(f)
+        header_skipped = False
+        for lines in csvFile:
+            if not header_skipped:
+                header_skipped = True
+                continue
+            if lines[0].startswith('GeLU::'):
+                act_time += float(lines[1])
+                act_comm += float(lines[2])
+            elif lines[0].startswith('LayerNorm::'):
+                norm_time += float(lines[1])
+                norm_comm += float(lines[2])
+            elif lines[0].startswith('nExp::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            elif lines[0].startswith('Softmax::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            total_time += float(lines[1])
+            total_comm += float(lines[2])
+    run_seq(f"cp llama3.csv {b}.csv")
+    print("[+] --- act time = " + str(act_time/1000.0) + " s")
+    print("[+] --- act comm = " + str(act_comm/1024.0) + " GB")
+    print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s")
+    print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB")
+    print("[+] --- norm time = " + str(norm_time/1000.0) + " s")
+    print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB")
+    print("[+] --- online time = " + str(total_time/1000.0) + " s")
+    print("[+] --- online comm = " + str(total_comm/1024.0) + " GB")
+    outcsv.write(f"{b},{act_time},{act_comm},{softmax_time},{softmax_comm},{norm_time},{norm_comm},{total_time},{total_comm}\n")
+    outcsv.flush()
+
+logfile1.close()
+logfile2.close()
+outcsv.close()
diff --git a/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py
new file mode 100644
index 00000000..57f12366
--- /dev/null
+++ b/GPU-MPC/ext/sytorch/scripts/all-cpu-benchmarks-remote.py
@@ -0,0 +1,88 @@
+import subprocess
+import csv
+import sys
+
+mute = False
+
+if len(sys.argv) < 3:
+    print("missing arguments")
+    print(f"usage: python {sys.argv[0]} <ip-of-party-0> <party id 0/1>")
+    exit()
+
+ip = sys.argv[1]
+party = int(sys.argv[2])
+
+benchmarks = [
+    'bert-tiny',
+    'bert-base',
+    'bert-large', # very large key
+    'gpt2',
+    'gptneo', # very large key
+    "llama-7b", # very large key
+    "llama-13b", # very large key
+]
+
+logfile1 = open("log1.log", 'a')
+outcsv = open("results.csv", 'a')
+outcsv.write("model,act_time,act_comm,softmax_time,softmax_comm,norm_time,norm_comm,total_time,total_comm\n")
+outcsv.flush()
+
+def run_seq(cmd):
+    p = subprocess.Popen(cmd, shell=True, stdout=logfile1, stderr=logfile1)
+    p.wait()
+
+
+for b in benchmarks:
+    print("[+] benchmarking " + b)
+    print("[+] --- compiling...")
+    run_seq('make benchmark-' + b)
+    print("[+] --- running dealer...")
+    run_seq(f'OMP_NUM_THREADS=4 ./benchmark-{b} 1')
+    print("[+] --- running online phase...")
+    # run_par(f'OMP_NUM_THREADS=4 ./benchmark-{b} 2', f'OMP_NUM_THREADS=4 ./benchmark-{b} 3')
+    run_seq(f"OMP_NUM_THREADS=4 ./benchmark-{b} {party+2} {ip}")
+
+    total_time = 0.0
+    total_comm = 0.0
+    act_time = 0.0
+    act_comm = 0.0
+    softmax_time = 0.0
+    softmax_comm = 0.0
+    norm_time = 0.0
+    norm_comm = 0.0
+    with open(f'llama{party+2}.csv') as f:
+        csvFile = csv.reader(f)
+        header_skipped = False
+        for lines in csvFile:
+            if not header_skipped:
+                header_skipped = True
+                continue
+            if lines[0].startswith('GeLU::'):
+                act_time += float(lines[1])
+                act_comm += float(lines[2])
+            elif lines[0].startswith('LayerNorm::'):
+                norm_time += float(lines[1])
+                norm_comm += float(lines[2])
+            elif lines[0].startswith('nExp::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            elif lines[0].startswith('Softmax::'):
+                softmax_time += float(lines[1])
+                softmax_comm += float(lines[2])
+            total_time += float(lines[1])
+            total_comm += float(lines[2])
+    run_seq(f"cp llama{party+2}.csv remote-{b}.csv")
+    print("[+] --- act time = " + str(act_time/1000.0) + " s")
+    print("[+] --- act comm = " + str(act_comm/1024.0) + " GB")
+    print("[+] --- softmax time = " + str(softmax_time/1000.0) + " s")
+    print("[+] --- softmax comm = " + str(softmax_comm/1024.0) + " GB")
+    print("[+] --- norm time = " + str(norm_time/1000.0) + " s")
+    print("[+] --- norm comm = " + str(norm_comm/1024.0) + " GB")
+    print("[+] --- online time = " + str(total_time/1000.0) + " s")
+    print("[+] --- online comm = " + str(total_comm/1024.0) + " GB")
+    outcsv.write(f"remote-{b},{act_time},{act_comm},{softmax_time},{softmax_comm},{norm_time},{norm_comm},{total_time},{total_comm}\n")
+    outcsv.flush()
+
+logfile1.close()
+logfile2.close()
+outcsv.close()