From 463fa6193ed9425df7e85feaed68c3887d06b2c6 Mon Sep 17 00:00:00 2001
From: SamuraiBUPT <31409163@bupt.edu.cn>
Date: Thu, 6 Jul 2023 19:12:23 +0800
Subject: [PATCH 1/2] int8_mode fix and remove decoupled mode in config

---
 all_models/llama/fastertransformer/config.pbtxt | 4 ----
 docs/llama_guide.md                             | 7 ++++++-
 src/libfastertransformer.cc                     | 7 ++++---
 3 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/all_models/llama/fastertransformer/config.pbtxt b/all_models/llama/fastertransformer/config.pbtxt
index 1f80e22..ad7be1f 100644
--- a/all_models/llama/fastertransformer/config.pbtxt
+++ b/all_models/llama/fastertransformer/config.pbtxt
@@ -29,10 +29,6 @@ backend: "fastertransformer"
 default_model_filename: "llama"
 max_batch_size: 1024
 
-model_transaction_policy {
-  decoupled: True
-}
-
 input [
   {
     name: "input_ids"
diff --git a/docs/llama_guide.md b/docs/llama_guide.md
index d1c60fc..c6bf9dc 100644
--- a/docs/llama_guide.md
+++ b/docs/llama_guide.md
@@ -194,4 +194,9 @@ I0628 02:59:06.177982 11650 http_server.cc:3477] Started HTTPService at 0.0.0.0:
 I0628 02:59:06.219577 11650 http_server.cc:184] Started Metrics Service at 0.0.0.0:8002
 ```
 
-That means the program was launched successfully.
\ No newline at end of file
+That means the program was launched successfully.
+
+# Update
++ offer `int8_mode` support in `libfastertransformer.cc` to make sure the compiler can find matching function.
++ remove `decoupled mode: true` in [config.pbtxt](../all_models/llama/fastertransformer/config.pbtxt#L31)
+  the `decoupled mode` seems to be a bug, not fixed so far. If you have any solution to this, welcome to contribute.
\ No newline at end of file
diff --git a/src/libfastertransformer.cc b/src/libfastertransformer.cc
index 174188d..1e2a6f1 100644
--- a/src/libfastertransformer.cc
+++ b/src/libfastertransformer.cc
@@ -333,13 +333,14 @@ std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(
     }
   } else if (model_type == "Llama") {
     if (data_type == "fp16") {
-      ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir);
+      const int         int8_mode  = param_get_int(param, "int8_mode");
+      ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir, int8_mode);
 #ifdef ENABLE_BF16
     } else if (data_type == "bf16") {
-      ft_model = std::make_shared<LlamaTritonModel<__nv_bfloat16>>(tp, pp, custom_ar, model_dir);
+      ft_model = std::make_shared<LlamaTritonModel<__nv_bfloat16>>(tp, pp, custom_ar, model_dir, int8_mode);
 #endif
     } else if (data_type == "fp32") {
-      ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir);
+      ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir, int8_mode);
     } else {
       LOG_MESSAGE(TRITONSERVER_LOG_ERROR, dt_message.c_str());
     }

From 4d3292ca744265a80e88c691cf52facdce25b50c Mon Sep 17 00:00:00 2001
From: SamuraiBUPT <31409163@bupt.edu.cn>
Date: Mon, 10 Jul 2023 19:49:26 +0800
Subject: [PATCH 2/2] decoupled mode support

---
 all_models/llama/fastertransformer/config.pbtxt |  4 ++++
 docs/llama_guide.md                             | 12 +++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/all_models/llama/fastertransformer/config.pbtxt b/all_models/llama/fastertransformer/config.pbtxt
index ad7be1f..1f80e22 100644
--- a/all_models/llama/fastertransformer/config.pbtxt
+++ b/all_models/llama/fastertransformer/config.pbtxt
@@ -29,6 +29,10 @@ backend: "fastertransformer"
 default_model_filename: "llama"
 max_batch_size: 1024
 
+model_transaction_policy {
+  decoupled: True
+}
+
 input [
   {
     name: "input_ids"
diff --git a/docs/llama_guide.md b/docs/llama_guide.md
index c6bf9dc..bc7dcfc 100644
--- a/docs/llama_guide.md
+++ b/docs/llama_guide.md
@@ -5,7 +5,8 @@ We have deployed LLaMa on triton inference server with faster transformer backen
 + Ubuntu 20.04
 + docker: 24.0.2
 + cmake
-+ python
++ python: 3.10.6
++ pip: 23.1.2
 
 Hardware:
 + RTX 3090 (24G VMEM) * 2
@@ -26,12 +27,14 @@ We will expand our work in `llama_deploy` directory.
 ## 1. build docker image
 To reproduce all further steps that would be easier to run everything into Docker container. So it's necessary to build a triton docker image for next steps.
 
+The reason why we choose the image tag:23.04 is that this may support the decoupled mode. See this [issue](https://github.com/triton-inference-server/server/issues/6002#issuecomment-1617106369) for more info.
+
 ```bash
 git clone https://github.com/void-main/fastertransformer_backend.git
 
 cd fastertransformer_backend
 
-sudo docker build --rm --build-arg TRITON_VERSION=22.12 -t triton_ft_backend:22.12 -f docker/Dockerfile .
+sudo docker build --rm --build-arg TRITON_VERSION=23.04 -t triton_ft_backend:23.04 -f docker/Dockerfile .
 ```
 
 The build process may take more than five minutes, depending on your hardware.
@@ -41,7 +44,7 @@ When finished, launch the container:
 ```bash
 cd ../
 
-sudo docker run -it --rm --gpus=all --net=host --shm-size=4G  -v $(pwd):/ft_workspace -p8888:8888 -p8000:8000 -p8001:8001 -p8002:8002 triton_ft_backend:22.12 bash 
+sudo docker run -it --rm --gpus=all --net=host --shm-size=4G  -v $(pwd):/ft_workspace -p8888:8888 -p8000:8000 -p8001:8001 -p8002:8002 triton_ft_backend:23.04 bash 
 ```
 
 We have mapped the `llama_deploy` directory to `/ft_workspace` inside the container.
@@ -198,5 +201,4 @@ That means the program was launched successfully.
 
 # Update
 + offer `int8_mode` support in `libfastertransformer.cc` to make sure the compiler can find matching function.
-+ remove `decoupled mode: true` in [config.pbtxt](../all_models/llama/fastertransformer/config.pbtxt#L31)
-  the `decoupled mode` seems to be a bug, not fixed so far. If you have any solution to this, welcome to contribute.
\ No newline at end of file
++ fix the `decoupled mode` support, you may get access to decoupled mode with a higher version of tritonserver base image! (23.04 tested)
\ No newline at end of file