From 463fa6193ed9425df7e85feaed68c3887d06b2c6 Mon Sep 17 00:00:00 2001 From: SamuraiBUPT <31409163@bupt.edu.cn> Date: Thu, 6 Jul 2023 19:12:23 +0800 Subject: [PATCH 1/2] int8_mode fix and remove decoupled mode in config --- all_models/llama/fastertransformer/config.pbtxt | 4 ---- docs/llama_guide.md | 7 ++++++- src/libfastertransformer.cc | 7 ++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/all_models/llama/fastertransformer/config.pbtxt b/all_models/llama/fastertransformer/config.pbtxt index 1f80e22..ad7be1f 100644 --- a/all_models/llama/fastertransformer/config.pbtxt +++ b/all_models/llama/fastertransformer/config.pbtxt @@ -29,10 +29,6 @@ backend: "fastertransformer" default_model_filename: "llama" max_batch_size: 1024 -model_transaction_policy { - decoupled: True -} - input [ { name: "input_ids" diff --git a/docs/llama_guide.md b/docs/llama_guide.md index d1c60fc..c6bf9dc 100644 --- a/docs/llama_guide.md +++ b/docs/llama_guide.md @@ -194,4 +194,9 @@ I0628 02:59:06.177982 11650 http_server.cc:3477] Started HTTPService at 0.0.0.0: I0628 02:59:06.219577 11650 http_server.cc:184] Started Metrics Service at 0.0.0.0:8002 ``` -That means the program was launched successfully. \ No newline at end of file +That means the program was launched successfully. + +# Update ++ offer `int8_mode` support in `libfastertransformer.cc` to make sure the compiler can find matching function. ++ remove `decoupled mode: true` in [config.pbtxt](../all_models/llama/fastertransformer/config.pbtxt#L31) + the `decoupled mode` seems to be a bug, not fixed so far. If you have any solution to this, welcome to contribute. \ No newline at end of file diff --git a/src/libfastertransformer.cc b/src/libfastertransformer.cc index 174188d..1e2a6f1 100644 --- a/src/libfastertransformer.cc +++ b/src/libfastertransformer.cc @@ -333,13 +333,14 @@ std::shared_ptr ModelState::ModelFactory( } } else if (model_type == "Llama") { if (data_type == "fp16") { - ft_model = std::make_shared>(tp, pp, custom_ar, model_dir); + const int int8_mode = param_get_int(param, "int8_mode"); + ft_model = std::make_shared>(tp, pp, custom_ar, model_dir, int8_mode); #ifdef ENABLE_BF16 } else if (data_type == "bf16") { - ft_model = std::make_shared>(tp, pp, custom_ar, model_dir); + ft_model = std::make_shared>(tp, pp, custom_ar, model_dir, int8_mode); #endif } else if (data_type == "fp32") { - ft_model = std::make_shared>(tp, pp, custom_ar, model_dir); + ft_model = std::make_shared>(tp, pp, custom_ar, model_dir, int8_mode); } else { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, dt_message.c_str()); } From 4d3292ca744265a80e88c691cf52facdce25b50c Mon Sep 17 00:00:00 2001 From: SamuraiBUPT <31409163@bupt.edu.cn> Date: Mon, 10 Jul 2023 19:49:26 +0800 Subject: [PATCH 2/2] decoupled mode support --- all_models/llama/fastertransformer/config.pbtxt | 4 ++++ docs/llama_guide.md | 12 +++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/all_models/llama/fastertransformer/config.pbtxt b/all_models/llama/fastertransformer/config.pbtxt index ad7be1f..1f80e22 100644 --- a/all_models/llama/fastertransformer/config.pbtxt +++ b/all_models/llama/fastertransformer/config.pbtxt @@ -29,6 +29,10 @@ backend: "fastertransformer" default_model_filename: "llama" max_batch_size: 1024 +model_transaction_policy { + decoupled: True +} + input [ { name: "input_ids" diff --git a/docs/llama_guide.md b/docs/llama_guide.md index c6bf9dc..bc7dcfc 100644 --- a/docs/llama_guide.md +++ b/docs/llama_guide.md @@ -5,7 +5,8 @@ We have deployed LLaMa on triton inference server with faster transformer backen + Ubuntu 20.04 + docker: 24.0.2 + cmake -+ python ++ python: 3.10.6 ++ pip: 23.1.2 Hardware: + RTX 3090 (24G VMEM) * 2 @@ -26,12 +27,14 @@ We will expand our work in `llama_deploy` directory. ## 1. build docker image To reproduce all further steps that would be easier to run everything into Docker container. So it's necessary to build a triton docker image for next steps. +The reason why we choose the image tag:23.04 is that this may support the decoupled mode. See this [issue](https://github.com/triton-inference-server/server/issues/6002#issuecomment-1617106369) for more info. + ```bash git clone https://github.com/void-main/fastertransformer_backend.git cd fastertransformer_backend -sudo docker build --rm --build-arg TRITON_VERSION=22.12 -t triton_ft_backend:22.12 -f docker/Dockerfile . +sudo docker build --rm --build-arg TRITON_VERSION=23.04 -t triton_ft_backend:23.04 -f docker/Dockerfile . ``` The build process may take more than five minutes, depending on your hardware. @@ -41,7 +44,7 @@ When finished, launch the container: ```bash cd ../ -sudo docker run -it --rm --gpus=all --net=host --shm-size=4G -v $(pwd):/ft_workspace -p8888:8888 -p8000:8000 -p8001:8001 -p8002:8002 triton_ft_backend:22.12 bash +sudo docker run -it --rm --gpus=all --net=host --shm-size=4G -v $(pwd):/ft_workspace -p8888:8888 -p8000:8000 -p8001:8001 -p8002:8002 triton_ft_backend:23.04 bash ``` We have mapped the `llama_deploy` directory to `/ft_workspace` inside the container. @@ -198,5 +201,4 @@ That means the program was launched successfully. # Update + offer `int8_mode` support in `libfastertransformer.cc` to make sure the compiler can find matching function. -+ remove `decoupled mode: true` in [config.pbtxt](../all_models/llama/fastertransformer/config.pbtxt#L31) - the `decoupled mode` seems to be a bug, not fixed so far. If you have any solution to this, welcome to contribute. \ No newline at end of file ++ fix the `decoupled mode` support, you may get access to decoupled mode with a higher version of tritonserver base image! (23.04 tested) \ No newline at end of file