diff --git a/README.md b/README.md index b830ade..a637289 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,7 @@ The Triton backend for the [FasterTransformer](https://github.com/NVIDIA/FasterTransformer). This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. In the FasterTransformer v4.0, it supports multi-gpu inference on GPT-3 model. This backend integrates FasterTransformer into Triton to use giant GPT-3 model serving by Triton. In the below example, we will show how to use the FasterTransformer backend in Triton to run inference on a GPT-3 model with 345M parameters trained by [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). -Note that this is a research and prototyping tool, not a formal product or maintained framework. User can learn more about Triton backends in the [backend repo](https://github.com/triton-inference-server/backend). Ask questions or report problems on the issues page in this FasterTransformer_backend repo. - - +Note that this is a research and prototyping tool, not a formal product or maintained framework. User can learn more about Triton backends in the [backend repo](https://github.com/triton-inference-server/backend). Ask questions or report problems on the [issues page](https://github.com/triton-inference-server/fastertransformer_backend/issues) in this FasterTransformer_backend repo. ## Table Of Contents @@ -50,7 +48,7 @@ We provide a docker file, which bases on Triton image `nvcr.io/nvidia/tritonserv ```bash mkdir workspace && cd workspace git clone https://github.com/triton-inference-server/fastertransformer_backend.git -nvidia-docker build --tag ft_backend --file transformer_backend/Dockerfile . +nvidia-docker build --tag ft_backend --file fastertransformer_backend/Dockerfile . nvidia-docker run --gpus=all -it --rm --volume $HOME:$HOME --volume $PWD:$PWD -w $PWD --name ft-work ft_backend cd workspace export WORKSPACE=$(pwd) @@ -71,8 +69,8 @@ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp cd $WORKSPACE git clone https://github.com/triton-inference-server/server.git export PATH=/usr/local/mpi/bin:$PATH -source transformer_backend/build.env -mkdir -p transformer_backend/build && cd $WORKSPACE/transformer_backend/build +source fastertransformer_backend/build.env +mkdir -p fastertransformer_backend/build && cd $WORKSPACE/fastertransformer_backend/build cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=1 .. && make -j32 ``` @@ -87,7 +85,7 @@ mkdir -p models/megatron-models/345m unzip megatron_lm_345m_v0.0.zip -d models/megatron-models/345m python ../sample/pytorch/utils/megatron_ckpt_convert.py -i ./models/megatron-models/345m/release/ -o ./models/megatron-models/c-model/345m/ -t_g 1 -i_g 8 python _deps/repo-ft-src/sample/pytorch/utils/megatron_ckpt_convert.py -i ./models/megatron-models/345m/release/ -o ./models/megatron-models/c-model/345m/ -t_g 1 -i_g 8 -cp ./models/megatron-models/c-model/345m/8-gpu $WORKSPACE/transformer_backend/all_models/transformer/1/ -r +cp ./models/megatron-models/c-model/345m/8-gpu $WORKSPACE/fastertransformer_backend/all_models/transformer/1/ -r ``` ## Run Serving @@ -95,12 +93,12 @@ cp ./models/megatron-models/c-model/345m/8-gpu $WORKSPACE/transformer_backend/al * Run servning directly ```bash -cp $WORKSPACE/transformer_backend/build/libtriton_transformer.so $WORKSPACE/transformer_backend/build/lib/libtransformer-shared.so /opt/tritonserver/backends/transformer +cp $WORKSPACE/fastertransformer_backend/build/libtriton_transformer.so $WORKSPACE/fastertransformer_backend/build/lib/libtransformer-shared.so /opt/tritonserver/backends/transformer cd $WORKSPACE && ln -s server/qa/common . -# Recommend to modify the SERVER_TIMEOUT of common/utils.sh to longer time -cd $WORKSPACE/transformer_backend/build/ -bash $WORKSPACE/transformer_backend/tools/run_server.sh -bash $WORKSPACE/transformer_backend/tools/run_client.sh +# Recommend to modify the SERVER_TIMEOUT of common/util.sh to longer time +cd $WORKSPACE/fastertransformer_backend/build/ +bash $WORKSPACE/fastertransformer_backend/tools/run_server.sh +bash $WORKSPACE/fastertransformer_backend/tools/run_client.sh python _deps/repo-ft-src/sample/pytorch/utils/convert_gpt_token.py --out_file=triton_out # Used for checking result ``` @@ -120,4 +118,4 @@ The model configuration for Triton server is put in `all_models/transformer/conf - vocab_size: size of vocabulary - decoder_layers: number of transformer layers - batch_size: max supported batch size -- is_fuse_QKV: fusing QKV in one matrix multiplication or not. It also depends on the weights of QKV. +- is_fuse_QKV: fusing QKV in one matrix multiplication or not. It also depends on the weights of QKV. \ No newline at end of file diff --git a/all_models/transformer/1/.tmp b/all_models/transformer/1/.tmp new file mode 100644 index 0000000..e69de29 diff --git a/tools/identity_test.py b/tools/identity_test.py index 9a9d495..31a211d 100644 --- a/tools/identity_test.py +++ b/tools/identity_test.py @@ -45,15 +45,15 @@ start_id = 220 end_id = 50256 -# random_start_ids = np.random.randint(0, 50255, size=(BATCH_SIZE, START_LEN), dtype=np.uint32) -random_start_ids = np.array([[9915, 27221, 59, 77, 383, 1853, 3327, 1462], - [6601, 4237, 345, 460, 779, 284, 787, 257], - [59, 77, 611, 7, 9248, 796, 657, 8], - [38, 10128, 6032, 651, 8699, 4, 4048, 20753], - [21448, 7006, 930, 12901, 930, 7406, 7006, 198], - [13256, 11, 281, 1605, 3370, 11, 1444, 6771], - [9915, 27221, 59, 77, 383, 1853, 3327, 1462], - [6601, 4237, 345, 460, 779, 284, 787, 257]], np.uint32) +random_start_ids = np.random.randint(0, 50255, size=(BATCH_SIZE, START_LEN), dtype=np.uint32) +# random_start_ids = np.array([[9915, 27221, 59, 77, 383, 1853, 3327, 1462], +# [6601, 4237, 345, 460, 779, 284, 787, 257], +# [59, 77, 611, 7, 9248, 796, 657, 8], +# [38, 10128, 6032, 651, 8699, 4, 4048, 20753], +# [21448, 7006, 930, 12901, 930, 7406, 7006, 198], +# [13256, 11, 281, 1605, 3370, 11, 1444, 6771], +# [9915, 27221, 59, 77, 383, 1853, 3327, 1462], +# [6601, 4237, 345, 460, 779, 284, 787, 257]], np.uint32) input_len = np.array([ [sentence.size] for sentence in random_start_ids ], np.uint32) output_len = np.ones_like(input_len).astype(np.uint32) * OUTPUT_LEN @@ -177,4 +177,4 @@ print(output_data.shape) print(output_data) stop_time = datetime.now() - print("[INFO] execution time: {} ms".format((stop_time - start_time).total_seconds() * 1000.0 / request_parallelism)) \ No newline at end of file + print("[INFO] execution time: {} ms".format((stop_time - start_time).total_seconds() * 1000.0 / request_parallelism)) diff --git a/tools/run_client.sh b/tools/run_client.sh index eb527b3..9bf0d19 100755 --- a/tools/run_client.sh +++ b/tools/run_client.sh @@ -27,7 +27,7 @@ # export CUDA_VISIBLE_DEVICES=0 -CLIENT_PY=$WORKSPACE/transformer_backend/tools/identity_test.py +CLIENT_PY=$WORKSPACE/fastertransformer_backend/tools/identity_test.py CLIENT_LOG="./client.log" rm -rf client.log err.log @@ -44,4 +44,4 @@ for PROTOCOL in http; do set -e done -exit $RET \ No newline at end of file +exit $RET diff --git a/tools/run_server.sh b/tools/run_server.sh index c7ced90..e96c009 100755 --- a/tools/run_server.sh +++ b/tools/run_server.sh @@ -28,7 +28,7 @@ # export CUDA_VISIBLE_DEVICES=0 SERVER=/opt/tritonserver/bin/tritonserver -SERVER_ARGS="--model-repository=$WORKSPACE/transformer_backend/all_models" +SERVER_ARGS="--model-repository=$WORKSPACE/fastertransformer_backend/all_models" SERVER_LOG="./inference_server.log" source $WORKSPACE/common/util.sh @@ -39,4 +39,4 @@ if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG exit 1 -fi \ No newline at end of file +fi