triton-inference-server · SherronBurtint · Apr 25, 2023 · Apr 25, 2023 · Apr 25, 2023 · Apr 27, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -110,7 +110,7 @@ if (EXISTS ${FT_DIR})
 else()
   FetchContent_Declare(
     repo-ft
-    GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git 
+    GIT_REPOSITORY https://github.com/void-main/FasterTransformer.git
     GIT_TAG main
     GIT_SHALLOW ON
   )

diff --git a/README.md b/README.md
diff --git a/all_models/llama/ensemble/1/.tmp b/all_models/llama/ensemble/1/.tmp
diff --git a/all_models/llama/ensemble/config.pbtxt b/all_models/llama/ensemble/config.pbtxt
@@ -0,0 +1,298 @@
+name: "ensemble"
+platform: "ensemble"
+max_batch_size: 1024
+
+input [
+  {
+    name: "INPUT_0"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "INPUT_1"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+   name: "INPUT_2"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+  },
+  {
+   name: "INPUT_3"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "start_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_learning_task_name_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p_reset_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "OUTPUT_0"
+    data_type: TYPE_STRING
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "preprocessing"
+      model_version: -1
+      input_map {
+        key: "QUERY"
+        value: "INPUT_0"
+      }
+      input_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "INPUT_1"
+      }
+      input_map {
+        key: "BAD_WORDS_DICT"
+        value: "INPUT_2"
+      }
+      input_map {
+        key: "STOP_WORDS_DICT"
+        value: "INPUT_3"
+      }
+      output_map {
+        key: "INPUT_ID"
+        value: "_INPUT_ID"
+      }
+      output_map {
+        key: "BAD_WORDS_IDS"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "STOP_WORDS_IDS"
+        value: "_STOP_WORDS_IDS"
+      }
+      output_map {
+        key: "REQUEST_INPUT_LEN"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      output_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+    },
+    {
+      model_name: "fastertransformer"
+      model_version: -1
+      input_map {
+        key: "input_ids"
+        value: "_INPUT_ID"
+      }
+      input_map {
+        key: "input_lengths"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      input_map {
+        key: "request_output_len"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      input_map {
+        key: "prompt_learning_task_name_ids"
+        value: "prompt_learning_task_name_ids"
+      }
+      input_map {
+          key: "runtime_top_k"
+          value: "runtime_top_k"
+      }
+      input_map {
+          key: "runtime_top_p"
+          value: "runtime_top_p"
+      }
+      input_map {
+          key: "beam_search_diversity_rate"
+          value: "beam_search_diversity_rate"
+      }
+      input_map {
+          key: "temperature"
+          value: "temperature"
+      }
+      input_map {
+          key: "len_penalty"
+          value: "len_penalty"
+      }
+      input_map {
+          key: "repetition_penalty"
+          value: "repetition_penalty"
+      }
+      input_map {
+          key: "random_seed"
+          value: "random_seed"
+      }
+      input_map {
+          key: "is_return_log_probs"
+          value: "is_return_log_probs"
+      }
+      input_map {
+          key: "beam_width"
+          value: "beam_width"
+      }
+      input_map {
+          key: "start_id"
+          value: "start_id"
+      }
+      input_map {
+          key: "end_id"
+          value: "end_id"
+      }
+      input_map {
+          key: "stop_words_list"
+          value: "_STOP_WORDS_IDS"
+      }
+      input_map {
+          key: "bad_words_list"
+          value: "_BAD_WORDS_IDS"
+      }
+      input_map {
+        key: "top_p_decay"
+        value: "top_p_decay"
+      }
+      input_map {
+        key: "top_p_min"
+        value: "top_p_min"
+      }
+      input_map {
+        key: "top_p_reset_ids"
+        value: "top_p_reset_ids"
+      }
+      output_map {
+        key: "output_ids"
+        value: "_TOKENS_BATCH"
+      }
+      output_map {
+        key: "sequence_length"
+        value: "sequence_length"
+      }
+      output_map {
+        key: "cum_log_probs"
+        value: "cum_log_probs"
+      }
+      output_map {
+        key: "output_log_probs"
+        value: "output_log_probs"
+      }
+    },
+    {
+      model_name: "postprocessing"
+      model_version: -1
+      input_map {
+        key: "TOKENS_BATCH"
+        value: "_TOKENS_BATCH"
+      }
+      input_map {
+        key: "sequence_length"
+        value: "sequence_length"
+      }
+      output_map {
+        key: "OUTPUT"
+        value: "OUTPUT_0"
+      }
+    }
+  ]
+}
diff --git a/all_models/llama/fastertransformer/1/config.ini b/all_models/llama/fastertransformer/1/config.ini
@@ -0,0 +1,11 @@
+[llama]
+model_name = llama
+head_num = 32
+size_per_head = 128
+inter_size = 11008
+num_layer = 32
+rotary_embedding = 128
+vocab_size = 32000
+start_id = 0
+end_id = 1
+weight_data_type = fp16