fix for only decode + better logging online - batch size hard coded

tenstorrent · Nov 13, 2024 · 1f9d587 · 1f9d587
1 parent ea29384
commit 1f9d587
Showing 1 changed file with 17 additions and 22 deletions.
diff --git a/tests/mock_vllm_model.py b/tests/mock_vllm_model.py
@@ -444,58 +444,53 @@ def _write_to_json(self, stats):
 
         if len(stats.time_to_first_tokens_iter) > 0:
             # if inference is done online, need to handle case where not all user requests are made at same engine step call
-            breakpoint()
             if os.path.exists(self.filepath):
                 with open(self.filepath, "r") as file:
                     lines = file.readlines()
                     # load in last line if time to first token not completed for all users
                     if lines:
                         last_line = lines[-1]
                         last_data = json.loads(last_line)
-                        if "time to first token" in last_data:
+                        if (
+                            "time to first token" in last_data
+                        ):  # if still in prefill stage (incomplete for all users) or only doing prefill and no decode
                             if (
                                 len(list(last_data["time to first token"].values())[0])
                                 < 32
-                            ):
-                                data = last_data
+                            ):  # if incomplete prefill for all users
+                                # data = last_data
+                                self._append_new_users(data)
                                 # find the index of the last user for whicht the first token was computed
                                 last_user_processed = len(
                                     list(last_data["time to first token"].values())[0]
                                 )
 
-                            else:
+                            else:  # if prefill already complete for all users
                                 last_user_processed = 0
-                                data["time to first token"] = {}
-                                data["time to first token"][
-                                    f"Inference num:{self.num_inference}"
-                                ] = {}
+                                self._append_new_users(data)
 
-                        else:
+                        else:  # if in decode stage
                             last_user_processed = 0
-                            data["time to first token"] = {}
-                            data["time to first token"][
-                                f"Inference num:{self.num_inference}"
-                            ] = {}
-            else:
+                            self._append_new_users(data)
+            else:  # if first forward pass
                 last_user_processed = 0
-                data["time to first token"] = {}
-                data["time to first token"][f"Inference num:{self.num_inference}"] = {}
+                self._append_new_users(data)
 
             for user_idx, ttft in enumerate(stats.time_to_first_tokens_iter):
                 data["time to first token"][f"Inference num:{self.num_inference}"][
                     f"user {user_idx + last_user_processed}"
                 ] = ttft
 
-            if (
-                len(data["time to first token"][f"Inference num:{self.num_inference}"])
-                == 32
-            ):  # if batch size == num users processed
-                self.num_inference += 1
+            self.num_inference += 1
 
         if data:
             with open(self.filepath, "a") as file:
                 json.dump(data, file)
                 file.write("\n")  # Ensure each JSON object is on a new line
 
+    def _append_new_users(self, data):
+        data["time to first token"] = {}
+        data["time to first token"][f"Inference num:{self.num_inference}"] = {}
+
     def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         raise NotImplementedError