From 0cdca230490722e473357dd7f9d4525a8a6d08bb Mon Sep 17 00:00:00 2001 From: Muhammad Asif Manzoor Date: Thu, 12 Dec 2024 21:52:39 +0000 Subject: [PATCH] cleanup and fix torch.multiprocessing Queue exception handling --- tt_torch/dynamo/backend.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tt_torch/dynamo/backend.py b/tt_torch/dynamo/backend.py index 5c7f28e6..8e1832bc 100644 --- a/tt_torch/dynamo/backend.py +++ b/tt_torch/dynamo/backend.py @@ -256,7 +256,7 @@ def compile_op(self, node, *inputs, **kwargs): op.add_stable_hlo_graph(module.operation.get_asm()) # mp.set_start_method('spawn') - # mp.set_sharing_strategy('file_system') + mp.set_sharing_strategy("file_system") torch.set_num_threads(1) sender = mp.Queue() receiver = mp.Queue() @@ -287,7 +287,7 @@ def compile_op(self, node, *inputs, **kwargs): ttnn_event.set() op.compilation_status = OpCompilationStatus.CONVERTED_TO_TTNN break - except mp.queues.Empty: + except Exception as e: pass if time.time() - start > self.compiler_config.single_op_timeout: process.terminate() @@ -312,7 +312,6 @@ def pre_process_inputs(self, *inputs): return processed_inputs def run_op(self, binary, *inputs): - print("run_op", file=sys.stderr) inputs = self.pre_process_inputs(*inputs) # mp.set_start_method('spawn') # mp.set_sharing_strategy('file_system') @@ -334,11 +333,11 @@ def run_op(self, binary, *inputs): break try: # result = receiver.get_nowait() - result = receiver.get(timeout=0.01) + result = receiver.get(timeout=0.05) outputs = result["outputs"] exec_event.set() break - except mp.queues.Empty: + except Exception as e: pass if time.time() - start > self.compiler_config.single_op_timeout: process.terminate()