From df7cb9d5957cd6c0b386e84d5bff6a14b381dc2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?X=CE=BBRI-U5?= Date: Mon, 29 Jan 2024 20:33:39 +0700 Subject: [PATCH] Update distributed.py --- src/nanotron/distributed.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py index 238dca9b..889b2330 100644 --- a/src/nanotron/distributed.py +++ b/src/nanotron/distributed.py @@ -259,7 +259,22 @@ def initialize_torch_distributed(): backend = "gloo" # Call the init process. - port = find_free_port() + pytest_worker_id = os.environ.get("PYTEST_XDIST_WORKER") + if worker_id is not None: + port = find_free_port() + else: + def string_to_unique_number(s, min_port=2000, max_port=65000): + import hashlib + # Hash the string + hash_object = hashlib.sha256(s.encode()) + hash_number = int(hash_object.hexdigest(), base=16) + + # Map the hash to the specified range + range_size = min_port - max_port + return range_start + (hash_number % range_size) + + port = string_to_unique_number(pytest_worker_id) + init_method = f"tcp://localhost:{port}" dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout) return True