From 5d9652abb9161d61c2f75df2fba1ae7f380d330a Mon Sep 17 00:00:00 2001 From: Phuc Nguyen Date: Wed, 31 Jan 2024 13:11:23 +0000 Subject: [PATCH] refactor --- .github/workflows/3d_parallelism_unit_tests.yaml | 5 ----- .github/workflows/fa2_unit_tests.yaml | 5 ----- src/nanotron/distributed.py | 6 ++++-- tests/helpers/utils.py | 5 +++-- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml index f2797418..332825bf 100644 --- a/.github/workflows/3d_parallelism_unit_tests.yaml +++ b/.github/workflows/3d_parallelism_unit_tests.yaml @@ -48,11 +48,6 @@ jobs: pip install -e .[dev] pip install -e .[test] - - name: Install test dependencies - run: | - pip install pytest - pip install pytest-cov - - name: Show installed libraries and their versions run: pip freeze | tee installed.txt diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml index 08a3184f..17f7475f 100644 --- a/.github/workflows/fa2_unit_tests.yaml +++ b/.github/workflows/fa2_unit_tests.yaml @@ -51,11 +51,6 @@ jobs: pip install -e .[dev] pip install -e .[test] - - name: Install test dependencies - run: | - pip install pytest - pip install pytest-cov - - name: Show installed libraries and their versions run: pip freeze | tee installed.txt diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py index 238dca9b..01438719 100644 --- a/src/nanotron/distributed.py +++ b/src/nanotron/distributed.py @@ -260,6 +260,8 @@ def initialize_torch_distributed(): # Call the init process. port = find_free_port() - init_method = f"tcp://localhost:{port}" - dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout) + init_method = f"env://localhost:{port}" + dist.init_process_group( + init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout + ) return True diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py index 516cc818..bc2ce00c 100644 --- a/tests/helpers/utils.py +++ b/tests/helpers/utils.py @@ -7,6 +7,7 @@ from nanotron.parallel import ParallelContext from torch.distributed.launcher import elastic_launch + def available_gpus(): if not torch.cuda.is_available(): return 0 @@ -91,7 +92,7 @@ def _init_distributed(func): """ nb_gpus = tp * dp * pp run_id = uuid.uuid4() - + config = torch.distributed.launcher.LaunchConfig( min_nodes=1, max_nodes=1, @@ -100,7 +101,7 @@ def _init_distributed(func): rdzv_configs={"timeout": 60}, # Setting port to `0` allows `torch` to randomly pick a port: https://pytorch.org/docs/stable/elastic/run.html#stacked-single-node-multi-worker # Works only for single node workload. - rdzv_endpoint=f"localhost:0", + rdzv_endpoint="localhost:0", run_id=str(run_id), max_restarts=0, # TODO @thomasw21: Tune as we increase the number of tests