Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
xrsrke committed Jan 31, 2024
1 parent e5e2045 commit 5d9652a
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 14 deletions.
5 changes: 0 additions & 5 deletions .github/workflows/3d_parallelism_unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,6 @@ jobs:
pip install -e .[dev]
pip install -e .[test]
- name: Install test dependencies
run: |
pip install pytest
pip install pytest-cov
- name: Show installed libraries and their versions
run: pip freeze | tee installed.txt

Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/fa2_unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,6 @@ jobs:
pip install -e .[dev]
pip install -e .[test]
- name: Install test dependencies
run: |
pip install pytest
pip install pytest-cov
- name: Show installed libraries and their versions
run: pip freeze | tee installed.txt

Expand Down
6 changes: 4 additions & 2 deletions src/nanotron/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ def initialize_torch_distributed():

# Call the init process.
port = find_free_port()
init_method = f"tcp://localhost:{port}"
dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
init_method = f"env://localhost:{port}"
dist.init_process_group(
init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
)
return True
5 changes: 3 additions & 2 deletions tests/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from nanotron.parallel import ParallelContext
from torch.distributed.launcher import elastic_launch


def available_gpus():
if not torch.cuda.is_available():
return 0
Expand Down Expand Up @@ -91,7 +92,7 @@ def _init_distributed(func):
"""
nb_gpus = tp * dp * pp
run_id = uuid.uuid4()

config = torch.distributed.launcher.LaunchConfig(
min_nodes=1,
max_nodes=1,
Expand All @@ -100,7 +101,7 @@ def _init_distributed(func):
rdzv_configs={"timeout": 60},
# Setting port to `0` allows `torch` to randomly pick a port: https://pytorch.org/docs/stable/elastic/run.html#stacked-single-node-multi-worker
# Works only for single node workload.
rdzv_endpoint=f"localhost:0",
rdzv_endpoint="localhost:0",
run_id=str(run_id),
max_restarts=0,
# TODO @thomasw21: Tune as we increase the number of tests
Expand Down

0 comments on commit 5d9652a

Please sign in to comment.