refactor

swiss-ai · Jan 31, 2024 · 5d9652a · 5d9652a
1 parent e5e2045
commit 5d9652a
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 14 deletions.
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -48,11 +48,6 @@ jobs:
         pip install -e .[dev]
         pip install -e .[test]
 
-    - name: Install test dependencies
-      run: |
-        pip install pytest
-        pip install pytest-cov
-
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
 

diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
@@ -51,11 +51,6 @@ jobs:
         pip install -e .[dev]
         pip install -e .[test]
 
-    - name: Install test dependencies
-      run: |
-        pip install pytest
-        pip install pytest-cov
-
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
 

diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
@@ -260,6 +260,8 @@ def initialize_torch_distributed():
 
     # Call the init process.
     port = find_free_port()
-    init_method = f"tcp://localhost:{port}"
-    dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
+    init_method = f"env://localhost:{port}"
+    dist.init_process_group(
+        init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
+    )
     return True
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
@@ -7,6 +7,7 @@
 from nanotron.parallel import ParallelContext
 from torch.distributed.launcher import elastic_launch
 
+
 def available_gpus():
     if not torch.cuda.is_available():
         return 0
@@ -91,7 +92,7 @@ def _init_distributed(func):
         """
         nb_gpus = tp * dp * pp
         run_id = uuid.uuid4()
-        
+
         config = torch.distributed.launcher.LaunchConfig(
             min_nodes=1,
             max_nodes=1,
@@ -100,7 +101,7 @@ def _init_distributed(func):
             rdzv_configs={"timeout": 60},
             # Setting port to `0` allows `torch` to randomly pick a port: https://pytorch.org/docs/stable/elastic/run.html#stacked-single-node-multi-worker
             # Works only for single node workload.
-            rdzv_endpoint=f"localhost:0",
+            rdzv_endpoint="localhost:0",
             run_id=str(run_id),
             max_restarts=0,
             # TODO @thomasw21: Tune as we increase the number of tests