From 5d9652abb9161d61c2f75df2fba1ae7f380d330a Mon Sep 17 00:00:00 2001
From: Phuc Nguyen <b3f0cus@icloud.com>
Date: Wed, 31 Jan 2024 13:11:23 +0000
Subject: [PATCH] refactor

---
 .github/workflows/3d_parallelism_unit_tests.yaml | 5 -----
 .github/workflows/fa2_unit_tests.yaml            | 5 -----
 src/nanotron/distributed.py                      | 6 ++++--
 tests/helpers/utils.py                           | 5 +++--
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
index f2797418..332825bf 100644
--- a/.github/workflows/3d_parallelism_unit_tests.yaml
+++ b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -48,11 +48,6 @@ jobs:
         pip install -e .[dev]
         pip install -e .[test]
 
-    - name: Install test dependencies
-      run: |
-        pip install pytest
-        pip install pytest-cov
-
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
 
diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
index 08a3184f..17f7475f 100644
--- a/.github/workflows/fa2_unit_tests.yaml
+++ b/.github/workflows/fa2_unit_tests.yaml
@@ -51,11 +51,6 @@ jobs:
         pip install -e .[dev]
         pip install -e .[test]
 
-    - name: Install test dependencies
-      run: |
-        pip install pytest
-        pip install pytest-cov
-
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
 
diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
index 238dca9b..01438719 100644
--- a/src/nanotron/distributed.py
+++ b/src/nanotron/distributed.py
@@ -260,6 +260,8 @@ def initialize_torch_distributed():
 
     # Call the init process.
     port = find_free_port()
-    init_method = f"tcp://localhost:{port}"
-    dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
+    init_method = f"env://localhost:{port}"
+    dist.init_process_group(
+        init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
+    )
     return True
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
index 516cc818..bc2ce00c 100644
--- a/tests/helpers/utils.py
+++ b/tests/helpers/utils.py
@@ -7,6 +7,7 @@
 from nanotron.parallel import ParallelContext
 from torch.distributed.launcher import elastic_launch
 
+
 def available_gpus():
     if not torch.cuda.is_available():
         return 0
@@ -91,7 +92,7 @@ def _init_distributed(func):
         """
         nb_gpus = tp * dp * pp
         run_id = uuid.uuid4()
-        
+
         config = torch.distributed.launcher.LaunchConfig(
             min_nodes=1,
             max_nodes=1,
@@ -100,7 +101,7 @@ def _init_distributed(func):
             rdzv_configs={"timeout": 60},
             # Setting port to `0` allows `torch` to randomly pick a port: https://pytorch.org/docs/stable/elastic/run.html#stacked-single-node-multi-worker
             # Works only for single node workload.
-            rdzv_endpoint=f"localhost:0",
+            rdzv_endpoint="localhost:0",
             run_id=str(run_id),
             max_restarts=0,
             # TODO @thomasw21: Tune as we increase the number of tests