From e60e645e5f959ae90da24d25c308ba8df360b1af Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Thu, 28 Sep 2023 14:27:06 +0800
Subject: [PATCH 01/26] add a white change that breaks formatting

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index d9aed9b47bd8..30d4c166c8c5 100755
--- a/setup.py
+++ b/setup.py
@@ -211,6 +211,7 @@ def op_enabled(op_name):
 
 
 def create_dir_symlink(src, dest):
+
     if not os.path.islink(dest):
         if os.path.exists(dest):
             os.remove(dest)

From ed95d21e96308b11332d1c4720d225a53c28d187 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sat, 30 Sep 2023 22:38:11 +0800
Subject: [PATCH 02/26] fix TestModelTask

---
 tests/unit/inference/test_inference.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 894f040be207..602c015dc1ee 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -280,6 +280,9 @@ def test(
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 

From f0022b076be2e06aea3e1c8d4e7b81f033a04acc Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sun, 1 Oct 2023 11:39:08 +0800
Subject: [PATCH 03/26] Skip TestModelTask if InferenceBuilder are not
 implemented

---
 tests/unit/inference/test_inference.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 602c015dc1ee..bf8182796c86 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -283,6 +283,9 @@ def test(
         if dtype not in get_accelerator().supported_dtypes():
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 

From af2f380764cf677a7e058132938365c0d4dfb29f Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sun, 1 Oct 2023 14:24:05 +0800
Subject: [PATCH 04/26] remove blank change

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 30d4c166c8c5..d9aed9b47bd8 100755
--- a/setup.py
+++ b/setup.py
@@ -211,7 +211,6 @@ def op_enabled(op_name):
 
 
 def create_dir_symlink(src, dest):
-
     if not os.path.islink(dest):
         if os.path.exists(dest):
             os.remove(dest)

From ac4254fbfea5d93c78992a81ef0504d5d84ee381 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Sat, 7 Oct 2023 15:27:01 +0800
Subject: [PATCH 05/26] Reuse hf_model list among tests to avoid slow loading
 (#16)

* Reuse hf_model list among tests to avoid slow loading

* try to debug test skip

* another attempt to print test failure

* another attempt

* more attempt to print skip reason

* revert changes that are temporary
---
 tests/unit/inference/test_inference.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index bf8182796c86..b86447c9fbb7 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -5,6 +5,7 @@
 
 import os
 import time
+import pickle
 import torch
 import pytest
 import itertools
@@ -65,7 +66,13 @@
 ]
 
 # Get a list of all models and mapping from task to supported models
-_hf_models = list(HfApi().list_models())
+try:
+    with open("hf_models.pkl", "rb") as fp:
+        _hf_models = pickle.load(fp)
+except FileNotFoundError:
+    _hf_models = list(HfApi().list_models())
+    with open("hf_models.pkl", "wb") as fp:
+        pickle.dump(_hf_models, fp)
 _hf_model_names = [m.modelId for m in _hf_models]
 _hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
 

From cc0294f824db968a21cafbbec82dbdfea6137129 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Sun, 8 Oct 2023 19:54:10 +0800
Subject: [PATCH 06/26] Change COLUMNS to 140 to allow display of pytest skip
 message; Sanity check before run unit tests

* Reuse hf_model list among tests to avoid slow loading

* try to debug test skip

* another attempt to print test failure

* another attempt

* more attempt to print skip reason

* revert changes that are temporary

* remove extra flag for pytest

* add a dummy test to test pytest

* test skip message

* put old test and temp test together to compare

* try to find out the reason skip message are not printed

* comment all skips

* check skip in common.py

* revert last commits

* shorten name to show skip message

* change test name

* expand number of columns to 120 when running pytest

* detect deepspeed installation

* add test code for environment

* change pytorch version 2.1.0==>2.0.1

* add py-cpuinfo as requiiremetns to dev

* install py-cpuinfo manually

* Change COLUMNS to 140 to allow display of pytest skip message
---
 .github/workflows/cpu-inference.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 8eeca324c350..c35b096b56db 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -74,7 +74,10 @@ jobs:
       - name: Unit tests
         run: |
           source oneCCL/build/_install/env/setvars.sh
+          # check whether the environment is properly setup
+          python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
+          python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
+          COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/

From 861088fa059bd191f3180097877097ab9c90b9f5 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Wed, 25 Oct 2023 10:24:49 +0800
Subject: [PATCH 07/26] Gma/fix cpu inference local (#19)

* Reuse hf_model list among tests to avoid slow loading

* try to debug test skip

* another attempt to print test failure

* another attempt

* more attempt to print skip reason

* revert changes that are temporary

* remove extra flag for pytest

* add a dummy test to test pytest

* test skip message

* put old test and temp test together to compare

* try to find out the reason skip message are not printed

* comment all skips

* check skip in common.py

* revert last commits

* shorten name to show skip message

* change test name

* expand number of columns to 120 when running pytest

* detect deepspeed installation

* add test code for environment

* change pytorch version 2.1.0==>2.0.1

* add py-cpuinfo as requiiremetns to dev

* install py-cpuinfo manually

* Change COLUMNS to 140 to allow display of pytest skip message

* ping pytorch to 2.0.1

* add pip list before install deepspeed

* install cpuinfo before install deepspeed

* change workflow to work with pytorch 2.1

* add torch install to CI workflow

* install py-cpuinfo

* enforce autotp test on single socket instance

* enforce 2 ranks in cpu autotp tests

* enable tests that can only run on torch 2.1 or above

* make build faster

* remove -j make option

* add back skip for codegen

* check UT result

* update tutorial
---
 .github/workflows/cpu-inference.yml               | 15 +++++++++++----
 .../accelerator-abstraction-interface.md          |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 6fdc5f386445..b37b26fd6d10 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -39,8 +39,14 @@ jobs:
 
       - name: Install oneCCL Bindings for PyTorch
         run: |
+          pip install torch
           python -m pip install intel_extension_for_pytorch
-          python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+          python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu
+          pip install py-cpuinfo
+          # check installed version
+          pip list |grep \\\<torch\\\>
+          pip list |grep intel-extension-for-pytorch
+          pip list |grep oneccl-bind-pt
 
       - name: Install oneCCL
         run: |
@@ -79,6 +85,7 @@ jobs:
           python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
           python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
-          COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
+          cd  tests
+          # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
+          LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md
index 0810c3c6b5d7..db1a6005f793 100644
--- a/docs/_tutorials/accelerator-abstraction-interface.md
+++ b/docs/_tutorials/accelerator-abstraction-interface.md
@@ -96,7 +96,7 @@ To run DeepSpeed model on CPU, use the following steps to prepare environment:
 
 ```
 python -m pip install intel_extension_for_pytorch
-python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu
 git clone https://github.com/oneapi-src/oneCCL
 cd oneCCL
 mkdir build

From 48787d9970fac6089d74449de67a5835fd5cc1ad Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Wed, 25 Oct 2023 02:43:47 +0000
Subject: [PATCH 08/26] change cpu inference test to self hosted v100 runner

---
 .github/workflows/cpu-inference.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index b37b26fd6d10..2c43952d4c1f 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v3

From 17183bd246889bbf64f222f19227d500a54efa71 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Thu, 26 Oct 2023 06:05:17 +0000
Subject: [PATCH 09/26] Running on self-hosted cpu rather than cuda machine.

---
 .github/workflows/cpu-inference.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 2c43952d4c1f..27d59d4049cb 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, cpu]
 
     steps:
       - uses: actions/checkout@v3

From f40a484ee6bbad517a6d5a55e049cd6272c3b14a Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Mon, 30 Oct 2023 02:46:41 +0000
Subject: [PATCH 10/26] remove ad-hoc running of cpu-inference

---
 .github/workflows/cpu-inference.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 27d59d4049cb..4b06324f96b8 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -5,7 +5,6 @@ on:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
-  workflow_dispatch:
   merge_group:
     branches: [ master ]
 

From 4ed3b60343f0344bf4e40fd020e90e0bd387e365 Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <1906710196@qq.com>
Date: Mon, 30 Oct 2023 12:06:40 +0800
Subject: [PATCH 11/26] update ccl.py for error type  (#24)

* Remove PP Grad Tail Check (#2538)

* Only communicate grad tail if it exists

Co-authored-by: Dashiell Stander <dash.stander@gmail.com>

* Revert previous patch and just always send the grad tail

* Formatting

---------

Co-authored-by: Dashiell Stander <dash.stander@gmail.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>

* Added __HIP_PLATFORM_AMD__=1 (#4570)

* fix multiple definition while building evoformer (#4556)

Current builder for evoformer use the same name for `attention.cpp` and
`attention.cu`, leading to same intermediate filename `attention.o`:
```shell
march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -
isystem /home/zejianxie/.conda/envs/dll/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem
/home/zejianxie/.conda/envs/dll/include build/temp.linux-x86_64-cpython-
310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython-
310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython-
310/csrc/deepspeed4science/evoformer_attn/attention_back.o
```
and
```
`attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)':
      tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): multiple definition of `attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): first defined here
      /home/zejianxie/.conda/envs/dll/bin/../lib/gcc/x86_64-conda-linux-gnu/11.4.0/../../../../x86_64-conda-linux-gnu/bin/ld: build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): multiple definition of `torch::autograd::(anonymous namespace)::graph_task_id'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): first defined here
```

I use following to reproduce and confirm my fix works:
```
git clone https://github.com/NVIDIA/cutlass --depth 1
CUTLASS_PATH=$PWD/cutlass DS_BUILD_EVOFORMER_ATTN=1 pip install ./DeepSpeed --global-option="build_ext"
```

![image](https://github.com/microsoft/DeepSpeed/assets/41792945/9e406b37-330c-431c-8bf9-6be378dee4ff)

Co-authored-by: Conglong Li <conglong.li@gmail.com>

* Update ccl.py

---------

Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
Co-authored-by: Dashiell Stander <dash.stander@gmail.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Co-authored-by: Xie Zejian <xiezej@gmail.com>
Co-authored-by: Conglong Li <conglong.li@gmail.com>
---
 .../evoformer_attn/{attention.cu => attention_cu.cu}           | 0
 deepspeed/comm/ccl.py                                          | 2 +-
 deepspeed/runtime/pipe/engine.py                               | 2 +-
 op_builder/builder.py                                          | 3 +++
 op_builder/evoformer_attn.py                                   | 2 +-
 5 files changed, 6 insertions(+), 3 deletions(-)
 rename csrc/deepspeed4science/evoformer_attn/{attention.cu => attention_cu.cu} (100%)

diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cu b/csrc/deepspeed4science/evoformer_attn/attention_cu.cu
similarity index 100%
rename from csrc/deepspeed4science/evoformer_attn/attention.cu
rename to csrc/deepspeed4science/evoformer_attn/attention_cu.cu
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index 6e915d0ca430..38a22f20109f 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -170,7 +170,7 @@ def get_all_ranks_from_group(self, group):
             while True:
                 results.append(super(CCLBackend, self).get_global_rank(group, rank))
                 rank += 1
-        except ValueError:
+        except (ValueError, RuntimeError):
             pass
         if tuple(results) not in self.groups:
             self._new_group(results, group)
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 2542dd602b1f..b89c1aca7f2a 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -988,7 +988,7 @@ def _exec_send_grads(self, buffer_id):
             if isinstance(inputs, tuple):
                 first_input = inputs[0]
                 assert all([torch.is_tensor(elt) for elt in inputs[1:]])
-                inputs_grad_tail = [elt.grad for elt in inputs[1:] if elt.grad is not None]
+                inputs_grad_tail = [elt.grad for elt in inputs[1:]]
             elif torch.is_tensor(inputs):
                 first_input = inputs
                 inputs_grad_tail = []
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 1fdcd485f5fb..62683774f20a 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -486,6 +486,9 @@ def jit_load(self, verbose=True):
                 cxx_args.append("-DBF16_AVAILABLE")
                 nvcc_args.append("-DBF16_AVAILABLE")
 
+        if self.is_rocm_pytorch():
+            cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+
         op_module = load(name=self.name,
                          sources=self.strip_empty_entries(sources),
                          extra_include_paths=self.strip_empty_entries(extra_include_paths),
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
index f4311848d0d4..6e7721f94e01 100644
--- a/op_builder/evoformer_attn.py
+++ b/op_builder/evoformer_attn.py
@@ -27,7 +27,7 @@ def extra_ldflags(self):
 
     def sources(self):
         src_dir = 'csrc/deepspeed4science/evoformer_attn'
-        return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention.cu']
+        return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu']
 
     def nvcc_args(self):
         args = super().nvcc_args()

From 15295ae5a5aa9f2e20c554b7b42fc2ecaa729e49 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Fri, 3 Nov 2023 07:17:49 +0000
Subject: [PATCH 12/26] install gcc-9 in cpu workflow

---
 .github/workflows/cpu-inference.yml | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 61bd68c737b0..adf309bef39e 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -24,6 +24,17 @@ jobs:
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install gcc-9
+        run: |
+          sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
+          sudo apt install -y gcc-9
+
+      - name: Check gcc version
+        run: |
+          # Get gcc version
+          gcc --version
+          g++ --version
+
       - name: Detect instruction sets on instance
         run: |
           lscpu
@@ -75,16 +86,16 @@ jobs:
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
-      - name: Python environment
+      - name: Python environment check
         run: |
           pip list
-
-      - name: Unit tests
-        run: |
           source oneCCL/build/_install/env/setvars.sh
           # check whether the environment is properly setup
           python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
           python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
+
+      - name: Unit tests
+        run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd  tests
           # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner

From d52ff77c6bcb50d3675bc21ed868012c64cb17b6 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sun, 5 Nov 2023 00:10:58 +0800
Subject: [PATCH 13/26] set gcc/g++ default to 9 in cpu inference workflow

---
 .github/workflows/cpu-inference.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index adf309bef39e..89df1d256179 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -27,7 +27,10 @@ jobs:
       - name: Install gcc-9
         run: |
           sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
-          sudo apt install -y gcc-9
+          sudo apt install -y gcc-9 g++-9
+          # set gcc-9 and g++9 to default
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
 
       - name: Check gcc version
         run: |

From e9fafa7cae0ff5dd9fb535a4bcdf12833b815064 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Tue, 7 Nov 2023 03:18:23 +0000
Subject: [PATCH 14/26] update oneccl_bind_pt installation steps

---
 .github/workflows/cpu-inference.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 89df1d256179..ce5c86523506 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -56,7 +56,9 @@ jobs:
         run: |
           pip install torch
           python -m pip install intel_extension_for_pytorch
-          python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu
+          # the curl line is for troubleshootingn
+          curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+          python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
           pip install py-cpuinfo
           # check installed version
           pip list |grep \\\<torch\\\>

From 51922e4872d06fca2a77376791a92a7dd11d02de Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Thu, 16 Nov 2023 03:21:10 +0000
Subject: [PATCH 15/26] mitigation for oneCCL GLIBCXX_3.4.30 not found issue

---
 .github/workflows/cpu-inference.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index ce5c86523506..0ee715b5aa1a 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -38,6 +38,11 @@ jobs:
           gcc --version
           g++ --version
 
+      - name: Mitigation for GLIBCXX_3.4.30
+        run: |
+          # install glibc higher version
+          conda install -c conda-forge libstdcxx-ng=12
+
       - name: Detect instruction sets on instance
         run: |
           lscpu

From fc6025c434c9271ef0c4108a9cbe32586caf83ab Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sat, 18 Nov 2023 22:54:52 +0800
Subject: [PATCH 16/26] use sudo to install conda package

---
 .github/workflows/cpu-inference.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 0ee715b5aa1a..6ad9c7098cad 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Mitigation for GLIBCXX_3.4.30
         run: |
           # install glibc higher version
-          conda install -c conda-forge libstdcxx-ng=12
+          sudo conda install -y -c conda-forge libstdcxx-ng=12
 
       - name: Detect instruction sets on instance
         run: |

From a8cec8b83d9b6a22b8e30f13bc4849ae5a0a41fd Mon Sep 17 00:00:00 2001
From: Liangliang-Ma <1906710196@qq.com>
Date: Tue, 21 Nov 2023 17:02:01 +0800
Subject: [PATCH 17/26] ccl issues fix (#32)

* fix api bugs

* remove group in inference_all_reduce

* remove group from inference_all_reduce in ccl.cpp

* direct call all_reduce and inference_all_reduce

* Update ccl.py
---
 csrc/cpu/comm/ccl.cpp |  4 ++--
 deepspeed/comm/ccl.py | 44 +++++++++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp
index 9a04b6f873f2..6428ab5cbfa5 100644
--- a/csrc/cpu/comm/ccl.cpp
+++ b/csrc/cpu/comm/ccl.cpp
@@ -537,7 +537,7 @@ static void parallel_memcpy(void* to, void* from, size_t n_bytes)
     }
 }
 
-void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> group, bool async_op)
+void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op)
 {
     static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
     static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value"));
@@ -562,7 +562,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> g
                                 data.numel(),
                                 get_ccl_datatype(data.scalar_type()),
                                 get_ccl_reduce_op(op, data),
-                                _get_comm_from_group(group))
+                                _get_comm_from_group())
                      .wait());
         return;
     }
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index 38a22f20109f..0fea6fe2ff2e 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -61,7 +61,8 @@ def is_initialized(self):
 
     def run_collective(self, name, **kwargs):
         if name in self.available_coll:
-            kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
+            if 'group' in kwargs:
+                kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
             if 'dst' in kwargs:
                 kwargs['dst'] = kwargs['group'].index(kwargs['dst'])
             if 'src' in kwargs:
@@ -71,23 +72,38 @@ def run_collective(self, name, **kwargs):
             return CCLHandler(self.ccl_comm_op)
         else:
             func = "super(CCLBackend, self)." + name
-            return eval(func)(*(kwargs.values()))
+            eval(func)(*(kwargs.values()))
+            return CCLHandler(self.ccl_comm_op)
 
     def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
         use_caching = False
         if use_caching:
             match_id = f"{tensor.size()}-{op}"
-            return self.run_collective(name="all_reduce_caching",
-                                       tensor=tensor,
-                                       op=op,
-                                       match_id=match_id,
-                                       group=group,
-                                       async_op=async_op)
+            name = "all_reduce_caching"
+            if name in self.available_coll:
+                group = self.get_all_ranks_from_group(group)
+                return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
+            else:
+                return self.run_collective(name="all_reduce_caching",
+                                            tensor=tensor,
+                                            op=op,
+                                            match_id=match_id,
+                                            group=group,
+                                            async_op=async_op)
         else:
-            return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
+            name = "all_reduce"
+            if name in self.available_coll:
+                group = self.get_all_ranks_from_group(group)
+                return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
+            else:
+                return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
 
     def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
-        return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
+        name = "inference_all_reduce"
+        if name in self.available_coll:
+            return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op)
+        else:
+            return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op)
 
     def broadcast(self, tensor, src, group=None, async_op=False):
         return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op)
@@ -120,11 +136,11 @@ def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes
                                    input_split_sizes=input_split_sizes,
                                    group=group)
 
-    def send(self, tensor, dst, group=None, async_op=False):
-        return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op)
+    def send(self, tensor, dst, group=None, tag=0):
+        return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, tag=tag)
 
-    def recv(self, tensor, src, group=None, async_op=False):
-        return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op)
+    def recv(self, tensor, src, group=None, tag=0):
+        return self.run_collective(name="recv", tensor=tensor, src=src, group=group, tag=tag)
 
     def gather(self, tensor, gather_list, dst, group=None, async_op=False):
         return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group)

From 9fb8ecb6587c58bb08c2e80c72e7c3dcf02b6c58 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Tue, 21 Nov 2023 09:05:08 +0000
Subject: [PATCH 18/26] clean up all_reduce_caching path

---
 deepspeed/comm/ccl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index 0fea6fe2ff2e..982e13c1ce1a 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -84,7 +84,7 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
                 group = self.get_all_ranks_from_group(group)
                 return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
             else:
-                return self.run_collective(name="all_reduce_caching",
+                return self.run_collective(name=name,
                                             tensor=tensor,
                                             op=op,
                                             match_id=match_id,

From c4cabcd5c2d281ccfb5c2524722ccb292049cfad Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Wed, 22 Nov 2023 02:33:42 +0000
Subject: [PATCH 19/26] fix formatting

---
 deepspeed/comm/ccl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index 982e13c1ce1a..a199843d5086 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -85,11 +85,11 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
                 return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
             else:
                 return self.run_collective(name=name,
-                                            tensor=tensor,
-                                            op=op,
-                                            match_id=match_id,
-                                            group=group,
-                                            async_op=async_op)
+                                           tensor=tensor,
+                                           op=op,
+                                           match_id=match_id,
+                                           group=group,
+                                           async_op=async_op)
         else:
             name = "all_reduce"
             if name in self.available_coll:

From 3663b75bc9332c5213b06048932af39acc9d4520 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Thu, 23 Nov 2023 17:54:58 +0800
Subject: [PATCH 20/26] preload libstdc++ from system lib path instead of conda
 path

---
 .github/workflows/cpu-inference.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 6ad9c7098cad..09c15f096734 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -38,11 +38,6 @@ jobs:
           gcc --version
           g++ --version
 
-      - name: Mitigation for GLIBCXX_3.4.30
-        run: |
-          # install glibc higher version
-          sudo conda install -y -c conda-forge libstdcxx-ng=12
-
       - name: Detect instruction sets on instance
         run: |
           lscpu
@@ -100,6 +95,7 @@ jobs:
         run: |
           pip list
           source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
           # check whether the environment is properly setup
           python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
           python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"

From 3934919c44cb3548afa31811d484e89fa70e1ab7 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Wed, 29 Nov 2023 12:34:19 +0800
Subject: [PATCH 21/26] prep oneCCL before running unit tests

---
 .github/workflows/cpu-inference.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 09c15f096734..8f410378c5af 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -102,6 +102,8 @@ jobs:
 
       - name: Unit tests
         run: |
+          # prep oneCCL for CCLBackend comm ops building
+          source oneCCL/build/_install/env/setvars.sh
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd  tests
           # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner

From b90fa99e18285f9d9f5e7ecda19d74fd4c9c3f40 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Sat, 2 Dec 2023 09:13:37 +0800
Subject: [PATCH 22/26] prep libstdc++ in UT run

---
 .github/workflows/cpu-inference.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 8f410378c5af..7a166e34c80f 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -104,6 +104,7 @@ jobs:
         run: |
           # prep oneCCL for CCLBackend comm ops building
           source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd  tests
           # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner

From 80550342e2af8167f96b89f4e487deb99315fb62 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Tue, 5 Dec 2023 09:47:23 +0800
Subject: [PATCH 23/26] allow codegen test for bf16

---
 tests/unit/inference/test_inference.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 527b4d5a8b67..dcd844d2a566 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -549,10 +549,6 @@ def test(
         if dtype not in get_accelerator().supported_dtypes():
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
-        # TODO: enable this test after torch 2.1 stable release
-        if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
-            pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
-
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "2"))

From b50a481172639f969a8ae1373bb14d6761f03564 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Thu, 7 Dec 2023 00:42:43 -0800
Subject: [PATCH 24/26] disable codegen bf16

---
 tests/unit/inference/test_inference.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index dcd844d2a566..767e1dba23ea 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -549,6 +549,9 @@ def test(
         if dtype not in get_accelerator().supported_dtypes():
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
+        if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
+            pytest.skip("Disable Codegen model(bf16) due to slight result difference")
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "2"))

From a72beea53a8317d551fce74e03548c8fd76c8f0a Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@intel.com>
Date: Wed, 3 Jan 2024 03:14:29 +0000
Subject: [PATCH 25/26] fix test_inference_config UT error

---
 tests/unit/inference/test_inference_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index 375563abf65b..39d62d17372c 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -15,7 +15,7 @@ class TestInferenceConfig(DistributedTest):
     world_size = 1
 
     def test_overlap_kwargs(self):
-        config = {"replace_with_kernel_inject": True}
+        config = {"replace_with_kernel_inject": True, "dtype": torch.float32}
         kwargs = {"replace_with_kernel_inject": True}
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
@@ -37,7 +37,7 @@ def test_kwargs_and_config(self):
         assert engine._config.dtype == kwargs["dtype"]
 
     def test_json_config(self, tmpdir):
-        config = {"replace_with_kernel_inject": True}
+        config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"}
         config_json = create_config_from_dict(tmpdir, config)
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)

From 3244e1f7c5278e05dca8acbbde2a99a7d0c101cf Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@gmail.com>
Date: Fri, 5 Jan 2024 09:29:54 +0800
Subject: [PATCH 26/26] fix typo

---
 .github/workflows/cpu-inference.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 7a166e34c80f..a2ca41f4aa3a 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -56,7 +56,7 @@ jobs:
         run: |
           pip install torch
           python -m pip install intel_extension_for_pytorch
-          # the curl line is for troubleshootingn
+          # the curl line is for troubleshooting
           curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
           python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
           pip install py-cpuinfo