From e60e645e5f959ae90da24d25c308ba8df360b1af Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Thu, 28 Sep 2023 14:27:06 +0800 Subject: [PATCH 01/26] add a white change that breaks formatting --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index d9aed9b47bd8..30d4c166c8c5 100755 --- a/setup.py +++ b/setup.py @@ -211,6 +211,7 @@ def op_enabled(op_name): def create_dir_symlink(src, dest): + if not os.path.islink(dest): if os.path.exists(dest): os.remove(dest) From ed95d21e96308b11332d1c4720d225a53c28d187 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sat, 30 Sep 2023 22:38:11 +0800 Subject: [PATCH 02/26] fix TestModelTask --- tests/unit/inference/test_inference.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 894f040be207..602c015dc1ee 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -280,6 +280,9 @@ def test( if invalid_test_msg: pytest.skip(invalid_test_msg) + if dtype not in get_accelerator().supported_dtypes(): + pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") + model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) From f0022b076be2e06aea3e1c8d4e7b81f033a04acc Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sun, 1 Oct 2023 11:39:08 +0800 Subject: [PATCH 03/26] Skip TestModelTask if InferenceBuilder are not implemented --- tests/unit/inference/test_inference.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 602c015dc1ee..bf8182796c86 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -283,6 +283,9 @@ def test( if dtype not in get_accelerator().supported_dtypes(): pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") + if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) + model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) From af2f380764cf677a7e058132938365c0d4dfb29f Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sun, 1 Oct 2023 14:24:05 +0800 Subject: [PATCH 04/26] remove blank change --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 30d4c166c8c5..d9aed9b47bd8 100755 --- a/setup.py +++ b/setup.py @@ -211,7 +211,6 @@ def op_enabled(op_name): def create_dir_symlink(src, dest): - if not os.path.islink(dest): if os.path.exists(dest): os.remove(dest) From ac4254fbfea5d93c78992a81ef0504d5d84ee381 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sat, 7 Oct 2023 15:27:01 +0800 Subject: [PATCH 05/26] Reuse hf_model list among tests to avoid slow loading (#16) * Reuse hf_model list among tests to avoid slow loading * try to debug test skip * another attempt to print test failure * another attempt * more attempt to print skip reason * revert changes that are temporary --- tests/unit/inference/test_inference.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index bf8182796c86..b86447c9fbb7 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -5,6 +5,7 @@ import os import time +import pickle import torch import pytest import itertools @@ -65,7 +66,13 @@ ] # Get a list of all models and mapping from task to supported models -_hf_models = list(HfApi().list_models()) +try: + with open("hf_models.pkl", "rb") as fp: + _hf_models = pickle.load(fp) +except FileNotFoundError: + _hf_models = list(HfApi().list_models()) + with open("hf_models.pkl", "wb") as fp: + pickle.dump(_hf_models, fp) _hf_model_names = [m.modelId for m in _hf_models] _hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks} From cc0294f824db968a21cafbbec82dbdfea6137129 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sun, 8 Oct 2023 19:54:10 +0800 Subject: [PATCH 06/26] Change COLUMNS to 140 to allow display of pytest skip message; Sanity check before run unit tests * Reuse hf_model list among tests to avoid slow loading * try to debug test skip * another attempt to print test failure * another attempt * more attempt to print skip reason * revert changes that are temporary * remove extra flag for pytest * add a dummy test to test pytest * test skip message * put old test and temp test together to compare * try to find out the reason skip message are not printed * comment all skips * check skip in common.py * revert last commits * shorten name to show skip message * change test name * expand number of columns to 120 when running pytest * detect deepspeed installation * add test code for environment * change pytorch version 2.1.0==>2.0.1 * add py-cpuinfo as requiiremetns to dev * install py-cpuinfo manually * Change COLUMNS to 140 to allow display of pytest skip message --- .github/workflows/cpu-inference.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 8eeca324c350..c35b096b56db 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -74,7 +74,10 @@ jobs: - name: Unit tests run: | source oneCCL/build/_install/env/setvars.sh + # check whether the environment is properly setup + python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')" + python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())" unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ - TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/ + COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ + COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/ From 861088fa059bd191f3180097877097ab9c90b9f5 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 25 Oct 2023 10:24:49 +0800 Subject: [PATCH 07/26] Gma/fix cpu inference local (#19) * Reuse hf_model list among tests to avoid slow loading * try to debug test skip * another attempt to print test failure * another attempt * more attempt to print skip reason * revert changes that are temporary * remove extra flag for pytest * add a dummy test to test pytest * test skip message * put old test and temp test together to compare * try to find out the reason skip message are not printed * comment all skips * check skip in common.py * revert last commits * shorten name to show skip message * change test name * expand number of columns to 120 when running pytest * detect deepspeed installation * add test code for environment * change pytorch version 2.1.0==>2.0.1 * add py-cpuinfo as requiiremetns to dev * install py-cpuinfo manually * Change COLUMNS to 140 to allow display of pytest skip message * ping pytorch to 2.0.1 * add pip list before install deepspeed * install cpuinfo before install deepspeed * change workflow to work with pytorch 2.1 * add torch install to CI workflow * install py-cpuinfo * enforce autotp test on single socket instance * enforce 2 ranks in cpu autotp tests * enable tests that can only run on torch 2.1 or above * make build faster * remove -j make option * add back skip for codegen * check UT result * update tutorial --- .github/workflows/cpu-inference.yml | 15 +++++++++++---- .../accelerator-abstraction-interface.md | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 6fdc5f386445..b37b26fd6d10 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -39,8 +39,14 @@ jobs: - name: Install oneCCL Bindings for PyTorch run: | + pip install torch python -m pip install intel_extension_for_pytorch - python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu + python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu + pip install py-cpuinfo + # check installed version + pip list |grep \\\ + pip list |grep intel-extension-for-pytorch + pip list |grep oneccl-bind-pt - name: Install oneCCL run: | @@ -79,6 +85,7 @@ jobs: python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')" python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())" unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch - cd tests - COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ - COLUMNS=140 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/ + cd tests + # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner + LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ + LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/ diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md index 0810c3c6b5d7..db1a6005f793 100644 --- a/docs/_tutorials/accelerator-abstraction-interface.md +++ b/docs/_tutorials/accelerator-abstraction-interface.md @@ -96,7 +96,7 @@ To run DeepSpeed model on CPU, use the following steps to prepare environment: ``` python -m pip install intel_extension_for_pytorch -python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu +python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu git clone https://github.com/oneapi-src/oneCCL cd oneCCL mkdir build From 48787d9970fac6089d74449de67a5835fd5cc1ad Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 25 Oct 2023 02:43:47 +0000 Subject: [PATCH 08/26] change cpu inference test to self hosted v100 runner --- .github/workflows/cpu-inference.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index b37b26fd6d10..2c43952d4c1f 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -15,7 +15,7 @@ concurrency: jobs: unit-tests: - runs-on: ubuntu-20.04 + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v3 From 17183bd246889bbf64f222f19227d500a54efa71 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 26 Oct 2023 06:05:17 +0000 Subject: [PATCH 09/26] Running on self-hosted cpu rather than cuda machine. --- .github/workflows/cpu-inference.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 2c43952d4c1f..27d59d4049cb 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -15,7 +15,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, cpu] steps: - uses: actions/checkout@v3 From f40a484ee6bbad517a6d5a55e049cd6272c3b14a Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 30 Oct 2023 02:46:41 +0000 Subject: [PATCH 10/26] remove ad-hoc running of cpu-inference --- .github/workflows/cpu-inference.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 27d59d4049cb..4b06324f96b8 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -5,7 +5,6 @@ on: paths-ignore: - 'docs/**' - 'blogs/**' - workflow_dispatch: merge_group: branches: [ master ] From 4ed3b60343f0344bf4e40fd020e90e0bd387e365 Mon Sep 17 00:00:00 2001 From: Liangliang-Ma <1906710196@qq.com> Date: Mon, 30 Oct 2023 12:06:40 +0800 Subject: [PATCH 11/26] update ccl.py for error type (#24) * Remove PP Grad Tail Check (#2538) * Only communicate grad tail if it exists Co-authored-by: Dashiell Stander * Revert previous patch and just always send the grad tail * Formatting --------- Co-authored-by: Dashiell Stander Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> * Added __HIP_PLATFORM_AMD__=1 (#4570) * fix multiple definition while building evoformer (#4556) Current builder for evoformer use the same name for `attention.cpp` and `attention.cu`, leading to same intermediate filename `attention.o`: ```shell march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe - isystem /home/zejianxie/.conda/envs/dll/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/zejianxie/.conda/envs/dll/include build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention_back.o ``` and ``` `attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)': tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): multiple definition of `attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): first defined here /home/zejianxie/.conda/envs/dll/bin/../lib/gcc/x86_64-conda-linux-gnu/11.4.0/../../../../x86_64-conda-linux-gnu/bin/ld: build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): multiple definition of `torch::autograd::(anonymous namespace)::graph_task_id'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): first defined here ``` I use following to reproduce and confirm my fix works: ``` git clone https://github.com/NVIDIA/cutlass --depth 1 CUTLASS_PATH=$PWD/cutlass DS_BUILD_EVOFORMER_ATTN=1 pip install ./DeepSpeed --global-option="build_ext" ``` ![image](https://github.com/microsoft/DeepSpeed/assets/41792945/9e406b37-330c-431c-8bf9-6be378dee4ff) Co-authored-by: Conglong Li * Update ccl.py --------- Co-authored-by: Quentin Anthony Co-authored-by: Dashiell Stander Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Co-authored-by: Xie Zejian Co-authored-by: Conglong Li --- .../evoformer_attn/{attention.cu => attention_cu.cu} | 0 deepspeed/comm/ccl.py | 2 +- deepspeed/runtime/pipe/engine.py | 2 +- op_builder/builder.py | 3 +++ op_builder/evoformer_attn.py | 2 +- 5 files changed, 6 insertions(+), 3 deletions(-) rename csrc/deepspeed4science/evoformer_attn/{attention.cu => attention_cu.cu} (100%) diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cu b/csrc/deepspeed4science/evoformer_attn/attention_cu.cu similarity index 100% rename from csrc/deepspeed4science/evoformer_attn/attention.cu rename to csrc/deepspeed4science/evoformer_attn/attention_cu.cu diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 6e915d0ca430..38a22f20109f 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -170,7 +170,7 @@ def get_all_ranks_from_group(self, group): while True: results.append(super(CCLBackend, self).get_global_rank(group, rank)) rank += 1 - except ValueError: + except (ValueError, RuntimeError): pass if tuple(results) not in self.groups: self._new_group(results, group) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 2542dd602b1f..b89c1aca7f2a 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -988,7 +988,7 @@ def _exec_send_grads(self, buffer_id): if isinstance(inputs, tuple): first_input = inputs[0] assert all([torch.is_tensor(elt) for elt in inputs[1:]]) - inputs_grad_tail = [elt.grad for elt in inputs[1:] if elt.grad is not None] + inputs_grad_tail = [elt.grad for elt in inputs[1:]] elif torch.is_tensor(inputs): first_input = inputs inputs_grad_tail = [] diff --git a/op_builder/builder.py b/op_builder/builder.py index 1fdcd485f5fb..62683774f20a 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -486,6 +486,9 @@ def jit_load(self, verbose=True): cxx_args.append("-DBF16_AVAILABLE") nvcc_args.append("-DBF16_AVAILABLE") + if self.is_rocm_pytorch(): + cxx_args.append("-D__HIP_PLATFORM_AMD__=1") + op_module = load(name=self.name, sources=self.strip_empty_entries(sources), extra_include_paths=self.strip_empty_entries(extra_include_paths), diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py index f4311848d0d4..6e7721f94e01 100644 --- a/op_builder/evoformer_attn.py +++ b/op_builder/evoformer_attn.py @@ -27,7 +27,7 @@ def extra_ldflags(self): def sources(self): src_dir = 'csrc/deepspeed4science/evoformer_attn' - return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention.cu'] + return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu'] def nvcc_args(self): args = super().nvcc_args() From 15295ae5a5aa9f2e20c554b7b42fc2ecaa729e49 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 3 Nov 2023 07:17:49 +0000 Subject: [PATCH 12/26] install gcc-9 in cpu workflow --- .github/workflows/cpu-inference.yml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 61bd68c737b0..adf309bef39e 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -24,6 +24,17 @@ jobs: - id: setup-venv uses: ./.github/workflows/setup-venv + - name: Install gcc-9 + run: | + sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test + sudo apt install -y gcc-9 + + - name: Check gcc version + run: | + # Get gcc version + gcc --version + g++ --version + - name: Detect instruction sets on instance run: | lscpu @@ -75,16 +86,16 @@ jobs: pip install .[dev,1bit,autotuning,inf] ds_report - - name: Python environment + - name: Python environment check run: | pip list - - - name: Unit tests - run: | source oneCCL/build/_install/env/setvars.sh # check whether the environment is properly setup python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')" python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())" + + - name: Unit tests + run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner From d52ff77c6bcb50d3675bc21ed868012c64cb17b6 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sun, 5 Nov 2023 00:10:58 +0800 Subject: [PATCH 13/26] set gcc/g++ default to 9 in cpu inference workflow --- .github/workflows/cpu-inference.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index adf309bef39e..89df1d256179 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -27,7 +27,10 @@ jobs: - name: Install gcc-9 run: | sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test - sudo apt install -y gcc-9 + sudo apt install -y gcc-9 g++-9 + # set gcc-9 and g++9 to default + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99 - name: Check gcc version run: | From e9fafa7cae0ff5dd9fb535a4bcdf12833b815064 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 7 Nov 2023 03:18:23 +0000 Subject: [PATCH 14/26] update oneccl_bind_pt installation steps --- .github/workflows/cpu-inference.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 89df1d256179..ce5c86523506 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -56,7 +56,9 @@ jobs: run: | pip install torch python -m pip install intel_extension_for_pytorch - python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu + # the curl line is for troubleshootingn + curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ pip install py-cpuinfo # check installed version pip list |grep \\\ From 51922e4872d06fca2a77376791a92a7dd11d02de Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 16 Nov 2023 03:21:10 +0000 Subject: [PATCH 15/26] mitigation for oneCCL GLIBCXX_3.4.30 not found issue --- .github/workflows/cpu-inference.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index ce5c86523506..0ee715b5aa1a 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -38,6 +38,11 @@ jobs: gcc --version g++ --version + - name: Mitigation for GLIBCXX_3.4.30 + run: | + # install glibc higher version + conda install -c conda-forge libstdcxx-ng=12 + - name: Detect instruction sets on instance run: | lscpu From fc6025c434c9271ef0c4108a9cbe32586caf83ab Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sat, 18 Nov 2023 22:54:52 +0800 Subject: [PATCH 16/26] use sudo to install conda package --- .github/workflows/cpu-inference.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 0ee715b5aa1a..6ad9c7098cad 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -41,7 +41,7 @@ jobs: - name: Mitigation for GLIBCXX_3.4.30 run: | # install glibc higher version - conda install -c conda-forge libstdcxx-ng=12 + sudo conda install -y -c conda-forge libstdcxx-ng=12 - name: Detect instruction sets on instance run: | From a8cec8b83d9b6a22b8e30f13bc4849ae5a0a41fd Mon Sep 17 00:00:00 2001 From: Liangliang-Ma <1906710196@qq.com> Date: Tue, 21 Nov 2023 17:02:01 +0800 Subject: [PATCH 17/26] ccl issues fix (#32) * fix api bugs * remove group in inference_all_reduce * remove group from inference_all_reduce in ccl.cpp * direct call all_reduce and inference_all_reduce * Update ccl.py --- csrc/cpu/comm/ccl.cpp | 4 ++-- deepspeed/comm/ccl.py | 44 +++++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 9a04b6f873f2..6428ab5cbfa5 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -537,7 +537,7 @@ static void parallel_memcpy(void* to, void* from, size_t n_bytes) } } -void inference_all_reduce(torch::Tensor& data, py::object op, std::vector group, bool async_op) +void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) { static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); @@ -562,7 +562,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, std::vector g data.numel(), get_ccl_datatype(data.scalar_type()), get_ccl_reduce_op(op, data), - _get_comm_from_group(group)) + _get_comm_from_group()) .wait()); return; } diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 38a22f20109f..0fea6fe2ff2e 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -61,7 +61,8 @@ def is_initialized(self): def run_collective(self, name, **kwargs): if name in self.available_coll: - kwargs['group'] = self.get_all_ranks_from_group(kwargs['group']) + if 'group' in kwargs: + kwargs['group'] = self.get_all_ranks_from_group(kwargs['group']) if 'dst' in kwargs: kwargs['dst'] = kwargs['group'].index(kwargs['dst']) if 'src' in kwargs: @@ -71,23 +72,38 @@ def run_collective(self, name, **kwargs): return CCLHandler(self.ccl_comm_op) else: func = "super(CCLBackend, self)." + name - return eval(func)(*(kwargs.values())) + eval(func)(*(kwargs.values())) + return CCLHandler(self.ccl_comm_op) def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): use_caching = False if use_caching: match_id = f"{tensor.size()}-{op}" - return self.run_collective(name="all_reduce_caching", - tensor=tensor, - op=op, - match_id=match_id, - group=group, - async_op=async_op) + name = "all_reduce_caching" + if name in self.available_coll: + group = self.get_all_ranks_from_group(group) + return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op) + else: + return self.run_collective(name="all_reduce_caching", + tensor=tensor, + op=op, + match_id=match_id, + group=group, + async_op=async_op) else: - return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op) + name = "all_reduce" + if name in self.available_coll: + group = self.get_all_ranks_from_group(group) + return self.ccl_comm_op.all_reduce(tensor, op, group, async_op) + else: + return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op) def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): - return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op) + name = "inference_all_reduce" + if name in self.available_coll: + return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op) + else: + return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op) def broadcast(self, tensor, src, group=None, async_op=False): return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op) @@ -120,11 +136,11 @@ def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes input_split_sizes=input_split_sizes, group=group) - def send(self, tensor, dst, group=None, async_op=False): - return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op) + def send(self, tensor, dst, group=None, tag=0): + return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, tag=tag) - def recv(self, tensor, src, group=None, async_op=False): - return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op) + def recv(self, tensor, src, group=None, tag=0): + return self.run_collective(name="recv", tensor=tensor, src=src, group=group, tag=tag) def gather(self, tensor, gather_list, dst, group=None, async_op=False): return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group) From 9fb8ecb6587c58bb08c2e80c72e7c3dcf02b6c58 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 21 Nov 2023 09:05:08 +0000 Subject: [PATCH 18/26] clean up all_reduce_caching path --- deepspeed/comm/ccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 0fea6fe2ff2e..982e13c1ce1a 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -84,7 +84,7 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): group = self.get_all_ranks_from_group(group) return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op) else: - return self.run_collective(name="all_reduce_caching", + return self.run_collective(name=name, tensor=tensor, op=op, match_id=match_id, From c4cabcd5c2d281ccfb5c2524722ccb292049cfad Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 22 Nov 2023 02:33:42 +0000 Subject: [PATCH 19/26] fix formatting --- deepspeed/comm/ccl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 982e13c1ce1a..a199843d5086 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -85,11 +85,11 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op) else: return self.run_collective(name=name, - tensor=tensor, - op=op, - match_id=match_id, - group=group, - async_op=async_op) + tensor=tensor, + op=op, + match_id=match_id, + group=group, + async_op=async_op) else: name = "all_reduce" if name in self.available_coll: From 3663b75bc9332c5213b06048932af39acc9d4520 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Thu, 23 Nov 2023 17:54:58 +0800 Subject: [PATCH 20/26] preload libstdc++ from system lib path instead of conda path --- .github/workflows/cpu-inference.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 6ad9c7098cad..09c15f096734 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -38,11 +38,6 @@ jobs: gcc --version g++ --version - - name: Mitigation for GLIBCXX_3.4.30 - run: | - # install glibc higher version - sudo conda install -y -c conda-forge libstdcxx-ng=12 - - name: Detect instruction sets on instance run: | lscpu @@ -100,6 +95,7 @@ jobs: run: | pip list source oneCCL/build/_install/env/setvars.sh + export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 # check whether the environment is properly setup python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')" python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())" From 3934919c44cb3548afa31811d484e89fa70e1ab7 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 29 Nov 2023 12:34:19 +0800 Subject: [PATCH 21/26] prep oneCCL before running unit tests --- .github/workflows/cpu-inference.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 09c15f096734..8f410378c5af 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -102,6 +102,8 @@ jobs: - name: Unit tests run: | + # prep oneCCL for CCLBackend comm ops building + source oneCCL/build/_install/env/setvars.sh unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner From b90fa99e18285f9d9f5e7ecda19d74fd4c9c3f40 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Sat, 2 Dec 2023 09:13:37 +0800 Subject: [PATCH 22/26] prep libstdc++ in UT run --- .github/workflows/cpu-inference.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 8f410378c5af..7a166e34c80f 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -104,6 +104,7 @@ jobs: run: | # prep oneCCL for CCLBackend comm ops building source oneCCL/build/_install/env/setvars.sh + export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner From 80550342e2af8167f96b89f4e487deb99315fb62 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Tue, 5 Dec 2023 09:47:23 +0800 Subject: [PATCH 23/26] allow codegen test for bf16 --- tests/unit/inference/test_inference.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 527b4d5a8b67..dcd844d2a566 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -549,10 +549,6 @@ def test( if dtype not in get_accelerator().supported_dtypes(): pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") - # TODO: enable this test after torch 2.1 stable release - if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono": - pytest.skip("Codegen model(bf16) need to use torch version > 2.0.") - model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "2")) From b50a481172639f969a8ae1373bb14d6761f03564 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 7 Dec 2023 00:42:43 -0800 Subject: [PATCH 24/26] disable codegen bf16 --- tests/unit/inference/test_inference.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index dcd844d2a566..767e1dba23ea 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -549,6 +549,9 @@ def test( if dtype not in get_accelerator().supported_dtypes(): pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") + if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono": + pytest.skip("Disable Codegen model(bf16) due to slight result difference") + model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "2")) From a72beea53a8317d551fce74e03548c8fd76c8f0a Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 3 Jan 2024 03:14:29 +0000 Subject: [PATCH 25/26] fix test_inference_config UT error --- tests/unit/inference/test_inference_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py index 375563abf65b..39d62d17372c 100644 --- a/tests/unit/inference/test_inference_config.py +++ b/tests/unit/inference/test_inference_config.py @@ -15,7 +15,7 @@ class TestInferenceConfig(DistributedTest): world_size = 1 def test_overlap_kwargs(self): - config = {"replace_with_kernel_inject": True} + config = {"replace_with_kernel_inject": True, "dtype": torch.float32} kwargs = {"replace_with_kernel_inject": True} engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs) @@ -37,7 +37,7 @@ def test_kwargs_and_config(self): assert engine._config.dtype == kwargs["dtype"] def test_json_config(self, tmpdir): - config = {"replace_with_kernel_inject": True} + config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"} config_json = create_config_from_dict(tmpdir, config) engine = deepspeed.init_inference(torch.nn.Module(), config=config_json) From 3244e1f7c5278e05dca8acbbde2a99a7d0c101cf Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 5 Jan 2024 09:29:54 +0800 Subject: [PATCH 26/26] fix typo --- .github/workflows/cpu-inference.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 7a166e34c80f..a2ca41f4aa3a 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -56,7 +56,7 @@ jobs: run: | pip install torch python -m pip install intel_extension_for_pytorch - # the curl line is for troubleshootingn + # the curl line is for troubleshooting curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ pip install py-cpuinfo