From 91829476a8fd4d0d9268c03c1d56795d20a51c12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Thu, 12 Dec 2024 22:48:32 +0200 Subject: [PATCH 01/10] Avoid poisoning process with CUDA calls as soon as importing (#6810) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Call `torch.cuda.device_count() > 0` before `torch.cuda.is_available()`, to give priority to nvml based availability, so that we can try not to poison process with CUDA calls as soon as we execute `import deepspeed`. https://github.com/pytorch/pytorch/blob/v2.5.1/torch/cuda/__init__.py#L120-L124 There are 2 reasons to make this change: Firstly, if we accidentally import deepspeed, since the CUDA runtime initializes when the first CUDA API call is made and caches the device list, changing the CUDA_VISIBLE_DEVICES within the same process after initialization won't have any effect on the visible devices. The specific case: https://github.com/OpenRLHF/OpenRLHF/pull/524#issuecomment-2501505023 A demo for reproduction before the fix is applied: ```python import torch import os os.environ["CUDA_VISIBLE_DEVICES"] = "" import deepspeed os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" torch.cuda.set_device('cuda:0') ``` Secondly, https://pytorch.org/docs/stable/notes/cuda.html When assessing the availability of CUDA in a given environment (is_available()), PyTorch’s default behavior is to call the CUDA Runtime API method cudaGetDeviceCount. Because this call in turn initializes the CUDA Driver API (via cuInit) if it is not already initialized, subsequent forks of a process that has run is_available() will fail with a CUDA initialization error. Signed-off-by: Hollow Man Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- accelerator/real_accelerator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index 69e96d285bb8..a6173ac70abd 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -167,7 +167,12 @@ def get_accelerator(): import torch # Determine if we are on a GPU or x86 CPU with torch. - if torch.cuda.is_available(): #ignore-cuda + # "torch.cuda.is_available()" provides a stronger guarantee, #ignore-cuda + # ensuring that we are free from CUDA initialization errors. + # While "torch.cuda.device_count() > 0" check ensures that #ignore-cuda + # we won't try to do any CUDA calls when no device is available + # For reference: https://github.com/microsoft/DeepSpeed/pull/6810 + if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda accelerator_name = "cuda" else: if accel_logger is not None: From 853a97648b9ba3acbb990018eab1dd928a08c390 Mon Sep 17 00:00:00 2001 From: Liangliang Ma Date: Sat, 14 Dec 2024 03:29:48 +0800 Subject: [PATCH 02/10] Fix xpu tests workflow failure by changing pip index url (#6864) Update xpu-max1100.yml and xpu-compile.yml --- .github/workflows/xpu-compile.yml | 8 ++++---- .github/workflows/xpu-max1100.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml index e095e089fc30..9e8bd9d792fb 100644 --- a/.github/workflows/xpu-compile.yml +++ b/.github/workflows/xpu-compile.yml @@ -31,10 +31,10 @@ jobs: run: | apt-get update apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ - pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ - pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ - pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl pip install py-cpuinfo numpy pip install .[dev,autotuning] diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index d19e73aeef1c..56bff4a88ba9 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -47,10 +47,10 @@ jobs: run: | apt-get update apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ - pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ - pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ - pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ pip install py-cpuinfo numpy pip install .[dev,autotuning] From d7750c34291b9dcd892de4a795ecd0e35b28f6ee Mon Sep 17 00:00:00 2001 From: Guanhua Wang Date: Fri, 13 Dec 2024 11:40:41 -0800 Subject: [PATCH 03/10] Domino updates (#6861) Updating our website for Domino --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- docs/_tutorials/domino.md | 6 ++++++ docs/index.md | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 docs/_tutorials/domino.md diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md new file mode 100644 index 000000000000..6b116cb87463 --- /dev/null +++ b/docs/_tutorials/domino.md @@ -0,0 +1,6 @@ +--- +title: "Domino" +tags: training +--- + +Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo. diff --git a/docs/index.md b/docs/index.md index 3279682b42d4..3d5f290f2bde 100755 --- a/docs/index.md +++ b/docs/index.md @@ -7,25 +7,25 @@ title: "Latest News" --- DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). +* [2024/12] [DeepSpeed Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) + * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)] * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)] * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] * [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] -* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
More news
From b5e3fac6a599f80a7db1899e47ea6993131bb51b Mon Sep 17 00:00:00 2001 From: Guanhua Wang Date: Fri, 13 Dec 2024 12:59:08 -0800 Subject: [PATCH 04/10] add domino navigation (#6866) add domino item into navigation list --- docs/_data/navigation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index b17685b8dc4d..c3ebad4f86af 100755 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -75,6 +75,8 @@ lnav: url: /tutorials/data-efficiency/ - title: 'DeepNVMe' url: /tutorials/deepnvme/ + - title: 'Domino' + url: /tutorials/domino/ - title: 'DS4Sci_EvoformerAttention' url: /tutorials/ds4sci_evoformerattention/ - title: 'Flops Profiler' From 8efbcc495c3c7c072d10bfd672932807fb9eb8e5 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 13 Dec 2024 16:49:08 -0500 Subject: [PATCH 05/10] Update TSC (#6867) --- COMMITTERS.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/COMMITTERS.md b/COMMITTERS.md index bcb8579bf1f7..8418bdf8629d 100644 --- a/COMMITTERS.md +++ b/COMMITTERS.md @@ -5,5 +5,7 @@ | Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft | | Logan Adams | [loadams](https://github.com/loadams) | Microsoft | | Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft | -| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake | -| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC | +| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake | +| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC | +| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD | +| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory | From 6e3e13cb280b684ebedb5c2aecb36efb545ebfce Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:35:12 -0800 Subject: [PATCH 06/10] Remove warnings from autodoc and sphinx (#6788) Co-authored-by: Olatunji Ruwase --- deepspeed/runtime/fp16/onebit/zoadam.py | 4 +++- deepspeed/runtime/lr_schedules.py | 4 ++-- docs/code-docs/source/monitor.rst | 18 +++++++++--------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py index 803bd929742d..70282ec41714 100644 --- a/deepspeed/runtime/fp16/onebit/zoadam.py +++ b/deepspeed/runtime/fp16/onebit/zoadam.py @@ -12,9 +12,11 @@ class ZeroOneAdam(torch.optim.Optimizer): - """Implements the 0/1 Adam algorithm. Currently GPU-only. + """ + Implements the 0/1 Adam algorithm. Currently GPU-only. For usage example please see https://www.deepspeed.ai/tutorials/zero-one-adam/ For technical details please read https://arxiv.org/abs/2202.06009 + Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups. diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py index f25a19e8e499..899358e2c5ef 100755 --- a/deepspeed/runtime/lr_schedules.py +++ b/deepspeed/runtime/lr_schedules.py @@ -274,7 +274,7 @@ class LRRangeTest(object): """Sets the learning rate of each parameter group according to learning rate range test (LRRT) policy. The policy increases learning rate starting from a base value with a constant frequency, as detailed in - the paper `A disciplined approach to neural network hyper-parameters: Part1`_. + the paper `A disciplined approach to neural network hyper-parameters: Part 1 `_ LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to configure the LR boundaries for Cyclic LR schedules. @@ -379,7 +379,7 @@ class OneCycle(object): 1CLR policy changes the learning rate after every batch. `step` should be called after a batch has been used for training. - This implementation was adapted from the github repo: `pytorch/pytorch`_ + This implementation was adapted from the github repo: `PyTorch `_. Args: optimizer (Optimizer): Wrapped optimizer. diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst index 694c72b9b870..b185ed433c1c 100644 --- a/docs/code-docs/source/monitor.rst +++ b/docs/code-docs/source/monitor.rst @@ -9,15 +9,15 @@ overview of what DeepSpeed will log automatically. :header: "Field", "Description", "Condition" :widths: 20, 20, 10 - `Train/Samples/train_loss`,The training loss.,None - `Train/Samples/lr`,The learning rate during training.,None - `Train/Samples/loss_scale`,The loss scale when training using `fp16`.,`fp16` must be enabled. - `Train/Eigenvalues/ModelBlockParam_{i}`,Eigen values per param block.,`eigenvalue` must be enabled. - `Train/Samples/elapsed_time_ms_forward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`. - `Train/Samples/elapsed_time_ms_backward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`. - `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`. - `Train/Samples/elapsed_time_ms_backward_allreduce`,The global duration of the allreduce operation.,`flops_profiler.enabled` or `wall_clock_breakdown`. - `Train/Samples/elapsed_time_ms_step`,The optimizer step time,`flops_profiler.enabled` or `wall_clock_breakdown`. + `Train/Samples/train_loss`,"The training loss.",None + `Train/Samples/lr`,"The learning rate during training.",None + `Train/Samples/loss_scale`,"The loss scale when training using `fp16`.",`fp16` must be enabled. + `Train/Eigenvalues/ModelBlockParam_{i}`,"Eigen values per param block.",`eigenvalue` must be enabled. + `Train/Samples/elapsed_time_ms_forward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`. + `Train/Samples/elapsed_time_ms_backward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`. + `Train/Samples/elapsed_time_ms_backward_inner`,"The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.",`flops_profiler.enabled` or `wall_clock_breakdown`. + `Train/Samples/elapsed_time_ms_backward_allreduce`,"The global duration of the allreduce operation.",`flops_profiler.enabled` or `wall_clock_breakdown`. + `Train/Samples/elapsed_time_ms_step`,"The optimizer step time.",`flops_profiler.enabled` or `wall_clock_breakdown`. TensorBoard ----------- From fc7c07007fe341bf6d78a9126d0cb5a914ce28fd Mon Sep 17 00:00:00 2001 From: keiwoo Date: Sat, 14 Dec 2024 08:41:43 +0800 Subject: [PATCH 07/10] Update real_accelerator.py (#6845) ### Comment out or delete `accelerate_name="cpu"` when `xpu` is not detected. When `xpu `is not detected it just pass at lines from 68 to 74 if `DS_ACCELERATOR` is set. However, `cpu` is assigned to `accelerate_name` if it cannot import `intel_extension_for_pytorch` or find` xpu`, namely, at line from 125 to 133 when`DS_ACCELERATOR` is not set. I found this problem yesterday and spent whole afternoon figuring it out. I got `intel_extension_for_pytorch `installed with other package which I do not use actually and have no idea about this. Then I found that it `cpu` is assigned to accelerate_name directly if it cannot find `xpu` and it affects `cuda` detection. In fact, `cpu` will be assigned finally if `cuda` is even not detected at line from 170 to 177. --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- accelerator/real_accelerator.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index a6173ac70abd..ced9218d7aca 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -125,10 +125,9 @@ def get_accelerator(): if accelerator_name is None: try: import intel_extension_for_pytorch as ipex + if ipex._C._has_xpu(): accelerator_name = "xpu" - else: - accelerator_name = "cpu" except ImportError as e: pass if accelerator_name is None: @@ -162,7 +161,6 @@ def get_accelerator(): except ImportError as e: pass if accelerator_name is None: - # borrow this log from PR#5084 try: import torch @@ -174,16 +172,16 @@ def get_accelerator(): # For reference: https://github.com/microsoft/DeepSpeed/pull/6810 if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda accelerator_name = "cuda" - else: - if accel_logger is not None: - accel_logger.warn( - "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it." - ) - accelerator_name = "cpu" except (RuntimeError, ImportError) as e: # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection - accelerator_name = "cuda" pass + if accelerator_name is None: + # borrow this log from PR#5084 + if accel_logger is not None: + accel_logger.warn( + "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.") + # cpu added as catch-all when accelerator detection fails + accelerator_name = "cpu" ds_set_method = "auto detect" From db98cc3ad1e0a20807e0c2513f0eee40f626860e Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:05:55 -0800 Subject: [PATCH 08/10] Fix assertion for offloading states (#6855) This PR fixes the assertions in `offload_states` method mentioned in #6833. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 0aad018528d3..5f023d87f375 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -3738,6 +3738,11 @@ def offload_states(self, assert self.zero_optimization_stage( ) == ZeroStageEnum.weights, "Moving buffers across devices is supported only for ZeRO stage 3." + opt_offload_config = self.zero_offload_optimizer() + assert opt_offload_config is None or opt_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded optimizer states." + param_offload_config = self.zero_offload_param() + assert param_offload_config is None or param_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded parameters." + assert not self.zero_offload_param(), "Moving states across devices is not supported for offloaded parameters." if device == OffloadDeviceEnum.none: From 87c650681eb285ab34a69a011b520f756f42d4b9 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:21:51 -0800 Subject: [PATCH 09/10] Remove pin from transformers version and fix Processing/Threading issues in tests (#6822) Changes from https://github.com/huggingface/transformers/pull/34966 caused the `nv-torch-latest-v100` tests to fail with the following error: ``` File "/tmp/azureml/cr/j/e4bfd57a509846d6bbc4914639ad248d/exe/wd/actions-runner/_work/DeepSpeed/DeepSpeed/unit-test-venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3941, in from_pretrained raise EnvironmentError( OSError: Can't load the model for 'hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack. ``` Sample failure here: https://github.com/microsoft/DeepSpeed/actions/runs/12169422174/job/33942348835?pr=6794#step:8:3506 This was resolved on the Transformers side here: https://github.com/huggingface/transformers/pull/35236 --- .github/workflows/cpu-torch-latest.yml | 2 +- .github/workflows/nv-torch-latest-v100.yml | 2 +- .github/workflows/nv-torch-nightly-v100.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 51bc60c2c2ae..78a51905834b 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -42,7 +42,7 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed - git checkout 6c3f168b3 + # git checkout 6c3f168b3 git rev-parse --short HEAD pip install . diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 2d69d0b94cb5..a1ba4937d164 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -38,7 +38,7 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed - git checkout 6c3f168b3 + # git checkout 6c3f168b3 git rev-parse --short HEAD pip install . diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index c2d10a454f94..0a9570a1ceaa 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -37,7 +37,7 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed - git checkout 6c3f168b3 + # git checkout 6c3f168b3 git rev-parse --short HEAD pip install . From da771ed42e41a44d5047813ca4672f1cfe9d1731 Mon Sep 17 00:00:00 2001 From: Yejing-Lai Date: Tue, 17 Dec 2024 06:14:53 +0800 Subject: [PATCH 10/10] Add MLP/lm_head tp grain size setting. (#6828) This PR aims to add MLP/lm_head tp size granularity setting to deepspeed.init_inference() API. It will be more flexible to set the MLP/lm_head sharding grain size. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size. We aim to be able to set the MLP/lm_head tp grain size flexibly. This is a preliminary solution. If there is a better solution, we can discuss it together. Thanks~ --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- deepspeed/inference/config.py | 3 +++ deepspeed/module_inject/replace_module.py | 5 ++++- deepspeed/module_inject/tp_shard.py | 11 ++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index c7c7684fff79..42ffebbc4386 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -40,6 +40,9 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel): tp_size: int = 1 """ Number of devices to split the model across using tensor parallelism. """ + tp_grain_size: int = 64 + "Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size." + mpu: object = None """ A model parallelism unit object that implements diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 7afe6ca903fb..e59f84bc8453 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -17,7 +17,7 @@ from .layers import TensorParallelOcShardConv2d, TensorParallelIcShardConv2d from deepspeed import comm as dist -from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads +from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads, set_tp_grain_size from .load_checkpoint import load_model_with_checkpoint import time @@ -303,6 +303,9 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): if hasattr(model_config, 'num_attention_heads'): set_num_attention_heads(getattr(model_config, 'num_attention_heads')) + # 4.4 set tp_grain_size + set_tp_grain_size(config.tensor_parallel.tp_grain_size) + # 5. Set linear policies _autotp.update_linear_policies() diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index 57be0c793856..3e6fc2b63ef1 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -22,6 +22,11 @@ def set_n_embd(num): n_embd = num +def set_tp_grain_size(num): + global tp_grain_size + tp_grain_size = num + + def get_num_kv_heads(): global num_kv_heads if 'num_kv_heads' in globals(): @@ -45,9 +50,9 @@ def get_shard_size(total_size, mp_size, name=None, rank=None): my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) return total_size * my_slices // num_kv_heads else: - if total_size >= 64: - grain_size = total_size // 64 - return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64 + if total_size >= tp_grain_size: + grain_size = total_size // tp_grain_size + return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * tp_grain_size else: return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0)