From a95bb3ecae2880abc1987da125a1a2450df60c65 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 7 Jan 2025 20:12:04 -0800 Subject: [PATCH 1/9] Update with changes from torchfix --- tests/unit/alexnet_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py index 51e80e7f9e62..6fe84edf4eda 100644 --- a/tests/unit/alexnet_model.py +++ b/tests/unit/alexnet_model.py @@ -84,7 +84,7 @@ def cast_to_half(x): def cifar_trainset(fp16=False): torchvision = pytest.importorskip("torchvision", minversion="0.5.0") - import torchvision.transforms as transforms + from torchvision import transforms transform_list = [ transforms.ToTensor(), From 34a35bd18423fe70044b6372c2b1d23ec2af3695 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 7 Jan 2025 20:19:48 -0800 Subject: [PATCH 2/9] Update call to torch.norm as it is deprecated --- deepspeed/runtime/comm/hccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/comm/hccl.py b/deepspeed/runtime/comm/hccl.py index 09fb11a731b8..b8639c7da4c9 100644 --- a/deepspeed/runtime/comm/hccl.py +++ b/deepspeed/runtime/comm/hccl.py @@ -83,7 +83,7 @@ def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_erro compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) + server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) server_error.set_(compensated_server_m - server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) From 7893103cb96c4e9adbab9b5154c594afc90f0c72 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 8 Jan 2025 10:32:22 -0800 Subject: [PATCH 3/9] Update all instances of to --- deepspeed/runtime/comm/compressed.py | 2 +- deepspeed/runtime/zero/stage_1_and_2.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/comm/compressed.py b/deepspeed/runtime/comm/compressed.py index 7f8c7395451d..2c5482eb1ad7 100644 --- a/deepspeed/runtime/comm/compressed.py +++ b/deepspeed/runtime/comm/compressed.py @@ -96,7 +96,7 @@ def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_erro compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) + server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) server_error.set_(compensated_server_m - server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index ecb2a527f870..a1dbf84dddd6 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1691,7 +1691,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): continue if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( - torch.norm(g.data.double().detach(), norm_type).to(get_accelerator().current_device_name())) + torch.linalg.norm(g.data.double().detach(), norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: @@ -1795,7 +1795,7 @@ def scaled_global_norm(self, norm_type=2): self._average_expert_grad_norms(norm_groups) # calculating L2 norm - return torch.norm(torch.stack(norm_groups), p=norm_type) + return torch.linalg.norm(torch.stack(norm_groups), p=norm_type) def get_bit16_param_group(self, group_no): bit16_partitions = self.parallel_partitioned_bit16_groups[group_no] From 022f223697296f182f1b677d83d41a4abb062b0b Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 8 Jan 2025 10:58:53 -0800 Subject: [PATCH 4/9] Formatting --- deepspeed/runtime/zero/stage_1_and_2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index a1dbf84dddd6..67e49b6f79d3 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1691,7 +1691,8 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): continue if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( - torch.linalg.norm(g.data.double().detach(), norm_type).to(get_accelerator().current_device_name())) + torch.linalg.norm(g.data.double().detach(), + norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: From 7bb1a61f3fdff8344674f4c983d024d43bd342e9 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 8 Jan 2025 14:28:20 -0800 Subject: [PATCH 5/9] Switch function signature to ord from p --- deepspeed/runtime/zero/stage_1_and_2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 67e49b6f79d3..f91d81659f98 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1691,8 +1691,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): continue if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( - torch.linalg.norm(g.data.double().detach(), - norm_type).to(get_accelerator().current_device_name())) + torch.linalg.norm(g.data.double().detach(), ord=norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: @@ -1796,7 +1795,7 @@ def scaled_global_norm(self, norm_type=2): self._average_expert_grad_norms(norm_groups) # calculating L2 norm - return torch.linalg.norm(torch.stack(norm_groups), p=norm_type) + return torch.linalg.norm(torch.stack(norm_groups), ord=norm_type) def get_bit16_param_group(self, group_no): bit16_partitions = self.parallel_partitioned_bit16_groups[group_no] From c803152e60480e487fa611baaa451dd5c5a5cc29 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 8 Jan 2025 14:29:10 -0800 Subject: [PATCH 6/9] formatting --- deepspeed/runtime/zero/stage_1_and_2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index f91d81659f98..ec653b32dced 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1691,7 +1691,8 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): continue if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( - torch.linalg.norm(g.data.double().detach(), ord=norm_type).to(get_accelerator().current_device_name())) + torch.linalg.norm(g.data.double().detach(), + ord=norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: From 8ff83e2ecfee5e0825b4bba1ad695148ae147899 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 14 Jan 2025 11:12:04 -0800 Subject: [PATCH 7/9] Update to use torch.linalg.vector_norm --- deepspeed/runtime/zero/stage3.py | 2 +- deepspeed/runtime/zero/stage_1_and_2.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 28f91cb9b3ab..9c06567ed100 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -2101,7 +2101,7 @@ def step(self, closure=None): return norm_groups = self._get_norm_groups() - scaled_global_grad_norm = torch.linalg.norm(torch.stack(norm_groups)) + scaled_global_grad_norm = torch.linalg.vector_norm(torch.stack(norm_groups)) # Stash unscaled gradient norm self._global_grad_norm = scaled_global_grad_norm / self.loss_scale diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index ec653b32dced..8866ff25de4b 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1691,7 +1691,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): continue if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( - torch.linalg.norm(g.data.double().detach(), + torch.linalg.vector_norm(g.data.double().detach(), ord=norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() @@ -1796,7 +1796,7 @@ def scaled_global_norm(self, norm_type=2): self._average_expert_grad_norms(norm_groups) # calculating L2 norm - return torch.linalg.norm(torch.stack(norm_groups), ord=norm_type) + return torch.linalg.vector_norm(torch.stack(norm_groups), ord=norm_type) def get_bit16_param_group(self, group_no): bit16_partitions = self.parallel_partitioned_bit16_groups[group_no] From b4066f5cc7eb48eb6081e5b1b51ce39d3a24fb4d Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 14 Jan 2025 11:14:35 -0800 Subject: [PATCH 8/9] Formatting --- deepspeed/runtime/zero/stage_1_and_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 8866ff25de4b..1835d4fbe8f4 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1692,7 +1692,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): all_norms.append( torch.linalg.vector_norm(g.data.double().detach(), - ord=norm_type).to(get_accelerator().current_device_name())) + ord=norm_type).to(get_accelerator().current_device_name())) if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: From 8db1efbfa968a8d596a1ebfe2bf6a78e546c0ea6 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 17 Jan 2025 10:21:40 -0800 Subject: [PATCH 9/9] Update lamb to be more clear since it does an L2 vector norm --- deepspeed/runtime/fp16/onebit/lamb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py index 89b6f40a308c..9e7bae816ecd 100644 --- a/deepspeed/runtime/fp16/onebit/lamb.py +++ b/deepspeed/runtime/fp16/onebit/lamb.py @@ -177,7 +177,7 @@ def step(self, closure=None, grads=None): # This is used to reduce compression error during compression stage. momentum_scales = [] for group in self.param_groups: - momentum_scales.append([(torch.linalg.norm(self.state[p]['exp_avg']) / + momentum_scales.append([(torch.linalg.vector_norm(self.state[p]['exp_avg']) / np.sqrt(torch.numel(self.state[p]['exp_avg']))).item() for p in group['params']]) united_scale = sum([sum(x) for x in momentum_scales]) / sum([len(x) for x in momentum_scales])