Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized DeepSeek-v2 on Gaudi #1677

Merged
merged 16 commits into from
Jan 28, 2025
Merged
Prev Previous commit
Next Next commit
Refactor the code.
gyou2021 committed Jan 28, 2025
commit 8a4c1a810f72a63abf9f34eaa709e84e88d7d054
Original file line number Diff line number Diff line change
@@ -62,16 +62,22 @@

# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
if not is_torch_greater_or_equal_than_1_13:
import torch.fx
# if is_torch_fx_available():
# if not is_torch_greater_or_equal_than_1_13:
# import torch.fx

_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
# _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

import torch.fx
_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DeepseekV2Config"

#default expert number per slice for dynamic MoE
SLICE_MAX_EXPERT = 80

try:
from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE

@@ -626,7 +632,7 @@ def __init__(self, config):
if config.n_shared_experts is not None:
intermediate_size = config.moe_intermediate_size * config.n_shared_experts
self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size)
SLICE_MAX_EXPERT = 80

self.expert_slice = math.ceil(config.n_routed_experts / SLICE_MAX_EXPERT)
self.expert_chunk = self.config.n_routed_experts // self.expert_slice