Skip to content

Commit

Permalink
[CLEANUP]
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye committed Jan 22, 2024
1 parent 0c8fa5b commit ea25ce9
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 8 deletions.
7 changes: 1 addition & 6 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,7 @@ MIT License

Copyright (c) 2023 Eternal Reclaimer

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "swarms-torch"
version = "0.1.9"
version = "0.2.1"
description = "swarms-torch - Pytorch"
license = "MIT"
authors = ["Kye Gomez <[email protected]>"]
Expand Down
1 change: 1 addition & 0 deletions swarms_torch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Particle,
TransformerParticleSwarmOptimization,
)
from swarms_torch.structs import * # noqa

__all__ = [
"ParticleSwarmOptimization",
Expand Down
2 changes: 1 addition & 1 deletion swarms_torch/structs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from swarms_torch.structs.parallel_wrapper import ParallelSwarm
from swarms_torch.structs.moe import SwitchGate, SwitchMoE
from swarms_torch.structs.switch_moe import SwitchGate, SwitchMoE
from swarms_torch.structs.simple_moe import GatingMechanism, SimpleMoE

__all__ = [
Expand Down
59 changes: 59 additions & 0 deletions swarms_torch/structs/mixtral_expert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

class SwiGLU(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(SwiGLU, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)

def forward(self, x):
return self.fc2(F.silu(self.fc1(x)))

class TopKGate(nn.Module):
def __init__(self, model_dim, num_experts, top_k):
super(TopKGate, self).__init__()
self.w_gate = nn.Linear(model_dim, num_experts)
self.top_k = top_k

def forward(self, x):
gate_logits = self.w_gate(x)
top_logits, top_indices = torch.topk(gate_logits, self.top_k, dim=-1)
top_k_logits = torch.full_like(gate_logits, float('-inf'))
top_k_logits.scatter_(1, top_indices, top_logits)
return F.softmax(top_k_logits, dim=-1)

class MoE(nn.Module):
def __init__(self, model_dim, hidden_dim, num_experts, top_k):
super(MoE, self).__init__()
self.experts = nn.ModuleList([SwiGLU(model_dim, hidden_dim, model_dim) for _ in range(num_experts)])
self.gate = TopKGate(model_dim, num_experts, top_k)

def forward(self, x):
gate_scores = self.gate(x)
expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
weighted_expert_outputs = gate_scores.unsqueeze(-1) * expert_outputs
return weighted_expert_outputs.sum(dim=2)

# Model architecture parameters
model_dim = 4096
n_layers = 32
head_dim = 128
hidden_dim = 14336
n_heads = 32
context_len = 32768
vocab_size = 32000
num_experts = 8
top_k_experts = 2

# Create a single MoE layer as a demonstration
moe_layer = MoE(model_dim, hidden_dim, num_experts, top_k_experts)

# Example input tensor
x = torch.rand(1, context_len, model_dim) # (batch_size, seq_len, model_dim)

# Forward pass through the MoE layer
output = moe_layer(x)

print(output)
File renamed without changes.

0 comments on commit ea25ce9

Please sign in to comment.