[CLEANUP]

kyegomez · Jan 22, 2024 · ea25ce9 · ea25ce9
1 parent 0c8fa5b
commit ea25ce9
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 8 deletions.
diff --git a/LICENSE b/LICENSE
@@ -2,12 +2,7 @@ MIT License
 
 Copyright (c) 2023 Eternal Reclaimer
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "swarms-torch"
-version = "0.1.9"
+version = "0.2.1"
 description = "swarms-torch - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <[email protected]>"]

diff --git a/swarms_torch/__init__.py b/swarms_torch/__init__.py
@@ -12,6 +12,7 @@
     Particle,
     TransformerParticleSwarmOptimization,
 )
+from swarms_torch.structs import * # noqa
 
 __all__ = [
     "ParticleSwarmOptimization",

diff --git a/swarms_torch/structs/__init__.py b/swarms_torch/structs/__init__.py
@@ -1,5 +1,5 @@
 from swarms_torch.structs.parallel_wrapper import ParallelSwarm
-from swarms_torch.structs.moe import SwitchGate, SwitchMoE
+from swarms_torch.structs.switch_moe import SwitchGate, SwitchMoE
 from swarms_torch.structs.simple_moe import GatingMechanism, SimpleMoE
 
 __all__ = [

diff --git a/swarms_torch/structs/mixtral_expert.py b/swarms_torch/structs/mixtral_expert.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class SwiGLU(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super(SwiGLU, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        return self.fc2(F.silu(self.fc1(x)))
+
+class TopKGate(nn.Module):
+    def __init__(self, model_dim, num_experts, top_k):
+        super(TopKGate, self).__init__()
+        self.w_gate = nn.Linear(model_dim, num_experts)
+        self.top_k = top_k
+
+    def forward(self, x):
+        gate_logits = self.w_gate(x)
+        top_logits, top_indices = torch.topk(gate_logits, self.top_k, dim=-1)
+        top_k_logits = torch.full_like(gate_logits, float('-inf'))
+        top_k_logits.scatter_(1, top_indices, top_logits)
+        return F.softmax(top_k_logits, dim=-1)
+
+class MoE(nn.Module):
+    def __init__(self, model_dim, hidden_dim, num_experts, top_k):
+        super(MoE, self).__init__()
+        self.experts = nn.ModuleList([SwiGLU(model_dim, hidden_dim, model_dim) for _ in range(num_experts)])
+        self.gate = TopKGate(model_dim, num_experts, top_k)
+
+    def forward(self, x):
+        gate_scores = self.gate(x)
+        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
+        weighted_expert_outputs = gate_scores.unsqueeze(-1) * expert_outputs
+        return weighted_expert_outputs.sum(dim=2)
+
+# Model architecture parameters
+model_dim = 4096
+n_layers = 32
+head_dim = 128
+hidden_dim = 14336
+n_heads = 32
+context_len = 32768
+vocab_size = 32000
+num_experts = 8
+top_k_experts = 2
+
+# Create a single MoE layer as a demonstration
+moe_layer = MoE(model_dim, hidden_dim, num_experts, top_k_experts)
+
+# Example input tensor
+x = torch.rand(1, context_len, model_dim)  # (batch_size, seq_len, model_dim)
+
+# Forward pass through the MoE layer
+output = moe_layer(x)
+
+print(output)
diff --git a/swarms_torch/structs/moe.py → swarms_torch/structs/switch_moe.py b/swarms_torch/structs/moe.py → swarms_torch/structs/switch_moe.py