From 33fd672924e82ee78b2777c64a0367378692fadf Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Mon, 22 Apr 2024 14:39:51 +0000
Subject: [PATCH 1/5] readme

---
 examples/mamba/README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/mamba/README.md b/examples/mamba/README.md
index 5c31d07f..8eefa9c2 100644
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@@ -18,6 +18,18 @@ pip install -r requirements.txt
 
 > https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
 
+## Bug related to nanotron
+Encountered the following issue when ran train_mamba.sh:   
+```
+causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
+```
+Solved this by doing:    
+pip uninstall mamba-ssm   
+pip install causal_conv1d==1.1.1   
+pip install mamba-ssm --no-cache-dir  
+https://github.com/state-spaces/mamba/issues/169 
+
+
 ## Credits
 Credits to the following repositories from which the code was adapted:
 - https://github.com/state-spaces/mamba

From e484d99db07bf0a69d35072fd11b500cb1722f45 Mon Sep 17 00:00:00 2001
From: Tiancheng Chen <tiachen@student.ethz.ch>
Date: Tue, 14 May 2024 18:57:55 +0200
Subject: [PATCH 2/5] wip

---
 src/nanotron/config/parallelism_config.py |  2 ++
 src/nanotron/models/llama.py              | 32 +++++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py
index 5912425b..321ee045 100644
--- a/src/nanotron/config/parallelism_config.py
+++ b/src/nanotron/config/parallelism_config.py
@@ -23,6 +23,7 @@ class ParallelismArgs:
         pp_engine: Pipeline engine to use between "1f1b" and "afab"
         tp_mode: TP mode to use between "all_reduce" and "reduce_scatter": all_reduce is normal, reduce_scatter activate sequence parallelism
         tp_linear_async_communication: Whether to use async communication in TP linear layers
+        recompute_layer: Whether to recompute each Transformer layer to save memory.
     """
 
     dp: int
@@ -31,6 +32,7 @@ class ParallelismArgs:
     pp_engine: Optional[PipelineEngine] = None
     tp_mode: Optional[TensorParallelLinearMode] = None
     tp_linear_async_communication: Optional[bool] = None
+    recompute_layer: bool = False
 
     expert_parallel_size: int = 1
 
diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
index 32aab9cd..a439768b 100644
--- a/src/nanotron/models/llama.py
+++ b/src/nanotron/models/llama.py
@@ -18,6 +18,7 @@
 
 import torch
 from torch import nn
+from torch.utils.checkpoint import CheckpointFunction
 
 from nanotron import distributed as dist
 from nanotron import logging
@@ -617,12 +618,14 @@ def __init__(
 
         self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.mlp = MLP(config=config, parallel_config=parallel_config, tp_pg=tp_pg)
-
-    def forward(
+        
+        self.recompute_layer = parallel_config.recompute_layer
+        
+    def _core_forward(
         self,
         hidden_states: Union[torch.Tensor, TensorPointer],
         sequence_mask: Union[torch.Tensor, TensorPointer],
-    ) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
+    ) -> List[Union[torch.Tensor, TensorPointer]]:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
 
@@ -635,12 +638,31 @@ def forward(
         hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
         hidden_states = hidden_states + residual
 
+        return hidden_states, output["sequence_mask"]
+        
+    def _checkpointed_forward(
+        self,
+        hidden_states: torch.Tensor,
+        sequence_mask: torch.Tensor,
+        ) -> List[torch.Tensor]:
+        return CheckpointFunction.apply(self._core_forward, hidden_states, sequence_mask)
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, TensorPointer],
+        sequence_mask: Union[torch.Tensor, TensorPointer],
+    ) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
+        
+        if self.recompute_layer:
+            hidden_states, sequence_mask = self._checkpointed_forward(hidden_states, sequence_mask)
+        else:
+            hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask)
+
         return {
             "hidden_states": hidden_states,
-            "sequence_mask": output["sequence_mask"],
+            "sequence_mask": sequence_mask,
         }
 
-
 class Embedding(nn.Module, AttachableStore):
     def __init__(self, tp_pg: dist.ProcessGroup, config: LlamaConfig, parallel_config: Optional[ParallelismArgs]):
         super().__init__()

From 7e15516cf282cc8b1f10b34e5334615f4e124c60 Mon Sep 17 00:00:00 2001
From: Tiancheng Chen <tiachen@student.ethz.ch>
Date: Tue, 14 May 2024 23:26:40 +0200
Subject: [PATCH 3/5] layer recompute

---
 src/nanotron/models/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
index a439768b..cb1b4d86 100644
--- a/src/nanotron/models/llama.py
+++ b/src/nanotron/models/llama.py
@@ -645,7 +645,7 @@ def _checkpointed_forward(
         hidden_states: torch.Tensor,
         sequence_mask: torch.Tensor,
         ) -> List[torch.Tensor]:
-        return CheckpointFunction.apply(self._core_forward, hidden_states, sequence_mask)
+        return CheckpointFunction.apply(self._core_forward, True, hidden_states, sequence_mask)
 
     def forward(
         self,
@@ -653,7 +653,7 @@ def forward(
         sequence_mask: Union[torch.Tensor, TensorPointer],
     ) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
         
-        if self.recompute_layer:
+        if self.recompute_layer and not isinstance(hidden_states, TensorPointer):
             hidden_states, sequence_mask = self._checkpointed_forward(hidden_states, sequence_mask)
         else:
             hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask)

From ed5a11c291e1988e3a86d74a3fba99be9ed6f57f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?X=CE=BBRI-U5?= <b3f0cus@icloud.com>
Date: Mon, 8 Jul 2024 17:05:47 +0700
Subject: [PATCH 4/5] Update README.md

---
 examples/doremi/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/doremi/README.md b/examples/doremi/README.md
index 5a726bd1..dfc9ea40 100644
--- a/examples/doremi/README.md
+++ b/examples/doremi/README.md
@@ -87,3 +87,7 @@ For evaluation, we do uniform sampling on the test set to evaluate a 2.5B model
 - 2.5B llama trained using the optimized weights: https://huggingface.co/nanotron/doremi-llama-2.5b-optimized-weights
 
 and the dataset: https://huggingface.co/datasets/nanotron/the-pile-for-doremi
+
+#### Thoughts
+
+For DoReMi, it's useful if you don't initially have an idea of what would be a good distribution for your training data, or want a quick way to find a better baseline than the uniform distribution if you want to tune the data distribution by hand. In my previous experiments, DoReMi matched the pretraining performance of the distribution of mamba training but couldn't outperform it. I suspect it doesn't work well when there are nuances, meaning the difference between your known best distribution and a better distribution isn't significant.

From d5cf7c42896645bad0b73c48641bf68085b62e0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?X=CE=BBRI-U5?= <b3f0cus@icloud.com>
Date: Mon, 8 Jul 2024 17:07:18 +0700
Subject: [PATCH 5/5] Update README.md

---
 examples/mup/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/mup/README.md b/examples/mup/README.md
index c86850ca..ed94c1fb 100644
--- a/examples/mup/README.md
+++ b/examples/mup/README.md
@@ -32,3 +32,8 @@ We trained a 350m model with spectral µTransfer and standard parametrization us
 Please check the directory [[./examples/mup/configs]](/examples/mup/configs) for the configurations we used to reproduce the experiments.
 
 ![LLaMA](./assets/llama.png)
+
+
+#### Thoughts
+
+For Spectral MuP, the experiments we used it on MLP only [link] and 300m LLaMA [link] (there are links to the experiment config in the mup readme). However, when we tested it on 1B/8B models iirc, the loss blew up for some reasons. So, we'd recommend they try μTransfer, not spectral μTransfer.