From 33fd672924e82ee78b2777c64a0367378692fadf Mon Sep 17 00:00:00 2001 From: zzhhjjj Date: Mon, 22 Apr 2024 14:39:51 +0000 Subject: [PATCH 1/5] readme --- examples/mamba/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/mamba/README.md b/examples/mamba/README.md index 5c31d07f..8eefa9c2 100644 --- a/examples/mamba/README.md +++ b/examples/mamba/README.md @@ -18,6 +18,18 @@ pip install -r requirements.txt > https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5 +## Bug related to nanotron +Encountered the following issue when ran train_mamba.sh: +``` +causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv +``` +Solved this by doing: +pip uninstall mamba-ssm +pip install causal_conv1d==1.1.1 +pip install mamba-ssm --no-cache-dir +https://github.com/state-spaces/mamba/issues/169 + + ## Credits Credits to the following repositories from which the code was adapted: - https://github.com/state-spaces/mamba From e484d99db07bf0a69d35072fd11b500cb1722f45 Mon Sep 17 00:00:00 2001 From: Tiancheng Chen Date: Tue, 14 May 2024 18:57:55 +0200 Subject: [PATCH 2/5] wip --- src/nanotron/config/parallelism_config.py | 2 ++ src/nanotron/models/llama.py | 32 +++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py index 5912425b..321ee045 100644 --- a/src/nanotron/config/parallelism_config.py +++ b/src/nanotron/config/parallelism_config.py @@ -23,6 +23,7 @@ class ParallelismArgs: pp_engine: Pipeline engine to use between "1f1b" and "afab" tp_mode: TP mode to use between "all_reduce" and "reduce_scatter": all_reduce is normal, reduce_scatter activate sequence parallelism tp_linear_async_communication: Whether to use async communication in TP linear layers + recompute_layer: Whether to recompute each Transformer layer to save memory. """ dp: int @@ -31,6 +32,7 @@ class ParallelismArgs: pp_engine: Optional[PipelineEngine] = None tp_mode: Optional[TensorParallelLinearMode] = None tp_linear_async_communication: Optional[bool] = None + recompute_layer: bool = False expert_parallel_size: int = 1 diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py index 32aab9cd..a439768b 100644 --- a/src/nanotron/models/llama.py +++ b/src/nanotron/models/llama.py @@ -18,6 +18,7 @@ import torch from torch import nn +from torch.utils.checkpoint import CheckpointFunction from nanotron import distributed as dist from nanotron import logging @@ -617,12 +618,14 @@ def __init__( self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.mlp = MLP(config=config, parallel_config=parallel_config, tp_pg=tp_pg) - - def forward( + + self.recompute_layer = parallel_config.recompute_layer + + def _core_forward( self, hidden_states: Union[torch.Tensor, TensorPointer], sequence_mask: Union[torch.Tensor, TensorPointer], - ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + ) -> List[Union[torch.Tensor, TensorPointer]]: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -635,12 +638,31 @@ def forward( hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] hidden_states = hidden_states + residual + return hidden_states, output["sequence_mask"] + + def _checkpointed_forward( + self, + hidden_states: torch.Tensor, + sequence_mask: torch.Tensor, + ) -> List[torch.Tensor]: + return CheckpointFunction.apply(self._core_forward, hidden_states, sequence_mask) + + def forward( + self, + hidden_states: Union[torch.Tensor, TensorPointer], + sequence_mask: Union[torch.Tensor, TensorPointer], + ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + + if self.recompute_layer: + hidden_states, sequence_mask = self._checkpointed_forward(hidden_states, sequence_mask) + else: + hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask) + return { "hidden_states": hidden_states, - "sequence_mask": output["sequence_mask"], + "sequence_mask": sequence_mask, } - class Embedding(nn.Module, AttachableStore): def __init__(self, tp_pg: dist.ProcessGroup, config: LlamaConfig, parallel_config: Optional[ParallelismArgs]): super().__init__() From 7e15516cf282cc8b1f10b34e5334615f4e124c60 Mon Sep 17 00:00:00 2001 From: Tiancheng Chen Date: Tue, 14 May 2024 23:26:40 +0200 Subject: [PATCH 3/5] layer recompute --- src/nanotron/models/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py index a439768b..cb1b4d86 100644 --- a/src/nanotron/models/llama.py +++ b/src/nanotron/models/llama.py @@ -645,7 +645,7 @@ def _checkpointed_forward( hidden_states: torch.Tensor, sequence_mask: torch.Tensor, ) -> List[torch.Tensor]: - return CheckpointFunction.apply(self._core_forward, hidden_states, sequence_mask) + return CheckpointFunction.apply(self._core_forward, True, hidden_states, sequence_mask) def forward( self, @@ -653,7 +653,7 @@ def forward( sequence_mask: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: - if self.recompute_layer: + if self.recompute_layer and not isinstance(hidden_states, TensorPointer): hidden_states, sequence_mask = self._checkpointed_forward(hidden_states, sequence_mask) else: hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask) From ed5a11c291e1988e3a86d74a3fba99be9ed6f57f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?X=CE=BBRI-U5?= Date: Mon, 8 Jul 2024 17:05:47 +0700 Subject: [PATCH 4/5] Update README.md --- examples/doremi/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/doremi/README.md b/examples/doremi/README.md index 5a726bd1..dfc9ea40 100644 --- a/examples/doremi/README.md +++ b/examples/doremi/README.md @@ -87,3 +87,7 @@ For evaluation, we do uniform sampling on the test set to evaluate a 2.5B model - 2.5B llama trained using the optimized weights: https://huggingface.co/nanotron/doremi-llama-2.5b-optimized-weights and the dataset: https://huggingface.co/datasets/nanotron/the-pile-for-doremi + +#### Thoughts + +For DoReMi, it's useful if you don't initially have an idea of what would be a good distribution for your training data, or want a quick way to find a better baseline than the uniform distribution if you want to tune the data distribution by hand. In my previous experiments, DoReMi matched the pretraining performance of the distribution of mamba training but couldn't outperform it. I suspect it doesn't work well when there are nuances, meaning the difference between your known best distribution and a better distribution isn't significant. From d5cf7c42896645bad0b73c48641bf68085b62e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?X=CE=BBRI-U5?= Date: Mon, 8 Jul 2024 17:07:18 +0700 Subject: [PATCH 5/5] Update README.md --- examples/mup/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/mup/README.md b/examples/mup/README.md index c86850ca..ed94c1fb 100644 --- a/examples/mup/README.md +++ b/examples/mup/README.md @@ -32,3 +32,8 @@ We trained a 350m model with spectral µTransfer and standard parametrization us Please check the directory [[./examples/mup/configs]](/examples/mup/configs) for the configurations we used to reproduce the experiments. ![LLaMA](./assets/llama.png) + + +#### Thoughts + +For Spectral MuP, the experiments we used it on MLP only [link] and 300m LLaMA [link] (there are links to the experiment config in the mup readme). However, when we tested it on 1B/8B models iirc, the loss blew up for some reasons. So, we'd recommend they try μTransfer, not spectral μTransfer.