From 38fcb8bcbe665100430507c16415119c54a57559 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 2 Dec 2024 14:24:03 +0000 Subject: [PATCH] config_dp --- examples/config_tiny_llama.yaml | 2 +- run_multinode.sh | 4 +- scaling_benchmarks.py | 109 ++++++++++++++++++++++++++------ 3 files changed, 92 insertions(+), 23 deletions(-) diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml index 75eb4328..9714a6d2 100644 --- a/examples/config_tiny_llama.yaml +++ b/examples/config_tiny_llama.yaml @@ -1,5 +1,5 @@ # /fsx/nouamane/miniconda/envs/2-1-cu121/bin/torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml -# NANOTRON_BENCHMARK=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml +# NANOTRON_BENCHMARK=1 CUDA_DEVICE_MAX_CONNECTIONS=1 /fsx/nouamane/miniconda/envs/2-1-cu121/bin/torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml checkpoints: checkpoint_interval: 10000 checkpoints_path: checkpoints diff --git a/run_multinode.sh b/run_multinode.sh index a285660b..cc64a10a 100644 --- a/run_multinode.sh +++ b/run_multinode.sh @@ -5,11 +5,11 @@ #SBATCH --partition=hopper-prod #SBATCH --qos=high -#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%x-%j.out +#SBATCH -o /fsx/nouamane/projects/nanotron/logs/%j-%x.out #SBATCH --nodes=2 # Number of nodes (modify as needed) #SBATCH --ntasks-per-node=1 # Number of tasks per node -#SBATCH --cpus-per-task=80 # CPU cores per task +#SBATCH --cpus-per-task=60 # CPU cores per task #SBATCH --gres=gpu:8 # Number of GPUs per node #SBATCH --exclusive # Exclusive use of nodes diff --git a/scaling_benchmarks.py b/scaling_benchmarks.py index 73a7764f..4182a67a 100644 --- a/scaling_benchmarks.py +++ b/scaling_benchmarks.py @@ -1,9 +1,18 @@ -#!/usr/bin/env python3 +# python scaling_benchmarks.py import argparse import math import os import yaml +from nanotron.logging import human_format + +VOCAB_SIZE = 32768 + + +def estimate_num_params(layers, hidden_size, heads, intermediate_size, tie_word_embeddings): + # params = 2*V*h + l(3*h*H + 4*h*h) = (2)Vh + 16lh^2 + vocab = VOCAB_SIZE * hidden_size if tie_word_embeddings else 2 * VOCAB_SIZE * hidden_size + return vocab + layers * (3 * hidden_size * intermediate_size + 4 * hidden_size * hidden_size) def create_config( @@ -15,6 +24,9 @@ def create_config( micro_batch_size: int = 1, base_config_path: str = "examples/config_tiny_llama.yaml", zero_stage: int = 0, + num_layers: int = 24, + hidden_size: int = 2048, + num_attention_heads: int = 16, ) -> dict: """Create a config with the specified parallelism settings.""" # Load base config @@ -35,16 +47,37 @@ def create_config( config["model"]["model_config"]["max_position_embeddings"] = seq_len config["tokens"]["micro_batch_size"] = micro_batch_size + # Modify model architecture settings + config["model"]["model_config"]["num_hidden_layers"] = num_layers + config["model"]["model_config"]["hidden_size"] = hidden_size + config["model"]["model_config"]["num_attention_heads"] = num_attention_heads + config["model"]["model_config"]["num_key_value_heads"] = num_attention_heads # No GQA / MQA + config["model"]["model_config"]["intermediate_size"] = 4 * hidden_size + # config["model"]["model_config"]["tie_word_embeddings"] = True if hidden_size < 3000 else False + + # Set vocab_size to 32k to reduce memory usage + config["model"]["model_config"]["vocab_size"] = VOCAB_SIZE + # modify zero stage config["optimizer"]["zero_stage"] = zero_stage + N = human_format( + estimate_num_params( + num_layers, + hidden_size, + num_attention_heads, + config["model"]["model_config"]["intermediate_size"], + config["model"]["model_config"]["tie_word_embeddings"], + ) + ) + # Update run name to reflect configuration config["general"][ "run" - ] = f"dp{dp}_tp{tp}_pp{pp}_acc{batch_accum}_mbs{micro_batch_size}_seq{seq_len}_zero{zero_stage}" + ] = f"{N}_dp{dp}_tp{tp}_pp{pp}_acc{batch_accum}_mbs{micro_batch_size}_seq{seq_len}_zero{zero_stage}_l{num_layers}_h{hidden_size}_heads{num_attention_heads}" # Update benchmark CSV path - config["general"]["benchmark_csv_path"] = "bench.csv" + config["general"]["benchmark_csv_path"] = "bench_dp.csv" return config @@ -104,6 +137,7 @@ def main(): ) parser.add_argument("--base-script", type=str, default="run_multinode.sh", help="Base SLURM script to use") parser.add_argument("--run", action="store_true", help="Automatically submit all generated SLURM scripts") + parser.add_argument("--debug", action="store_true", help="Debug mode") args = parser.parse_args() # Validate input files exist @@ -116,34 +150,69 @@ def main(): for directory in [args.configs_dir, args.scripts_dir]: os.makedirs(directory, exist_ok=True) + # Define model configurations + model_configs = { + # params = 2*V*h + l(3*h*H + 4*h*h) = (2)Vh + 16lh^2 + # (layers, hidden_size, heads) + # "138M": (12, 768, 12), + # "200M": (12, 1024, 16), + # "500M": (12, 1536, 16), + # "1000M": (15, 2048, 16), + "1700M": (24, 2048, 16), # (layers, hidden_size, heads) + "4300M": (28, 3072, 20), + "8700M": (32, 4096, 32), + "11B": (42, 4096, 32), + } + # Define configurations to test - configurations = [ - # (dp, tp, pp, batch_accum, seq_len, mbs) - # (1, 8, 1, 1, 2048, 1), # Base configuration - # (2, 4, 1, 1, 2048, 1), - # (8, 1, 1, 1, 2048, 1), - # (16, 1, 1, 1, 2048, 1), - *[(2**i, 1, 1, 1, 2048, 1) for i in range(0, 8)], - *[(2**i, 8, 1, 1, 2048, 1) for i in range(0, 7)], - *[(2**i, 8, 1, 1, 2048, 8) for i in range(0, 7)], - # 64k seq len - *[(2**i, 8, 1, 1, 65536, 1) for i in range(0, 7)], # 64 nodes max - ] + configurations = [] + + # For each model size, test different GPU configurations + for model_name, (num_layers, hidden_size, num_heads) in model_configs.items(): + # Test each model with different GPU counts while maintaining 4M tokens/step + model_configs = [ + # Format: (dp, tp, pp, batch_accum, seq_len, mbs, ...) + # (8, 1, 1, 1, 4096, 1, num_layers, hidden_size, num_heads), + # 128 GPU configuration + (128, 1, 1, 4, 4096, 2, num_layers, hidden_size, num_heads), + # 512 GPU configuration + (512, 1, 1, 1, 4096, 2, num_layers, hidden_size, num_heads), + ] + configurations.extend(model_configs) + + if args.debug: + print("Debug mode: only running 1 configuration") + configurations = configurations[:1] # Validate configurations - for dp, tp, pp, batch_accum, seq_len, mbs in configurations: + for dp, tp, pp, batch_accum, seq_len, mbs, num_layers, hidden_size, num_heads in configurations: total_gpus = dp * tp * pp - if total_gpus > 64: # Assuming maximum of 8 nodes with 8 GPUs each + if total_gpus > 512: print( f"Warning: Configuration dp={dp}, tp={tp}, pp={pp} requires {total_gpus} GPUs, which might be too many" ) + # Calculate tokens per step to verify batch size + tokens_per_step = dp * tp * pp * mbs * batch_accum * seq_len + print(f"Model {hidden_size}H_{num_layers}L: {total_gpus} GPUs, " f"{tokens_per_step:,} GBS") + # Generate configs and scripts - generated_scripts = [] # Keep track of generated script paths - for dp, tp, pp, batch_accum, seq_len, mbs in configurations: + generated_scripts = [] + for dp, tp, pp, batch_accum, seq_len, mbs, num_layers, hidden_size, num_heads in configurations: try: # Create config - config = create_config(dp, tp, pp, batch_accum, seq_len, mbs, base_config_path=args.base_config) + config = create_config( + dp=dp, + tp=tp, + pp=pp, + batch_accum=batch_accum, + seq_len=seq_len, + micro_batch_size=mbs, + base_config_path=args.base_config, + num_layers=num_layers, + hidden_size=hidden_size, + num_attention_heads=num_heads, + ) # Save config config_path = os.path.join(args.configs_dir, f"config_{config['general']['run']}.yaml")