diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml index 58645e2d..01d3fb1b 100644 --- a/examples/config_tiny_llama.yaml +++ b/examples/config_tiny_llama.yaml @@ -1,3 +1,4 @@ +# /fsx/nouamane/miniconda/envs/2-1-cu121/bin/torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml checkpoints: checkpoint_interval: 10 checkpoints_path: checkpoints @@ -6,29 +7,11 @@ checkpoints: save_initial_state: false data_stages: - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 1 - hf_dataset_config_name: null - hf_dataset_or_datasets: stas/openwebtext-10k - hf_dataset_splits: train - text_column_name: text + dataset: null # Custom dataloader will be used num_loading_workers: 1 seed: 42 name: Stable Training Stage start_training_step: 1 -- data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 1 - hf_dataset_config_name: null - hf_dataset_or_datasets: stas/openwebtext-10k - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 1 - seed: 42 - name: Annealing Phase - start_training_step: 10 general: benchmark_csv_path: null consumed_train_samples: null @@ -46,27 +29,27 @@ model: ddp_bucket_cap_mb: 25 dtype: bfloat16 init_method: - std: 0.025 + std: 0.02 make_vocab_size_divisible_by: 1 model_config: - bos_token_id: 1 - eos_token_id: 2 + bos_token_id: 0 + eos_token_id: 0 hidden_act: silu - hidden_size: 16 + hidden_size: 2048 initializer_range: 0.02 - intermediate_size: 64 + intermediate_size: 8192 is_llama_config: true - max_position_embeddings: 256 - num_attention_heads: 4 - num_hidden_layers: 2 - num_key_value_heads: 4 + max_position_embeddings: 2048 + num_attention_heads: 32 + num_hidden_layers: 24 + num_key_value_heads: 32 pad_token_id: null pretraining_tp: 1 rms_norm_eps: 1.0e-05 rope_scaling: null tie_word_embeddings: true use_cache: true - vocab_size: 256 + vocab_size: 49152 optimizer: accumulate_grad_in_fp32: true clip_grad: 1.0 @@ -87,14 +70,15 @@ optimizer: weight_decay: 0.01 zero_stage: 0 parallelism: - dp: 2 + dp: 8 expert_parallel_size: 1 - pp: 2 + pp: 1 pp_engine: 1f1b - tp: 2 + tp: 1 tp_linear_async_communication: true tp_mode: REDUCE_SCATTER -profiler: null +profiler: + profiler_export_path: ./tb_logs tokenizer: tokenizer_max_length: null tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel @@ -103,7 +87,7 @@ tokens: batch_accumulation_per_replica: 1 limit_test_batches: 0 limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 256 - train_steps: 15 - val_check_interval: -1 + micro_batch_size: 4 + sequence_length: 2048 + train_steps: 7 + val_check_interval: 100