.

huggingface · Nov 25, 2024 · 8aa249e · 8aa249e
1 parent f6a7db3
commit 8aa249e
Showing 1 changed file with 21 additions and 37 deletions.
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
@@ -1,3 +1,4 @@
+# /fsx/nouamane/miniconda/envs/2-1-cu121/bin/torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
 checkpoints:
   checkpoint_interval: 10
   checkpoints_path: checkpoints
@@ -6,29 +7,11 @@ checkpoints:
   save_initial_state: false
 data_stages:
 - data:
-    dataset:
-      dataset_overwrite_cache: false
-      dataset_processing_num_proc_per_process: 1
-      hf_dataset_config_name: null
-      hf_dataset_or_datasets: stas/openwebtext-10k
-      hf_dataset_splits: train
-      text_column_name: text
+    dataset: null # Custom dataloader will be used
     num_loading_workers: 1
     seed: 42
   name: Stable Training Stage
   start_training_step: 1
-- data:
-    dataset:
-      dataset_overwrite_cache: false
-      dataset_processing_num_proc_per_process: 1
-      hf_dataset_config_name: null
-      hf_dataset_or_datasets: stas/openwebtext-10k
-      hf_dataset_splits: train
-      text_column_name: text
-    num_loading_workers: 1
-    seed: 42
-  name: Annealing Phase
-  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
@@ -46,27 +29,27 @@ model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
-    std: 0.025
+    std: 0.02
   make_vocab_size_divisible_by: 1
   model_config:
-    bos_token_id: 1
-    eos_token_id: 2
+    bos_token_id: 0
+    eos_token_id: 0
     hidden_act: silu
-    hidden_size: 16
+    hidden_size: 2048
     initializer_range: 0.02
-    intermediate_size: 64
+    intermediate_size: 8192
     is_llama_config: true
-    max_position_embeddings: 256
-    num_attention_heads: 4
-    num_hidden_layers: 2
-    num_key_value_heads: 4
+    max_position_embeddings: 2048
+    num_attention_heads: 32
+    num_hidden_layers: 24
+    num_key_value_heads: 32
     pad_token_id: null
     pretraining_tp: 1
     rms_norm_eps: 1.0e-05
     rope_scaling: null
     tie_word_embeddings: true
     use_cache: true
-    vocab_size: 256
+    vocab_size: 49152
 optimizer:
   accumulate_grad_in_fp32: true
   clip_grad: 1.0
@@ -87,14 +70,15 @@ optimizer:
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
-  dp: 2
+  dp: 8
   expert_parallel_size: 1
-  pp: 2
+  pp: 1
   pp_engine: 1f1b
-  tp: 2
+  tp: 1
   tp_linear_async_communication: true
   tp_mode: REDUCE_SCATTER
-profiler: null
+profiler:
+  profiler_export_path: ./tb_logs
 tokenizer:
   tokenizer_max_length: null
   tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel
@@ -103,7 +87,7 @@ tokens:
   batch_accumulation_per_replica: 1
   limit_test_batches: 0
   limit_val_batches: 0
-  micro_batch_size: 2
-  sequence_length: 256
-  train_steps: 15
-  val_check_interval: -1
+  micro_batch_size: 4
+  sequence_length: 2048
+  train_steps: 7
+  val_check_interval: 100