add updated config

stanford-crfm · Jan 26, 2025 · 97d35d5 · 97d35d5
1 parent 0345857
commit 97d35d5
Showing 1 changed file with 41 additions and 0 deletions.
diff --git a/config/debug_sft.yaml b/config/debug_sft.yaml
@@ -0,0 +1,41 @@
+dataset_type: chat_jsonl
+chat_train_urls:
+  - "gs://marin-us-central2/documents/allenai--tulu-v2-sft-mixture-0ba27c/data/**/*.jsonl.gz"
+supervised_data:
+# cache_dir before trying sequence packing
+  cache_dir: "gs://marin-us-central2/tokenized/tulu_sft_v3_llama3_tokenizer-7b19dc"
+  #cache_dir: "gs://marin-us-central2/tokenized/tulu_sft_v3_llama3_tokenizer_retrypack-bca8bd/"
+
+tokenizer: "meta-llama/Meta-Llama-3.1-8B"
+model:  # 7B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: True
+  flash_attention_block_size: 512
+  use_bias: false
+  use_layer_norm_weight: false
+trainer:
+  tracker:
+    type: wandb
+    project: "marin"
+    tags: ["dolma", "olmo", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 256
+  num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4E-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  warmup: 5000
+
+epoch: 0