-
Notifications
You must be signed in to change notification settings - Fork 102
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Oğuzhan Fatih Kar <[email protected]> Co-authored-by: David Mizrahi <[email protected]> Co-authored-by: Ali Garjani <[email protected]>
- Loading branch information
1 parent
43558d1
commit 4600165
Showing
61 changed files
with
15,348 additions
and
8,822 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
72 changes: 72 additions & 0 deletions
72
cfgs/default/4m/alphas_mixture/main/mix_mod21_all2allmix_rgb2all_capT5bias.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# Mixture of alphas: | ||
# - all2all with input and target alphas 0.01, 0.1, 1.0, 10.0 | ||
# - rgb2all with target alpha 0.5 | ||
# - caption and T5 embedding bias (each weighted half) | ||
|
||
sampling_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5] | ||
|
||
alphas_mixture: | ||
rgb@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 1000.0, 0.05, 0.05] | ||
target_alphas: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | ||
caption: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 5.0, 0.0] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.0, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'all', 'random'] | ||
t5_caption: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.0, 5.0] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.0] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random', 'all'] | ||
det: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random', 'random'] | ||
tok_rgb@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_normal@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_depth@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_semseg@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_clip@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
human_poses: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random', 'random'] | ||
tok_dinov2@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_dinov2_global: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_imagebind@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_imagebind_global: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_canny_edge@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
tok_sam_edge@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
color_palette: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
keep: ['binary', 'binary', 'binary', 'binary', 'binary', 'binary', 'binary'] | ||
metadata: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random', 'random'] | ||
sam_instance: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random', 'random'] |
34 changes: 34 additions & 0 deletions
34
cfgs/default/4m/alphas_mixture/main/mix_mod7_all2allmix_rgb2all_capbias.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Mixture of alphas: | ||
# - all2all with input and target alphas 0.01, 0.1, 1.0, 10.0 | ||
# - rgb2all with target alpha 0.5 | ||
# - caption bias | ||
|
||
sampling_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] | ||
|
||
alphas_mixture: | ||
rgb@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 1000.0, 0.05] | ||
target_alphas: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | ||
caption: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 5.0] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'all'] | ||
det: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
keep: ['random', 'random', 'random', 'random', 'random', 'random'] | ||
tok_rgb@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
tok_normal@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
tok_depth@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
tok_semseg@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] | ||
tok_clip@224: | ||
input_alphas: [0.01, 0.1, 1.0, 10.0, 0.0, 0.05] | ||
target_alphas: [0.01, 0.1, 1.0, 10.0, 0.5, 0.5] |
105 changes: 105 additions & 0 deletions
105
cfgs/default/4m/data/cc12m+coyo+c4/main/mix_mod21_all2allmix_rgb2all_capT5bias_C4.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
train: | ||
datasets: | ||
cc12m: | ||
type: multimodal | ||
|
||
# Input and output domain names, separated by hyphen | ||
in_domains: caption-t5_caption-det-metadata-rgb@224-tok_rgb@224-tok_normal@224-tok_depth@224-tok_semseg@224-tok_clip@224-human_poses-tok_dinov2@224-tok_dinov2_global-tok_imagebind@224-tok_imagebind_global-tok_sam_edge@224-tok_canny_edge@224-color_palette-sam_instance | ||
out_domains: caption-det-metadata-tok_rgb@224-tok_normal@224-tok_depth@224-tok_semseg@224-tok_clip@224-human_poses-tok_dinov2@224-tok_dinov2_global-tok_imagebind@224-tok_imagebind_global-tok_sam_edge@224-tok_canny_edge@224-color_palette-sam_instance | ||
|
||
# Dirichlet alphas concentration parameter for input and output. | ||
# Can be either one value, or one value per input modality separated by hyphen. | ||
input_alphas: null | ||
target_alphas: null | ||
# Path to specific alphas configuration to enable mixture of Dirichlets. | ||
# If provided, overrides input_alphas and target_alphas | ||
alphas_config: "cfgs/default/4m/alphas_mixture/main/mix_mod21_all2allmix_rgb2all_capT5bias.yaml" | ||
|
||
# Optionally, min_input_tokens, min_target_tokens, num_input_tokens, num_target_tokens can be specified here | ||
# If so, they will override the values provided in the main config | ||
min_input_tokens: null | ||
min_target_tokens: null | ||
num_input_tokens: 256 | ||
num_target_tokens: 256 | ||
|
||
# Data can either be local or on cloud storage (e.g. S3), see data docs for more info | ||
# Use braceexpand notation to indicate shard range (e.g. shard-{0000..9999}.tar) | ||
# Use brackets to indicate multiple modalities (e.g. [modality1,modality2,modality3]) | ||
data_path: 'path/to/training/data/[modality1,modality2,modality3]/shard-{00000..9999}.tar' | ||
use_wds: True # Use webdataset | ||
wds_n_repeats: 4 # Number of repeats for webdataset loader to improve efficiency | ||
wds_shuffle_buffer_tar: 1_000 # Webdatasets shuffle buffer after loading tar files | ||
wds_shuffle_buffer_repeat: 1_000 # Webdatasets shuffle buffer after repeating samples | ||
|
||
main_augment_domain: rgb@224 # Select from which modality to get the original full image size (mostly important for resizing bounding boxes) | ||
aligned_captions: True # Align captions to crop_settings | ||
tok_train_aug: True # Apply data augmentation to tokens (if multiple crop settings are available) | ||
|
||
# modality_name_map: # Use modality_name_map to define a mapping from a folder name to a modality name | ||
# tok_rgb_folder_name: tok_rgb@224 | ||
# tok_depth_folder_nme: tok_depth@224 | ||
# ... | ||
|
||
coyo700m: | ||
type: multimodal | ||
|
||
# Input and output domain names, separated by hyphen | ||
in_domains: caption-det-rgb@224-tok_rgb@224-tok_normal@224-tok_depth@224-tok_semseg@224-tok_clip@224 | ||
out_domains: caption-det-tok_rgb@224-tok_normal@224-tok_depth@224-tok_semseg@224-tok_clip@224 | ||
|
||
# Dirichlet alphas concentration parameter for input and output. | ||
# Can be either one value, or one value per input modality separated by hyphen. | ||
input_alphas: null | ||
target_alphas: null | ||
# Path to specific alphas configuration to enable mixture of Dirichlets. | ||
# If provided, overrides input_alphas and target_alphas | ||
alphas_config: "cfgs/bolt/pretrain/4m/alphas_mixture/all2allmix-oldmod_rgb2all_capbias_v0.yaml" # TODO | ||
|
||
# Optionally, min_input_tokens, min_target_tokens, num_input_tokens, num_target_tokens can be specified here | ||
# If so, they will override the values provided in the main config | ||
min_input_tokens: null | ||
min_target_tokens: null | ||
num_input_tokens: 256 | ||
num_target_tokens: 256 | ||
|
||
# Data can either be local or on cloud storage (e.g. S3), see data docs for more info | ||
# Use braceexpand notation to indicate shard range (e.g. shard-{0000..9999}.tar) | ||
# Use brackets to indicate multiple modalities (e.g. [modality1,modality2,modality3]) | ||
data_path: 'path/to/training/data/[modality1,modality2,modality3]/shard-{00000..9999}.tar' | ||
use_wds: True # Use webdataset | ||
wds_n_repeats: 1 # Number of repeats for webdataset loader to improve efficiency | ||
wds_shuffle_buffer_tar: 1_000 # Webdatasets shuffle buffer after loading tar files | ||
wds_shuffle_buffer_repeat: 1_000 # Webdatasets shuffle buffer after repeating samples | ||
|
||
main_augment_domain: rgb@224 # Select from which modality to get the original full image size (mostly important for resizing bounding boxes) | ||
aligned_captions: True # Align captions to crop_settings | ||
tok_train_aug: True # Apply data augmentation to tokens (if multiple crop settings are available) | ||
|
||
# modality_name_map: # Use modality_name_map to define a mapping from a folder name to a modality name | ||
# tok_rgb_folder_name: tok_rgb@224 | ||
# tok_depth_folder_nme: tok_depth@224 | ||
# ... | ||
|
||
c4: | ||
type: huggingface | ||
|
||
in_domains: caption | ||
out_domains: caption | ||
|
||
input_alphas: "1.0" | ||
target_alphas: "1.0" | ||
alphas_config: null | ||
|
||
data_path: '/path/to/c4/en' | ||
shuffle_buffer_load: 1_000 | ||
|
||
weights: [0.6, 0.2, 0.2] # Sampling weights for the training datasets | ||
|
||
val: | ||
datasets: | ||
cc12m: | ||
data_path: 'path/to/val/data' | ||
coyo700m: | ||
data_path: 'path/to/val/data' | ||
c4: | ||
data_path: 'path/to/val/data' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Config for DDP | ||
|
||
# Arch: SwiGLU No Bias | ||
# Modalities: Mix of rgb2all, all2all, caption/T5-biased2all, and C4 text-only | ||
# Datasets: Mix of COYO700M, CC12M, and C4 | ||
# To be run on 64 GPUs for batch size = 4096 | ||
run_name: auto | ||
|
||
# Input & output | ||
num_input_tokens: 256 | ||
num_target_tokens: 256 | ||
loss_type: mod | ||
|
||
# Architecture | ||
model: fm_base_12e_12d_swiglu_nobias | ||
patch_size: 16 | ||
input_size: 224 | ||
dtype: bfloat16 | ||
tokenizer_path: "fourm/utils/tokenizer/trained/text_tokenizer_4m_wordpiece_30k.json" | ||
|
||
# Initialization | ||
finetune: '/path/to/4M_checkpoint.pth' # Change me. Initialize 4M-21 training from 4M-7 checkpoint | ||
|
||
# Train | ||
epochs: -1 | ||
total_tokens: 500 # in billions | ||
opt: adamw | ||
blr: 0.0001 # this is base_lr = 1e-4, lr = base_lr * batch_size / 256 | ||
min_blr: 0. | ||
warmup_epochs: -1 | ||
warmup_tokens: 10 # in billions | ||
batch_size: 64 # 64 x 64 = 4096 | ||
|
||
# Data | ||
data_config: "cfgs/default/4m/data/cc12m+coyo+c4/main/mix_mod21_all2allmix_rgb2all_capT5bias_C4.yaml" | ||
s3_data_endpoint: "/path/to/endpoint" # Change me | ||
eval_freq: 1 | ||
fixed_eval: True | ||
epoch_size: 10_000_000 # Number of samples per "epoch" | ||
|
||
# Saving | ||
save_ckpt_freq: 1 | ||
output_dir: 'output/auto' | ||
|
||
# Wandb | ||
log_wandb: False # Set to True to log to Weights & Biases | ||
wandb_project: '4m-train' | ||
wandb_entity: null # Change if needed | ||
wandb_run_name: auto |
Oops, something went wrong.