Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Actually do sequence packing #873

Merged
merged 18 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions config/data/dolma_llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
cache_dir: null
cache_options:
batch_size: 128
num_shard_groups: 128
target_size_per_flush: 512MB
configs:
dolma/algebraic-stack:
cache_dir: gs://marin-us-west4/tokenized/dolma/algebraic-stack-cc00cf
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/algebraic-stack-train-{0000..0015}.json.gz
validation_urls: []
dolma/arxiv:
cache_dir: gs://marin-us-west4/tokenized/dolma/arxiv-07a51f
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/arxiv-{0000..0099}.json.gz
validation_urls: []
dolma/c4:
cache_dir: gs://marin-us-west4/tokenized/dolma/c4-e0e5ec
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/c4-{0000..0170}.json.gz
validation_urls: []
dolma/cc:
cache_dir: gs://marin-us-west4/tokenized/dolma/cc-74b017
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_head-{0000..0274}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0000..0238}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0240..0379}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0000..0152}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0154..0444}.json.gz
validation_urls: []
dolma/cc-news:
cache_dir: gs://marin-us-west4/tokenized/dolma/cc-news-625d3e
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_head-{0000..0004}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_middle-{0000..0002}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_tail-0000.json.gz
validation_urls: []
dolma/falcon:
cache_dir: gs://marin-us-west4/tokenized/dolma/falcon-da8fd0
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/falcon-{0000..0499}.json.gz
validation_urls: []
dolma/flan:
cache_dir: gs://marin-us-west4/tokenized/dolma/flan-a99cb2
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/tulu_flan-{0000..0065}.json.gz
validation_urls: []
dolma/gutenberg:
cache_dir: gs://marin-us-west4/tokenized/dolma/gutenberg-f9eb99
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/books-{0000..0002}.json.gz
validation_urls: []
dolma/megawika:
cache_dir: gs://marin-us-west4/tokenized/dolma/megawika-34abf2
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/megawika-{0000..0261}.json.gz
validation_urls: []
dolma/open-web-math:
cache_dir: gs://marin-us-west4/tokenized/dolma/open-web-math-79823d
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/open-web-math-train-{0000..0012}.json.gz
validation_urls: []
dolma/pes2o:
cache_dir: gs://marin-us-west4/tokenized/dolma/pes2o-538363
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/pes2o-{0000..0025}.json.gz
validation_urls: []
dolma/reddit:
cache_dir: gs://marin-us-west4/tokenized/dolma/reddit-62a64a
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/reddit-{0000..0077}.json.gz
validation_urls: []
dolma/stackexchange:
cache_dir: gs://marin-us-west4/tokenized/dolma/stackexchange-adfc49
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/stackexchange-{0000..0025}.json.gz
validation_urls: []
dolma/starcoder:
cache_dir: gs://marin-us-west4/tokenized/dolma/starcoder-8b6089
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/starcoder-{0000..0048}.json.gz
validation_urls: []
dolma/wiki:
cache_dir: gs://marin-us-west4/tokenized/dolma/wiki-212315
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/wiki-{0000..0001}.json.gz
validation_urls: []
enforce_eos: true
ignore_token_id: null
mixture_block_size: 2048
shuffle: true
stop_strategy: restart
tokenizer: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
train_weights:
dolma/algebraic-stack: 12.6
dolma/arxiv: 28.0
dolma/c4: 124.95
dolma/cc: 597.75
dolma/cc-news: 14.3
dolma/falcon: 456.4
dolma/flan: 16.5
dolma/gutenberg: 5.3
dolma/megawika: 4.6
dolma/open-web-math: 12.6
dolma/pes2o: 57.2
dolma/reddit: 79.9
dolma/stackexchange: 19.6
dolma/starcoder: 263.8
dolma/wiki: 7.4
vocab_size: null
Loading
Loading