-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2b.yaml
80 lines (80 loc) · 1.63 KB
/
2b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
version: x0.0
logger:
log: /dev/stdout
level: trace
stream:
remote: null
local: null
split: null
index: null
download_retry: 3
download_timeout: 60s
hash_algos: null
non_hashed_ok: true
keep_zip: false
streams:
c4:
remote: oci://path/to/c4
local: /dataset/mds/c4
split: train
proportion: 0.299
markdown:
remote: oci://path/to/markdown
local: /dataset/mds/markdown
proportion: 0.045
mc4:
remote: oci://path/to/mc4
local: /dataset/mds/mc4
proportion: 0.335
redpajama:
remote: oci://path/to/redpajama
local: /dataset/mds/redpajama
proportion: 0.085
redpajama-arxiv:
remote: oci://path/to/redpajama-arxiv
local: /dataset/mds/redpajama-arxiv
proportion: 0.019
redpajama-books:
remote: oci://path/to/redpajama-books
local: /dataset/mds/redpajama-books
proportion: 0.03
redpajama-stackexchange:
remote: oci://path/to/redpajama-stackexchange
local: /dataset/mds/redpajama-stackexchange
proportion: 0.014
redpajama-wiki:
remote: oci://path/to/redpajama-wiki
local: /dataset/mds/redpajama-wiki
proportion: 0.04
s2:
remote: oci://path/to/s2
local: /dataset/mds/s2
split: train
proportion: 0.033
stack:
remote: oci://path/to/stack
local: /dataset/mds/stack
proportion: 0.1
index:
bucket_size: 1024
sampler:
algo: m2
seed: 1337
epoch_size: 2b
strategy: balanced
determiner:
algo: fast
canonical_nodes: 64
batch_size: 4
shuffle: true
shuffler:
algo: s1br
seed: 1337
min_block_size: 500k
max_block_size: 2m
downloader:
prefetch: 1024
concurrency: 64
cache:
disk: null
memory: null