Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ahmed_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 committed May 4, 2024
2 parents ba6366c + 2516d06 commit 4f34c81
Show file tree
Hide file tree
Showing 11 changed files with 482 additions and 129 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run_entry_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
matrix:
python-version: ["3.10"]
jax-version: ["0.4.23"]
jax-version: ["0.4.26"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_ray_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
matrix:
python-version: ["3.10"]
jax-version: ["0.4.23"]
jax-version: ["0.4.26"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
matrix:
python-version: ["3.10"]
jax-version: ["0.4.23"]
jax-version: ["0.4.26"]

steps:
- uses: actions/checkout@v3
Expand Down
33 changes: 33 additions & 0 deletions config/gpt2_small_fast_public.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
data:
train_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
validation_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
cache_dir: "gs://levanter-data/tokenized/openwebtext/"
tokenizer: "gpt2"
model:
type: gpt2
hidden_dim: 768
num_heads: 12
num_layers: 12
seq_len: 1024
gradient_checkpointing: true
scale_attn_by_inverse_layer_idx: true
trainer:
tracker:
- type: wandb
project: "levanter"
tags: [ "openwebtext", "gpt2", "itest"]

mp: p=f32,c=bfloat16
model_axis_size: 1
per_device_parallelism: -1

train_batch_size: 256
num_train_steps: 10000
ray:
auto_start_cluster: false
optimizer:
learning_rate: 1E-3
weight_decay: 0.1
warmup: 0.01
2 changes: 1 addition & 1 deletion infra/helpers/setup-tpu-vm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ pip install -U wheel
# jax and jaxlib
# libtpu sometimes has issues installing for clinical (probably firewall?)
#retry pip install -U "jax[tpu]==0.4.5" libtpu-nightly==0.1.dev20230216 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
retry pip install -U "jax[tpu]==0.4.21" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
retry pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html

# clone levanter
git clone $REPO levanter
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies = [
# Haliax changes in step with levanter, so we'll just use the git version except for releases.
# "haliax @ git+https://github.com/stanford-crfm/haliax.git@main",
"haliax>=1.4.dev291",
"equinox>=0.10.7",
"equinox>=0.11.4",
"jaxtyping>=0.2.20",
"transformers>=4.39.3",
"optax>=0.1.9",
Expand Down
36 changes: 29 additions & 7 deletions scripts/clean_old_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
# python clean_old_checkpoints.py gs://my-bucket/my-dir | xargs -I {} gsutil -m rm -r {}
import os
import sys
from datetime import datetime, timezone

import fsspec


AGE = 30 # days


def is_dir_of_checkpoints(path):
fs = fsspec.filesystem("gcs")
# if the children are named like step-XXXXX, then it's a checkpoint directory
Expand Down Expand Up @@ -45,16 +49,34 @@ def list_deletable_directories(base_dir):

# Add all checkpoint directories except the ones we need to keep
for path in checkpoint_paths:
if path != max_complete_checkpoint and path != max_000_checkpoint:
yield path
if path == max_complete_checkpoint or path == max_000_checkpoint:
continue

try:
new = False
for file in ["metadata.json", "worker-0.cert"]:
details = fs.ls(f"{path}/{file}", detail=True)
if details:
mtime = details[0]["mtime"]
age = (datetime.now(timezone.utc) - mtime).days
if age < AGE:
new = True
break

if new:
continue

except FileNotFoundError:
pass

yield path


# Usage example:
if __name__ == "__main__":
if len(sys.argv) != 2:
if len(sys.argv) < 2:
print("Usage: python clean_old_checkpoints.py <base_dir>")
sys.exit(1)
base_dir = sys.argv[1]

for path in list_deletable_directories(base_dir):
print(f"gs://{path}")
for base_dir in sys.argv[1:]:
for path in list_deletable_directories(base_dir):
print(f"gs://{path}")
Loading

0 comments on commit 4f34c81

Please sign in to comment.