Skip to content

Commit

Permalink
Removed distributed training and fixed issue with Tag name not being …
Browse files Browse the repository at this point in the history
…used as run name
  • Loading branch information
GeroVanMi committed May 15, 2024
1 parent 9d87d1f commit 0c54ae5
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
4 changes: 2 additions & 2 deletions ci/start_lightning_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
studio = create_studio()

print("Starting studio.")
studio.start(Machine.T4_X_4)
studio.start(Machine.T4)

BASE_PATH = "/teamspace/studios/this_studio/algorithmic-quartet-mlops"

Expand All @@ -28,5 +28,5 @@
)

studio.run(
f"docker run -d --gpus all -e TAG_NAME -e WANDB_API_KEY='{os.environ.get('WANDB_API_KEY')}' -e GC_BUCKET_KEY='{os.environ.get('GC_BUCKET_KEY')}' us-west2-docker.pkg.dev/algorithmic-quartet/training-images/pokemon-trainer:latest"
f"docker run -d --gpus all -e TAG_NAME={os.environ.get('TAG_NAME')} -e WANDB_API_KEY='{os.environ.get('WANDB_API_KEY')}' -e GC_BUCKET_KEY='{os.environ.get('GC_BUCKET_KEY')}' us-west2-docker.pkg.dev/algorithmic-quartet/training-images/pokemon-trainer:latest"
)
2 changes: 1 addition & 1 deletion training/src/configurations/TrainConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

@dataclass
class TrainConfig(Configuration):
train_batch_size = 96
train_batch_size = 24
eval_batch_size = 8
num_epochs = 60
num_images = 7351

0 comments on commit 0c54ae5

Please sign in to comment.