Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wip cloud training #75

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
.*.swp
*.pyc
/torch310/
src/datasets/spotting-ball-2024
data/
12 changes: 12 additions & 0 deletions .ipynb_checkpoints/requirements-checkpoint.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
torch>=2
torchvision
pyyaml
numpy
opencv-python
submitit
braceexpand
webdataset
timm
pandas
einops
beartype
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
nodes: 8
tasks_per_node: 8
tag: k400-16x8x3
eval_name: video_classification_frozen
resume_checkpoint: false
data:
dataset_train: /your_path_to_kinetics400_train_csv_file_index.csv
dataset_val: /your_path_to_kinetics400_val_csv_file_index.csv
dataset_type: VideoDataset
num_classes: 400
frames_per_clip: 16
num_segments: 8
num_views_per_segment: 3
frame_step: 4
optimization:
attend_across_segments: true
num_epochs: 20
resolution: 224
batch_size: 32
weight_decay: 0.01
lr: 0.001
start_lr: 0.001
final_lr: 0.0
warmup: 0.
use_bfloat16: true
pretrain:
model_name: vit_large
checkpoint_key: target_encoder
clip_duration: null
frames_per_clip: 16
tubelet_size: 2
uniform_power: true
use_silu: false
tight_silu: false
use_sdpa: true
patch_size: 16
folder: /home
checkpoint: vitl16.pth.tar # name of pretrained model file inside folder
write_tag: jepa
4 changes: 2 additions & 2 deletions configs/evals/vith16_k400_16x8x3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ pretrain:
tight_silu: false
use_sdpa: true
patch_size: 16
folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder
folder: /Users/cosmincojocaru/Downloads
checkpoint: vitl16.pth.tar # name of pretrained model file inside folder
write_tag: jepa
6 changes: 3 additions & 3 deletions configs/evals/vitl16_k400_16x8x3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ optimization:
attend_across_segments: true
num_epochs: 20
resolution: 224
batch_size: 4
batch_size: 32
weight_decay: 0.01
lr: 0.001
start_lr: 0.001
Expand All @@ -34,6 +34,6 @@ pretrain:
tight_silu: false
use_sdpa: true
patch_size: 16
folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder
folder: /home
checkpoint: vitl16.pth.tar # name of pretrained model file inside folder
write_tag: jepa
67 changes: 67 additions & 0 deletions evals/.ipynb_checkpoints/main-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

import argparse

import multiprocessing as mp

import pprint
import yaml

from src.utils.distributed import init_distributed

from evals.scaffold import main as eval_main

parser = argparse.ArgumentParser()
parser.add_argument(
'--fname', type=str,
help='name of config file to load',
default='../configs/evals/vitl16_k400_16x8x3.yaml')
parser.add_argument(
'--devices', type=str, nargs='+', default=['cuda:0'],
help='which devices to use on local machine')


def process_main(rank, fname, world_size, devices):
import os
os.environ['CUDA_VISIBLE_DEVICES'] = str(devices[rank].split(':')[-1])

import logging
logging.basicConfig()
logger = logging.getLogger()
if rank == 0:
logger.setLevel(logging.INFO)
else:
logger.setLevel(logging.ERROR)

logger.info(f'called-params {fname}')

# Load config
params = None
with open(fname, 'r') as y_file:
params = yaml.load(y_file, Loader=yaml.FullLoader)
logger.info('loaded params...')
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(params)

# Init distributed (access to comm between GPUS on same machine)
world_size, rank = init_distributed(rank_and_world_size=(rank, world_size))
logger.info(f'Running... (rank: {rank}/{world_size})')

# Launch the eval with loaded config
eval_main(params['eval_name'], args_eval=params)


if __name__ == '__main__':
args = parser.parse_args()
num_gpus = len(args.devices)
mp.set_start_method('spawn')
for rank in range(num_gpus):
mp.Process(
target=process_main,
args=(rank, args.fname, num_gpus, args.devices)
).start()
2 changes: 1 addition & 1 deletion evals/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
parser.add_argument(
'--fname', type=str,
help='name of config file to load',
default='configs.yaml')
default='../configs/evals/vitl16_k400_16x8x3.yaml')
parser.add_argument(
'--devices', type=str, nargs='+', default=['cuda:0'],
help='which devices to use on local machine')
Expand Down
Loading