From 471280904b08805f19ab6e64914867cda5ffcc5d Mon Sep 17 00:00:00 2001 From: Cosmin Cojocaru Date: Sat, 29 Jun 2024 17:53:55 +0300 Subject: [PATCH 1/3] initial training loop for vjepa --- .gitignore | 2 + configs/evals/vith16_k400_16x8x3.yaml | 4 +- configs/evals/vitl16_k400_16x8x3.yaml | 4 +- evals/main.py | 4 +- evals/video_classification_frozen/eval.py | 224 ++++++++++++---------- requirements.txt | 1 - src/__init__.py | 0 src/datasets/football_frames_dataset.py | 67 +++++++ src/datasets/main.py | 24 +++ src/datasets/utils/video/functional.py | 16 ++ 10 files changed, 233 insertions(+), 113 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/datasets/football_frames_dataset.py create mode 100644 src/datasets/main.py diff --git a/.gitignore b/.gitignore index 3bb2efd7..9343c1bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .*.swp *.pyc +/torch310/ +src/datasets/spotting-ball-2024 \ No newline at end of file diff --git a/configs/evals/vith16_k400_16x8x3.yaml b/configs/evals/vith16_k400_16x8x3.yaml index 7605790d..79945b27 100644 --- a/configs/evals/vith16_k400_16x8x3.yaml +++ b/configs/evals/vith16_k400_16x8x3.yaml @@ -34,6 +34,6 @@ pretrain: tight_silu: false use_sdpa: true patch_size: 16 - folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/ - checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder + folder: /Users/cosmincojocaru/Downloads + checkpoint: vitl16.pth.tar # name of pretrained model file inside folder write_tag: jepa diff --git a/configs/evals/vitl16_k400_16x8x3.yaml b/configs/evals/vitl16_k400_16x8x3.yaml index b7bcf052..4efb860b 100644 --- a/configs/evals/vitl16_k400_16x8x3.yaml +++ b/configs/evals/vitl16_k400_16x8x3.yaml @@ -34,6 +34,6 @@ pretrain: tight_silu: false use_sdpa: true patch_size: 16 - folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/ - checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder + folder: /Users/cosmincojocaru/Downloads + checkpoint: vitl16.pth.tar # name of pretrained model file inside folder write_tag: jepa diff --git a/evals/main.py b/evals/main.py index c614edb8..67ee39d6 100644 --- a/evals/main.py +++ b/evals/main.py @@ -20,9 +20,9 @@ parser.add_argument( '--fname', type=str, help='name of config file to load', - default='configs.yaml') + default='../configs/evals/vitl16_k400_16x8x3.yaml') parser.add_argument( - '--devices', type=str, nargs='+', default=['cuda:0'], + '--devices', type=str, nargs='+', default=['cpu:0'], help='which devices to use on local machine') diff --git a/evals/video_classification_frozen/eval.py b/evals/video_classification_frozen/eval.py index f81f526d..8da287c2 100644 --- a/evals/video_classification_frozen/eval.py +++ b/evals/video_classification_frozen/eval.py @@ -33,6 +33,9 @@ from src.datasets.data_manager import ( init_data, ) +from torch.utils.data import DataLoader + +from src.datasets.football_frames_dataset import FramesDataset from src.utils.distributed import ( init_distributed, AllReduce @@ -129,8 +132,8 @@ def main(args_eval, resume_preempt=False): device = torch.device('cuda:0') torch.cuda.set_device(device) - world_size, rank = init_distributed() - logger.info(f'Initialized (rank/world-size) {rank}/{world_size}') + # world_size, rank = init_distributed() + # logger.info(f'Initialized (rank/world-size) {rank}/{world_size}') # -- log/checkpointing paths folder = os.path.join(pretrain_folder, 'video_classification_frozen/') @@ -138,15 +141,15 @@ def main(args_eval, resume_preempt=False): folder = os.path.join(folder, eval_tag) if not os.path.exists(folder): os.makedirs(folder, exist_ok=True) - log_file = os.path.join(folder, f'{tag}_r{rank}.csv') - latest_path = os.path.join(folder, f'{tag}-latest.pth.tar') + # log_file = os.path.join(folder, f'{tag}_r{rank}.csv') + # latest_path = os.path.join(folder, f'{tag}-latest.pth.tar') # -- make csv_logger - if rank == 0: - csv_logger = CSVLogger(log_file, - ('%d', 'epoch'), - ('%.5f', 'loss'), - ('%.5f', 'acc')) + # if rank == 0: + # csv_logger = CSVLogger(log_file, + # ('%d', 'epoch'), + # ('%.5f', 'loss'), + # ('%.5f', 'acc')) # Initialize model @@ -164,16 +167,16 @@ def main(args_eval, resume_preempt=False): use_SiLU=use_SiLU, tight_SiLU=tight_SiLU, use_sdpa=use_sdpa) - if pretrain_frames_per_clip == 1: - # Process each frame independently and aggregate - encoder = FrameAggregation(encoder).to(device) - else: - # Process each video clip independently and aggregate - encoder = ClipAggregation( - encoder, - tubelet_size=tubelet_size, - attend_across_segments=attend_across_segments - ).to(device) + # if pretrain_frames_per_clip == 1: + # # Process each frame independently and aggregate + # encoder = FrameAggregation(encoder).to(device) + # else: + # # Process each video clip independently and aggregate + # encoder = ClipAggregation( + # encoder, + # tubelet_size=tubelet_size, + # attend_across_segments=attend_across_segments + # ).to(device) encoder.eval() for p in encoder.parameters(): p.requires_grad = False @@ -183,41 +186,48 @@ def main(args_eval, resume_preempt=False): embed_dim=encoder.embed_dim, num_heads=encoder.num_heads, depth=1, - num_classes=num_classes, + num_classes=12, ).to(device) - train_loader = make_dataloader( - dataset_type=dataset_type, - root_path=train_data_path, - resolution=resolution, - frames_per_clip=eval_frames_per_clip, - frame_step=eval_frame_step, - eval_duration=eval_duration, - num_segments=eval_num_segments if attend_across_segments else 1, - num_views_per_segment=1, - allow_segment_overlap=True, - batch_size=batch_size, - world_size=world_size, - rank=rank, - training=True) - val_loader = make_dataloader( - dataset_type=dataset_type, - root_path=val_data_path, - resolution=resolution, - frames_per_clip=eval_frames_per_clip, - frame_step=eval_frame_step, - num_segments=eval_num_segments, - eval_duration=eval_duration, - num_views_per_segment=eval_num_views_per_segment, - allow_segment_overlap=True, - batch_size=batch_size, - world_size=world_size, - rank=rank, - training=False) + # train_loader = make_dataloader( + # dataset_type=dataset_type, + # root_path=train_data_path, + # resolution=resolution, + # frames_per_clip=eval_frames_per_clip, + # frame_step=eval_frame_step, + # eval_duration=eval_duration, + # num_segments=eval_num_segments if attend_across_segments else 1, + # num_views_per_segment=1, + # allow_segment_overlap=True, + # batch_size=batch_size, + # world_size=world_size, + # rank=rank, + # training=True) + # val_loader = make_dataloader( + # dataset_type=dataset_type, + # root_path=val_data_path, + # resolution=resolution, + # frames_per_clip=eval_frames_per_clip, + # frame_step=eval_frame_step, + # num_segments=eval_num_segments, + # eval_duration=eval_duration, + # num_views_per_segment=eval_num_views_per_segment, + # allow_segment_overlap=True, + # batch_size=batch_size, + # world_size=world_size, + # rank=rank, + # training=False) + + # # -- optimizer and scheduler + val_dataset = FramesDataset('../src/datasets/spotting-ball-2024') + val_loader = DataLoader(val_dataset, batch_size=8) + + train_dataset = FramesDataset('../src/datasets/spotting-ball-2024') + train_loader = DataLoader(train_dataset, batch_size=8) + ipe = len(train_loader) logger.info(f'Dataloader created... iterations per epoch: {ipe}') - # -- optimizer and scheduler optimizer, scaler, scheduler, wd_scheduler = init_opt( classifier=classifier, wd=wd, @@ -228,36 +238,36 @@ def main(args_eval, resume_preempt=False): warmup=warmup, num_epochs=num_epochs, use_bfloat16=use_bfloat16) - classifier = DistributedDataParallel(classifier, static_graph=True) - - # -- load training checkpoint - start_epoch = 0 - if resume_checkpoint: - classifier, optimizer, scaler, start_epoch = load_checkpoint( - device=device, - r_path=latest_path, - classifier=classifier, - opt=optimizer, - scaler=scaler) - for _ in range(start_epoch*ipe): - scheduler.step() - wd_scheduler.step() - - def save_checkpoint(epoch): - save_dict = { - 'classifier': classifier.state_dict(), - 'opt': optimizer.state_dict(), - 'scaler': None if scaler is None else scaler.state_dict(), - 'epoch': epoch, - 'batch_size': batch_size, - 'world_size': world_size, - 'lr': lr - } - if rank == 0: - torch.save(save_dict, latest_path) + # classifier = DistributedDataParallel(classifier, static_graph=True) + + # # -- load training checkpoint + # start_epoch = 0 + # if resume_checkpoint: + # classifier, optimizer, scaler, start_epoch = load_checkpoint( + # device=device, + # r_path=latest_path, + # classifier=classifier, + # opt=optimizer, + # scaler=scaler) + # for _ in range(start_epoch*ipe): + # scheduler.step() + # wd_scheduler.step() + + # def save_checkpoint(epoch): + # save_dict = { + # 'classifier': classifier.state_dict(), + # 'opt': optimizer.state_dict(), + # 'scaler': None if scaler is None else scaler.state_dict(), + # 'epoch': epoch, + # 'batch_size': batch_size, + # 'world_size': world_size, + # 'lr': lr + # } + # if rank == 0: + # torch.save(save_dict, latest_path) # TRAIN LOOP - for epoch in range(start_epoch, num_epochs): + for epoch in range(0, 1): logger.info('Epoch %d' % (epoch + 1)) train_acc = run_one_epoch( device=device, @@ -290,9 +300,9 @@ def save_checkpoint(epoch): use_bfloat16=use_bfloat16) logger.info('[%5d] train: %.3f%% test: %.3f%%' % (epoch + 1, train_acc, val_acc)) - if rank == 0: - csv_logger.log(epoch + 1, train_acc, val_acc) - save_checkpoint(epoch + 1) + # if rank == 0: + # csv_logger.log(epoch + 1, train_acc, val_acc) + # save_checkpoint(epoch + 1) def run_one_epoch( @@ -315,46 +325,48 @@ def run_one_epoch( criterion = torch.nn.CrossEntropyLoss() top1_meter = AverageMeter() for itr, data in enumerate(data_loader): - if training: scheduler.step() wd_scheduler.step() + print("Iter", itr, data[0].shape, data[1].shape) with torch.cuda.amp.autocast(dtype=torch.float16, enabled=use_bfloat16): # Load data and put on GPU - clips = [ - [dij.to(device, non_blocking=True) for dij in di] # iterate over spatial views of clip - for di in data[0] # iterate over temporal index of clip - ] - clip_indices = [d.to(device, non_blocking=True) for d in data[2]] + # clips = [ + # [dij.to(device, non_blocking=True) for dij in di] # iterate over spatial views of clip + # for di in data[0] # iterate over temporal index of clip + # ] + # clip_indices = [d.to(device, non_blocking=True) for d in data[2]] + x = data[0].to(device) labels = data[1].to(device) batch_size = len(labels) # Forward and prediction with torch.no_grad(): - outputs = encoder(clips, clip_indices) - if not training: - if attend_across_segments: - outputs = [classifier(o) for o in outputs] - else: - outputs = [[classifier(ost) for ost in os] for os in outputs] - if training: - if attend_across_segments: - outputs = [classifier(o) for o in outputs] - else: - outputs = [[classifier(ost) for ost in os] for os in outputs] + outputs = encoder(x) + print("Outputs shape before", outputs.shape) + outputs = classifier(outputs) + loss = criterion(outputs, labels) + print("Outputs shape after", outputs.shape) + # if not training: + # if attend_across_segments: + # outputs = [classifier(o) for o in outputs] + # else: + # outputs = [[classifier(ost) for ost in os] for os in outputs] + # if training: + # if attend_across_segments: + # outputs = [classifier(o) for o in outputs] + # else: + # outputs = [[classifier(ost) for ost in os] for os in outputs] # Compute loss - if attend_across_segments: - loss = sum([criterion(o, labels) for o in outputs]) / len(outputs) - else: - loss = sum([sum([criterion(ost, labels) for ost in os]) for os in outputs]) / len(outputs) / len(outputs[0]) + # if attend_across_segments: + # loss = sum([criterion(o, labels) for o in outputs]) / len(outputs) + # else: + # loss = sum([sum([criterion(ost, labels) for ost in os]) for os in outputs]) / len(outputs) / len(outputs[0]) with torch.no_grad(): - if attend_across_segments: - outputs = sum([F.softmax(o, dim=1) for o in outputs]) / len(outputs) - else: - outputs = sum([sum([F.softmax(ost, dim=1) for ost in os]) for os in outputs]) / len(outputs) / len(outputs[0]) + outputs = F.softmax(outputs, dim=1) top1_acc = 100. * outputs.max(dim=1).indices.eq(labels).sum() / batch_size top1_acc = float(AllReduce.apply(top1_acc)) top1_meter.update(top1_acc) @@ -429,7 +441,7 @@ def load_pretrained( if k not in pretrained_dict: logger.info(f'key "{k}" could not be found in loaded state dict') elif pretrained_dict[k].shape != v.shape: - logger.info(f'key "{k}" is of different shape in model and loaded state dict') + logger.info(f'key "{k}" is of different shape in model and loaded state dict, {pretrained_dict[k].shape} != { v.shape}') pretrained_dict[k] = v msg = encoder.load_state_dict(pretrained_dict, strict=False) print(encoder) diff --git a/requirements.txt b/requirements.txt index d2970710..7f058508 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ submitit braceexpand webdataset timm -decord pandas einops beartype diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/datasets/football_frames_dataset.py b/src/datasets/football_frames_dataset.py new file mode 100644 index 00000000..8b0a729f --- /dev/null +++ b/src/datasets/football_frames_dataset.py @@ -0,0 +1,67 @@ +import json +import os +import cv2 +import numpy as np + +from src.datasets.utils.video.functional import * +from torch.utils.data import Dataset + + +actions = "Pass, Drive, Header, High Pass, Out, Cross, Throw In, Shot, Ball Player Block, Player Successful Tackle, Free Kick, Goal".split(", ") +action_to_id = {action.upper(): action_id for action_id, action in enumerate(actions)} + +class FramesDataset(Dataset): + def __init__(self, root_dir, frame_window_size=64, frame_step=8, frame_dim=(224, 224), n_channels=3, shuffle=True): + self.frame_dim = frame_dim + self.frame_window_size = frame_window_size + self.frame_step = frame_step + self.n_channels = n_channels + self.shuffle = shuffle + self.frames_info = [] + + # Load annotations and create a global index for frames + for root, dirs, files in os.walk(root_dir): + for match_folder in dirs: + match_folder_path = os.path.join(root, match_folder) + video_files = [f for f in os.listdir(match_folder_path) if f.endswith('.mp4')] + if len(video_files) != 2: + continue + labels_file = 'Labels-ball.json' + if labels_file not in os.listdir(match_folder_path): + continue + with open(match_folder_path + '/' + labels_file, 'r') as f: + data = json.load(f) + annotations = data['annotations'] + for ann in annotations[:10]: + video_path = os.path.join(root_dir, data['UrlLocal'], '720p.mp4'.format(ann['gameTime'][0])) + frame_count = get_frame_count(video_path) + start_frame = int(ann['position']) + if int(start_frame/40) + self.frame_window_size < frame_count: + self.frames_info.append((video_path, int(start_frame/40), action_to_id[ann['label']])) + if shuffle: + np.random.shuffle(self.frames_info) + + def __len__(self): + return int(np.floor(len(self.frames_info))) + + def __getitem__(self, index): + action_frames_info = self.frames_info[index] + raw_frames, target = self.data_generation(action_frames_info) + raw_frames = np.transpose(raw_frames, (3, 0, 1, 2)) + return raw_frames, target + + + def data_generation(self, action_frames_info): + video_path, start_frame, y = action_frames_info + cap = cv2.VideoCapture(video_path) + frames = [] + for frame_index in range(0, self.frame_window_size, self.frame_step): + cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame + frame_index) + ret, frame = cap.read() + frame = cv2.resize(frame, self.frame_dim) + frames.append(frame.astype(np.float32) / 255.) + if not ret: + break # Reached the end of the video + cap.release() + + return np.array(frames), y \ No newline at end of file diff --git a/src/datasets/main.py b/src/datasets/main.py new file mode 100644 index 00000000..c928e652 --- /dev/null +++ b/src/datasets/main.py @@ -0,0 +1,24 @@ +import matplotlib.pyplot as plt +import cv2 + +from football_frames_dataset import FramesDataset +from torch.utils.data import DataLoader + + +if __name__== "__main__": + val_dataset = FramesDataset('./spotting-ball-2024') + raw_frames, label = val_dataset[0] + print(raw_frames.shape, label) + val_loader = DataLoader(val_dataset, batch_size=8) + batch = next(iter(val_loader)) + print(batch[0].shape, batch[1].shape, len(batch)) + # Plot the images using matplotlib + # fig, axes = plt.subplots(2, 4, figsize=(15, 8)) + # axes = axes.ravel() + + # for i in range(8): + # axes[i].imshow(raw_frames[i]) + # axes[i].axis('off') + + # plt.tight_layout() + # plt.show() \ No newline at end of file diff --git a/src/datasets/utils/video/functional.py b/src/datasets/utils/video/functional.py index a91d15d2..77d3412b 100644 --- a/src/datasets/utils/video/functional.py +++ b/src/datasets/utils/video/functional.py @@ -94,3 +94,19 @@ def normalize(clip, mean, std, inplace=False): clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) return clip + +def get_frame_count(video_path): + # Create a VideoCapture object + cap = cv2.VideoCapture(video_path) + + # Check if video opened successfully + if not cap.isOpened(): + print("Error opening video file") + return None + + # Get the total number of frames in the video + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + # Release the VideoCapture object + cap.release() + + return frame_count \ No newline at end of file From cbc9411ca9269128c0677ac4007e9ed00b267c48 Mon Sep 17 00:00:00 2001 From: gothera Date: Fri, 5 Jul 2024 13:04:33 +0000 Subject: [PATCH 2/3] started training in gpu cloud --- .gitignore | 3 +- .../requirements-checkpoint.txt | 12 + .../vitl16_k400_16x8x3-checkpoint.yaml | 39 ++ configs/evals/vitl16_k400_16x8x3.yaml | 4 +- evals/.ipynb_checkpoints/main-checkpoint.py | 67 ++ evals/main.py | 2 +- .../.ipynb_checkpoints/eval-checkpoint.py | 571 ++++++++++++++++++ .../.ipynb_checkpoints/utils-checkpoint.py | 343 +++++++++++ evals/video_classification_frozen/eval.py | 40 +- evals/video_classification_frozen/utils.py | 2 +- .../data_manager-checkpoint.py | 91 +++ .../football_frames_dataset-checkpoint.py | 68 +++ src/datasets/football_frames_dataset.py | 9 +- 13 files changed, 1221 insertions(+), 30 deletions(-) create mode 100644 .ipynb_checkpoints/requirements-checkpoint.txt create mode 100644 configs/evals/.ipynb_checkpoints/vitl16_k400_16x8x3-checkpoint.yaml create mode 100644 evals/.ipynb_checkpoints/main-checkpoint.py create mode 100644 evals/video_classification_frozen/.ipynb_checkpoints/eval-checkpoint.py create mode 100644 evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py create mode 100644 src/datasets/.ipynb_checkpoints/data_manager-checkpoint.py create mode 100644 src/datasets/.ipynb_checkpoints/football_frames_dataset-checkpoint.py diff --git a/.gitignore b/.gitignore index 9343c1bc..368ae4d5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .*.swp *.pyc /torch310/ -src/datasets/spotting-ball-2024 \ No newline at end of file +src/datasets/spotting-ball-2024 +data/ diff --git a/.ipynb_checkpoints/requirements-checkpoint.txt b/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 00000000..7f058508 --- /dev/null +++ b/.ipynb_checkpoints/requirements-checkpoint.txt @@ -0,0 +1,12 @@ +torch>=2 +torchvision +pyyaml +numpy +opencv-python +submitit +braceexpand +webdataset +timm +pandas +einops +beartype diff --git a/configs/evals/.ipynb_checkpoints/vitl16_k400_16x8x3-checkpoint.yaml b/configs/evals/.ipynb_checkpoints/vitl16_k400_16x8x3-checkpoint.yaml new file mode 100644 index 00000000..f4139e78 --- /dev/null +++ b/configs/evals/.ipynb_checkpoints/vitl16_k400_16x8x3-checkpoint.yaml @@ -0,0 +1,39 @@ +nodes: 8 +tasks_per_node: 8 +tag: k400-16x8x3 +eval_name: video_classification_frozen +resume_checkpoint: false +data: + dataset_train: /your_path_to_kinetics400_train_csv_file_index.csv + dataset_val: /your_path_to_kinetics400_val_csv_file_index.csv + dataset_type: VideoDataset + num_classes: 400 + frames_per_clip: 16 + num_segments: 8 + num_views_per_segment: 3 + frame_step: 4 +optimization: + attend_across_segments: true + num_epochs: 20 + resolution: 224 + batch_size: 32 + weight_decay: 0.01 + lr: 0.001 + start_lr: 0.001 + final_lr: 0.0 + warmup: 0. + use_bfloat16: true +pretrain: + model_name: vit_large + checkpoint_key: target_encoder + clip_duration: null + frames_per_clip: 16 + tubelet_size: 2 + uniform_power: true + use_silu: false + tight_silu: false + use_sdpa: true + patch_size: 16 + folder: /home + checkpoint: vitl16.pth.tar # name of pretrained model file inside folder + write_tag: jepa diff --git a/configs/evals/vitl16_k400_16x8x3.yaml b/configs/evals/vitl16_k400_16x8x3.yaml index 4efb860b..f4139e78 100644 --- a/configs/evals/vitl16_k400_16x8x3.yaml +++ b/configs/evals/vitl16_k400_16x8x3.yaml @@ -16,7 +16,7 @@ optimization: attend_across_segments: true num_epochs: 20 resolution: 224 - batch_size: 4 + batch_size: 32 weight_decay: 0.01 lr: 0.001 start_lr: 0.001 @@ -34,6 +34,6 @@ pretrain: tight_silu: false use_sdpa: true patch_size: 16 - folder: /Users/cosmincojocaru/Downloads + folder: /home checkpoint: vitl16.pth.tar # name of pretrained model file inside folder write_tag: jepa diff --git a/evals/.ipynb_checkpoints/main-checkpoint.py b/evals/.ipynb_checkpoints/main-checkpoint.py new file mode 100644 index 00000000..2b899f4a --- /dev/null +++ b/evals/.ipynb_checkpoints/main-checkpoint.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import argparse + +import multiprocessing as mp + +import pprint +import yaml + +from src.utils.distributed import init_distributed + +from evals.scaffold import main as eval_main + +parser = argparse.ArgumentParser() +parser.add_argument( + '--fname', type=str, + help='name of config file to load', + default='../configs/evals/vitl16_k400_16x8x3.yaml') +parser.add_argument( + '--devices', type=str, nargs='+', default=['cuda:0'], + help='which devices to use on local machine') + + +def process_main(rank, fname, world_size, devices): + import os + os.environ['CUDA_VISIBLE_DEVICES'] = str(devices[rank].split(':')[-1]) + + import logging + logging.basicConfig() + logger = logging.getLogger() + if rank == 0: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.ERROR) + + logger.info(f'called-params {fname}') + + # Load config + params = None + with open(fname, 'r') as y_file: + params = yaml.load(y_file, Loader=yaml.FullLoader) + logger.info('loaded params...') + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(params) + + # Init distributed (access to comm between GPUS on same machine) + world_size, rank = init_distributed(rank_and_world_size=(rank, world_size)) + logger.info(f'Running... (rank: {rank}/{world_size})') + + # Launch the eval with loaded config + eval_main(params['eval_name'], args_eval=params) + + +if __name__ == '__main__': + args = parser.parse_args() + num_gpus = len(args.devices) + mp.set_start_method('spawn') + for rank in range(num_gpus): + mp.Process( + target=process_main, + args=(rank, args.fname, num_gpus, args.devices) + ).start() diff --git a/evals/main.py b/evals/main.py index 67ee39d6..2b899f4a 100644 --- a/evals/main.py +++ b/evals/main.py @@ -22,7 +22,7 @@ help='name of config file to load', default='../configs/evals/vitl16_k400_16x8x3.yaml') parser.add_argument( - '--devices', type=str, nargs='+', default=['cpu:0'], + '--devices', type=str, nargs='+', default=['cuda:0'], help='which devices to use on local machine') diff --git a/evals/video_classification_frozen/.ipynb_checkpoints/eval-checkpoint.py b/evals/video_classification_frozen/.ipynb_checkpoints/eval-checkpoint.py new file mode 100644 index 00000000..f9038edb --- /dev/null +++ b/evals/video_classification_frozen/.ipynb_checkpoints/eval-checkpoint.py @@ -0,0 +1,571 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import os + +# -- FOR DISTRIBUTED TRAINING ENSURE ONLY 1 DEVICE VISIBLE PER PROCESS +try: + # -- WARNING: IF DOING DISTRIBUTED TRAINING ON A NON-SLURM CLUSTER, MAKE + # -- SURE TO UPDATE THIS TO GET LOCAL-RANK ON NODE, OR ENSURE + # -- THAT YOUR JOBS ARE LAUNCHED WITH ONLY 1 DEVICE VISIBLE + # -- TO EACH PROCESS + os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['SLURM_LOCALID'] +except Exception: + pass + +import logging +import pprint + +import numpy as np + +import torch +import torch.multiprocessing as mp +import torch.nn.functional as F + +from torch.nn.parallel import DistributedDataParallel + +import src.models.vision_transformer as vit +from src.models.attentive_pooler import AttentiveClassifier +from src.datasets.data_manager import ( + init_data, +) +from torch.utils.data import DataLoader + +from src.datasets.football_frames_dataset import FramesDataset +from src.utils.distributed import ( + init_distributed, + AllReduce +) +from src.utils.schedulers import ( + WarmupCosineSchedule, + CosineWDSchedule, +) +from src.utils.logging import ( + AverageMeter, + CSVLogger +) + +from evals.video_classification_frozen.utils import ( + make_transforms, + ClipAggregation, + FrameAggregation, + EvalVideoTransform +) + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +_GLOBAL_SEED = 0 +np.random.seed(_GLOBAL_SEED) +torch.manual_seed(_GLOBAL_SEED) +torch.backends.cudnn.benchmark = True + +pp = pprint.PrettyPrinter(indent=4) + + +def main(args_eval, resume_preempt=False): + + # ----------------------------------------------------------------------- # + # PASSED IN PARAMS FROM CONFIG FILE + # ----------------------------------------------------------------------- # + + # -- PRETRAIN + args_pretrain = args_eval.get('pretrain') + checkpoint_key = args_pretrain.get('checkpoint_key', 'target_encoder') + model_name = args_pretrain.get('model_name', None) + patch_size = args_pretrain.get('patch_size', None) + pretrain_folder = args_pretrain.get('folder', None) + ckp_fname = args_pretrain.get('checkpoint', None) + tag = args_pretrain.get('write_tag', None) + use_sdpa = args_pretrain.get('use_sdpa', True) + use_SiLU = args_pretrain.get('use_silu', False) + tight_SiLU = args_pretrain.get('tight_silu', True) + uniform_power = args_pretrain.get('uniform_power', False) + pretrained_path = os.path.join(pretrain_folder, ckp_fname) + # Optional [for Video model]: + tubelet_size = args_pretrain.get('tubelet_size', 2) + pretrain_frames_per_clip = args_pretrain.get('frames_per_clip', 1) + + # -- DATA + args_data = args_eval.get('data') + train_data_path = [args_data.get('dataset_train')] + val_data_path = [args_data.get('dataset_val')] + dataset_type = args_data.get('dataset_type', 'VideoDataset') + num_classes = args_data.get('num_classes') + eval_num_segments = args_data.get('num_segments', 1) + eval_frames_per_clip = args_data.get('frames_per_clip', 16) + eval_frame_step = args_pretrain.get('frame_step', 4) + eval_duration = args_pretrain.get('clip_duration', None) + eval_num_views_per_segment = args_data.get('num_views_per_segment', 1) + + # -- OPTIMIZATION + args_opt = args_eval.get('optimization') + resolution = args_opt.get('resolution', 224) + batch_size = args_opt.get('batch_size') + attend_across_segments = args_opt.get('attend_across_segments', False) + num_epochs = args_opt.get('num_epochs') + wd = args_opt.get('weight_decay') + start_lr = args_opt.get('start_lr') + lr = args_opt.get('lr') + final_lr = args_opt.get('final_lr') + warmup = args_opt.get('warmup') + use_bfloat16 = args_opt.get('use_bfloat16') + + # -- EXPERIMENT-ID/TAG (optional) + resume_checkpoint = args_eval.get('resume_checkpoint', False) or resume_preempt + eval_tag = args_eval.get('tag', None) + + # ----------------------------------------------------------------------- # + + try: + mp.set_start_method('spawn') + except Exception: + pass + + if not torch.cuda.is_available(): + device = torch.device('cpu') + else: + device = torch.device('cuda:0') + torch.cuda.set_device(device) + + # world_size, rank = init_distributed() + # logger.info(f'Initialized (rank/world-size) {rank}/{world_size}') + + # -- log/checkpointing paths + folder = os.path.join(pretrain_folder, 'video_classification_frozen/') + if eval_tag is not None: + folder = os.path.join(folder, eval_tag) + if not os.path.exists(folder): + os.makedirs(folder, exist_ok=True) + # log_file = os.path.join(folder, f'{tag}_r{rank}.csv') + # latest_path = os.path.join(folder, f'{tag}-latest.pth.tar') + + # -- make csv_logger + # if rank == 0: + # csv_logger = CSVLogger(log_file, + # ('%d', 'epoch'), + # ('%.5f', 'loss'), + # ('%.5f', 'acc')) + + # Initialize model + + # -- pretrained encoder (frozen) + encoder = init_model( + crop_size=resolution, + device=device, + pretrained=pretrained_path, + model_name=model_name, + patch_size=patch_size, + tubelet_size=tubelet_size, + frames_per_clip=pretrain_frames_per_clip, + uniform_power=uniform_power, + checkpoint_key=checkpoint_key, + use_SiLU=use_SiLU, + tight_SiLU=tight_SiLU, + use_sdpa=use_sdpa) + # if pretrain_frames_per_clip == 1: + # # Process each frame independently and aggregate + # encoder = FrameAggregation(encoder).to(device) + # else: + # # Process each video clip independently and aggregate + # encoder = ClipAggregation( + # encoder, + # tubelet_size=tubelet_size, + # attend_across_segments=attend_across_segments + # ).to(device) + encoder.eval() + for p in encoder.parameters(): + p.requires_grad = False + + # -- init classifier + classifier = AttentiveClassifier( + embed_dim=encoder.embed_dim, + num_heads=encoder.num_heads, + depth=1, + num_classes=12, + ).to(device) + + # train_loader = make_dataloader( + # dataset_type=dataset_type, + # root_path=train_data_path, + # resolution=resolution, + # frames_per_clip=eval_frames_per_clip, + # frame_step=eval_frame_step, + # eval_duration=eval_duration, + # num_segments=eval_num_segments if attend_across_segments else 1, + # num_views_per_segment=1, + # allow_segment_overlap=True, + # batch_size=batch_size, + # world_size=world_size, + # rank=rank, + # training=True) + # val_loader = make_dataloader( + # dataset_type=dataset_type, + # root_path=val_data_path, + # resolution=resolution, + # frames_per_clip=eval_frames_per_clip, + # frame_step=eval_frame_step, + # num_segments=eval_num_segments, + # eval_duration=eval_duration, + # num_views_per_segment=eval_num_views_per_segment, + # allow_segment_overlap=True, + # batch_size=batch_size, + # world_size=world_size, + # rank=rank, + # training=False) + + tfms = EvalVideoTransform(num_views_per_clip=eval_num_segments) + # # -- optimizer and scheduler + val_dataset = FramesDataset('../data/spotting-ball-2024/train', transforms=tfms) + val_loader = DataLoader(val_dataset, batch_size=64) + + train_dataset = FramesDataset('../data/spotting-ball-2024/valid', transforms=tfms) + train_loader = DataLoader(train_dataset, batch_size=64) + + ipe = len(train_loader) + logger.info(f'Dataloader created... iterations per epoch: {ipe}') + + optimizer, scaler, scheduler, wd_scheduler = init_opt( + classifier=classifier, + wd=wd, + start_lr=start_lr, + ref_lr=lr, + final_lr=final_lr, + iterations_per_epoch=ipe, + warmup=warmup, + num_epochs=num_epochs, + use_bfloat16=use_bfloat16) + # classifier = DistributedDataParallel(classifier, static_graph=True) + + # # -- load training checkpoint + # start_epoch = 0 + # if resume_checkpoint: + # classifier, optimizer, scaler, start_epoch = load_checkpoint( + # device=device, + # r_path=latest_path, + # classifier=classifier, + # opt=optimizer, + # scaler=scaler) + # for _ in range(start_epoch*ipe): + # scheduler.step() + # wd_scheduler.step() + + def save_checkpoint(epoch): + save_dict = { + 'classifier': classifier.state_dict(), + 'opt': optimizer.state_dict(), + 'scaler': None if scaler is None else scaler.state_dict(), + 'epoch': epoch, + 'batch_size': batch_size, + 'lr': lr + } + torch.save(save_dict, latest_path) + + # TRAIN LOOP + for epoch in range(0, 1): + logger.info('Epoch %d' % (epoch + 1)) + train_acc = run_one_epoch( + device=device, + training=True, + num_temporal_views=eval_num_segments if attend_across_segments else 1, + attend_across_segments=attend_across_segments, + num_spatial_views=1, + encoder=encoder, + classifier=classifier, + scaler=scaler, + optimizer=optimizer, + scheduler=scheduler, + wd_scheduler=wd_scheduler, + data_loader=train_loader, + use_bfloat16=use_bfloat16) + + val_acc = run_one_epoch( + device=device, + training=False, + num_temporal_views=eval_num_segments, + attend_across_segments=attend_across_segments, + num_spatial_views=eval_num_views_per_segment, + encoder=encoder, + classifier=classifier, + scaler=scaler, + optimizer=optimizer, + scheduler=scheduler, + wd_scheduler=wd_scheduler, + data_loader=val_loader, + use_bfloat16=use_bfloat16) + + logger.info('[%5d] train: %.3f%% test: %.3f%%' % (epoch + 1, train_acc, val_acc)) + # if rank == 0: + # csv_logger.log(epoch + 1, train_acc, val_acc) + save_checkpoint(epoch + 1) + + +def run_one_epoch( + device, + training, + encoder, + classifier, + scaler, + optimizer, + scheduler, + wd_scheduler, + data_loader, + use_bfloat16, + num_spatial_views, + num_temporal_views, + attend_across_segments, +): + + classifier.train(mode=training) + criterion = torch.nn.CrossEntropyLoss() + top1_meter = AverageMeter() + for itr, data in enumerate(data_loader): + if training: + scheduler.step() + wd_scheduler.step() + print("Iter", itr, data[0].shape, data[1].shape) + + with torch.cuda.amp.autocast(dtype=torch.float16, enabled=use_bfloat16): + + # Load data and put on GPU + # clips = [ + # [dij.to(device, non_blocking=True) for dij in di] # iterate over spatial views of clip + # for di in data[0] # iterate over temporal index of clip + # ] + # clip_indices = [d.to(device, non_blocking=True) for d in data[2]] + x = data[0].to(device) + labels = data[1].to(device) + batch_size = len(labels) + + # Forward and prediction + with torch.no_grad(): + outputs = encoder(x) + outputs = classifier(outputs) + loss = criterion(outputs, labels) + # if not training: + # if attend_across_segments: + # outputs = [classifier(o) for o in outputs] + # else: + # outputs = [[classifier(ost) for ost in os] for os in outputs] + # if training: + # if attend_across_segments: + # outputs = [classifier(o) for o in outputs] + # else: + # outputs = [[classifier(ost) for ost in os] for os in outputs] + + # Compute loss + # if attend_across_segments: + # loss = sum([criterion(o, labels) for o in outputs]) / len(outputs) + # else: + # loss = sum([sum([criterion(ost, labels) for ost in os]) for os in outputs]) / len(outputs) / len(outputs[0]) + with torch.no_grad(): + outputs = F.softmax(outputs, dim=1) + top1_acc = 100. * outputs.max(dim=1).indices.eq(labels).sum() / batch_size + top1_acc = float(AllReduce.apply(top1_acc)) + top1_meter.update(top1_acc) + + if training: + if use_bfloat16: + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0) + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0) + optimizer.step() + optimizer.zero_grad() + + if itr % 20 == 0: + logger.info('[%5d] %.3f%% (loss: %.3f) [mem: %.2e]' + % (itr, top1_meter.avg, loss, + torch.cuda.max_memory_allocated() / 1024.**2)) + + return top1_meter.avg + + +def load_checkpoint( + device, + r_path, + classifier, + opt, + scaler +): + try: + checkpoint = torch.load(r_path, map_location=torch.device('cpu')) + epoch = checkpoint['epoch'] + + # -- loading encoder + pretrained_dict = checkpoint['classifier'] + msg = classifier.load_state_dict(pretrained_dict) + logger.info(f'loaded pretrained classifier from epoch {epoch} with msg: {msg}') + + # -- loading optimizer + opt.load_state_dict(checkpoint['opt']) + if scaler is not None: + scaler.load_state_dict(checkpoint['scaler']) + logger.info(f'loaded optimizers from epoch {epoch}') + logger.info(f'read-path: {r_path}') + del checkpoint + + except Exception as e: + logger.info(f'Encountered exception when loading checkpoint {e}') + epoch = 0 + + return classifier, opt, scaler, epoch + + +def load_pretrained( + encoder, + pretrained, + checkpoint_key='target_encoder' +): + logger.info(f'Loading pretrained model from {pretrained}') + checkpoint = torch.load(pretrained, map_location='cpu') + try: + pretrained_dict = checkpoint[checkpoint_key] + except Exception: + pretrained_dict = checkpoint['encoder'] + + pretrained_dict = {k.replace('module.', ''): v for k, v in pretrained_dict.items()} + pretrained_dict = {k.replace('backbone.', ''): v for k, v in pretrained_dict.items()} + for k, v in encoder.state_dict().items(): + if k not in pretrained_dict: + logger.info(f'key "{k}" could not be found in loaded state dict') + elif pretrained_dict[k].shape != v.shape: + logger.info(f'key "{k}" is of different shape in model and loaded state dict, {pretrained_dict[k].shape} != { v.shape}') + pretrained_dict[k] = v + msg = encoder.load_state_dict(pretrained_dict, strict=False) + print(encoder) + logger.info(f'loaded pretrained model with msg: {msg}') + logger.info(f'loaded pretrained encoder from epoch: {checkpoint["epoch"]}\n path: {pretrained}') + del checkpoint + return encoder + + +def make_dataloader( + root_path, + batch_size, + world_size, + rank, + dataset_type='VideoDataset', + resolution=224, + frames_per_clip=16, + frame_step=4, + num_segments=8, + eval_duration=None, + num_views_per_segment=1, + allow_segment_overlap=True, + training=False, + num_workers=12, + subset_file=None +): + # Make Video Transforms + transform = make_transforms( + training=training, + num_views_per_clip=num_views_per_segment, + random_horizontal_flip=False, + random_resize_aspect_ratio=(0.75, 4/3), + random_resize_scale=(0.08, 1.0), + reprob=0.25, + auto_augment=True, + motion_shift=False, + crop_size=resolution, + ) + + data_loader, _ = init_data( + data=dataset_type, + root_path=root_path, + transform=transform, + batch_size=batch_size, + world_size=world_size, + rank=rank, + clip_len=frames_per_clip, + frame_sample_rate=frame_step, + duration=eval_duration, + num_clips=num_segments, + allow_clip_overlap=allow_segment_overlap, + num_workers=num_workers, + copy_data=False, + drop_last=False, + subset_file=subset_file) + return data_loader + + +def init_model( + device, + pretrained, + model_name, + patch_size=16, + crop_size=224, + # Video specific parameters + frames_per_clip=16, + tubelet_size=2, + use_sdpa=False, + use_SiLU=False, + tight_SiLU=True, + uniform_power=False, + checkpoint_key='target_encoder' +): + encoder = vit.__dict__[model_name]( + img_size=crop_size, + patch_size=patch_size, + num_frames=frames_per_clip, + tubelet_size=tubelet_size, + uniform_power=uniform_power, + use_sdpa=use_sdpa, + use_SiLU=use_SiLU, + tight_SiLU=tight_SiLU, + ) + + encoder.to(device) + encoder = load_pretrained(encoder=encoder, pretrained=pretrained, checkpoint_key=checkpoint_key) + return encoder + + +def init_opt( + classifier, + iterations_per_epoch, + start_lr, + ref_lr, + warmup, + num_epochs, + wd=1e-6, + final_wd=1e-6, + final_lr=0.0, + use_bfloat16=False +): + param_groups = [ + { + 'params': (p for n, p in classifier.named_parameters() + if ('bias' not in n) and (len(p.shape) != 1)) + }, { + 'params': (p for n, p in classifier.named_parameters() + if ('bias' in n) or (len(p.shape) == 1)), + 'WD_exclude': True, + 'weight_decay': 0 + } + ] + + logger.info('Using AdamW') + optimizer = torch.optim.AdamW(param_groups) + scheduler = WarmupCosineSchedule( + optimizer, + warmup_steps=int(warmup*iterations_per_epoch), + start_lr=start_lr, + ref_lr=ref_lr, + final_lr=final_lr, + T_max=int(num_epochs*iterations_per_epoch)) + wd_scheduler = CosineWDSchedule( + optimizer, + ref_wd=wd, + final_wd=final_wd, + T_max=int(num_epochs*iterations_per_epoch)) + scaler = torch.cuda.amp.GradScaler() if use_bfloat16 else None + return optimizer, scaler, scheduler, wd_scheduler diff --git a/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py b/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 00000000..d6881824 --- /dev/null +++ b/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,343 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import numpy as np + +import torch +import torch.nn as nn +import torchvision.transforms as transforms + +import src.datasets.utils.video.transforms as video_transforms +import src.datasets.utils.video.volume_transforms as volume_transforms + +from src.datasets.utils.video.randerase import RandomErasing + +from src.models.utils.pos_embs import get_1d_sincos_pos_embed +from src.masks.utils import apply_masks + + +class FrameAggregation(nn.Module): + """ + Process each frame independently and concatenate all tokens + """ + + def __init__( + self, + model, + max_frames=10000, + use_pos_embed=False, + attend_across_segments=False + ): + super().__init__() + self.model = model + self.embed_dim = embed_dim = model.embed_dim + self.num_heads = model.num_heads + self.attend_across_segments = attend_across_segments + # 1D-temporal pos-embedding + self.pos_embed = None + if use_pos_embed: + self.pos_embed = nn.Parameter( + torch.zeros(1, max_frames, embed_dim), + requires_grad=False) + sincos = get_1d_sincos_pos_embed(embed_dim, max_frames) + self.pos_embed.copy_(torch.from_numpy(sincos).float().unsqueeze(0)) + + def forward(self, x, clip_indices=None): + + # TODO: implement attend_across_segments=False + # num_clips = len(x) + num_views_per_clip = len(x[0]) + + # Concatenate views along batch dimension + x = [torch.cat(xi, dim=0) for xi in x] + # Concatenate clips along temporal dimension + x = torch.cat(x, dim=2) + B, C, T, H, W = x.size() + + # Put each frame along the batch dimension + x = x.permute(0, 2, 1, 3, 4).reshape(B*T, C, H, W) + + outputs = self.model(x) + _, N, D = outputs.size() + outputs = outputs.reshape(B, T, N, D).flatten(1, 2) + + # Separate views into list + B = B // num_views_per_clip + all_outputs = [] + for i in range(num_views_per_clip): + o = outputs[i*B:(i+1)*B] + # Compute positional embedding + if (self.pos_embed is not None) and (clip_indices is not None): + pos_embed = self.pos_embed.repeat(B, 1, 1) # [B, F, D] + pos_embed = apply_masks(pos_embed, clip_indices, concat=False) # list(Tensor([B, T, D])) + pos_embed = torch.cat(pos_embed, dim=1) # concatenate along temporal dimension + pos_embed = pos_embed.unsqueeze(2).repeat(1, 1, N, 1) # [B, T*num_clips, N, D] + pos_embed = pos_embed.flatten(1, 2) + o += pos_embed + all_outputs += [o] + + return all_outputs + + +class ClipAggregation(nn.Module): + """ + Process each clip independently and concatenate all tokens + """ + + def __init__( + self, + model, + tubelet_size=2, + max_frames=10000, + use_pos_embed=False, + attend_across_segments=False + ): + super().__init__() + self.model = model + self.tubelet_size = tubelet_size + self.embed_dim = embed_dim = model.embed_dim + self.num_heads = model.num_heads + self.attend_across_segments = attend_across_segments + # 1D-temporal pos-embedding + self.pos_embed = None + if use_pos_embed: + max_T = max_frames // tubelet_size + self.pos_embed = nn.Parameter( + torch.zeros(1, max_T, embed_dim), + requires_grad=False) + sincos = get_1d_sincos_pos_embed(embed_dim, max_T) + self.pos_embed.copy_(torch.from_numpy(sincos).float().unsqueeze(0)) + + def forward(self, x, clip_indices=None): + + num_clips = len(x) + num_views_per_clip = len(x[0]) + B, C, T, H, W = x[0][0].size() + + # Concatenate all spatial and temporal views along batch dimension + x = [torch.cat(xi, dim=0) for xi in x] + x = torch.cat(x, dim=0) + outputs = self.model(x) + _, N, D = outputs.size() + + T = T // self.tubelet_size # Num temporal tokens + N = N // T # Num spatial tokens + + # Unroll outputs into a 2D array [spatial_views x temporal_views] + eff_B = B * num_views_per_clip + all_outputs = [[] for _ in range(num_views_per_clip)] + for i in range(num_clips): + o = outputs[i*eff_B:(i+1)*eff_B] + for j in range(num_views_per_clip): + all_outputs[j].append(o[j*B:(j+1)*B]) + + if not self.attend_across_segments: + return all_outputs + + for i, outputs in enumerate(all_outputs): + + # Concatenate along temporal dimension + outputs = [o.reshape(B, T, N, D) for o in outputs] + outputs = torch.cat(outputs, dim=1).flatten(1, 2) + + # Compute positional embedding + if (self.pos_embed is not None) and (clip_indices is not None): + clip_indices = [c[:, ::self.tubelet_size] for c in clip_indices] + pos_embed = self.pos_embed.repeat(B, 1, 1) # [B, F, D] + pos_embed = apply_masks(pos_embed, clip_indices, concat=False) # list(Tensor([B, T, D])) + pos_embed = torch.cat(pos_embed, dim=1) # concatenate along temporal dimension + pos_embed = pos_embed.unsqueeze(2).repeat(1, 1, N, 1) # [B, T*num_clips, N, D] + pos_embed = pos_embed.flatten(1, 2) + outputs += pos_embed + + all_outputs[i] = outputs + + return all_outputs + + +def make_transforms( + training=True, + random_horizontal_flip=True, + random_resize_aspect_ratio=(3/4, 4/3), + random_resize_scale=(0.3, 1.0), + reprob=0.0, + auto_augment=False, + motion_shift=False, + crop_size=224, + num_views_per_clip=1, + normalize=((0.485, 0.456, 0.406), + (0.229, 0.224, 0.225)) +): + + if not training and num_views_per_clip > 1: + print('Making EvalVideoTransform, multi-view') + _frames_augmentation = EvalVideoTransform( + num_views_per_clip=num_views_per_clip, + short_side_size=crop_size, + normalize=normalize, + ) + + else: + _frames_augmentation = VideoTransform( + training=training, + random_horizontal_flip=random_horizontal_flip, + random_resize_aspect_ratio=random_resize_aspect_ratio, + random_resize_scale=random_resize_scale, + reprob=reprob, + auto_augment=auto_augment, + motion_shift=motion_shift, + crop_size=crop_size, + normalize=normalize, + ) + return _frames_augmentation + + +class VideoTransform(object): + + def __init__( + self, + training=True, + random_horizontal_flip=True, + random_resize_aspect_ratio=(3/4, 4/3), + random_resize_scale=(0.3, 1.0), + reprob=0.0, + auto_augment=False, + motion_shift=False, + crop_size=224, + normalize=((0.485, 0.456, 0.406), + (0.229, 0.224, 0.225)) + ): + + self.training = training + + short_side_size = int(crop_size * 256 / 224) + self.eval_transform = video_transforms.Compose([ + video_transforms.Resize(short_side_size, interpolation='bilinear'), + video_transforms.CenterCrop(size=(crop_size, crop_size)), + volume_transforms.ClipToTensor(), + video_transforms.Normalize(mean=normalize[0], std=normalize[1]) + ]) + + self.random_horizontal_flip = random_horizontal_flip + self.random_resize_aspect_ratio = random_resize_aspect_ratio + self.random_resize_scale = random_resize_scale + self.auto_augment = auto_augment + self.motion_shift = motion_shift + self.crop_size = crop_size + self.normalize = torch.tensor(normalize) + + self.autoaug_transform = video_transforms.create_random_augment( + input_size=(crop_size, crop_size), + auto_augment='rand-m7-n4-mstd0.5-inc1', + interpolation='bicubic', + ) + + self.spatial_transform = video_transforms.random_resized_crop_with_shift \ + if motion_shift else video_transforms.random_resized_crop + + self.reprob = reprob + self.erase_transform = RandomErasing( + reprob, + mode='pixel', + max_count=1, + num_splits=1, + device='cpu', + ) + + def __call__(self, buffer): + + if not self.training: + return [self.eval_transform(buffer)] + + buffer = [transforms.ToPILImage()(frame) for frame in buffer] + + if self.auto_augment: + buffer = self.autoaug_transform(buffer) + + buffer = [transforms.ToTensor()(img) for img in buffer] + buffer = torch.stack(buffer) # T C H W + buffer = buffer.permute(0, 2, 3, 1) # T H W C + + buffer = tensor_normalize(buffer, self.normalize[0], self.normalize[1]) + buffer = buffer.permute(3, 0, 1, 2) # T H W C -> C T H W + + buffer = self.spatial_transform( + images=buffer, + target_height=self.crop_size, + target_width=self.crop_size, + scale=self.random_resize_scale, + ratio=self.random_resize_aspect_ratio, + ) + if self.random_horizontal_flip: + buffer, _ = video_transforms.horizontal_flip(0.5, buffer) + + if self.reprob > 0: + buffer = buffer.permute(1, 0, 2, 3) + buffer = self.erase_transform(buffer) + buffer = buffer.permute(1, 0, 2, 3) + + return [buffer] + + +class EvalVideoTransform(object): + + def __init__( + self, + num_views_per_clip=1, + short_side_size=224, + normalize=((0.485, 0.456, 0.406), + (0.229, 0.224, 0.225)) + ): + self.views_per_clip = num_views_per_clip + self.short_side_size = short_side_size + self.spatial_resize = video_transforms.Resize(short_side_size, interpolation='bilinear') + self.to_tensor = video_transforms.Compose([ + volume_transforms.ClipToTensor(), + video_transforms.Normalize(mean=normalize[0], std=normalize[1]) + ]) + + def __call__(self, buffer): + + # Sample several spatial views of each clip + buffer = np.array(self.spatial_resize(buffer)) + T, H, W, C = buffer.shape + + num_views = self.views_per_clip + side_len = self.short_side_size + spatial_step = (max(H, W) - side_len) // (num_views - 1) + + all_views = [] + for i in range(num_views): + start = i*spatial_step + if H > W: + view = buffer[:, start:start+side_len, :, :] + else: + view = buffer[:, :, start:start+side_len, :] + view = self.to_tensor(view) + all_views.append(view) + + return all_views[0] + + +def tensor_normalize(tensor, mean, std): + """ + Normalize a given tensor by subtracting the mean and dividing the std. + Args: + tensor (tensor): tensor to normalize. + mean (tensor or list): mean value to subtract. + std (tensor or list): std to divide. + """ + if tensor.dtype == torch.uint8: + tensor = tensor.float() + tensor = tensor / 255.0 + if type(mean) == list: + mean = torch.tensor(mean) + if type(std) == list: + std = torch.tensor(std) + tensor = tensor - mean + tensor = tensor / std + return tensor diff --git a/evals/video_classification_frozen/eval.py b/evals/video_classification_frozen/eval.py index 8da287c2..f9038edb 100644 --- a/evals/video_classification_frozen/eval.py +++ b/evals/video_classification_frozen/eval.py @@ -52,7 +52,8 @@ from evals.video_classification_frozen.utils import ( make_transforms, ClipAggregation, - FrameAggregation + FrameAggregation, + EvalVideoTransform ) logging.basicConfig() @@ -218,12 +219,13 @@ def main(args_eval, resume_preempt=False): # rank=rank, # training=False) + tfms = EvalVideoTransform(num_views_per_clip=eval_num_segments) # # -- optimizer and scheduler - val_dataset = FramesDataset('../src/datasets/spotting-ball-2024') - val_loader = DataLoader(val_dataset, batch_size=8) + val_dataset = FramesDataset('../data/spotting-ball-2024/train', transforms=tfms) + val_loader = DataLoader(val_dataset, batch_size=64) - train_dataset = FramesDataset('../src/datasets/spotting-ball-2024') - train_loader = DataLoader(train_dataset, batch_size=8) + train_dataset = FramesDataset('../data/spotting-ball-2024/valid', transforms=tfms) + train_loader = DataLoader(train_dataset, batch_size=64) ipe = len(train_loader) logger.info(f'Dataloader created... iterations per epoch: {ipe}') @@ -253,18 +255,16 @@ def main(args_eval, resume_preempt=False): # scheduler.step() # wd_scheduler.step() - # def save_checkpoint(epoch): - # save_dict = { - # 'classifier': classifier.state_dict(), - # 'opt': optimizer.state_dict(), - # 'scaler': None if scaler is None else scaler.state_dict(), - # 'epoch': epoch, - # 'batch_size': batch_size, - # 'world_size': world_size, - # 'lr': lr - # } - # if rank == 0: - # torch.save(save_dict, latest_path) + def save_checkpoint(epoch): + save_dict = { + 'classifier': classifier.state_dict(), + 'opt': optimizer.state_dict(), + 'scaler': None if scaler is None else scaler.state_dict(), + 'epoch': epoch, + 'batch_size': batch_size, + 'lr': lr + } + torch.save(save_dict, latest_path) # TRAIN LOOP for epoch in range(0, 1): @@ -301,8 +301,8 @@ def main(args_eval, resume_preempt=False): logger.info('[%5d] train: %.3f%% test: %.3f%%' % (epoch + 1, train_acc, val_acc)) # if rank == 0: - # csv_logger.log(epoch + 1, train_acc, val_acc) - # save_checkpoint(epoch + 1) + # csv_logger.log(epoch + 1, train_acc, val_acc) + save_checkpoint(epoch + 1) def run_one_epoch( @@ -345,10 +345,8 @@ def run_one_epoch( # Forward and prediction with torch.no_grad(): outputs = encoder(x) - print("Outputs shape before", outputs.shape) outputs = classifier(outputs) loss = criterion(outputs, labels) - print("Outputs shape after", outputs.shape) # if not training: # if attend_across_segments: # outputs = [classifier(o) for o in outputs] diff --git a/evals/video_classification_frozen/utils.py b/evals/video_classification_frozen/utils.py index 450f799a..d6881824 100644 --- a/evals/video_classification_frozen/utils.py +++ b/evals/video_classification_frozen/utils.py @@ -320,7 +320,7 @@ def __call__(self, buffer): view = self.to_tensor(view) all_views.append(view) - return all_views + return all_views[0] def tensor_normalize(tensor, mean, std): diff --git a/src/datasets/.ipynb_checkpoints/data_manager-checkpoint.py b/src/datasets/.ipynb_checkpoints/data_manager-checkpoint.py new file mode 100644 index 00000000..cdb7ade4 --- /dev/null +++ b/src/datasets/.ipynb_checkpoints/data_manager-checkpoint.py @@ -0,0 +1,91 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +from logging import getLogger + + +_GLOBAL_SEED = 0 +logger = getLogger() + + +def init_data( + batch_size, + transform=None, + shared_transform=None, + data='ImageNet', + collator=None, + pin_mem=True, + num_workers=8, + world_size=1, + rank=0, + root_path=None, + image_folder=None, + training=True, + copy_data=False, + drop_last=True, + tokenize_txt=True, + subset_file=None, + clip_len=8, + frame_sample_rate=2, + duration=None, + num_clips=1, + random_clip_sampling=True, + allow_clip_overlap=False, + filter_short_videos=False, + filter_long_videos=int(1e9), + decode_one_clip=True, + datasets_weights=None, + persistent_workers=False, + repeat_wds=False, + ipe=300, + log_dir=None, +): + + if (data.lower() == 'imagenet') \ + or (data.lower() == 'inat21') \ + or (data.lower() == 'places205'): + from src.datasets.image_dataset import make_imagedataset + dataset, data_loader, dist_sampler = make_imagedataset( + transform=transform, + batch_size=batch_size, + collator=collator, + pin_mem=pin_mem, + training=training, + num_workers=num_workers, + world_size=world_size, + rank=rank, + root_path=root_path, + image_folder=image_folder, + persistent_workers=persistent_workers, + copy_data=copy_data, + drop_last=drop_last, + subset_file=subset_file) + + elif data.lower() == 'videodataset': + from src.datasets.video_dataset import make_videodataset + dataset, data_loader, dist_sampler = make_videodataset( + data_paths=root_path, + batch_size=batch_size, + frames_per_clip=clip_len, + frame_step=frame_sample_rate, + duration=duration, + num_clips=num_clips, + random_clip_sampling=random_clip_sampling, + allow_clip_overlap=allow_clip_overlap, + filter_short_videos=filter_short_videos, + filter_long_videos=filter_long_videos, + shared_transform=shared_transform, + transform=transform, + datasets_weights=datasets_weights, + collator=collator, + num_workers=num_workers, + world_size=world_size, + rank=rank, + drop_last=drop_last, + log_dir=log_dir) + + return (data_loader, dist_sampler) diff --git a/src/datasets/.ipynb_checkpoints/football_frames_dataset-checkpoint.py b/src/datasets/.ipynb_checkpoints/football_frames_dataset-checkpoint.py new file mode 100644 index 00000000..2176111e --- /dev/null +++ b/src/datasets/.ipynb_checkpoints/football_frames_dataset-checkpoint.py @@ -0,0 +1,68 @@ +import json +import os +import cv2 +import numpy as np + +from src.datasets.utils.video.functional import * +from torch.utils.data import Dataset + + +actions = "Pass, Drive, Header, High Pass, Out, Cross, Throw In, Shot, Ball Player Block, Player Successful Tackle, Free Kick, Goal".split(", ") +action_to_id = {action.upper(): action_id for action_id, action in enumerate(actions)} + +class FramesDataset(Dataset): + def __init__(self, root_dir, frame_window_size=64, frame_step=8, frame_dim=(224, 224), n_channels=3, shuffle=True, transforms=None): + self.frame_dim = frame_dim + self.frame_window_size = frame_window_size + self.frame_step = frame_step + self.n_channels = n_channels + self.shuffle = shuffle + self.transforms = transforms + self.frames_info = [] + + # Load annotations and create a global index for frames + for root, dirs, files in os.walk(root_dir): + for match_folder in dirs: + match_folder_path = os.path.join(root, match_folder) + video_files = [f for f in os.listdir(match_folder_path) if f.endswith('.mp4')] + if len(video_files) != 2: + continue + labels_file = 'Labels-ball.json' + if labels_file not in os.listdir(match_folder_path): + continue + with open(match_folder_path + '/' + labels_file, 'r') as f: + data = json.load(f) + annotations = data['annotations'] + for ann in annotations: + video_path = os.path.join(root_dir, data['UrlLocal'], '720p.mp4'.format(ann['gameTime'][0])) + frame_count = get_frame_count(video_path) + start_frame = int(ann['position']) + if int(start_frame/40) + self.frame_window_size < frame_count: + self.frames_info.append((video_path, int(start_frame/40), action_to_id[ann['label']])) + if shuffle: + np.random.shuffle(self.frames_info) + + def __len__(self): + return int(np.floor(len(self.frames_info))) + + def __getitem__(self, index): + action_frames_info = self.frames_info[index] + raw_frames, target = self.data_generation(action_frames_info) + raw_frames = self.transforms(raw_frames) + return np.array(raw_frames), target + + + def data_generation(self, action_frames_info): + video_path, start_frame, y = action_frames_info + cap = cv2.VideoCapture(video_path) + frames = [] + for frame_index in range(0, self.frame_window_size, self.frame_step): + cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame + frame_index) + ret, frame = cap.read() + frame = cv2.resize(frame, self.frame_dim) + frames.append(frame.astype(np.float32) / 255.) + if not ret: + break # Reached the end of the video + cap.release() + + return np.array(frames), y \ No newline at end of file diff --git a/src/datasets/football_frames_dataset.py b/src/datasets/football_frames_dataset.py index 8b0a729f..2176111e 100644 --- a/src/datasets/football_frames_dataset.py +++ b/src/datasets/football_frames_dataset.py @@ -11,12 +11,13 @@ action_to_id = {action.upper(): action_id for action_id, action in enumerate(actions)} class FramesDataset(Dataset): - def __init__(self, root_dir, frame_window_size=64, frame_step=8, frame_dim=(224, 224), n_channels=3, shuffle=True): + def __init__(self, root_dir, frame_window_size=64, frame_step=8, frame_dim=(224, 224), n_channels=3, shuffle=True, transforms=None): self.frame_dim = frame_dim self.frame_window_size = frame_window_size self.frame_step = frame_step self.n_channels = n_channels self.shuffle = shuffle + self.transforms = transforms self.frames_info = [] # Load annotations and create a global index for frames @@ -32,7 +33,7 @@ def __init__(self, root_dir, frame_window_size=64, frame_step=8, frame_dim=(224, with open(match_folder_path + '/' + labels_file, 'r') as f: data = json.load(f) annotations = data['annotations'] - for ann in annotations[:10]: + for ann in annotations: video_path = os.path.join(root_dir, data['UrlLocal'], '720p.mp4'.format(ann['gameTime'][0])) frame_count = get_frame_count(video_path) start_frame = int(ann['position']) @@ -47,8 +48,8 @@ def __len__(self): def __getitem__(self, index): action_frames_info = self.frames_info[index] raw_frames, target = self.data_generation(action_frames_info) - raw_frames = np.transpose(raw_frames, (3, 0, 1, 2)) - return raw_frames, target + raw_frames = self.transforms(raw_frames) + return np.array(raw_frames), target def data_generation(self, action_frames_info): From 19183f250c1100f4dfa5865481a7ac30e6d39c4b Mon Sep 17 00:00:00 2001 From: gothera Date: Fri, 5 Jul 2024 13:10:23 +0000 Subject: [PATCH 3/3] started training in gpu cloud-1 --- .../.ipynb_checkpoints/utils-checkpoint.py | 3 +-- evals/video_classification_frozen/utils.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py b/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py index d6881824..34c54275 100644 --- a/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py +++ b/evals/video_classification_frozen/.ipynb_checkpoints/utils-checkpoint.py @@ -296,8 +296,7 @@ def __init__( self.short_side_size = short_side_size self.spatial_resize = video_transforms.Resize(short_side_size, interpolation='bilinear') self.to_tensor = video_transforms.Compose([ - volume_transforms.ClipToTensor(), - video_transforms.Normalize(mean=normalize[0], std=normalize[1]) + volume_transforms.ClipToTensor() ]) def __call__(self, buffer): diff --git a/evals/video_classification_frozen/utils.py b/evals/video_classification_frozen/utils.py index d6881824..34c54275 100644 --- a/evals/video_classification_frozen/utils.py +++ b/evals/video_classification_frozen/utils.py @@ -296,8 +296,7 @@ def __init__( self.short_side_size = short_side_size self.spatial_resize = video_transforms.Resize(short_side_size, interpolation='bilinear') self.to_tensor = video_transforms.Compose([ - volume_transforms.ClipToTensor(), - video_transforms.Normalize(mean=normalize[0], std=normalize[1]) + volume_transforms.ClipToTensor() ]) def __call__(self, buffer):