modelv2 & add remote sensing configure

ViTAE-Transformer · Apr 26, 2024 · afe2344 · afe2344
1 parent 766c867
commit afe2344
Show file tree

Hide file tree

Showing 23 changed files with 5,421 additions and 70 deletions.
diff --git a/benchmark.py b/benchmark.py
diff --git a/benchmark.sh b/benchmark.sh
@@ -0,0 +1,10 @@
+OMP_NUM_THREADS=1 \
+CUDA_VISIBLE_DEVICES="1" \
+python benchmark.py  --results-file benchmark.txt \
+  --model pacavit_tiny_p2cconv_100_0 \
+  --bench profile_deepspeed \
+  --num-bench-iter 100 \
+  --batch-size 128 --img-size 224 --num-classes 1000 \
+  --opt adamw --opt-eps 1e-8 --momentum 0.9 --weight-decay 0.05 \
+  --smoothing 0.1 --drop-path 0.1 \
+  --amp --channels-last \
diff --git a/configs/mixformer.yaml b/configs/mixformer.yaml
@@ -1,11 +1,12 @@
 # data
 data_dir: ./datasets/IMNET1k
 dataset: imagenet
+num_classes: 1000
 workers: 4
 pin_mem: true
 
 # model
-model: biformer_tiny
+model: mixformer_tiny
 drop: 0.0
 drop_path: 0.1
 
@@ -21,8 +22,8 @@ weight_decay: 0.05
 
 # sched
 sched: cosine
-lr_base: 1.0e-3
-min_lr: 1.0e-5
+lr_base: 5.0e-4
+min_lr: 5.0e-6
 warmup_lr: 1.0e-06
 warmup_epochs: 5
 weight_decay: 2.0e-05

diff --git a/configs/mixformer_scene_recognition.yaml b/configs/mixformer_scene_recognition.yaml
@@ -0,0 +1,82 @@
+# data
+data_dir: ./datasets/millionaid
+dataset: torch/millionaid
+num_classes: 51
+workers: 4
+pin_mem: true
+
+# model
+model: mixformer_tiny
+drop: 0.0
+drop_path: 0.1
+
+# opt
+epochs: 100
+opt: adamw
+opt_eps: 1e-8
+opt_betas:
+  - 0.9
+  - 0.999
+momentum: 0.9
+weight_decay: 0.05
+
+# sched
+sched: cosine
+lr_base: 5.0e-4
+min_lr: 5.0e-6
+warmup_lr: 1.0e-06
+warmup_epochs: 2
+weight_decay: 2.0e-05
+lr_base_scale: linear
+lr_base_size: 512
+auto_scale_warmup_min_lr: True
+
+# cosine sched
+lr_cycle_decay: 0.5
+lr_cycle_limit: 1
+lr_cycle_mul: 1.0
+lr_k_decay: 1.0
+lr_noise: null
+lr_noise_pct: 0.67
+lr_noise_std: 1.0
+cooldown_epochs: 0
+# patience_epochs: 10
+
+
+# amp
+amp: true
+amp_dtype: float16
+amp_impl: native
+
+# ema
+model_ema: false
+model_ema_decay: 0.99996
+model_ema_force_cpu: false
+
+# mixup
+mixup: 0.8
+cutmix: 1.0
+cutmix_minmax: null
+mixup_prob: 1.0
+mixup_switch_prob: 0.5
+mixup_mode: batch
+
+# others
+grad_accum_steps: 1
+clip_grad: null
+aa: rand-m9-mstd0.5-inc1
+color_jitter: 0.4
+decay_rate: 0.1
+smoothing: 0.1
+train_interpolation: bicubic
+repeated_aug: true
+reprob: 0.25
+remode: pixel
+recount: 1
+resplit: false
+
+# log
+# summary: torchinfo
+# benchmark: calflops
+
+
diff --git a/debug.sh b/debug.sh
@@ -10,3 +10,16 @@ CUDA_VISIBLE_DEVICES="1" \
     --data-dir datasets/IMNET1k \
     --img-size 224 \
     --batch-size 100
+
+# OMP_NUM_THREADS=1 \
+# CUDA_VISIBLE_DEVICES="1" \
+#   python -m debugpy --listen localhost:5678 --wait-for-client \
+#     main.py \
+#     --config configs/mixformer.yaml \
+#     --output outputs/classification \
+#     --experiment exp1_debug \
+#     --resume outputs/classification/mixformer_tiny_224/exp1/checkpoint-77.pth.tar \
+#     --model mixformer_tiny \
+#     --data-dir datasets/IMNET1k \
+#     --img-size 224 \
+#     --batch-size 100
diff --git a/engine.py b/engine.py
@@ -95,6 +95,9 @@ def _backward(_loss):
                             mode=args.clip_mode,
                         )
                     optimizer.step()
+
+        # for name, param in model.named_parameters():
+        #     assert torch.isfinite(param).all() == True, f"Param {param} not a number"
 
         if has_no_sync and not need_update:
             with model.no_sync():
@@ -104,6 +107,9 @@ def _backward(_loss):
             loss = _forward()
             _backward(loss)
 
+        # for name, param in model.named_parameters():
+        #     assert torch.isfinite(param.grad).all() == True, f"optim {param.grad} not a number"
+
         if not args.distributed:
             losses_m.update(loss.item() * accum_steps, input.size(0))
         update_sample_count += input.size(0)

diff --git a/main.py b/main.py
@@ -27,7 +27,7 @@
 from torch.nn.parallel import DistributedDataParallel as NativeDDP
 
 from timm import utils
-from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.data import create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
 from timm.layers import convert_splitbn_model, convert_sync_batchnorm, set_fast_norm
 from timm.loss import JsdCrossEntropy, SoftTargetCrossEntropy, BinaryCrossEntropy, LabelSmoothingCrossEntropy
 from timm.models import create_model, safe_model_name, resume_checkpoint, load_checkpoint
@@ -37,6 +37,7 @@
 
 from engine import train_one_epoch, validate
 from models import *
+from data import create_dataset
 
 import torchstat, torchinfo, torchsummary
 from calflops import calculate_flops
@@ -78,7 +79,7 @@
 
 def main():
 
-# region Initial    
+# region Initial
     args, args_text = _parse_args()
 
     if torch.cuda.is_available():
@@ -103,7 +104,11 @@ def main():
                 str(data_config['input_size'][-1])
             ])
 
+
         output_dir = utils.get_outdir(args.output if args.output else './output/train', exp_name, inc=not args.override)
+        if args.override:
+            shutil.rmtree(output_dir)
+            os.makedirs(output_dir)
         log_dir = os.path.join(output_dir, "train_log.txt")
 
     # modified ------>>>
@@ -325,7 +330,7 @@ def main():
         else:
             if utils.is_primary(args):
                 _logger.info("Using native Torch DistributedDataParallel.")
-            model = NativeDDP(model, device_ids=[], broadcast_buffers=not args.no_ddp_bb)
+            model = NativeDDP(model, device_ids=[device], broadcast_buffers=not args.no_ddp_bb)
         # NOTE: EMA model does not need to be wrapped by DDP
 
     if args.torchcompile: