Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fine-tuning problem Issue #457

Open
LLH-Harward opened this issue Aug 3, 2024 · 15 comments
Open

Fine-tuning problem Issue #457

LLH-Harward opened this issue Aug 3, 2024 · 15 comments
Assignees

Comments

@LLH-Harward
Copy link

After fine-tuning, the detector fails to detect objects,the Output Image Has No Detected Boxes, even though the loss keeps decreasing during training. I want to know whether the issue lies with my method or my data. Could anyone help me? Thank you.
image

Inference Command Used After Training:
python .\image_demo.py D:\YOLO-World-master\configs\pretrain\custom_yolo_world_l_clip.py D:\YOLO-World-master\log_200\epoch_220.pth D:\YOLO-World-master\datasets\images\train\0000001
.jpg "Book"

metainfo
metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))

class_text_path:
[
["Chalk"],
["Microphone"],
["MobilePhone"],
["Tablet"],
["OtherTeachingTools"],
["Book"],
["Pen"],
["RulerTools"],
["Eraser"],
["PencilCase"],
["Laptop"],
["NonEducationalItems"],
["BlackboardWriting"],
["Notes"]
]

Config file:
base = (
'../../third_party/mmyolo/configs/yolov8/'
'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(
imports=['yolo_world'],
allow_failed_imports=False)

import os
os.chdir('D:/YOLO-World-master')
hyper-parameters
num_classes = 14
num_training_classes = 14
max_epochs = 500 # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, base.last_stage_out_channels // 2]
neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32]
base_lr = 1e-4
weight_decay = 0.05
train_batch_size_per_gpu = 16
load_from='pretrained_weights/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
persistent_workers = False

model settings
model = dict(
type='YOLOWorldDetector',
mm_neck=True,
num_train_classes=num_training_classes,
num_test_classes=num_classes,
data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
backbone=dict(
delete=True,
type='MultiModalYOLOBackbone',
image_model={{base.model.backbone}},
text_model=dict(
type='HuggingCLIPLanguageBackbone',
model_name='openai/clip-vit-base-patch32',
frozen_modules=['all'])),
neck=dict(type='YOLOWorldDualPAFPN',
guide_channels=text_channels,
embed_channels=neck_embed_channels,
num_heads=neck_num_heads,
block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
text_enhancder=dict(type='ImagePoolingAttentionModule',
embed_channels=256,
num_heads=8)),
bbox_head=dict(type='YOLOWorldHead',
head_module=dict(type='YOLOWorldHeadModule',
embed_dims=text_channels,
num_classes=num_training_classes)),
train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

dataset settings
text_transform = [
dict(type='RandomLoadText',
num_neg_samples=(num_classes, num_classes),
max_num_samples=num_training_classes,
padding_to_max=True,
padding_value=''),
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
'flip_direction', 'texts'))
]
mosaic_affine_transform = [
dict(
type='MultiModalMosaic',
img_scale=base.img_scale,
pad_val=114.0,
pre_transform=base.pre_transform),
dict(
type='YOLOv5RandomAffine',
max_rotate_degree=0.0,
max_shear_degree=0.0,
max_aspect_ratio=100.,
scaling_ratio_range=(1 - base.affine_scale,
1 + base.affine_scale),
# img_scale is (width, height)
border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2),
border_val=(114, 114, 114))
]
train_pipeline = [
*base.pre_transform,
mosaic_affine_transform,
dict(
type='YOLOv5MultiModalMixUp',
prob=base.mixup_prob,
pre_transform=[
base.pre_transform,
*mosaic_affine_transform]),
*base.last_transform[:-1],
*text_transform
]
train_pipeline_stage2 = [
*base.train_pipeline_stage2[:-1],
*text_transform
]

metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))
coco_train_dataset = dict(
delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo = metainfo,
data_root='datasets',
ann_file='annotations/train.json',
data_prefix=dict(img='images'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/custom.json',
pipeline=train_pipeline)

train_dataloader = dict(
persistent_workers=persistent_workers,
batch_size=train_batch_size_per_gpu,
collate_fn=dict(type='yolow_collate'),
dataset=coco_train_dataset)
test_pipeline = [
*base.test_pipeline[:-1],
dict(type='LoadText'),
dict(
type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'pad_param', 'texts'))
]
coco_val_dataset = dict(
delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo = metainfo,
data_root='datasets/',
ann_file='annotations/val.json',
data_prefix=dict(img='images'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/custom.json',
pipeline=test_pipeline)
val_dataloader = dict(dataset=coco_val_dataset)
test_dataloader = val_dataloader
training settings
default_hooks = dict(
param_scheduler=dict(
scheduler_type='linear',
lr_factor=0.01,
max_epochs=max_epochs),
checkpoint=dict(
max_keep_ckpts=-1,
save_best=None,
interval=save_epoch_intervals))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0001,
update_buffers=True,
strict_load=False,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - close_mosaic_epochs,
switch_pipeline=train_pipeline_stage2)
]
train_cfg = dict(
max_epochs=max_epochs,
val_interval=5,
dynamic_intervals=[((max_epochs - close_mosaic_epochs),
base.val_interval_stage2)])
optim_wrapper = dict(
optimizer=dict(
delete=True,
type='AdamW',
lr=base_lr,
weight_decay=weight_decay,
batch_size_per_gpu=train_batch_size_per_gpu),
paramwise_cfg=dict(
custom_keys={'backbone.text_model': dict(lr_mult=0.01),
'logit_scale': dict(weight_decay=0.0)}),
constructor='YOLOWv5OptimizerConstructor')

evaluation settings
val_evaluator = dict(
delete=True,
type='mmdet.CocoMetric',
proposal_nums=(100, 1, 10),
ann_file='datasets/annotations/val.json',
metric='bbox')

datasets:
image

train.json

{
  "images": [
    {
      "file_name": "val\\0000002.jpg",
      "id": 0,
      "width": 1920,
      "height": 1080
    },
    ...
   ]
    "annotations": [
    {
      "image_id": 0,
      "id": 0,
      "category_id": 9,
      "bbox": [
        342.47200000000004,
        610.78652,
        95.72999999999996,
        72.80948000000001
      ],
      "area": 6970.051520399998,
      "segmentation": [
        [
          342.47200000000004,
          610.78652,
          438.202,
          610.78652,
          438.202,
          683.596,
          342.47200000000004,
          683.596
        ]
      ],
      "iscrowd": 0
    },
    {
      "image_id": 1,
      "id": 1,
      "category_id": 9,
      "bbox": [
        542.02231,
        690.3370000000001,
        115.95522000000005,
        76.85399999999993
      ],
      "area": 8911.622477879995,
      "segmentation": [
        [
          542.02231,
          690.3370000000001,
          657.97753,
          690.3370000000001,
          657.97753,
          767.191,
          542.02231,
          767.191
        ]
      ],
      "iscrowd": 0
    },
   ...
   ]
 "categories": [
    {
      "id": 0,
      "name": "Chalk"
    },
    {
      "id": 1,
      "name": "Microphone"
    },
    {
      "id": 2,
      "name": "MobilePhone"
    },
    {
      "id": 3,
      "name": "Tablet"
    },
    {
      "id": 4,
      "name": "OtherTeachingTools"
    },
    {
      "id": 5,
      "name": "Book"
    },
    {
      "id": 6,
      "name": "Pen"
    },
    {
      "id": 7,
      "name": "RulerTools"
    },
    {
      "id": 8,
      "name": "Eraser"
    },
    {
      "id": 9,
      "name": "PencilCase"
    },
    {
      "id": 10,
      "name": "Laptop"
    },
    {
      "id": 11,
      "name": "NonEducationalItems"
    },
    {
      "id": 12,
      "name": "BlackboardWriting"
    },
    {
      "id": 13,
      "name": "Notes"
    }
  ]
}
@LLH-Harward
Copy link
Author

@wondervictor

@LLH-Harward
Copy link
Author

My mistake, no problem, it can be inferred from the code. Just set the threshold in image_demo.py very low.

@LLH-Harward
Copy link
Author

I fine-tuned a model using x-1280, and it can detect bounding boxes, but the confidence is very low. Do you have any solutions for this?
image

@2879982985
Copy link

你好,请问你的grad-norm为0的问题是怎么解决的呀

@LLH-Harward
Copy link
Author

如果您有有关提高置信度的方法 也请知会我 不胜感激

@2879982985
Copy link

"我尝试了下第二种prompt tuning的方法,按照您博客中的方法,冻住了neck和head以及backbone的前两层,只修改了dataset 加入了metainfo中自己的类(如下),发现grad_norm前两个epoch为0.0001 后面都为0.0000,loss不降低。"我是从评论区过来找您的,我也遇到了这样的问题,请问第二种方式prompt tuning您是怎么解决的loss正常,但grad-norm为0

@LLH-Harward
Copy link
Author

LLH-Harward commented Sep 13, 2024 via email

@2879982985
Copy link

好的,谢谢您,我再研究研究

@LLH-Harward
Copy link
Author

建议 repa tuning或者Ultralytics版本

@2879982985
Copy link

bug大佬帮忙看一下为啥我repa tuning也是这样呀,就是grad-norm为0,然后loss不下降,第二种微调方式也这样

@LLH-Harward
Copy link
Author

LLH-Harward commented Sep 13, 2024 via email

@2879982985
Copy link

bug
base = ('../../third_party/mmyolo/configs/yolov8/'
'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)

hyper-parameters

num_classes = 8
num_training_classes = 8
max_epochs = 300 # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, base.last_stage_out_channels // 2]
neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32]
base_lr = 2e-4
weight_decay = 0.05
train_batch_size_per_gpu = 4
load_from = 'weights/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea_rep_conv.pth'
persistent_workers = False
mixup_prob = 0.15
copypaste_prob = 0.3
classes = [["rottenSurface"], ["crease"], ["growthLines"], ["healingInjury"], ["cavity"], ["bacterialInjury"], ["pinhole"], ["scratch"]]

model settings

model = dict(type='SimpleYOLOWorldDetector',
mm_neck=True,
num_train_classes=num_classes,
num_test_classes=num_classes,
reparameterized=True,
data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
backbone=dict(delete=True,
type='MultiModalYOLOBackbone',
text_model=None,
image_model={{base.model.backbone}},
with_text_model=False),
neck=dict(type='YOLOWorldPAFPN',
guide_channels=num_classes,
embed_channels=neck_embed_channels,
num_heads=neck_num_heads,
block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
guide_channels=num_classes)),
bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
embed_dims=text_channels,
num_guide=num_classes,
num_classes=num_classes)),
train_cfg=dict(assigner=dict(num_classes=num_classes)))

dataset settings

final_transform = [
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
'flip_direction'))
]
mosaic_affine_transform = [
dict(type='Mosaic',
img_scale=base.img_scale,
pad_val=114.0,
pre_transform=base.pre_transform),
dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
dict(
type='YOLOv5RandomAffine',
max_rotate_degree=0.0,
max_shear_degree=0.0,
max_aspect_ratio=100.,
scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale),
# img_scale is (width, height)
border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2),
border_val=(114, 114, 114),
min_area_ratio=base.min_area_ratio,
use_mask_refine=base.use_mask2refine)
]
train_pipeline = [
*base.pre_transform, mosaic_affine_transform,
dict(type='YOLOv5MixUp',
prob=mixup_prob,
pre_transform=[
base.pre_transform, *mosaic_affine_transform]),
*base.last_transform[:-1], *final_transform
]

train_pipeline_stage2 = [*base.train_pipeline_stage2[:-1], *final_transform]

coco_train_dataset = dict(type='YOLOv5CocoDataset',
metainfo=dict(classes=classes),
data_root='leather2017/',
ann_file='train2017/instances_train2017.json',
data_prefix=dict(img='train2017/images/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32),
pipeline=train_pipeline)

train_dataloader = dict(persistent_workers=persistent_workers,
batch_size=train_batch_size_per_gpu,
collate_fn=dict(type='yolow_collate'),
dataset=coco_train_dataset)

train_dataloader = dict(persistent_workers=persistent_workers,
batch_size=train_batch_size_per_gpu,
collate_fn=dict(type='yolow_collate'),
dataset=coco_train_dataset)
test_pipeline = [
*base.test_pipeline[:-1],
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'pad_param'))
]
coco_val_dataset = dict(type='YOLOv5CocoDataset',
metainfo=dict(classes=classes),
data_root='leather2017/',
ann_file='val2017/instances_val2017.json',
data_prefix=dict(img='val2017/images/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32),
pipeline=test_pipeline)

val_dataloader = dict(dataset=coco_val_dataset)
test_dataloader = val_dataloader

training settings

default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
lr_factor=0.01,
max_epochs=max_epochs),
checkpoint=dict(max_keep_ckpts=-1,
save_best=None,
interval=save_epoch_intervals))
custom_hooks = [
dict(type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0001,
update_buffers=True,
strict_load=False,
priority=49),
dict(type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - close_mosaic_epochs,
switch_pipeline=train_pipeline_stage2)
]
train_cfg = dict(max_epochs=max_epochs,
val_interval=5,
dynamic_intervals=[((max_epochs - close_mosaic_epochs),
base.val_interval_stage2)])
optim_wrapper = dict(optimizer=dict(
delete=True,
type='AdamW',
lr=base_lr,
weight_decay=weight_decay,
batch_size_per_gpu=train_batch_size_per_gpu),
constructor='YOLOWv5OptimizerConstructor')

evaluation settings

val_evaluator = dict(delete=True,
type='mmdet.CocoMetric',
proposal_nums=(100, 1, 10),
ann_file='leather2017/val2017/instances_val2017.json',
metric='bbox')
这是config文件,大佬帮忙看一下

@2879982985
Copy link

或者说是直接用那个class_text.json文件直接生成的,然后文件内容是[[ "rottenSurface" ], [ "crease" ], [ "growthLines" ], [ "healingInjury" ], [ "cavity" ], [ "bacterialInjury" ], [ "pinhole" ], [ "scratch" ]],是文件格式有问题导致生成的npy文件有问题吗

@nightwishhhhh
Copy link

如果您有有关提高置信度的方法 也请知会我 不胜感激

你好~请问这个问题有解决吗?谢谢

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants