Skip to content

Commit e5f9f35

Browse files
authored
Update README and refine of MM-GDINO (#11298)
1 parent 63a4bb8 commit e5f9f35

File tree

51 files changed

+2956
-341
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2956
-341
lines changed

configs/glip/README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,8 @@ Learning visual representations from natural language supervision has recently s
166166

167167
### Results on Flickr30k
168168

169-
| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
170-
| ------------- | -------- | -------------- | ------- | ------- | -------- | -------- | -------- | --------- |
171-
| **GLIP-T(C)** || O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
172-
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
169+
| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
170+
| ------------- | -------- | ------------------- | ------- | ------- | -------- | -------- | -------- | --------- |
171+
| **GLIP-T(C)** || O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
172+
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
173+
| **GLIP-T** | | O365,GoldG,CC3M,SBU | 85.3 | 95.5 | 96.9 | 86.0 | 95.9 | 97.2 |

configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
lang_model_name = 'bert-base-uncased'
44

5-
model = dict(bbox_head=dict(early_fuse=True), )
5+
model = dict(bbox_head=dict(early_fuse=True))
66

77
dataset_type = 'Flickr30kDataset'
8-
data_root = 'data/flickr30k/'
8+
data_root = 'data/flickr30k_entities/'
99

1010
test_pipeline = [
1111
dict(
@@ -27,15 +27,15 @@
2727
dataset_Flickr30k_val = dict(
2828
type=dataset_type,
2929
data_root=data_root,
30-
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
30+
ann_file='final_flickr_separateGT_val.json',
3131
data_prefix=dict(img='flickr30k_images/'),
3232
pipeline=test_pipeline,
3333
)
3434

3535
dataset_Flickr30k_test = dict(
3636
type=dataset_type,
3737
data_root=data_root,
38-
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
38+
ann_file='final_flickr_separateGT_test.json',
3939
data_prefix=dict(img='flickr30k_images/'),
4040
pipeline=test_pipeline,
4141
)

configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
22

33
dataset_type = 'Flickr30kDataset'
4-
data_root = 'data/flickr30k/'
4+
data_root = 'data/flickr30k_entities/'
55

66
test_pipeline = [
77
dict(
@@ -23,15 +23,15 @@
2323
dataset_Flickr30k_val = dict(
2424
type=dataset_type,
2525
data_root=data_root,
26-
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
26+
ann_file='final_flickr_separateGT_val.json',
2727
data_prefix=dict(img='flickr30k_images/'),
2828
pipeline=test_pipeline,
2929
)
3030

3131
dataset_Flickr30k_test = dict(
3232
type=dataset_type,
3333
data_root=data_root,
34-
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
34+
ann_file='final_flickr_separateGT_test.json',
3535
data_prefix=dict(img='flickr30k_images/'),
3636
pipeline=test_pipeline,
3737
)

configs/mm_grounding_dino/README.md

Lines changed: 319 additions & 113 deletions
Large diffs are not rendered by default.

configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
44
data_root = 'data/brain_tumor_v2/'
55
class_name = ('label0', 'label1', 'label2')
6+
label_name = '_annotations.coco.json'
7+
68
palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]
79

810
metainfo = dict(classes=class_name, palette=palette)
@@ -64,20 +66,20 @@
6466
pipeline=train_pipeline,
6567
return_classes=True,
6668
data_prefix=dict(img='train/'),
67-
ann_file='train/_annotations.coco.json')))
69+
ann_file='train/' + label_name)))
6870

6971
val_dataloader = dict(
7072
dataset=dict(
7173
metainfo=metainfo,
7274
data_root=data_root,
7375
return_classes=True,
74-
ann_file='valid/_annotations.coco.json',
76+
ann_file='valid/' + label_name,
7577
data_prefix=dict(img='valid/')))
7678
test_dataloader = val_dataloader
7779

7880
val_evaluator = dict(
7981
type='CocoMetric',
80-
ann_file=data_root + 'valid/_annotations.coco.json',
82+
ann_file=data_root + 'valid/' + label_name,
8183
metric='bbox',
8284
format_only=False)
8385
test_evaluator = val_evaluator
@@ -107,4 +109,4 @@
107109

108110
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
109111

110-
load_from = ''
112+
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa

configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,4 @@
107107
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
108108
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
109109

110-
load_from = ''
110+
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa

configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
custom_keys={
6565
'absolute_pos_embed': dict(decay_mult=0.),
6666
'backbone': dict(lr_mult=0.1),
67-
# 'language_model': dict(lr_mult=0),
67+
'language_model': dict(lr_mult=0.1),
6868
}))
6969

7070
# learning policy
@@ -75,11 +75,11 @@
7575
begin=0,
7676
end=max_epochs,
7777
by_epoch=True,
78-
milestones=[11],
78+
milestones=[8, 11],
7979
gamma=0.1)
8080
]
8181
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
8282

8383
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
8484

85-
load_from = ''
85+
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa

configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,20 @@
88
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
99
'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
1010
'mouse', 'remote', 'microwave', 'oven', 'toaster',
11-
'refrigerator', 'book', 'clock', 'vase', 'toothbrush')
11+
'refrigerator', 'book', 'clock', 'vase', 'toothbrush') # 48
1212
novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
1313
'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
14-
'cake', 'couch', 'keyboard', 'sink', 'scissors')
15-
all_classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
16-
'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog',
17-
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
18-
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
19-
'skis', 'snowboard', 'kite', 'skateboard', 'surfboard',
20-
'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
21-
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza',
22-
'donut', 'cake', 'chair', 'couch', 'bed', 'toilet', 'tv',
23-
'laptop', 'mouse', 'remote', 'keyboard', 'microwave', 'oven',
24-
'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
25-
'scissors', 'toothbrush')
14+
'cake', 'couch', 'keyboard', 'sink', 'scissors') # 17
15+
all_classes = (
16+
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
17+
'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
18+
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
19+
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard',
20+
'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
21+
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut',
22+
'cake', 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse',
23+
'remote', 'keyboard', 'microwave', 'oven', 'toaster', 'sink',
24+
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'toothbrush') # 65
2625

2726
train_metainfo = dict(classes=base_classes)
2827
test_metainfo = dict(
@@ -95,7 +94,7 @@
9594
type='CocoDataset',
9695
metainfo=train_metainfo,
9796
data_root=data_root,
98-
ann_file='zero-shot/instances_train2017_seen_2.json',
97+
ann_file='annotations/instances_train2017_seen_2.json',
9998
data_prefix=dict(img='train2017/'),
10099
return_classes=True,
101100
filter_cfg=dict(filter_empty_gt=False, min_size=32),
@@ -111,7 +110,7 @@
111110
type='CocoDataset',
112111
metainfo=test_metainfo,
113112
data_root=data_root,
114-
ann_file='zero-shot/instances_val2017_all_2.json',
113+
ann_file='annotations/instances_val2017_all_2.json',
115114
data_prefix=dict(img='val2017/'),
116115
test_mode=True,
117116
pipeline=test_pipeline,
@@ -121,7 +120,7 @@
121120

122121
val_evaluator = dict(
123122
type='OVCocoMetric',
124-
ann_file=data_root + 'zero-shot/instances_val2017_all_2.json',
123+
ann_file=data_root + 'annotations/instances_val2017_all_2.json',
125124
metric='bbox',
126125
format_only=False)
127126
test_evaluator = val_evaluator
@@ -155,4 +154,4 @@
155154
checkpoint=dict(
156155
max_keep_ckpts=1, save_best='coco/novel_ap50', rule='greater'))
157156

158-
load_from = 'epoch_30.pth'
157+
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
2+
3+
data_root = 'data/coco/'
4+
5+
train_pipeline = [
6+
dict(type='LoadImageFromFile'),
7+
dict(type='LoadAnnotations', with_bbox=True),
8+
dict(type='RandomFlip', prob=0.5),
9+
dict(
10+
type='RandomChoice',
11+
transforms=[
12+
[
13+
dict(
14+
type='RandomChoiceResize',
15+
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
16+
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
17+
(736, 1333), (768, 1333), (800, 1333)],
18+
keep_ratio=True)
19+
],
20+
[
21+
dict(
22+
type='RandomChoiceResize',
23+
# The radio of all image in train dataset < 7
24+
# follow the original implement
25+
scales=[(400, 4200), (500, 4200), (600, 4200)],
26+
keep_ratio=True),
27+
dict(
28+
type='RandomCrop',
29+
crop_type='absolute_range',
30+
crop_size=(384, 600),
31+
allow_negative_crop=True),
32+
dict(
33+
type='RandomChoiceResize',
34+
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
35+
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
36+
(736, 1333), (768, 1333), (800, 1333)],
37+
keep_ratio=True)
38+
]
39+
]),
40+
dict(
41+
type='RandomSamplingNegPos',
42+
tokenizer_name=_base_.lang_model_name,
43+
num_sample_negative=20, # ======= important =====
44+
label_map_file='data/coco/annotations/coco2017_label_map.json',
45+
max_tokens=256),
46+
dict(
47+
type='PackDetInputs',
48+
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
49+
'scale_factor', 'flip', 'flip_direction', 'text',
50+
'custom_entities', 'tokens_positive', 'dataset_mode'))
51+
]
52+
53+
train_dataloader = dict(
54+
dataset=dict(
55+
_delete_=True,
56+
type='ODVGDataset',
57+
need_text=False,
58+
data_root=data_root,
59+
ann_file='annotations/instances_train2017_od.json',
60+
label_map_file='annotations/coco2017_label_map.json',
61+
data_prefix=dict(img='train2017/'),
62+
return_classes=True,
63+
filter_cfg=dict(filter_empty_gt=False, min_size=32),
64+
pipeline=train_pipeline))
65+
66+
optim_wrapper = dict(
67+
_delete_=True,
68+
type='OptimWrapper',
69+
optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
70+
clip_grad=dict(max_norm=0.1, norm_type=2),
71+
paramwise_cfg=dict(
72+
custom_keys={
73+
'absolute_pos_embed': dict(decay_mult=0.),
74+
'backbone': dict(lr_mult=0.1),
75+
'language_model': dict(lr_mult=0.0),
76+
}))
77+
78+
# learning policy
79+
max_epochs = 12
80+
param_scheduler = [
81+
dict(
82+
type='MultiStepLR',
83+
begin=0,
84+
end=max_epochs,
85+
by_epoch=True,
86+
milestones=[8, 11],
87+
gamma=0.1)
88+
]
89+
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
90+
91+
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
92+
93+
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa

0 commit comments

Comments
 (0)