Skip to content

Commit d6aa5da

Browse files
authored
[Feature] Support Multi-View 3D Visual Grounding Benchmark and Baselines (#10)
* Support visual grounding benchmark and baseline models * Update the info path for the CVPR 2024 challenge
1 parent 8b95e69 commit d6aa5da

39 files changed

+4100
-30
lines changed

.dev_scripts/diff_coverage_test.sh

100755100644
File mode changed.

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ data/scannet
126126
data/3rscan
127127
data/matterport3d
128128
data/*.pkl
129+
data/*.json
129130
exps/
130131
todo.md
131132

configs/detection/embodied-det3d_8xb1_embodiedscan-3d-284class-9dof-mlvl.py configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@
190190
times=8,
191191
dataset=dict(type=dataset_type,
192192
data_root=data_root,
193-
ann_file='embodiedscan_infos_train_full.pkl',
193+
ann_file='embodiedscan_infos_train.pkl',
194194
pipeline=train_pipeline,
195195
test_mode=False,
196196
filter_empty_gt=True,
@@ -205,7 +205,7 @@
205205
sampler=dict(type='DefaultSampler', shuffle=False),
206206
dataset=dict(type=dataset_type,
207207
data_root=data_root,
208-
ann_file='embodiedscan_infos_val_full.pkl',
208+
ann_file='embodiedscan_infos_val.pkl',
209209
pipeline=test_pipeline,
210210
test_mode=True,
211211
filter_empty_gt=True,
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@
186186
times=10,
187187
dataset=dict(type=dataset_type,
188188
data_root=data_root,
189-
ann_file='embodiedscan_infos_train_full.pkl',
189+
ann_file='embodiedscan_infos_train.pkl',
190190
pipeline=train_pipeline,
191191
test_mode=False,
192192
filter_empty_gt=True,
@@ -200,7 +200,7 @@
200200
sampler=dict(type='DefaultSampler', shuffle=False),
201201
dataset=dict(type=dataset_type,
202202
data_root=data_root,
203-
ann_file='embodiedscan_infos_val_full.pkl',
203+
ann_file='embodiedscan_infos_val.pkl',
204204
pipeline=test_pipeline,
205205
test_mode=True,
206206
filter_empty_gt=True,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
_base_ = ['../default_runtime.py']
2+
n_points = 100000
3+
4+
backend_args = None
5+
# Uncomment the following if use ceph or other file clients.
6+
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
7+
# for more details.
8+
# file_client_args = dict(
9+
# backend='petrel',
10+
# path_mapping=dict({
11+
# './data/scannet/':
12+
# 's3://openmmlab/datasets/detection3d/scannet_processed/',
13+
# 'data/scannet/':
14+
# 's3://openmmlab/datasets/detection3d/scannet_processed/'
15+
# }))
16+
17+
metainfo = dict(classes='all')
18+
19+
model = dict(
20+
type='SparseFeatureFusion3DGrounder',
21+
num_queries=256,
22+
voxel_size=0.01,
23+
data_preprocessor=dict(type='Det3DDataPreprocessor',
24+
mean=[123.675, 116.28, 103.53],
25+
std=[58.395, 57.12, 57.375],
26+
bgr_to_rgb=True,
27+
pad_size_divisor=32),
28+
backbone=dict(
29+
type='mmdet.ResNet',
30+
depth=50,
31+
base_channels=16, # to make it consistent with mink resnet
32+
num_stages=4,
33+
out_indices=(0, 1, 2, 3),
34+
frozen_stages=1,
35+
norm_cfg=dict(type='BN', requires_grad=False),
36+
norm_eval=True,
37+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
38+
style='pytorch'),
39+
backbone_lidar=dict(type='MinkResNet', in_channels=3, depth=34),
40+
use_xyz_feat=True,
41+
# change due to no img feature fusion
42+
neck_3d=dict(type='MinkNeck',
43+
num_classes=1,
44+
in_channels=[128, 256, 512, 1024],
45+
out_channels=256,
46+
voxel_size=0.01,
47+
pts_prune_threshold=1000),
48+
decoder=dict(
49+
num_layers=6,
50+
return_intermediate=True,
51+
layer_cfg=dict(
52+
# query self attention layer
53+
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
54+
# cross attention layer query to text
55+
cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
56+
# cross attention layer query to image
57+
cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
58+
ffn_cfg=dict(embed_dims=256,
59+
feedforward_channels=2048,
60+
ffn_drop=0.0)),
61+
post_norm_cfg=None),
62+
bbox_head=dict(type='GroundingHead',
63+
num_classes=256,
64+
sync_cls_avg_factor=True,
65+
decouple_bbox_loss=True,
66+
decouple_groups=4,
67+
share_pred_layer=True,
68+
decouple_weights=[0.2, 0.2, 0.2, 0.4],
69+
contrastive_cfg=dict(max_text_len=256,
70+
log_scale='auto',
71+
bias=True),
72+
loss_cls=dict(type='mmdet.FocalLoss',
73+
use_sigmoid=True,
74+
gamma=2.0,
75+
alpha=0.25,
76+
loss_weight=1.0),
77+
loss_bbox=dict(type='BBoxCDLoss',
78+
mode='l1',
79+
loss_weight=1.0,
80+
group='g8')),
81+
coord_type='DEPTH',
82+
# training and testing settings
83+
train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
84+
match_costs=[
85+
dict(type='BinaryFocalLossCost',
86+
weight=1.0),
87+
dict(type='BBox3DL1Cost', weight=2.0),
88+
dict(type='IoU3DCost', weight=2.0)
89+
]), ),
90+
test_cfg=None)
91+
92+
dataset_type = 'MultiView3DGroundingDataset'
93+
data_root = 'data'
94+
95+
train_pipeline = [
96+
dict(type='LoadAnnotations3D'),
97+
dict(type='MultiViewPipeline',
98+
n_images=20,
99+
transforms=[
100+
dict(type='LoadImageFromFile', backend_args=backend_args),
101+
dict(type='LoadDepthFromFile', backend_args=backend_args),
102+
dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
103+
dict(type='PointSample', num_points=n_points // 10),
104+
dict(type='Resize', scale=(480, 480), keep_ratio=False)
105+
]),
106+
dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
107+
dict(type='PointSample', num_points=n_points),
108+
dict(type='GlobalRotScaleTrans',
109+
rot_range=[-0.087266, 0.087266],
110+
scale_ratio_range=[.9, 1.1],
111+
translation_std=[.1, .1, .1],
112+
shift_height=False),
113+
dict(type='Pack3DDetInputs',
114+
keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
115+
]
116+
test_pipeline = [
117+
dict(type='LoadAnnotations3D'),
118+
dict(type='MultiViewPipeline',
119+
n_images=50,
120+
ordered=True,
121+
transforms=[
122+
dict(type='LoadImageFromFile', backend_args=backend_args),
123+
dict(type='LoadDepthFromFile', backend_args=backend_args),
124+
dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
125+
dict(type='PointSample', num_points=n_points // 10),
126+
dict(type='Resize', scale=(480, 480), keep_ratio=False)
127+
]),
128+
dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
129+
dict(type='PointSample', num_points=n_points),
130+
dict(type='Pack3DDetInputs',
131+
keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
132+
]
133+
134+
# TODO: to determine a reasonable batch size
135+
train_dataloader = dict(
136+
batch_size=12,
137+
num_workers=12,
138+
persistent_workers=True,
139+
sampler=dict(type='DefaultSampler', shuffle=True),
140+
dataset=dict(type='RepeatDataset',
141+
times=1,
142+
dataset=dict(type=dataset_type,
143+
data_root=data_root,
144+
ann_file='embodiedscan_infos_train.pkl',
145+
vg_file='embodiedscan_train_full_vg.json',
146+
metainfo=metainfo,
147+
pipeline=train_pipeline,
148+
test_mode=False,
149+
filter_empty_gt=True,
150+
box_type_3d='Euler-Depth')))
151+
152+
val_dataloader = dict(batch_size=12,
153+
num_workers=12,
154+
persistent_workers=True,
155+
drop_last=False,
156+
sampler=dict(type='DefaultSampler', shuffle=False),
157+
dataset=dict(type=dataset_type,
158+
data_root=data_root,
159+
ann_file='embodiedscan_infos_val.pkl',
160+
vg_file='embodiedscan_val_full_vg.json',
161+
metainfo=metainfo,
162+
pipeline=test_pipeline,
163+
test_mode=True,
164+
filter_empty_gt=True,
165+
box_type_3d='Euler-Depth'))
166+
test_dataloader = val_dataloader
167+
168+
val_evaluator = dict(type='GroundingMetric')
169+
test_evaluator = val_evaluator
170+
171+
# training schedule for 1x
172+
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
173+
val_cfg = dict(type='ValLoop')
174+
test_cfg = dict(type='TestLoop')
175+
176+
# optimizer
177+
lr = 5e-4
178+
optim_wrapper = dict(type='OptimWrapper',
179+
optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
180+
paramwise_cfg=dict(
181+
custom_keys={
182+
'text_encoder': dict(lr_mult=0.0),
183+
'decoder': dict(lr_mult=0.1, decay_mult=1.0)
184+
}),
185+
clip_grad=dict(max_norm=10, norm_type=2))
186+
187+
# learning rate
188+
param_scheduler = dict(type='MultiStepLR',
189+
begin=0,
190+
end=12,
191+
by_epoch=True,
192+
milestones=[8, 11],
193+
gamma=0.1)
194+
195+
custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
196+
197+
# hooks
198+
default_hooks = dict(
199+
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
200+
201+
# vis_backends = [
202+
# dict(type='TensorboardVisBackend'),
203+
# dict(type='LocalVisBackend')
204+
# ]
205+
# visualizer = dict(
206+
# type='Det3DLocalVisualizer',
207+
# vis_backends=vis_backends, name='visualizer')
208+
209+
find_unused_parameters = True
210+
load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa

0 commit comments

Comments
 (0)