From d51c772fc0412ebf05e4a8477638cc017fece733 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Tue, 5 Mar 2024 15:44:22 +0800 Subject: [PATCH 1/2] Support visual grounding benchmark and baseline models --- .dev_scripts/diff_coverage_test.sh | 0 .gitignore | 1 + ...t3d_8xb1_embodiedscan-3d-284class-9dof.py} | 4 +- ...t3d_8xb4_embodiedscan-3d-284class-9dof.py} | 34 +- ...ounding_8xb12_embodiedscan-vg-9dof-full.py | 212 +++++ ...mv-grounding_8xb12_embodiedscan-vg-9dof.py | 212 +++++ ...g_8xb12_embodiedscan-vg-9dof_fcaf-coder.py | 213 +++++ embodiedscan/datasets/__init__.py | 5 +- ...ied_dataset.py => embodiedscan_dataset.py} | 0 embodiedscan/datasets/mv_3dvg_dataset.py | 509 +++++++++++ .../datasets/transforms/formatting.py | 3 +- embodiedscan/datasets/transforms/multiview.py | 6 +- embodiedscan/eval/__init__.py | 5 +- embodiedscan/eval/metrics/__init__.py | 5 + embodiedscan/eval/{ => metrics}/det_metric.py | 2 +- embodiedscan/eval/metrics/grounding_metric.py | 159 ++++ .../eval/{ => metrics}/occupancy_metric.py | 0 embodiedscan/models/__init__.py | 2 + embodiedscan/models/dense_heads/__init__.py | 3 +- .../models/dense_heads/grounding_head.py | 855 ++++++++++++++++++ embodiedscan/models/detectors/__init__.py | 6 +- .../detectors/sparse_featfusion_grounder.py | 805 +++++++++++++++++ .../sparse_featfusion_single_stage.py | 6 +- embodiedscan/models/layers/__init__.py | 3 + .../layers/ground_transformer/__init__.py | 7 + .../layers/ground_transformer/decoder.py | 297 ++++++ embodiedscan/models/losses/__init__.py | 8 +- embodiedscan/models/losses/match_cost.py | 265 ++++++ embodiedscan/models/necks/__init__.py | 4 + embodiedscan/models/necks/channel_mapper.py | 90 ++ embodiedscan/models/necks/mink_neck.py | 254 ++++++ embodiedscan/models/task_modules/__init__.py | 4 + .../models/task_modules/assigners/__init__.py | 4 + .../assigners/hungarian_assigner.py | 138 +++ embodiedscan/structures/__init__.py | 8 +- embodiedscan/structures/bbox_3d/__init__.py | 10 +- embodiedscan/utils/__init__.py | 7 +- embodiedscan/utils/typing_config.py | 18 +- tools/test.py | 2 +- 39 files changed, 4122 insertions(+), 44 deletions(-) mode change 100755 => 100644 .dev_scripts/diff_coverage_test.sh rename configs/detection/{embodied-det3d_8xb1_embodiedscan-3d-284class-9dof-mlvl.py => cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py} (99%) rename configs/detection/{mv-sparse-featfusion_8xb4_embodiedscan-3d-284class-9dof-mlvl-distorted-corners-group4.py => mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py} (91%) create mode 100644 configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py create mode 100644 configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py create mode 100644 configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py rename embodiedscan/datasets/{embodied_dataset.py => embodiedscan_dataset.py} (100%) create mode 100644 embodiedscan/datasets/mv_3dvg_dataset.py create mode 100644 embodiedscan/eval/metrics/__init__.py rename embodiedscan/eval/{ => metrics}/det_metric.py (99%) create mode 100644 embodiedscan/eval/metrics/grounding_metric.py rename embodiedscan/eval/{ => metrics}/occupancy_metric.py (100%) create mode 100644 embodiedscan/models/dense_heads/grounding_head.py create mode 100644 embodiedscan/models/detectors/sparse_featfusion_grounder.py create mode 100644 embodiedscan/models/layers/ground_transformer/__init__.py create mode 100644 embodiedscan/models/layers/ground_transformer/decoder.py create mode 100644 embodiedscan/models/losses/match_cost.py create mode 100644 embodiedscan/models/necks/__init__.py create mode 100644 embodiedscan/models/necks/channel_mapper.py create mode 100644 embodiedscan/models/necks/mink_neck.py create mode 100644 embodiedscan/models/task_modules/__init__.py create mode 100644 embodiedscan/models/task_modules/assigners/__init__.py create mode 100644 embodiedscan/models/task_modules/assigners/hungarian_assigner.py diff --git a/.dev_scripts/diff_coverage_test.sh b/.dev_scripts/diff_coverage_test.sh old mode 100755 new mode 100644 diff --git a/.gitignore b/.gitignore index 956d4ec..860eafc 100644 --- a/.gitignore +++ b/.gitignore @@ -126,6 +126,7 @@ data/scannet data/3rscan data/matterport3d data/*.pkl +data/*.json exps/ todo.md diff --git a/configs/detection/embodied-det3d_8xb1_embodiedscan-3d-284class-9dof-mlvl.py b/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py similarity index 99% rename from configs/detection/embodied-det3d_8xb1_embodiedscan-3d-284class-9dof-mlvl.py rename to configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py index 02675dc..6d5a603 100644 --- a/configs/detection/embodied-det3d_8xb1_embodiedscan-3d-284class-9dof-mlvl.py +++ b/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py @@ -190,7 +190,7 @@ times=8, dataset=dict(type=dataset_type, data_root=data_root, - ann_file='embodiedscan_infos_train_full.pkl', + ann_file='embodiedscan_infos_train_split.pkl', pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, @@ -205,7 +205,7 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict(type=dataset_type, data_root=data_root, - ann_file='embodiedscan_infos_val_full.pkl', + ann_file='embodiedscan_infos_val_split.pkl', pipeline=test_pipeline, test_mode=True, filter_empty_gt=True, diff --git a/configs/detection/mv-sparse-featfusion_8xb4_embodiedscan-3d-284class-9dof-mlvl-distorted-corners-group4.py b/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py similarity index 91% rename from configs/detection/mv-sparse-featfusion_8xb4_embodiedscan-3d-284class-9dof-mlvl-distorted-corners-group4.py rename to configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py index a372e2c..976e54b 100644 --- a/configs/detection/mv-sparse-featfusion_8xb4_embodiedscan-3d-284class-9dof-mlvl-distorted-corners-group4.py +++ b/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py @@ -184,28 +184,30 @@ sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict(type='RepeatDataset', times=10, - dataset=dict(type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train_full.pkl', - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - metainfo=metainfo))) + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train_split_filtered.pkl', + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth', + metainfo=metainfo))) val_dataloader = dict(batch_size=1, num_workers=1, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict(type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val_full.pkl', - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - metainfo=metainfo)) + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val_split_filtered.pkl', + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth', + metainfo=metainfo)) test_dataloader = val_dataloader val_evaluator = dict(type='IndoorDetMetric') diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py new file mode 100644 index 0000000..f1568db --- /dev/null +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py @@ -0,0 +1,212 @@ +_base_ = ['../default_runtime.py'] +n_points = 100000 + +backend_args = None +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/', +# 'data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/' +# })) + +metainfo = dict(classes='all') + +model = dict( + type='SparseFeatureFusion3DGrounder', + num_queries=256, + voxel_size=0.01, + data_preprocessor=dict(type='Det3DDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='mmdet.ResNet', + depth=50, + base_channels=16, # to make it consistent with mink resnet + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + backbone_lidar=dict(type='MinkResNet', in_channels=3, depth=34), + use_xyz_feat=True, + # change due to no img feature fusion + neck_3d=dict(type='MinkNeck', + num_classes=1, + in_channels=[128, 256, 512, 1024], + out_channels=256, + voxel_size=0.01, + pts_prune_threshold=1000), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict(embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0)), + post_norm_cfg=None), + bbox_head=dict(type='GroundingHead', + num_classes=256, + sync_cls_avg_factor=True, + decouple_bbox_loss=True, + decouple_groups=4, + share_pred_layer=True, + decouple_weights=[0.2, 0.2, 0.2, 0.4], + contrastive_cfg=dict(max_text_len=256, + log_scale='auto', + bias=True), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='BBoxCDLoss', + mode='l1', + loss_weight=1.0, + group='g8')), + coord_type='DEPTH', + # training and testing settings + train_cfg=dict(assigner=dict(type='HungarianAssigner3D', + match_costs=[ + dict(type='BinaryFocalLossCost', + weight=1.0), + dict(type='BBox3DL1Cost', weight=2.0), + dict(type='IoU3DCost', weight=2.0) + ]), ), + test_cfg=None) + +dataset_type = 'MultiView3DGroundingDataset' +data_root = 'data' + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=20, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='GlobalRotScaleTrans', + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[.9, 1.1], + translation_std=[.1, .1, .1], + shift_height=False), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=50, + ordered=True, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +# TODO: to determine a reasonable batch size +train_dataloader = dict( + batch_size=12, + num_workers=12, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict(type='RepeatDataset', + times=1, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train_split_filtered.pkl', + vg_file='embodiedscan_train_full_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) + +val_dataloader = dict(batch_size=12, + num_workers=12, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val_split_filtered.pkl', + vg_file='embodiedscan_val_full_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) +test_dataloader = val_dataloader + +val_evaluator = dict(type='GroundingMetric') +test_evaluator = val_evaluator + +# training schedule for 1x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer +lr = 5e-4 +optim_wrapper = dict(type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), + paramwise_cfg=dict( + custom_keys={ + 'text_encoder': dict(lr_mult=0.0), + 'decoder': dict(lr_mult=0.1, decay_mult=1.0) + }), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning rate +param_scheduler = dict(type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + +custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] + +# hooks +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) + +# vis_backends = [ +# dict(type='TensorboardVisBackend'), +# dict(type='LocalVisBackend') +# ] +# visualizer = dict( +# type='Det3DLocalVisualizer', +# vis_backends=vis_backends, name='visualizer') + +find_unused_parameters = True +load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py new file mode 100644 index 0000000..38dc457 --- /dev/null +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py @@ -0,0 +1,212 @@ +_base_ = ['../default_runtime.py'] +n_points = 100000 + +backend_args = None +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/', +# 'data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/' +# })) + +metainfo = dict(classes='all') + +model = dict( + type='SparseFeatureFusion3DGrounder', + num_queries=256, + voxel_size=0.01, + data_preprocessor=dict(type='Det3DDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='mmdet.ResNet', + depth=50, + base_channels=16, # to make it consistent with mink resnet + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + backbone_lidar=dict(type='MinkResNet', in_channels=3, depth=34), + use_xyz_feat=True, + # change due to no img feature fusion + neck_3d=dict(type='MinkNeck', + num_classes=1, + in_channels=[128, 256, 512, 1024], + out_channels=256, + voxel_size=0.01, + pts_prune_threshold=1000), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict(embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0)), + post_norm_cfg=None), + bbox_head=dict(type='GroundingHead', + num_classes=256, + sync_cls_avg_factor=True, + decouple_bbox_loss=True, + decouple_groups=4, + share_pred_layer=True, + decouple_weights=[0.2, 0.2, 0.2, 0.4], + contrastive_cfg=dict(max_text_len=256, + log_scale='auto', + bias=True), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='BBoxCDLoss', + mode='l1', + loss_weight=1.0, + group='g8')), + coord_type='DEPTH', + # training and testing settings + train_cfg=dict(assigner=dict(type='HungarianAssigner3D', + match_costs=[ + dict(type='BinaryFocalLossCost', + weight=1.0), + dict(type='BBox3DL1Cost', weight=2.0), + dict(type='IoU3DCost', weight=2.0) + ]), ), + test_cfg=None) + +dataset_type = 'MultiView3DGroundingDataset' +data_root = 'data' + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=20, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='GlobalRotScaleTrans', + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[.9, 1.1], + translation_std=[.1, .1, .1], + shift_height=False), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=50, + ordered=True, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +# TODO: to determine a reasonable batch size +train_dataloader = dict( + batch_size=12, + num_workers=12, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict(type='RepeatDataset', + times=1, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train_split_filtered.pkl', + vg_file='embodiedscan_train_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) + +val_dataloader = dict(batch_size=12, + num_workers=12, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val_split_filtered.pkl', + vg_file='embodiedscan_val_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) +test_dataloader = val_dataloader + +val_evaluator = dict(type='GroundingMetric') +test_evaluator = val_evaluator + +# training schedule for 1x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer +lr = 5e-4 +optim_wrapper = dict(type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), + paramwise_cfg=dict( + custom_keys={ + 'text_encoder': dict(lr_mult=0.0), + 'decoder': dict(lr_mult=0.1, decay_mult=1.0) + }), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning rate +param_scheduler = dict(type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + +custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] + +# hooks +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) + +# vis_backends = [ +# dict(type='TensorboardVisBackend'), +# dict(type='LocalVisBackend') +# ] +# visualizer = dict( +# type='Det3DLocalVisualizer', +# vis_backends=vis_backends, name='visualizer') + +find_unused_parameters = True +load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py new file mode 100644 index 0000000..1342198 --- /dev/null +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py @@ -0,0 +1,213 @@ +_base_ = ['../default_runtime.py'] +n_points = 100000 + +backend_args = None +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/', +# 'data/scannet/': +# 's3://openmmlab/datasets/detection3d/scannet_processed/' +# })) + +metainfo = dict(classes='all') + +model = dict( + type='SparseFeatureFusion3DGrounder', + num_queries=256, + voxel_size=0.01, + data_preprocessor=dict(type='Det3DDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='mmdet.ResNet', + depth=50, + base_channels=16, # to make it consistent with mink resnet + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + backbone_lidar=dict(type='MinkResNet', in_channels=3, depth=34), + use_xyz_feat=True, + # change due to no img feature fusion + neck_3d=dict(type='MinkNeck', + num_classes=1, + in_channels=[128, 256, 512, 1024], + out_channels=256, + voxel_size=0.01, + pts_prune_threshold=1000), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict(embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0)), + post_norm_cfg=None), + bbox_head=dict(type='GroundingHead', + num_classes=256, + box_coder='FCAF', + sync_cls_avg_factor=True, + decouple_bbox_loss=True, + decouple_groups=4, + share_pred_layer=True, + decouple_weights=[0.2, 0.2, 0.2, 0.4], + contrastive_cfg=dict(max_text_len=256, + log_scale='auto', + bias=True), + loss_cls=dict(type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='BBoxCDLoss', + mode='l1', + loss_weight=1.0, + group='g8')), + coord_type='DEPTH', + # training and testing settings + train_cfg=dict(assigner=dict(type='HungarianAssigner3D', + match_costs=[ + dict(type='BinaryFocalLossCost', + weight=1.0), + dict(type='BBox3DL1Cost', weight=2.0), + dict(type='IoU3DCost', weight=2.0) + ]), ), + test_cfg=None) + +dataset_type = 'MultiView3DGroundingDataset' +data_root = 'data' + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=20, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='GlobalRotScaleTrans', + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[.9, 1.1], + translation_std=[.1, .1, .1], + shift_height=False), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='MultiViewPipeline', + n_images=50, + ordered=True, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadDepthFromFile', backend_args=backend_args), + dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), + dict(type='PointSample', num_points=n_points // 10), + dict(type='Resize', scale=(480, 480), keep_ratio=False) + ]), + dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), + dict(type='PointSample', num_points=n_points), + dict(type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) +] + +# TODO: to determine a reasonable batch size +train_dataloader = dict( + batch_size=12, + num_workers=12, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict(type='RepeatDataset', + times=1, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train_split_filtered.pkl', + vg_file='embodiedscan_train_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) + +val_dataloader = dict(batch_size=12, + num_workers=12, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val_split_filtered.pkl', + vg_file='embodiedscan_val_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) +test_dataloader = val_dataloader + +val_evaluator = dict(type='GroundingMetric') +test_evaluator = val_evaluator + +# training schedule for 1x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer +lr = 5e-4 +optim_wrapper = dict(type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), + paramwise_cfg=dict( + custom_keys={ + 'text_encoder': dict(lr_mult=0.0), + 'decoder': dict(lr_mult=0.1, decay_mult=1.0) + }), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning rate +param_scheduler = dict(type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + +custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] + +# hooks +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) + +# vis_backends = [ +# dict(type='TensorboardVisBackend'), +# dict(type='LocalVisBackend') +# ] +# visualizer = dict( +# type='Det3DLocalVisualizer', +# vis_backends=vis_backends, name='visualizer') + +find_unused_parameters = True +load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa diff --git a/embodiedscan/datasets/__init__.py b/embodiedscan/datasets/__init__.py index 788c41a..79276ac 100644 --- a/embodiedscan/datasets/__init__.py +++ b/embodiedscan/datasets/__init__.py @@ -1,4 +1,5 @@ -from .embodied_dataset import EmbodiedScanDataset +from .embodiedscan_dataset import EmbodiedScanDataset +from .mv_3dvg_dataset import MultiView3DGroundingDataset from .transforms import * # noqa: F401,F403 -__all__ = ['EmbodiedScanDataset'] +__all__ = ['EmbodiedScanDataset', 'MultiView3DGroundingDataset'] diff --git a/embodiedscan/datasets/embodied_dataset.py b/embodiedscan/datasets/embodiedscan_dataset.py similarity index 100% rename from embodiedscan/datasets/embodied_dataset.py rename to embodiedscan/datasets/embodiedscan_dataset.py diff --git a/embodiedscan/datasets/mv_3dvg_dataset.py b/embodiedscan/datasets/mv_3dvg_dataset.py new file mode 100644 index 0000000..f6a2764 --- /dev/null +++ b/embodiedscan/datasets/mv_3dvg_dataset.py @@ -0,0 +1,509 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +import os +import warnings +from os import path as osp +from typing import Callable, List, Optional, Union + +import mmengine +import numpy as np +from mmengine.dataset import BaseDataset +from mmengine.fileio import load + +from embodiedscan.registry import DATASETS +from embodiedscan.structures import get_box_type + + +@DATASETS.register_module() +class MultiView3DGroundingDataset(BaseDataset): + r"""Multi-View 3D Grounding Dataset for EmbodiedScan. + + This class serves as the API for experiments on the EmbodiedScan Dataset. + + Please refer to `EmbodiedScan Dataset + `_ for data downloading. + + TODO: Merge the implementation with EmbodiedScanDataset. + + Args: + data_root (str): Path of dataset root. + ann_file (str): Path of annotation file. + vg_file (str): Path of the visual grounding annotation file. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. + pipeline (List[dict]): Pipeline used for data processing. + Defaults to []. + box_type_3d (str): Type of 3D box of this dataset. + Based on the `box_type_3d`, the dataset will encapsulate the box + to its original format then converted them to `box_type_3d`. + Defaults to 'Euler-Depth' in this dataset. + serialize_data (bool): Whether to serialize all data samples to save + memory. Defaults to False. It is set to True typically, but we + need to do the serialization after getting the data_list through + the preliminary loading and converting. Therefore, we set it to + False by default and serialize data samples at last meanwhile + setting this attribute to True. + filter_empty_gt (bool): Whether to filter the data with empty GT. + If it's set to be True, the example with empty annotations after + data pipeline will be dropped and a random example will be chosen + in `__getitem__`. Defaults to True. + remove_dontcare (bool): Whether to remove objects that we do not care. + Defaults to False. + test_mode (bool): Whether the dataset is in test mode. + Defaults to False. + load_eval_anns (bool): Whether to load evaluation annotations. + Defaults to True. Only take effect when test_mode is True. + """ + # NOTE: category "step" -> "steps" to avoid potential naming conflicts in + # TensorboardVisBackend + METAINFO = { + 'classes': + ('adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', + 'backpack', 'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', + 'baseboard', 'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', + 'bench', 'bicycle', 'bidet', 'bin', 'blackboard', 'blanket', 'blinds', + 'board', 'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', + 'bread', 'broom', 'brush', 'bucket', 'cabinet', 'calendar', 'camera', + 'can', 'candle', 'candlestick', 'cap', 'car', 'carpet', 'cart', + 'case', 'ceiling', 'chair', 'chandelier', 'cleanser', 'clock', + 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil', + 'column', 'commode', 'computer', 'conducting wire', 'container', + 'control', 'copier', 'cosmetics', 'couch', 'counter', 'countertop', + 'crate', 'crib', 'cube', 'cup', 'curtain', 'cushion', 'decoration', + 'desk', 'detergent', 'device', 'dish rack', 'dishwasher', 'dispenser', + 'divider', 'door', 'door knob', 'doorframe', 'doorway', 'drawer', + 'dress', 'dresser', 'drum', 'duct', 'dumbbell', 'dustpan', 'dvd', + 'eraser', 'excercise equipment', 'fan', 'faucet', 'fence', 'file', + 'fire extinguisher', 'fireplace', 'floor', 'flowerpot', 'flush', + 'folder', 'food', 'footstool', 'frame', 'fruit', 'furniture', + 'garage door', 'garbage', 'glass', 'globe', 'glove', 'grab bar', + 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger', 'hat', + 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook', + 'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle', + 'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', + 'ladder', 'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', + 'machine', 'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', + 'menu', 'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', + 'napkins', 'notebook', 'object', 'ottoman', 'oven', 'pack', 'package', + 'pad', 'pan', 'panel', 'paper', 'paper cutter', 'partition', + 'pedestal', 'pen', 'person', 'piano', 'picture', 'pillar', 'pillow', + 'pipe', 'pitcher', 'plant', 'plate', 'player', 'plug', 'plunger', + 'pool', 'pool table', 'poster', 'pot', 'price tag', 'printer', + 'projector', 'purse', 'rack', 'radiator', 'radio', 'rail', + 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod', + 'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', + 'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel', + 'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', + 'socket', 'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', + 'stapler', 'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', + 'structure', 'sunglasses', 'support', 'switch', 'table', 'tablet', + 'teapot', 'telephone', 'thermostat', 'tissue', 'tissue box', + 'toaster', 'toilet', 'toilet paper', 'toiletry', 'tool', 'toothbrush', + 'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube', + 'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vanity', + 'vase', 'vent', 'ventilation', 'wall', 'wardrobe', 'washbasin', + 'washing machine', 'water cooler', 'water heater', 'window', + 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap'), + 'valid_class_ids': + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, + 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, + 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, + 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, + 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, + 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, + 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, + 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, + 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, + 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, + 287, 288) + } + + def __init__(self, + data_root: str, + ann_file: str, + vg_file: str, + metainfo: Optional[dict] = None, + pipeline: List[Union[dict, Callable]] = [], + box_type_3d: str = 'Euler-Depth', + serialize_data: bool = False, + filter_empty_gt: bool = True, + remove_dontcare: bool = False, + test_mode: bool = False, + load_eval_anns: bool = True, + **kwargs) -> None: + + if 'classes' in metainfo: + if metainfo['classes'] == 'all': + metainfo['classes'] = list(self.METAINFO['classes']) + + self.det3d_valid_id2label = np.zeros( + max(self.METAINFO['valid_class_ids']) + 1, dtype=np.int64) + for _ in range(self.det3d_valid_id2label.shape[0]): + self.det3d_valid_id2label[_] = -1 + for cls_idx, cat_id in enumerate(self.METAINFO['valid_class_ids']): + self.det3d_valid_id2label[cat_id] = cls_idx + + self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) + self.filter_empty_gt = filter_empty_gt + self.remove_dontcare = remove_dontcare + self.load_eval_anns = load_eval_anns + + super().__init__(data_root=data_root, + ann_file=ann_file, + metainfo=metainfo, + pipeline=pipeline, + serialize_data=serialize_data, + test_mode=test_mode, + **kwargs) + + self.vg_file = osp.join(self.data_root, vg_file) + self.convert_info_to_scan() + self.data_list = self.load_language_data() + self.data_bytes, self.data_address = self._serialize_data() + self.serialize_data = True + + def process_metainfo(self): + """This function will be processed after metainfos from ann_file and + config are combined.""" + assert 'categories' in self._metainfo + + if 'classes' not in self._metainfo: + self._metainfo.setdefault( + 'classes', list(self._metainfo['categories'].keys())) + + self.label_mapping = np.full( + max(list(self._metainfo['categories'].values())) + 1, + -1, + dtype=int) + for key, value in self._metainfo['categories'].items(): + if key in self._metainfo['classes']: + self.label_mapping[value] = self._metainfo['classes'].index( + key) + + self.occ_label_mapping = np.full( + max(list(self._metainfo['categories'].values())) + 1, + -1, + dtype=int) + if 'occ_classes' in self._metainfo: + for idx, label_name in enumerate(self._metainfo['occ_classes']): + self.occ_label_mapping[self.metainfo['categories'][ + label_name]] = idx + 1 # 1-based, 0 is empty + + @staticmethod + def _get_axis_align_matrix(info: dict) -> np.ndarray: + """Get axis_align_matrix from info. If not exist, return identity mat. + + Args: + info (dict): Info of a single sample data. + + Returns: + np.ndarray: 4x4 transformation matrix. + """ + if 'axis_align_matrix' in info: + return np.array(info['axis_align_matrix']) + else: + warnings.warn( + 'axis_align_matrix is not found in ScanNet data info, please ' + 'use new pre-process scripts to re-generate ScanNet data') + return np.eye(4).astype(np.float32) + + # need to compensate the scan_id info to the original pkl file + def convert_info_to_scan(self): + self.scans = dict() + for data in self.data_list: + scan_id = data['scan_id'] + self.scans[scan_id] = data + + @staticmethod + def _is_view_dep(text): + """Check whether to augment based on sr3d utterance.""" + rels = [ + 'front', 'behind', 'back', 'left', 'right', 'facing', 'leftmost', + 'rightmost', 'looking', 'across' + ] + words = set(text.split()) + return any(rel in words for rel in rels) + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + If the annotation file does not follow `OpenMMLab 2.0 format dataset + `_ . + The subclass must override this method for load annotations. The meta + information of annotation file will be overwritten :attr:`METAINFO` + and ``metainfo`` argument of constructor. + + Returns: + list[dict]: A list of annotation. + """ # noqa: E501 + # `self.ann_file` denotes the absolute annotation file path if + # `self.root=None` or relative path if `self.root=/path/to/data/`. + annotations = load(self.ann_file) + if not isinstance(annotations, dict): + raise TypeError(f'The annotations loaded from annotation file ' + f'should be a dict, but got {type(annotations)}!') + if 'data_list' not in annotations or 'metainfo' not in annotations: + raise ValueError('Annotation must have data_list and metainfo ' + 'keys') + metainfo = annotations['metainfo'] + raw_data_list = annotations['data_list'] + + # Meta information load from annotation file will not influence the + # existed meta information load from `BaseDataset.METAINFO` and + # `metainfo` arguments defined in constructor. + for k, v in metainfo.items(): + self._metainfo.setdefault(k, v) + + self.process_metainfo() + + # load and parse data_infos. + data_list = [] + for raw_data_info in raw_data_list: + # parse raw data information to target format + data_info = self.parse_data_info(raw_data_info) + if isinstance(data_info, dict): + # For image tasks, `data_info` should information if single + # image, such as dict(img_path='xxx', width=360, ...) + data_list.append(data_info) + elif isinstance(data_info, list): + # For video tasks, `data_info` could contain image + # information of multiple frames, such as + # [dict(video_path='xxx', timestamps=...), + # dict(video_path='xxx', timestamps=...)] + for item in data_info: + if not isinstance(item, dict): + raise TypeError('data_info must be list of dict, but ' + f'got {type(item)}') + data_list.extend(data_info) + else: + raise TypeError('data_info should be a dict or list of dict, ' + f'but got {type(data_info)}') + + return data_list + + def load_language_data(self): + # load the object-level annotations + language_annotations = load(self.vg_file) + # language_infos = [ + # { + # 'scan_id': anno['scan_id'], + # 'target_id': int(anno['target_id']), + # 'distractor_ids': anno['distractor_ids'], + # 'text': anno['text'], + # 'tokens_positive': anno['tokens_positive'] + # } + # for anno in language_annotations + # ] + # According to each object annotation, + # find all objects in the corresponding scan + language_infos = [] + for anno in mmengine.track_iter_progress(language_annotations): + language_info = dict() + language_info.update({ + 'scan_id': anno['scan_id'], + 'target_id': int(anno['target_id']), + 'distractor_ids': anno['distractor_ids'], + 'text': anno['text'], + 'tokens_positive': anno['tokens_positive'] + }) + data = self.scans[language_info['scan_id']] + language_info['axis_align_matrix'] = data['axis_align_matrix'] + language_info['img_path'] = data['img_path'] + language_info['depth_img_path'] = data['depth_img_path'] + language_info['depth2img'] = data['depth2img'] + if 'cam2img' in data: + language_info['cam2img'] = data['cam2img'] + language_info['scan_id'] = data['scan_id'] + language_info['depth_shift'] = data['depth_shift'] + language_info['depth_cam2img'] = data['depth_cam2img'] + + ann_info = data['ann_info'] + object_ids = ann_info['bbox_id'] # numpy array + labels = ann_info['gt_labels_3d'] # all box labels in the scan + bboxes = ann_info['gt_bboxes_3d'] # BaseInstanceBboxes + # obtain all objects sharing the same category with + # the target object, the num of such objects <= 32 + object_ind = np.where(object_ids == language_info['target_id'])[0] + if len(object_ind) != 1: + continue + # save the bounding boxes and corresponding labels + language_anno_info = dict() + language_anno_info['gt_bboxes_3d'] = bboxes[object_ind] + language_anno_info['gt_labels_3d'] = labels[object_ind] + # the 'distractor_ids' starts from 1, not 0 + language_anno_info['is_view_dep'] = self._is_view_dep( + language_info['text']) + language_anno_info['is_hard'] = len( + language_info['distractor_ids'] + ) > 3 # more than three distractors + language_anno_info['is_unique'] = len( + language_info['distractor_ids']) == 0 + + if not self.test_mode: + language_info['ann_info'] = language_anno_info + + if self.test_mode and self.load_eval_anns: + language_info['ann_info'] = language_anno_info + language_info['eval_ann_info'] = language_info['ann_info'] + + language_infos.append(language_info) + + del self.scans + + return language_infos + + def parse_data_info(self, info: dict) -> dict: + """Process the raw data info. + + The only difference with it in `Det3DDataset` + is the specific process for `axis_align_matrix'. + + Args: + info (dict): Raw info dict. + + Returns: + dict: Has `ann_info` in training stage. And + all path has been converted to absolute path. + """ + info['box_type_3d'] = self.box_type_3d + info['axis_align_matrix'] = self._get_axis_align_matrix(info) + # Because multi-view settings are different from original designs + # we temporarily follow the ori design in ImVoxelNet + info['img_path'] = [] + info['depth_img_path'] = [] + info['scan_id'] = info['sample_idx'] + ann_dataset = info['sample_idx'].split('/')[0] + if ann_dataset == 'matterport3d': + info['depth_shift'] = 4000.0 + else: + info['depth_shift'] = 1000.0 + + if 'cam2img' in info: + cam2img = info['cam2img'].astype(np.float32) + else: + cam2img = [] + + extrinsics = [] + for i in range(len(info['images'])): + img_path = os.path.join(self.data_prefix.get('img_path', ''), + info['images'][i]['img_path']) + depth_img_path = os.path.join(self.data_prefix.get('img_path', ''), + info['images'][i]['depth_path']) + + info['img_path'].append(img_path) + info['depth_img_path'].append(depth_img_path) + align_global2cam = np.linalg.inv( + info['axis_align_matrix'] @ info['images'][i]['cam2global']) + extrinsics.append(align_global2cam.astype(np.float32)) + if 'cam2img' not in info: + cam2img.append(info['images'][i]['cam2img'].astype(np.float32)) + + info['depth2img'] = dict(extrinsic=extrinsics, + intrinsic=cam2img, + origin=np.array([.0, .0, + .5]).astype(np.float32)) + + if 'depth_cam2img' not in info: + info['depth_cam2img'] = cam2img + + if not self.test_mode: + info['ann_info'] = self.parse_ann_info(info) + if self.test_mode and self.load_eval_anns: + info['ann_info'] = self.parse_ann_info(info) + info['eval_ann_info'] = info['ann_info'] + return info + + def parse_ann_info(self, info: dict) -> dict: + """Process the `instances` in data info to `ann_info`. + + Args: + info (dict): Info dict. + + Returns: + dict: Processed `ann_info`. + """ + for instance in info['instances']: + if instance['bbox_label_3d'] < self.det3d_valid_id2label.shape[0]: + value = self.det3d_valid_id2label[instance['bbox_label_3d']] + if value < 0: + raise Exception('Class out of range') + instance['bbox_label_3d'] = value + else: + raise Exception('Class out of range') + + # ann_info = None + # if 'instances' in info and len(info['instances']) > 0: + # ann_info = dict( + # gt_bboxes_3d=np.zeros((len(info['instances']), 9), + # dtype=np.float32), + # gt_labels_3d=np.zeros((len(info['instances']), ), + # dtype=np.int64), + # ) + # for idx, instance in enumerate(info['instances']): + # ann_info['gt_bboxes_3d'][idx] = instance['bbox_3d'] + # ann_info['gt_labels_3d'][idx] = self.label_mapping[ + # instance['bbox_label_3d']] + + # add s or gt prefix for most keys after concat + # we only process 3d annotations here, the corresponding + # 2d annotation process is in the `LoadAnnotations3D` + # in `transforms` + name_mapping = { + 'bbox_label_3d': 'gt_labels_3d', + 'bbox_label': 'gt_bboxes_labels', + 'bbox': 'gt_bboxes', + 'bbox_3d': 'gt_bboxes_3d', + 'depth': 'depths', + 'center_2d': 'centers_2d', + 'attr_label': 'attr_labels', + 'velocity': 'velocities', + } + instances = info['instances'] + # empty gt + if len(instances) == 0: + return None + else: + keys = list(instances[0].keys()) + ann_info = dict() + for ann_name in keys: + temp_anns = [item[ann_name] for item in instances] + # map the original dataset label to training label + if 'label' in ann_name and ann_name != 'attr_label': + temp_anns = [ + self.label_mapping[item] for item in temp_anns + ] + if ann_name in name_mapping: + mapped_ann_name = name_mapping[ann_name] + else: + mapped_ann_name = ann_name + + if 'label' in ann_name: + temp_anns = np.array(temp_anns).astype(np.int64) + elif ann_name in name_mapping: + temp_anns = np.array(temp_anns).astype(np.float32) + else: + temp_anns = np.array(temp_anns) + + ann_info[mapped_ann_name] = temp_anns + ann_info['instances'] = info['instances'] + + if ann_info is None: + ann_info = dict() + ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32) + ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64) + + ann_info['gt_bboxes_3d'] = self.box_type_3d( + ann_info['gt_bboxes_3d'], + box_dim=ann_info['gt_bboxes_3d'].shape[-1], + with_yaw=True, + origin=(0.5, 0.5, 0.5)) + + return ann_info diff --git a/embodiedscan/datasets/transforms/formatting.py b/embodiedscan/datasets/transforms/formatting.py index a16d7dd..e849ca9 100644 --- a/embodiedscan/datasets/transforms/formatting.py +++ b/embodiedscan/datasets/transforms/formatting.py @@ -74,7 +74,8 @@ def __init__( 'trans_mat', 'affine_aug', 'sweep_img_metas', 'ori_cam2img', 'cam2global', 'crop_offset', 'img_crop_offset', 'resize_img_shape', 'lidar2cam', 'ori_lidar2img', 'num_ref_frames', 'num_views', - 'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix')): + 'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix', + 'text', 'tokens_positive')): self.keys = keys self.meta_keys = meta_keys diff --git a/embodiedscan/datasets/transforms/multiview.py b/embodiedscan/datasets/transforms/multiview.py index a0b2b3b..b88ec17 100644 --- a/embodiedscan/datasets/transforms/multiview.py +++ b/embodiedscan/datasets/transforms/multiview.py @@ -55,7 +55,11 @@ def transform(self, results: dict) -> dict: # sometimes can not get the accurate n_images in this way # then take the first n_images one ids = ids[:self.n_images] - # else: all the ids are evaluated + else: # the number of images < pre-set n_images + # randomly select n_images ids to enable batch-wise inference + # In practice, can directly use the original ids to avoid + # redundant computation + ids = np.random.choice(ids, self.n_images, replace=replace) else: ids = np.random.choice(ids, self.n_images, replace=replace) for i in ids.tolist(): diff --git a/embodiedscan/eval/__init__.py b/embodiedscan/eval/__init__.py index 9061b7b..3ca0fe9 100644 --- a/embodiedscan/eval/__init__.py +++ b/embodiedscan/eval/__init__.py @@ -1,4 +1,3 @@ -from .det_metric import IndoorDetMetric -from .occupancy_metric import OccupancyMetric +from .metrics import GroundingMetric, IndoorDetMetric, OccupancyMetric -__all__ = ['IndoorDetMetric', 'OccupancyMetric'] +__all__ = ['IndoorDetMetric', 'OccupancyMetric', 'GroundingMetric'] diff --git a/embodiedscan/eval/metrics/__init__.py b/embodiedscan/eval/metrics/__init__.py new file mode 100644 index 0000000..2d8beb7 --- /dev/null +++ b/embodiedscan/eval/metrics/__init__.py @@ -0,0 +1,5 @@ +from .det_metric import IndoorDetMetric +from .grounding_metric import GroundingMetric +from .occupancy_metric import OccupancyMetric + +__all__ = ['IndoorDetMetric', 'OccupancyMetric', 'GroundingMetric'] diff --git a/embodiedscan/eval/det_metric.py b/embodiedscan/eval/metrics/det_metric.py similarity index 99% rename from embodiedscan/eval/det_metric.py rename to embodiedscan/eval/metrics/det_metric.py index a5869c6..302f496 100644 --- a/embodiedscan/eval/det_metric.py +++ b/embodiedscan/eval/metrics/det_metric.py @@ -14,7 +14,7 @@ from embodiedscan.registry import METRICS from embodiedscan.structures import get_box_type -from .indoor_eval import indoor_eval +from ..indoor_eval import indoor_eval @METRICS.register_module() diff --git a/embodiedscan/eval/metrics/grounding_metric.py b/embodiedscan/eval/metrics/grounding_metric.py new file mode 100644 index 0000000..524d837 --- /dev/null +++ b/embodiedscan/eval/metrics/grounding_metric.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +from typing import Dict, List, Optional, Sequence + +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger, print_log +from terminaltables import AsciiTable + +from embodiedscan.registry import METRICS +from embodiedscan.structures import EulerDepthInstance3DBoxes + + +@METRICS.register_module() +class GroundingMetric(BaseMetric): + """Lanuage grounding evaluation metric. We calculate the grounding + performance based on the alignment score of each bbox with the input + prompt. + + Args: + iou_thr (float or List[float]): List of iou threshold when calculate + the metric. Defaults to [0.25, 0.5]. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix will + be used instead. Defaults to None. + """ + + def __init__(self, + iou_thr: List[float] = [0.25, 0.5], + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super(GroundingMetric, self).__init__(prefix=prefix, + collect_device=collect_device) + self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for data_sample in data_samples: + pred_3d = data_sample['pred_instances_3d'] + eval_ann_info = data_sample['eval_ann_info'] + cpu_pred_3d = dict() + for k, v in pred_3d.items(): + if hasattr(v, 'to'): + cpu_pred_3d[k] = v.to('cpu') + else: + cpu_pred_3d[k] = v + self.results.append((eval_ann_info, cpu_pred_3d)) + + def ground_eval(self, gt_annos, det_annos, logger=None): + + assert len(det_annos) == len(gt_annos) + + pred = {} + gt = {} + + object_types = [ + 'Easy', 'Hard', 'View-Dep', 'View-Indep', 'Unique', 'Multi', + 'Overall' + ] + + for t in self.iou_thr: + for object_type in object_types: + pred.update({object_type + '@' + str(t): 0}) + gt.update({object_type + '@' + str(t): 1e-14}) + + for sample_id in range(len(det_annos)): + det_anno = det_annos[sample_id] + gt_anno = gt_annos[sample_id] + target_scores = det_anno['target_scores_3d'] # (num_query, ) + + bboxes = det_anno['bboxes_3d'] + gt_bboxes = gt_anno['gt_bboxes_3d'] + bboxes = EulerDepthInstance3DBoxes(bboxes.tensor, + origin=(0.5, 0.5, 0.5)) + gt_bboxes = EulerDepthInstance3DBoxes(gt_bboxes.tensor, + origin=(0.5, 0.5, 0.5)) + + view_dep = gt_anno['is_view_dep'] + hard = gt_anno['is_hard'] + unique = gt_anno['is_unique'] + + box_index = target_scores.argsort(dim=-1, descending=True)[:10] + top_bbox = bboxes[box_index] + + iou = top_bbox.overlaps(top_bbox, gt_bboxes) # (num_query, 1) + + for t in self.iou_thr: + threshold = iou > t + found = int(threshold.any()) + if view_dep: + gt['View-Dep@' + str(t)] += 1 + pred['View-Dep@' + str(t)] += found + else: + gt['View-Indep@' + str(t)] += 1 + pred['View-Indep@' + str(t)] += found + if hard: + gt['Hard@' + str(t)] += 1 + pred['Hard@' + str(t)] += found + else: + gt['Easy@' + str(t)] += 1 + pred['Easy@' + str(t)] += found + if unique: + gt['Unique@' + str(t)] += 1 + pred['Unique@' + str(t)] += found + else: + gt['Multi@' + str(t)] += 1 + pred['Multi@' + str(t)] += found + + gt['Overall@' + str(t)] += 1 + pred['Overall@' + str(t)] += found + + header = ['Type'] + header.extend(object_types) + ret_dict = {} + + for t in self.iou_thr: + table_columns = [['results']] + for object_type in object_types: + metric = object_type + '@' + str(t) + value = pred[metric] / max(gt[metric], 1) + ret_dict[metric] = value + table_columns.append([f'{value:.4f}']) + + table_data = [header] + table_rows = list(zip(*table_columns)) + table_data += table_rows + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) + + return ret_dict + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results after all batches have + been processed. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() # noqa + annotations, preds = zip(*results) + + ret_dict = self.ground_eval(annotations, preds) + + return ret_dict diff --git a/embodiedscan/eval/occupancy_metric.py b/embodiedscan/eval/metrics/occupancy_metric.py similarity index 100% rename from embodiedscan/eval/occupancy_metric.py rename to embodiedscan/eval/metrics/occupancy_metric.py diff --git a/embodiedscan/models/__init__.py b/embodiedscan/models/__init__.py index 1401cbc..cc4172a 100644 --- a/embodiedscan/models/__init__.py +++ b/embodiedscan/models/__init__.py @@ -5,3 +5,5 @@ from .detectors import * # noqa: F401,F403 from .layers import * # noqa: F401,F403 from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 diff --git a/embodiedscan/models/dense_heads/__init__.py b/embodiedscan/models/dense_heads/__init__.py index 2b5caf4..ae20f42 100644 --- a/embodiedscan/models/dense_heads/__init__.py +++ b/embodiedscan/models/dense_heads/__init__.py @@ -1,3 +1,4 @@ from .fcaf3d_head import FCAF3DHead, FCAF3DHeadRotMat +from .grounding_head import GroundingHead -__all__ = ['FCAF3DHead', 'FCAF3DHeadRotMat'] +__all__ = ['FCAF3DHead', 'FCAF3DHeadRotMat', 'GroundingHead'] diff --git a/embodiedscan/models/dense_heads/grounding_head.py b/embodiedscan/models/dense_heads/grounding_head.py new file mode 100644 index 0000000..b6a15ad --- /dev/null +++ b/embodiedscan/models/dense_heads/grounding_head.py @@ -0,0 +1,855 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +import copy +import math +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import Linear +from mmdet.models.utils import multi_apply +from mmdet.utils import ConfigType, InstanceList, OptMultiConfig, reduce_mean +from mmengine.model import BaseModule, constant_init +from mmengine.structures import InstanceData +from pytorch3d.transforms import matrix_to_euler_angles +from torch import Tensor + +from embodiedscan.registry import MODELS, TASK_UTILS +from embodiedscan.structures import (EulerDepthInstance3DBoxes, + rotation_3d_in_axis, rotation_3d_in_euler) +from embodiedscan.utils.typing_config import SampleList + + +class ContrastiveEmbed(nn.Module): + """text visual ContrastiveEmbed layer. + + Args: + max_text_len (int, optional): Maximum length of text. + log_scale (Optional[Union[str, float]]): The initial value of a + learnable parameter to multiply with the similarity + matrix to normalize the output. Defaults to 0.0. + + - If set to 'auto', the similarity matrix will be normalized by + a fixed value ``sqrt(d_c)`` where ``d_c`` is the channel number. + - If set to 'none' or ``None``, there is no normalization applied. + - If set to a float number, the similarity matrix will be multiplied + by ``exp(log_scale)``, where ``log_scale`` is learnable. + bias (bool, optional): Whether to add bias to the output. + If set to ``True``, a learnable bias that is initialized as -4.6 + will be added to the output. Useful when training from scratch. + Defaults to False. + """ + + def __init__(self, + max_text_len: int = 256, + log_scale: Optional[Union[str, float]] = None, + bias: bool = False): + super().__init__() + self.max_text_len = max_text_len + self.log_scale = log_scale + if isinstance(log_scale, float): + self.log_scale = nn.Parameter(torch.Tensor([float(log_scale)]), + requires_grad=True) + elif log_scale not in ['auto', 'none', None]: + raise ValueError(f'log_scale should be one of ' + f'"auto", "none", None, but got {log_scale}') + + self.bias = None + if bias: + bias_value = -math.log((1 - 0.01) / 0.01) + self.bias = nn.Parameter(torch.Tensor([bias_value]), + requires_grad=True) + + def forward(self, + visual_feat: Tensor, + text_feat: Tensor, + text_token_mask: Tensor, + visual_feat_mask: Tensor = None) -> Tensor: + """Forward function. + + Args: + visual_feat (Tensor): Visual features. # (b, num_query, dim) + text_feat (Tensor): Text features. # (b, text_lenth, text_dim) + text_token_mask (Tensor): A mask used for text feats. + visual_feat_mask (Tensor, optional): Mask used for visual features. + Defaults to None. + + Returns: + Tensor: Classification score. + """ + res = visual_feat @ text_feat.transpose( + -1, -2) # (b, num_query, text_lenth) + if isinstance(self.log_scale, nn.Parameter): + res = res * self.log_scale.exp() + elif self.log_scale == 'auto': + # NOTE: similar to the normalizer in self-attention + res = res / math.sqrt(visual_feat.shape[-1]) + if self.bias is not None: + res = res + self.bias + # fill -inf in the padding part + res.masked_fill_(~text_token_mask[:, None, :], float('-inf')) + + if visual_feat_mask is not None: + res.masked_fill_(~visual_feat_mask[:, :, None], float('-inf')) + + new_res = torch.full((*res.shape[:-1], self.max_text_len), + float('-inf'), + device=res.device) + new_res[..., :res.shape[-1]] = res + + return new_res + + +@MODELS.register_module() +class GroundingHead(BaseModule): + """3D Grounding Head.""" + + def __init__(self, + num_classes: int, + embed_dims: int = 256, + num_pred_layer: int = 7, + num_reg_fcs: int = 2, + num_reg: int = 9, + box_coder: str = 'baseline', + sync_cls_avg_factor: bool = False, + decouple_bbox_loss: bool = False, + decouple_groups: int = 3, + decouple_weights: Optional[list] = None, + norm_decouple_loss: bool = False, + loss_cls: ConfigType = dict(type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0), + train_cfg: ConfigType = dict(assigner=dict( + type='HungarianAssigner3D', + match_costs=[ + dict(type='ClassificationCost', weight=1.), + dict(type='BBoxL1Cost', weight=5.0, + box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + contrastive_cfg=dict(max_text_len=256), + share_pred_layer: bool = False, + test_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + self.contrastive_cfg = contrastive_cfg + self.max_text_len = contrastive_cfg.get('max_text_len', 256) + super().__init__(init_cfg=init_cfg) + self.share_pred_layer = share_pred_layer + self.num_pred_layer = num_pred_layer + self.bg_cls_weight = 0 + self.sync_cls_avg_factor = sync_cls_avg_factor + self.decouple_bbox_loss = decouple_bbox_loss + self.decouple_groups = decouple_groups + self.norm_decouple_loss = norm_decouple_loss + if decouple_weights is None: + self.decouple_weights = [ + 1.0 / self.decouple_groups for _ in range(self.decouple_groups) + ] + else: + self.decouple_weights = decouple_weights + self.num_reg = num_reg + self.box_coder = box_coder + assert self.box_coder in ('baseline', 'FCAF') + class_weight = loss_cls.get('class_weight', None) + if class_weight is not None and (self.__class__ is GroundingHead): + assert isinstance(class_weight, float), 'Expected ' \ + 'class_weight to have type float. Found ' \ + f'{type(class_weight)}.' + # NOTE following the official DETR repo, bg_cls_weight means + # relative classification weight of the no-object class. + bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) + assert isinstance(bg_cls_weight, float), 'Expected ' \ + 'bg_cls_weight to have type float. Found ' \ + f'{type(bg_cls_weight)}.' + class_weight = torch.ones(num_classes + 1) * class_weight + # set background class as the last indice + class_weight[num_classes] = bg_cls_weight + loss_cls.update({'class_weight': class_weight}) + if 'bg_cls_weight' in loss_cls: + loss_cls.pop('bg_cls_weight') + self.bg_cls_weight = bg_cls_weight + + if train_cfg: + assert 'assigner' in train_cfg, 'assigner should be provided ' \ + 'when train_cfg is set.' + assigner = train_cfg['assigner'] + self.assigner = TASK_UTILS.build(assigner) + if train_cfg.get('sampler', None) is not None: + raise RuntimeError('DETR do not build sampler.') + self.num_classes = num_classes + self.embed_dims = embed_dims + self.num_reg_fcs = num_reg_fcs + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + + if self.loss_cls.use_sigmoid: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + self._init_layers() + + def _init_layers(self) -> None: + """Initialize classification branch and regression branch of head.""" + fc_cls = ContrastiveEmbed(**self.contrastive_cfg) + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.num_reg)) + reg_branch = nn.Sequential(*reg_branch) + + # NOTE: due to the fc_cls is a contrastive embedding and don't + # have any trainable parameters,we do not need to copy it. + if self.share_pred_layer: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(self.num_pred_layer)]) + else: + self.cls_branches = nn.ModuleList( + [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList([ + copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer) + ]) + + def init_weights(self) -> None: + """Initialize weights of the Deformable DETR head.""" + for m in self.reg_branches: + constant_init(m[-1], 0, bias=0) + nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0) + + def get_targets(self, cls_scores_list: List[Tensor], + pred_bboxes_list: List[Tensor], + batch_gt_instances_3d: InstanceList, + batch_img_metas: List[dict]) -> tuple: + """Compute regression and classification targets for a batch image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image, has shape [num_queries, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_queries, 4]. + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance_3d. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pos_inds_list, neg_inds_list) = multi_apply(self._get_targets_single, + cls_scores_list, + pred_bboxes_list, + batch_gt_instances_3d) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def _bbox_pred_to_bbox(self, points, bbox_pred: Tensor) -> Tensor: + """Transform predicted bbox parameters to bbox. + + Args: + points (Tensor): Final locations of shape (N, num_query, 3) + bbox_pred (Tensor): Predicted bbox parameters of shape + (N, num_query, 12) or (N, 9) or (N, 12), i.e., + for baseline box_coder: + 9-dim: (3D offsets to the center, log(3D sizes), + alpha, beta, gamma) + 12-dim: (3D offsets to the center, log(3D sizes), + x_raw (3D vector), y_raw (3D vector)); + for FCAF box_coder: + 9-dim: (log(distances to 6 faces) (6D vector), + alpha, beta, gamma), + 12-dim: (log(distances to 6 faces) (6D vector), + x_raw (3D vector), y_raw (3D vector)). + + Returns: + Tensor: Transformed 3D box of shape (N, 6) or (N, 7) or (N, 9). + """ + + assert len(points.size()) == len(bbox_pred.size()) == 3 + batch_size = points.shape[0] + num_queries = points.shape[1] + + if self.box_coder == 'baseline': + if bbox_pred.shape[-1] == 9: + center = bbox_pred[..., :3] + points + size = torch.exp(bbox_pred[..., 3:6]).clamp(min=2e-2) + euler = bbox_pred[..., 6:] + elif bbox_pred.shape[-1] == 12: + center = bbox_pred[..., :3] + points + size = torch.exp(bbox_pred[..., 3:6]).clamp(min=2e-2) + x_raw, y_raw = bbox_pred[..., 6:9], bbox_pred[..., 9:] + rot_mat = ortho_6d_2_Mat(x_raw.view(-1, 3), y_raw.view(-1, 3)) + euler = matrix_to_euler_angles(rot_mat, 'ZXY').view( + batch_size, num_queries, 3) + else: + raise NotImplementedError + return torch.cat((center, size, euler), dim=-1) + elif self.box_coder == 'FCAF': + if bbox_pred.shape[0] == 0: + return bbox_pred + if len(points.size()) == 3: + points = points.reshape(-1, points.size(-1)) + bbox_pred = bbox_pred.reshape(-1, bbox_pred.size(-1)) + # axis-aligned case + if bbox_pred.shape[1] == 6: + x_center = points[..., 0] + (bbox_pred[..., 1] - + bbox_pred[..., 0]) / 2 + y_center = points[..., 1] + (bbox_pred[..., 3] - + bbox_pred[..., 2]) / 2 + z_center = points[..., 2] + (bbox_pred[..., 5] - + bbox_pred[..., 4]) / 2 + # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max + # -> x, y, z, w, l, h + base_bbox = torch.stack([ + x_center, + y_center, + z_center, + bbox_pred[..., 0] + bbox_pred[..., 1], + bbox_pred[..., 2] + bbox_pred[..., 3], + bbox_pred[..., 4] + bbox_pred[..., 5], + ], -1) + return base_bbox + # for rotated boxes (7-DoF or 9-DoF) + # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max, alpha -> + # x_center, y_center, z_center, w, l, h, alpha + # (N, num_queries, 3) + bbox_pred[..., :6] = torch.exp(bbox_pred[..., :6]).clamp(min=2e-2) + shift = torch.stack(((bbox_pred[..., 1] - bbox_pred[..., 0]) / 2, + (bbox_pred[..., 3] - bbox_pred[..., 2]) / 2, + (bbox_pred[..., 5] - bbox_pred[..., 4]) / 2), + dim=-1).view(-1, 1, 3) + if bbox_pred.shape[-1] == 7: + euler = bbox_pred[..., [6]] + shift = rotation_3d_in_axis(shift, bbox_pred[..., 6], + axis=2)[:, 0, :] + elif bbox_pred.shape[-1] == 9: + euler = bbox_pred[..., 6:] + shift = rotation_3d_in_euler(shift, bbox_pred[..., 6:])[:, + 0, :] + elif bbox_pred.shape[-1] == 12: + x_raw, y_raw = bbox_pred[..., 6:9], bbox_pred[..., 9:] + rot_mat = ortho_6d_2_Mat(x_raw.view(-1, 3), y_raw.view(-1, 3)) + euler = matrix_to_euler_angles(rot_mat, 'ZXY') + shift = rotation_3d_in_euler(shift, euler)[:, 0, :] + center = points + shift + size = torch.stack( + (bbox_pred[..., 0] + bbox_pred[..., 1], bbox_pred[..., 2] + + bbox_pred[..., 3], bbox_pred[..., 4] + bbox_pred[..., 5]), + dim=-1) + return torch.cat((center, size, euler), + dim=-1).view(batch_size, num_queries, -1) + else: + raise NotImplementedError + + def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, + gt_instances_3d: InstanceData) -> tuple: + """Compute regression and classification targets for one sample. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_queries, 4]. + gt_instances_3d (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + num_bboxes = bbox_pred.size(0) + + bbox_3d = EulerDepthInstance3DBoxes(bbox_pred) + pred_instances_3d = InstanceData(scores_3d=cls_score, + bboxes_3d=bbox_3d) + # assigner and sampler + assign_result = self.assigner.assign( + pred_instances_3d=pred_instances_3d, + gt_instances_3d=gt_instances_3d) + gt_bboxes = gt_instances_3d.bboxes_3d.tensor + + pos_inds = torch.nonzero(assign_result.gt_inds > 0, + as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero(assign_result.gt_inds == 0, + as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] + + # Major changes. The labels are 0-1 binary labels for each bbox + # and text tokens. + labels = gt_bboxes.new_full((num_bboxes, self.max_text_len), + 0, + dtype=torch.float32) + # (num_bboxes , max_text_len) gt_labels map + labels[pos_inds] = gt_instances_3d.positive_maps[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights[pos_inds] = 1.0 + bbox_targets[pos_inds] = pos_gt_bboxes + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) + + def forward( + self, + hidden_states: Tensor, + text_feats: Tensor, + text_token_mask: Tensor, + ) -> Tuple[Tensor]: + """Forward function. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries, dim). + pred_bboxes (List[Tensor]): List of the reference from the decoder. + each with shape (bs, num_queries, 9) + text_feats (Tensor): Text feats. It has shape (bs, len_text, + text_embed_dims). + text_token_mask (Tensor): Text token mask. It has shape (bs, + len_text). + + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - all_layers_outputs_classes (Tensor): Outputs from the + classification head, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + - all_layers_outputs_coords (Tensor): Sigmoid outputs from the + regression head with normalized coordinate format (cx, cy, w, + h), has shape (num_decoder_layers, bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). + """ + all_layers_cls_scores = [] + + for layer_id in range(hidden_states.shape[0]): + # NOTE The last reference will not be used. + hidden_state = hidden_states[layer_id] + cls_scores = self.cls_branches[layer_id](hidden_state, text_feats, + text_token_mask) + all_layers_cls_scores.append(cls_scores) + + all_layers_cls_scores = torch.stack(all_layers_cls_scores) + + return (all_layers_cls_scores, ) + + def predict(self, hidden_states: Tensor, all_layers_pred_bboxes: Tensor, + text_feats: Tensor, text_token_mask: Tensor, + batch_data_samples: SampleList) -> InstanceList: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, num_queries, bs, dim). + references (List[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + text_feats (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + text_token_mask (Tensor): Text token mask. It has shape (bs, + len_text). + batch_data_samples (SampleList): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): If `True`, return boxes in original + image space. Defaults to `True`. + + Returns: + InstanceList: Detection results of each image + after the post process. + """ + batch_input_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + batch_gt_bboxes_3d = [ + data_samples.gt_instances_3d.bboxes_3d + for data_samples in batch_data_samples + ] + batch_positive_maps = [ + data_samples.gt_instances_3d.positive_maps + for data_samples in batch_data_samples + ] + batch_token_positive_maps = None + + outs = self(hidden_states, text_feats, text_token_mask) + + predictions = self.predict_by_feat( + *outs, + all_layers_pred_bboxes, + batch_input_metas=batch_input_metas, + batch_gt_bboxes_3d=batch_gt_bboxes_3d, + batch_positive_maps=batch_positive_maps, + batch_token_positive_maps=batch_token_positive_maps) + return predictions + + def predict_by_feat(self, + all_layers_cls_scores: Tensor, + all_layers_pred_bboxes: Tensor, + batch_input_metas: List[Dict], + batch_gt_bboxes_3d: List, + batch_positive_maps: List, + batch_token_positive_maps=None) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, num_queries, + max_text_lenth). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 12-tensor with shape (num_decoder_layers, bs, + num_queries, reg_num). + batch_input_metas (List[Dict]): _description_ + batch_token_positive_maps (list[dict], Optional): Batch token + positive map. Defaults to None. Actually batch_data_samples + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cls_scores = all_layers_cls_scores[-1] + bbox_preds = all_layers_pred_bboxes[-1] + result_list = [] + for img_id in range(len(batch_input_metas)): + cls_score = cls_scores[img_id] + bbox_pred = bbox_preds[img_id] + gt_bboxes_3d = batch_gt_bboxes_3d[img_id] + positive_maps = batch_positive_maps[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + gt_bboxes_3d, positive_maps) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + gt_bboxes_3d: Tensor, + positive_maps: Tensor) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score (Tensor): Box score logits from the last decoder layer + for each image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from the last decoder layer + for each image, with coordinate format (cx, cy, w, h) and + shape [num_queries, 4]. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_score) == len(bbox_pred) # num_queries + + cls_score = cls_score.sigmoid() # (num_query, self.max_text_len 256) + target_token_maps = positive_maps.squeeze(0) > 0 + # (num_query, num_target_tokens) + target_cls_score = cls_score[:, target_token_maps] + scores, _ = cls_score.max(-1) + target_scores = target_cls_score.sum(-1) + + results = InstanceData() + results.bboxes_3d = EulerDepthInstance3DBoxes(bbox_pred) + results.scores_3d = scores + results.target_scores_3d = target_scores + + return results + + def loss(self, hidden_states: Tensor, all_layers_pred_bboxes: Tensor, + text_feats: Tensor, text_token_mask: Tensor, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + text_feats (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances_3d = [] + batch_input_metas = [] + for data_sample in batch_data_samples: + batch_input_metas.append(data_sample.metainfo) + batch_gt_instances_3d.append(data_sample.gt_instances_3d) + + outs = self(hidden_states, text_feats, text_token_mask) + self.text_masks = text_token_mask + loss_inputs = outs + (all_layers_pred_bboxes, batch_gt_instances_3d, + batch_input_metas) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_pred_bboxes: Tensor, + batch_gt_instances_3d: InstanceList, + batch_input_metas: List[dict], + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels), where + `num_queries_total` is the sum of `num_denoising_queries` + and `num_matching_queries`. + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_input_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + all_layers_cls_scores, + all_layers_pred_bboxes, + batch_gt_instances_3d=batch_gt_instances_3d, + batch_input_metas=batch_input_metas) + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in \ + zip(losses_cls[:-1], losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + return loss_dict + + def loss_by_feat_single(self, cls_scores: Tensor, pred_bboxes: Tensor, + batch_gt_instances_3d: InstanceList, + batch_input_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer of a single + feature level. + + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images, has shape (bs, num_queries, cls_out_channels). + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all sample. # (bs, num_queries, 12) + batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_input_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_angle`. + """ + batch_size = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(batch_size)] + pred_bboxes_list = [pred_bboxes[i] for i in range(batch_size)] + with torch.no_grad(): + cls_reg_targets = self.get_targets(cls_scores_list, + pred_bboxes_list, + batch_gt_instances_3d, + batch_input_metas) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.stack(labels_list, 0) # (bs, 1, max_text_len 256) + label_weights = torch.stack(label_weights_list, 0) # (bs*num_query, 1) + bbox_targets = torch.cat(bbox_targets_list, 0) # (bs*num_query, 9) + bbox_weights = torch.cat(bbox_weights_list, 0) # (bs*num_query, 1) + + # ===== this change ===== + # Loss is not computed for the padded regions of the text. + assert self.text_masks.dim() == 2 + text_masks = self.text_masks.new_zeros( + (self.text_masks.size(0), self.max_text_len)) + text_masks[:, :self.text_masks.size(1)] = self.text_masks + text_mask = (text_masks > 0).unsqueeze( + 1) # turn to bool and then (bs, 1, max_text_len) + text_mask = text_mask.repeat(1, cls_scores.size(1), + 1) # (bs, num_query, max_text_len) + # cls_scores (bs, num_query, self.max_text_len 256) + cls_scores = torch.masked_select( + cls_scores, text_mask).contiguous() # one-dimension + + labels = torch.masked_select(labels, text_mask) # one-dimension + label_weights = label_weights[..., + None].repeat(1, 1, text_mask.size(-1)) + label_weights = torch.masked_select(label_weights, text_mask) + + # classification loss + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + loss_cls = self.loss_cls(cls_scores, + labels, + label_weights, + avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + pred_bboxes = pred_bboxes.reshape( + -1, pred_bboxes.size(-1)) # (bs*num_query, 12) + + valid_box_mask = bbox_weights[:, 0] > 0 + valid_bbox_preds = pred_bboxes[valid_box_mask] # (bs, 9) + valid_bbox_targets = bbox_targets[valid_box_mask] # (bs, 9) + + if self.decouple_bbox_loss: + bbox_targ_center = valid_bbox_targets[:, :3] + bbox_targ_size = valid_bbox_targets[:, 3:6] + bbox_targ_euler = valid_bbox_targets[:, 6:] + bbox_pred_center = valid_bbox_preds[:, :3] + bbox_pred_size = valid_bbox_preds[:, 3:6] + bbox_pred_euler = valid_bbox_preds[:, 6:] + + corner_bbox_loss = 0 + if self.decouple_bbox_loss: + assert self.decouple_groups in ( + 3, 4), 'Only support groups=3 or 4 with stable performance.' + if self.norm_decouple_loss: + corner_bbox_loss += self.decouple_weights[0] * self.loss_bbox( + torch.concat( + (bbox_pred_center, bbox_targ_size, bbox_targ_euler), + dim=-1), + valid_bbox_targets, + reduction_override='none') + corner_bbox_loss += self.decouple_weights[1] * self.loss_bbox( + torch.concat( + (bbox_targ_center, bbox_pred_size, bbox_targ_euler), + dim=-1), + valid_bbox_targets, + reduction_override='none') + corner_bbox_loss += self.decouple_weights[2] * self.loss_bbox( + torch.concat( + (bbox_targ_center, bbox_targ_size, bbox_pred_euler), + dim=-1), + valid_bbox_targets, + reduction_override='none') + bbox_sizes = bbox_targ_size.norm(dim=-1)[:, + None].clamp(min=0.1) + corner_bbox_loss = (corner_bbox_loss / bbox_sizes).mean() + else: + corner_bbox_loss += self.decouple_weights[0] * self.loss_bbox( + torch.concat( + (bbox_pred_center, bbox_targ_size, bbox_targ_euler), + dim=-1), valid_bbox_targets) + corner_bbox_loss += self.decouple_weights[1] * self.loss_bbox( + torch.concat( + (bbox_targ_center, bbox_pred_size, bbox_targ_euler), + dim=-1), valid_bbox_targets) + corner_bbox_loss += self.decouple_weights[2] * self.loss_bbox( + torch.concat( + (bbox_targ_center, bbox_targ_size, bbox_pred_euler), + dim=-1), valid_bbox_targets) + + if self.decouple_groups == 4: + corner_bbox_loss += self.decouple_weights[3] * self.loss_bbox( + valid_bbox_preds, valid_bbox_targets) + + else: + corner_bbox_loss += self.loss_bbox(valid_bbox_preds, + valid_bbox_targets) + + loss_bbox = corner_bbox_loss + + return loss_cls, loss_bbox + + +def normalize_vector(vector): + norm = torch.norm(vector, dim=1, keepdim=True) + 1e-8 + normalized_vector = vector / norm + return normalized_vector + + +def cross_product(a, b): + cross_product = torch.cross(a, b, dim=1) + return cross_product + + +def ortho_6d_2_Mat(x_raw, y_raw): + """x_raw, y_raw: both tensors (batch, 3).""" + y = normalize_vector(y_raw) + z = cross_product(x_raw, y) + z = normalize_vector(z) # (batch, 3) + x = cross_product(y, z) # (batch, 3) + + x = x.unsqueeze(2) + y = y.unsqueeze(2) + z = z.unsqueeze(2) + matrix = torch.cat((x, y, z), 2) # (batch, 3) + return matrix diff --git a/embodiedscan/models/detectors/__init__.py b/embodiedscan/models/detectors/__init__.py index 436d15e..93c7f67 100644 --- a/embodiedscan/models/detectors/__init__.py +++ b/embodiedscan/models/detectors/__init__.py @@ -1,5 +1,9 @@ from .embodied_det3d import Embodied3DDetector +from .sparse_featfusion_grounder import SparseFeatureFusion3DGrounder from .sparse_featfusion_single_stage import \ SparseFeatureFusionSingleStage3DDetector -__all__ = ['Embodied3DDetector', 'SparseFeatureFusionSingleStage3DDetector'] +__all__ = [ + 'Embodied3DDetector', 'SparseFeatureFusionSingleStage3DDetector', + 'SparseFeatureFusion3DGrounder' +] diff --git a/embodiedscan/models/detectors/sparse_featfusion_grounder.py b/embodiedscan/models/detectors/sparse_featfusion_grounder.py new file mode 100644 index 0000000..ce829a9 --- /dev/null +++ b/embodiedscan/models/detectors/sparse_featfusion_grounder.py @@ -0,0 +1,805 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +# Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/detectors/single_stage_sparse.py # noqa +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch import Tensor + +try: + import MinkowskiEngine as ME +except ImportError: + # Please follow getting_started.md to install MinkowskiEngine. + ME = None + pass + +from mmengine.model import BaseModel +from mmengine.structures import InstanceData +from transformers import RobertaModel, RobertaTokenizerFast + +from embodiedscan.models.layers import SparseFeatureFusionTransformerDecoder +from embodiedscan.models.layers.fusion_layers.point_fusion import ( + batch_point_sample, point_sample) +from embodiedscan.registry import MODELS +from embodiedscan.structures.bbox_3d import get_proj_mat_by_coord_type +from embodiedscan.utils import ConfigType, OptConfigType +from embodiedscan.utils.typing_config import (ForwardResults, InstanceList, + OptSampleList, SampleList) + + +def create_positive_map(tokenized, + tokens_positive: list, + max_num_entities: int = 256) -> Tensor: + """construct a map such that positive_map[i,j] = True + if box i is associated to token j + + Args: + tokenized: The tokenized input. + tokens_positive (list): A list of token ranges + associated with positive boxes. + max_num_entities (int, optional): The maximum number of entities. + Defaults to 256. + + Returns: + torch.Tensor: The positive map. + + Raises: + Exception: If an error occurs during token-to-char mapping. + """ + # max number of tokens + positive_map = torch.zeros((len(tokens_positive), max_num_entities), + dtype=torch.float) + + for j, tok_list in enumerate(tokens_positive): + for (beg, end) in tok_list: + try: + beg_pos = tokenized.char_to_token(beg) + end_pos = tokenized.char_to_token(end - 1) + except Exception as e: + print('beg:', beg, 'end:', end) + print('token_positive:', tokens_positive) + raise e + if beg_pos is None: + try: + beg_pos = tokenized.char_to_token(beg + 1) + if beg_pos is None: + beg_pos = tokenized.char_to_token(beg + 2) + except Exception: + beg_pos = None + if end_pos is None: + try: + end_pos = tokenized.char_to_token(end - 2) + if end_pos is None: + end_pos = tokenized.char_to_token(end - 3) + except Exception: + end_pos = None + if beg_pos is None or end_pos is None: + continue + + assert beg_pos is not None and end_pos is not None + positive_map[j, beg_pos:end_pos + 1].fill_(1) + # softmax for tokens to ensure the sum <= 1 + return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) + + +@MODELS.register_module() +class SparseFeatureFusion3DGrounder(BaseModel): + """SparseFusionSingleStage3DDetector. + + Args: + backbone (dict): Config dict of detector's backbone. + neck (dict, optional): Config dict of neck. Defaults to None. + bbox_head (dict, optional): Config dict of box head. Defaults to None. + train_cfg (dict, optional): Config dict of training hyper-parameters. + Defaults to None. + test_cfg (dict, optional): Config dict of test hyper-parameters. + Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`BaseDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or ConfigDict, optional): the config to control the + initialization. Defaults to None. + """ + _version = 2 + + def __init__(self, + backbone: ConfigType, + backbone_lidar: ConfigType, + bbox_head: ConfigType, + neck: ConfigType = None, + neck_3d: ConfigType = None, + neck_lidar: ConfigType = None, + decoder: ConfigType = None, + voxel_size: float = 0.01, + num_queries: int = 512, + coord_type: str = 'CAMERA', + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + use_xyz_feat: bool = False, + init_cfg: OptConfigType = None): + super().__init__(data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + self.backbone_lidar = MODELS.build(backbone_lidar) + if neck is not None: + self.neck = MODELS.build(neck) + if neck_3d is not None: + self.neck_3d = MODELS.build(neck_3d) + if neck_lidar is not None: + self.neck_lidar = MODELS.build(neck_lidar) + bbox_head.update(train_cfg=train_cfg) + bbox_head.update(test_cfg=test_cfg) + self.bbox_head = MODELS.build(bbox_head) + self.decoder = decoder + self.coord_type = coord_type + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.num_queries = num_queries + if ME is None: + raise ImportError( + 'Please follow `getting_started.md` to install MinkowskiEngine.`' # noqa: E501 + ) + self.voxel_size = voxel_size + self.use_xyz_feat = use_xyz_feat + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + # text modules + t_type = 'roberta-base' + self.tokenizer = RobertaTokenizerFast.from_pretrained(t_type) + self.text_encoder = RobertaModel.from_pretrained(t_type) + + self.decoder = SparseFeatureFusionTransformerDecoder(**self.decoder) + # map the text feature to the target dimension number + self.embed_dims = self.decoder.embed_dims + self.text_feat_map = nn.Linear(self.text_encoder.config.hidden_size, + self.embed_dims, + bias=True) + + @property + def with_neck(self): + """Whether the detector has a 2D backbone.""" + return hasattr(self, 'neck') and self.neck is not None + + @property + def with_neck_3d(self): + """Whether the detector has a 3D neck.""" + return hasattr(self, 'neck_3d') and self.neck_3d is not None + + @property + def with_neck_lidar(self): + """Whether the detector has a 2D backbone.""" + return hasattr(self, 'neck_lidar') and self.neck_lidar is not None + + def convert_sparse_feature(self, x: List[Tensor], batch_size: int): + """Convert SparseTensor to pytorch tensor. + + Args: + batch_inputs_dict (dict): The model input dict which includes + 'points' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + + Returns: + tuple[Tensor] | dict: For outside 3D object detection, we + typically obtain a tuple of features from the backbone + neck, + and for inside 3D object detection, usually a dict containing + features will be obtained. + """ + + batch_features_list = [[] + for _ in range(batch_size)] # list of features + batch_coords_list = [[] + for _ in range(batch_size)] # list of coordinates + + # for each level of sparsetensor feature + for sparse_tensor in x: + # extract non-zero features + features = sparse_tensor.F + # Obtain the coordinates of batch decomposition + # remember x self.voxel_size + decomposed_coords = [ + coords * self.voxel_size + for coords in sparse_tensor.decomposed_coordinates + ] + + for batch_idx, coords in enumerate(decomposed_coords): + # Since decomposed_coordinates are already separated + # by batches, we can use them directly. + batch_features = features[sparse_tensor.C[:, 0] == batch_idx] + batch_features_list[batch_idx].append(batch_features) + batch_coords_list[batch_idx].append(coords) + + batch_features_list = [ + torch.cat(features, dim=0) for features in batch_features_list + ] + batch_coords_list = [ + torch.cat(coords, dim=0) for coords in batch_coords_list + ] + + return batch_features_list, batch_coords_list + + def extract_feat( + self, batch_inputs_dict: Dict[str, + Tensor], batch_data_samples: SampleList + ) -> Union[Tuple[torch.Tensor], Dict[str, Tensor]]: + """Directly extract features from the backbone+neck. + + Args: + batch_inputs_dict (dict): The model input dict which includes + 'points' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + + Returns: + tuple[Tensor] | dict: For outside 3D object detection, we + typically obtain a tuple of features from the backbone + neck, + and for inside 3D object detection, usually a dict containing + features will be obtained. + """ + points = batch_inputs_dict['points'] + # construct sparse tensor and features + if self.use_xyz_feat: + coordinates, features = ME.utils.batch_sparse_collate( + [(p[:, :3] / self.voxel_size, p) for p in points], + device=points[0].device) + else: + coordinates, features = ME.utils.batch_sparse_collate( + [(p[:, :3] / self.voxel_size, p[:, 3:]) for p in points], + device=points[0].device) + + x = ME.SparseTensor(coordinates=coordinates, features=features) + + x = self.backbone_lidar(x) + num_levels = len(x) + num_samples = len(x[0].decomposed_coordinates) + + # # extract img features + img = batch_inputs_dict['imgs'] + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + batch_size = img.shape[0] + + if len(img.shape) > 4: # (B, n_views, C, H, W) + img = img.reshape([-1] + list(img.shape)[2:]) + img_features = self.backbone(img) + img_features = [ + img_feat.reshape([batch_size, -1] + list(img_feat.shape)[1:]) + for img_feat in img_features + ] + else: + img_features = self.backbone(img) + + all_points_imgfeats = [] + + for idx in range(len(batch_img_metas)): + img_meta = batch_img_metas[idx] + img_scale_factor = (img.new_tensor(img_meta['scale_factor'][:2]) + if 'scale_factor' in img_meta.keys() else 1) + img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False + img_crop_offset = (img.new_tensor(img_meta['img_crop_offset']) + if 'img_crop_offset' in img_meta.keys() else 0) + proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type) + # Multi-View Sparse Fusion + if isinstance(proj_mat, dict): + assert 'extrinsic' in proj_mat.keys() + assert 'intrinsic' in proj_mat.keys() + projection = [] + # Support different intrinsic matrices for different images + # if the original intrinsic is only a matrix + # we will simply copy it to construct the intrinsic matrix list + # in MultiViewPipeline + assert isinstance(proj_mat['intrinsic'], list) + for proj_idx in range(len(proj_mat['extrinsic'])): + intrinsic = img.new_tensor(proj_mat['intrinsic'][proj_idx]) + extrinsic = img.new_tensor(proj_mat['extrinsic'][proj_idx]) + projection.append(intrinsic @ extrinsic) + proj_mat = torch.stack(projection) + points_imgfeats = [] + for level_idx in range(num_levels): + point = x[level_idx].decomposed_coordinates[ + idx] * self.voxel_size + points_imgfeat = batch_point_sample( + img_meta, + img_features=img_features[level_idx][idx], + points=point, + proj_mat=proj_mat, + coord_type=self.coord_type, + img_scale_factor=img_scale_factor, + img_crop_offset=img_crop_offset, + img_flip=img_flip, + img_pad_shape=img.shape[-2:], + img_shape=img_meta['img_shape'][:2], + aligned=False) + points_imgfeats.append( + points_imgfeat) # one sample, all levels + else: + feature = img_features[idx] + proj_mat = points.new_tensor(proj_mat) + points_imgfeats = [] + for level_idx in range(num_levels): + point = x[level_idx].decomposed_coordinates[ + idx] * self.voxel_size + points_imgfeat = point_sample( + img_meta, + img_features=feature[None, ...], + points=point, + proj_mat=point.new_tensor(proj_mat), + coord_type='CAMERA', + img_scale_factor=img_scale_factor, + img_crop_offset=img_crop_offset, + img_flip=img_flip, + img_pad_shape=img.shape[-2:], + img_shape=img_meta['img_shape'][:2], + aligned=False) + points_imgfeats.append( + points_imgfeat) # one sample, all levels + all_points_imgfeats.append( + points_imgfeats) # all samples, all levels + + # append img features + for level_idx in range(num_levels): + mlvl_feats = torch.cat([ + all_points_imgfeats[sample_idx][level_idx] + for sample_idx in range(num_samples) + ]) + img_x = ME.SparseTensor( + features=mlvl_feats, + coordinate_map_key=x[level_idx].coordinate_map_key, + coordinate_manager=x[level_idx].coordinate_manager) + x[level_idx] = ME.cat(x[level_idx], img_x) + + if self.with_neck_lidar: + x = self.neck_lidar(x) + + # channel mapper feature of different level to the fixed number + feats, scores, coords = self.neck_3d(x, batch_size) + + return feats, scores, coords + + def forward_transformer(self, + point_feats: List[Tensor], + scores: List[Tensor], + point_xyz: List[Tensor], + text_dict: Dict, + batch_data_samples: OptSampleList = None) -> Dict: + decoder_inputs_dict, head_inputs_dict = self.pre_decoder( + point_feats, scores, point_xyz, **text_dict) + decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) + head_inputs_dict.update(decoder_outputs_dict) + return head_inputs_dict + + def pre_decoder( + self, + feats_list: List[Tensor], + scores_list: List[Tensor], + xyz_list: List[Tensor], + text_feats: Tensor, + text_token_mask: Tensor, + batch_data_samples: OptSampleList = None, + ) -> Tuple[Dict]: + + feats_with_pos_list = [ + torch.cat((feats, pos), dim=-1) + for feats, pos in zip(feats_list, xyz_list) + ] + # batch the list of tensor + max_feats_length = max(feats.size(0) for feats in feats_with_pos_list) + min_feats_length = min(feats.size(0) for feats in feats_with_pos_list) + padding_length = [ + max_feats_length - feats.size(0) for feats in feats_with_pos_list + ] + + padded_feats_list = [] + feats_mask_list = [] + for batch_id, feats in enumerate(feats_with_pos_list): + # If padding is needed, create a padding tensor + # of the corresponding size. + if padding_length[batch_id] > 0: + padding_feats = torch.zeros(padding_length[batch_id], + feats.size(1)).to(feats.device) + padded_feats = torch.cat([feats, padding_feats], dim=0) + else: + padded_feats = feats + padded_feats_list.append(padded_feats) + feats_mask = torch.zeros(max_feats_length, + dtype=torch.bool).to(feats.device) + feats_mask[:feats.size(0)] = 1 + feats_mask_list.append(feats_mask) + + feats_with_pos = torch.stack( + padded_feats_list) # (b, max_feats_length, C+3) + feats_mask = torch.stack( + feats_mask_list).bool() # (b, max_feats_length) + + feats, coords = feats_with_pos[..., :-3], feats_with_pos[..., -3:] + + # (b, max_feats_length, max_text_length) + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers](feats, text_feats, text_token_mask, + feats_mask) + + # calculate the min visual token sizes in the batch + topk = min(self.num_queries, min_feats_length) + topk_indices = torch.topk(enc_outputs_class.max(-1)[0], k=topk, + dim=1)[1] + + bbox_preds = self.bbox_head.reg_branches[self.decoder.num_layers]( + feats) + bbox_pred_bboxes = self.bbox_head._bbox_pred_to_bbox( + coords, bbox_preds) + + topk_query_coords = torch.gather( + coords, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 3)) + topk_pred_bboxes = torch.gather( + bbox_pred_bboxes, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 9)) + topk_feats = torch.gather( + feats, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, feats.size(-1))) + + decoder_inputs_dict = dict( + query=topk_feats, + feats=feats, + feats_attention_mask=~feats_mask, + query_coords=topk_query_coords, + feats_coords=coords, + pred_bboxes=topk_pred_bboxes.detach().clone(), + text_feats=text_feats, + text_attention_mask=~text_token_mask) + + head_inputs_dict = dict(text_feats=text_feats, + text_token_mask=text_token_mask) + return decoder_inputs_dict, head_inputs_dict + + def forward_decoder(self, query: Tensor, feats: Tensor, + feats_attention_mask: Tensor, query_coords: Tensor, + feats_coords: Tensor, pred_bboxes: Tensor, + text_feats: Tensor, + text_attention_mask: Tensor) -> Dict: + """Forward with Transformer decoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries_total, dim), where `num_queries_total` is the + sum of `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` of the decoder output and `references` including + the initial and intermediate reference_points. + """ + inter_states, pred_bboxes = self.decoder( + query=query, + key=feats, + value=feats, + key_padding_mask=feats_attention_mask, + self_attn_mask=None, + cross_attn_mask=None, + query_coords=query_coords, + key_coords=feats_coords, + pred_bboxes=pred_bboxes, + text_feats=text_feats, + text_attention_mask=text_attention_mask, + bbox_head=self.bbox_head) + + decoder_outputs_dict = dict(hidden_states=inter_states, + all_layers_pred_bboxes=pred_bboxes) + return decoder_outputs_dict + + def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList, + **kwargs) -> Union[dict, list]: + """Calculate losses from a batch of inputs dict and data samples. + + Args: + batch_inputs_dict (dict): The model input dict which include + 'points', 'img' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor, optional): Image of each sample. + + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`. + + Returns: + dict: A dictionary of loss components. + """ + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] # txt list + + tokens_positive = [ + data_samples.tokens_positive for data_samples in batch_data_samples + ] + + tokenized = self.tokenizer.batch_encode_plus( + text_prompts, padding='longest', + return_tensors='pt').to(batch_inputs_dict['points'][0].device) + positive_maps = self.get_positive_map(tokenized, tokens_positive) + + encoded_text = self.text_encoder(**tokenized) + text_feats = self.text_feat_map(encoded_text.last_hidden_state) + text_token_mask = tokenized.attention_mask.bool() + text_dict = dict() + text_dict['text_feats'] = text_feats + text_dict['text_token_mask'] = text_token_mask # (bs, max_text_length) + # mind attention mask that we get from huggingface is inverse + # because its the opposite in pytorch transformer + # text_dict['tokenized'] = tokenized + for i, data_samples in enumerate(batch_data_samples): + positive_map = positive_maps[i].to( + batch_inputs_dict['points'] + [0].device).bool().float().unsqueeze(0) # (1, max_text_length) + text_token_mask = text_dict['text_token_mask'][ + i] # (max_text_length) + data_samples.gt_instances_3d.positive_maps = positive_map + # (1, max_text_length) + data_samples.gt_instances_3d.text_token_mask = \ + text_token_mask.unsqueeze(0).repeat( + len(positive_map), 1) + + point_feats, scores, point_xyz = self.extract_feat( + batch_inputs_dict, batch_data_samples) + head_inputs_dict = self.forward_transformer(point_feats, scores, + point_xyz, text_dict, + batch_data_samples) + losses = self.bbox_head.loss(**head_inputs_dict, + batch_data_samples=batch_data_samples) + return losses + + def predict(self, batch_inputs_dict, batch_data_samples): + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] # txt list + + tokens_positive = [ + data_samples.tokens_positive for data_samples in batch_data_samples + ] + + point_feats, scores, point_xyz = self.extract_feat( + batch_inputs_dict, batch_data_samples) + + # extract text feats + tokenized = self.tokenizer.batch_encode_plus( + text_prompts, padding='longest', + return_tensors='pt').to(batch_inputs_dict['points'][0].device) + positive_maps = self.get_positive_map(tokenized, tokens_positive) + + encoded_text = self.text_encoder(**tokenized) + text_feats = self.text_feat_map(encoded_text.last_hidden_state) + text_token_mask = tokenized.attention_mask.bool() + text_dict = dict() + text_dict['text_feats'] = text_feats + text_dict['text_token_mask'] = text_token_mask # (bs, max_text_length) + # mind attention mask that we get from huggingface is inverse + # because its the opposite in pytorch transformer + # text_dict['tokenized'] = tokenized + for i, data_samples in enumerate(batch_data_samples): + positive_map = positive_maps[i].to( + batch_inputs_dict['points'] + [0].device).bool().float().unsqueeze(0) # (1, max_text_length) + text_token_mask = text_dict['text_token_mask'][ + i] # (max_text_length) + data_samples.gt_instances_3d.positive_maps = positive_map + # (1, max_text_length) + data_samples.gt_instances_3d.text_token_mask = \ + text_token_mask.unsqueeze(0).repeat( + len(positive_map), 1) + + head_inputs_dict = self.forward_transformer(point_feats, scores, + point_xyz, text_dict, + batch_data_samples) + results_list = self.bbox_head.predict( + **head_inputs_dict, batch_data_samples=batch_data_samples) + + for data_sample, pred_instances_3d in zip(batch_data_samples, + results_list): + data_sample.pred_instances_3d = pred_instances_3d + return batch_data_samples + + def create_positive_map(tokenized, + tokens_positive: list, + max_num_entities: int = 256) -> Tensor: + """construct a map such that positive_map[i,j] = True + if box i is associated to token j + + Args: + tokenized: The tokenized input. + tokens_positive (list): A list of token ranges + associated with positive boxes. + max_num_entities (int, optional): The maximum number of entities. + Defaults to 256. + + Returns: + torch.Tensor: The positive map. + + Raises: + Exception: If an error occurs during token-to-char mapping. + """ + # max number of tokens + positive_map = torch.zeros((len(tokens_positive), max_num_entities), + dtype=torch.float) + + for j, tok_list in enumerate(tokens_positive): + for (beg, end) in tok_list: + try: + beg_pos = tokenized.char_to_token(beg) + end_pos = tokenized.char_to_token(end - 1) + except Exception as e: + print('beg:', beg, 'end:', end) + print('token_positive:', tokens_positive) + raise e + if beg_pos is None: + try: + beg_pos = tokenized.char_to_token(beg + 1) + if beg_pos is None: + beg_pos = tokenized.char_to_token(beg + 2) + except Exception: + beg_pos = None + if end_pos is None: + try: + end_pos = tokenized.char_to_token(end - 2) + if end_pos is None: + end_pos = tokenized.char_to_token(end - 3) + except Exception: + end_pos = None + if beg_pos is None or end_pos is None: + continue + + assert beg_pos is not None and end_pos is not None + positive_map[j, beg_pos:end_pos + 1].fill_(1) + + return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) + + def get_positive_map(self, tokenized, tokens_positive): + positive_map = create_positive_map(tokenized, + tokens_positive, + max_num_entities=256) + return positive_map + + def forward(self, + inputs: Union[dict, List[dict]], + data_samples: Optional[List] = None, + mode: str = 'tensor', + **kwargs) -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`Det3DDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (dict | list[dict]): When it is a list[dict], the + outer list indicate the test time augmentation. Each + dict contains batch inputs + which include 'points' and 'imgs' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor): Image tensor has shape (B, C, H, W). + data_samples (list[:obj:`Det3DDataSample`], + list[list[:obj:`Det3DDataSample`]], optional): The + annotation data of every samples. When it is a list[list], the + outer list indicate the test time augmentation, and the + inter list indicate the batch. Otherwise, the list simply + indicate the batch. Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`Det3DDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + if mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + elif mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def _forward(self, + batch_inputs_dict: dict, + batch_data_samples: OptSampleList = None, + **kwargs) -> Tuple[List[torch.Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs_dict (dict): The model input dict which include + 'points', 'img' keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor, optional): Image of each sample. + + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`. + + Returns: + tuple[list]: A tuple of features from ``bbox_head`` forward. + """ + x = self.extract_feat(batch_inputs_dict, batch_data_samples) + results = self.bbox_head.forward(x) + return results + + def add_pred_to_datasample( + self, + data_samples: SampleList, + data_instances_3d: Optional[InstanceList] = None, + data_instances_2d: Optional[InstanceList] = None, + ) -> SampleList: + """Convert results list to `Det3DDataSample`. + + Subclasses could override it to be compatible for some multi-modality + 3D detectors. + + Args: + data_samples (list[:obj:`Det3DDataSample`]): The input data. + data_instances_3d (list[:obj:`InstanceData`], optional): 3D + Detection results of each sample. + data_instances_2d (list[:obj:`InstanceData`], optional): 2D + Detection results of each sample. + + Returns: + list[:obj:`Det3DDataSample`]: Detection results of the + input. Each Det3DDataSample usually contains + 'pred_instances_3d'. And the ``pred_instances_3d`` normally + contains following keys. + + - scores_3d (Tensor): Classification scores, has a shape + (num_instance, ) + - labels_3d (Tensor): Labels of 3D bboxes, has a shape + (num_instances, ). + - bboxes_3d (Tensor): Contains a tensor with shape + (num_instances, C) where C >=7. + + When there are image prediction in some models, it should + contains `pred_instances`, And the ``pred_instances`` normally + contains following keys. + + - scores (Tensor): Classification scores of image, has a shape + (num_instance, ) + - labels (Tensor): Predict Labels of 2D bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Contains a tensor with shape + (num_instances, 4). + """ + + assert (data_instances_2d is not None) or \ + (data_instances_3d is not None),\ + 'please pass at least one type of data_samples' + + if data_instances_2d is None: + data_instances_2d = [ + InstanceData() for _ in range(len(data_instances_3d)) + ] + if data_instances_3d is None: + data_instances_3d = [ + InstanceData() for _ in range(len(data_instances_2d)) + ] + + for i, data_sample in enumerate(data_samples): + data_sample.pred_instances_3d = data_instances_3d[i] + data_sample.pred_instances = data_instances_2d[i] + return data_samples diff --git a/embodiedscan/models/detectors/sparse_featfusion_single_stage.py b/embodiedscan/models/detectors/sparse_featfusion_single_stage.py index a9a89aa..49cf47c 100644 --- a/embodiedscan/models/detectors/sparse_featfusion_single_stage.py +++ b/embodiedscan/models/detectors/sparse_featfusion_single_stage.py @@ -109,6 +109,8 @@ def extract_feat( points = batch_inputs_dict['points'] # construct sparse tensor and features + # coordinates shape: (N, D+1), features shape: (N, F) + # N is the total point number in the batch if self.use_xyz_feat: coordinates, features = ME.utils.batch_sparse_collate( [(p[:, :3] / self.voxel_size, p) for p in points], @@ -120,7 +122,7 @@ def extract_feat( x = ME.SparseTensor(coordinates=coordinates, features=features) x = self.backbone_lidar(x) - num_levels = len(x) + num_levels = len(x) # 4 levels num_samples = len(x[0].decomposed_coordinates) # extract img features @@ -188,6 +190,8 @@ def extract_feat( proj_mat = points.new_tensor(proj_mat) points_imgfeats = [] for level_idx in range(num_levels): + # get the corresponding voxel coordinates + # and * voxel_size to get the absolute positions point = x[level_idx].decomposed_coordinates[ idx] * self.voxel_size points_imgfeat = point_sample( diff --git a/embodiedscan/models/layers/__init__.py b/embodiedscan/models/layers/__init__.py index e69de29..fb1a6d1 100644 --- a/embodiedscan/models/layers/__init__.py +++ b/embodiedscan/models/layers/__init__.py @@ -0,0 +1,3 @@ +from .ground_transformer import SparseFeatureFusionTransformerDecoder + +__all__ = ['SparseFeatureFusionTransformerDecoder'] diff --git a/embodiedscan/models/layers/ground_transformer/__init__.py b/embodiedscan/models/layers/ground_transformer/__init__.py new file mode 100644 index 0000000..07a4524 --- /dev/null +++ b/embodiedscan/models/layers/ground_transformer/__init__.py @@ -0,0 +1,7 @@ +from .decoder import (SparseFeatureFusionTransformerDecoder, + SparseFeatureFusionTransformerDecoderLayer) + +__all__ = [ + 'SparseFeatureFusionTransformerDecoder', + 'SparseFeatureFusionTransformerDecoderLayer' +] diff --git a/embodiedscan/models/layers/ground_transformer/decoder.py b/embodiedscan/models/layers/ground_transformer/decoder.py new file mode 100644 index 0000000..bf5b90e --- /dev/null +++ b/embodiedscan/models/layers/ground_transformer/decoder.py @@ -0,0 +1,297 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmengine import ConfigDict +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from embodiedscan.utils import ConfigType, OptConfigType + +try: + from fairscale.nn.checkpoint import checkpoint_wrapper +except Exception: + checkpoint_wrapper = None + + +class PositionEmbeddingLearned(BaseModule): + """Absolute pos embedding, learned.""" + + def __init__(self, input_channel, embed_dims=256): + super().__init__() + self.position_embedding_head = nn.Sequential( + nn.Conv1d(input_channel, embed_dims, kernel_size=1), + nn.BatchNorm1d(embed_dims), nn.ReLU(inplace=True), + nn.Conv1d(embed_dims, embed_dims, kernel_size=1)) + + def forward(self, xyz): + """Forward pass, xyz is (B, N, 3or6), output (B, N, F).""" + xyz = xyz.transpose(1, 2).contiguous() + position_embedding = self.position_embedding_head(xyz) + return position_embedding.transpose(1, 2).contiguous() + + +class SparseFeatureFusionTransformerDecoderLayer(BaseModule): + + def __init__(self, + self_attn_cfg: OptConfigType = dict(embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + cross_attn_cfg: OptConfigType = dict(embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + cross_attn_text_cfg: OptConfigType = dict(embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + ffn_cfg: OptConfigType = dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + norm_cfg: OptConfigType = dict(type='LN'), + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.cross_attn_text_cfg = cross_attn_text_cfg + self.self_attn_cfg = self_attn_cfg + self.cross_attn_cfg = cross_attn_cfg + + if 'batch_first' not in self.cross_attn_text_cfg: + self.cross_attn_text_cfg['batch_first'] = True + + if 'batch_first' not in self.self_attn_cfg: + self.self_attn_cfg['batch_first'] = True + else: + assert self.self_attn_cfg['batch_first'] is True, 'First \ + dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` flag.' + + if 'batch_first' not in self.cross_attn_cfg: + self.cross_attn_cfg['batch_first'] = True + else: + assert self.cross_attn_cfg['batch_first'] is True, 'First \ + dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` flag.' + + self.ffn_cfg = ffn_cfg + self.norm_cfg = norm_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize self_attn, cross-attn, ffn, and norms.""" + self.self_attn = MultiheadAttention(**self.self_attn_cfg) + self.cross_attn_text = MultiheadAttention(**self.cross_attn_text_cfg) + self.cross_attn = MultiheadAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(4) + ] + self.norms = ModuleList(norms_list) + self.self_posembed = PositionEmbeddingLearned(3, self.embed_dims) + + def forward(self, + query: Tensor, + key: Tensor = None, + value: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + self_attn_mask: Tensor = None, + cross_attn_mask: Tensor = None, + key_padding_mask: Tensor = None, + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + **kwargs) -> Tensor: + """Implements decoder layer in Grounding DINO transformer. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + key (Tensor, optional): The input key, has shape (bs, num_keys, + dim). If `None`, the `query` will be used. Defaults to `None`. + value (Tensor, optional): The input value, has the same shape as + `key`, as in `nn.MultiheadAttention.forward`. If `None`, the + `key` will be used. Defaults to `None`. + query_pos (Tensor, optional): The positional encoding for `query`, + has the same shape as `query`. If not `None`, it will be added + to `query` before forward function. Defaults to `None`. + key_pos (Tensor, optional): The positional encoding for `key`, has + the same shape as `key`. If not `None`, it will be added to + `key` before forward function. If None, and `query_pos` has the + same shape as `key`, then `query_pos` will be used for + `key_pos`. Defaults to None. + self_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + cross_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor, optional): The `key_padding_mask` of + `self_attn` input. ByteTensor, has shape (bs, num_value). + Defaults to None. + memory_text (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + text_attention_mask (Tensor): Text token mask. It has shape (bs, + len_text). + + Returns: + Tensor: forwarded results, has shape (bs, num_queries, dim). + """ + + # self attention dropout is down in the self_attn layer + query = self.self_attn(query=query, + key=query, + value=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_mask, + **kwargs) + query = self.norms[0](query) + # cross attention between query and text + query = self.cross_attn_text(query=query, + query_pos=query_pos, + key=memory_text, + value=memory_text, + key_padding_mask=text_attention_mask) + query = self.norms[1](query) + # cross attention between query and point cloud + query = self.cross_attn(query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=cross_attn_mask, + key_padding_mask=key_padding_mask, + **kwargs) + query = self.norms[2](query) + query = self.ffn(query) + query = self.norms[3](query) + + return query + + +class SparseFeatureFusionTransformerDecoder(BaseModule): + """Decoder of DETR. + + Args: + num_layers (int): Number of decoder layers. + layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder + layer. All the layers will share the same config. + post_norm_cfg (:obj:`ConfigDict` or dict, optional): Config of the + post normalization layer. Defaults to `LN`. + return_intermediate (bool, optional): Whether to return outputs of + intermediate layers. Defaults to `True`, + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + num_layers: int, + layer_cfg: ConfigType, + post_norm_cfg: OptConfigType = dict(type='LN'), + return_intermediate: bool = True, + init_cfg: Union[dict, ConfigDict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.layer_cfg = layer_cfg + self.num_layers = num_layers + self.post_norm_cfg = post_norm_cfg + self.return_intermediate = return_intermediate + self._init_layers() + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + self.layers = ModuleList([ + SparseFeatureFusionTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + if self.post_norm_cfg is not None: + raise ValueError('There is not post_norm in ' + f'{self._get_name()}') + self.self_posembed = PositionEmbeddingLearned(9, self.embed_dims) + self.cross_posembed = PositionEmbeddingLearned(3, self.embed_dims) + self.norm = nn.LayerNorm(self.embed_dims) + + def forward(self, query: Tensor, key: Tensor, value: Tensor, + key_padding_mask: Tensor, self_attn_mask: Tensor, + cross_attn_mask: Tensor, query_coords: Tensor, + key_coords: Tensor, pred_bboxes: Tensor, text_feats: Tensor, + text_attention_mask: Tensor, bbox_head: nn.ModuleList, + **kwargs) -> Tuple[Tensor]: + """Forward function of Transformer decoder. + + Args: + query (Tensor): The input query, has shape (num_queries, bs, dim). + value (Tensor): The input values, has shape (num_value, bs, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (num_queries, bs). + self_attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups and matching parts, has + shape (num_queries_total, num_queries_total). It is `None` when + `self.training` is `False`. + pred_sizes (Tensor): The initial reference, has shape + (bs, num_queries, 3 or 6) with the last dimension arranged as + (x, y, z) or (dx, dy, dz). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + reg_branches: (obj:`nn.ModuleList`): Used for refining the + regression results. + + Returns: + tuple[Tensor]: Output queries and references of Transformer + decoder + + - query (Tensor): Output embeddings of the last decoder, has + shape (num_queries, bs, embed_dims) when `return_intermediate` + is `False`. Otherwise, Intermediate output embeddings of all + decoder layers, has shape (num_decoder_layers, num_queries, bs, + embed_dims). + - pred_sizes (Tensor): The reference of the last decoder + layer, has shape (bs, num_queries, 4) when `return_intermediate` + is `False`. Otherwise, Intermediate references of all decoder + layers, has shape (num_decoder_layers, bs, num_queries, 4). The + coordinates are arranged as (cx, cy, w, h) + """ + intermediate = [] + intermediate_bboxes = [] + for lid, layer in enumerate(self.layers): + + query_pos = self.self_posembed(pred_bboxes) + key_pos = self.cross_posembed(key_coords) + query = layer(query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + memory_text=text_feats, + self_attn_mask=self_attn_mask, + cross_attn_mask=cross_attn_mask, + key_padding_mask=key_padding_mask, + text_attention_mask=text_attention_mask, + **kwargs) + + if bbox_head is not None: + # (bs, num_query, 9) + bbox_preds = bbox_head.reg_branches[lid](query) + new_pred_bboxes = bbox_head._bbox_pred_to_bbox( + query_coords, bbox_preds) + pred_bboxes = new_pred_bboxes.detach().clone() + + if self.return_intermediate: + intermediate.append(self.norm(query)) + intermediate_bboxes.append(new_pred_bboxes) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack(intermediate_bboxes) + + return query, new_pred_bboxes diff --git a/embodiedscan/models/losses/__init__.py b/embodiedscan/models/losses/__init__.py index adee503..decd336 100644 --- a/embodiedscan/models/losses/__init__.py +++ b/embodiedscan/models/losses/__init__.py @@ -1,5 +1,9 @@ -from .chamfer_distance import BBoxCDLoss +from .chamfer_distance import BBoxCDLoss, bbox_to_corners +from .match_cost import BBox3DL1Cost, BinaryFocalLossCost, IoU3DCost from .reduce_loss import weighted_loss from .rotated_iou_loss import RotatedIoU3DLoss -__all__ = ['RotatedIoU3DLoss', 'weighted_loss', 'BBoxCDLoss'] +__all__ = [ + 'RotatedIoU3DLoss', 'weighted_loss', 'BBoxCDLoss', 'bbox_to_corners', + 'BBox3DL1Cost', 'IoU3DCost', 'BinaryFocalLossCost' +] diff --git a/embodiedscan/models/losses/match_cost.py b/embodiedscan/models/losses/match_cost.py new file mode 100644 index 0000000..0ab46f8 --- /dev/null +++ b/embodiedscan/models/losses/match_cost.py @@ -0,0 +1,265 @@ +from abc import abstractmethod +from typing import Optional, Union + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from embodiedscan.registry import TASK_UTILS +from embodiedscan.structures import EulerDepthInstance3DBoxes + + +class BaseMatchCost: + """Base match cost class. + + Args: + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, weight: Union[float, int] = 1.) -> None: + self.weight = weight + + @abstractmethod + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + img_meta (dict, optional): Image information. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pass + + +@TASK_UTILS.register_module() +class BBox3DL1Cost(BaseMatchCost): + """L1 cost for 3D boxes.""" + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) + which are all in range [0, 1] and shape [num_query, 10]. + gt_bboxes (Tensor): Ground truth boxes with `normalized` + coordinates (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y). + Shape [num_gt, 10]. + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_bboxes = pred_instances.bboxes_3d.tensor # (num_preds, 9) + gt_bboxes = gt_instances.bboxes_3d.tensor # (num_gts, 9) + + bbox_cost = torch.cdist(pred_bboxes, gt_bboxes, + p=1) # (num_preds, num_gt) + return bbox_cost * self.weight + + +@TASK_UTILS.register_module() +class TokenMapCost(BaseMatchCost): + """TokenPredictionCost.""" + + def __call__(self, pred_logits: Tensor, gt_logits: Tensor) -> Tensor: + """Compute match cost. + + Args: + pred_logits (Tensor): Shape [num_query, C]. + gt_logits (Tensor): Shape [num_gt, C]. + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + token_map_cost = torch.matmul(pred_logits, gt_logits.transpose(0, 1)) + return token_map_cost * self.weight + + +@TASK_UTILS.register_module() +class IoU3DCost(object): + """3D IoU cost for 3D boxes.""" + + def __init__(self, weight): + self.weight = weight + + def __call__(self, pred_instances: InstanceData, + gt_instances: InstanceData): + pred_bboxes = EulerDepthInstance3DBoxes( + pred_instances.bboxes_3d.tensor, origin=(0.5, 0.5, 0.5)) + gt_bboxes = EulerDepthInstance3DBoxes(gt_instances.bboxes_3d.tensor, + origin=(0.5, 0.5, 0.5)) + overlaps = pred_bboxes.overlaps(pred_bboxes, gt_bboxes) + + # The 1 is a constant that doesn't change the matching, so omitted. + iou_cost = -overlaps + + return iou_cost * self.weight + + +@TASK_UTILS.register_module() +class FocalLossCost(BaseMatchCost): + """FocalLossCost. + + Args: + alpha (Union[float, int]): focal_loss alpha. Defaults to 0.25. + gamma (Union[float, int]): focal_loss gamma. Defaults to 2. + eps (float): Defaults to 1e-12. + binary_input (bool): Whether the input is binary. Currently, + binary_input = True is for masks input, binary_input = False + is for label input. Defaults to False. + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, + alpha: Union[float, int] = 0.25, + gamma: Union[float, int] = 2, + eps: float = 1e-12, + binary_input: bool = False, + weight: Union[float, int] = 1.) -> None: + super().__init__(weight=weight) + self.alpha = alpha + self.gamma = gamma + self.eps = eps + self.binary_input = binary_input + + def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_queries, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels] + return cls_cost * self.weight + + def _mask_focal_loss_cost(self, cls_pred, gt_labels) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits. + in shape (num_queries, d1, ..., dn), dtype=torch.float32. + gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), + dtype=torch.long. Labels should be binary. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_queries, num_gt). + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).float() + n = cls_pred.shape[1] + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \ + torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) + return cls_cost / n * self.weight + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``scores`` or ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``labels`` or ``mask``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + if self.binary_input: + pred_masks = pred_instances.masks + gt_masks = gt_instances.masks + return self._mask_focal_loss_cost(pred_masks, gt_masks) + else: + pred_scores = pred_instances.scores + gt_labels = gt_instances.labels + return self._focal_loss_cost(pred_scores, gt_labels) + + +@TASK_UTILS.register_module() +class BinaryFocalLossCost(FocalLossCost): + """Binary focal loss cost.""" + + def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_queries, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).float() + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \ + torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) + return cls_cost * self.weight + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``scores`` or ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``labels`` or ``mask``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + # gt_instances.text_token_mask is a repeated tensor of the same length + # of instances. Only gt_instances.text_token_mask[0] is useful + text_token_mask = torch.nonzero( + gt_instances.text_token_mask[0]).squeeze(-1) + # mask used to filter padding texts + # (num_query,) + pred_scores = pred_instances.scores_3d[:, text_token_mask] + # (1, real_tex_length) + gt_labels = gt_instances.positive_maps[:, text_token_mask] + return self._focal_loss_cost(pred_scores, gt_labels) diff --git a/embodiedscan/models/necks/__init__.py b/embodiedscan/models/necks/__init__.py new file mode 100644 index 0000000..03e9ecd --- /dev/null +++ b/embodiedscan/models/necks/__init__.py @@ -0,0 +1,4 @@ +from .channel_mapper import ChannelMapper +from .mink_neck import MinkNeck + +__all__ = ['ChannelMapper', 'MinkNeck'] diff --git a/embodiedscan/models/necks/channel_mapper.py b/embodiedscan/models/necks/channel_mapper.py new file mode 100644 index 0000000..1471b86 --- /dev/null +++ b/embodiedscan/models/necks/channel_mapper.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch.nn as nn +from mmengine.model import BaseModule +from torch import Tensor + +try: + import MinkowskiEngine as ME +except ImportError: + # Please follow getting_started.md to install MinkowskiEngine. + ME = None + pass + +from embodiedscan.registry import MODELS + + +@MODELS.register_module() +class ChannelMapper(BaseModule): + """Channel Mapper to reduce/increase channels of backbone features. + + This is used to reduce/increase channels of backbone features. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + kernel_size (int, optional): kernel_size for reducing channels (used + at each scale). Default: 3. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Default: None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Default: None. + act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + activation layer in ConvModule. Default: dict(type='ReLU'). + bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict], + optional): Initialization config dict. + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = ChannelMapper(in_channels, 11, 3).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + kernel_size: int = 1, + init_cfg: Optional[dict] = None) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(in_channels, list) + self.convs = nn.ModuleList() + for in_channel in in_channels: + self.convs.append( + self._make_conv_block(in_channel, out_channels, kernel_size)) + + def _make_conv_block(self, in_channels: int, out_channels: int, + kernel_size: int) -> nn.Module: + """Construct DeConv-Norm-Act-Conv-Norm-Act block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + + Returns: + torch.nn.Module: With corresponding layers. + """ + return nn.Sequential( + ME.MinkowskiConvolution(in_channels, + out_channels, + kernel_size=kernel_size, + dimension=3), + ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU()) + + def forward(self, inputs: List[Tensor]) -> Tuple[Tensor]: + """Forward function.""" + assert len(inputs) == len(self.convs) + outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] + return outs diff --git a/embodiedscan/models/necks/mink_neck.py b/embodiedscan/models/necks/mink_neck.py new file mode 100644 index 0000000..0d02661 --- /dev/null +++ b/embodiedscan/models/necks/mink_neck.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +# Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/dense_heads/fcaf3d_neck_with_head.py # noqa +from typing import List, Optional, Tuple + +try: + import MinkowskiEngine as ME + from MinkowskiEngine import SparseTensor +except ImportError: + # Please follow get_started.md to install MinkowskiEngine. + ME = SparseTensor = None + pass + +import torch +from mmengine.model import BaseModule, bias_init_with_prob +from torch import Tensor, nn + +from embodiedscan.registry import MODELS + + +@MODELS.register_module() +class MinkNeck(BaseModule): + """MinkEngine based 3D Neck. + + Actually here we store both the sparse 3D FPN and a head. The neck and + the head can not be simply separated as pruning score on the i-th level + of FPN requires classification scores from i+1-th level of the head. + + Args: + num_classes (int): Number of classes. + in_channels (tuple(int)): Number of channels in input tensors. + out_channels (int): Number of channels in the neck output tensors. + num_reg_outs (int): Number of regression layer channels. + voxel_size (float): Voxel size in meters. + pts_prune_threshold (int): Pruning threshold on each feature level. + pts_assign_threshold (int): Box to location assigner parameter. + Assigner selects the maximum feature level with more locations + inside the box than pts_assign_threshold. + pts_center_threshold (int): Box to location assigner parameter. + After feature level for the box is determined, assigner selects + pts_center_threshold locations closest to the box center. + center_loss (dict): Config of centerness loss. Defaults to + dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True). + bbox_loss (dict): Config of bbox loss. Defaults to + dict(type='AxisAlignedIoULoss'). + cls_loss (dict): Config of classification loss. Defaults to + dict = dict(type='mmdet.FocalLoss'). + train_cfg (dict, optional): Config for train stage. Defaults to None. + test_cfg (dict, optional): Config for test stage. Defaults to None. + init_cfg (dict, optional): Config for weight initialization. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, # 1 + in_channels: Tuple[int], + out_channels: int, + voxel_size: float, + pts_prune_threshold: int, + train_cfg: Optional[dict] = None, + test_cfg: Optional[dict] = None, + init_cfg: Optional[dict] = None): + super(MinkNeck, self).__init__(init_cfg) + if ME is None: + raise ImportError( + 'Please follow `get_started.md` to install MinkowskiEngine.`') + self.voxel_size = voxel_size + self.pts_prune_threshold = pts_prune_threshold + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self._init_layers(in_channels, out_channels, num_classes) + + @staticmethod + def _make_block(in_channels: int, out_channels: int) -> nn.Module: + """Construct Conv-Norm-Act block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + + Returns: + torch.nn.Module: With corresponding layers. + """ + return nn.Sequential( + ME.MinkowskiConvolution(in_channels, + out_channels, + kernel_size=3, + dimension=3), + ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU()) + + @staticmethod + def _make_up_block(in_channels: int, out_channels: int) -> nn.Module: + """Construct DeConv-Norm-Act-Conv-Norm-Act block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + + Returns: + torch.nn.Module: With corresponding layers. + """ + return nn.Sequential( + ME.MinkowskiGenerativeConvolutionTranspose(in_channels, + out_channels, + kernel_size=2, + stride=2, + dimension=3), + ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU(), + ME.MinkowskiConvolution(out_channels, + out_channels, + kernel_size=3, + dimension=3), + ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU()) + + def _init_layers(self, in_channels: Tuple[int], out_channels: int, + num_classes: int): + """Initialize layers. + + Args: + in_channels (tuple[int]): Number of channels in input tensors. + out_channels (int): Number of channels in the neck output tensors. + num_reg_outs (int): Number of regression layer channels. + num_classes (int): Number of classes. + """ + # neck layers + self.pruning = ME.MinkowskiPruning() + for i in range(len(in_channels)): + if i > 0: + self.__setattr__( + f'up_block_{i}', + self._make_up_block(in_channels[i], in_channels[i - 1])) + self.__setattr__(f'out_block_{i}', + self._make_block(in_channels[i], out_channels)) + + # head layers + self.conv_cls = ME.MinkowskiConvolution(out_channels, + num_classes, + kernel_size=1, + bias=True, + dimension=3) + + def init_weights(self): + """Initialize weights.""" + nn.init.normal_(self.conv_cls.kernel, std=.01) + nn.init.constant_(self.conv_cls.bias, bias_init_with_prob(.01)) + + def forward(self, x: List[Tensor], batch_size) -> Tuple[List[Tensor], ...]: + """Forward pass. + + Args: + x (list[Tensor]): Features from the backbone. + + Returns: + Tuple[List[Tensor], ...]: Predictions of the head. + """ + feats, cls_preds, points = [], [], [] + inputs = x + x = inputs[-1] + prune_score = None + for i in range(len(inputs) - 1, -1, -1): + if i < len(inputs) - 1: + x = self.__getattr__(f'up_block_{i + 1}')(x) + x = inputs[i] + x + x = self._prune(x, prune_score) + + out = self.__getattr__(f'out_block_{i}')(x) + feat, cls_pred, point, prune_score = \ + self._forward_single(out) + feats.append(feat) + cls_preds.append(cls_pred) + points.append(point) + batch_feats_list, batch_scores_list, batch_points_list = \ + self.convert_to_batch(feats, cls_preds, points, batch_size) + return batch_feats_list, batch_scores_list, batch_points_list + + def _prune(self, x: SparseTensor, scores: SparseTensor) -> SparseTensor: + """Prunes the tensor by score thresholding. + + Args: + x (SparseTensor): Tensor to be pruned. + scores (SparseTensor): Scores for thresholding. + + Returns: + SparseTensor: Pruned tensor. + """ + with torch.no_grad(): + coordinates = x.C.float() + interpolated_scores = scores.features_at_coordinates(coordinates) + prune_mask = interpolated_scores.new_zeros( + (len(interpolated_scores)), dtype=torch.bool) + for permutation in x.decomposition_permutations: + score = interpolated_scores[permutation] + mask = score.new_zeros((len(score)), dtype=torch.bool) + topk = min(len(score), self.pts_prune_threshold) + ids = torch.topk(score.squeeze(1), topk, sorted=False).indices + mask[ids] = True + prune_mask[permutation[mask]] = True + x = self.pruning(x, prune_mask) + return x + + def _forward_single(self, x: SparseTensor) -> Tuple[Tensor, ...]: + """Forward pass per level. + + Args: + x (SparseTensor): Per level neck output tensor. + scale (mmcv.cnn.Scale): Per level multiplication weight. + + Returns: + tuple[Tensor]: Per level head predictions. + """ + feat = x.features + scores = self.conv_cls(x) + cls_pred = scores.features + prune_scores = ME.SparseTensor( + scores.features.max(dim=1, keepdim=True).values, + coordinate_map_key=scores.coordinate_map_key, + coordinate_manager=scores.coordinate_manager) + + feats, cls_preds, points = [], [], [] + for permutation in x.decomposition_permutations: + feats.append(feat[permutation]) + cls_preds.append(cls_pred[permutation]) + + points = x.decomposed_coordinates + for i in range(len(points)): + points[i] = points[i] * self.voxel_size + + return feats, cls_preds, points, prune_scores + + def convert_to_batch(self, feats, scores, points, batch_size): + """Loss function about feature. + + Args: + feats (list[list[Tensor]]): Feats for all + scenes. The first list contains predictions from different + levels. The second list contains predictions in a mini-batch. + points (list[list[Tensor]]): Final location coordinates for all + scenes. The first list contains predictions from different + levels. The second list contains predictions in a mini-batch. + + Returns: + """ + batch_feats_list = [] + batch_scores_list = [] + batch_points_list = [] + for i in range(batch_size): + feats_list = [x[i] for x in feats] + scores_list = [x[i] for x in scores] + points_list = [x[i] for x in points] + batch_feats_list.append(torch.cat(feats_list, dim=0)) + batch_scores_list.append(torch.cat(scores_list, dim=0)) + batch_points_list.append(torch.cat(points_list, dim=0)) + + return batch_feats_list, batch_scores_list, batch_points_list diff --git a/embodiedscan/models/task_modules/__init__.py b/embodiedscan/models/task_modules/__init__.py new file mode 100644 index 0000000..cc1caf2 --- /dev/null +++ b/embodiedscan/models/task_modules/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +from .assigners import HungarianAssigner3D + +__all__ = ['HungarianAssigner3D'] diff --git a/embodiedscan/models/task_modules/assigners/__init__.py b/embodiedscan/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000..b54230c --- /dev/null +++ b/embodiedscan/models/task_modules/assigners/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenRobotLab. All rights reserved. +from .hungarian_assigner import HungarianAssigner3D + +__all__ = ['HungarianAssigner3D'] diff --git a/embodiedscan/models/task_modules/assigners/hungarian_assigner.py b/embodiedscan/models/task_modules/assigners/hungarian_assigner.py new file mode 100644 index 0000000..324ade0 --- /dev/null +++ b/embodiedscan/models/task_modules/assigners/hungarian_assigner.py @@ -0,0 +1,138 @@ +# ------------------------------------------------------------------------ +# Copyright (c) 2021 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection (https://github.com/open-mmlab/mmdetection) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ +from typing import List, Union + +import torch +from mmdet.models.task_modules import AssignResult, BaseAssigner +from mmengine import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from embodiedscan.registry import TASK_UTILS + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None + + +@TASK_UTILS.register_module() +class HungarianAssigner3D(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. This + class computes an assignment between the targets and the predictions based + on the costs. The costs are weighted sum of three components: + classification cost, regression L1 cost and regression iou cost. The + targets don't include the no_object, so generally there are more + predictions than targets. After the one-to-one matching, the un-matched are + treated as backgrounds. Thus each query prediction will be assigned with + `0` or a positive integer indicating the ground truth index: + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + """ + + def __init__( + self, match_costs: Union[List[Union[dict, ConfigDict]], dict, + ConfigDict] + ) -> None: + + if isinstance(match_costs, dict): + match_costs = [match_costs] + elif isinstance(match_costs, list): + assert len(match_costs) > 0, \ + 'match_costs must not be a empty list.' + + self.match_costs = [ + TASK_UTILS.build(match_cost) for match_cost in match_costs + ] + + def assign(self, + pred_instances_3d: InstanceData, + gt_instances_3d: InstanceData, + eps=1e-7) -> AssignResult: + """Computes one-to-one matching based on the weighted costs. This + method assign each query prediction to a ground truth or background. + The `assigned_gt_inds` with -1 means don't care, 0 means negative + sample, and positive number is the index (1-based) of assigned gt. + + The assignment is done in the following steps, the order matters. + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + + Args: + pred_instances_3d (:obj:`InstanceData`): Predicted instances. + It should includes ``bboxes`` and ``labels`` + attributes. + gt_instances_3d (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert isinstance(gt_instances_3d.labels_3d, Tensor) + num_gts, num_preds = len(gt_instances_3d), len(pred_instances_3d) + gt_labels = gt_instances_3d.labels_3d + device = gt_labels.device + + # 1. assign -1 by default + assigned_gt_inds = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + assigned_labels = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + + if num_gts == 0 or num_preds == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult(num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) + + # 2. compute the weighted costs + cost_list = [] + for match_cost in self.match_costs: + cost = match_cost(pred_instances=pred_instances_3d, + gt_instances=gt_instances_3d) + cost_list.append(cost) + cost = torch.stack(cost_list).sum(dim=0) + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0) + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to(device) + matched_col_inds = torch.from_numpy(matched_col_inds).to(device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + + return AssignResult(num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) diff --git a/embodiedscan/structures/__init__.py b/embodiedscan/structures/__init__.py index b5c3df2..b770fe9 100644 --- a/embodiedscan/structures/__init__.py +++ b/embodiedscan/structures/__init__.py @@ -1,13 +1,13 @@ # Copyright (c) OpenRobotLab. All rights reserved. from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, Coord3DMode, - EulerInstance3DBoxes, get_box_type, - get_proj_mat_by_coord_type, limit_period, + EulerDepthInstance3DBoxes, EulerInstance3DBoxes, + get_box_type, get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr) __all__ = [ 'BaseInstance3DBoxes', 'Box3DMode', 'Coord3DMode', 'EulerInstance3DBoxes', - 'get_box_type', 'get_proj_mat_by_coord_type', 'limit_period', - 'mono_cam_box2vis', 'points_cam2img', 'points_img2cam', + 'EulerDepthInstance3DBoxes', 'get_box_type', 'get_proj_mat_by_coord_type', + 'limit_period', 'mono_cam_box2vis', 'points_cam2img', 'points_img2cam', 'rotation_3d_in_axis', 'rotation_3d_in_euler', 'xywhr2xyxyr' ] diff --git a/embodiedscan/structures/bbox_3d/__init__.py b/embodiedscan/structures/bbox_3d/__init__.py index 2e4d739..da5736a 100644 --- a/embodiedscan/structures/bbox_3d/__init__.py +++ b/embodiedscan/structures/bbox_3d/__init__.py @@ -3,14 +3,16 @@ from .box_3d_mode import Box3DMode from .coord_3d_mode import Coord3DMode from .euler_box3d import EulerInstance3DBoxes +from .euler_depth_box3d import EulerDepthInstance3DBoxes from .utils import (batch_points_cam2img, get_box_type, get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr) __all__ = [ - 'Box3DMode', 'BaseInstance3DBoxes', 'EulerInstance3DBoxes', 'xywhr2xyxyr', - 'get_box_type', 'rotation_3d_in_axis', 'rotation_3d_in_euler', - 'limit_period', 'points_cam2img', 'points_img2cam', 'Coord3DMode', - 'mono_cam_box2vis', 'batch_points_cam2img', 'get_proj_mat_by_coord_type' + 'Box3DMode', 'BaseInstance3DBoxes', 'EulerInstance3DBoxes', + 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type', + 'rotation_3d_in_axis', 'rotation_3d_in_euler', 'limit_period', + 'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis', + 'batch_points_cam2img', 'get_proj_mat_by_coord_type' ] diff --git a/embodiedscan/utils/__init__.py b/embodiedscan/utils/__init__.py index 87dbb68..f25cc3a 100644 --- a/embodiedscan/utils/__init__.py +++ b/embodiedscan/utils/__init__.py @@ -1,4 +1,7 @@ from .array_converter import ArrayConverter, array_converter -from .typing_config import ConfigType +from .typing_config import ConfigType, OptConfigType, OptMultiConfig -__all__ = ['ConfigType', 'ArrayConverter', 'array_converter'] +__all__ = [ + 'ConfigType', 'OptConfigType', 'OptMultiConfig', 'ArrayConverter', + 'array_converter' +] diff --git a/embodiedscan/utils/typing_config.py b/embodiedscan/utils/typing_config.py index c62af30..af619fe 100644 --- a/embodiedscan/utils/typing_config.py +++ b/embodiedscan/utils/typing_config.py @@ -1,8 +1,9 @@ from collections.abc import Sized -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch +from mmdet.models.task_modules.samplers import SamplingResult from mmengine.config import ConfigDict from mmengine.structures import BaseDataElement, InstanceData @@ -189,8 +190,21 @@ def __len__(self) -> int: return 0 -SampleList = List[Det3DDataElement] +# Type hint of config data ConfigType = Union[ConfigDict, dict] +OptConfigType = Optional[ConfigType] + +# Type hint of one or more config data +MultiConfig = Union[ConfigType, List[ConfigType]] +OptMultiConfig = Optional[MultiConfig] + InstanceList = List[InstanceData] +OptInstanceList = Optional[InstanceList] ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataElement], Tuple[torch.Tensor], torch.Tensor] + +SamplingResultList = List[SamplingResult] + +OptSamplingResultList = Optional[SamplingResultList] +SampleList = List[Det3DDataElement] +OptSampleList = Optional[SampleList] diff --git a/tools/test.py b/tools/test.py index f2296b6..1d2df60 100644 --- a/tools/test.py +++ b/tools/test.py @@ -7,7 +7,7 @@ from mmengine.registry import RUNNERS from mmengine.runner import Runner -# from mmdet3d.utils import replace_ceph_backend +# from embodiedscan.utils import replace_ceph_backend # TODO: support fuse_conv_bn and format_only From b57929617fb5b9a6b9c09de06ce0c5fc58f6b288 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Tue, 5 Mar 2024 15:51:06 +0800 Subject: [PATCH 2/2] Update the info path for the CVPR 2024 challenge --- ...et3d_8xb1_embodiedscan-3d-284class-9dof.py | 4 +- ...et3d_8xb4_embodiedscan-3d-284class-9dof.py | 34 ++++++++--------- ...ounding_8xb12_embodiedscan-vg-9dof-full.py | 38 +++++++++---------- ...mv-grounding_8xb12_embodiedscan-vg-9dof.py | 38 +++++++++---------- ...g_8xb12_embodiedscan-vg-9dof_fcaf-coder.py | 38 +++++++++---------- 5 files changed, 72 insertions(+), 80 deletions(-) diff --git a/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py b/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py index 6d5a603..b782b08 100644 --- a/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py +++ b/configs/detection/cont-det3d_8xb1_embodiedscan-3d-284class-9dof.py @@ -190,7 +190,7 @@ times=8, dataset=dict(type=dataset_type, data_root=data_root, - ann_file='embodiedscan_infos_train_split.pkl', + ann_file='embodiedscan_infos_train.pkl', pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, @@ -205,7 +205,7 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict(type=dataset_type, data_root=data_root, - ann_file='embodiedscan_infos_val_split.pkl', + ann_file='embodiedscan_infos_val.pkl', pipeline=test_pipeline, test_mode=True, filter_empty_gt=True, diff --git a/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py b/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py index 976e54b..458cb8b 100644 --- a/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py +++ b/configs/detection/mv-det3d_8xb4_embodiedscan-3d-284class-9dof.py @@ -184,30 +184,28 @@ sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict(type='RepeatDataset', times=10, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train_split_filtered.pkl', - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - metainfo=metainfo))) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train.pkl', + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth', + metainfo=metainfo))) val_dataloader = dict(batch_size=1, num_workers=1, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val_split_filtered.pkl', - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - metainfo=metainfo)) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val.pkl', + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth', + metainfo=metainfo)) test_dataloader = val_dataloader val_evaluator = dict(type='IndoorDetMetric') diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py index f1568db..d311d3f 100644 --- a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof-full.py @@ -139,32 +139,30 @@ sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict(type='RepeatDataset', times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train_split_filtered.pkl', - vg_file='embodiedscan_train_full_vg.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth'))) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train.pkl', + vg_file='embodiedscan_train_full_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) val_dataloader = dict(batch_size=12, num_workers=12, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val_split_filtered.pkl', - vg_file='embodiedscan_val_full_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth')) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val.pkl', + vg_file='embodiedscan_val_full_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) test_dataloader = val_dataloader val_evaluator = dict(type='GroundingMetric') diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py index 38dc457..1362c26 100644 --- a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof.py @@ -139,32 +139,30 @@ sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict(type='RepeatDataset', times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train_split_filtered.pkl', - vg_file='embodiedscan_train_vg.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth'))) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train.pkl', + vg_file='embodiedscan_train_mini_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) val_dataloader = dict(batch_size=12, num_workers=12, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val_split_filtered.pkl', - vg_file='embodiedscan_val_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth')) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val.pkl', + vg_file='embodiedscan_val_mini_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) test_dataloader = val_dataloader val_evaluator = dict(type='GroundingMetric') diff --git a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py index 1342198..1896402 100644 --- a/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py +++ b/configs/grounding/mv-grounding_8xb12_embodiedscan-vg-9dof_fcaf-coder.py @@ -140,32 +140,30 @@ sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict(type='RepeatDataset', times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train_split_filtered.pkl', - vg_file='embodiedscan_train_vg.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth'))) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_train.pkl', + vg_file='embodiedscan_train_mini_vg.json', + metainfo=metainfo, + pipeline=train_pipeline, + test_mode=False, + filter_empty_gt=True, + box_type_3d='Euler-Depth'))) val_dataloader = dict(batch_size=12, num_workers=12, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val_split_filtered.pkl', - vg_file='embodiedscan_val_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth')) + dataset=dict(type=dataset_type, + data_root=data_root, + ann_file='embodiedscan_infos_val.pkl', + vg_file='embodiedscan_val_mini_vg.json', + metainfo=metainfo, + pipeline=test_pipeline, + test_mode=True, + filter_empty_gt=True, + box_type_3d='Euler-Depth')) test_dataloader = val_dataloader val_evaluator = dict(type='GroundingMetric')