Skip to content

Commit 91ee5b6

Browse files
author
Bin Xiao
committed
add nms_kernel.cu
1 parent 59c9a78 commit 91ee5b6

File tree

1 file changed

+143
-0
lines changed

1 file changed

+143
-0
lines changed

lib/nms/nms_kernel.cu

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// ------------------------------------------------------------------
2+
// Copyright (c) Microsoft
3+
// Licensed under The MIT License
4+
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
5+
// ------------------------------------------------------------------
6+
7+
#include "gpu_nms.hpp"
8+
#include <vector>
9+
#include <iostream>
10+
11+
#define CUDA_CHECK(condition) \
12+
/* Code block avoids redefinition of cudaError_t error */ \
13+
do { \
14+
cudaError_t error = condition; \
15+
if (error != cudaSuccess) { \
16+
std::cout << cudaGetErrorString(error) << std::endl; \
17+
} \
18+
} while (0)
19+
20+
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
21+
int const threadsPerBlock = sizeof(unsigned long long) * 8;
22+
23+
__device__ inline float devIoU(float const * const a, float const * const b) {
24+
float left = max(a[0], b[0]), right = min(a[2], b[2]);
25+
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
26+
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
27+
float interS = width * height;
28+
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
29+
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
30+
return interS / (Sa + Sb - interS);
31+
}
32+
33+
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
34+
const float *dev_boxes, unsigned long long *dev_mask) {
35+
const int row_start = blockIdx.y;
36+
const int col_start = blockIdx.x;
37+
38+
// if (row_start > col_start) return;
39+
40+
const int row_size =
41+
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
42+
const int col_size =
43+
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
44+
45+
__shared__ float block_boxes[threadsPerBlock * 5];
46+
if (threadIdx.x < col_size) {
47+
block_boxes[threadIdx.x * 5 + 0] =
48+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
49+
block_boxes[threadIdx.x * 5 + 1] =
50+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
51+
block_boxes[threadIdx.x * 5 + 2] =
52+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
53+
block_boxes[threadIdx.x * 5 + 3] =
54+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
55+
block_boxes[threadIdx.x * 5 + 4] =
56+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
57+
}
58+
__syncthreads();
59+
60+
if (threadIdx.x < row_size) {
61+
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
62+
const float *cur_box = dev_boxes + cur_box_idx * 5;
63+
int i = 0;
64+
unsigned long long t = 0;
65+
int start = 0;
66+
if (row_start == col_start) {
67+
start = threadIdx.x + 1;
68+
}
69+
for (i = start; i < col_size; i++) {
70+
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
71+
t |= 1ULL << i;
72+
}
73+
}
74+
const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
75+
dev_mask[cur_box_idx * col_blocks + col_start] = t;
76+
}
77+
}
78+
79+
void _set_device(int device_id) {
80+
int current_device;
81+
CUDA_CHECK(cudaGetDevice(&current_device));
82+
if (current_device == device_id) {
83+
return;
84+
}
85+
// The call to cudaSetDevice must come before any calls to Get, which
86+
// may perform initialization using the GPU.
87+
CUDA_CHECK(cudaSetDevice(device_id));
88+
}
89+
90+
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
91+
int boxes_dim, float nms_overlap_thresh, int device_id) {
92+
_set_device(device_id);
93+
94+
float* boxes_dev = NULL;
95+
unsigned long long* mask_dev = NULL;
96+
97+
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
98+
99+
CUDA_CHECK(cudaMalloc(&boxes_dev,
100+
boxes_num * boxes_dim * sizeof(float)));
101+
CUDA_CHECK(cudaMemcpy(boxes_dev,
102+
boxes_host,
103+
boxes_num * boxes_dim * sizeof(float),
104+
cudaMemcpyHostToDevice));
105+
106+
CUDA_CHECK(cudaMalloc(&mask_dev,
107+
boxes_num * col_blocks * sizeof(unsigned long long)));
108+
109+
dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
110+
DIVUP(boxes_num, threadsPerBlock));
111+
dim3 threads(threadsPerBlock);
112+
nms_kernel<<<blocks, threads>>>(boxes_num,
113+
nms_overlap_thresh,
114+
boxes_dev,
115+
mask_dev);
116+
117+
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
118+
CUDA_CHECK(cudaMemcpy(&mask_host[0],
119+
mask_dev,
120+
sizeof(unsigned long long) * boxes_num * col_blocks,
121+
cudaMemcpyDeviceToHost));
122+
123+
std::vector<unsigned long long> remv(col_blocks);
124+
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
125+
126+
int num_to_keep = 0;
127+
for (int i = 0; i < boxes_num; i++) {
128+
int nblock = i / threadsPerBlock;
129+
int inblock = i % threadsPerBlock;
130+
131+
if (!(remv[nblock] & (1ULL << inblock))) {
132+
keep_out[num_to_keep++] = i;
133+
unsigned long long *p = &mask_host[0] + i * col_blocks;
134+
for (int j = nblock; j < col_blocks; j++) {
135+
remv[j] |= p[j];
136+
}
137+
}
138+
}
139+
*num_out = num_to_keep;
140+
141+
CUDA_CHECK(cudaFree(boxes_dev));
142+
CUDA_CHECK(cudaFree(mask_dev));
143+
}

0 commit comments

Comments
 (0)