|
| 1 | +// ------------------------------------------------------------------ |
| 2 | +// Copyright (c) Microsoft |
| 3 | +// Licensed under The MIT License |
| 4 | +// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) |
| 5 | +// ------------------------------------------------------------------ |
| 6 | + |
| 7 | +#include "gpu_nms.hpp" |
| 8 | +#include <vector> |
| 9 | +#include <iostream> |
| 10 | + |
| 11 | +#define CUDA_CHECK(condition) \ |
| 12 | + /* Code block avoids redefinition of cudaError_t error */ \ |
| 13 | + do { \ |
| 14 | + cudaError_t error = condition; \ |
| 15 | + if (error != cudaSuccess) { \ |
| 16 | + std::cout << cudaGetErrorString(error) << std::endl; \ |
| 17 | + } \ |
| 18 | + } while (0) |
| 19 | + |
| 20 | +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) |
| 21 | +int const threadsPerBlock = sizeof(unsigned long long) * 8; |
| 22 | + |
| 23 | +__device__ inline float devIoU(float const * const a, float const * const b) { |
| 24 | + float left = max(a[0], b[0]), right = min(a[2], b[2]); |
| 25 | + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); |
| 26 | + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); |
| 27 | + float interS = width * height; |
| 28 | + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); |
| 29 | + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); |
| 30 | + return interS / (Sa + Sb - interS); |
| 31 | +} |
| 32 | + |
| 33 | +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, |
| 34 | + const float *dev_boxes, unsigned long long *dev_mask) { |
| 35 | + const int row_start = blockIdx.y; |
| 36 | + const int col_start = blockIdx.x; |
| 37 | + |
| 38 | + // if (row_start > col_start) return; |
| 39 | + |
| 40 | + const int row_size = |
| 41 | + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); |
| 42 | + const int col_size = |
| 43 | + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); |
| 44 | + |
| 45 | + __shared__ float block_boxes[threadsPerBlock * 5]; |
| 46 | + if (threadIdx.x < col_size) { |
| 47 | + block_boxes[threadIdx.x * 5 + 0] = |
| 48 | + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; |
| 49 | + block_boxes[threadIdx.x * 5 + 1] = |
| 50 | + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; |
| 51 | + block_boxes[threadIdx.x * 5 + 2] = |
| 52 | + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; |
| 53 | + block_boxes[threadIdx.x * 5 + 3] = |
| 54 | + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; |
| 55 | + block_boxes[threadIdx.x * 5 + 4] = |
| 56 | + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; |
| 57 | + } |
| 58 | + __syncthreads(); |
| 59 | + |
| 60 | + if (threadIdx.x < row_size) { |
| 61 | + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; |
| 62 | + const float *cur_box = dev_boxes + cur_box_idx * 5; |
| 63 | + int i = 0; |
| 64 | + unsigned long long t = 0; |
| 65 | + int start = 0; |
| 66 | + if (row_start == col_start) { |
| 67 | + start = threadIdx.x + 1; |
| 68 | + } |
| 69 | + for (i = start; i < col_size; i++) { |
| 70 | + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { |
| 71 | + t |= 1ULL << i; |
| 72 | + } |
| 73 | + } |
| 74 | + const int col_blocks = DIVUP(n_boxes, threadsPerBlock); |
| 75 | + dev_mask[cur_box_idx * col_blocks + col_start] = t; |
| 76 | + } |
| 77 | +} |
| 78 | + |
| 79 | +void _set_device(int device_id) { |
| 80 | + int current_device; |
| 81 | + CUDA_CHECK(cudaGetDevice(¤t_device)); |
| 82 | + if (current_device == device_id) { |
| 83 | + return; |
| 84 | + } |
| 85 | + // The call to cudaSetDevice must come before any calls to Get, which |
| 86 | + // may perform initialization using the GPU. |
| 87 | + CUDA_CHECK(cudaSetDevice(device_id)); |
| 88 | +} |
| 89 | + |
| 90 | +void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, |
| 91 | + int boxes_dim, float nms_overlap_thresh, int device_id) { |
| 92 | + _set_device(device_id); |
| 93 | + |
| 94 | + float* boxes_dev = NULL; |
| 95 | + unsigned long long* mask_dev = NULL; |
| 96 | + |
| 97 | + const int col_blocks = DIVUP(boxes_num, threadsPerBlock); |
| 98 | + |
| 99 | + CUDA_CHECK(cudaMalloc(&boxes_dev, |
| 100 | + boxes_num * boxes_dim * sizeof(float))); |
| 101 | + CUDA_CHECK(cudaMemcpy(boxes_dev, |
| 102 | + boxes_host, |
| 103 | + boxes_num * boxes_dim * sizeof(float), |
| 104 | + cudaMemcpyHostToDevice)); |
| 105 | + |
| 106 | + CUDA_CHECK(cudaMalloc(&mask_dev, |
| 107 | + boxes_num * col_blocks * sizeof(unsigned long long))); |
| 108 | + |
| 109 | + dim3 blocks(DIVUP(boxes_num, threadsPerBlock), |
| 110 | + DIVUP(boxes_num, threadsPerBlock)); |
| 111 | + dim3 threads(threadsPerBlock); |
| 112 | + nms_kernel<<<blocks, threads>>>(boxes_num, |
| 113 | + nms_overlap_thresh, |
| 114 | + boxes_dev, |
| 115 | + mask_dev); |
| 116 | + |
| 117 | + std::vector<unsigned long long> mask_host(boxes_num * col_blocks); |
| 118 | + CUDA_CHECK(cudaMemcpy(&mask_host[0], |
| 119 | + mask_dev, |
| 120 | + sizeof(unsigned long long) * boxes_num * col_blocks, |
| 121 | + cudaMemcpyDeviceToHost)); |
| 122 | + |
| 123 | + std::vector<unsigned long long> remv(col_blocks); |
| 124 | + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); |
| 125 | + |
| 126 | + int num_to_keep = 0; |
| 127 | + for (int i = 0; i < boxes_num; i++) { |
| 128 | + int nblock = i / threadsPerBlock; |
| 129 | + int inblock = i % threadsPerBlock; |
| 130 | + |
| 131 | + if (!(remv[nblock] & (1ULL << inblock))) { |
| 132 | + keep_out[num_to_keep++] = i; |
| 133 | + unsigned long long *p = &mask_host[0] + i * col_blocks; |
| 134 | + for (int j = nblock; j < col_blocks; j++) { |
| 135 | + remv[j] |= p[j]; |
| 136 | + } |
| 137 | + } |
| 138 | + } |
| 139 | + *num_out = num_to_keep; |
| 140 | + |
| 141 | + CUDA_CHECK(cudaFree(boxes_dev)); |
| 142 | + CUDA_CHECK(cudaFree(mask_dev)); |
| 143 | +} |
0 commit comments