#pragma once
#include <cuda_runtime.h>
#include <stddef.h>
#include <stdint.h>
#define DIVUP(m, n) (((m)-1) / (n) + 1)
namespace mgb {
namespace opr {
namespace standalone {
namespace nms {
/*!
* \brief launch the kernel to generate nms mask
* \param nr_boxes number of input boxes
* \param nms_overlap_thresh overlapping threshold for IoU
* \param[in] dev_boxes boxes in [n, 4] layout,
* each row containing (x0, y0, x1, y1)
* \param dev_mask_width width in number of uint64_t elements of div_mask
* matrix; must be at least ceil(n, 64)
* \param[out] dev_mask [n, dev_mask_width] dev_mask[i] is a
* bitmask of length n indicating whether i overlaps with each box. Only
* the upper triangle (row < col) are filled.
*/
void launch_gen_mask(
const int nr_boxes, const float nms_overlap_thresh, const float* dev_boxes,
const int dev_mask_width, uint64_t* dev_mask, cudaStream_t stream);
/*!
* \brief launch the kernel to generate indices of kept boxes
* \param max_output max number of entries to be written to out_idx
* \param overlap_mask the mask generated by launch_gen_mask
* \param[in,out] rm_mask mask of removed boxes; must be initialized as 0
* \param[out] out_idx indices of boxes to be kept
* \param[out] out_size number of items written to out_idx; the remaining items
* would be filled with the last valid item
*/
void launch_gen_indices(
int nr_boxes, int max_output, int overlap_mask_width,
const uint64_t* overlap_mask, uint64_t* rm_mask, uint32_t* out_idx,
uint32_t* out_size, cudaStream_t stream);
} // namespace nms
} // namespace standalone
} // namespace opr
} // namespace mgb
// vim: ft=cuda syntax=cuda.doxygen