#include "megbrain/opr/standalone/nms_opr.h"
#if MGB_CUDA
#include "./nms_kern.cuh"
#endif
#include "./nms_cpu.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/serialization/sereg.h"
#include "megbrain/utils/arith_helper.h"
#if MGB_ENABLE_FBS_SERIALIZATION
#include "megbrain/serialization/internal/mgb_cpp_opr_generated.h"
#include "megbrain/serialization/internal/schema_generated.h"
#endif
using namespace mgb::opr::standalone;
MGB_DYN_TYPE_OBJ_FINAL_IMPL(NMSKeep);
class NMSKeep::Kern {
public:
virtual ~Kern() = default;
virtual size_t get_workspace_size(const NMSKeep* opr, const TensorShape& boxes) = 0;
virtual void exec(
const NMSKeep* opr, const DeviceTensorND& inp,
const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
const DeviceTensorND& workspace) = 0;
};
#if MGB_CUDA
class NMSKeep::CUDAKern final : public Kern {
size_t m_workspace_overlap_mask_bytes, m_workspace_overlap_mask_bytes_align,
m_workspace_rm_mask_bytes;
void init(const NMSKeep* opr, const TensorShape& boxes) {
auto align = opr->comp_node().get_mem_addr_alignment();
size_t nr_boxes = boxes[1];
if (nr_boxes == 0) {
m_workspace_overlap_mask_bytes = 0;
m_workspace_overlap_mask_bytes_align = 0;
m_workspace_rm_mask_bytes = 0;
} else {
m_workspace_overlap_mask_bytes =
nr_boxes * DIVUP(nr_boxes, 64) * sizeof(uint64_t);
m_workspace_overlap_mask_bytes_align =
get_aligned_power2(m_workspace_overlap_mask_bytes, align);
m_workspace_rm_mask_bytes = DIVUP(nr_boxes, 64) * sizeof(uint64_t);
}
}
public:
size_t get_workspace_size(const NMSKeep* opr, const TensorShape& boxes) override {
init(opr, boxes);
return m_workspace_overlap_mask_bytes_align + m_workspace_rm_mask_bytes;
}
void exec(
const NMSKeep* opr, const DeviceTensorND& inp,
const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
const DeviceTensorND& workspace) override;
};
void NMSKeep::CUDAKern::exec(
const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx,
const DeviceTensorND& out_size, const DeviceTensorND& workspace) {
CompNode comp_node = out_idx.comp_node();
mgb_assert(comp_node == opr->comp_node());
auto&& cuda_env = CompNodeEnv::from_comp_node(comp_node).cuda_env();
mgb_assert(
cuda_env.device_prop.warpSize == 32, "invalid warp size: %d",
cuda_env.device_prop.warpSize);
auto stream = cuda_env.stream;
init(opr, inp.shape());
auto inp_ptr = inp.ptr<float>();
void* workspace_ptr = workspace.raw_ptr();
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr),
dev_rm_mask =
(uint64_t*)(workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align);
auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
if (nr_boxes == 0) {
MGB_CUDA_CHECK(
cudaMemsetAsync(out_size_ptr, 0, batch * sizeof(uint32_t), stream));
return;
}
MGB_CUDA_CHECK(cudaMemsetAsync(
dev_overlap_mask, 0, m_workspace_overlap_mask_bytes, stream));
auto max_output = opr->param().max_output;
for (size_t i = 0; i < batch; ++i) {
nms::launch_gen_mask(
nr_boxes, opr->param().iou_thresh, inp_ptr + i * nr_boxes * 4,
DIVUP(nr_boxes, 64), dev_overlap_mask, stream);
MGB_CUDA_CHECK(
cudaMemsetAsync(dev_rm_mask, 0, m_workspace_rm_mask_bytes, stream));
nms::launch_gen_indices(
nr_boxes, max_output, DIVUP(nr_boxes, 64), dev_overlap_mask,
dev_rm_mask, out_idx_ptr + i * max_output, out_size_ptr + i, stream);
}
}
#endif
class NMSKeep::CPUKern final : public Kern {
public:
~CPUKern() = default;
size_t get_workspace_size(const NMSKeep*, const TensorShape& boxes) override {
return nms::cpu_kern_workspace(boxes.shape[1]);
}
void exec(
const NMSKeep* opr, const DeviceTensorND& inp,
const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
const DeviceTensorND& workspace) override;
};
void NMSKeep::CPUKern::exec(
const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx,
const DeviceTensorND& out_size, const DeviceTensorND& workspace) {
CompNode comp_node = out_idx.comp_node();
size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
if (nr_boxes == 0) {
auto out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
for (size_t i = 0; i < batch; ++i) {
*(out_size_ptr + i) = 0;
}
return;
}
auto param = opr->param();
auto workspace_ptr = workspace.raw_ptr();
auto kern = [=]() {
for (size_t i = 0; i < batch; ++i) {
auto inp_ptr = inp.as_megdnn().ptr<float>();
auto out_idx_ptr =
reinterpret_cast<uint32_t*>(out_idx.as_megdnn().ptr<int32_t>());
auto out_size_ptr =
reinterpret_cast<uint32_t*>(out_size.as_megdnn().ptr<int32_t>());
nms::cpu_kern(
nr_boxes, param.max_output, param.iou_thresh,
inp_ptr + i * nr_boxes * 4, out_idx_ptr + i * param.max_output,
out_size_ptr + i, workspace_ptr);
}
};
CompNodeEnv::from_comp_node(comp_node).cpu_env().dispatch(kern);
}
NMSKeep::NMSKeep(
VarNode* boxes, const Param& param,
const OperatorNodeConfig& config)
: Super(boxes->owner_graph(), config, "nms_keep", {boxes} ),
m_param{param} {
mgb_assert(
boxes->dtype() == dtype::Float32(), "input should be float32; got %s",
boxes->dtype().name());
switch (boxes->comp_node().device_type()) {
#if MGB_CUDA
case CompNode::DeviceType::CUDA:
m_kern = std::make_unique<CUDAKern>();
break;
#endif
case CompNode::DeviceType::CPU:
m_kern = std::make_unique<CPUKern>();
break;
default:
mgb_throw(
MegBrainError, "NMSKeep: unsupported device type: %s",
boxes->comp_node().to_string().c_str());
}
add_input({boxes});
add_output("indices")
->dtype(dtype::Int32())
.add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE);
add_output("sizes")->dtype(dtype::Int32());
cg::add_workspace_output(this);
add_equivalence_component<PODHash<Param>>(&m_param);
}
NMSKeep::~NMSKeep() noexcept = default;
mgb::SymbolVar NMSKeep::make(
SymbolVar boxes, const Param& param, const OperatorNodeConfig& config) {
auto bvar = boxes.node();
return boxes.insert_single_output_opr<NMSKeep>(bvar, param, config);
}
void NMSKeep::get_output_var_shape(
const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
auto boxes = inp_shape.at(0);
mgb_assert(
boxes.ndim == 3 && boxes.shape[2] == 4, "invalid box shape: %s",
boxes.to_string().c_str());
mgb_assert(out_shape.size() == 3);
auto batch = boxes[0];
out_shape[0] = {batch, m_param.max_output}; out_shape[1] = {batch}; out_shape[2] = {m_kern->get_workspace_size(this, boxes)}; }
void NMSKeep::add_input_layout_constraint() {
input(0)->add_layout_constraint_contiguous();
}
void NMSKeep::scn_do_execute() {
DeviceTensorND empty_workspace;
m_kern->exec(
this, input(0)->dev_tensor(), output(0)->dev_tensor(),
output(1)->dev_tensor(),
output(2)->dev_tensor_valid() ? output(2)->dev_tensor() : empty_workspace);
}
NMSKeep::NodeProp* NMSKeep::do_make_node_prop() const {
auto ret = Super::do_make_node_prop();
ret->add_dep_type_existing_var(input(0), NodeProp::DepType::VALUE_ALLOW_EMPTY);
return ret;
}
#if MGB_ENABLE_FBS_SERIALIZATION
namespace mgb {
namespace serialization {
namespace fbs {
template <>
struct ParamConverter<opr::standalone::NMSKeep::Param> {
using FlatBufferType = param::NMSKeep;
static opr::standalone::NMSKeep::Param to_param(const FlatBufferType* fb) {
return {fb->iou_thresh(), fb->max_output()};
}
static flatbuffers::Offset<FlatBufferType> to_flatbuffer(
flatbuffers::FlatBufferBuilder& builder,
const opr::standalone::NMSKeep::Param& p) {
return param::CreateNMSKeep(builder, p.iou_thresh, p.max_output);
}
};
} } }
#endif
namespace mgb {
void _hack_pull_in_nms_opr_object() {}
}
using NMSKeepMGB = NMSKeep;
MGB_SEREG_OPR(NMSKeepMGB, 1);