#include "megbrain_build_config.h"
#if MGB_CUDA
#include <mutex>
#include <vector>
#include "denseflownvidia.h"
#include "megbrain/common.h"
NVFlowExtractor::NVFlowExtractor(
int device_id, std::vector<size_t>& shape, uint32_t preset,
bool use_cuda_stream, bool debug) {
batch_size = shape[0];
m_width = shape[3];
m_height = shape[2];
debug_flag = debug;
m_temporal_size = shape[1];
m_use_cuda_stream = use_cuda_stream;
out_width = (m_width + m_out_grid_size - 1) / m_out_grid_size;
out_height = (m_height + m_out_grid_size - 1) / m_out_grid_size;
m_width_in_blocks = (m_width + m_blockSizeX - 1) / m_blockSizeX;
m_height_in_blocks = (m_height + m_blockSizeY - 1) / m_blockSizeY;
out_size = out_width * out_height * 2;
m_device_id = device_id;
std::unordered_map<uint32_t, NV_OF_PERF_LEVEL> preset_map = {
{0, NV_OF_PERF_LEVEL_SLOW},
{1, NV_OF_PERF_LEVEL_MEDIUM},
{2, NV_OF_PERF_LEVEL_FAST}};
_preset = preset;
auto search = preset_map.find(_preset);
if (search == preset_map.end()) {
mgb_throw(
MegBrainError,
"NVOF: invalid preset level! err type: NV_OF_ERR_INVALID_PARAM");
}
perf_preset = search->second;
}
void NVFlowExtractor::create_nvof_instances(int height, int width) {
nv_optical_flow = NvOFCuda::Create(
cu_context, width, height, buffer_format, input_buffer_type,
output_buffer_type, NV_OF_MODE_OPTICALFLOW, perf_preset, input_stream,
output_stream);
nv_optical_flow->Init(m_out_grid_size);
input_buffers = nv_optical_flow->CreateBuffers(
NV_OF_BUFFER_USAGE_INPUT, buffer_pool_size * batch_size);
output_buffers = nv_optical_flow->CreateBuffers(
NV_OF_BUFFER_USAGE_OUTPUT, (buffer_pool_size - 1) * batch_size);
}
void NVFlowExtractor::init_nvof_engine() {
std::lock_guard<std::mutex> lock(m_lock);
if (init_flag == false) {
set_device(m_device_id);
if (cuCtxCreate(&cu_context, 0, cu_device)) {
mgb_log_warn("nvof: create ctx failed, fallback to get current ctx");
CUDA_DRVAPI_CALL(cuCtxGetCurrent(&cu_context));
}
if (m_use_cuda_stream) {
CUDA_DRVAPI_CALL(cuStreamCreate(&input_stream, CU_STREAM_DEFAULT));
CUDA_DRVAPI_CALL(cuStreamCreate(&output_stream, CU_STREAM_DEFAULT));
}
create_nvof_instances(m_height, m_width);
init_flag = true;
}
}
NVFlowExtractor::~NVFlowExtractor() {
if (debug_flag) {
mgb_log_debug("%s: %d start", __FUNCTION__, __LINE__);
}
if (m_use_cuda_stream) {
cuStreamDestroy(output_stream);
output_stream = nullptr;
cuStreamDestroy(input_stream);
input_stream = nullptr;
}
if (debug_flag) {
mgb_log_debug("%s: %d end", __FUNCTION__, __LINE__);
}
}
void NVFlowExtractor::set_device(int dev_id) {
int nGpu = 0;
if (debug_flag) {
mgb_log_warn("config nvof gpu device id: %d", dev_id);
}
CUDA_DRVAPI_CALL(cuInit(0));
CUDA_DRVAPI_CALL(cuDeviceGetCount(&nGpu));
if (dev_id < 0 || dev_id >= nGpu) {
mgb_log_warn("GPU ordinal out of range. Should be with in [0, %d]", nGpu - 1);
mgb_throw(
MegBrainError, "NVOF: GPU Setting Error! err type: NV_OF_ERR_GENERIC");
}
CUDA_DRVAPI_CALL(cuDeviceGet(&cu_device, dev_id));
}
CUmemorytype NVFlowExtractor::get_mem_type(CUdeviceptr p) {
unsigned int mem_type;
auto ret = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, p);
if (CUDA_SUCCESS == ret) {
mgb_assert(
CU_MEMORYTYPE_DEVICE == mem_type || CU_MEMORYTYPE_HOST == mem_type,
"only imp CU_MEMORYTYPE_HOST or CU_MEMORYTYPE_DEVICE mem type");
} else {
mgb_log_warn(
"nvof call cuPointerGetAttribute err!!, may init nvof opr on "
"cpu comp_node, force set mem type to CU_MEMORYTYPE_HOST");
mem_type = CU_MEMORYTYPE_HOST;
}
return static_cast<CUmemorytype_enum>(mem_type);
}
void NVFlowExtractor::extract_flow(
unsigned char* frames, std::vector<size_t>& shape, int16_t* result_out_ptr) {
auto batch_size = shape[0];
auto temporal_size = shape[1];
auto height = shape[2];
auto width = shape[3];
auto channel = shape[4];
auto temporal_len = height * width * channel;
auto batch_len = temporal_size * height * width * channel;
init_nvof_engine();
auto src_mem_type = get_mem_type(reinterpret_cast<CUdeviceptr>(frames));
auto out_mem_type = get_mem_type(reinterpret_cast<CUdeviceptr>(result_out_ptr));
if ((height != m_height || width != m_width) ||
(m_temporal_size != temporal_size)) {
mgb_log_warn("We do not support dynamic shape at mgb side");
mgb_throw(MegBrainError, "NVOF: Nvof err shap!!!! err type: NV_OF_ERR_GENERIC");
}
for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) {
auto input_buffer_batch_offsect = buffer_pool_size * batch_idx;
auto output_buffer_batch_offsect = (buffer_pool_size - 1) * batch_idx;
input_buffers[input_buffer_batch_offsect]->UploadData(
(unsigned char*)(frames + batch_idx * batch_len), src_mem_type);
for (size_t temporal_idx = 1; temporal_idx < temporal_size; temporal_idx++) {
input_buffers[input_buffer_batch_offsect + temporal_idx % buffer_pool_size]->UploadData(
(unsigned char*)(frames + batch_idx * batch_len + temporal_idx * temporal_len),
src_mem_type);
nv_optical_flow->Execute(
input_buffers
[input_buffer_batch_offsect +
(temporal_idx - 1) % buffer_pool_size]
.get(),
input_buffers
[input_buffer_batch_offsect +
temporal_idx % buffer_pool_size]
.get(),
output_buffers
[output_buffer_batch_offsect +
(temporal_idx - 1) % (buffer_pool_size - 1)]
.get(),
nullptr, nullptr);
output_buffers
[output_buffer_batch_offsect +
(temporal_idx - 1) % (buffer_pool_size - 1)]
->DownloadData(
result_out_ptr +
batch_idx * (temporal_size - 1) * out_size +
(temporal_idx - 1) * out_size,
out_mem_type);
}
}
CUDA_DRVAPI_CALL(cuCtxSynchronize());
}
float NVFlowExtractor::get_precision() {
return m_precision;
}
#endif