#include "megbrain/tensorrt/tensorrt_runtime_opr.h"
#include "megbrain/common.h"
#include "megbrain/plugin/profiler.h"
#include "megbrain/serialization/opr_load_dump.h"
#include "megbrain/version_symbol.h"
#include "megdnn/basic_types.h"
#include <cinttypes>
#if MGB_ENABLE_TENSOR_RT
#include <NvInferPlugin.h>
using namespace mgb;
using namespace opr;
using TensorRTManager = intl::TensorRTManager;
namespace {
DType get_dtype_from_trt(nvinfer1::DataType trt_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
return dtype::Float32();
case nvinfer1::DataType::kHALF:
#if !MEGDNN_DISABLE_FLOAT16
return dtype::Float16();
#else
mgb_throw(MegBrainError, "Float16 support is disabled.");
#endif
case nvinfer1::DataType::kINT8:
return dtype::QuantizedS8(1.f);
case nvinfer1::DataType::kINT32:
return dtype::Int32();
default:
mgb_assert("DataType of trt engine is unknown.");
}
return DType();
}
}
MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorRTRuntimeOpr);
TensorRTRuntimeOpr::TensorRTRuntimeOpr(
std::shared_ptr<nvinfer1::ICudaEngine> engine,
std::shared_ptr<GpuAllocator> gpu_allocator, const VarNodeArray& inputs,
const OperatorNodeConfig& config)
: Super(inputs.at(0)->owner_graph(), config, "tensor_rt", {inputs.at(0)}),
m_gpu_allocator{std::move(gpu_allocator)},
m_engine{std::move(engine)},
m_trt_engine_has_batch{false} {
mgb_assert(
inputs[0]->comp_node().device_type() == CompNode::DeviceType::CUDA,
"TensorRTRuntimeOpr can only be used on cuda comp nodes; got %s",
inputs[0]->comp_node().to_string().c_str());
size_t nr_input = 0;
bool is_input = true;
#if NV_TENSOR_RT_VERSION >= 6001
auto profile_num = m_engine->getNbOptimizationProfiles();
#else
int profile_num = 1;
#endif
auto bindings_per_profile = m_engine->getNbBindings() / profile_num;
for (int i = 0; i < bindings_per_profile; ++i) {
if (m_engine->bindingIsInput(nr_input)) {
mgb_assert(is_input, "mixed input/output bindings");
if (m_engine->getBindingDimensions(nr_input).nbDims != 3)
m_trt_engine_has_batch = true;
++nr_input;
} else {
is_input = false;
}
}
size_t nr_output = bindings_per_profile - nr_input;
mgb_assert(
nr_input == inputs.size(), "inputs size not equal: expect=%zu got=%zu",
nr_input, inputs.size());
for (auto i : inputs) {
add_input({i});
}
if (nr_output == 1) {
add_output(None);
} else {
for (size_t i = 0; i < nr_output; ++i)
add_output(ssprintf("o%zu", i));
}
cg::add_workspace_output(this);
add_equivalence_component<mgb::ScalarHash<void*>>(m_engine.get());
}
void TensorRTRuntimeOpr::get_output_var_shape(
const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
auto batch = inp_shape.at(0)[0];
m_manager.create_trt_context(this->comp_node(), inp_shape, m_engine.get());
auto get_mgb_shape = [&](int binding_idx) -> TensorShape {
auto dims = m_engine->getBindingDimensions(binding_idx);
#if NV_TENSOR_RT_VERSION >= 6001
auto format = m_engine->getBindingFormat(binding_idx);
if (format == nvinfer1::TensorFormat::kCHW4) {
mgb_assert(
dims.nbDims == 3 || dims.nbDims == 4,
"Tensor with NCHW4 format should have dimensions of "
"3/4.(got: %d)",
dims.nbDims);
int chan_pos = 0;
if (dims.nbDims == 4) {
chan_pos = 1;
}
dims.nbDims = dims.nbDims + 1;
dims.d[chan_pos] = (dims.d[chan_pos] + 3) / 4;
dims.d[dims.nbDims - 1] = 4;
}
#endif
auto shape = m_trt_engine_has_batch ? TensorRTOpr::dims2shape(dims)
: TensorRTOpr::dims2shape(dims, batch);
#if NV_TENSOR_RT_VERSION >= 6001
if (static_cast<size_t>(binding_idx) < inp_shape.size()) {
for (int i = 0; i < dims.nbDims; i++) {
if (dims.d[i] == -1) {
shape[i] = inp_shape.at(binding_idx)[i];
}
}
} else {
auto trt_infer_dims = m_manager.get_binding_dimensions(binding_idx);
for (int i = 0; i < dims.nbDims; i++) {
if (dims.d[i] == -1) {
shape[i] = trt_infer_dims.d[i];
}
}
}
#endif
return shape;
};
for (size_t i = 0; i < inp_shape.size(); ++i) {
mgb_assert(batch == inp_shape[i][0], "input batchsize not equal");
TensorShape shp = get_mgb_shape(i);
mgb_assert(
shp.eq_shape(inp_shape[i]), "input shape mismatch: expect=%s got=%s",
shp.to_string().c_str(), inp_shape[i].to_string().c_str());
}
for (size_t i = 0; i < out_shape.size() - 1; ++i) {
out_shape[i] = get_mgb_shape(i + input().size());
}
out_shape.back() = {intl::workspace_size(m_engine.get())};
}
void TensorRTRuntimeOpr::add_input_layout_constraint() {
for (auto i : input()) {
i->add_layout_constraint_contiguous();
}
}
void TensorRTRuntimeOpr::scn_do_execute() {
auto batch = this->input(0)->shape()[0];
if (m_trt_engine_has_batch)
m_manager.exec(
this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
m_engine.get());
else
m_manager.exec(
this, m_gpu_allocator ? m_gpu_allocator->comp_node() : CompNode{},
m_engine.get(), batch);
}
void TensorRTRuntimeOpr::init_output_dtype() {
DType dt_trt, dt_input;
int idx = 0;
for (auto inp : input()) {
dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
dt_input = inp->dtype();
mgb_assert(
dt_trt.valid() && dt_input.valid() &&
dt_trt.enumv() == dt_input.enumv(),
"Input %d Dtype is not expected in trt engine: expected %s, "
"got %s",
idx, dt_trt.name(), dt_input.name());
idx++;
}
for (size_t i = 0; i < output().size(); ++i) {
dt_trt = get_dtype_from_trt(m_engine->getBindingDataType(idx));
mgb_assert(
dt_trt.valid(),
"output dtype checking failed: invalid dtype returned.");
if (dt_trt.enumv() == DTypeEnum::QuantizedS8) {
mgb_assert(
output(i)->dtype().valid(),
"user should specify scale of output tensor of "
"TensorRTRuntimeOpr.");
}
if (!output(i)->dtype().valid())
output(i)->dtype(dt_trt);
idx++;
}
}
SymbolVarArray TensorRTRuntimeOpr::make(
std::shared_ptr<nvinfer1::ICudaEngine> engine,
std::shared_ptr<GpuAllocator> gpu_allocator, const SymbolVarArray& src,
const OperatorNodeConfig& config) {
mgb_assert(
NV_TENSORRT_VERSION == getInferLibVersion(),
"TensorRT version mismatch: compiled with %d; detected %d at runtime , may "
"caused by customized environment, for example LD_LIBRARY_PATH on LINUX "
"and PATH on Windows!!",
NV_TENSORRT_VERSION, getInferLibVersion());
VarNodeArray var_node_array = cg::to_var_node_array(src);
auto tensor_rt_opr = std::make_unique<TensorRTRuntimeOpr>(
std::move(engine), std::move(gpu_allocator), var_node_array, config);
auto ret = cg::to_symbol_var_array(src[0].node()
->owner_graph()
->insert_opr(std::move(tensor_rt_opr))
->output());
ret.pop_back(); return ret;
}
SymbolVarArray TensorRTRuntimeOpr::make(
const void* buf, size_t buf_size, const SymbolVarArray& src,
const OperatorNodeConfig& config) {
mgb_throw_if(
!CompNode::get_device_count(CompNode::DeviceType::CUDA), SystemError,
"can not create TensorRTRuntimeOpr when CUDA is not available");
mgb_assert(!src.empty(), "no inputs provided");
initLibNvInferPlugins(&TensorRTOpr::Logger::instance(), "");
TensorRTUniquePtr<nvinfer1::IRuntime> runtime{
nvinfer1::createInferRuntime(TensorRTOpr::Logger::instance()), {}};
auto gpu_allocator = std::make_shared<GpuAllocator>(src[0].node()->comp_node());
runtime->setGpuAllocator(gpu_allocator.get());
auto engine = runtime->deserializeCudaEngine(buf, buf_size, nullptr);
mgb_assert(engine, "failed to deserialize ICudaEngine");
return make(to_shared_ptr_engine(engine), gpu_allocator, src, config);
}
void TensorRTRuntimeOpr::LoadDumpImpl::dump(
serialization::OprDumpContext& ctx, const cg::OperatorNodeBase& opr) {
TensorRTUniquePtr<nvinfer1::IHostMemory> buf{
opr.cast_final_safe<Opr>().trt_cuda_engine()->serialize(), {}};
mgb_assert(buf, "failed to serialize ICudaEngine");
ctx.dump_buf_with_len(buf->data(), buf->size());
}
cg::OperatorNodeBase* TensorRTRuntimeOpr::LoadDumpImpl::load(
serialization::OprLoadContext& ctx, const cg::VarNodeArray& inputs,
const OperatorNodeConfig& config) {
inputs.at(0)->comp_node().activate();
auto buf = ctx.load_shared_buf_with_len();
return Opr::make(buf.data(), buf.size(), cg::to_symbol_var_array(inputs), config)
.at(0)
.node()
->owner_opr();
}
#endif