#include "./compiler_cuda.h"
#include <cstdio>
#include "./codegen_cuda.h"
#include "megbrain/common.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/jit/param_elem_visitor.h"
#include "megbrain/jit/utils.h"
#include "megbrain/utils/persistent_cache.h"
#include "megbrain/utils/timer.h"
#if MGB_JIT && MGB_CUDA
#include <dlfcn.h>
#include <nvrtc.h>
using namespace mgb;
using namespace jit;
namespace {
std::string NVRTCCompile(const std::string& code, int cap_major, int cap_minor) {
static std::vector<std::string> cuda_include_opts = get_cuda_include_opts();
auto arch_opt = ssprintf("--gpu-architecture=compute_%d%d", cap_major, cap_minor);
std::vector<const char*> opts;
opts.push_back(arch_opt.c_str());
for (auto& inc_path : cuda_include_opts)
opts.push_back(inc_path.c_str());
nvrtcProgram prog;
MGB_NVRTC_CHECK(
nvrtcCreateProgram(&prog, code.c_str(), nullptr, 0, nullptr, nullptr));
std::unique_ptr<nvrtcProgram, void (*)(nvrtcProgram*)> prog_release{
&prog, [](nvrtcProgram* p) { MGB_NVRTC_CHECK(nvrtcDestroyProgram(p)); }};
nvrtcResult compile_res = nvrtcCompileProgram(prog, opts.size(), opts.data());
size_t log_size;
MGB_NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
std::string log;
log.resize(log_size);
MGB_NVRTC_CHECK(nvrtcGetProgramLog(prog, &log[0]));
mgb_throw_if(
compile_res != NVRTC_SUCCESS, SystemError,
"nvrtc compile error: %s\n========= source code\n%s", log.c_str(),
code.c_str());
size_t ptx_size;
MGB_NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
std::string ptx;
ptx.resize(ptx_size);
MGB_NVRTC_CHECK(nvrtcGetPTX(prog, &ptx[0]));
return ptx;
}
void make_fastdiv(Uint32Fastdiv& fdiv, uint32_t d) {
mgb_assert(d);
fdiv.m_divisor = d;
constexpr uint32_t MAX_U32 = ~0u;
fdiv.m_inc_dividend = 0;
fdiv.m_divisor_is_not_1 = ~0u;
if (!(d & (d - 1))) {
fdiv.m_mul = 1u << 31;
int p = 0;
while ((1u << p) < d)
++p;
mgb_assert((1u << p) == d);
fdiv.m_shift = p ? p - 1 : 0;
if (d == 1)
fdiv.m_divisor_is_not_1 = 0;
return;
}
auto n_bound = uint64_t(d / 2 + 1) * MAX_U32;
uint32_t shift = 32;
while ((1ull << shift) < n_bound)
++shift;
uint64_t mdst = 1ull << shift;
int64_t delta = d - mdst % d;
fdiv.m_mul = mdst / d + 1;
if ((uint64_t)delta > d / 2) {
delta -= d;
--fdiv.m_mul;
fdiv.m_inc_dividend = 1;
}
mgb_assert((uint64_t)fdiv.m_mul * d == mdst + delta);
delta = delta >= 0 ? delta : -delta;
mgb_assert((uint64_t)delta * MAX_U32 < mdst);
fdiv.m_shift = shift - 32;
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
template <int ndim>
void host_init_pvisitor(ParamElemVisitor<ndim>& pvis, const TensorLayout& layout) {
mgb_assert(layout.ndim && layout.ndim <= ndim);
for (uint32_t i = 0; i < layout.ndim; ++i) {
pvis.m_stride[i] = layout.stride[i];
if (i + 1 < layout.ndim) {
make_fastdiv(pvis.m_shape_highdim[i], layout.shape[i + 1]);
}
}
for (int i = layout.ndim - 1; i < ndim - 1; ++i) {
make_fastdiv(pvis.m_shape_highdim[i], 1);
}
for (int i = layout.ndim; i < ndim; ++i) {
pvis.m_stride[i] = 0;
}
}
#pragma GCC diagnostic pop
template <size_t out_dim>
void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_size) {
auto&& args = fusion_opr->args();
size_t nr_inps = args.inputs.size();
bool copy_param_to_dev = nr_inps > CudaCompiler::MAX_CUDA_NR_INPUT;
SmallVector<CUdeviceptr> datum(nr_inps + 1);
SmallVector<ParamElemVisitor<out_dim>> pvisitors;
pvisitors.reserve(nr_inps);
for (size_t i = 0; i < args.inputs.size(); i++) {
datum[i] = reinterpret_cast<CUdeviceptr>(
args.inputs[i].from->dev_tensor().raw_ptr());
host_init_pvisitor<out_dim>(pvisitors[i], args.inputs[i].layout);
}
datum[nr_inps] = reinterpret_cast<CUdeviceptr>(
args.outputs[0].from->dev_tensor().as_megdnn().raw_ptr());
size_t num_elements = args.outputs[0].layout.total_nr_elems();
mgb_assert(
num_elements <= UINT32_MAX,
"Currently JIT only supports 32 bit of elememt size for better "
"performance");
int num_block = (num_elements - 1) / (block_size * 3) + 1;
void* exec_args[3];
exec_args[1] = &num_elements;
void* datum_dev = nullptr;
void* p_visitors_dev = nullptr;
const CompNodeEnv& env = CompNodeEnv::from_comp_node(fusion_opr->comp_node());
if (!copy_param_to_dev) {
exec_args[0] = datum.data();
exec_args[2] = pvisitors.data();
} else {
datum_dev = args.outputs[1].from->dev_tensor().as_megdnn().raw_ptr();
MGB_CUDA_CHECK(cudaMemcpyAsync(
datum_dev, datum.data(), (nr_inps + 1) * sizeof(CUdeviceptr),
cudaMemcpyHostToDevice, env.cuda_env().stream));
p_visitors_dev = args.outputs[2].from->dev_tensor().as_megdnn().raw_ptr();
MGB_CUDA_CHECK(cudaMemcpyAsync(
p_visitors_dev, pvisitors.data(),
nr_inps * sizeof(ParamElemVisitor<out_dim>), cudaMemcpyHostToDevice,
env.cuda_env().stream));
exec_args[0] = &datum_dev;
exec_args[2] = &p_visitors_dev;
}
MGB_CUDA_CU_CHECK(cuLaunchKernel(
func, num_block, 1, 1, block_size, 1, 1, 0, env.cuda_env().stream,
exec_args, 0));
}
}
void mgb::jit::_on_nvrtc_error(
const char* expr, nvrtcResult nvrtc_res, const char* file, const char* func,
int line) {
mgb_throw(
CudaError, "nvrtc error %d: %s (%s at %s:%s:%d)", int(nvrtc_res),
nvrtcGetErrorString(nvrtc_res), expr, file, func, line);
}
CudaExecutable::CudaExecutable(std::string source, std::string name)
: m_source{std::move(source)}, m_name{std::move(name)} {}
void CudaExecutable::execute(JITExecutor* fusion_opr) {
FuncCache* func;
auto cn = fusion_opr->comp_node();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
{
MGB_LOCK_GUARD(m_mtx);
func = &m_func_cache[{prop.major, prop.minor}];
}
{
MGB_LOCK_GUARD(func->mtx);
if (func->ptx.empty()) {
func->compile(
"jit:nvrtc:" + PersistentCache::make_category_from_comp_node(cn),
prop.major, prop.minor, this);
}
}
func->exec(fusion_opr, this);
}
void CudaExecutable::FuncCache::compile(
const std::string& cache_category, int major, int minor,
const CudaExecutable* cuda_exe) {
RealTimer timer;
auto&& cache = PersistentCache::inst();
PersistentCache::Blob key{cuda_exe->m_source.data(), cuda_exe->m_source.size()};
auto ptx_cache = cache.get(cache_category, key);
if (ptx_cache.valid()) {
ptx.assign(static_cast<const char*>(ptx_cache->ptr), ptx_cache->size);
} else {
ptx = NVRTCCompile(cuda_exe->m_source, major, minor);
ptx_cache = PersistentCache::Blob{ptx.data(), ptx.size()};
cache.put(cache_category, key, ptx_cache.val());
mgb_log("NVRTC JIT: compile %s for %d.%d: source_len=%zu ptx_len=%zu "
"time=%.3fms",
cuda_exe->m_name.c_str(), major, minor, key.size, ptx.size(),
timer.get_msecs());
}
}
void CudaExecutable::FuncCache::exec(
const JITExecutor* fusion_opr, const CudaExecutable* cuda_exe) {
Func* func;
{
MGB_LOCK_GUARD(mtx);
auto ins = cn2func.insert({fusion_opr->comp_node(), {}});
func = &ins.first->second;
if (ins.second) {
MGB_CUDA_CU_CHECK(cuModuleLoadData(&func->module, ptx.data()));
MGB_CUDA_CU_CHECK(cuModuleGetFunction(
&func->func, func->module, cuda_exe->m_name.c_str()));
int min_grid_size = 0;
MGB_CUDA_CU_CHECK(cuOccupancyMaxPotentialBlockSize(
&min_grid_size, &func->block_size, func->func, nullptr, 0, 0));
}
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-value"
int out_dim = fusion_opr->args().outputs[0].layout.ndim;
#define cb_outdim(EXPECTED_OUTDIM) \
if (EXPECTED_OUTDIM == out_dim) { \
setup_and_launch<EXPECTED_OUTDIM>(fusion_opr, func->func, func->block_size); \
return; \
}
#pragma GCC diagnostic push
cb_outdim(1);
cb_outdim(2);
cb_outdim(3);
cb_outdim(4);
mgb_throw(InternalError, "unsupported out_dim=%zu", static_cast<size_t>(out_dim));
#undef cb_outdim
}
CudaExecutable::~CudaExecutable() {
for (auto&& i : m_func_cache) {
for (auto&& j : i.second.cn2func) {
j.first.activate();
if (auto m = j.second.module) {
cuModuleUnload(m);
}
}
}
}
std::unique_ptr<Executable> CudaCompiler::do_compile(
const InternalGraph& graph, const JITExecutor::Args& args) {
bool copy_param_to_dev = graph.placeholders().size() > MAX_CUDA_NR_INPUT;
if (copy_param_to_dev) {
mgb_log_warn(
"Too many[%zu] inputs, which exceeds the limit[%zu]. JIT "
"kernel function's parameters will be "
"put in GPU global memory.",
graph.placeholders().size(), MAX_CUDA_NR_INPUT);
}
std::string source, kernel_name;
std::tie(kernel_name, source) = codegen_cuda(graph, args, copy_param_to_dev);
auto ret =
std::make_unique<CudaExecutable>(std::move(source), std::move(kernel_name));
return ret;
}
size_t CudaCompiler::get_nr_workspace_outputs(JITExecutor* opr) const {
if (opr->input().size() > MAX_CUDA_NR_INPUT) {
return 2;
}
return 0;
}
void CudaCompiler::init_workspace_size_infer(JITExecutor* opr) {
if (opr->output().size() == 3) {
using namespace cg::static_infer;
auto&& mgr = opr->owner_graph()->static_infer_manager();
TensorShape output_shape1(
{(opr->input().size() + 1) * sizeof(unsigned long long)});
mgr.register_shape_infer(
opr->output(1), ShapeInferDesc::make_const(output_shape1));
TensorShape output_shape2({opr->input().size() * sizeof(ParamElemVisitor<4>)});
mgr.register_shape_infer(
opr->output(2), ShapeInferDesc::make_const(output_shape2));
}
}
#endif