#include "src/cuda/conv_bias/opr_impl.h"
#include "megdnn/dtype.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/conv_bias/helper.h"
#include "src/cuda/handle.h"
#include "src/cuda/utils.h"
#include "src/common/algo_chooser.h"
#include "src/common/conv_bias.h"
#include "src/cuda/cudnn_with_check.h"
namespace megdnn {
namespace cuda {
void ConvBiasForwardImpl::exec(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias,
_megdnn_tensor_in z, _megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) {
check_exec_allow_noncontiguous(
src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size, preprocessed_filter);
AlgoBase::ExecArgs args(
this, src, filter, bias, z, dst, workspace, preprocessed_filter);
auto algo = get_algorithm(
this, src.layout, filter.layout, bias.layout, z.layout, dst.layout);
algo->exec(args);
};
std::vector<ConvBiasForward::Algorithm*> ConvBiasForwardImpl::get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst) {
return megdnn::get_all_algorithms<ConvBiasForwardImpl>(
{this, src, filter, bias, z, dst});
}
std::vector<ConvBiasForward::Algorithm*> ConvBiasForwardImpl::get_all_algorithms_safe(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst) {
return megdnn::get_all_algorithms_safe<ConvBiasForwardImpl>(
{this, src, filter, bias, z, dst});
}
ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) {
using namespace conv_bias;
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
auto dst_layout = *args.dst_layout;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
}
auto conv_args = args;
auto cudnn_conv_bias_act_from_enum_wrapper =
[](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo);
};
auto cudnn_conv_from_enum_wrapper =
[](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
return sm_algo_pack.cudnn_conv_from_enum(algo);
};
auto get_cudnn_algo =
[this, &conv_args, &args, workspace_limit_in_bytes, positive_attr,
negative_attr](
const thin_function<AlgoBase*(cudnnConvolutionFwdAlgo_t)>& cb)
-> AlgoBase* {
auto cudnn_handle = cuda::cudnn_handle(this->handle());
CUDNNForwardDescs desc;
conv_args.init_conv_desc(desc);
#if CUDNN_MAJOR >= 7
int max_count = 0;
cudnn_check(
cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_count));
SmallVector<cudnnConvolutionFwdAlgoPerf_t> algo_perf(max_count);
int ret_count = 0;
cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(
cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count, &ret_count,
algo_perf.data()));
for (int i = 0; i < ret_count; ++i) {
auto conv_bias_algo = cb(algo_perf[i].algo);
if (conv_bias_algo->is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return conv_bias_algo;
}
}
#else
cudnnConvolutionFwdAlgo_t algo;
cudnn_check(cudnnGetConvolutionForwardAlgorithm(
cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
desc.conv_desc.conv_desc, desc.dst_desc.desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_in_bytes,
&algo));
auto conv_bias_algo = cb(algo);
if (conv_bias_algo->is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return conv_bias_algo;
#endif
return nullptr;
};
auto get_1x1_algo = [workspace_limit_in_bytes, positive_attr,
negative_attr](const AlgoBase::SizeArgs& size_arg)
-> ConvBiasForwardImpl::AlgoBase* {
if (sm_algo_pack.batched_matmul.is_available_attribute(
size_arg, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.batched_matmul;
}
return nullptr;
};
const bool is_chanwise = (args.filter_meta.format == Param::Format::NCHW &&
args.filter_meta.group == src[1]) ||
(args.filter_meta.format == Param::Format::NCHW4 &&
args.filter_meta.group == src[1] * 4) ||
(args.filter_meta.format == Param::Format::NCHW32 &&
args.filter_meta.group == src[1] * 32);
const bool slow_cudnn_chanwise_impl =
CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5);
const int hw_size = src[2] * src[3];
const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl ||
args.filter_meta.stride[0] != 1 ||
args.filter_meta.stride[1] != 1 || hw_size < 512;
size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
size_t hi = src[2], wi = src[3];
const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw;
const bool prefer_direct_lk = fh > 9 && fw > 9;
if (is_chanwise) {
if (prefer_dnn_lk_implbmm) {
#if CUDA_VERSION >= 10020
if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.f16_implicit_bmm[0];
#endif
if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.f32_implicit_bmm[0];
} else if (
prefer_direct_lk &&
sm_algo_pack.depthwise_large_filter.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.depthwise_large_filter;
} else if (prefer_dnn_chanwise) {
if (sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.chanwise;
if (sm_algo_pack.chanwise8x8x32.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.chanwise8x8x32;
} else {
conv_args.dst_layout = &dst_layout;
if (is_cudnn_supported(conv_args)) {
if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) {
return algo;
}
}
}
}
bool cudnn_conv_bias_act_supported = false;
for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) {
if (algo.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
cudnn_conv_bias_act_supported = true;
break;
}
}
if (cudnn_conv_bias_act_supported) {
if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper))
return algo;
}
conv_args.dst_layout = &dst_layout;
if (is_cudnn_supported(conv_args)) {
if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper))
return algo;
}
if (auto algo = get_1x1_algo(args)) {
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.fallback_nchw_qs8;
}
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
"cuda convbias fwd", positive_attr, negative_attr);
} else {
return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
"cuda convbias fwd", positive_attr, negative_attr);
}
}
const char* ConvBiasForwardImpl::get_algorithm_set_name() const {
return "CONV_BIAS_CUDA";
}
size_t ConvBiasForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
TensorLayoutArray layouts{src, filter, bias, z, dst};
HeuristicCache::Key key{this->handle(), this->get_opr_type(),
layouts.data(), layouts.size(),
&this->param(), sizeof(this->param())};
auto rst = HeuristicCache::instance().get(key);
if (rst.policy.algo.valid()) {
return rst.workspace;
}
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst, preprocessed_filter};
return get_algorithm(this, src, filter, bias, z, dst)->get_workspace_in_bytes(args);
};
size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->get_preprocess_workspace_in_bytes(args);
}
SmallVector<TensorLayout> ConvBiasForwardImpl::deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
const TensorLayout& z, const TensorLayout& dst) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->deduce_preprocessed_filter_layout(args);
}
void ConvBiasForwardImpl::exec_preprocess(
const TensorLayout& src_layout, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, const TensorLayout& z_layout,
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, z{nullptr, z_layout};
AlgoBase::ExecArgs args(
this, src, filter, bias, z, dst, workspace, preprocessed_filter);
auto algo = get_algorithm(
this, src.layout, filter.layout, bias.layout, z.layout, dst.layout);
return algo->exec_preprocess(args);
}
} }