#include "src/x86/conv_bias/int8/algos.h"
#include "src/common/opr_delegate.h"
#include "src/common/utils.h"
#include "src/fallback/convolution/img2col_helper.h"
#include "src/x86/conv_bias/int8/algo_usable_preferred.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride2.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/x86/conv_bias/opr_impl.h"
#include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/handle.h"
#include "src/x86/utils.h"
#if MEGDNN_X86_WITH_MKL_DNN
#include <mkldnn.hpp>
#endif
#include <cstring>
#if MEGDNN_X86_WITH_MKL_DNN
using namespace dnnl;
#endif
using namespace megdnn;
using namespace x86;
bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy ) const {
return chanwise_avx2_stride1_qint8_usable(param);
}
WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_bundle(
const NCBKernSizeParam& param) {
size_t nr_threads = param.nr_threads;
size_t IH2, IW2, OH2, OW2;
size_t src_size = 0, dst_size = 0, int32_temp = 0;
get_rectified_size(param, IH2, IW2, OH2, OW2);
if (need_src_copy(param)) {
src_size = IH2 * IW2 * sizeof(int8_t) * nr_threads;
}
if (need_dst_copy(param)) {
dst_size = OH2 * OW2 * param.dst_type.size() * nr_threads;
}
bool dst_need_convert = param.dst_type.enumv() == DTypeEnum::QuantizedS8;
if (dst_need_convert) {
int32_temp = OH2 * OW2 * sizeof(int32_t) * nr_threads;
}
return dst_need_convert ? WorkspaceBundle(nullptr, {src_size, dst_size, int32_temp})
: WorkspaceBundle(nullptr, {src_size, dst_size});
}
size_t ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_workspace(
const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<fallback::ConvBiasImpl::NCBKern> ConvBiasImpl::
AlgoChanWiseAvx2Stride1Qint8::get_kimpls(const NCBKernSizeParam& param) const {
auto bundle = get_bundle(param);
return avx2_chanwise_stride1::get_kimpls(param, bundle);
}
bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::is_preferred(
const NCBKernSizeParam& param) const {
return chanwise_avx2_stride1_qint8_preferred(param);
}
bool ConvBiasImpl::AlgoChanWiseAvx2Stride2Qint8::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy ) const {
return chanwise_avx2_stride2_qint8_usable(param);
}
WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride2Qint8::get_bundle(
const NCBKernSizeParam& param) {
size_t nr_threads = param.nr_threads;
size_t IH2, IW2, OH2, OW2;
size_t src_size = 0, dst_size = 0, int32_temp = 0;
get_rectified_size(param, IH2, IW2, OH2, OW2);
if (need_src_copy(param)) {
src_size = IH2 * IW2 * sizeof(int8_t) * nr_threads;
}
if (need_dst_copy(param)) {
dst_size = OH2 * OW2 * param.dst_type.size() * nr_threads;
}
bool dst_need_convert = param.dst_type.enumv() == DTypeEnum::QuantizedS8;
if (dst_need_convert) {
int32_temp = OH2 * OW2 * sizeof(int32_t) * nr_threads;
}
return dst_need_convert ? WorkspaceBundle(nullptr, {src_size, dst_size, int32_temp})
: WorkspaceBundle(nullptr, {src_size, dst_size});
}
size_t ConvBiasImpl::AlgoChanWiseAvx2Stride2Qint8::get_workspace(
const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<fallback::ConvBiasImpl::NCBKern> ConvBiasImpl::
AlgoChanWiseAvx2Stride2Qint8::get_kimpls(const NCBKernSizeParam& param) const {
auto bundle = get_bundle(param);
return avx2_chanwise_stride2::get_kimpls(param, bundle);
}
bool ConvBiasImpl::AlgoChanWiseAvx2Stride2Qint8::is_preferred(
const NCBKernSizeParam& param) const {
return chanwise_avx2_stride2_qint8_preferred(param);
}
bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy ) const {
return direct_avx2_stride1_int8_usable(param);
}
WorkspaceBundle ConvBiasImpl::AlgoDirectAvx2Stride1Int8::get_bundle(
const NCBKernSizeParam& param) {
auto&& fm = param.filter_meta;
size_t N = param.n;
size_t IC = fm.icpg;
size_t OC = fm.ocpg;
size_t IH = param.isz[0];
size_t IW = param.isz[1];
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t FH = fm.spatial[0];
size_t FW = fm.spatial[1];
size_t GROUP = fm.group;
size_t IC_STEP = 2, OC_STEP = 4, IW_STEP = 8;
size_t pad_h = fm.padding[0];
size_t pad_w = fm.padding[1];
size_t src_size = 0, filter_size = 0;
filter_size = GROUP * round_up(OC, OC_STEP) * round_up(IC, IC_STEP) * FH * FW *
sizeof(int16_t);
src_size = N * GROUP * div_ceil(IC, IC_STEP) * (IH + 2 * pad_h) *
round_up(IW + 2 * pad_w, IW_STEP) * 2 * sizeof(int8_t);
bool need_post_process = param.dst_type.enumv() == DTypeEnum::QuantizedS8;
if (need_post_process) {
size_t dst_tmp = N * GROUP * OC * OW * OH * sizeof(int32_t);
return WorkspaceBundle(nullptr, {src_size, filter_size, dst_tmp});
} else {
return WorkspaceBundle(nullptr, {src_size, filter_size});
}
}
size_t ConvBiasImpl::AlgoDirectAvx2Stride1Int8::get_workspace(
const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<fallback::ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoDirectAvx2Stride1Int8::
get_kimpls(const NCBKernSizeParam& param) const {
auto bundle = get_bundle(param);
return direct_conv_avx2_stride1::get_kimpls(param, bundle);
}
bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::is_preferred(
const NCBKernSizeParam& param) const {
return direct_avx2_stride1_int8_preferred(param);
}
bool ConvBiasImpl::AlgoAVX2DirectConvStride2::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
return direct_avx2_stride2_int8_usable(param);
}
WorkspaceBundle ConvBiasImpl::AlgoAVX2DirectConvStride2::get_bundle(
const NCBKernSizeParam& param) {
auto&& fm = param.filter_meta;
size_t N = param.n;
size_t IC = fm.icpg;
size_t OC = fm.ocpg;
size_t IH = param.isz[0];
size_t IW = param.isz[1];
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t FH = fm.spatial[0];
size_t FW = fm.spatial[1];
size_t GROUP = fm.group;
size_t IC_STEP = 2, OC_STEP = 4;
size_t pad_h = fm.padding[0];
size_t pad_w = fm.padding[1];
size_t src_size = 0, filter_size = 0;
filter_size = GROUP * round_up(OC, OC_STEP) * round_up(IC, IC_STEP) * FH * FW *
sizeof(int16_t);
src_size = N * GROUP * div_ceil(IC, IC_STEP) * (IH + 2 * pad_h) * (IW + 2 * pad_w) *
2 * sizeof(int8_t) +
32;
bool need_post_process = param.dst_type.enumv() == DTypeEnum::QuantizedS8;
if (need_post_process) {
size_t dst_tmp = N * GROUP * OC * OW * OH * sizeof(int32_t);
return WorkspaceBundle(nullptr, {src_size, filter_size, dst_tmp});
} else {
return WorkspaceBundle(nullptr, {src_size, filter_size});
}
}
size_t ConvBiasImpl::AlgoAVX2DirectConvStride2::get_workspace(
const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<fallback::ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoAVX2DirectConvStride2::
get_kimpls(const NCBKernSizeParam& param) const {
auto bundle = get_bundle(param);
return direct_conv_avx2_stride2::get_kimpls(param, bundle);
}
bool ConvBiasImpl::AlgoAVX2DirectConvStride2::is_preferred(
const NCBKernSizeParam& param) const {
return direct_avx2_stride2_int8_preferred(param);
}
#if MEGDNN_X86_WITH_MKL_DNN
bool ConvBiasImpl::AlgoMkldnnQint8::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
return mkldnn_qint8_usable(param);
}
WorkspaceBundle ConvBiasImpl::AlgoMkldnnQint8::get_bundle(
const NCBKernSizeParam& param) {
if (!is_supported(SIMDType::VNNI)) {
size_t N = param.n;
size_t IC = param.filter_meta.icpg;
size_t IH = param.isz[0];
size_t IW = param.isz[1];
size_t size = (N * IC * IH * IW) * sizeof(uint8_t);
return WorkspaceBundle{nullptr, {size}};
} else {
return WorkspaceBundle{nullptr, {0}};
}
}
#define REORDER_MEMORY(megdnn_memory, reorder_memory) \
do { \
auto reorder_pd = reorder::primitive_desc( \
eng_mkldnn, megdnn_memory.get_desc(), eng_mkldnn, \
reorder_memory.get_desc()); \
auto reorder_exe = reorder(reorder_pd); \
reorder_exe.execute(stream_mkldnn, megdnn_memory, reorder_memory); \
} while (0)
void ConvBiasImpl::AlgoMkldnnQint8::kern_mkldnn_s8x8x32(
const NCBKernParam& param, const NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(N);
size_t group_id = ncb_index.ndrange_id[0];
size_t batch_id = ncb_index.ndrange_id[1];
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get());
megdnn_assert(x86_handle != nullptr, "x86 handle can not be null");
auto eng_mkldnn = x86_handle->mkldnn_engine();
auto stream_mkldnn = x86_handle->mkldnn_stream();
memory::dims src_shape = {1, IC, IH, IW};
memory::dims weight_shape = {OC, IC, FH, FW};
memory::dims dst_shape = {1, OC, OH, OW};
memory::dims strides_shape = {SH, SW};
memory::dims padding_shape = {PH, PW};
auto megdnn_src_md =
memory::desc({src_shape}, memory::data_type::s8, memory::format_tag::nchw);
auto megdnn_weight_md = memory::desc(
{weight_shape}, memory::data_type::s8, memory::format_tag::oihw);
auto megdnn_dst_md =
memory::desc({dst_shape}, memory::data_type::s32, memory::format_tag::nchw);
auto megdnn_weight_memory =
memory(megdnn_weight_md, eng_mkldnn,
const_cast<void*>(param.filter<void>(group_id)));
int8_t* src = const_cast<int8_t*>(param.src<int8_t>(batch_id, group_id));
int32_t* dst = param.dst<int32_t>(batch_id, group_id);
auto megdnn_src_memory = memory(megdnn_src_md, eng_mkldnn, static_cast<void*>(src));
auto megdnn_dst_memory = memory(megdnn_dst_md, eng_mkldnn, static_cast<void*>(dst));
if (is_supported(SIMDType::VNNI)) {
auto conv_src_md = memory::desc(
{src_shape}, memory::data_type::s8, memory::format_tag::any);
auto conv_weights_md = memory::desc(
{weight_shape}, memory::data_type::s8, memory::format_tag::any);
auto conv_dst_md = memory::desc(
{dst_shape}, memory::data_type::s32, memory::format_tag::any);
auto conv_desc = convolution_forward::desc(
prop_kind::forward, algorithm::convolution_auto, conv_src_md,
conv_weights_md, conv_dst_md, strides_shape, padding_shape,
padding_shape);
auto conv_prim_desc =
convolution_forward::primitive_desc(conv_desc, eng_mkldnn);
auto conv = convolution_forward(conv_prim_desc);
memory conv_src_memory = memory(conv_prim_desc.src_desc(), eng_mkldnn);
memory conv_weight_memory = memory(conv_prim_desc.weights_desc(), eng_mkldnn);
memory conv_dst_memory;
REORDER_MEMORY(megdnn_src_memory, conv_src_memory);
REORDER_MEMORY(megdnn_weight_memory, conv_weight_memory);
if (megdnn_dst_memory.get_desc() != conv_prim_desc.dst_desc()) {
conv_dst_memory = memory(conv_prim_desc.dst_desc(), eng_mkldnn);
} else {
conv_dst_memory = megdnn_dst_memory;
}
conv.execute(
stream_mkldnn, {{DNNL_ARG_SRC, conv_src_memory},
{DNNL_ARG_WEIGHTS, conv_weight_memory},
{DNNL_ARG_DST, conv_dst_memory}});
REORDER_MEMORY(conv_dst_memory, megdnn_dst_memory);
stream_mkldnn.wait();
} else {
std::vector<primitive> net;
std::vector<std::unordered_map<int, memory>> net_args;
uint8_t* const_128 = static_cast<uint8_t*>(param.workspace_ptr);
std::memset(const_128, 128u, get_bundle(param).total_size_in_bytes());
auto megdnn_128_md = memory::desc(
{src_shape}, memory::data_type::u8, memory::format_tag::nchw);
auto megdnn_128_memory =
memory(megdnn_128_md, eng_mkldnn, static_cast<void*>(const_128));
auto conv_128_dst_memory = memory(megdnn_dst_md, eng_mkldnn);
auto conv_desc1 = convolution_forward::desc(
prop_kind::forward, algorithm::convolution_auto, megdnn_128_md,
megdnn_weight_md, megdnn_dst_md, strides_shape, padding_shape,
padding_shape);
auto conv_prim_desc1 =
convolution_forward::primitive_desc(conv_desc1, eng_mkldnn);
net.push_back(convolution_forward(conv_prim_desc1));
net_args.push_back(
{{DNNL_ARG_SRC, megdnn_128_memory},
{DNNL_ARG_WEIGHTS, megdnn_weight_memory},
{DNNL_ARG_DST, conv_128_dst_memory}});
memory conv_src_add_128_memory = megdnn_128_memory;
auto sum_128_desc = sum::primitive_desc(
conv_src_add_128_memory.get_desc(), {1.0f, 1.0f},
{megdnn_128_md, megdnn_src_md}, eng_mkldnn);
net.push_back(sum(sum_128_desc));
net_args.push_back(
{{DNNL_ARG_MULTIPLE_SRC, megdnn_128_memory},
{DNNL_ARG_MULTIPLE_SRC + 1, megdnn_src_memory},
{DNNL_ARG_DST, conv_src_add_128_memory}});
auto conv_desc2 = convolution_forward::desc(
prop_kind::forward, algorithm::convolution_auto, megdnn_128_md,
megdnn_weight_md, megdnn_dst_md, strides_shape, padding_shape,
padding_shape);
auto conv_prim_desc2 =
convolution_forward::primitive_desc(conv_desc2, eng_mkldnn);
net.push_back(convolution_forward(conv_prim_desc2));
net_args.push_back(
{{DNNL_ARG_SRC, conv_src_add_128_memory},
{DNNL_ARG_WEIGHTS, megdnn_weight_memory},
{DNNL_ARG_DST, megdnn_dst_memory}});
auto sub_128_desc = sum::primitive_desc(
megdnn_dst_md, {1.0f, -1.0f}, {megdnn_dst_md, megdnn_dst_md},
eng_mkldnn);
net.push_back(sum(sub_128_desc));
net_args.push_back(
{{DNNL_ARG_MULTIPLE_SRC, megdnn_dst_memory},
{DNNL_ARG_MULTIPLE_SRC + 1, conv_128_dst_memory},
{DNNL_ARG_DST, megdnn_dst_memory}});
for (size_t i = 0; i < net.size(); ++i) {
net.at(i).execute(stream_mkldnn, net_args.at(i));
}
stream_mkldnn.wait();
}
}
#undef REORDER_MEMORY
bool ConvBiasImpl::AlgoMkldnnQint8::is_preferred(const NCBKernSizeParam& param) const {
return mkldnn_qint8_preferred(param);
}
bool ConvBiasImpl::AlgoMkldnnMatmulQint8::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
return mkldnn_matmul_qint8_usable(param);
}
bool ConvBiasImpl::AlgoMkldnnMatmulQint8::is_preferred(
const NCBKernSizeParam& param) const {
return mkldnn_matmul_qint8_preferred(param);
}
WorkspaceBundle ConvBiasImpl::AlgoMkldnnMatmulQint8::get_bundle(
const NCBKernSizeParam& param) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
megdnn_ignore(N);
megdnn_ignore(OC);
auto IW2 = IH + 2 * PH;
auto IH2 = IW + 2 * PW;
bool can_matrix_mul_direct =
(FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
size_t part0, part1, part2;
if (can_matrix_mul_direct) {
part0 = part1 = 0;
} else {
part0 = (IC * IH2 * IW2 + 4) * sizeof(int8_t);
part1 = (IC * FH * FW * OH * OW + 4) * sizeof(int8_t);
}
{
TensorLayout A_, B_, C_;
A_ = TensorLayout({OC, IC * FH * FW}, dtype::Int8());
B_ = TensorLayout({IC * FH * FW, OH * OW}, dtype::Int8());
C_ = TensorLayout({OC, OH * OW}, dtype::Int32());
part2 = get_matmul_opr()->get_workspace_in_bytes(A_, B_, C_);
}
return {nullptr, {part0, part1, part2}};
}
MatrixMul* ConvBiasImpl::AlgoMkldnnMatmulQint8::get_matmul_opr() {
static CpuOprDelegationStorage<> storage;
return storage.get<MatrixMul>();
}
void ConvBiasImpl::AlgoMkldnnMatmulQint8::kern_mkldnn_matmul_s8x8x32(
const NCBKernParam& param, const NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
auto IH2 = IH + 2 * PH;
auto IW2 = IW + 2 * PW;
size_t group_id = ncb_index.ndrange_id[0];
bool is_xcorr = !param.filter_meta.should_flip;
auto bundle = get_bundle(param);
bundle.set(param.workspace_ptr);
for (size_t n = 0; n < N; ++n) {
int8_t* src = const_cast<int8_t*>(param.src<int8_t>(n, group_id));
int32_t* dst = param.dst<int32_t>(n, group_id);
int8_t *B, *src2;
if (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0) {
B = src;
} else {
src2 = static_cast<int8_t*>(bundle.get(0));
int8_t* src2_ptr = src2;
const int8_t* src_ptr = src;
rep(ic, IC) {
if (PH != 0) {
std::memset(src2_ptr, 0, sizeof(int8_t) * PH * IW2);
src2_ptr += PH * IW2;
}
rep(ih, IH) {
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = 0.0f;
std::memcpy(src2_ptr, src_ptr, sizeof(int8_t) * IW);
src2_ptr += IW;
src_ptr += IW;
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = 0.0f;
}
if (PH != 0) {
std::memset(src2_ptr, 0, sizeof(int8_t) * PH * IW2);
src2_ptr += PH * IW2;
}
}
B = static_cast<int8_t*>(bundle.get(1));
if (SH == 1 && SW == 1) {
if (is_xcorr) {
img2col<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
} else {
img2col<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
}
} else {
if (is_xcorr) {
img2col_stride<true>(
src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW, SH, SW);
} else {
img2col_stride<false>(
src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW, SH, SW);
}
}
}
{
TensorND A_, B_, C_;
A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Int8());
A_.reset_ptr(const_cast<int8_t*>(param.filter<int8_t>(group_id)));
B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Int8());
B_.reset_ptr(B);
C_.layout = TensorLayout({OC, OH * OW}, dtype::Int32());
C_.reset_ptr(dst);
Workspace workspace(
static_cast<dt_byte*>(bundle.get(2)), bundle.get_size(2));
get_matmul_opr()->exec(A_, B_, C_, workspace);
}
}
}
#endif