#include "./legacy_checker.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/serialization/serializer.h"
#include "megbrain/test/autocheck.h"
#include "megbrain/test/helper.h"
#include "megbrain/test/megdnn_helper.h"
#include "megdnn/dtype.h"
#include "megdnn/heuristic_cache.h"
#include "megdnn/oprs/base.h"
#include <gmock/gmock.h>
#include <cmath>
#include <memory>
#include <random>
using namespace mgb;
namespace {
using Param = opr::Convolution::Param;
using Param3D = opr::Convolution3D::Param;
using Mode = Param::Mode;
Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
void conv_bwd_data_brute(
const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& dest,
const opr::ConvolutionBackwardData::Param& param) {
mgb_assert(param.format == Param::Format::NCHW);
auto &&data = *inps[0], &&filter = *inps[1];
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
size_t GROUP, ICPG, OCPG, FH, FW;
if (param.sparse == Param::Sparse::DENSE) {
GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1), FH = filter.shape(2),
FW = filter.shape(3);
} else {
mgb_assert(param.sparse == Param::Sparse::GROUP);
GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
FH = filter.shape(3), FW = filter.shape(4);
}
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
dest = std::make_shared<HostTensorND>(
CompNode::load("xpu0"), TensorShape{N, OCPG * GROUP, OH, OW});
auto&& out = *dest;
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(), optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();
#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)
FOR3(n, N, group, GROUP, icg, ICPG)
FOR2(ih, IH, iw, IW) {
float scale = *(dptr++);
FOR3(ocg, OCPG, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int oh = int(ih * param.stride_h + fh * param.dilate_h) - int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) - int(param.pad_w);
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
oh * ol.stride[2] + ow;
size_t flt_off = 0;
if (param.sparse == Param::Convolution::Sparse::DENSE) {
flt_off = icg * fl.stride[0] + ocg * fl.stride[1] +
fh * fl.stride[2] + fw;
} else {
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fh * fl.stride[3] + fw;
}
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR3
#undef FOR2
}
void conv_bwd_flt_brute(
const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& out,
const opr::ConvolutionBackwardFilter::Param& param) {
auto &&src = *inps[0], &&diff = *inps[1], &&filter = *inps[2];
size_t N = src.shape(0), IH = src.shape(2), IW = src.shape(3), OC = filter.shape(0),
IC = filter.shape(1), FH = filter.shape(2), FW = filter.shape(3),
OH = diff.shape(2), OW = diff.shape(3);
out = std::make_shared<HostTensorND>(
CompNode::load("xpu0"), TensorShape{OC, IC, FH, FW});
auto&& grad = *out;
auto sptr = src.ptr<float>(), dptr = diff.ptr<float>(), gptr = grad.ptr<float>();
memset(gptr, 0, sizeof(float) * grad.shape().total_nr_elems());
auto valid = [&](size_t ih, size_t iw) { return ih < IH && iw < IW; };
for (size_t n = 0; n < N; ++n)
for (size_t oc = 0; oc < OC; ++oc)
for (size_t ic = 0; ic < IC; ++ic) {
for (size_t oh = 0; oh < OH; ++oh)
for (size_t ow = 0; ow < OW; ++ow) {
for (size_t fh = 0; fh < FH; ++fh)
for (size_t fw = 0; fw < FW; ++fw) {
size_t ih = oh * param.stride_h + fh - param.pad_h,
iw = ow * param.stride_w + fw - param.pad_w;
auto src_data = valid(ih, iw)
? sptr[(n * IC + ic) * IH * IW +
ih * IW + iw]
: 0;
gptr[(oc * IC + ic) * FH * FW + fh * FW + fw] +=
dptr[(n * OC + oc) * OH * OW + oh * OW + ow] *
src_data;
}
}
}
}
void local_share_brute(
const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& out, const opr::LocalShare::Param& param) {
auto in = inps[0], filter = inps[1];
mgb_assert(in->shape().ndim == 4);
mgb_assert(filter->shape().ndim == 6);
int batch_size = in->shape()[0], ci = in->shape()[1], hi = in->shape()[2],
wi = in->shape()[3];
int fh = filter->shape()[3], fw = filter->shape()[4];
int ph = param.pad_h, pw = param.pad_w;
int sh = param.stride_h, sw = param.stride_w;
int dh = param.dilate_h, dw = param.dilate_w;
int sgh = filter->shape()[0], sgw = filter->shape()[1];
mgb_assert(dh == 1 && dw == 1);
mgb_assert(
static_cast<uint32_t>(sgh) == param.spatial_groups_h &&
static_cast<uint32_t>(sgw) == param.spatial_groups_w);
int ho = (hi + 2 * ph - fh) / sh + 1;
int wo = (wi + 2 * pw - fw) / sw + 1;
mgb_assert(ho % sgh == 0 && wo % sgw == 0);
int grp_ho = ho / sgh, grp_wo = wo / sgw;
int co = filter->shape()[5];
size_t u_batch = batch_size, u_co = co, u_ho = ho, u_wo = wo;
out = std::make_shared<HostTensorND>(
CompNode::load("xpu0"), TensorShape{u_batch, u_co, u_ho, u_wo});
mgb_assert(param.mode == Param::Mode::CROSS_CORRELATION);
for (int n = 0; n < batch_size; ++n) {
for (int oc = 0; oc < co; ++oc) {
for (int oh = 0; oh < ho; ++oh) {
for (int ow = 0; ow < wo; ++ow) {
size_t u_n = n, u_oc = oc, u_oh = oh, u_ow = ow;
float& dval = out->ptr<float>({u_n, u_oc, u_oh, u_ow})[0];
dval = 0;
int grp_oh_idx = oh / grp_ho;
int grp_ow_idx = ow / grp_wo;
for (int ic = 0; ic < ci; ++ic) {
for (int kh = 0; kh < fh; ++kh) {
for (int kw = 0; kw < fw; ++kw) {
int ih = oh * sh - ph + kh;
int iw = ow * sw - pw + kw;
float sval = 0.f;
float fval = 0.f;
if (ih >= 0 && ih < hi && iw >= 0 && iw < wi) {
sval = in->ptr<float>(
{static_cast<size_t>(n),
static_cast<size_t>(ic),
static_cast<size_t>(ih),
static_cast<size_t>(iw)})[0];
}
fval = filter->ptr<float>(
{static_cast<size_t>(grp_oh_idx),
static_cast<size_t>(grp_ow_idx),
static_cast<size_t>(ic),
static_cast<size_t>(kh),
static_cast<size_t>(kw),
static_cast<size_t>(oc)})[0];
dval += fval * sval;
}
}
}
}
}
}
}
}
void convolution_brute(
const std::vector<std::shared_ptr<HostTensorND>>& in_tensor,
std::shared_ptr<HostTensorND>& out_tensor,
const opr::Convolution::Param& param) {
mgb_assert(in_tensor.size() == 2);
auto in = in_tensor[0], filter = in_tensor[1];
mgb_assert(in->shape().ndim == 4);
mgb_assert(filter->shape().ndim == 4);
int batch_size = in->shape().shape[0];
int ic = in->shape().shape[1];
int ih = in->shape().shape[2];
int iw = in->shape().shape[3];
int fh = filter->shape().shape[2];
int fw = filter->shape().shape[3];
int ph = param.pad_h;
int pw = param.pad_w;
int sh = param.stride_h;
int sw = param.stride_w;
int dh = param.dilate_h;
int dw = param.dilate_w;
mgb_assert(ih + 2 * ph >= (fh - 1) * dh + 1);
mgb_assert(iw + 2 * pw >= (fw - 1) * dw + 1);
int oh = (ih + 2 * ph - ((fh - 1) * dh + 1)) / sh + 1;
int ow = (iw + 2 * pw - ((fw - 1) * dw + 1)) / sw + 1;
mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
int oc = filter->shape().shape[0];
out_tensor = std::make_shared<HostTensorND>(
CompNode::load("xpu0"),
TensorShape{
static_cast<size_t>(batch_size), static_cast<size_t>(oc),
static_cast<size_t>(oh), static_cast<size_t>(ow)});
int pn, poc, poh, pow, pih, piw, pic, pfh, pfw;
for (pn = 0; pn < batch_size; ++pn)
for (poc = 0; poc < oc; ++poc)
for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw) {
float& target = out_tensor->ptr<float>(
{static_cast<size_t>(pn), static_cast<size_t>(poc),
static_cast<size_t>(poh), static_cast<size_t>(pow)})[0];
target = 0;
for (pic = 0; pic < ic; ++pic)
for (pfh = 0; pfh < fh; ++pfh)
for (pfw = 0; pfw < fw; ++pfw) {
int prih, priw;
float img_data, filter_data;
if (param.mode == Param::Mode::CONVOLUTION) {
prih = pih + (fh - pfh - 1) * dh;
priw = piw + (fw - pfw - 1) * dw;
} else {
mgb_assert(
param.mode ==
Param::Mode::CROSS_CORRELATION);
prih = pih + pfh * dh;
priw = piw + pfw * dw;
}
if (prih >= 0 && prih < ih && priw >= 0 && priw < iw) {
img_data = in_tensor[0]->ptr<float>(
{static_cast<size_t>(pn),
static_cast<size_t>(pic),
static_cast<size_t>(prih),
static_cast<size_t>(priw)})[0];
} else {
img_data = 0;
}
filter_data = filter->ptr<float>(
{static_cast<size_t>(poc),
static_cast<size_t>(pic),
static_cast<size_t>(pfh),
static_cast<size_t>(pfw)})[0];
target += img_data * filter_data;
}
}
}
opr::Convolution::Param convert_to_conv_param(
const opr::ConvBiasForward::Param& param) {
return opr::Convolution::Param{param.mode, param.pad_h, param.pad_w,
param.stride_h, param.stride_w, param.dilate_h,
param.dilate_w, param.sparse, param.format};
};
#if MGB_CUDA
opr::Convolution::Param convert_to_conv_param(
const opr::BatchConvBiasForward::Param& param) {
return opr::Convolution::Param{param.mode, param.pad_h, param.pad_w,
param.stride_h, param.stride_w, param.dilate_h,
param.dilate_w, param.sparse, param.format};
};
#endif
TEST(TestOprDNN, ConvolutionForward) {
uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
for (auto mode : modes_to_check) {
uint32_t iw = ih + 1, fw = fh + 1, pw = ph + 1, sw = sh + 1;
Param param{mode, ph, pw, sh, sw};
size_t batch_size = 32;
opr::test::ForwardChecker<opr::Convolution, 2> forward_checker(
{{batch_size, ic, ih, iw}, {oc, ic, fh, fw}}, convolution_brute, param);
forward_checker.run();
}
}
TEST(TestOprDNN, ConvolutionBackward) {
uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
for (auto mode : modes_to_check) {
uint32_t iw = 11, fw = 4, pw = 1, sw = 3;
Param param{mode, ph, pw, sh, sw};
size_t batch_size = 32;
opr::test::BackwardChecker<opr::Convolution, 2> backward_checker(
{{batch_size, ic, ih, iw}, {oc, ic, fh, fw}}, param, 1e-2, 1);
backward_checker.run();
}
}
TEST(TestOprDNN, ConvBiasExePolicy) {
using Param = opr::ConvBias::Param;
Param param;
using Policy = opr::ConvBias::ExecutionPolicy;
using S = Policy::Strategy;
auto cn = CompNode::load("cpux");
auto orig_impl =
PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
auto run = [&](S strategy) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
dtype);
};
auto x = mkvar("x", {20, 50, 50, 16}, dtype::QuantizedS8(2.5f));
auto w = mkvar("w", {24, 3, 3, 16}, dtype::QuantizedS8(2.5f));
auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
param.nonlineMode = Param::NonlineMode::RELU;
param.format = Param::Format::NHWC;
Policy policy;
policy.strategy = strategy;
auto conv_bias = opr::ConvBias::make(
x, w, bias, param, policy,
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
func->execute();
PersistentCache::set_impl(std::make_shared<InMemoryPersistentCache>());
};
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
run(strategy);
}
megdnn::HeuristicCache::instance().clear();
ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError);
PersistentCache::set_impl(orig_impl);
}
TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {
using Param = opr::ConvBias::Param;
Param param;
using Policy = opr::ConvBias::ExecutionPolicy;
using S = Policy::Strategy;
auto cn = CompNode::load("cpux");
for (auto strategy : SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name),
dtype);
};
auto x =
mkvar("x", {20, 50, 50, 16},
dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
auto w =
mkvar("w", {24, 3, 3, 16},
dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0)));
auto bias = mkvar("bias", {1, 1, 1, 24}, dtype::QuantizedS32(6.25f));
param.nonlineMode = Param::NonlineMode::RELU;
param.format = Param::Format::NHWC;
Policy policy;
policy.strategy = strategy;
auto conv_bias = opr::ConvBias::make(
x, w, bias, param, policy,
OperatorNodeConfig{
dtype::Quantized8Asymm(2.5f, static_cast<uint8_t>(0))});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(conv_bias, host_y)});
func->execute();
}
}
TEST(TestOprDNN, ConvolutionExePolicy) {
Param param{Mode::CONVOLUTION};
using Policy = opr::Convolution::ExecutionPolicy;
using S = Policy::Strategy;
int nr_get = 0;
auto on_get = [&nr_get](
const std::string&, const void*, size_t, const void*,
size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
megdnn::HeuristicCache::instance().clear();
using Checker = AutoOprChecker<2, 1>;
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
auto out = opr::Convolution::make(inputs[0], inputs[1], param, policy);
return {out};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp.begin(), inp.end()}, sh_out, param);
dest[0] = *sh_out;
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
nr_get = 0;
Checker(make_graph, fwd)
.run({TensorShape{3, 2, 10, 6}, {4, 2, 3, 2}}, opt)
.run({TensorShape{6, 3, 8, 13}, {2, 3, 2, 13}}, opt)
.run({TensorShape{1, 1, 10, 10}, {2, 1, 3, 3}}, opt);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
megdnn::HeuristicCache::instance().clear();
}
}
TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
REQUIRE_GPU(1);
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
param.compute_mode = Param::ComputeMode::FLOAT32;
using Policy = opr::Convolution::ExecutionPolicy;
using S = Policy::Strategy;
auto gen_bfp16 = [](HostTensorND& dest) {
RNGxorshf rng{next_rand_seed()};
auto rand_real = [&rng]() {
std::uniform_real_distribution<float> dist(-1, 1);
return dist(rng);
};
auto ptr = dest.ptr<dt_bfloat16>();
size_t elems = dest.shape().total_nr_elems();
for (size_t i = 0; i < elems; i++) {
ptr[i] = dt_bfloat16(rand_real());
}
};
auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::BFloat16{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
}
return ret;
};
auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::Float32{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
}
return ret;
};
int nr_get = 0;
auto on_get = [&nr_get](
const std::string&, const void*, size_t, const void*,
size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy :
{S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
S(S::PROFILE | S::HEURISTIC)}) {
#else
for (auto strategy : {S : HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
#endif
megdnn::HeuristicCache::instance().clear();
using Checker = AutoOprChecker<2, 1>;
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::ConvolutionBackwardData::make_deconv(
inputs[0], inputs[1], param, policy)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute({bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out, param);
dest[0] = *f32_to_bf16(out);
};
Checker::RunOptions opt;
opt.outputs_max_err = 1e-3;
nr_get = 0;
Checker(make_graph, fwd)
.disable_grad_check()
.set_input_dtype(0, dtype::BFloat16{})
.set_input_dtype(1, dtype::BFloat16{})
.set_input_generator(0, gen_bfp16)
.set_input_generator(1, gen_bfp16)
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
}
}
#if MGB_ENABLE_FASTRUN
TEST(TestOprDNN, ConvolutionBackwardDataFloat16ExePolicy) {
REQUIRE_GPU(1);
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
param.compute_mode = Param::ComputeMode::FLOAT32;
using Policy = opr::Convolution::ExecutionPolicy;
using S = Policy::Strategy;
auto gen_fp16 = [](HostTensorND& dest) {
RNGxorshf rng{next_rand_seed()};
auto rand_real = [&rng]() {
std::uniform_real_distribution<float> dist(-1, 1);
return dist(rng);
};
auto ptr = dest.ptr<dt_float16>();
size_t elems = dest.shape().total_nr_elems();
for (size_t i = 0; i < elems; i++) {
ptr[i] = dt_float16(rand_real());
}
};
auto f32_to_f16 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::Float16{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_float16>()[i] = src->ptr<dt_float32>()[i];
}
return ret;
};
auto f16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::Float32{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_float32>()[i] = src->ptr<dt_float16>()[i];
}
return ret;
};
int nr_get = 0;
auto on_get = [&nr_get](
const std::string&, const void*, size_t, const void*,
size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};
auto strategy = S(S::PROFILE | S::REPRODUCIBLE);
using Checker = AutoOprChecker<2, 1>;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::ConvolutionBackwardData::make_deconv(
inputs[0], inputs[1], param, policy)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute({f16_to_f32(inp[0]), f16_to_f32(inp[1])}, out, param);
dest[0] = *f32_to_f16(out);
};
Checker::RunOptions opt;
opt.outputs_max_err = 1e-2;
nr_get = 0;
Checker(make_graph, fwd)
.disable_grad_check()
.set_input_dtype(0, dtype::Float16{})
.set_input_dtype(1, dtype::Float16{})
.set_input_generator(0, gen_fp16)
.set_input_generator(1, gen_fp16)
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
}
#endif
TEST(TestOprDNN, Deconvolution) {
using Checker = AutoOprChecker<2, 1>;
Param param{Mode::CROSS_CORRELATION, 0, 1, 1, 2};
param.dilate_h = 2;
param.sparse = Param::Sparse::GROUP;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
Checker(make_graph, fwd)
.run({TensorShape{2, 4, 6, 8}, {1, 4, 5, 3, 2}}, opt)
.run({TensorShape{3, 2, 1, 1}, {2, 1, 1, 4, 3}}, opt)
.run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt);
}
TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
Param param;
using Policy = opr::ConvolutionBackwardData::ExecutionPolicy;
using S = Policy::Strategy;
#if MGB_ENABLE_FASTRUN
for (auto strategy :
{S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
S(S::PROFILE | S::HEURISTIC)}) {
#else
for (auto strategy : {S : HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
#endif
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), dtype);
};
auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f));
auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f));
param.format = Param::Format::NCHW4;
param.pad_h = param.pad_w = 2;
param.stride_h = param.stride_w = 2;
Policy policy;
policy.strategy = strategy;
auto deconv = opr::ConvolutionBackwardData::make_deconv(
x, w, param, policy, OperatorNodeConfig{dtype::QuantizedS8(1.2f)});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(deconv, host_y)});
func->execute();
}
}
TEST(TestOprDNN, ConvolutionBackwardFilter) {
using Checker = AutoOprChecker<3, 1>;
constexpr size_t PH = 0, PW = 1, SH = 1, SW = 2;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
return {opr::ConvolutionBackwardFilter::make(
inputs[0], inputs[1], inputs[2], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_flt_brute(
{inp[0], inp[1], inp[2]}, out,
Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
dest[0] = *out;
};
#define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
#define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
{ \
TensorShape{N, IC, IH, IW}, \
{N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
OC, IC, FH, FW \
} \
}
Checker::RunOptions opt;
opt.numdiff_eps = 1;
Checker(make_graph, fwd)
.run(inp_tensor(2, 3, 4, 9, 8, 4, 3), opt)
.run(inp_tensor(1, 5, 3, 7, 9, 3, 4), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
#undef inp_tensor
#undef get_shp
}
TEST(TestOprDNN, DilatedConvolution) {
using Checker = AutoOprChecker<2, 1>;
opr::ConvolutionForward::Param param;
param.pad_h = 5;
param.pad_w = 2;
param.stride_w = 2;
param.dilate_h = 2;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::Convolution::make(inputs[0], inputs[1], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto opr = megdnn_naive_handle()->create_operator<megdnn::Convolution>();
opr->param() = param;
TensorLayout dest_layout;
opr->deduce_layout(inp[0]->layout(), inp[1]->layout(), dest_layout);
std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
inp[0]->layout(), inp[1]->layout(), dest_layout, nullptr));
dest[0].dtype(dtype::Float32())
.comp_node(inp[0]->comp_node())
.resize(dest_layout);
opr->exec(
inp[0]->as_megdnn(), inp[1]->as_megdnn(), dest[0].as_megdnn(), nullptr,
{workspace.data(), workspace.size()});
};
Checker::RunOptions option;
option.numdiff_eps = 0.1;
Checker(make_graph, fwd)
.run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 2, 2}}, option)
.run({TensorShape{2, 3, 8, 7}, TensorShape{4, 3, 3, 2}}, option)
.run({TensorShape{2, 3, 8, 9}, TensorShape{4, 3, 3, 2}}, option);
}
TEST(TestOprDNN, GroupConv) {
using Checker = AutoOprChecker<2, 1>;
opr::Convolution::Param param;
param.pad_h = 1;
param.pad_w = 2;
param.stride_h = 2;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
auto p1 = param;
p1.sparse = opr::Convolution::Param::Sparse::GROUP;
return {opr::Convolution::make(inputs[0], inputs[1], p1)};
};
auto cn = CompNode::load("xpux");
auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
HostTensorND out_raw;
auto graph_raw = ComputingGraph::make();
auto func_raw = graph_raw->compile({make_callback_copy(
opr::Convolution::make(
opr::Host2DeviceCopy::make(*graph_raw, inp0),
opr::Host2DeviceCopy::make(*graph_raw, inp1), param),
out_raw)});
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto&& out = dest[0];
auto sl = inp[0]->layout(), fl = inp[1]->layout().remove_axis(0);
TensorLayout ol;
auto group = inp[1]->layout()[0];
sl.shape[1] /= group;
for (size_t i = 0; i < group; ++i) {
inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
sl, i * sl[1] * sl[2] * sl[3])));
inp1->copy_from(inp[1]->sub(
SubTensorSpec::make_from_offset_elem(fl, i * fl.total_nr_elems())));
func_raw->execute();
if (!i) {
auto oshp = out_raw.shape();
oshp[1] *= group;
out.resize(oshp);
ol = out.layout();
ol[1] /= group;
}
out.sub(SubTensorSpec::make_from_offset_elem(ol, i * ol[1] * ol[2] * ol[3]))
.copy_from_fixlayout(out_raw);
}
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
opt.outputs_max_err = 5e-5;
Checker checker{make_graph, fwd};
auto run = [&](const TensorShape& ishp, size_t fh, size_t fw, size_t oc,
size_t group) {
size_t ic = ishp[1];
TensorShape flt{group, oc / group, ic / group, fh, fw};
checker.run({ishp, flt}, opt);
};
run({1, 2, 1, 1}, 1, 1, 2, 2);
run({3, 9, 5, 4}, 1, 2, 6, 3);
run({3, 6, 8, 9}, 3, 1, 4, 2);
run({2, 5, 3, 6}, 2, 3, 5, 1);
run({2, 6, 3, 6}, 2, 3, 6, 6);
}
TEST(TestOprDNN, MaskConvolution) {
using Checker = AutoOprChecker<3, 1>;
opr::Convolution::Param param;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::MaskConvolution::make(inputs[0], inputs[1], inputs[2], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp[0], inp[1]}, sh_out, param);
dest[0] = *sh_out;
size_t N = dest[0].shape()[0];
size_t OC = dest[0].shape()[1];
size_t OH = dest[0].shape()[2];
size_t OW = dest[0].shape()[3];
auto mask_ptr = inp[2]->ptr<int8_t>();
auto dest_ptr = dest[0].ptr<float>();
for (size_t i = 0; i < N * OC; ++i) {
for (size_t mask_idx = 0; mask_idx < OH * OW; ++mask_idx) {
if (mask_ptr[mask_idx] == 0) {
dest_ptr[i * OH * OW + mask_idx] = 0;
}
}
}
};
auto gen_mask = [](HostTensorND& dest) {
HostTensorGenerator<dtype::Int8, RandomDistribution::UNIFORM> mask_generator{
0, 1};
dest = *mask_generator(dest.shape(), dest.comp_node());
};
auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
size_t PW = 0) {
param.pad_h = PH;
param.pad_w = PW;
param.stride_h = SH;
param.stride_w = SW;
Checker checker{make_graph, fwd};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
checker.set_input_dtype(2, dtype::Int8());
checker.set_input_generator(2, gen_mask);
auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
size_t FW) {
size_t OH = (IH + 2 * PH - FH) / SH + 1;
size_t OW = (IW + 2 * PW - FW) / SW + 1;
checker.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {OH, OW}}, opt);
};
run(1, 1, 1, 5, 5, 3, 3);
run(2, 3, 4, 5, 5, 3, 3);
run(3, 3, 4, 224, 223, 3, 3);
run(3, 3, 4, 224, 223, 2, 2);
};
run_with_param();
run_with_param(2, 2, 3, 3);
run_with_param(3, 2, 1, 2);
run_with_param(2, 3, 2, 2);
}
TEST(TestOprDNN, MaskPropagate) {
using Checker = AutoOprChecker<3, 1>;
opr::MaskPropagate::Param mask_param;
opr::Convolution::Param conv_param;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
auto inp_mask = inputs[2];
auto out_mask = opr::MaskPropagate::make(inp_mask, mask_param);
return {opr::MaskConvolution::make(inputs[0], inputs[1], out_mask, conv_param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto& src = *inp[0];
auto& mask = *inp[2];
auto src_ptr = inp[0]->ptr<float>();
auto mask_ptr = inp[2]->ptr<int>();
mgb_assert(
src.shape()[2] == mask.shape()[0] && src.shape()[3] == mask.shape()[1]);
for (size_t i = 0; i < src.shape()[0] * src.shape()[1]; ++i) {
for (size_t mask_idx = 0; mask_idx < src.shape()[2] * src.shape()[3];
++mask_idx) {
if (mask_ptr[mask_idx] == 0) {
src_ptr[i * src.layout().stride[1] + mask_idx] = 0;
}
}
}
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp[0], inp[1]}, sh_out, conv_param);
dest[0] = *sh_out;
};
auto gen_mask = [](HostTensorND& dest) {
HostTensorGenerator<dtype::Int32, RandomDistribution::UNIFORM> mask_generator{
0, 1};
dest = *mask_generator(dest.shape(), dest.comp_node());
};
auto run_with_param = [&](size_t FH, size_t FW, size_t SH = 1, size_t SW = 1,
size_t PH = 0, size_t PW = 0, size_t DH = 1,
size_t DW = 1) {
conv_param.pad_h = PH;
conv_param.pad_w = PW;
conv_param.stride_h = SH;
conv_param.stride_w = SW;
conv_param.dilate_h = DH;
conv_param.dilate_w = DW;
mask_param.pad_h = PH;
mask_param.pad_w = PW;
mask_param.stride_h = SH;
mask_param.stride_w = SW;
mask_param.kernel_h = FH;
mask_param.kernel_w = FW;
mask_param.dilate_h = DH;
mask_param.dilate_w = DW;
Checker checker{make_graph, fwd};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
checker.set_input_dtype(2, dtype::Int32());
checker.set_input_generator(2, gen_mask);
auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW) {
checker.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {IH, IW}}, opt);
};
run(1, 1, 1, 5, 5);
run(2, 3, 4, 5, 5);
run(3, 3, 4, 224, 223);
run(3, 3, 4, 224, 223);
};
run_with_param(3, 3, 1, 1, 0, 0, 2, 2);
run_with_param(3, 3, 2, 2, 3, 3);
run_with_param(4, 2, 3, 2, 1, 2);
run_with_param(2, 4, 2, 3, 2, 2);
run_with_param(4, 2, 3, 2, 1, 2, 2, 2);
run_with_param(2, 4, 2, 3, 2, 2, 2, 1);
}
void convolution3d_brute(
const std::vector<std::shared_ptr<HostTensorND>>& in_tensor,
std::shared_ptr<HostTensorND>& out_tensor,
const opr::Convolution3D::Param& param) {
mgb_assert(in_tensor.size() == 2);
auto in = in_tensor[0], filter = in_tensor[1];
mgb_assert(in->shape().ndim == 5);
mgb_assert(filter->shape().ndim == 5);
int batch_size = in->shape().shape[0];
int ic = in->shape().shape[1];
int id = in->shape().shape[2];
int ih = in->shape().shape[3];
int iw = in->shape().shape[4];
int fd = filter->shape().shape[2];
int fh = filter->shape().shape[3];
int fw = filter->shape().shape[4];
int pd = param.pad_d;
int ph = param.pad_h;
int pw = param.pad_w;
int sd = param.stride_d;
int sh = param.stride_h;
int sw = param.stride_w;
int dd = param.dilate_d;
int dh = param.dilate_h;
int dw = param.dilate_w;
mgb_assert(id + 2 * pd >= (fd - 1) * dd + 1);
mgb_assert(ih + 2 * ph >= (fh - 1) * dh + 1);
mgb_assert(iw + 2 * pw >= (fw - 1) * dw + 1);
int od = (id + 2 * pd - ((fd - 1) * dd + 1)) / sd + 1;
int oh = (ih + 2 * ph - ((fh - 1) * dh + 1)) / sh + 1;
int ow = (iw + 2 * pw - ((fw - 1) * dw + 1)) / sw + 1;
mgb_assert(static_cast<size_t>(ic) == filter->shape().shape[1]);
int oc = filter->shape().shape[0];
out_tensor = std::make_shared<HostTensorND>(
CompNode::load("xpu0"),
TensorShape{
static_cast<size_t>(batch_size), static_cast<size_t>(oc),
static_cast<size_t>(od), static_cast<size_t>(oh),
static_cast<size_t>(ow)});
int pn, poc, pod, poh, pow, pic, pid, pih, piw, pfd, pfh, pfw;
for (pn = 0; pn < batch_size; ++pn)
for (poc = 0; poc < oc; ++poc)
for (pod = 0, pid = -pd; pod < od; ++pod, pid += sd)
for (poh = 0, pih = -ph; poh < oh; ++poh, pih += sh)
for (pow = 0, piw = -pw; pow < ow; ++pow, piw += sw) {
float& target = out_tensor->ptr<float>(
{static_cast<size_t>(pn), static_cast<size_t>(poc),
static_cast<size_t>(pod), static_cast<size_t>(poh),
static_cast<size_t>(pow)})[0];
target = 0;
for (pic = 0; pic < ic; ++pic)
for (pfd = 0; pfd < fd; ++pfd)
for (pfh = 0; pfh < fh; ++pfh)
for (pfw = 0; pfw < fw; ++pfw) {
int prid, prih, priw;
float img_data, filter_data;
if (param.mode == opr::Convolution3D::Param::
Mode::CONVOLUTION) {
prid = pid + (fd - pfd - 1) * dd;
prih = pih + (fh - pfh - 1) * dh;
priw = piw + (fw - pfw - 1) * dw;
} else {
mgb_assert(
param.mode ==
opr::Convolution3D::Param::Mode::
CROSS_CORRELATION);
prid = pid + pfd * dd;
prih = pih + pfh * dh;
priw = piw + pfw * dw;
}
if (prid >= 0 && prid < id && prih >= 0 &&
prih < ih && priw >= 0 && priw < iw) {
img_data = in_tensor[0]->ptr<float>(
{static_cast<size_t>(pn),
static_cast<size_t>(pic),
static_cast<size_t>(prid),
static_cast<size_t>(prih),
static_cast<size_t>(priw)})[0];
} else {
img_data = 0;
}
filter_data = filter->ptr<float>(
{static_cast<size_t>(poc),
static_cast<size_t>(pic),
static_cast<size_t>(pfd),
static_cast<size_t>(pfh),
static_cast<size_t>(pfw)})[0];
target += img_data * filter_data;
}
}
}
TEST(TestOprDNN, Convolution3DForward) {
for (uint32_t batch_size : {8})
for (uint32_t id : {12})
for (uint32_t fd : {1, 3})
for (uint32_t ic : {4})
for (uint32_t oc : {ic})
for (uint32_t pd : {0, 2})
for (uint32_t sd : {1, 3})
for (uint32_t dd : {1, 3})
for (bool xcorr : {0, 1}) {
uint32_t ih = id + 1, fh = fd, ph = pd + 1,
sh = sd + 1;
uint32_t iw = ih + 1, fw = fh, pw = ph + 1,
sw = sh + 1;
Param3D param{
xcorr ? Param3D::Mode::CROSS_CORRELATION
: Param3D::Mode::CONVOLUTION,
pd,
ph,
pw,
sd,
sh,
sw,
dd,
dd,
dd};
opr::test::ForwardChecker<opr::Convolution3D, 2>
forward_checker(
{{batch_size, ic, id, ih, iw},
{oc, ic, fd, fh, fw}},
convolution3d_brute, param);
forward_checker.run();
}
}
TEST(TestOprDNN, Convolution3DBackward) {
for (uint32_t batch_size : {8})
for (uint32_t id : {12})
for (uint32_t fd : {1, 3})
for (uint32_t ic : {4})
for (uint32_t oc : {ic})
for (uint32_t pd : {0, 2})
for (uint32_t sd : {1, 3})
for (uint32_t dd : {1, 3})
for (bool xcorr : {0, 1}) {
uint32_t ih = id + 1, fh = fd, ph = pd + 1,
sh = sd + 1;
uint32_t iw = ih + 1, fw = fh, pw = ph + 1,
sw = sh + 1;
Param3D param{
xcorr ? Param3D::Mode::CROSS_CORRELATION
: Param3D::Mode::CONVOLUTION,
pd,
ph,
pw,
sd,
sh,
sw,
dd,
dd,
dd};
opr::test::BackwardChecker<
opr::Convolution3D, 2>
backward_checker(
{{batch_size, ic, id, ih, iw},
{oc, ic, fd, fh, fw}},
param, 1e-2, 1);
backward_checker.run();
}
}
TEST(TestOprDNN, GroupConv3D) {
using Checker = AutoOprChecker<2, 1>;
opr::Convolution3D::Param param;
param.pad_d = 0;
param.pad_h = 1;
param.pad_w = 0;
param.stride_d = 1;
param.stride_h = 2;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
auto p1 = param;
p1.sparse = opr::Convolution3D::Param::Sparse::GROUP;
return {opr::Convolution3D::make(inputs[0], inputs[1], p1)};
};
auto cn = CompNode::load("xpux");
auto inp0 = std::make_shared<HostTensorND>(cn, dtype::Float32()),
inp1 = std::make_shared<HostTensorND>(cn, dtype::Float32());
HostTensorND out_raw;
auto graph_raw = ComputingGraph::make();
auto func_raw = graph_raw->compile({make_callback_copy(
opr::Convolution3D::make(
opr::Host2DeviceCopy::make(*graph_raw, inp0),
opr::Host2DeviceCopy::make(*graph_raw, inp1), param),
out_raw)});
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto&& out = dest[0];
auto sl = inp[0]->layout(), fl = inp[1]->layout().remove_axis(0);
TensorLayout ol;
auto group = inp[1]->layout()[0];
sl.shape[1] /= group;
for (size_t i = 0; i < group; ++i) {
inp0->copy_from(inp[0]->sub(SubTensorSpec::make_from_offset_elem(
sl, i * sl[1] * sl[2] * sl[3] * sl[4])));
inp1->copy_from(inp[1]->sub(
SubTensorSpec::make_from_offset_elem(fl, i * fl.total_nr_elems())));
func_raw->execute();
if (!i) {
auto oshp = out_raw.shape();
oshp[1] *= group;
out.resize(oshp);
ol = out.layout();
ol[1] /= group;
}
out.sub(SubTensorSpec::make_from_offset_elem(
ol, i * ol[1] * ol[2] * ol[3] * ol[4]))
.copy_from_fixlayout(out_raw);
}
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
opt.outputs_max_err = 5e-5;
Checker checker{make_graph, fwd};
auto run = [&](const TensorShape& ishp, size_t fd, size_t fh, size_t fw, size_t oc,
size_t group) {
size_t ic = ishp[1];
TensorShape flt{group, oc / group, ic / group, fd, fh, fw};
checker.run({ishp, flt}, opt);
};
run({1, 2, 1, 1, 1}, 1, 1, 1, 2, 2);
run({3, 9, 5, 4, 3}, 1, 2, 3, 6, 3);
run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
run({2, 1, 3, 6, 9}, 2, 3, 3, 5, 1);
}
TEST(TestOprDNN, Deconvolution3D) {
using Checker = AutoOprChecker<2, 1>;
Param3D param{Param3D::Mode::CROSS_CORRELATION, 0, 1, 1, 1, 2, 2};
param.sparse = Param3D::Sparse::GROUP;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::Convolution3DBackwardData::make_deconv(
inputs[0], inputs[1], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto &&data = *inp[0], &&filter = *inp[1];
size_t N = data.shape(0), ID = data.shape(2), IH = data.shape(3),
IW = data.shape(4), GROUP = filter.shape(0), ICPG = filter.shape(1),
OCPG = filter.shape(2), FD = filter.shape(3), FH = filter.shape(4),
FW = filter.shape(5);
auto&& out = dest[0];
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
size_t OD = get_shp(ID, FD, param.stride_d, param.pad_d, param.dilate_d),
OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
out.resize({N, OCPG * GROUP, OD, OH, OW});
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();
#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)
#define FOR4(a, A, b, B, c, C, d, D) \
FOR3(a, A, b, B, c, C) \
for (size_t d = 0; d < D; ++d)
FOR3(n, N, group, GROUP, icg, ICPG)
FOR3(id, ID, ih, IH, iw, IW) {
float scale = *(dptr++);
FOR4(ocg, OCPG, fd, FD, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int od = int(id * param.stride_d + fd * param.dilate_d) -
int(param.pad_d),
oh = int(ih * param.stride_h + fh * param.dilate_h) -
int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) -
int(param.pad_w);
if (od >= 0 && oh >= 0 && ow >= 0 && od < static_cast<int>(OD) &&
oh < static_cast<int>(OH) && ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
od * ol.stride[2] + oh * ol.stride[3] + ow,
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fd * fl.stride[3] +
fh * fl.stride[4] + fw;
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR4
#undef FOR3
#undef FOR2
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
Checker(make_graph, fwd)
.run({TensorShape{2, 4, 3, 3, 2}, {1, 4, 5, 3, 2, 2}}, opt)
.run({TensorShape{3, 2, 1, 1, 1}, {2, 1, 1, 4, 3, 3}}, opt)
.run({TensorShape{4, 6, 2, 2, 2}, {2, 3, 4, 6, 5, 4}}, opt);
}
TEST(TestOprDNN, Convolution3DExePolicy) {
Param3D param{Param3D::Mode::CONVOLUTION};
using Policy = opr::Convolution3D::ExecutionPolicy;
using S = Policy::Strategy;
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
using Checker = AutoOprChecker<2, 1>;
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
auto out = opr::Convolution3D::make(inputs[0], inputs[1], param, policy);
return {out};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution3d_brute({inp.begin(), inp.end()}, sh_out, param);
dest[0] = *sh_out;
};
Checker::RunOptions opt;
opt.numdiff_eps = 1;
Checker(make_graph, fwd)
.run({TensorShape{3, 2, 3, 4, 1}, {4, 2, 2, 2, 1}}, opt)
.run({TensorShape{3, 3, 2, 6, 2}, {2, 3, 1, 4, 1}}, opt)
.run({TensorShape{1, 1, 4, 4, 4}, {2, 1, 3, 3, 3}}, opt);
}
}
TEST(TestOprDNN, ConvBiasForward) {
using Checker2 = AutoOprChecker<2, 1>;
using Checker3 = AutoOprChecker<3, 1>;
opr::ConvBiasForward::Param param;
auto make_graph2 =
[&](const Checker2::SymInpArray& inputs) -> Checker2::SymOutArray {
return {opr::ConvBiasForward::make(inputs[0], inputs[1], param)};
};
auto make_graph3 =
[&](const Checker3::SymInpArray& inputs) -> Checker3::SymOutArray {
return {opr::ConvBiasForward::make(inputs[0], inputs[1], inputs[2], param)};
};
auto fwd2 = [&](Checker2::NumOutArray& dest, Checker2::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
dest[0] = *sh_out;
};
auto fwd3 = [&](Checker3::NumOutArray& dest, Checker3::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
dest[0] = *sh_out;
size_t N = dest[0].shape()[0];
size_t OC = dest[0].shape()[1];
size_t OH = dest[0].shape()[2];
size_t OW = dest[0].shape()[3];
auto dest_ptr = dest[0].ptr<float>();
for (size_t i = 0; i < N; i++) {
auto bias_ptr = inp[2]->ptr<float>();
for (size_t c = 0; c < OC; c++) {
for (size_t hw = 0; hw < OH * OW; hw++) {
*(dest_ptr++) += *(bias_ptr);
}
bias_ptr++;
}
}
};
auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
size_t PW = 0) {
param.pad_h = PH;
param.pad_w = PW;
param.stride_h = SH;
param.stride_w = SW;
Checker2 checker2{make_graph2, fwd2};
Checker2::RunOptions opt2;
checker2.set_output_allow_grad(0, false);
Checker3 checker3{make_graph3, fwd3};
Checker3::RunOptions opt3;
checker3.set_output_allow_grad(0, false);
auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
size_t FW) {
auto opr = megdnn_naive_handle()
->create_operator<megdnn::ConvolutionForward>();
opr->param() = convert_to_conv_param(param);
TensorLayout dest_layout;
opr->deduce_layout(
{{N, IC, IH, IW}, dtype::Float32()},
{{OC, IC, FH, FW}, dtype::Float32()}, dest_layout);
checker2.run({TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}}, opt2);
checker3.run(
{TensorShape{N, IC, IH, IW}, {OC, IC, FH, FW}, {1, OC, 1, 1}},
opt3);
};
run(1, 1, 1, 5, 5, 1, 1);
run(1, 1, 1, 5, 5, 3, 3);
run(2, 3, 4, 5, 5, 3, 3);
run(3, 3, 4, 224, 223, 3, 3);
run(3, 3, 4, 224, 223, 2, 2);
};
run_with_param();
run_with_param(2, 2, 3, 3);
run_with_param(3, 2, 1, 2);
run_with_param(2, 3, 2, 2);
}
TEST(TestOprDNN, ConvBiasForwardWithZ) {
REQUIRE_GPU(1);
using Checker4 = AutoOprChecker<4, 1>;
opr::ConvBiasForward::Param param;
auto make_graph4 =
[&](const Checker4::SymInpArray& inputs) -> Checker4::SymOutArray {
return {opr::ConvBiasForward::make(
inputs[0], inputs[1], inputs[2], inputs[3], param)};
};
auto fwd4 = [&](Checker4::NumOutArray& dest, Checker4::NumInpArray inp) {
std::shared_ptr<HostTensorND> sh_out;
convolution_brute({inp[0], inp[1]}, sh_out, convert_to_conv_param(param));
dest[0] = *sh_out;
size_t N = dest[0].shape()[0];
size_t OC = dest[0].shape()[1];
size_t OH = dest[0].shape()[2];
size_t OW = dest[0].shape()[3];
auto dest_ptr = dest[0].ptr<float>();
float* z_ptr = inp[3]->ptr<float>();
for (size_t i = 0; i < N; i++) {
auto bias_ptr = inp[2]->ptr<float>();
for (size_t c = 0; c < OC; c++) {
for (size_t hw = 0; hw < OH * OW; hw++) {
*(dest_ptr++) += *(bias_ptr) + *(z_ptr++);
}
bias_ptr++;
}
}
};
auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
size_t PW = 0) {
param.pad_h = PH;
param.pad_w = PW;
param.stride_h = SH;
param.stride_w = SW;
Checker4 checker4{make_graph4, fwd4};
Checker4::RunOptions opt4;
checker4.set_output_allow_grad(0, false);
auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
size_t FW) {
auto opr = megdnn_naive_handle()
->create_operator<megdnn::ConvolutionForward>();
opr->param() = convert_to_conv_param(param);
TensorLayout dest_layout;
opr->deduce_layout(
{{N, IC, IH, IW}, dtype::Float32()},
{{OC, IC, FH, FW}, dtype::Float32()}, dest_layout);
checker4.run(
{TensorShape{N, IC, IH, IW},
{OC, IC, FH, FW},
{1, OC, 1, 1},
{N, OC, dest_layout[2], dest_layout[3]}},
opt4);
};
run(1, 1, 1, 5, 5, 3, 3);
run(2, 3, 4, 5, 5, 3, 3);
run(3, 3, 4, 224, 223, 3, 3);
run(3, 3, 4, 224, 223, 2, 2);
};
run_with_param();
run_with_param(2, 2, 3, 3);
run_with_param(3, 2, 1, 2);
run_with_param(2, 3, 2, 2);
}
TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
using Checker = AutoOprChecker<3, 1>;
using Param = opr::ConvBias::Param;
opr::ConvBiasForward::Param param;
auto make_quantized = [&](SymbolVar x, const DType& dtype) {
return opr::TypeCvt::make(x, dtype);
};
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
auto conv_param = convert_to_conv_param(param);
auto y = opr::Convolution::make(
make_quantized(inputs[0], dtype::QuantizedS8(0.3f)),
make_quantized(inputs[1], dtype::QuantizedS8(0.1f)), conv_param);
y = y + make_quantized(inputs[2], dtype::QuantizedS32(0.03f));
if (param.nonlineMode == Param::NonlineMode::RELU)
y = opr::Elemwise::make({y}, {opr::Elemwise::Mode::RELU});
y = opr::TypeCvt::make(y, dtype::QuantizedS8(0.5f));
return {opr::TypeCvt::make(y, dtype::Float32())};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto graph = ComputingGraph::make();
Checker::SymInpArray inputs;
for (size_t i = 0; i < inp.size(); ++i) {
inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
}
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_fuse_conv_bias_nonlinearity();
auto y = gopt::optimize_for_inference({make_graph(inputs)[0]}, options)[0];
auto func = graph->compile({make_callback_copy(y, dest[0])});
func->execute();
func->wait();
};
auto run_with_param = [&](size_t SH = 1, size_t SW = 1, size_t PH = 0,
size_t PW = 0, size_t group = 1) {
param.pad_h = PH;
param.pad_w = PW;
param.stride_h = SH;
param.stride_w = SW;
param.format = Param::Format::NCHW4;
if (group != 1)
param.sparse = Param::Sparse::GROUP;
Checker checker{make_graph, fwd, CompNode::load("cpu0")};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
auto run = [&](size_t N, size_t IC, size_t OC, size_t IH, size_t IW, size_t FH,
size_t FW) {
mgb_assert(IC % 4 == 0 && OC % 4 == 0);
checker.run(
{TensorShape{N, group * IC / 4, IH, IW, 4},
{group, OC, IC / 4, FH, FW, 4},
{1, group * OC / 4, 1, 1, 4}},
opt);
};
run(1, 8, 8, 56, 56, 3, 3);
run(1, 8, 8, 56, 56, 3, 3);
run(1, 8, 8, 56, 56, 3, 3);
};
run_with_param(1, 1, 1, 1, 8);
run_with_param();
run_with_param(2, 2, 3, 3);
run_with_param(3, 2, 1, 2);
run_with_param(2, 3, 2, 2);
}
TEST(TestOprDNN, ConvolutionDTypeInference) {
Param param;
param.mode = Mode::CONVOLUTION;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{
cn, {1, 3, 7, 7}, dtype::Quantized8Asymm(0.233f, (uint8_t)123)};
HostTensorND filt_host{
cn, {8, 3, 1, 1}, dtype::Quantized8Asymm(0.874f, (uint8_t)234)};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto opr = opr::Convolution::make(inp, filt, param);
ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.233f * 0.874f);
inp_host = {cn, {1, 3, 7, 7}, dtype::QuantizedS8(0.1234f)};
filt_host = {cn, {8, 3, 1, 1}, dtype::QuantizedS8(0.2345f)};
inp = opr::ImmutableTensor::make(*graph, inp_host);
filt = opr::ImmutableTensor::make(*graph, filt_host);
opr = opr::Convolution::make(inp, filt, param);
ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS32);
EXPECT_EQ(opr.dtype().param<dtype::QuantizedS32>().scale, 0.1234f * 0.2345f);
inp_host = {cn, {1, 3, 7, 7}, dtype::Int8()};
filt_host = {cn, {8, 3, 1, 1}, dtype::Int8()};
inp = opr::ImmutableTensor::make(*graph, inp_host);
filt = opr::ImmutableTensor::make(*graph, filt_host);
opr = opr::Convolution::make(inp, filt, param);
ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::Int32);
}
TEST(TestOprDNN, ConvBiasINT8x8xXDTypeInference) {
float inp_scale = 1.926f;
float filt_scale = 0.817f;
float bias_scale = inp_scale * filt_scale;
opr::ConvBias::Param param;
param.mode = Mode::CONVOLUTION;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
DType output_dtype = dtype::QuantizedS8(bias_scale);
HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto bias = opr::ImmutableTensor::make(*graph, filt_host);
auto opr = opr::ConvBiasForward::make(
inp, filt, bias, param, {}, OperatorNodeConfig{output_dtype});
ASSERT_EQ(opr.dtype().enumv(), DTypeEnum::QuantizedS8);
EXPECT_EQ(opr.dtype().param<dtype::QuantizedS8>().scale, bias_scale);
}
TEST(TestOprDNN, ConvBiasINT8x8xXSerialization) {
using namespace serialization;
float inp_scale = 1.926f;
float filt_scale = 0.817f;
float bias_scale = inp_scale * filt_scale;
DType output_dtype = dtype::QuantizedS8(bias_scale);
auto fname = output_file("ConvBiasINT8x8xXTest");
auto dump = [&]() {
opr::ConvBias::Param param;
param.mode = Mode::CONVOLUTION;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{cn, {1, 3, 7, 7}, dtype::QuantizedS8(inp_scale)};
HostTensorND filt_host{cn, {8, 3, 1, 1}, dtype::QuantizedS8(filt_scale)};
HostTensorND bias_host{cn, {1, 3, 7, 7}, dtype::QuantizedS32(bias_scale)};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto bias = opr::ImmutableTensor::make(*graph, filt_host);
auto opr = opr::ConvBiasForward::make(
inp, filt, bias, param, {}, OperatorNodeConfig{output_dtype});
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto rst = dumper->dump({opr});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
ASSERT_EQ(rst.output_var_list.size(), 1u);
EXPECT_EQ(rst.output_var_list[0].dtype(), output_dtype);
};
dump();
load();
}
TEST(TestOprDNN, LocalShareForward) {
REQUIRE_GPU(1);
using Checker = AutoOprChecker<2, 1>;
using Param = opr::LocalShare::Param;
Param param;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::LocalShare::make(inputs[0], inputs[1], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
mgb_assert(inp.size() == 2);
mgb_assert(dest.size() == 1);
std::shared_ptr<HostTensorND> out;
local_share_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
};
auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
size_t ph = fh / 2, pw = fw / 2;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
param.spatial_groups_w = sgw;
Checker checker{make_graph, fwd};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32());
auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
size_t ho = (hi + 2 * ph - fh) / sh + 1;
size_t wo = (wi + 2 * pw - fw) / sw + 1;
if (ho % sgh != 0 || wo % sgw != 0)
return;
checker.run(
{TensorShape{n, ci, hi, wi}, TensorShape{sgh, sgw, ci, fh, fw, co}},
opt);
};
run(32, 2, 7, 24, 24);
run(16, 2, 7, 24, 24);
run(32, 2, 8, 12, 12);
run(16, 2, 9, 6, 6);
};
run_with_param(1, 1, 1, 1, 3, 3);
run_with_param(3, 3, 1, 1, 2, 2);
run_with_param(5, 5, 1, 1, 2, 2);
run_with_param(7, 7, 1, 1, 2, 2);
run_with_param(1, 1, 2, 2, 3, 3);
run_with_param(3, 3, 2, 2, 2, 2);
run_with_param(5, 5, 1, 1, 2, 2);
run_with_param(7, 7, 1, 1, 2, 2);
}
TEST(TestOprDNN, LocalShareForwardGrad) {
REQUIRE_GPU(1);
using Checker = AutoOprChecker<2, 1>;
using Param = opr::LocalShare::Param;
Param param;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
return {opr::LocalShare::make(inputs[0], inputs[1], param)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
mgb_assert(inp.size() == 2);
mgb_assert(dest.size() == 1);
std::shared_ptr<HostTensorND> out;
local_share_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
};
auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
size_t ph = fh / 2, pw = fw / 2;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
param.spatial_groups_w = sgw;
Checker checker{make_graph, fwd};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, true);
opt.numdiff_max_err = 1e-1;
checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32());
auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
size_t ho = (hi + 2 * ph - fh) / sh + 1;
size_t wo = (wi + 2 * pw - fw) / sw + 1;
if (ho % sgh != 0 || wo % sgw != 0)
return;
checker.run(
{TensorShape{n, ci, hi, wi}, TensorShape{sgh, sgw, ci, fh, fw, co}},
opt);
};
run(4, 2, 8, 24, 24);
run(8, 2, 4, 6, 6);
run(16, 4, 8, 12, 12);
run(4, 4, 8, 12, 12);
};
run_with_param(1, 1, 1, 1, 3, 3);
run_with_param(1, 1, 2, 2, 3, 3);
run_with_param(3, 3, 2, 2, 2, 2);
}
TEST(TestOprDNN, LocalShareForwardExecPolicy) {
REQUIRE_GPU(1);
using Checker = AutoOprChecker<2, 1>;
using Policy = opr::LocalShare::ExecutionPolicy;
using S = Policy::Strategy;
using Param = opr::LocalShare::Param;
Param param;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
int nr_get = 0;
auto on_get = [&nr_get](
const std::string&, const void*, size_t, const void*,
size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::LocalShare::make(inputs[0], inputs[1], param, policy)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
mgb_assert(inp.size() == 2);
mgb_assert(dest.size() == 1);
std::shared_ptr<HostTensorND> out;
local_share_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
};
auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1,
size_t sw = 1, size_t sgh = 3, size_t sgw = 3) {
megdnn::HeuristicCache::instance().clear();
size_t ph = fh / 2, pw = fw / 2;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw, param.spatial_groups_h = sgh,
param.spatial_groups_w = sgw;
Checker checker{make_graph, fwd};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32());
nr_get = 0;
opt.outputs_max_err = 1e-3;
auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
size_t ho = (hi + 2 * ph - fh) / sh + 1;
size_t wo = (wi + 2 * pw - fw) / sw + 1;
if (ho % sgh != 0 || wo % sgw != 0)
return;
checker.run(
{TensorShape{n, ci, hi, wi},
TensorShape{sgh, sgw, ci, fh, fw, co}},
opt);
};
run(32, 4, 8, 24, 24);
run(32, 4, 8, 12, 12);
run(16, 4, 8, 12, 12);
run(32, 4, 8, 6, 6);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
};
run_with_param(1, 1, 1, 1, 3, 3);
run_with_param(3, 3, 1, 1, 2, 2);
run_with_param(5, 5, 1, 1, 2, 2);
run_with_param(7, 7, 1, 1, 2, 2);
run_with_param(1, 1, 2, 2, 3, 3);
run_with_param(3, 3, 2, 2, 2, 2);
run_with_param(5, 5, 1, 1, 2, 2);
run_with_param(7, 7, 1, 1, 2, 2);
}
}
TEST(TestOprDNN, LocalShareSerialization) {
using namespace serialization;
auto fname = output_file("LocalShareForwardTest");
auto dump = [&]() {
opr::LocalShare::Param param;
param.mode = Mode::CROSS_CORRELATION;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 0;
param.spatial_groups_h = param.spatial_groups_w = 3;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{cn, {32, 4, 24, 24}, dtype::Float32()};
HostTensorND filt_host{cn, {3, 3, 4, 1, 1, 8}, dtype::Float32()};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto opr = opr::LocalShareForward::make(inp, filt, param, {});
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto rst = dumper->dump({opr});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
ASSERT_EQ(rst.output_var_list.size(), 1u);
};
dump();
load();
}
TEST(TestOprDNN, DeformableConvForward) {
REQUIRE_GPU(1);
using Checker = AutoOprChecker<4, 1>;
using Policy = opr::DeformableConvForward::ExecutionPolicy;
using S = Policy::Strategy;
using Param = opr::DeformableConvForward::Param;
Param param;
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::DeformableConvForward::make(
inputs[0], inputs[1], inputs[2], inputs[3], param, policy)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto opr = megdnn_naive_handle()
->create_operator<megdnn::DeformableConvForward>();
opr->param() = param;
TensorLayout dest_layout;
opr->deduce_layout(
inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
inp[3]->layout(), dest_layout);
std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
inp[0]->layout(), inp[1]->layout(), inp[2]->layout(),
inp[3]->layout(), dest_layout));
dest[0].dtype(dtype::Float32())
.comp_node(inp[0]->comp_node())
.resize(dest_layout);
opr->exec(
inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
inp[3]->as_megdnn(), dest[0].as_megdnn(),
{workspace.data(), workspace.size()});
};
auto run_with_param = [&](size_t fh, size_t fw, size_t sh, size_t sw, size_t dh,
size_t dw, size_t group, size_t deformable_group) {
Checker checker{make_graph, fwd};
size_t ph = fh / 2, pw = fw / 2;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw;
param.dilate_h = dh, param.dilate_w = dw;
param.format = Param::Format::NCHW;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
if (group > 1)
param.sparse = Param::Sparse::GROUP;
Checker::RunOptions opt;
float DELTA = 1e-3;
opt.numdiff_eps = DELTA;
opt.numdiff_max_err = 1e-1;
auto gen_off = [DELTA](HostTensorND& off, float l = -2.f, float h = 2.f) {
RNGxorshf rng{next_rand_seed()};
auto elems = off.shape().total_nr_elems();
auto ptr = off.ptr<float>();
auto rand_real = [](RNGxorshf& rng, float lo, float hi) {
std::uniform_real_distribution<float> dist(lo, hi);
return dist(rng);
};
for (size_t i = 0; i < elems; ++i) {
do {
float val = rand_real(rng, l, h);
if (abs(floor(val + 2 * DELTA) - floor(val)) <= 1e-6f &&
abs(floor(val - 2 * DELTA) - floor(val)) <= 1e-6f) {
ptr[i] = val;
break;
}
} while (true);
}
};
checker.set_input_generator(2, gen_off);
checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32());
checker.set_input_dtype(2, dtype::Float32());
checker.set_input_dtype(3, dtype::Float32());
auto run = [&](size_t n, size_t ih, size_t iw, size_t icpg, size_t ocpg) {
size_t oh = (ih + 2 * ph - fh) / sh + 1;
size_t ow = (iw + 2 * pw - fw) / sw + 1;
checker.run(
{TensorShape{n, group * icpg, ih, iw},
(param.sparse == Param::Sparse::GROUP)
? TensorShape{group, ocpg, icpg, fh, fw}
: TensorShape{group * ocpg, group * icpg, fh, fw},
{n, 2 * deformable_group * fh * fw, oh, ow},
{n, deformable_group * fh * fw, oh, ow}},
opt);
};
run(1, 3, 3, 2, 1);
run(2, 3, 3, 2, 2);
run(1, 5, 5, 2, 1);
};
run_with_param(3, 3, 1, 1, 1, 1, 2, 2);
}
}
TEST(TestOprDNN, DeformableConvSerialization) {
using namespace serialization;
auto fname = output_file("DeformableConvTest");
auto dump = [&]() {
using Param = opr::DeformableConvForward::Param;
Param param;
size_t n = 16, ocpg = 2, icpg = 4;
size_t ih = 24, iw = 24, fh = 3, fw = 3, ph = 2, pw = 2, sh = 1, sw = 1, dh = 1,
dw = 1;
size_t group = 1, deformable_group = 1;
size_t oh = (ih + 2 * ph - fh) / sh + 1;
size_t ow = (iw + 2 * pw - fw) / sw + 1;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw;
param.dilate_h = dh, param.dilate_w = dw;
param.format = Param::Format::NCHW;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{cn, {n, group * icpg, ih, iw}, dtype::Float32()};
HostTensorND filt_host{
cn, {group * ocpg, group * icpg, fh, fw}, dtype::Float32()};
HostTensorND offset_host{
cn, {n, 2 * deformable_group * fh * fw, oh, ow}, dtype::Float32()};
HostTensorND mask_host{
cn, {n, deformable_group * fh * fw, oh, ow}, dtype::Float32()};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto offset = opr::ImmutableTensor::make(*graph, offset_host);
auto mask = opr::ImmutableTensor::make(*graph, mask_host);
auto opr = opr::DeformableConvForward::make(
inp, filt, offset, mask, param, {}, {});
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto rst = dumper->dump({opr});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
ASSERT_EQ(rst.output_var_list.size(), 1u);
};
dump();
load();
}
#if MGB_CUDA
TEST(TestOprDNN, BatchConvBiasForward) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
using Checker = AutoOprChecker<3, 1>;
using Policy = opr::BatchConvBiasForward::ExecutionPolicy;
using S = Policy::Strategy;
using Param = opr::BatchConvBiasForward::Param;
Param param;
param.format = Param::Format::NCHW4;
param.mode = Param::Mode::CROSS_CORRELATION;
param.sparse = Param::Sparse::DENSE;
#if MGB_ENABLE_FASTRUN
for (auto strategy : SmallVector<S>{
S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTIMIZED}) {
#else
for (auto strategy : SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
#endif
auto make_quantized = [&](SymbolVar x, const DType& dtype) {
return opr::TypeCvt::make(x, dtype);
};
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
auto conv_bias = opr::BatchConvBiasForward::make(
make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f}), param,
policy, OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
return {opr::TypeCvt::make(conv_bias, dtype::Float32())};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
mgb_assert(inp.size() == 3);
mgb_assert(dest.size() == 1);
auto graph = ComputingGraph::make();
Checker::SymInpArray inputs;
for (size_t i = 0; i < inp.size(); ++i) {
inputs[i] = opr::Host2DeviceCopy::make(*graph, inp[i]);
}
auto src = make_quantized(inputs[0], dtype::QuantizedS8{1.1f}),
filter = make_quantized(inputs[1], dtype::QuantizedS8{1.2f}),
bias = make_quantized(inputs[2], dtype::QuantizedS32{1.1f * 1.2f});
{
auto xshp = opr::GetVarShape::make(src);
auto cv = [&src](int v) { return src.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp = opr::Concat::make(
{cv(1), sub(0) * sub(1), sub(2), sub(3), sub(4)}, 0);
src = opr::Reshape::make(src, tshp);
}
auto conv_param = convert_to_conv_param(param);
conv_param.sparse = opr::BatchConvBias::Param::Sparse::GROUP;
auto y = opr::Convolution::make(src, filter, conv_param);
{
auto fshp = opr::GetVarShape::make(filter);
auto batch = opr::IndexAt::make(fshp, {{0, filter.make_scalar(0)}});
auto xshp = opr::GetVarShape::make(y);
auto cv = [&y](int v) { return y.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp = opr::Concat::make(
{batch, sub(1) / batch, sub(2), sub(3), sub(4)}, 0);
y = opr::Reshape::make(y, tshp);
}
y = y + bias;
y = opr::TypeCvt::make(y, dtype::QuantizedS8{1.3f});
y = opr::TypeCvt::make(y, dtype::Float32());
auto func = graph->compile({make_callback_copy(y, dest[0])});
func->execute();
func->wait();
};
auto run_with_param = [&](size_t sh = 1, size_t sw = 1) {
size_t fh = 1;
size_t fw = 1;
size_t ph = fh / 2, pw = fw / 2;
param.pad_h = ph, param.pad_w = pw;
param.stride_h = sh, param.stride_w = sw;
Checker checker{make_graph, fwd, cn};
Checker::RunOptions opt;
checker.set_output_allow_grad(0, false);
checker.set_input_dtype(0, dtype::Float32());
checker.set_input_dtype(1, dtype::Float32());
checker.set_input_dtype(2, dtype::Float32());
auto run = [&](size_t n, size_t ci, size_t co, size_t hi, size_t wi) {
checker.run(
{TensorShape{n, ci / 4, hi, wi, 4},
TensorShape{n, co, ci / 4, fh, fw, 4},
TensorShape{1, co / 4, 1, 1, 4}},
opt);
};
run(32, 16, 32, 24, 24);
run(16, 16, 32, 24, 24);
run(32, 16, 64, 12, 12);
run(16, 16, 64, 6, 6);
};
run_with_param(1, 1);
run_with_param(2, 2);
}
}
#endif
TEST(TestOprDNN, BatchConvBiasSerialization) {
using namespace serialization;
auto fname = output_file("BatchConvBiasForwardTest");
auto dump = [&]() {
opr::BatchConvBias::Param param;
param.mode = Mode::CROSS_CORRELATION;
param.format = opr::BatchConvBias::Param::Format::NCHW4;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 0;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorND inp_host{cn, {32, 1, 24, 24, 4}, dtype::QuantizedS8{1.1f}};
HostTensorND filt_host{cn, {32, 8, 1, 1, 1, 4}, dtype::QuantizedS8{1.2f}};
auto inp = opr::ImmutableTensor::make(*graph, inp_host);
auto filt = opr::ImmutableTensor::make(*graph, filt_host);
auto opr = opr::BatchConvBiasForward::make(
inp, filt, param, {}, OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto rst = dumper->dump({opr});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
ASSERT_EQ(rst.output_var_list.size(), 1u);
};
dump();
load();
}
TEST(TestOprDNN, HeuristicReproducible) {
using Policy = opr::ConvolutionBackwardFilter::ExecutionPolicy;
using S = Policy::Strategy;
using Checker = AutoOprChecker<3, 1>;
constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;
for (auto strategy : SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
VarNode* bwd_flt;
auto make_graph =
[&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
Param param{Mode::CROSS_CORRELATION, PH, PW, SH, SW};
Policy policy;
policy.strategy = strategy;
auto out = opr::ConvolutionBackwardFilter::make(
inputs[0], inputs[1], inputs[2], param, policy);
bwd_flt = out.node();
return {out};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_flt_brute(
{inp[0], inp[1], inp[2]}, out,
Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
dest[0] = *out;
};
#define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
#define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
{ \
TensorShape{N, IC, IH, IW}, \
{N, OC, get_shp(IH, PH, SH, FH), get_shp(IW, PW, SW, FW)}, { \
OC, IC, FH, FW \
} \
}
Checker::RunOptions opt;
opt.numdiff_eps = 1;
opt.outputs_max_err = 1e-3;
std::string algo_name0, algo_name1;
{
Checker checker(make_graph, fwd);
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(bwd_flt->owner_opr())
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");
if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
EXPECT_TRUE(palgo->contain_attribute_all(
megdnn::AlgoAttribute::REPRODUCIBLE));
}
algo_name0 = palgo->name();
}
megdnn::HeuristicCache::instance().clear();
{
Checker checker(make_graph, fwd);
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(bwd_flt->owner_opr())
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");
algo_name1 = palgo->name();
}
EXPECT_TRUE(algo_name0 == algo_name1);
megdnn::HeuristicCache::instance().clear();
}
#undef inp_tensor
#undef get_shp
}
#if MGB_CUDA
TEST(TestOprDNN, ConvolutionMultiCompNode) {
REQUIRE_GPU(1);
auto cn0 = CompNode::load("gpu0:0"), cn1 = CompNode::load("gpu0:1");
cn0.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn0).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
HostTensorGenerator<dtype::Int8> gen;
auto mkvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype);
};
auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
dtype);
};
auto graph0 = ComputingGraph::make();
graph0->options().graph_opt_level = 0;
auto graph1 = ComputingGraph::make();
graph1->options().graph_opt_level = 0;
auto make_func = [&gen, &mkvar, &mkcvar](
std::shared_ptr<ComputingGraph> graph,
const CompNode& cn) {
using Policy = opr::ConvBias::ExecutionPolicy;
using S = Policy::Strategy;
auto x = mkvar("x", {64, 32, 28, 28, 4}, dtype::QuantizedS8(2.5f), graph, cn),
w1 = mkcvar("w1", {256, 32, 5, 5, 4}, dtype::QuantizedS8(2.5f), graph, cn),
b1 = mkcvar("b1", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f), graph, cn),
w2 = mkcvar("w2", {256, 64, 3, 3, 4}, dtype::QuantizedS8(2.5f), graph, cn),
b2 = mkcvar("b2", {1, 64, 1, 1, 4}, dtype::QuantizedS32(6.25f), graph, cn);
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW4;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 2;
Policy policy;
policy.strategy = S::PROFILE;
auto y = opr::ConvBias::make(
x, w1, b1, param, policy, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
y = opr::ConvBias::make(
y, w2, b2, param, policy, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
return y;
};
auto y0 = make_func(graph0, cn0);
auto y1 = make_func(graph1, cn1);
HostTensorND host_y0, host_y1;
auto func0 = graph0->compile({make_callback_copy(y0, host_y0)});
auto func1 = graph1->compile({make_callback_copy(y1, host_y1)});
auto worker = [&func0, &func1](int wid) {
static const int iter_num = 1000;
if (wid == 0) {
for (int i = 0; i < iter_num; ++i)
func0->execute();
} else {
for (int i = 0; i < iter_num; ++i)
func1->execute();
}
};
std::thread worker0(worker, 0);
std::thread worker1(worker, 1);
worker0.join();
worker1.join();
}
#endif
}
#ifndef _WIN32
namespace mgb {
namespace opr {
namespace testing {
class ConvolutionTestingPeer {
opr::ConvolutionForward& m_conv_opr;
public:
explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
: m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
void set_megdnn_opr(std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
}
};
} } }
namespace {
using megdnn::TensorND;
using megdnn::Workspace;
using opr::testing::ConvolutionTestingPeer;
class MockConvolutionForward : public megdnn::ConvolutionForward {
const char* m_algorithm_set_name;
public:
MockConvolutionForward(megdnn::ConvolutionForward* orig, const char* algo_set_name)
: megdnn::ConvolutionForward(orig->handle()),
m_algorithm_set_name(algo_set_name) {}
MOCK_METHOD5(
exec,
void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace));
MOCK_METHOD5(
exec_preprocess,
void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace));
MOCK_METHOD4(
get_workspace_in_bytes,
size_t(const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter));
MOCK_METHOD3(
deduce_preprocessed_filter_layout,
SmallVector<TensorLayout>(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst));
MOCK_METHOD3(
get_preprocess_workspace_in_bytes,
size_t(const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst));
MOCK_METHOD3(
get_all_algorithms_info,
std::vector<AlgorithmInfo>(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2));
MOCK_METHOD3(
get_all_algorithms_info_safe,
std::vector<AlgorithmInfo>(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2));
MOCK_METHOD6(
get_algorithm_info_heuristic,
AlgorithmInfo(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr));
MOCK_METHOD3(
get_all_algorithms, std::vector<Algorithm*>(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2));
MOCK_METHOD3(
get_all_algorithms_safe,
std::vector<Algorithm*>(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2));
MOCK_METHOD6(
get_algorithm_heuristic,
Algorithm*(
const TensorLayout& p0, const TensorLayout& p1,
const TensorLayout& p2, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr));
MOCK_METHOD1(get_algorithm_from_desc, Algorithm*(const AlgorithmDesc&));
protected:
const char* get_algorithm_set_name() const override { return m_algorithm_set_name; }
};
class MockAlgorithm : public megdnn::detail::Algorithm {
const char* m_name;
public:
MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
Attribute attribute() const override { return Attribute::REPRODUCIBLE; }
const char* name() const override { return m_name; }
uint32_t type() const override {
return megdnn::detail::Algorithm::INVALID_ALGO_TYPE;
}
virtual ~MockAlgorithm() = default;
};
class TestWeightPreprocess : public ::testing::Test {
protected:
CompNode comp_node;
std::shared_ptr<ComputingGraph> graph;
std::shared_ptr<HostTensorND> x_host;
MockConvolutionForward* mock_conv_ptr;
SymbolVar y;
HostTensorND y_host;
std::unique_ptr<cg::AsyncExecutable> func;
MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }
void SetUp() override {
constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2, iw = ih;
comp_node = CompNode::load("cpux");
graph = ComputingGraph::make();
graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
auto x = opr::Host2DeviceCopy::make(*graph, x_host);
auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
Param param;
param.pad_h = param.pad_w = ph;
param.stride_h = param.stride_w = sh;
param.format = Param::Format::NCHW;
y = opr::ConvolutionForward::make(x, w, param);
auto& opr = y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
auto mock = std::make_unique<MockConvolutionForward>(
opr.megdnn_opr(),
::testing::UnitTest::GetInstance()->current_test_info()->name());
mock_conv_ptr = mock.get();
ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
func = graph->compile({make_callback_copy(y, y_host)});
}
void run() { func->execute().wait(); }
virtual bool is_weight_preprocess() { return true; }
void TearDown() override {
func.reset();
graph.reset();
x_host.reset();
}
};
TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
using ::testing::_;
using ::testing::Return;
auto& mock = mock_conv();
MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)).WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
.WillRepeatedly(Return(0));
{
::testing::InSequence seq;
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(SmallVector<TensorLayout>{}));
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
run();
}
}
TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
megdnn::HeuristicCache::instance().clear();
using ::testing::_;
using ::testing::Expectation;
using ::testing::Field;
using ::testing::Invoke;
using ::testing::Return;
using PF = MockConvolutionForward::PreprocessedFilter;
auto& mock = mock_conv();
MockAlgorithm algo;
SmallVector<TensorLayout> filter_layout{
{{1, 2, 3, 4}, dtype::Float32()}, {{5, 6, 7, 8}, dtype::Float32()}};
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(filter_layout));
EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
Expectation algo_call = EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
.WillOnce(Return(&algo));
Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.After(algo_call)
.WillOnce(Return(0));
Expectation pre_ws_call =
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
.After(algo_call)
.WillOnce(Return(233));
{
::testing::InSequence seq;
int salt = 0;
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
.After(ws_call, pre_ws_call)
.WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PF* pf,
_megdnn_workspace workspace) {
ASSERT_EQ(workspace.size, 233);
ASSERT_NE(pf, nullptr);
pf->algorithm_id = &salt;
ASSERT_EQ(pf->tensors.size(), 2);
ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
ASSERT_NE(pf->tensors[0].raw_ptr(), nullptr);
ASSERT_NE(pf->tensors[1].raw_ptr(), nullptr);
pf->tensors[0].ptr<float>()[0] = 114.514f;
pf->tensors[1].ptr<float>()[0] = 1926.0817f;
}));
for (int i = 0; i < 3; i++) {
if (i > 0) {
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
}
EXPECT_CALL(mock, exec(_, _, _, _, _))
.WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
_megdnn_tensor_out, const PF* pf,
_megdnn_workspace) {
ASSERT_NE(pf, nullptr);
ASSERT_EQ(pf->algorithm_id, &salt);
ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
}));
run();
}
}
}
class TestNoWeightPreprocess : public TestWeightPreprocess {
bool is_weight_preprocess() override { return false; }
};
TEST_F(TestNoWeightPreprocess, NoPreprocess) {
using ::testing::_;
using ::testing::Return;
auto& mock = mock_conv();
MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _, _))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_)).WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)).WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
.WillRepeatedly(Return(0));
{
::testing::InSequence seq;
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
run();
}
}
}
#endif
namespace {
TEST(TestOprDNN, ConvBiasInt4Serialize) {
using namespace serialization;
float inp_scale = 1.20210327f;
float filt_scale = 1.20210406f;
float bias_scale = inp_scale * filt_scale;
DType output_dtype = dtype::QuantizedS4{inp_scale};
HostTensorGenerator<dtype::Int8> gen;
std::shared_ptr<HostTensorND> xv;
auto mkvar = [](const char* name, const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
std::shared_ptr<HostTensorND> val) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
};
auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
dtype);
};
auto fname = output_file("ConvBiasInt4Serialize");
HostTensorND y1, y2;
auto dump = [&]() {
opr::ConvBias::Param param;
param.mode = Mode::CONVOLUTION;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
xv = gen({1, 64, 56, 56}, cn);
auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
auto w =
mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
auto b =
mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
auto y = opr::ConvBiasForward::make(
x, w, b, param, {}, OperatorNodeConfig{output_dtype});
auto w1 = mkcvar(
"w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
auto b1 =
mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
y = opr::ConvBiasForward::make(
y, w1, b1, param, {}, OperatorNodeConfig{output_dtype});
y = opr::TypeCvt::make(y, dtype::Float32());
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto func = graph->compile({make_callback_copy(y, y1)});
func->execute();
func->wait();
auto rst = dumper->dump({y});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
for (const auto& t : rst.tensor_map) {
t.second->copy_from(*xv).sync();
}
auto func =
rst.graph->compile({make_callback_copy(rst.output_var_list[0], y2)});
func->execute();
func->wait();
ASSERT_EQ(rst.output_var_list.size(), 1u);
EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
};
dump();
load();
MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
}
TEST(TestOprDNN, ConvBiasInt4SerializeWithParamFuse) {
using namespace serialization;
float inp_scale = 1.20210327f;
float filt_scale = 1.20210406f;
float bias_scale = inp_scale * filt_scale;
DType output_dtype = dtype::QuantizedS4{inp_scale};
HostTensorGenerator<dtype::Int8> gen;
std::shared_ptr<HostTensorND> xv;
auto mkvar = [](const char* name, const DType& dtype,
std::shared_ptr<ComputingGraph> graph,
std::shared_ptr<HostTensorND> val) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, val).rename(name), dtype);
};
auto mkcvar = [&gen](const char* name, const TensorShape& shp, const DType& dtype,
std::shared_ptr<ComputingGraph> graph, const CompNode& cn) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
dtype);
};
auto fname = output_file("ConvBiasInt4SerializeWithParamFuse");
HostTensorND y1, y2;
auto dump = [&]() {
opr::ConvBias::Param param;
param.mode = Mode::CONVOLUTION;
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
xv = gen({1, 64, 56, 56}, cn);
auto x = mkvar("x", dtype::QuantizedS4{inp_scale}, graph, xv);
auto w =
mkcvar("w", {256, 64, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
auto b =
mkcvar("b", {1, 256, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
auto y = opr::ConvBiasForward::make(
x, w, b, param, {}, OperatorNodeConfig{output_dtype});
auto w1 = mkcvar(
"w1", {64, 256, 1, 1}, dtype::QuantizedS4{filt_scale}, graph, cn);
auto b1 =
mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32{bias_scale}, graph, cn);
y = opr::ConvBiasForward::make(
y, w1, b1, param, {}, OperatorNodeConfig{output_dtype});
y = opr::TypeCvt::make(y, dtype::Float32());
SymbolVar y_param_fused;
unpack_vector(
gopt::GraphOptimizer{}
.add_pass<gopt::ParamFusePass>()
.apply({{y}})
.endpoint_vars(),
y_param_fused);
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
auto func = graph->compile({make_callback_copy(y_param_fused, y1)});
func->execute();
func->wait();
auto rst = dumper->dump({y_param_fused});
ASSERT_EQ(rst.outputs.size(), 1u);
};
auto load = [&]() {
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto rst = loader->load();
for (const auto& t : rst.tensor_map) {
t.second->copy_from(*xv).sync();
}
auto func =
rst.graph->compile({make_callback_copy(rst.output_var_list[0], y2)});
func->execute();
func->wait();
ASSERT_EQ(rst.output_var_list.size(), 1u);
EXPECT_EQ(rst.output_var_list[0].dtype(), dtype::Float32());
};
dump();
load();
MGB_ASSERT_TENSOR_NEAR(y1, y2, 1e-3);
}
}