#include "megbrain/test/autocheck.h"
#include "megbrain/test/megdnn_helper.h"
#include "megbrain/opr/basic_arith_wrapper.h"
#include "megbrain/opr/blas.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
#include "megdnn/tensor_iter.h"
#include <algorithm>
using namespace mgb;
namespace {
using Mode = opr::Reduce::Mode;
using DataType = opr::Reduce::Param::DataType;
template <Mode mode, typename ctype>
struct ImplTrait {};
template <typename ctype>
struct ImplTrait<Mode::SUM, ctype> {
static constexpr float GRAD_MAXERR = 1e-4, GRAD_EPS = 1;
static ctype init() { return 0; }
static ctype reduce(ctype accum, ctype v) { return accum + v; }
ctype finalize(ctype result) { return result; }
};
template <typename ctype>
struct ImplTrait<Mode::SUM_SQR, ctype> {
static constexpr float GRAD_MAXERR = 1e-3, GRAD_EPS = 0.01;
static ctype init() { return 0; }
static ctype reduce(ctype accum, ctype v) { return accum + v * v; }
ctype finalize(ctype result) { return result; }
};
template <typename ctype>
struct ImplTrait<Mode::PRODUCT, ctype> {
static constexpr float GRAD_MAXERR = 1e-4, GRAD_EPS = 0.01;
static ctype init() { return 1; }
static ctype reduce(ctype accum, ctype v) { return accum * v; }
ctype finalize(ctype result) { return result; }
};
template <typename ctype>
struct ImplTrait<Mode::MAX, ctype> {
static constexpr float GRAD_MAXERR = 1e-2, GRAD_EPS = 1e-3;
static ctype init() { return std::numeric_limits<ctype>::lowest(); }
static ctype reduce(ctype accum, ctype v) { return std::max(accum, v); }
ctype finalize(ctype result) { return result; }
};
template <typename ctype>
struct ImplTrait<Mode::MIN, ctype> {
static constexpr float GRAD_MAXERR = 1e-2, GRAD_EPS = 1e-3;
static ctype init() { return std::numeric_limits<ctype>::max(); }
static ctype reduce(ctype accum, ctype v) { return std::min(accum, v); }
ctype finalize(ctype result) { return result; }
};
template <typename ctype>
struct ImplTrait<Mode::MEAN, ctype> {
static constexpr float GRAD_MAXERR = 1e-4, GRAD_EPS = 1e-2;
size_t nr_elems;
ctype init() {
nr_elems = 0;
return 0;
}
ctype reduce(ctype accum, ctype v) {
nr_elems++;
return accum + v;
}
ctype finalize(ctype result) { return result / static_cast<ctype>(nr_elems); }
};
template <Mode mode, typename ctype>
void reduce_raw(HostTensorND& dest, const HostTensorND& src) {
auto tshp = dest.shape();
using Impl = ImplTrait<mode, ctype>;
if (tshp.is_scalar()) {
if (src.shape().is_scalar()) {
dest.copy_from_fixlayout(src);
return;
}
Impl impl;
ctype val = impl.init();
for (auto i : megdnn::tensor_iter_valonly<ctype>(src.as_megdnn()))
val = impl.reduce(val, i);
dest.ptr<ctype>()[0] = impl.finalize(val);
return;
}
mgb_assert(tshp.ndim == src.shape().ndim);
std::vector<size_t> axis_to_use;
for (size_t i = 0; i < tshp.ndim; i++) {
if (tshp.shape[i] != src.shape(i)) {
mgb_assert(tshp.shape[i] == 1);
axis_to_use.push_back(i);
}
}
if (axis_to_use.empty()) {
dest.copy_from_fixlayout(src);
return;
}
TensorLayout sub_layout{dest.dtype()};
sub_layout.ndim = axis_to_use.size();
for (size_t i = 0; i < axis_to_use.size(); i++) {
sub_layout.shape[i] = src.layout().shape[axis_to_use[i]];
sub_layout.stride[i] = src.layout().stride[axis_to_use[i]];
}
auto diter_maker = megdnn::tensor_iter<ctype>(dest.as_megdnn());
for (auto iter = diter_maker.begin(), iter_end = diter_maker.end();
iter != iter_end; ++iter) {
ptrdiff_t offset = 0;
for (size_t i = 0; i < tshp.ndim; i++)
offset += iter.idx()[i] * src.layout().stride[i];
Impl impl;
ctype val = impl.init();
auto subspec = SubTensorSpec::make_from_offset_elem(sub_layout, offset);
HostTensorND subt = const_cast<HostTensorND&>(src).sub(subspec);
for (ctype i : megdnn::tensor_iter_valonly<ctype>(subt.as_megdnn())) {
val = impl.reduce(val, i);
}
*iter = impl.finalize(val);
}
}
template <Mode mode, class dtype>
void do_test_correctness() {
using ctype = typename DTypeTrait<dtype>::ctype;
using Impl = ImplTrait<mode, ctype>;
using Checker = AutoOprChecker<1, 1, dtype>;
constexpr int AXIS = 1;
auto make_graph = [&](const typename Checker::SymInpArray& inputs) ->
typename Checker::SymOutArray {
return {opr::Reduce::make(inputs[0], {mode, AXIS})};
};
auto fwd = [&](typename Checker::NumOutArray& dest,
typename Checker::NumInpArray inp) {
TensorShape oshp = inp[0]->shape();
oshp.shape[1] = 1;
dest[0].resize(oshp);
reduce_raw<mode, ctype>(dest[0], *inp[0]);
};
typename Checker::RunOptions opt;
opt.numdiff_eps = Impl::GRAD_EPS;
opt.numdiff_max_err = Impl::GRAD_MAXERR;
using S = TensorShape;
Checker{make_graph, fwd}
.run({S{2, 3, 4}}, opt)
.run({S{2, 2, 3, 4}}, opt)
.run({S{2, 3, 4, 3}}, opt);
}
template <Mode mode>
void test_correctness() {
set_rand_seed(19931102);
do_test_correctness<mode, dtype::Float32>();
do_test_correctness<mode, dtype::Int32>();
}
void test_base_impl(bool dyn_inp, bool dyn_tshp) {
HostTensorGenerator<> gen;
auto host_x = gen({10});
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({1}).ptr<int>()[0] = 1;
HostTensorND host_y, expected{host_x->comp_node(), dtype::Float32()};
DeviceTensorND static_calc_x{CompNode::default_cpu()},
static_calc_workspace{CompNode::default_cpu()},
static_calc_y{CompNode::default_cpu()};
auto static_calc_opr =
opr::intl::create_megdnn_opr<megdnn::Reduce>(CompNode::default_cpu());
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {"x"}),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp, {"tshp"});
if (dyn_inp)
x = opr::MarkDynamicVar::make(x);
if (dyn_tshp)
tshp = opr::MarkDynamicVar::make(tshp);
auto y = opr::reduce_sum(x, tshp);
auto func = graph->compile({make_callback_copy(y, host_y)});
if (!dyn_tshp) {
ASSERT_TRUE(cg::is_static_var_shape(y.node()));
}
if (!dyn_inp && !dyn_tshp) {
ASSERT_TRUE(cg::is_static_var_value(y.node()));
}
bool check_succ = false;
auto do_check = [&](const TensorShape& ishp,
const std::vector<size_t>& reduce_axes) {
check_succ = false;
host_x->copy_from(*gen(ishp));
auto oshp = ishp;
if (reduce_axes.size() == 1 && reduce_axes[0] == (size_t)-1) {
oshp.shape[0] = 1;
oshp.ndim = 1;
} else {
for (auto i : reduce_axes)
oshp.shape[i] = 1;
}
{
DeviceTensorND tmp;
cg::copy_shape_to_tensor_value(tmp, oshp);
host_tshp->copy_from(tmp);
}
func->execute();
if (reduce_axes.empty() && !(!dyn_inp && dyn_tshp)) {
ASSERT_EQ(x.node()->prev_dev_ptr(), y.node()->prev_dev_ptr());
}
expected.resize(oshp);
reduce_raw<Mode::SUM, float>(expected, *host_x);
MGB_ASSERT_TENSOR_NEAR(expected, host_y, 1e-5);
static_calc_x.copy_from(*host_x);
opr::Reduce::perform(
Mode::SUM, static_calc_y, static_calc_workspace, static_calc_x,
dtype::Float32(), oshp, static_calc_opr);
host_y.ptr<float>()[0]++;
host_y.copy_from(static_calc_y);
MGB_ASSERT_TENSOR_NEAR(expected, host_y, 1e-5);
check_succ = true;
};
auto check = [&](const TensorShape& ishp, const std::vector<size_t>& reduce_axes) {
do_check(ishp, reduce_axes);
mgb_assert(check_succ);
};
check({1, 2}, {size_t(-1)});
check({1, 2}, {});
check({1}, {});
check({2}, {0});
check({2, 3}, {0, 1});
check({2, 3, 4}, {0, 1, 2});
check({2, 3, 4, 5}, {0, 1, 2, 3});
check({2, 3, 4, 5, 6}, {0, 1, 2, 3, 4});
check({2, 3, 4, 5, 6}, {size_t(-1)});
check({1, 1, 1}, {size_t(-1)});
check({1, 2, 3, 4}, {});
for (size_t i = 0; i < 4; i++)
check({3, 2, 5, 6}, {i});
for (size_t i = 0; i < 4; i++)
for (size_t j = i + 1; j < 4; j++)
check({4, 2, 6, 7}, {i, j});
for (size_t i = 0; i < 5; i++)
for (size_t j = i + 1; j < 5; j++)
for (size_t k = j + 1; k < 5; k++)
check({4, 5, 2, 7, 2}, {i, j, k});
check({100, 100, 32}, {1});
}
}
TEST(TestBasicArithReduction, BaseImpl00) {
test_base_impl(false, false);
}
TEST(TestBasicArithReduction, BaseImpl01) {
test_base_impl(false, true);
}
TEST(TestBasicArithReduction, BaseImpl10) {
test_base_impl(true, false);
}
TEST(TestBasicArithReduction, BaseImpl11) {
test_base_impl(true, true);
}
TEST(TestBasicArithReduction, AxisOnly) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 6, 7, 8});
for (bool dyn : {false, true}) {
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
if (dyn)
x = opr::MarkDynamicVar::make(x);
auto y = opr::Reduce::make(x, {Mode::SUM, 1});
HostTensorND host_y, expected{host_x->comp_node(), host_x->dtype()};
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
expected.resize({2, 1, 7, 8});
reduce_raw<Mode::SUM, float>(expected, *host_x);
MGB_ASSERT_TENSOR_EQ(expected, host_y);
}
}
TEST(TestBasicArithReduction, NegativeAxis) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 6, 7, 8});
for (bool dyn : {false, true})
for (int i = 0; i < 4; i++) {
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
if (dyn)
x = opr::MarkDynamicVar::make(x);
auto y = opr::Reduce::make(x, {Mode::SUM, i - 4});
HostTensorND host_y, expected{host_x->comp_node(), host_x->dtype()};
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
megdnn::TensorShape tshp({2, 6, 7, 8});
tshp.shape[i] = 1;
expected.resize(tshp);
reduce_raw<Mode::SUM, float>(expected, *host_x);
MGB_ASSERT_TENSOR_EQ(expected, host_y);
}
}
TEST(TestBasicArithReduction, NonCont) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
for (int dyn = 0; dyn < 4; ++dyn) {
auto host_x = gen({2, 1});
auto x = opr::Host2DeviceCopy::make(*graph, host_x), xnt = x.broadcast({2, 4}),
tshp = x.make_scalar(1);
if (dyn & 3)
xnt = opr::MarkDynamicVar::make(xnt);
if (dyn & 1)
tshp = opr::MarkDynamicVar::make(tshp);
auto y = opr::reduce_sum(xnt, tshp);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
ASSERT_TRUE(host_y.shape().is_scalar());
auto xp = host_x->ptr<float>();
MGB_ASSERT_FLOAT_EQ((xp[0] + xp[1]) * 4, host_y.ptr<float>()[0]);
}
}
TEST(TestBasicArithReduction, NonContFwd) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
for (int dyn = 0; dyn < 4; ++dyn) {
auto host_x = gen({2, 1});
auto x = opr::Host2DeviceCopy::make(*graph, host_x), xnt = x.broadcast({2, 4}),
tshp = xnt.symshape();
if (dyn & 3)
xnt = opr::MarkDynamicVar::make(xnt);
if (dyn & 1)
tshp = opr::MarkDynamicVar::make(tshp);
auto y = opr::reduce_sum(xnt, tshp);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
ASSERT_EQ(TensorShape({2, 4}), host_y.shape());
for (size_t i = 0; i < 2; ++i) {
for (size_t j = 0; j < 4; ++j) {
MGB_ASSERT_FLOAT_EQ(
host_x->ptr<float>()[i], host_y.ptr<float>({i, j})[0]);
}
}
if (!dyn) {
ASSERT_EQ(dev_ptr(x), dev_ptr(xnt));
ASSERT_EQ(dev_ptr(x), dev_ptr(y));
}
if (dyn == 3) {
ASSERT_EQ(xnt.node()->prev_dev_ptr(), y.node()->prev_dev_ptr());
}
}
}
TEST(TestBasicArithReduction, NonContPerform) {
DeviceTensorND x{CompNode::default_cpu(), dtype::Float32()},
y{x.comp_node(), x.dtype()}, workspace;
x.resize({1}).ptr<float>()[0] = 2.3;
x.reset(x.storage(), x.layout().broadcast({5, 5}));
auto opr = opr::intl::create_megdnn_opr<megdnn::Reduce>(x.comp_node());
float x0_val = 2.3;
for (auto mode : {Mode::SUM, Mode::SUM_SQR}) {
for (auto&& tshp : TensorShapeArray{{5, 1}, {1, 5}, {1, 1}, {1}, {5, 5}}) {
opr::Reduce::perform(mode, y, workspace, x, dtype::Float32(), tshp, opr);
ASSERT_TRUE(y.layout().is_contiguous());
ASSERT_EQ(tshp, y.shape());
size_t nr = tshp.total_nr_elems();
float expect = x0_val * 25 / nr;
auto py = y.ptr<float>();
for (size_t i = 0; i < nr; ++i)
MGB_ASSERT_FLOAT_EQ(expect, py[i]);
}
x0_val *= 2.3;
}
}
TEST(TestBasicArithReduction, SideEffect) {
using Checker = AutoOprChecker<1, 2>;
auto make_graph = [&](const Checker::SymInpArray& inputs,
bool scalar) -> Checker::SymOutArray {
auto x = inputs[0];
auto y0_shp = opr::GetVarShape::make(x);
opr::Subtensor::IndexDesc desc{
opr::Subtensor::AxisIndexer::make_index(0, x.make_scalar(1))};
auto y1_shp = opr::SetSubtensor::make(
y0_shp.fill_retain_dtype(1), opr::Subtensor::make(y0_shp, desc), desc);
if (scalar) {
y1_shp = y1_shp.make_scalar(1);
}
return {opr::reduce_sum_sqr(x, y0_shp), opr::reduce_sum_sqr(x, y1_shp)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp, bool scalar) {
auto &&x = *inp[0], &&y0 = dest[0], &&y1 = dest[1];
y0.copy_from(x);
auto py0 = y0.ptr<float>();
for (size_t i = 0, it = x.shape().total_nr_elems(); i < it; ++i) {
py0[i] *= py0[i];
}
auto y1_shp = y0.shape();
for (size_t i = 0; i < y1_shp.ndim; ++i) {
if (i != 1)
y1_shp[i] = 1;
}
if (scalar) {
y1_shp.ndim = 1;
y1_shp[0] = 1;
}
reduce_raw<opr::Reduce::Mode::SUM, dt_float32>(y1.resize(y1_shp), y0);
};
using S = TensorShape;
for (auto&& scalar : {false, true}) {
using namespace std::placeholders;
Checker{std::bind(make_graph, _1, scalar), std::bind(fwd, _1, _2, scalar)}
.run({S{2, 3, 4}})
.run({S{2, 2, 3, 4}})
.run({S{3, 3, 2, 3}})
.run({S{1, 1}});
}
}
TEST(TestBasicArithReduction, DifferentNDim) {
HostTensorGenerator<> gen;
for (size_t first_dim = 1; first_dim <= 2; ++first_dim) {
auto host_x = gen({first_dim, 64, 22, 22});
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({3});
host_tshp->ptr<int>()[0] = 64;
host_tshp->ptr<int>()[1] = 22;
host_tshp->ptr<int>()[2] = 22;
auto host_tshp_equal =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp_equal->resize({4});
host_tshp_equal->ptr<int>()[0] = 1;
host_tshp_equal->ptr<int>()[1] = 64;
host_tshp_equal->ptr<int>()[2] = 22;
host_tshp_equal->ptr<int>()[3] = 22;
using namespace opr;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto x = opr::relu(opr::Host2DeviceCopy::make(*graph, host_x, {"x"}));
auto tshp = opr::Host2DeviceCopy::make(*graph, host_tshp, {"tshp"});
auto tshp_equal =
opr::Host2DeviceCopy::make(*graph, host_tshp_equal, {"tshp_equal"});
auto check_mode = [&](Reduce::Mode mode) {
Reduce::Param param_default{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::DEFAULT};
auto reduce_default = opr::Reduce::make(x, param_default, tshp);
auto reduce_equal = opr::Reshape::make(
opr::Reduce::make(x, param_default, tshp_equal), tshp);
HostTensorND host_default;
HostTensorND host_equal;
auto func = graph->compile(
{make_callback_copy(reduce_default, host_default),
make_callback_copy(reduce_equal, host_equal)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_default, host_equal);
};
for (auto mode :
{Reduce::Mode::PRODUCT, Reduce::Mode::MAX, Reduce::Mode::MIN,
Reduce::Mode::SUM, Reduce::Mode::SUM_SQR, Reduce::Mode::MEAN}) {
check_mode(mode);
}
}
}
TEST(TestBasicArithReduction, MultiType) {
HostTensorGenerator<> gen;
auto host_x = gen({1, 64, 22, 22});
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({4});
host_tshp->ptr<int>()[0] = 1;
host_tshp->ptr<int>()[1] = 64;
host_tshp->ptr<int>()[2] = 1;
host_tshp->ptr<int>()[3] = 1;
using namespace opr;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto x_fp16 = opr::relu(opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, host_x, {"x"}), dtype::Float16())),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp, {"tshp"});
auto x = opr::TypeCvt::make(x_fp16, dtype::Float32());
auto check_mode = [&](Reduce::Mode mode) {
Reduce::Param param_default{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::DEFAULT};
Reduce::Param param_i16_co32{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::FLOAT_O32xC32};
Reduce::Param param_io16_c32{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::FLOAT_O16xC32};
auto reduce_default = opr::Reduce::make(x, param_default, tshp);
auto reduce_i16_co32 = opr::Reduce::make(x_fp16, param_i16_co32, tshp);
auto reduce_io16_c32 = opr::Reduce::make(x_fp16, param_io16_c32, tshp);
auto reduce_default_as16 = opr::TypeCvt::make(reduce_default, dtype::Float16());
HostTensorND host_default, host_default_as16, host_i16_co32, host_io16_c32;
auto func = graph->compile(
{make_callback_copy(reduce_default, host_default),
make_callback_copy(reduce_i16_co32, host_i16_co32),
make_callback_copy(reduce_io16_c32, host_io16_c32),
make_callback_copy(reduce_default_as16, host_default_as16)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_default, host_i16_co32);
MGB_ASSERT_TENSOR_EQ(host_default_as16, host_io16_c32);
};
for (auto mode : { Reduce::Mode::SUM_SQR}) {
check_mode(mode);
}
host_tshp->ptr<int>()[0] = 1;
host_tshp->ptr<int>()[1] = 64;
host_tshp->ptr<int>()[2] = 22;
host_tshp->ptr<int>()[3] = 22;
for (auto mode :
{Reduce::Mode::PRODUCT, Reduce::Mode::MAX, Reduce::Mode::MIN,
Reduce::Mode::SUM, Reduce::Mode::SUM_SQR, Reduce::Mode::MEAN}) {
check_mode(mode);
}
}
TEST(TestBasicArithReduction, C32VsC16) {
HostTensorGenerator<> gen(1.f, 2.f);
auto host_x = gen({1, 32, 100000, 2});
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({4});
host_tshp->ptr<int>()[0] = 1;
host_tshp->ptr<int>()[1] = 32;
host_tshp->ptr<int>()[2] = 1;
host_tshp->ptr<int>()[3] = 1;
using namespace opr;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto x_fp16 = opr::relu(opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, host_x, {"x"}), dtype::Float16())),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp, {"tshp"});
x_fp16 = opr::Concat::make({x_fp16, -x_fp16}, 0);
auto x = opr::TypeCvt::make(x_fp16, dtype::Float32());
Reduce::Param::Mode mode = Reduce::Param::Mode::SUM;
Reduce::Param param_default{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::DEFAULT};
Reduce::Param param_i16_co32{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::FLOAT_O32xC32};
Reduce::Param param_io16_c32{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::FLOAT_O16xC32};
auto reduce_default = opr::Reduce::make(x, param_default, tshp);
auto reduce_i16_co32 = opr::Reduce::make(x_fp16, param_i16_co32, tshp);
auto reduce_io16_c32 = opr::Reduce::make(x_fp16, param_io16_c32, tshp);
auto reduce_default_as16 = opr::TypeCvt::make(reduce_default, dtype::Float16());
auto bad = opr::Reduce::make(x_fp16, param_default, tshp);
HostTensorND host_default, host_default_as16, host_i16_co32, host_io16_c32,
host_bad;
auto func = graph->compile(
{make_callback_copy(reduce_default, host_default),
make_callback_copy(reduce_i16_co32, host_i16_co32),
make_callback_copy(reduce_io16_c32, host_io16_c32),
make_callback_copy(reduce_default_as16, host_default_as16),
make_callback_copy(bad, host_bad)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_default, host_i16_co32);
MGB_ASSERT_TENSOR_EQ(host_default_as16, host_io16_c32);
for (size_t i = 0; i < host_io16_c32.shape().total_nr_elems(); ++i) {
float a = host_io16_c32.ptr<dt_float16>()[i];
float b = host_bad.ptr<dt_float16>()[i];
ASSERT_TRUE(std::isfinite(a));
ASSERT_FALSE(std::isfinite(b));
}
}
TEST(TestBasicArithReduction, AutoCheck) {
using Checker = AutoOprChecker<2, 1>;
using Param = opr::Reduce::Param;
Param param;
auto make_graph = [¶m](
const Checker::SymInpArray& inputs,
DType dtype) -> Checker::SymOutArray {
auto inp = inputs[0];
auto tshp = inputs[1].symshape();
inp = opr::TypeCvt::make(inp, dtype);
return {opr::Reduce::make(inp, param, tshp)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp, DType dtype) {
auto cn = inp[0]->storage().comp_node();
TensorShape out_shape = inp[1]->shape();
dest[0] = HostTensorND{cn, out_shape, dtype::Float32()};
HostTensorND tmp_inp{cn, inp[0]->shape(), dtype};
HostTensorND new_inp{cn, inp[0]->shape(), dtype::Float32()};
auto typecvt = megdnn_naive_handle()->create_operator<megdnn::TypeCvt>();
typecvt->exec(inp[0]->as_megdnn(), tmp_inp.as_megdnn());
typecvt->exec(tmp_inp.as_megdnn(), new_inp.as_megdnn());
#define dispatch_by_mode(CTYPE, MODE, in, out) \
if (MODE == param.mode) { \
reduce_raw<MODE, CTYPE>(out, in); \
}
#define dispatch_by_dtype(DTYPE, in, out) \
mgb_assert(DTYPE() == (in).dtype()); \
typedef DTypeTrait<DTYPE>::ctype ctype; \
dispatch_by_mode(ctype, Mode::MIN, in, out); \
dispatch_by_mode(ctype, Mode::MAX, in, out); \
dispatch_by_mode(ctype, Mode::SUM, in, out); \
dispatch_by_mode(ctype, Mode::PRODUCT, in, out); \
dispatch_by_mode(ctype, Mode::SUM_SQR, in, out); \
dispatch_by_mode(ctype, Mode::MEAN, in, out);
mgb_assert(param.data_type == Param::DataType::FLOAT_O32xC32);
dispatch_by_dtype(dtype::Float32, new_inp, dest[0]);
#undef dispatch_by_mode
#undef dispatch_by_dtype
};
auto check = [&](Mode mode, Param::DataType data_type, DType dtype) {
param.mode = mode;
param.data_type = data_type;
Checker::RunOptions opts;
opts.outputs_max_err = 1e-3;
opts.numdiff_max_err = 5e-1;
using namespace std::placeholders;
Checker checker(
std::bind(make_graph, _1, dtype), std::bind(fwd, _1, _2, dtype));
if (dtype.category() == DTypeCategory::FLOAT) {
checker.set_input_allow_grad(1, false);
} else {
checker.disable_grad_check();
}
checker.run({TensorShape{22, 21}, {22, 1}}, opts)
.run({TensorShape{22, 21}, {1, 1}}, opts)
.run({TensorShape{22, 21}, {22, 1}}, opts);
};
for (auto mode : {Mode::SUM, Mode::MAX, Mode::MIN, Mode::PRODUCT, Mode::MEAN}) {
check(mode, Param::DataType::FLOAT_O32xC32, dtype::Float16());
check(mode, Param::DataType::FLOAT_O32xC32, dtype::Int32());
}
}
#define OPR_TEST(o) \
TEST(TestBasicArithReduction, o) { test_correctness<Mode::o>(); }
OPR_TEST(SUM)
OPR_TEST(SUM_SQR)
OPR_TEST(PRODUCT)
OPR_TEST(MAX)
OPR_TEST(MIN)
OPR_TEST(MEAN)
TEST(TestBasicArithReduction, CompSeqRecordLevel2) {
HostTensorGenerator<> gen;
auto host_x = gen({1}, CompNode::load("cpux"));
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({1});
host_tshp->ptr<int>()[0] = 1;
using namespace opr;
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
graph->options().comp_node_seq_record_level = 2;
graph->options().graph_opt_level = 0;
auto x_fp16 = opr::relu(opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, host_x, {"x"}), dtype::Float16())),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp, {"tshp"});
auto mode = Reduce::Mode::SUM_SQR;
auto x = opr::TypeCvt::make(x_fp16, dtype::Float32());
Reduce::Param param_default{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::DEFAULT};
Reduce::Param param_i16_co32{
mode, MEGDNN_MAX_NDIM, Reduce::Param::DataType::FLOAT_O32xC32};
auto reduce_default = opr::Reduce::make(x, param_default, tshp);
auto reduce_i16_co32 = opr::Reduce::make(x_fp16, param_i16_co32, tshp);
HostTensorND host_default, host_i16_co32;
auto func = graph->compile({
make_callback_copy(reduce_default, host_default, false),
make_callback_copy(reduce_i16_co32, host_i16_co32, false),
});
ComputingGraph::assert_destroy(graph);
EXPECT_NO_THROW(func->execute().wait());
EXPECT_NO_THROW(func->execute().wait());
}
TEST(TestBasicArithReduction, StaticInferValue) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 3, 4, 5});
auto graph = ComputingGraph::make();
using AI = opr::Subtensor::AxisIndexer;
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
x_shape = opr::GetVarShape::make(x),
x_shape_sub = opr::Subtensor::make(
x_shape, {AI::make_interval(0, x.make_scalar(-2), nullptr, nullptr)}),
y = opr::reduce_sum(x, x_shape_sub);
auto inferred_dev = graph->static_infer_manager().infer_value(y.node());
HostTensorND expected{host_x->comp_node(), dtype::Float32()};
expected.resize({1, 1, 4, 5});
reduce_raw<Mode::SUM, float>(expected, *host_x);
expected.reset(expected.storage(), inferred_dev.layout());
HostTensorND inferred = HostTensorND::make_proxy(inferred_dev);
MGB_ASSERT_TENSOR_EQ(inferred, expected);
}
TEST(TestBasicArithReduction, StaticInferValueDType) {
using ParamType = opr::Reduce::Param::DataType;
DType F32 = dtype::Float32(), F16 = dtype::Float16();
auto run_test = [](const DType& itype, const DType& expected_otype,
ParamType param_dtype) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 3, 4, 5});
auto host_tshp =
std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
host_tshp->resize({1});
host_tshp->ptr<int>()[0] = 1;
auto graph = ComputingGraph::make();
auto x_f32 = opr::Host2DeviceCopy::make(*graph, host_x),
x = opr::TypeCvt::make(x_f32, itype),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp),
y = opr::Reduce::make(
x, {opr::Reduce::Mode::SUM, MEGDNN_MAX_NDIM, param_dtype}, tshp);
auto inferred = graph->static_infer_manager().infer_value(y.node());
ASSERT_EQ(inferred.layout().dtype, expected_otype);
};
run_test(F32, F32, ParamType::DEFAULT);
run_test(F16, F16, ParamType::DEFAULT);
run_test(F32, F32, ParamType::FLOAT_O32xC32);
run_test(F16, F32, ParamType::FLOAT_O32xC32);
run_test(F32, F16, ParamType::FLOAT_O16xC32);
run_test(F16, F16, ParamType::FLOAT_O16xC32);
}
TEST(TestBasicArithReduction, EmptyInput) {
using Param = opr::Reduce::Param;
using Mode = opr::Reduce::Mode;
auto check_allow_empty = [](const Param& param, const TensorShape& inpshp,
double target_val) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
auto host_x = gen(inpshp);
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Reduce::make(x, param, {});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute().wait();
if (!host_y.shape().is_empty()) {
size_t size = host_y.layout().total_nr_elems();
#define cb(DType) \
if (host_y.layout().dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
auto ptr = host_y.ptr<ctype>(); \
ctype target = static_cast<ctype>(target_val); \
for (size_t i = 0; i < size; ++i) { \
ASSERT_TRUE(ptr[i] == target); \
} \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
#undef cb
} else {
ASSERT_TRUE(host_y.empty());
}
};
auto check_forbid_empty = [](const Param& param, const TensorShape& inpshp) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
auto host_x = gen(inpshp);
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Reduce::make(x, param, {});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
ASSERT_ANY_THROW(func->execute().wait());
};
check_allow_empty({Mode::SUM, 0, {}}, {0}, 0);
check_allow_empty({Mode::SUM, -1, {}}, {2, 0, 3}, 0);
check_allow_empty({Mode::SUM, 1, {}}, {2, 0, 3}, 0);
check_allow_empty({Mode::PRODUCT, 0, {}}, {0, 1, 2}, 1);
check_allow_empty({Mode::PRODUCT, 1, {}}, {0, 0, 0}, 1);
check_allow_empty({Mode::PRODUCT, 2, {}}, {0, 0, 0}, 1);
check_forbid_empty({Mode::MAX, 0, {}}, {0});
check_forbid_empty({Mode::MIN, -1, {}}, {0, 1, 2});
check_forbid_empty({Mode::MEAN, 0, {}}, {0, 0});
check_forbid_empty({Mode::SUM_SQR, 1, {}}, {2, 1, 0});
}