#include "megbrain/opr/misc.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/graph/event.h"
#include "megbrain/graph/execution_mask.h"
#include "megbrain/graph/grad_impl.h"
#include "megbrain/graph/helper.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/opr/basic_arith_wrapper.h"
#include "megbrain/opr/blas.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/indexing.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
#include "megbrain/plugin/profiler.h"
#include "megbrain/utils/timer.h"
#include "megbrain/test/helper.h"
#include "megdnn/heuristic_cache.h"
#include "megdnn/oprs/base.h"
#include <array>
#include <atomic>
#include <chrono>
#include <memory>
using namespace mgb;
namespace mgb {
namespace cg {
class ComputingGraphImpl : public ComputingGraph {
public:
GraphExecutable::ExecEnv* current_exec_env();
};
class SeqCompNodeOptimizerImpl : public SeqCompNodeOptimizer {
~SeqCompNodeOptimizerImpl() = default;
public:
void optimize_comp_nodes(const VarNodeArray& endpoints);
};
} }
namespace {
MGB_DEFINE_OPR_CLASS(PODDedupTestOpr, cg::SingleCNOperatorNodeBase) public:
struct Param {
int v0;
char v1;
} MGB_PACKED;
PODDedupTestOpr(ComputingGraph* owner, const Param& param)
: Super{owner, OperatorNodeConfig{}, "node", {}}, m_param(param) {
add_equivalence_component<PODHash<Param>>(&m_param);
add_output(None)->dtype(dtype::Byte());
}
static SymbolVar make(ComputingGraph& owner, const Param& param) {
return owner.insert_opr(std::make_unique<PODDedupTestOpr>(&owner, param))
->output(0);
}
private:
Param m_param;
void scn_do_execute() override {}
void init_output_comp_node() override {
output(0)->comp_node(CompNode::load("xpu0"));
}
void init_output_static_infer_desc() override {
using namespace mgb::cg::static_infer;
owner_graph()->static_infer_manager().register_shape_infer(
output(0),
{SourceType::CONSTANT, {}, [](TensorShape& dest, const InpVal&) {
dest = {1};
return true;
}});
}
};
MGB_DEFINE_OPR_CLASS(
WorkspaceAllocTestOpr, cg::SingleCNOutshapePureByInshapeOprBase) void get_output_var_shape(
const TensorShapeArray& inp_shape,
TensorShapeArray& out_shape) const override {
MGB_MARK_USED_VAR(inp_shape);
out_shape.at(0) = {2};
out_shape.at(1) = {3};
}
void scn_do_execute() override {
ASSERT_EQ(TensorShape{2}, output(0)->dev_tensor().shape());
ASSERT_EQ(TensorShape{3}, output(1)->dev_tensor().shape());
executed = true;
}
public:
bool executed = false;
WorkspaceAllocTestOpr(VarNode* inp) : Super(inp->owner_graph(), {}, "test", {inp}) {
add_input({inp});
add_output("out")->dtype(dtype::Float32());
cg::add_workspace_output(this);
}
};
MGB_DEFINE_OPR_CLASS(AllInputGradOpr, cg::SingleCNOutshapePureByInshapeOprBase) void get_output_var_shape(
const TensorShapeArray& inp_shape,
TensorShapeArray& out_shape) const override {
out_shape.at(0) = {2};
}
void scn_do_execute() override {}
public:
size_t nr_grad_call = 0;
VarNode* prev_out_grad = nullptr;
AllInputGradOpr(VarNode* a, VarNode* b)
: Super(a->owner_graph(), {}, "all_inp_grad", {a, b}) {
add_input({a, b});
add_output(None);
}
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(AllInputGradOpr);
MGB_IMPL_OPR_GRAD(AllInputGradOpr) {
auto&& ncopr = const_cast<AllInputGradOpr&>(opr);
ncopr.prev_out_grad = out_grad[0];
++ncopr.nr_grad_call;
SymbolVar x = opr.input(0), y = opr.input(1);
if (ncopr.nr_grad_call & 1) {
return VarNodeArray{(x + y).node(), nullptr};
} else {
return VarNodeArray{nullptr, (x * y).node()};
}
}
template <bool dynamic, typename dtype>
void test_aplusb() {
using Gen = HostTensorGenerator<dtype>;
using ctype = typename Gen::ctype;
Gen gen;
constexpr size_t SIZE = 1234;
auto host_x = gen({SIZE}), host_y = gen({SIZE});
auto graph = ComputingGraph::make();
SymbolVar x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
y = opr::Host2DeviceCopy::make(*graph, host_y).rename("y");
if (dynamic) {
x = opr::MarkDynamicVar::make(x).rename("xd");
y = opr::MarkDynamicVar::make(y).rename("yd");
}
auto z = opr::add(x, y).rename("z");
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
for (ctype delta = 0; delta < 2; ++delta) {
auto px = host_x->template ptr<ctype>();
px[0] += delta; func->execute();
auto py = host_y->template ptr<ctype>(), pz = host_z.template ptr<ctype>();
ASSERT_EQ(host_x->shape(), host_z.shape());
for (size_t i = 0; i < SIZE; ++i) {
MGB_ASSERT_FLOAT_EQ(px[i] + py[i], pz[i])
<< ssprintf("failed at %zu: %g+%g", i, float(px[i]), float(py[i]));
}
}
}
class TrackableStaticMemAlloc final : public cg::DeviceMemoryAllocator {
SmallVector<DeviceTensorStorage> m_refhold;
public:
size_t version_num = 0, size_expect = 0;
void alloc_static(
ComputingGraph*, DeviceTensorStorage& dest, size_t size) override {
dest.ensure_size(size);
m_refhold.emplace_back(dest);
if (size_expect) {
ASSERT_EQ(size_expect, size);
}
}
size_t nr_call() const { return m_refhold.size(); }
size_t static_alloc_version(ComputingGraph*) const override { return version_num; }
};
class TrackableDynamicMemAlloc final : public cg::DeviceMemoryAllocator {
ThinHashSet<VarNode*> m_alive_vars;
std::mutex m_mtx;
public:
void alloc_dynamic(VarNode* var, DeviceTensorStorage& dest, size_t size) override {
ASSERT_LT(dest.size(), size);
MGB_LOCK_GUARD(m_mtx);
auto ptr = dest.comp_node().alloc_device(size);
auto ins = m_alive_vars.insert(var);
ASSERT_TRUE(ins.second);
auto del = [this, var, cn = dest.comp_node()](void* ptr) {
cn.free_device(ptr);
MGB_LOCK_GUARD(m_mtx);
auto nr = m_alive_vars.erase(var);
ASSERT_EQ(1u, nr);
};
dest.reset(dest.comp_node(), size, {static_cast<dt_byte*>(ptr), del});
}
const ThinHashSet<VarNode*>& alive_vars() const { return m_alive_vars; }
~TrackableDynamicMemAlloc() { EXPECT_TRUE(m_alive_vars.empty()); }
};
}
MGB_DYN_TYPE_OBJ_FINAL_IMPL(PODDedupTestOpr);
MGB_DYN_TYPE_OBJ_FINAL_IMPL(WorkspaceAllocTestOpr);
TEST(TestGraphBasic, APlusBF32) {
test_aplusb<false, dtype::Float32>();
}
TEST(TestGraphBasic, APlusBI32) {
test_aplusb<false, dtype::Int32>();
}
TEST(TestGraphBasic, DynAPlusBF32) {
test_aplusb<true, dtype::Float32>();
}
TEST(TestGraphBasic, DynAPlusBI32) {
test_aplusb<true, dtype::Int32>();
}
TEST(TestGraph, APlusBOnCPU) {
HostTensorGenerator<> gen;
constexpr size_t SIZE = 1234;
auto host_x = gen({SIZE}, "cpu0"), host_y = gen({SIZE}, "cpu0");
auto graph = ComputingGraph::make();
SymbolVar x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
y = opr::Host2DeviceCopy::make(*graph, host_y).rename("y");
auto z = (x + y).rename("z");
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
ASSERT_EQ(host_x->shape(), host_z.shape());
auto px = host_x->ptr<float>(), py = host_y->ptr<float>(), pz = host_z.ptr<float>();
for (size_t i = 0; i < SIZE; ++i)
MGB_ASSERT_FLOAT_EQ(px[i] + py[i], pz[i]);
}
TEST(TestGraph, DeDup) {
auto t0 = std::make_shared<DeviceTensorND>(
CompNode::load("xpu0"), TensorShape{2, 2}),
t1 = std::make_shared<DeviceTensorND>(
CompNode::load("xpu0"), TensorShape{2, 2}),
t2 = std::make_shared<DeviceTensorND>(
CompNode::load("xpu0"), TensorShape{2, 2});
auto graph = ComputingGraph::make();
auto st0 = opr::SharedDeviceTensor::make(*graph, t0),
st1 = opr::SharedDeviceTensor::make(*graph, t1);
SymbolVar x = opr::add(st0, st1),
y = opr::add(
opr::SharedDeviceTensor::make(*graph, t1),
opr::SharedDeviceTensor::make(*graph, t0)),
z = opr::add(
opr::SharedDeviceTensor::make(*graph, t0),
opr::SharedDeviceTensor::make(*graph, t2));
EXPECT_EQ(x.node(), y.node());
EXPECT_NE(x.node(), z.node());
}
TEST(TestGraph, PODDeDup) {
auto graph = ComputingGraph::make();
PODDedupTestOpr::Param param{42, 'x'};
auto var0 = PODDedupTestOpr::make(*graph, param),
var1 = PODDedupTestOpr::make(*graph, param);
param.v1 = 'y';
auto var2 = PODDedupTestOpr::make(*graph, param);
EXPECT_NE(var0.node(), var2.node());
EXPECT_NE(var1.node(), var2.node());
EXPECT_EQ(var0.node(), var1.node());
}
TEST(TestGraph, MultiCard) {
auto cns = load_multiple_xpus(2);
constexpr size_t SIZE = 123456;
constexpr double SLEEP_TIME = 0.8, MAX_EXE_TIME = 0.5;
HostTensorGenerator<> gen;
auto host_opr0 = gen({SIZE}, cns[0]), host_opr1 = gen({SIZE}, cns[1]);
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"}),
opr1 = opr::Host2DeviceCopy::make(*graph, host_opr1, {"opr1"}),
opr0_delay = opr::Sleep::make(opr0, SLEEP_TIME),
opr1_delay = opr::Sleep::make(opr1, SLEEP_TIME),
opr1_card0 = opr::Copy::make(
opr1_delay, OperatorNodeConfig{"opr1_card0"}.comp_node(
cns[0].change_stream(1))),
opr0_double = opr::add(opr0_delay, opr0_delay, {"opr0_double"}),
sum = opr::add(
opr0_double, opr1_card0,
OperatorNodeConfig{"sum"}.comp_node(cns[0].change_stream(2))),
sum_delay = opr::Sleep::make(sum, SLEEP_TIME);
HostTensorND host_sum;
auto func = graph->compile(
{{sum_delay, [&](DeviceTensorND& s) { host_sum.copy_from(s); }}});
RealTimer timer;
func->execute();
auto use_time = timer.get_secs();
if (use_time >= MAX_EXE_TIME) {
mgb_log_warn("expect time [%f < %f], got %f", use_time, MAX_EXE_TIME, use_time);
}
ASSERT_EQ(host_sum.layout(), host_opr0->layout());
auto p0 = host_opr0->ptr<float>(), p1 = host_opr1->ptr<float>(),
ps = host_sum.sync().ptr<float>();
use_time = timer.get_secs();
if (use_time <= SLEEP_TIME * 2) {
mgb_log_warn(
"expect time [%f > %f], got %f", use_time, SLEEP_TIME * 2, use_time);
}
use_time = timer.get_secs();
if (use_time >= SLEEP_TIME * 3) {
mgb_log_warn(
"expect time [%f < %f], got %f", use_time, SLEEP_TIME * 3, use_time);
}
for (size_t i = 0; i < SIZE; i++)
ASSERT_FLOAT_EQ(p0[i] * 2 + p1[i], ps[i]);
}
TEST(TestGraph, AsyncExec) {
static constexpr double SLEEP_TIME = 0.1;
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
auto host_x = gen({1});
SymbolVar x0 = opr::Host2DeviceCopy::make(*graph, host_x),
xs = opr::Sleep::make(x0, SLEEP_TIME);
auto func = graph->compile({{xs, [](DeviceTensorND&) {}}});
RealTimer timer;
double t0, t1, t2, t3, t4, t5;
t0 = timer.get_secs();
func->execute();
t1 = timer.get_secs();
func->wait();
t2 = timer.get_secs();
func->execute();
t3 = timer.get_secs();
func->execute();
t4 = timer.get_secs();
func->wait();
t5 = timer.get_secs();
if ((t1 - t0) >= SLEEP_TIME / 2 || (t2 - t0) <= SLEEP_TIME ||
(t3 - t2) >= SLEEP_TIME / 2 || (t4 - t2) <= SLEEP_TIME ||
(t5 - t4) <= SLEEP_TIME / 2 || func->get_prev_exec_time() <= SLEEP_TIME ||
func->get_prev_exec_time() >= SLEEP_TIME * 1.5) {
mgb_log_warn(
"time issue, pls check detail: [t0: %f, t1:%f, t2:%f, t3: %f, "
"t4: %f, t5: %f]",
t0, t1, t2, t3, t4, t5);
}
}
TEST(TestGraph, VSizeTensor) {
HostTensorGenerator<> gen;
auto host_x = gen({1}), host_y = gen({1});
auto graph = ComputingGraph::make();
auto dev_x = opr::Host2DeviceCopy::make(*graph, host_x),
dev_y = opr::Host2DeviceCopy::make(*graph, host_y), dev_z = dev_x + dev_y;
HostTensorND host_z;
auto func = graph->compile(
{{dev_z, [&](DeviceTensorND& z) { host_z.copy_from(z).sync(); }}});
auto check = [&](size_t inp_sz) {
*host_x = *gen({inp_sz});
*host_y = *gen({inp_sz});
func->execute();
ASSERT_EQ(host_z.shape(), TensorShape({inp_sz}));
auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
pz = host_z.ptr<float>();
for (size_t i = 0; i < inp_sz; i++)
ASSERT_EQ(px[i] + py[i], pz[i]);
};
check(100);
check(456);
check(456);
check(10);
}
TEST(TestGraph, CompileTwice) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x), y1 = x + 1, y2 = x + 2,
z1 = opr::Copy::make(y1), z2 = opr::Copy::make(y2);
EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
HostTensorND host_z1, host_z2;
auto func = graph->compile({make_callback_copy(z1, host_z1)});
EXPECT_FALSE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
func->execute();
EXPECT_EQ(host_x->ptr<float>()[0] + 1, host_z1.ptr<float>()[0]);
EXPECT_FALSE(host_z2.storage().comp_node_valid());
host_z1.ptr<float>()[0]++;
func = graph->compile({make_callback_copy(z2, host_z2)});
EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
EXPECT_FALSE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
func->execute();
EXPECT_NE(host_x->ptr<float>()[0] + 1, host_z1.ptr<float>()[0]);
EXPECT_EQ(host_x->ptr<float>()[0] + 2, host_z2.ptr<float>()[0]);
}
TEST(TestGraph, MultiCNDynamicInputs) {
auto cns = load_multiple_xpus(3);
HostTensorGenerator<> gen;
auto host_x = gen({5, 2}, cns[0]), host_y = gen({5, 3}, cns[1]);
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Host2DeviceCopy::make(*graph, host_y),
xd = opr::Sleep::make(opr::MarkDynamicVar::make(x), 0.1),
yd = opr::Sleep::make(opr::MarkDynamicVar::make(y), 0.2),
z = opr::Concat::make({xd, yd}, 1, OperatorNodeConfig().comp_node(cns[2]));
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
ASSERT_EQ(host_z.shape(), TensorShape({5, 5}));
for (size_t i = 0; i < 5; ++i)
for (size_t j = 0; j < 5; ++j) {
float expect;
if (j < 2)
expect = *host_x->ptr<float>({i, j});
else
expect = *host_y->ptr<float>({i, j - 2});
ASSERT_FLOAT_EQ(expect, *host_z.ptr<float>({i, j}));
}
}
TEST(TestGraph, DepMapSameNode) {
auto run = [](bool dyn) {
auto graph = ComputingGraph::make();
auto cn = CompNode::load("xpu0");
auto x = SymbolVar::make_scalar(1, *graph, cn);
if (dyn)
x = opr::MarkDynamicVar::make(x);
auto y = opr::Reshape::make(x, x);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
ASSERT_EQ(TensorShape{1}, host_y.shape());
ASSERT_EQ(1, host_y.ptr<dt_int32>()[0]);
};
run(false);
run(true);
}
TEST(TestGraph, DoubleThrowOnInit) {
HostTensorGenerator<> gen;
auto host_x = gen({23});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
using Ad = opr::SetSubtensor::AxisIndexer;
std::vector<Ad> axis_desc{Ad::make_index(0, x.make_scalar(0.f))};
ASSERT_THROW(opr::SetSubtensor::make(x, x, axis_desc), MegBrainError);
ASSERT_THROW(opr::SetSubtensor::make(x, x, axis_desc), MegBrainError);
}
TEST(TestGraph, ShapeOnlyDep) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 3});
auto graph = ComputingGraph::make();
using Ad = opr::AxisAddRemove::AxisDesc;
bool shp_dep_exec = false;
auto cb_set_shp_dep_exec = [&](DeviceTensorND&) { shp_dep_exec = true; };
auto add_chk = [&](SymbolVar var) {
return opr::CallbackInjector::make(var, cb_set_shp_dep_exec);
};
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
xd_ = opr::MarkDynamicVar::make(x),
xd = add_chk(x.make_scalar(0)).broadcast(opr::GetVarShape::make(xd_)),
axadd = add_chk(opr::AxisAddRemove::make(xd, {Ad::make_add(0)})),
y = opr::GetVarShape::make(axadd);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
TensorShape y_as_shp;
DeviceTensorND yv{CompNode::default_cpu()};
yv.copy_from(host_y);
cg::copy_tensor_value_to_shape(y_as_shp, yv);
ASSERT_EQ(TensorShape({1, 2, 3}), y_as_shp);
ASSERT_FALSE(shp_dep_exec);
}
TEST(TestGraph, MemAllocForAsyncRead) {
auto cns = load_multiple_xpus(2);
auto cn1 = cns[1];
HostTensorGenerator<> gen;
auto host_x = gen({4, 3}, cns[0]);
std::atomic_bool copy_issued = ATOMIC_VAR_INIT(false);
RealTimer timer;
auto cb_wait_copy_issue = [&](DeviceTensorND&) {
while (!copy_issued.load())
;
auto t = timer.get_secs();
mgb_assert(t <= 0.1, "copy issue time too long: %.2f", t);
};
auto cb_set_copy_issue = [&](DeviceTensorND&) { copy_issued.store(true); };
auto make_cb_async = [](SymbolVar dev, HostTensorND& host) {
return std::make_pair(dev, [&](DeviceTensorND& d) { host.copy_from(d); });
};
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
graph->options().seq_opt.enable_seq_comp_node_opt = false;
auto x = opr::Host2DeviceCopy::make(*graph, host_x), xrshp = x.reshape({6, 2}),
xv0_static = xrshp + 3 - 2,
xv0_after_copy = opr::CallbackInjector::make(xv0_static, cb_wait_copy_issue),
xdyn = opr::MarkDynamicVar::make(xv0_after_copy), y0 = xdyn + 1,
xcp_cn1 = opr::CallbackInjector::make(
opr::Copy::make(x, {cn1}), cb_set_copy_issue),
y1 = xcp_cn1 + 3;
HostTensorND host_y0, host_y1;
auto func =
graph->compile({make_cb_async(y0, host_y0), make_cb_async(y1, host_y1)});
timer.reset();
opr::Sleep::sleep(cn1, 0.2);
func->execute().wait();
ASSERT_EQ(x.node()->prev_dev_ptr(), xrshp.node()->prev_dev_ptr());
ASSERT_NE(x.node()->prev_dev_ptr(), xdyn.node()->prev_dev_ptr());
ASSERT_EQ(TensorShape({6, 2}), host_y0.shape());
ASSERT_EQ(TensorShape({4, 3}), host_y1.shape());
for (size_t i = 0; i < 12; ++i) {
auto xv = host_x->ptr<float>()[i];
MGB_ASSERT_FLOAT_EQ(xv + 2, host_y0.ptr<float>()[i]);
MGB_ASSERT_FLOAT_EQ(xv + 3, host_y1.ptr<float>()[i]);
}
}
TEST(TestGraph, EmptyStaticAlloc) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
for (int i = 0; i < 2; ++i) {
auto host_x = gen({2, 3});
auto dev_x = std::make_shared<DeviceTensorND>();
dev_x->copy_from(*host_x);
auto x = opr::SharedDeviceTensor::make(*graph, dev_x), y = x.reshape({6});
auto func = graph->compile({{y, {}}});
func->execute();
ASSERT_EQ(dev_x->raw_ptr(), prev_dev_ptr(y));
}
}
TEST(TestGraph, MultiOutRelease) {
auto cns = load_multiple_xpus(4);
auto cn0 = cns[1], cn1 = cns[2], cn2 = cns[3];
HostTensorGenerator<> gen;
auto host_x = gen({6, 3}, cns[0]), host_one = gen({1}, cns[0]);
host_one->ptr<float>()[0] = 1;
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
graph->options().async_exec_level = 0b10;
std::atomic_bool cn0_finished{false};
float* splt2_dev_ptr_produced = nullptr;
DeviceTensorStorage splt2_alloc;
splt2_alloc.comp_node(cn2.change_stream(CompNode::Stream::COPY)).ensure_size(6);
VarNode* split_out0 = nullptr;
auto cb_set_cn0_finish = [&](DeviceTensorND&) {
mgb_assert(split_out0->contain_flag(VarNode::Flag::RT_FORCE_DYNAMIC_MEM_ALLOC));
while (split_out0->mem_plan().valid()) {
asm volatile("" : : : "memory");
}
mgb_assert(!split_out0->dev_tensor_valid());
splt2_alloc = {};
cn0_finished.store(true);
};
auto cb_wait_cn0_finish = [&](DeviceTensorND&) {
while (!cn0_finished.load())
;
};
auto cb_record_ptr = [&](DeviceTensorND& dv) {
splt2_dev_ptr_produced = dv.ptr<float>();
};
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
tmp = opr::CallbackInjector::make(
SymbolVar::make_scalar(
2.3f, *graph, cn1.change_stream(CompNode::Stream::COPY)),
cb_wait_cn0_finish),
one0 = opr::Host2DeviceCopy::make(*graph, host_one, {cn0}),
one1 = opr::Host2DeviceCopy::make(*graph, host_one, {cn1}),
one2 = opr::Host2DeviceCopy::make(*graph, host_one, {cn2});
set_priority(tmp, -100);
auto splt = opr::Split::make(
x, opr::Split::Options::make_average(0, 3),
OperatorNodeConfig{}.comp_node_arr(
{cn0.change_stream(23), cn1.change_stream(23),
cn2.change_stream(23)}));
HostTensorND host_y1;
split_out0 = splt[0].node();
auto func = graph->compile({
{opr::add(splt[0], one0, cn0), cb_set_cn0_finish},
{tmp, {}},
make_callback_copy(opr::add(splt[1], one1, cn1), host_y1),
{opr::add(splt[2], one2, cn2), {}}, {splt[2], cb_record_ptr},
});
func->execute();
func->to_json()->writeto_fpath(output_file("TestGraph.MultiOutRelease.json"));
ASSERT_EQ(TensorShape({2, 3}), host_y1.shape());
auto py1 = host_y1.ptr<float>(), px = host_x->ptr<float>({2});
for (size_t i = 0; i < 6; ++i)
MGB_ASSERT_FLOAT_EQ(px[i] + 1, py1[i]);
ASSERT_EQ(splt2_dev_ptr_produced, splt[2].node()->prev_dev_ptr());
}
TEST(TestGraph, MemAllocForRemoteReadVars) {
auto cn1 = CompNode::load("xpu0:1");
HostTensorGenerator<> gen;
auto host_x = gen({5, 6}), host_y = gen({5, 6});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y).rename("y"),
sum0 = (opr::Sleep::make(x, 0.2) * x + opr::Sleep::make(y, 0.2) * y)
.rename("sum0"),
sum1 = opr::add(x, y, {cn1}).rename("sum1");
HostTensorND host_sum0, host_sum1;
auto func = graph->compile(
{make_callback_copy(sum0, host_sum0), make_callback_copy(sum1, host_sum1)});
func->execute();
func->wait();
for (bool sleep_cn1 : {false, true}) {
host_sum0 = {};
host_sum1 = {};
if (sleep_cn1)
opr::Sleep::sleep(cn1, 0.5);
func->execute();
auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
ps0 = host_sum0.ptr<float>(), ps1 = host_sum1.ptr<float>();
for (int i = 0; i < 30; ++i) {
auto x = px[i], y = py[i];
ASSERT_FLOAT_EQ(x * x + y * y, ps0[i]);
ASSERT_FLOAT_EQ(x + y, ps1[i]);
}
}
ASSERT_FALSE(cg::is_static_var_storage(x.node()));
ASSERT_FALSE(cg::is_static_var_storage(y.node()));
}
TEST(TestGraph, ShapeOnlyInput) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x), y = opr::GetVarShape::make(x);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
for (size_t sz : {1, 3, 5}) {
*host_x = *gen({sz});
func->execute();
ASSERT_EQ(sz, size_t(host_y.ptr<dt_int32>()[0]));
}
}
TEST(TestGraph, HostAndDevValueDep) {
HostTensorGenerator<dtype::Int32> gen;
auto host_idx = gen({1}), host_x = gen({3});
host_idx->ptr<dt_int32>()[0] = 0;
for (int i = 0; i < 3; ++i)
host_x->ptr<dt_int32>()[i] = i + 1;
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
idx = opr::Host2DeviceCopy::make(*graph, host_idx).rename("idx"),
xsub = opr::IndexAt::make(x, {{0, idx}}).rename("xsub"),
idx2 = (idx * idx).rename("idx2"), y = (xsub + idx2).rename("y");
set_priority(xsub, -10);
set_priority(idx2, 10);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
bool found = false;
for (auto&& i : func->get_rt_static_source_deps()) {
constexpr auto V = cg::static_infer::DepType::VALUE;
if (i.dest == idx.node() && i.type == V) {
found = true;
break;
}
}
ASSERT_TRUE(found);
for (int i = 0; i < 3; ++i) {
host_idx->ptr<dt_int32>()[0] = i;
func->execute();
ASSERT_EQ(i + 1 + i * i, host_y.ptr<dt_int32>()[0]) << "fail at " << i;
}
}
TEST(TestGraph, ExtraVarDeps) {
HostTensorND hv{dtype::Float32()};
hv.comp_node(CompNode::load("xpu0")).resize({1}).ptr<float>()[0] = 0;
auto dv = std::make_shared<DeviceTensorND>();
dv->copy_from(hv);
float cbv0 = -1, cbv1 = -1;
auto cb0 = [&](DeviceTensorND& v) {
cbv0 = HostTensorND().copy_from(v).sync().ptr<float>()[0];
};
auto cb1 = [&](DeviceTensorND& v) {
cbv1 = HostTensorND().copy_from(v).sync().ptr<float>()[0];
};
auto graph = ComputingGraph::make();
auto x = opr::SharedDeviceTensor::make(*graph, dv),
xu = opr::AddUpdate::make(x, x.make_scalar(1.f)),
y0 = opr::CallbackInjector::make(x, cb0),
y1 = opr::CallbackInjector::make(xu, cb1);
graph->options().extra_vardeps[xu.node()].push_back(y0.node());
graph->options().extra_vardeps[xu.node()].push_back(y1.node());
auto func = graph->compile({{xu, {}}});
for (int i = 0; i < 3; ++i) {
func->execute();
MGB_ASSERT_FLOAT_EQ(i, cbv0);
MGB_ASSERT_FLOAT_EQ(i + 1, cbv1);
}
}
TEST(TestGraph, WorkspaceAlloc) {
auto graph = ComputingGraph::make();
auto x = SymbolVar::make_scalar(0, *graph, CompNode::load("xpu0"));
auto opr = graph->insert_opr(std::make_unique<WorkspaceAllocTestOpr>(x.node()));
ASSERT_EQ(2u, opr->output().size());
ASSERT_EQ(TensorShape{2}, opr->output(0)->shape());
ASSERT_EQ(TensorShape{}, opr->output(1)->shape());
auto func = graph->compile({{opr->output(0), {}}});
func->execute();
ASSERT_TRUE(opr->cast_final_safe<WorkspaceAllocTestOpr>().executed);
}
TEST(TestGraph, ConstFolding) {
auto graph = ComputingGraph::make();
auto a = SymbolVar::make_scalar(3, *graph, CompNode::load("xpu0")),
b = SymbolVar::make_scalar(3, *graph, CompNode::load("xpu0")), c = a + b,
d = a + b;
ASSERT_EQ(a.node(), b.node());
ASSERT_EQ(c.node(), d.node());
ASSERT_NE(a.node(), c.node());
ASSERT_EQ(d.node()->owner_opr()->dyn_typeinfo(), opr::ImmutableTensor::typeinfo());
}
TEST(TestGraph, MergeBroadcast) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
auto graph = ComputingGraph::make();
auto a = opr::Host2DeviceCopy::make(*graph, host_x), b = a.broadcast({1, 2}),
c = b.broadcast({3, 4});
ASSERT_EQ(b.node(), b.node());
ASSERT_EQ(c.node()->shape(), TensorShape({3, 4}));
}
TEST(TestGraph, SwapTypeCvtAndBcast) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
auto graph = ComputingGraph::make();
auto a = opr::Host2DeviceCopy::make(*graph, host_x), b = a.broadcast({1, 2}),
c = opr::TypeCvt::make(b, dtype::Int32());
ASSERT_EQ(b.node()->owner_opr()->dyn_typeinfo(), opr::Broadcast::typeinfo());
ASSERT_EQ(c.node()->dtype(), dtype::Int32());
}
TEST(TestGraph, SingleGraphMultipleCompile) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 3});
auto graph = ComputingGraph::make();
auto x = opr::MarkDynamicVar::make(opr::Host2DeviceCopy::make(*graph, host_x)),
y = x + 1;
HostTensorND host_y0, host_y1, host_y_expect;
host_y_expect.copy_from(*host_x);
for (size_t i = 0, it = host_x->shape().total_nr_elems(); i < it; ++i)
host_y_expect.ptr<float>()[i]++;
auto func0 = graph->compile({make_callback_copy(y, host_y0)});
func0->execute();
auto func1 = graph->compile({make_callback_copy(y, host_y1)});
func1->execute();
ASSERT_THROW(func0->execute(), MegBrainError);
MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y0);
MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y1);
}
TEST(TestGraph, VarVirtualReceiverGrad) {
HostTensorGenerator<> gen;
constexpr size_t SIZE = 23;
auto graph = ComputingGraph::make();
auto bind_vo = [&](const std::array<SymbolVar, 3>& inp,
const std::array<SymbolVar, 3>& out) {
HostTensorND iv[3];
ComputingGraph::OutputSpec outspec;
for (int i = 0; i < 3; ++i) {
outspec.push_back(make_callback_copy(inp[i], iv[i]));
inp[i].rename(ssprintf("vinp%d", i));
out[i].rename(ssprintf("vout%d", i));
}
graph->compile(outspec)->execute();
auto grad = [](const VarNodeArray& inp, const VarNodeArray&, size_t idx,
const VarNodeArray& out_grad) {
SymbolVar x = inp[idx], exp = x.make_scalar(float(idx + 2)),
gx = exp * opr::pow(x + float(idx), exp - 1.f) * out_grad[idx];
return gx.node();
};
VarNodeArray vinp(3), vout(3);
for (int i = 0; i < 3; ++i) {
vinp[i] = inp[i].node();
vout[i] = out[i].node();
}
cg::add_var_virtual_receiver(vinp, vout, grad);
float *iptr[3], *optr[3];
for (int i = 0; i < 3; ++i) {
iptr[i] = iv[i].ptr<float>();
optr[i] = out[i].node()
->owner_opr()
->cast_final_safe<opr::Host2DeviceCopy>()
.host_data()
->ptr<float>();
}
for (size_t i = 0; i < SIZE; ++i) {
for (int j = 0; j < 3; ++j)
optr[j][i] = std::pow(iptr[j][i] + j, 2.0 + j);
}
};
std::shared_ptr<HostTensorND> host_x = gen({SIZE}), host_vo[3], host_loss_p[5];
for (int i = 0; i < 5; ++i) {
if (i < 3)
host_vo[i] = gen({SIZE});
host_loss_p[i] = gen({SIZE});
}
auto mkl = [&](SymbolVar x, size_t idx) {
return opr::Dot::make(x, opr::Host2DeviceCopy::make(*graph, host_loss_p[idx]));
};
auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
loss = mkl(x, 3) + mkl(opr::pow(x, x.make_scalar(-1.f)), 4);
std::array<SymbolVar, 3> vout;
for (int i = 0; i < 3; ++i) {
vout[i] = opr::Host2DeviceCopy::make(*graph, host_vo[i]);
loss = loss + mkl(vout[i], i);
}
bind_vo({x, x * .9f, x}, vout);
HostTensorND gx, host_loss;
auto func = graph->compile(
{make_callback_copy(cg::grad(loss, x), gx),
make_callback_copy(loss, host_loss)});
func->execute();
auto px = host_x->ptr<float>(), pgx = gx.ptr<float>();
float *plp[5], *pvo[3], scale[5], bias[5], exp[5];
for (int i = 0; i < 5; ++i) {
plp[i] = host_loss_p[i]->ptr<float>();
scale[i] = 1;
bias[i] = 0;
exp[i] = 1;
if (i < 3)
pvo[i] = host_vo[i]->ptr<float>();
}
exp[0] = 2;
scale[1] = 0.9;
bias[1] = 1;
exp[1] = 3;
bias[2] = 2;
exp[2] = 4;
exp[4] = -1;
float loss_expect = 0;
for (size_t i = 0; i < SIZE; ++i) {
float gx = 0, x = px[i];
for (int j = 0; j < 5; ++j) {
auto a = scale[j], b = bias[j], c = exp[j];
auto base = a * x + b;
gx += plp[j][i] * c * a * std::pow(base, c - 1.f);
loss_expect += plp[j][i] * std::pow(base, c);
if (j < 3) {
MGB_ASSERT_FLOAT_EQ(std::pow(base, c), pvo[j][i]);
}
}
MGB_ASSERT_FLOAT_EQ(gx, pgx[i]);
}
MGB_ASSERT_FLOAT_EQ(loss_expect, host_loss.ptr<float>()[0]);
}
TEST(TestGraph, ClearDeviceMemory) {
HostTensorGenerator<> gen;
auto host_x = gen({2, 3});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x), y = x + 1;
auto func = graph->compile({{y, {}}});
for (int i = 0; i < 2; ++i) {
ASSERT_EQ(0u, graph->clear_device_memory());
func->execute();
ASSERT_EQ(1u, graph->clear_device_memory());
ASSERT_EQ(0u, graph->clear_device_memory());
}
}
TEST(TestGraph, CopyStream) {
REQUIRE_GPU(2);
HostTensorGenerator<> gen;
auto cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1");
auto host_x = gen({23}, cn0);
auto sum_sqr = [](SymbolVar x) { return opr::reduce_sum_sqr(x, x.make_scalar(1)); };
auto graph = ComputingGraph::make();
graph->options().log_level = 3;
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
loss0 = opr::Copy::make(
sum_sqr(x) + opr::reduce_sum(x, x.make_scalar(1)), cn1),
loss1 = sum_sqr(opr::Copy::make(x, cn1)),
gx = opr::VirtualGrad::make(loss0 + loss1, x);
HostTensorND host_gx;
auto func = graph->compile({make_callback_copy(gx, host_gx)});
func->execute();
ASSERT_EQ(host_gx.shape(), host_x->shape());
auto px = host_x->ptr<float>(), pgx = host_gx.ptr<float>();
for (size_t i = 0; i < 23; ++i) {
MGB_ASSERT_FLOAT_EQ(px[i] * 4 + 1, pgx[i]);
}
ASSERT_EQ(int(CompNode::Stream::COPY), host_gx.comp_node().locator().stream);
}
TEST(TestGraph, DynShapeDepCrossCN) {
auto cns = load_multiple_xpus(2);
HostTensorGenerator<> gen;
auto host_x = gen({23}, cns[0]);
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Sleep::make(x, 0.1, {true, true}), a = opr::MarkDynamicVar::make(y),
ao = opr::Copy::make(a, cns[1].change_stream(1)),
b = opr::GetVarShape::make(ao, {}, cns[1]) + 1;
graph->options().var_sanity_check_first_run = false;
graph->options().async_exec_level |= 0b10;
set_priority(b, -100);
HostTensorND host_a, host_b;
auto func = graph->compile(
{make_callback_copy(a, host_a, false),
make_callback_copy(b, host_b, false)});
func->execute().wait();
MGB_ASSERT_TENSOR_EQ(*host_x, host_a);
ASSERT_EQ(TensorShape{1}, host_b.shape());
ASSERT_EQ(24.f, host_b.ptr<int>()[0]);
}
namespace {
MGB_DEFINE_OPR_CLASS(CustomCopy, cg::SingleCNOperatorNodeBase) std::shared_ptr<DeviceTensorND> m_data;
void scn_do_execute() override {
using namespace std::literals;
std::this_thread::sleep_for(100ms);
m_data->copy_from(input(0)->dev_tensor());
}
void init_output_static_infer_desc() override {
using namespace cg::static_infer;
owner_graph()->static_infer_manager().register_shape_infer(
output(0), ShapeInferDesc::make_const({}));
}
public:
CustomCopy(VarNode* x, std::shared_ptr<DeviceTensorND> dv)
: Super{x->owner_graph(), {dv->comp_node()}, "d2h", {x}}, m_data(dv) {
add_input({x});
using F = VarNode::Flag;
add_output(None)->add_flag(F::ALLOW_EMPTY_SHAPE).add_flag(F::VOLATILE_CONTENT);
}
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(CustomCopy);
}
TEST(TestGraph, DependentOnVolatileContent) {
HostTensorGenerator<> gen;
auto cn0 = CompNode::load("xpu0"), cn1 = cn0.change_stream(1);
auto host_x = gen({233}, cn0);
auto dev_y = std::make_shared<DeviceTensorND>(cn1);
auto graph = ComputingGraph::make();
auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
y = x.insert_single_output_opr<CustomCopy>(x.node(), dev_y),
x_new = opr::AddUpdate::make(x, x.make_scalar(1));
auto func = graph->compile({{y, {}}, {x_new, {}}});
func->execute().wait();
HostTensorND host_y;
host_y.copy_from(*dev_y).sync();
MGB_ASSERT_TENSOR_EQ(*host_x, host_y);
}
namespace {
void check_wait(SymbolVar dest, SymbolVar dep) {
if (!dep.node()) {
ASSERT_EQ(0u, dest.node()->owner_opr()->input_waiting_spec().size());
return;
}
cg::OperatorNodeBase::InputWaitingSpecElem ws;
unpack_vector(dest.node()->owner_opr()->input_waiting_spec(), ws);
ASSERT_EQ(ws.comp_node, dest.node()->comp_node());
VarNode* get;
unpack_vector(ws.dev_ready, get);
ASSERT_EQ(dep, get);
};
}
TEST(TestGraph, InputWaitingSpec) {
auto cns = load_multiple_xpus(2);
constexpr size_t SIZE = 12345;
HostTensorGenerator<> gen;
auto host_x = gen({SIZE}, cns[0]);
auto graph = ComputingGraph::make();
graph->options().seq_opt.enable_seq_comp_node_opt = false; auto cn0 = cns[0], cn1 = cns[1];
auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
y0 = opr::Copy::make(x, cn1), y1 = opr::Copy::make(x + 1, cn1),
z1 = opr::Copy::make(y1 + 1, cn0), z0 = opr::Copy::make(y0 + 1, cn0);
set_priority(y0, 5);
set_priority(y1, 10);
set_priority(z1, 15);
set_priority(z0, 20);
HostTensorND host_z0, host_z1;
auto func = graph->compile(
{make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
func->execute();
auto px = host_x->ptr<float>(), pz0 = host_z0.ptr<float>(),
pz1 = host_z1.ptr<float>();
for (size_t i = 0; i < SIZE; ++i) {
MGB_ASSERT_FLOAT_EQ(px[i] + 1, pz0[i]);
MGB_ASSERT_FLOAT_EQ(px[i] + 2, pz1[i]);
}
check_wait(y0, x);
check_wait(y1, x + 1);
check_wait(z1, y1 + 1);
check_wait(z0, {});
}
TEST(TestGraph, InputWaitingSpecMultiOut) {
auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
HostTensorGenerator<> gen;
auto graph = cg::ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().var_sanity_check_first_run = 0;
graph->options().async_exec_level = 0b100;
graph->options().seq_opt.enable_seq_comp_node_opt = false;
size_t nr_out = 1024, length = 32;
auto hv = gen({nr_out * length}, cn0);
auto x = opr::Host2DeviceCopy::make(*graph, hv);
auto outs = opr::Split::make(x, opr::Split::Options::make_average(0, nr_out));
cg::ComputingGraph::OutputSpec output_spec;
for (size_t i = 0; i < nr_out; ++i) {
auto y = opr::Copy::make(outs[i], cn1);
y.node()->owner_opr()->node_prop().attribute().priority = i ? nr_out - i : 0;
output_spec.push_back({y, {}});
}
auto func = graph->compile(output_spec);
func->execute().wait();
check_wait(output_spec[0].first, outs[0]);
check_wait(output_spec[nr_out - 1].first, outs[nr_out - 1]);
for (size_t i = 1; i < nr_out - 1; ++i) {
check_wait(output_spec[i].first, {});
}
}
TEST(TestGraph, GradStaticShape) {
for (bool enable : {false, true}) {
auto graph = ComputingGraph::make();
graph->options().enable_grad_var_static_reshape = enable;
HostTensorGenerator<> gen;
auto host_x = gen({234});
auto x = opr::Host2DeviceCopy::make(*graph, host_x), x1 = x + 1.f,
y = opr::MarkDynamicVar::make(x1) * x1,
gx = cg::grad(opr::reduce_sum(y, y.make_scalar(1)), x);
ASSERT_FALSE(cg::is_static_var_shape(y.node()));
ASSERT_EQ(enable, cg::is_static_var_shape(gx.node()));
HostTensorND host_gx;
auto func = graph->compile({make_callback_copy(gx, host_gx)});
func->execute();
auto px = host_x->ptr<float>(), pgx = host_gx.ptr<float>();
for (size_t i = 0; i < 234; ++i) {
MGB_ASSERT_FLOAT_EQ(2 * (px[i] + 1), pgx[i]);
}
}
}
TEST(TestGraph, AllInputGrad) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto host_x = gen({2}), host_y = gen({2});
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {"x"}),
y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
z = x.insert_single_output_opr<AllInputGradOpr>(x.node(), y.node()),
loss0 = opr::reduce_sum_sqr(z, z.make_scalar(1)),
loss1 = opr::reduce_sum_sqr(z * 2, z.make_scalar(1));
auto&& op = z.node()->owner_opr()->cast_final_safe<AllInputGradOpr>();
auto grad = [](SymbolVar x, SymbolVar y) { return cg::grad(x, y, true, false); };
auto gx0 = grad(loss0, x), gy0 = grad(loss0, y);
ASSERT_EQ(1u, op.nr_grad_call);
ASSERT_EQ(x + y, gx0);
ASSERT_EQ(nullptr, gy0.node());
auto gx1 = grad(loss1, x), gy1 = grad(loss1, y);
ASSERT_EQ(2u, op.nr_grad_call);
ASSERT_EQ(nullptr, gx1.node());
ASSERT_EQ(x * y, gy1);
}
TEST(TestGraph, CPPMemLeak) {
auto run = []() {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto host_x = gen({23}, "cpux");
auto x = opr::Host2DeviceCopy::make(*graph, host_x), y0 = x + 1.f, y1 = x + 1.f;
ASSERT_EQ(y0, y1); HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y0, host_y)});
func->execute();
auto px = host_x->ptr<float>(), py = host_y.ptr<float>();
for (size_t i = 0; i < 23; ++i) {
MGB_ASSERT_FLOAT_EQ(px[i] + 1, py[i]);
}
};
CompNode::finalize();
run();
run(); }
TEST(TestGraph, ReplaceVarHelper) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
auto x = opr::Host2DeviceCopy::make(*graph, gen({2})), y = x + 1.f, z = y + 1.f;
auto x1 = opr::Host2DeviceCopy::make(*graph, gen({3}));
SymbolVar y1, z1;
unpack_vector(cg::replace_vars({y, z}, {{x, x1}}), y1, z1);
ASSERT_EQ(x1 + 1.f, y1);
ASSERT_EQ(y1 + 1.f, z1);
}
TEST(TestGraph, ReplaceVarWithDeps) {
auto cn = CompNode::load("xpu0");
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
using Iter = std::pair<std::shared_ptr<DeviceTensorND>, SymbolVar>;
auto make_iter = [&]() -> Iter {
HostTensorND host(cn, {1});
host.ptr<float>()[0] = 0.0;
auto dev = opr::SharedDeviceTensor::make(*graph, host);
auto iter = opr::AddUpdate::make(dev, dev.make_scalar(1));
return {dev.node()
->owner_opr()
->cast_final_safe<opr::SharedDeviceTensor>()
.dev_data(),
iter};
};
auto check_iter = [&](float val, const Iter& iter) {
HostTensorND host(cn, {1});
host.copy_from_fixlayout(*iter.first);
host.sync();
MGB_ASSERT_FLOAT_EQ(val, host.ptr<float>()[0]);
};
auto iter0 = make_iter();
auto iter1 = make_iter();
auto iter2 = make_iter();
auto iter3 = make_iter();
auto a = iter0.second + 1;
auto b = iter1.second + 2;
auto c = b * 5;
graph->options().extra_vardeps[b.node()].push_back(a.node());
auto y = cg::replace_vars(
{c}, {{iter0.second.node(), iter2.second.node()},
{iter1.second.node(), iter3.second.node()}});
ASSERT_EQ(y.size(), 1u);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y[0], host_y)});
func->execute();
check_iter(0, iter0);
check_iter(0, iter1);
check_iter(1, iter2);
check_iter(1, iter3);
}
TEST(TestGraph, EmptyShapeCheck) {
auto cn = CompNode::load("xpux");
auto graph = ComputingGraph::make();
auto host_x = std::make_shared<HostTensorND>(cn, TensorShape{1});
host_x->ptr<float>()[0] = 2;
using Param = opr::CondTake::Param;
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::CondTake::make(x, x, {Param::Mode::GT})[0],
z = opr::reduce_max(y, y.make_scalar(1));
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
MGB_ASSERT_FLOAT_EQ(2.f, host_z.ptr<float>()[0]);
host_x->ptr<float>()[0] = -2;
ASSERT_THROW(
{
try {
func->execute();
} catch (const MegBrainError& exc) {
std::string msg{exc.what()};
ASSERT_TRUE(
msg.find("empty input is not allowed") != std::string::npos)
<< "bad message " << msg;
throw;
}
},
MegBrainError);
}
TEST(TestGraph, RefCntManage) {
HostTensorGenerator<> gen;
auto cns = load_multiple_xpus(2);
auto graph = ComputingGraph::make();
auto host_x = gen({2, 3}, cns[0]), host_y = gen({1, 3}, cns[1]);
auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y).rename("y"),
x_cn1 = opr::Copy::make(x, {cns[1]}).rename("x_cn1"),
z = (x_cn1 + y).rename("z");
HostTensorND host_z;
graph->options().seq_opt.enable_seq_comp_node_opt = false;
graph->compile({make_callback_copy(z, host_z)})->execute();
auto chk_dyn = [](SymbolVar var) {
auto v = var.node();
ASSERT_FALSE(cg::is_static_var_storage(v)) << v->name();
ASSERT_FALSE(v->dev_tensor_valid()) << v->name();
ASSERT_EQ(0u, v->refcnt()) << v->name();
};
bool cross_cn_mem_share = cns[0].mem_node() == cns[1].mem_node();
for (auto i : {x, y, x_cn1, z}) {
ASSERT_EQ(0u, i.node()->refcnt()) << i.node()->name();
if (i.node() == x.node() || (cross_cn_mem_share && i.node() == x_cn1.node())) {
chk_dyn(i);
} else {
ASSERT_TRUE(cg::is_static_var_storage(i.node())) << i.node()->name();
ASSERT_TRUE(i.node()->dev_tensor_valid()) << i.node()->name();
}
}
graph->options().force_dynamic_alloc = true;
HostTensorND host_z1;
graph->compile({make_callback_copy(z, host_z1)})->execute();
MGB_ASSERT_TENSOR_EQ(host_z, host_z1);
for (auto i : {x, y, x_cn1, z}) {
chk_dyn(i);
}
graph->compile({{z, {}}})->execute().wait();
chk_dyn(z);
}
TEST(TestGraph, CompNodeFinalize) {
for (int rec = 0; rec < 3; ++rec) {
auto cn = CompNode::load(rec ? "cpu0" : "xpux");
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
auto host_x = gen({1}, cn), host_y = gen({1}, cn);
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::Host2DeviceCopy::make(*graph, host_y), z = x + y;
HostTensorND host_z;
if (rec) {
graph->options().var_sanity_check_first_run = false;
graph->options().comp_node_seq_record_level = rec;
}
auto func = graph->compile({make_callback_copy(z, host_z)});
if (rec == 2) {
ComputingGraph::assert_destroy(graph);
}
for (int i = 0; i < 5; ++i) {
host_x->copy_from(*gen({1}, cn));
func->execute();
MGB_ASSERT_FLOAT_EQ(
host_x->ptr<float>()[0] + host_y->ptr<float>()[0],
host_z.ptr<float>()[0]);
}
CompNode::finalize();
ASSERT_THROW(func->execute(), InternalError);
}
}
namespace {
class GraphHolder final : public UserDataContainer::UserData {
MGB_TYPEINFO_OBJ_DECL;
std::shared_ptr<ComputingGraph> m_graph;
int* m_del_chk;
public:
GraphHolder(std::shared_ptr<ComputingGraph> graph, int* del_chk)
: m_graph{std::move(graph)}, m_del_chk{del_chk} {}
~GraphHolder() { ++*m_del_chk; }
};
MGB_TYPEINFO_OBJ_IMPL(GraphHolder);
}
TEST(TestGraph, CompNodeFinalizeRecursive) {
int del_chk = 0;
auto graph = ComputingGraph::make();
graph->options().user_data.get_user_data_or_create<GraphHolder>([&]() {
return std::make_shared<GraphHolder>(std::move(graph), &del_chk);
});
graph.reset();
ASSERT_EQ(0, del_chk);
CompNode::finalize();
ASSERT_EQ(1, del_chk);
}
#if MGB_NEED_MEGDNN_ASYNC_ERROR
TEST(TestGraph, SignalCompSeqExecFinishedAsyncError) {
REQUIRE_GPU(1);
auto graph = ComputingGraph::make();
HostTensorGenerator<dtype::Int32> gen;
auto host_x = gen({10});
auto host_y = gen({1});
host_y->ptr<int>()[0] = 20;
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
auto y = opr::Host2DeviceCopy::make(*graph, host_y);
y = opr::MarkDynamicVar::make(y);
using AIdx = opr::indexing::AxisIndexer;
auto out1 = opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
size_t exec_cnt = 0;
auto cb = [&exec_cnt](const cg::event::CompSeqExecFinished& ev) {
MGB_MARK_USED_VAR(ev);
exec_cnt++;
};
auto handle = graph->event().register_receiver<cg::event::CompSeqExecFinished>(cb);
auto func = graph->compile({{out1, {}}});
for (size_t i = 0; i < 3; ++i) {
ASSERT_THROW(
{
try {
func->execute().wait();
} catch (const MegBrainError&) {
ASSERT_EQ(exec_cnt, i + 1);
throw;
}
},
MegBrainError);
}
}
TEST(TestGraph, RecoverFromAsyncError) {
REQUIRE_GPU(1);
auto graph = ComputingGraph::make();
HostTensorGenerator<dtype::Int32> gen;
auto host_x = gen({10});
auto host_y = gen({1});
host_y->ptr<int>()[0] = 5;
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
auto y = opr::Host2DeviceCopy::make(*graph, host_y);
y = opr::MarkDynamicVar::make(y);
using AIdx = opr::indexing::AxisIndexer;
auto out1 = opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
auto func = graph->compile({{out1, {}}});
func->execute().wait();
ASSERT_THROW(
{
try {
host_y->ptr<int>()[0] = 20;
func->execute().wait();
} catch (const MegBrainError&) {
host_y->ptr<int>()[0] = 5;
throw;
}
},
MegBrainError);
func->execute().wait();
}
TEST(TestGraph, AsyncErrorMultiCompGraph) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Int32> gen;
auto host_x = gen({10});
auto host_y0 = gen({1}), host_y1 = gen({1});
auto gen_func = [&](decltype(host_y0) host_y) {
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
auto y = opr::Host2DeviceCopy::make(*graph, host_y);
y = opr::MarkDynamicVar::make(y);
using AIdx = opr::indexing::AxisIndexer;
auto out1 = opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
return graph->compile({{out1, {}}});
};
auto func0 = gen_func(host_y0);
auto func1 = gen_func(host_y1);
ASSERT_THROW(
{
host_y0->ptr<int>()[0] = 20;
host_y1->ptr<int>()[0] = 5;
ASSERT_NO_THROW({
func0->execute();
func1->execute().wait();
});
func0->wait();
},
MegBrainError);
ASSERT_NO_THROW({
host_y0->ptr<int>()[0] = 5;
host_y1->ptr<int>()[0] = 5;
func0->execute().wait();
func1->execute().wait();
});
}
#endif
TEST(TestGraph, WaitAfterException) {
auto graph = ComputingGraph::make();
HostTensorGenerator<dtype::Int32> gen;
auto host_x = gen({10});
auto host_y = gen({10});
size_t flag;
auto x = opr::Host2DeviceCopy::make(*graph, host_x);
auto y = opr::Host2DeviceCopy::make(*graph, host_y);
auto z = opr::CallbackInjector::make(x + y, [&](DeviceTensorND&) {
mgb_throw_if(flag, MegBrainError, "throw exception after a + b.");
});
auto cb = [&](const cg::event::CompSeqExecFinished& ev) {
MGB_MARK_USED_VAR(ev);
mgb_throw_if(
flag, MegBrainError,
"It should not signal CompSeqExecFinished "
"if any exception is thrown during execution.");
};
auto handle = graph->event().register_receiver<cg::event::CompSeqExecFinished>(cb);
auto func = graph->compile({{z, {}}});
flag = 1;
ASSERT_THROW(func->execute(), MegBrainError);
ASSERT_NO_THROW(func->wait());
flag = 0;
ASSERT_NO_THROW(func->execute().wait());
}
TEST(TestGraph, PauseExecEnv) {
auto graph = ComputingGraph::make();
graph->options().var_sanity_check_first_run = false;
graph->options().async_exec_level = 0b100;
HostTensorGenerator<> gen;
auto host_x = gen({2, 3}, CompNode::default_cpu());
std::atomic_bool flag0{false}, flag1{false};
auto cb0 = [&flag0](DeviceTensorND&) {
flag0 = true;
while (flag0.load()) {
std::this_thread::yield();
}
};
auto cb1 = [&flag1](DeviceTensorND&) { flag1 = true; };
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::CallbackInjector::make(x, cb0),
z = opr::CallbackInjector::make(y, cb1);
auto func = graph->compile({{z, {}}});
auto exec_env =
static_cast<cg::ComputingGraphImpl*>(graph.get())->current_exec_env();
auto worker = [&flag0, &flag1, exec_env]() {
while (!flag0.load()) {
std::this_thread::yield();
}
exec_env->pause_exec();
flag0 = false;
using namespace std::chrono_literals;
std::this_thread::sleep_for(100ms);
ASSERT_FALSE(flag1.load());
exec_env->resume_exec();
std::this_thread::sleep_for(100ms);
ASSERT_TRUE(flag1.load());
};
std::thread worker_th{worker};
func->execute();
func->wait();
worker_th.join();
}
TEST(TestGraph, CustomStaticDeviceMemoryAllocator) {
HostTensorGenerator<> gen;
auto host_x = gen({1, 3});
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = x + opr::ImmutableTensor::make(*graph, *gen({2, 1}));
auto func = graph->compile({{y, {}}});
auto allocator = std::make_shared<TrackableStaticMemAlloc>();
graph->set_device_memory_allocator(allocator);
ASSERT_EQ(0u, allocator->nr_call());
ThinHashSet<const void*> y_addrs;
size_t expected_nr_call = 1;
auto check = [&]() {
func->execute();
y_addrs.insert(prev_dev_ptr(y));
ASSERT_EQ(expected_nr_call, allocator->nr_call());
ASSERT_EQ(expected_nr_call, y_addrs.size());
};
for (int i = 1; i < 12; ++i) {
if (i % 3 == 0) {
++expected_nr_call;
++allocator->version_num;
}
check();
}
*host_x = *gen({1, 1023});
++expected_nr_call;
check();
*host_x = *gen({1, 2047});
allocator->size_expect =
func->update_static_alloc_plan_and_get_size().at(host_x->comp_node());
ASSERT_EQ(expected_nr_call, allocator->nr_call());
++expected_nr_call;
check();
allocator->version_num = TrackableStaticMemAlloc::VERSION_INVALID;
ASSERT_THROW(func->execute(), MegBrainError);
}
TEST(TestGraph, CustomDynamicDeviceMemoryAllocator) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
auto allocator = std::make_shared<TrackableDynamicMemAlloc>();
SymbolVar x, xp1, y, z;
auto cb = [&](DeviceTensorND& dv) {
HostTensorND hv;
hv.copy_from(dv).sync();
ASSERT_EQ(host_x->ptr<float>()[0] + 1.f, hv.ptr<float>()[0]);
EXPECT_EQ(1u, allocator->alive_vars().count(xp1.node()));
EXPECT_EQ(1u, allocator->alive_vars().size());
};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().force_dynamic_alloc = true;
x = opr::Host2DeviceCopy::make(*graph, host_x);
xp1 = x + 1;
y = opr::CallbackInjector::make(xp1, cb);
z = y * 2;
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
graph->set_device_memory_allocator(allocator);
ASSERT_TRUE(allocator->alive_vars().empty());
func->execute();
ASSERT_EQ(2.f * (host_x->ptr<float>()[0] + 1.f), host_z.ptr<float>()[0]);
ASSERT_TRUE(allocator->alive_vars().empty());
*host_x = *gen({1});
func->execute();
ASSERT_EQ(2.f * (host_x->ptr<float>()[0] + 1.f), host_z.ptr<float>()[0]);
ASSERT_TRUE(allocator->alive_vars().empty());
}
TEST(TestGraph, ExecutionMask) {
HostTensorGenerator<> gen;
auto host_x = gen({1});
int called = 0;
auto cb = [&](DeviceTensorND&) { ++called; };
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
y = opr::CallbackInjector::make(x, cb);
auto exec_mask = std::make_shared<cg::ExecutionMask>(nullptr);
exec_mask->register_to_opr(y.node()->owner_opr());
auto func = graph->compile({{y, {}}});
func->execute();
ASSERT_EQ(0, called);
exec_mask->enable(true);
func->execute();
ASSERT_EQ(1, called);
func->execute();
ASSERT_EQ(2, called);
exec_mask->enable(false);
func->execute();
ASSERT_EQ(2, called);
}
TEST(TestGraph, AsyncRelease) {
HostTensorGenerator<> gen;
auto host_x = gen({1024});
auto dev_x = std::make_shared<DeviceTensorND>();
dev_x->copy_from(*host_x).sync();
auto cn1 = host_x->comp_node().change_stream(1);
auto host_tshp = std::make_shared<HostTensorND>(
host_x->comp_node(), TensorShape{2}, dtype::Int32{});
auto set_shape = [p = host_tshp->ptr<int>()](int x) {
p[0] = 1 << x;
p[1] = 1 << (10 - x);
};
set_shape(0);
auto graph = ComputingGraph::make();
auto x = opr::SharedDeviceTensor::make(*graph, dev_x),
tshp = opr::Host2DeviceCopy::make(*graph, host_tshp), x_fwd = x.reshape(tshp),
y = opr::Sleep::make(x_fwd, 0.05, {}, cn1);
auto func = graph->compile({{y, {}}});
ASSERT_TRUE(cg::is_static_var_storage(x.node()));
ASSERT_FALSE(cg::is_static_var_storage(x_fwd.node()));
for (int i = 0; i < 3; ++i) {
set_shape(i + 1);
func->execute();
ASSERT_EQ(prev_dev_ptr(x_fwd), dev_x->raw_ptr());
ASSERT_EQ(TensorShape({2u << i, 1u << (9 - i)}), y.shape());
}
}
TEST(TestGraph, UpdateStaticAllocPlan) {
HostTensorGenerator<> gen;
auto host_x = gen({3});
auto graph = ComputingGraph::make();
auto x = opr::Sleep::make(opr::Host2DeviceCopy::make(*graph, host_x), 0.5),
y = x + opr::ImmutableTensor::make(*graph, *gen({1}));
auto func = graph->compile({{y, {}}});
func->update_static_alloc_plan_and_get_size();
func->execute();
*host_x = *gen({1023});
func->execute();
*host_x = *gen({2047});
func->update_static_alloc_plan_and_get_size();
func->execute();
}
TEST(TestGraph, CPUGPUHybrid) {
REQUIRE_GPU(1);
auto cn_gpu = CompNode::load("gpu0");
for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) {
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
constexpr size_t length = 23333;
auto host_x = gen({length});
graph->options().var_sanity_check_first_run = false;
auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
y = opr::Sleep::make(x, 0.5) * 2, z_gpu = opr::Copy::make(y, cn_gpu) + 1,
z = opr::Copy::make(z_gpu, cn_cpu) * 2;
HostTensorND host_z;
auto func = graph->compile({make_callback_copy(z, host_z)});
func->execute();
for (size_t i = 0; i < length; ++i) {
MGB_ASSERT_FLOAT_EQ(
(host_x->ptr<float>()[i] * 2 + 1) * 2, host_z.ptr<float>()[i]);
}
}
}
TEST(TestGraph, In2OutOpStreamPropagate) {
REQUIRE_GPU(1); HostTensorGenerator<> gen;
SmallVector<std::shared_ptr<HostTensorND>> host_v = {gen({233}), gen({23})};
using PropType = cg::SeqCompNodeOptimizer::StreamPropType;
for (auto type : {PropType::STRONG, PropType::WEAK})
for (size_t idx : {0, 1}) {
auto graph = ComputingGraph::make();
SymbolVarArray inp(2);
for (size_t i = 0; i < 2; ++i) {
inp[i] = opr::Host2DeviceCopy::make(*graph, host_v[i]);
}
auto out = opr::VirtualDep::make(inp);
auto&& mgr = static_cast<cg::SeqCompNodeOptimizerImpl&>(
graph->seq_comp_node_optimizer());
mgr.register_stream_var(
inp[idx].node(), PropType{CompNode::Stream::COPY, type});
mgr.optimize_comp_nodes({out.node()});
ASSERT_EQ(inp[0].node()->comp_node(), out.node()->comp_node());
auto o_stream = out.node()->comp_node().locator().stream;
int expect = idx ? 0 : int(CompNode::Stream::COPY);
ASSERT_EQ(o_stream, expect);
}
}
TEST(TestGraph, OperatorNodeConfigInstanceID) {
OperatorNodeConfig config0, config1;
void *p0 = &config0, *p1 = &config1;
{ ASSERT_EQ(config0.instance_id(), config1.instance_id());
config0.update_instance_id(p0);
ASSERT_NE(config0.instance_id(), config1.instance_id());
config0.reset_instance_id();
ASSERT_EQ(config0.instance_id(), config1.instance_id());
}
{ config0.reset_instance_id();
config0.update_instance_id(p1);
config1.reset_instance_id();
config1.update_instance_id(p1);
ASSERT_EQ(config0.instance_id(), config1.instance_id());
}
{ config0.reset_instance_id();
config0.update_instance_id(p0);
config1.reset_instance_id();
config1.update_instance_id(p1);
ASSERT_NE(config0.instance_id(), config1.instance_id());
config0.update_instance_id(p1);
ASSERT_NE(config0.instance_id(), config1.instance_id());
}
{ config0.reset_instance_id();
config0.update_instance_id(p1);
config0.update_instance_id(p0);
config1.reset_instance_id();
config1.update_instance_id(p0);
config1.update_instance_id(p1);
ASSERT_NE(config0.instance_id(), config1.instance_id());
}
}
TEST(TestGraph, NaiveRecord2NCHW44) {
auto cn = CompNode::load("cpu0");
using ConvParam = megdnn::ConvBias::Param;
ConvParam param;
param.sparse = ConvParam::Sparse::DENSE;
param.format = ConvParam::Format::NCHW44;
HostTensorGenerator<> gen;
auto host_x = gen({1, 2, 12, 12, 4}, cn), host_w = gen({2, 2, 3, 3, 4, 4}, cn),
host_b = gen({1, 2, 1, 1, 4}, cn);
HostTensorND host_z;
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x),
w = opr::Host2DeviceCopy::make(*graph, host_w),
b = opr::Host2DeviceCopy::make(*graph, host_b),
z = opr::ConvBiasForward::make(x, w, b, param, {});
graph->options().comp_node_seq_record_level = 2;
graph->options().var_sanity_check_first_run = false;
auto func = graph->compile({make_callback_copy(z, host_z)});
ComputingGraph::assert_destroy(graph);
host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
func->execute().wait();
}
namespace {
template <typename DnnOp, typename... Args>
typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, Args&&... args) {
if (found.valid()) {
if (found.val()) {
return dnn_op->execution_policy();
} else {
return {};
}
}
for (auto&& algo :
dnn_op->get_all_algorithms_info_safe(std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo.desc;
auto layouts =
dnn_op->deduce_preprocessed_filter_layout(std::forward<Args>(args)...);
if (layouts.empty())
continue;
bool valid = false;
for (auto&& l : layouts) {
if (!l.is_empty()) {
valid = true;
break;
}
}
if (valid) {
found.emplace(true);
return {algo.desc, {}};
}
}
found.emplace(false);
mgb_log_warn("Can't find weight preprocess algo for op %s", mgb_info);
return {};
}
template <typename DnnOp, typename... Args>
typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found, Args&&... args) {
if (found.valid()) {
if (found.val()) {
return dnn_op->execution_policy();
} else {
return {};
}
}
for (auto&& algo :
dnn_op->get_all_algorithms_info_safe(std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo.desc;
auto layouts =
dnn_op->deduce_preprocessed_filter_layout(std::forward<Args>(args)...);
if (layouts.size() <= 1)
continue;
bool valid = false;
if (!layouts[1].is_empty()) {
valid = true;
}
if (valid) {
found.emplace(true);
return {algo.desc, {}};
}
}
found.emplace(false);
mgb_log_warn("Can't find bias preprocess algo for op %s", mgb_info);
return {};
}
void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
#if MGB_ENABLE_JSON
std::unique_ptr<GraphProfiler> profiler;
if (!record_level) {
profiler = std::make_unique<GraphProfiler>(graph.get());
}
#endif
graph->options().graph_opt.weight_preprocess = true;
graph->options().comp_node_seq_record_level = record_level;
auto mkvar = [&](const char* name, const TensorShape& shp) {
return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
};
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn)).rename(name);
};
auto x = mkvar("x", {1, 32, 16, 16});
opr::ConvBias::Param param_conv_bias;
param_conv_bias.pad_h = param_conv_bias.pad_w = 0;
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
auto w1 = mkcvar("w1", {32, 32, 1, 1}), b1 = mkcvar("b1", {1, 32, 1, 1});
auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
Maybe<bool> wp1, wp2;
conv1.node()->owner_opr()->cast_final_safe<opr::ConvBias>().setup_algo_chooser(
[&](const cg::OperatorNodeBase* opr) {
return try_find_any_weight_preprocess_algo(
opr->cast_final_safe<opr::ConvBias>().megdnn_opr(),
opr->cname(), wp1, opr->input(0)->layout(),
opr->input(1)->layout(), opr->input(2)->layout(),
TensorLayout{}, opr->output(0)->layout());
});
opr::Convolution::Param param_conv;
param_conv.pad_h = param_conv.pad_w = 0;
param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
auto w2 = mkcvar("w2", {32, 32, 1, 1});
auto y = opr::Convolution::make(conv1, w2, param_conv);
y.node()->owner_opr()->cast_final_safe<opr::Convolution>().setup_algo_chooser(
[&](const cg::OperatorNodeBase* opr) {
return try_find_any_weight_preprocess_algo(
opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
opr->cname(), wp2, opr->input(0)->layout(),
opr->input(1)->layout(), opr->output(0)->layout());
});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
func->execute();
auto check = [&](SymbolVar v) {
ASSERT_TRUE(v.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
ASSERT_TRUE(v.node()->dev_tensor().empty());
ASSERT_TRUE(v.node()->owner_opr()
->cast_final_safe<opr::SharedDeviceTensor>()
.get_dev_tensor()
.empty());
};
ASSERT_TRUE(wp1.valid() && wp2.valid());
if (wp1.val()) {
check(w1);
}
if (wp2.val()) {
check(w2);
}
#if MGB_ENABLE_JSON
if (profiler) {
func->wait();
profiler->to_json_full(func.get())
->writeto_fpath(output_file("weight_preprocess.json"));
}
#endif
}
}
TEST(TestGraph, FreeMemoryInWeightPreprocess) {
test_free_memory_in_weight_preprocess(0, CompNode::load("xpu0"));
megdnn::HeuristicCache::instance().clear();
}
TEST(TestGraph, RecordFreeMemoryInWeightPreprocess) {
test_free_memory_in_weight_preprocess(1, CompNode::load("cpu0"));
megdnn::HeuristicCache::instance().clear();
}
namespace {
MGB_DEFINE_OPR_CLASS(HostValueReader, cg::SingleCNOutshapePureByInshapeOprBase) void scn_do_execute() override {
auto&& hv = owner_graph()->static_infer_manager().infer_value(input(0));
MGB_MARK_USED_VAR(hv);
}
NodeProp* do_make_node_prop() const override {
auto ret = Super::do_make_node_prop();
ret->dep_map()[input(0)] = NodeProp::DepType::HOST_VALUE;
return ret;
}
void get_output_var_shape(
const TensorShapeArray&, TensorShapeArray& out_shape) const override {
out_shape.at(0) = {};
}
public:
HostValueReader(VarNode* inp)
: Super{inp->owner_graph(), {}, "host_value_reader", {inp}} {
add_input({inp});
using F = VarNode::Flag;
add_output(None)->add_flag(F::ALLOW_EMPTY_SHAPE).add_flag(F::VOLATILE_CONTENT);
}
static SymbolVar make(SymbolVar inp) {
return inp.node()
->owner_graph()
->insert_opr(std::make_unique<HostValueReader>(inp.node()))
->output(0);
}
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(HostValueReader);
}
TEST(TestGraph, FreeMemoryInWeightPreprocessWithValueInfer) {
HostTensorGenerator<> gen;
CompNode cn = CompNode::load("xpux");
auto graph = ComputingGraph::make();
graph->options().graph_opt.weight_preprocess = true;
graph->options().var_sanity_check_first_run = false;
auto mkvar = [&](const char* name, const TensorShape& shp) {
return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
};
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn)).rename(name);
};
auto x = mkvar("x", {1, 32, 16, 16});
auto w = mkcvar("w", {32, 32, 1, 1});
auto y = opr::Convolution::make(x, w);
Maybe<bool> found;
y.node()->owner_opr()->cast_final_safe<opr::Convolution>().setup_algo_chooser(
[&](const cg::OperatorNodeBase* opr) {
return try_find_any_weight_preprocess_algo(
opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
opr->cname(), found, opr->input(0)->layout(),
opr->input(1)->layout(), opr->output(0)->layout());
});
auto reader = HostValueReader::make(w);
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(y, host_y), {reader, {}}});
func->execute();
func->execute();
ASSERT_FALSE(w.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
ASSERT_FALSE(w.node()->dev_tensor().empty());
ASSERT_FALSE(w.node()->owner_opr()
->cast_final_safe<opr::SharedDeviceTensor>()
.get_dev_tensor()
.empty());
megdnn::HeuristicCache::instance().clear();
}
TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) {
HostTensorGenerator<> gen;
CompNode cn = CompNode::load("xpux");
auto graph = ComputingGraph::make();
graph->options().graph_opt.weight_preprocess = true;
graph->options().var_sanity_check_first_run = false;
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp) {
return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
};
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn)).rename(name);
};
auto x = mkvar("x", {1, 32, 16, 16});
auto w = mkcvar("w", {32, 32, 1, 1});
auto y = opr::Convolution::make(x, w);
Maybe<bool> found;
y.node()->owner_opr()->cast_final_safe<opr::Convolution>().setup_algo_chooser(
[&](const cg::OperatorNodeBase* opr) {
return try_find_any_weight_preprocess_algo(
opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
opr->cname(), found, opr->input(0)->layout(),
opr->input(1)->layout(), opr->output(0)->layout());
});
auto y1 = w * 2 + 1;
HostTensorND host_y, host_y1;
auto func = graph->compile(
{make_callback_copy(y, host_y), make_callback_copy(y1, host_y1)});
func->execute();
func->execute();
ASSERT_FALSE(w.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
ASSERT_FALSE(w.node()->dev_tensor().empty());
ASSERT_FALSE(w.node()->owner_opr()
->cast_final_safe<opr::SharedDeviceTensor>()
.get_dev_tensor()
.empty());
megdnn::HeuristicCache::instance().clear();
}
TEST(TestGraph, FreeBias) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
auto cn = CompNode::load("xpu0");
graph->options().graph_opt.weight_preprocess = true;
auto mkvar = [&](const char* name, const TensorShape& shp) {
return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
};
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn)).rename(name);
};
auto x = mkvar("x", {1, 32, 16, 16});
opr::ConvBias::Param param_conv_bias;
param_conv_bias.pad_h = param_conv_bias.pad_w = 0;
param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
auto w1 = mkcvar("w1", {32, 32, 1, 1}), b1 = mkcvar("b1", {1, 32, 1, 1});
auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
auto w2 = mkcvar("w2", {32, 32, 1, 1});
auto conv2 = opr::ConvBias::make(conv1, w2, param_conv_bias);
Maybe<bool> wp1;
conv1.node()->owner_opr()->cast_final_safe<opr::ConvBias>().setup_algo_chooser(
[&](const cg::OperatorNodeBase* opr) {
return try_find_any_bias_preprocess_algo(
opr->cast_final_safe<opr::ConvBias>().megdnn_opr(),
opr->cname(), wp1, opr->input(0)->layout(),
opr->input(1)->layout(), opr->input(2)->layout(),
TensorLayout{}, opr->output(0)->layout());
});
HostTensorND host_y;
auto func = graph->compile({make_callback_copy(conv2, host_y)});
func->execute();
func->execute();
auto check = [&](SymbolVar v) {
ASSERT_TRUE(v.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
ASSERT_TRUE(v.node()->dev_tensor().empty());
ASSERT_TRUE(v.node()->owner_opr()
->cast_final_safe<opr::SharedDeviceTensor>()
.get_dev_tensor()
.empty());
};
ASSERT_TRUE(wp1.valid());
if (wp1.val()) {
check(b1);
}
}
TEST(TestGraph, CallbackCaller) {
using namespace opr;
auto cns = load_multiple_xpus(3);
constexpr size_t C1 = 20, C2 = 30, C3 = 10, C4 = 40;
constexpr size_t N = 2, C = C1 + C2;
HostTensorGenerator<> gen;
auto host_opr0 = gen({N, C}, cns[0]);
auto graph = ComputingGraph::make();
SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"});
auto spl0 = opr::Split::make(
opr0, Split::Options::make_partition(opr0, 1, {C1, C2}),
OperatorNodeConfig("split0").comp_node_arr({cns[1], cns[2]}));
auto spl1 = opr::Split::make(
opr0, Split::Options::make_partition(opr0, 1, {C3, C4}),
OperatorNodeConfig("split1"));
HostTensorND host_spl00, host_spl01, host_spl10, host_spl11;
auto func = graph->compile(
{make_callback_copy(spl0[0], host_spl00),
make_callback_copy(spl0[1], host_spl01),
make_callback_copy(spl1[0], host_spl10),
make_callback_copy(spl1[1], host_spl11)});
func->execute();
auto o00 = host_spl00.ptr<float>(), o01 = host_spl01.ptr<float>(),
o10 = host_spl10.ptr<float>(), o11 = host_spl11.ptr<float>(),
c = host_opr0->ptr<float>();
for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
auto ch = i % C;
auto n = i / C;
if (ch < C1) {
MGB_ASSERT_FLOAT_EQ(o00[n * C1 + ch], c[i]) << ssprintf("failed at %zd", i);
} else {
MGB_ASSERT_FLOAT_EQ(o01[n * C2 + ch - C1], c[i])
<< ssprintf("failed at %zd", i);
}
if (ch < C3) {
MGB_ASSERT_FLOAT_EQ(o10[n * C3 + ch], c[i]) << ssprintf("failed at %zd", i);
} else {
MGB_ASSERT_FLOAT_EQ(o11[n * C4 + ch - C3], c[i])
<< ssprintf("failed at %zd", i);
}
}
}
TEST(TestGraph, DynamicOutput) {
using namespace opr;
REQUIRE_GPU(1);
auto cn0 = CompNode::load("gpu0");
constexpr size_t C1 = 20, C2 = 20;
constexpr size_t C = C1 + C2;
HostTensorGenerator<> gen;
auto host_opr0 = gen({C}, cn0);
auto graph = ComputingGraph::make();
graph->options().force_output_dynamic_alloc = true;
SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0);
auto spl_0 =
opr::Split::make(opr0, Split::Options::make_partition(opr0, 0, {C1, C2}));
auto sum = opr::add(spl_0[1], spl_0[1]);
HostTensorND expect_sum, expect_spl_0_0, result_sum, result_spl_0_0;
auto func1 = graph->compile(
{make_callback_copy(sum, expect_sum),
make_callback_copy(spl_0[0], expect_spl_0_0)});
func1->execute().wait();
auto func2 = graph->compile({{sum, nullptr}, {spl_0[0], nullptr}});
auto&& dest_vars = func2->get_output_vars();
func2->execute().wait();
result_sum.copy_from(dest_vars[0]->dev_tensor()).sync();
MGB_ASSERT_TENSOR_NEAR(expect_sum, result_sum, 1e-4);
result_spl_0_0.copy_from(dest_vars[1]->dev_tensor()).sync();
MGB_ASSERT_TENSOR_NEAR(expect_spl_0_0, result_spl_0_0, 1e-4);
}
namespace {
MGB_DEFINE_OPR_CLASS(MaybeEmptyTensorOpr, cg::SingleCNOperatorNodeBase) DeviceTensorND m_dv;
void init_output_comp_node() override {
output(0)->comp_node(m_dv.comp_node());
comp_node(m_dv.comp_node());
}
void scn_do_execute() override { output(0)->reset_dev_tensor_from_tensor(m_dv); }
void init_output_static_infer_desc() override {
using namespace cg::static_infer;
auto&& mgr = owner_graph()->static_infer_manager();
mgr.register_shape_infer(output(0), ShapeInferDesc::make_const(m_dv.shape()));
}
public:
MaybeEmptyTensorOpr(
ComputingGraph& graph, const DeviceTensorND& dv,
const OperatorNodeConfig& config)
: Super(&graph, config, "", {}), m_dv{dv} {
add_output(None)
->add_flag(cg::VarNode::Flag::NO_SYS_MEM_ALLOC)
.add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
.dtype(dv.dtype());
}
static SymbolVar make(
ComputingGraph& graph, const DeviceTensorND& dv,
const OperatorNodeConfig& config = {}) {
return graph
.insert_opr(std::make_unique<MaybeEmptyTensorOpr>(graph, dv, config))
->output(0);
}
};
}
MGB_DYN_TYPE_OBJ_FINAL_IMPL(MaybeEmptyTensorOpr);
TEST(TestMemReuse, ResetEmptyDevTensor) {
auto allow_empty = [](const TensorShape& inp_shp) {
HostTensorGenerator<> gen;
auto g = ComputingGraph::make();
auto host_x1 = gen(inp_shp), host_x2 = gen(inp_shp);
DeviceTensorND dev_x1, dev_x2;
dev_x1.copy_from(*host_x1), dev_x2.copy_from(*host_x2);
auto x1 = MaybeEmptyTensorOpr::make(*g, dev_x1, {"x1"}),
x2 = MaybeEmptyTensorOpr::make(*g, dev_x2, {"x2"}), y = x1 + x2;
HostTensorND host_y;
auto func = g->compile({make_callback_copy(y, host_y)});
auto&& recv =
x1.node()->owner_graph()->var_receiver_in_current_comp_seq(x1.node());
ASSERT_TRUE(recv.is_empty_allowed());
ASSERT_NO_THROW(func->execute().wait());
if (inp_shp.is_empty()) {
ASSERT_TRUE(host_y.empty());
ASSERT_TRUE(host_y.shape().is_empty());
}
};
auto forbid_empty = [](const TensorShape& inp_shp) {
HostTensorGenerator<> gen;
auto g = ComputingGraph::make();
auto host_x = gen(inp_shp);
DeviceTensorND dev_x;
dev_x.copy_from(*host_x);
auto x = MaybeEmptyTensorOpr::make(*g, dev_x, {"x"}),
y = opr::Reduce::make(x, {opr::Reduce::Mode::MAX, 0});
HostTensorND host_y;
auto func = g->compile({make_callback_copy(y, host_y)});
if (inp_shp.is_empty()) {
ASSERT_ANY_THROW(func->execute().wait());
} else {
ASSERT_NO_THROW(func->execute().wait());
}
};
allow_empty({2, 3, 4, 5});
allow_empty({2, 0, 3, 4});
forbid_empty({4, 5, 6, 7});
forbid_empty({8, 0, 0, 9});
}