#include "megbrain/graph.h"
#include "megbrain/graph/event.h"
#include "megbrain/opr/basic_arith_wrapper.h"
#include "megbrain/opr/blas.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
#include "megbrain/serialization/sereg.h"
#include "megbrain/test/helper.h"
using namespace mgb;
#if MGB_ENABLE_SUBLINEAR
namespace mgb {
namespace cg {
class SeqModifierForSublinearMemory {
public:
const CompNode::UnorderedMap<size_t>& prev_min_bottleneck();
};
class ComputingGraphImpl : public ComputingGraph {
public:
SeqModifierForSublinearMemory& seq_modifier_for_sublinear_memory();
};
}; };
namespace {
MGB_DEFINE_OPR_CLASS(SublinearBadOpr, cg::SingleCNOperatorNodeBase) bool m_flag;
size_t m_scale;
void scn_do_execute() override { mgb_assert(0); }
NodeProp* do_make_node_prop() const override {
auto prop = Super::do_make_node_prop();
if (m_flag) {
prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
}
return prop;
}
void init_output_static_infer_desc() override {
using namespace cg::static_infer;
auto&& mgr = owner_graph()->static_infer_manager();
auto infer_shape = [this](TensorShape& dst, const InpVal& inp) {
size_t n = inp.val.at(0).shape().total_nr_elems();
dst = TensorShape{n * m_scale};
return true;
};
mgr.register_shape_infer(
output(0),
{SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_shape});
}
public:
SublinearBadOpr(
VarNode* inp, bool bad, size_t scale, OperatorNodeConfig config = {})
: Super{inp->owner_graph(), config, "subliner_bad_op", {inp}},
m_flag{bad},
m_scale{scale} {
add_input({inp});
add_output(None);
}
static SymbolVar make(
SymbolVar inp, bool bad, size_t scale, OperatorNodeConfig config = {}) {
return inp.node()
->owner_graph()
->insert_opr(std::make_unique<SublinearBadOpr>(
inp.node(), bad, scale, config))
->output(0);
}
bool flag() const { return m_flag; }
size_t scale() const { return m_scale; }
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(SublinearBadOpr);
cg::OperatorNodeBase* bad_opr_shallow_copy(
const serialization::OprShallowCopyContext& ctx,
const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs,
const OperatorNodeConfig& config) {
mgb_assert(inputs.size() == 1);
auto&& opr = opr_.cast_final_safe<SublinearBadOpr>();
return SublinearBadOpr::make(inputs[0], opr.flag(), opr.scale(), config)
.node()
->owner_opr();
}
MGB_REG_OPR_SHALLOW_COPY(SublinearBadOpr, bad_opr_shallow_copy);
};
#if MGB_CUDA
#define CHECK_REQ \
do { \
\
REQUIRE_GPU(1); \
if (CompNode::load("gpu0").get_mem_status_bytes().second <= \
5ull * 1024 * 1024 * 1024) { \
mgb_log_warn( \
"test skipped due to " \
"insufficient available gpu memory"); \
return; \
} \
} while (0)
TEST(TestSublinearMemory, FullConv) {
CHECK_REQ;
HostTensorGenerator<> gen_;
auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
constexpr size_t N = 128, H = 256, W = 256;
auto host_data = gen({N, 1, H, W});
auto graph = ComputingGraph::make();
SymbolVarArray params;
auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
out = data;
size_t out_chl = host_data->shape(1), layer_count = 0;
auto add_layer = [&](size_t oc, size_t h, size_t w) {
gen_.std(sqrt(2.0 / (out_chl * h * w)));
auto host_kern = gen({oc, out_chl, h, w});
auto dev_kern = std::make_shared<DeviceTensorND>();
dev_kern->copy_from(*host_kern);
params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
out = opr::relu(opr::Convolution::make(
out, params.back().rename(ssprintf("param%zu", layer_count)), {}));
out.rename(ssprintf("out%zu", layer_count));
++layer_count;
out_chl = oc;
};
for (int i = 0; i < 10; ++i)
add_layer(5, 3, 3);
auto loss = opr::Dot::make(out.flatten(), out.flatten());
std::vector<HostTensorND> grad_params_get(params.size());
ComputingGraph::OutputSpec out_spec;
for (size_t i = 0; i < params.size(); ++i) {
out_spec.emplace_back(
make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i]));
}
std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
for (bool sublinear : {false, true}) {
graph->options().enable_sublinear_memory_opt = sublinear;
auto func = graph->compile(out_spec);
func->execute();
if (!sublinear) {
for (size_t i = 0; i < grad_params_get.size(); ++i)
grad_params_expect[i].copy_from(grad_params_get[i]);
}
}
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
graph->options().enable_sublinear_memory_opt = false;
graph->options().enable_dtr_memory_opt = true;
graph->options().dtr_config.eviction_threshold = 1ULL << 30;
auto func = graph->compile(out_spec);
func->execute();
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
}
TEST(TestSublinearMemory, ConcatSplit) {
CHECK_REQ;
HostTensorGenerator<> gen_;
auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
constexpr size_t N = 128, H = 256, W = 256;
auto host_data = gen({N, 2, H, W});
auto graph = ComputingGraph::make();
SymbolVarArray params;
auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
out = data;
size_t out_chl = host_data->shape(1), layer_count = 0;
auto add_layer = [&](size_t oc, size_t h, size_t w) {
auto prev = opr::Split::make(out, opr::Split::Options::make_average(1, 2));
SymbolVarArray cur_out(2);
size_t cur_in_chl[] = {out_chl / 2, out_chl - out_chl / 2};
size_t cur_out_chl[] = {oc / 2, oc - oc / 2};
for (int i = 0; i < 2; ++i) {
gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
auto dev_kern = std::make_shared<DeviceTensorND>();
dev_kern->copy_from(*host_kern);
params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
cur_out[i] = opr::relu(opr::Convolution::make(
prev[i],
params.back().rename(ssprintf(
"param%zu:%d", layer_count, i)),
{}))
.rename(ssprintf("out%zu:%d", layer_count, i));
}
++layer_count;
out_chl = oc;
out = opr::Concat::make(cur_out, 1);
};
for (int i = 0; i < 10; ++i)
add_layer(6, 3, 3);
auto loss = opr::Dot::make(out.flatten(), out.flatten());
std::vector<HostTensorND> grad_params_get(params.size());
ComputingGraph::OutputSpec out_spec;
for (size_t i = 0; i < params.size(); ++i) {
out_spec.emplace_back(
make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i]));
}
std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
for (bool sublinear : {false, true}) {
graph->options().enable_sublinear_memory_opt = sublinear;
auto func = graph->compile(out_spec);
func->execute();
if (!sublinear) {
for (size_t i = 0; i < grad_params_get.size(); ++i)
grad_params_expect[i].copy_from(grad_params_get[i]);
}
}
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
graph->options().enable_sublinear_memory_opt = false;
graph->options().enable_dtr_memory_opt = true;
graph->options().dtr_config.eviction_threshold = 1ULL << 30;
auto func = graph->compile(out_spec);
func->execute();
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
}
TEST(TestSublinearMemory, MultiOutputOpr) {
CHECK_REQ;
HostTensorGenerator<> gen_;
auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
constexpr size_t N = 128, H = 256, W = 256;
auto host_data = gen({N, 3, H, W});
auto graph = ComputingGraph::make();
SymbolVarArray params;
auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
out = data;
size_t out_chl = host_data->shape(1), layer_count = 0;
auto add_layer = [&](size_t oc, size_t h, size_t w) {
auto prev = opr::Split::make(out, opr::Split::Options::make_average(1, 3));
SymbolVarArray cur_out(3);
size_t cur_in_chl[] = {out_chl / 3, out_chl / 3, out_chl - out_chl / 3 * 2};
size_t cur_out_chl[] = {oc / 3, oc / 3, oc - oc / 3 * 2};
for (int i = 0; i < 3; ++i) {
gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
auto dev_kern = std::make_shared<DeviceTensorND>();
dev_kern->copy_from(*host_kern);
params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
auto f = opr::Convolution::make(
prev[i],
params.back().rename(ssprintf("param%zu:%d", layer_count, i)), {});
if (i == 2)
for (size_t j = 0; j < 10; ++j)
f = opr::relu(f);
cur_out[i] = f;
}
++layer_count;
out_chl = oc;
out = opr::Concat::make(cur_out, 1);
};
add_layer(6, 3, 3);
auto loss = opr::Dot::make(out.flatten(), out.flatten());
std::vector<HostTensorND> grad_params_get(params.size());
ComputingGraph::OutputSpec out_spec;
for (size_t i = 0; i < params.size(); ++i) {
out_spec.emplace_back(
make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i]));
}
std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
for (bool sublinear : {false, true}) {
graph->options().enable_sublinear_memory_opt = sublinear;
auto func = graph->compile(out_spec);
func->execute();
if (!sublinear) {
for (size_t i = 0; i < grad_params_get.size(); ++i)
grad_params_expect[i].copy_from(grad_params_get[i]);
}
}
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
graph->options().enable_sublinear_memory_opt = false;
graph->options().enable_dtr_memory_opt = true;
graph->options().dtr_config.eviction_threshold = 1ULL << 30;
auto func = graph->compile(out_spec);
func->execute();
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
}
TEST(TestSublinearMemory, LongChain) {
CHECK_REQ;
HostTensorGenerator<> gen_;
auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
constexpr size_t N = 32, C = 3, H = 224, W = 224;
auto host_data = gen({N, C, H, W});
auto graph = ComputingGraph::make();
SymbolVarArray params;
auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
out = data;
size_t out_chl = host_data->shape(1), layer_count = 0;
opr::Convolution::Param conv_param;
conv_param.pad_h = 1;
conv_param.pad_w = 1;
auto add_layer = [&](size_t oc, size_t h, size_t w) {
gen_.std(sqrt(2.0 / (out_chl * h * w)));
auto host_kern = gen({oc, out_chl, h, w});
auto dev_kern = std::make_shared<DeviceTensorND>();
dev_kern->copy_from(*host_kern);
params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
out = opr::relu(opr::Convolution::make(
out, params.back().rename(ssprintf("param%zu", layer_count)),
conv_param));
out.rename(ssprintf("out%zu", layer_count));
++layer_count;
out_chl = oc;
};
int OC[] = {1, 1, 1, 12, 1, 1, 1, 1, 15, 1};
for (int i = 1; i <= 10; ++i) {
for (int j = 0; j < 10; j++)
add_layer(OC[j], 3, 3);
}
auto loss = opr::Dot::make(out.flatten(), out.flatten());
std::vector<HostTensorND> grad_params_get(params.size());
ComputingGraph::OutputSpec out_spec;
for (int i = params.size() - 1; i >= 0; --i) {
out_spec.emplace_back(
make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i]));
}
std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
for (bool sublinear : {false, true}) {
graph->options().enable_sublinear_memory_opt = sublinear;
auto func = graph->compile(out_spec);
func->execute();
func->to_json()->writeto_fpath(output_file(
ssprintf("TestSublinearMemory.LongChain%d.json", sublinear)));
if (!sublinear) {
for (size_t i = 0; i < grad_params_get.size(); ++i)
grad_params_expect[i].copy_from(grad_params_get[i]);
}
}
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4);
graph->options().enable_sublinear_memory_opt = false;
graph->options().enable_dtr_memory_opt = true;
graph->options().dtr_config.eviction_threshold = 1ULL << 30;
auto func = graph->compile(out_spec);
func->execute();
for (size_t i = 0; i < grad_params_get.size(); ++i)
MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4);
}
#endif
TEST(TestSublinearMemory, MultiReuse) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
auto host_x = gen({N}), host_y0 = gen({N * 2}), host_y1 = gen({N * 2}),
host_z = gen({N});
auto call_check = [&](SymbolVar val, const HostTensorND& expected) {
auto cb = [expected](const DeviceTensorND& val) {
HostTensorND get;
get.copy_from(val).sync();
MGB_ASSERT_TENSOR_EQ(expected, get);
};
return opr::CallbackInjector::make(val, {true, cb});
};
auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
z0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_z),
z1 = call_check(z0, *host_z), x1 = call_check(x0, *host_x),
x2 = call_check(x0, *host_x),
y0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y0),
y01 = call_check(y0, *host_y0),
y1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y1),
y11 = call_check(y1, *host_y1), x3 = call_check(x0, *host_x);
SymbolVar vars[] = {x0, z0, z1, x1, x2, y0, y01, y1, y11, x3};
ComputingGraph::OutputSpec out_spec;
for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
set_priority(vars[i], i);
out_spec.push_back({vars[i], {}});
}
size_t alloc_size = 0;
auto alloc_size_hdl = graph->event().register_receiver<cg::event::StaticMemAlloc>(
[&](const cg::event::StaticMemAlloc& s) {
if (s.comp_node.valid()) {
alloc_size = s.alloc_size;
}
});
graph->options().enable_sublinear_memory_opt = true;
auto func = graph->compile(out_spec);
func->execute();
ASSERT_GT(alloc_size, 0u);
ASSERT_LT(alloc_size, NS * 2 + (NS / 2));
}
TEST(TestSublinearMemory, DynamicShape) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
auto host_x = gen({N}), host_p = gen({N}), host_t = gen({N / 2 + 1, 2});
auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
y0 = (x + 1.f).rename("y0"), y1 = (y0 + .4f).rename("y1"),
p = opr::Host2DeviceCopy::make_no_fwd(*graph, host_p).rename("p"),
po0 = (p + .5f).rename("po0"), po1 = (p + .4f).rename("po1"),
po = (po0 + po1).rename("po"), xt = (x + .5f).rename("xt"),
xdyn = opr::MarkDynamicVar::make(xt),
t1_shp = (opr::GetVarShape::make(xdyn) + 2).rename("t0"),
t0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_t),
t1 = t0.reshape(t1_shp);
set_priority(y0, 1);
set_priority(y1, 1);
set_priority(p, 2);
set_priority(po, 2);
set_priority(xt, 3);
set_priority(xdyn, 4);
set_priority(t0, 5);
HostTensorND host_y1, host_t1;
size_t alloc_size = 0;
auto alloc_size_hdl = graph->event().register_receiver<cg::event::StaticMemAlloc>(
[&](const cg::event::StaticMemAlloc& s) {
if (s.comp_node.valid()) {
alloc_size = s.alloc_size;
}
});
graph->options().graph_opt_level = 0;
graph->options().enable_sublinear_memory_opt = true;
auto func = graph->compile(
{make_callback_copy(y1, host_y1),
{po, {}},
make_callback_copy(t1, host_t1)});
func->execute().to_json()->writeto_fpath(
output_file("TestSublinearMemory.DynamicShape.json"));
ASSERT_GT(alloc_size, 0u);
ASSERT_LT(alloc_size, NS * 2 + NS / 2);
auto px = host_x->ptr<float>(), py = host_y1.ptr<float>();
for (size_t i = 0; i < N; ++i) {
MGB_ASSERT_FLOAT_EQ(px[i] + 1.4f, py[i]);
}
host_t->resize({N + 2});
MGB_ASSERT_TENSOR_EQ(*host_t, host_t1);
}
TEST(TestSublinearMemory, EmptyGraph) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
graph->options().enable_sublinear_memory_opt = true;
auto x = opr::SharedDeviceTensor::make(*graph, *gen({1}));
auto func = graph->compile({{x, {}}});
func->execute();
}
TEST(TestSublinearMemory, DepsInTopoSort) {
HostTensorGenerator<> gen;
auto graph = ComputingGraph::make();
constexpr size_t N = 1024;
auto host_x0 = gen({N}), host_x1 = gen({N}), host_x2 = gen({N}), host_x3 = gen({N});
auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x0),
x1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x1),
x2 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x2),
x3 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x3),
x4 = opr::SharedDeviceTensor::make(*graph, *host_x0), y0 = x3 + x4,
y1 = y0 + x2, y2 = y1 + x1, y3 = y2 + x0, y4 = opr::AddUpdate::make(x4, y3);
SymbolVar vars[] = {x0, x1, x2, x3, x4, y0, y1, y2, y3, y4};
ComputingGraph::OutputSpec out_spec;
for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
set_priority(vars[i], i);
out_spec.push_back({vars[i], {}});
}
graph->options().graph_opt_level = 0;
for (bool enable_sublinear : {false, true}) {
graph->options().enable_sublinear_memory_opt = enable_sublinear;
auto func = graph->compile(out_spec);
ASSERT_EQ(1u, y4.node()->owner_opr()->node_prop().dep_map().count(y0.node()));
}
}
TEST(TestSublinearMemory, BadOpr) {
HostTensorGenerator<> gen;
auto cn = CompNode::load("xpu0");
constexpr size_t N = 1024, Scale = 2;
auto host_x = gen({N}, cn);
for (bool bad : {false, true}) {
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
bad_var = SublinearBadOpr::make(x, bad, Scale),
y0 = opr::reduce_sum(bad_var, x.make_scalar_dt(1)),
y1 = SublinearBadOpr::make(y0, false, N * Scale), y = y1 + 1,
z = opr::reduce_max(bad_var, x.make_scalar_dt(1));
set_priority(y0, 0);
set_priority(y1, 1);
set_priority(y, 2);
set_priority(z, 3);
graph->options().graph_opt_level = 0;
graph->options().enable_sublinear_memory_opt = 1;
graph->options().sublinear_mem_config.genetic_nr_iter = 50;
auto func = graph->compile({{y, {}}, {z, {}}});
auto&& results = static_cast<cg::ComputingGraphImpl*>(graph.get())
->seq_modifier_for_sublinear_memory()
.prev_min_bottleneck();
size_t expect = bad ? N * Scale * 2 + 1 : N * Scale + N;
ASSERT_EQ(results.at(cn), expect * host_x->dtype().size());
size_t nr_bad_opr = 0;
auto count_up = [&nr_bad_opr](cg::OperatorNodeBase* op) {
if (op->dyn_typeinfo() == SublinearBadOpr::typeinfo()) {
++nr_bad_opr;
}
return true;
};
func->iter_opr_seq(count_up);
ASSERT_EQ(nr_bad_opr, bad ? 2 : 3);
}
}
#else
#pragma message "tests are disabled as Sublinear is not enabled."
#endif