#include <thread>
#include "example.h"
#if LITE_BUILD_WITH_MGE
#include "misc.h"
using namespace lite;
using namespace example;
#if LITE_WITH_CUDA
namespace {
bool device_input(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
lite::Config config{LiteDeviceType::LITE_CUDA};
NetworkIO network_io;
std::string input_name = "data";
bool is_host = false;
IO device_input{input_name, is_host};
network_io.inputs.push_back(device_input);
std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
Layout input_layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
tensor_device.copy_from(*src_tensor);
input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
network->forward();
network->wait();
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool device_input_output(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
lite::Config config{LiteDeviceType::LITE_CUDA};
NetworkIO network_io;
std::string input_name = "data";
std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
bool is_host = false;
IO device_input{input_name, is_host};
IO device_output{output_name, is_host};
network_io.inputs.push_back(device_input);
network_io.outputs.push_back(device_output);
std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
Layout input_layout = input_tensor_device->get_layout();
auto src_tensor = parse_npy(input_path);
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
tensor_device.copy_from(*src_tensor);
input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
network->forward();
network->wait();
std::shared_ptr<Tensor> output_tensor_device = network->get_io_tensor(output_name);
auto output_tensor = std::make_shared<Tensor>();
output_tensor->copy_from(*output_tensor_device);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
bool pinned_host_input(const Args& args) {
std::string network_path = args.model_path;
std::string input_path = args.input_path;
lite::Config config{LiteDeviceType::LITE_CUDA};
std::shared_ptr<Network> network = std::make_shared<Network>(config);
network->load_model(network_path);
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
Layout input_layout = input_tensor->get_layout();
auto src_tensor = parse_npy(input_path);
bool is_pinned_host = true;
auto tensor_pinned_input =
Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
tensor_pinned_input.copy_from(*src_tensor);
input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
network->forward();
network->wait();
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
void* out_data = output_tensor->get_memory_ptr();
size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
output_tensor->get_layout().get_elem_size();
float max = -1.0f;
float sum = 0.0f;
for (size_t i = 0; i < out_length; i++) {
float data = static_cast<float*>(out_data)[i];
sum += data;
if (max < data)
max = data;
}
printf("max=%e, sum=%e\n", max, sum);
return true;
}
}
REGIST_EXAMPLE("device_input", device_input);
REGIST_EXAMPLE("device_input_output", device_input_output);
REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
#endif
#endif