#include "backend_model_instance.h"
#include "status.h"
#ifndef _WIN32
#include <sys/resource.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "backend_config.h"
#include "backend_model.h"
#include "cuda_utils.h"
#include "infer_stats.h"
#include "metrics.h"
#include "model_config.pb.h"
#include "numa_utils.h"
#include "server.h"
#include "shared_library.h"
#include "triton/common/logging.h"
#include "triton/common/nvtx.h"
#include "tritonserver_apis.h"
#if defined(_MSC_VER)
#define TRITONAPI_DECLSPEC __declspec(dllexport)
#elif defined(__GNUC__)
#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
#else
#define TRITONAPI_DECLSPEC
#endif
namespace triton { namespace core {
namespace {
TRITONSERVER_Error*
WarmupResponseAlloc(
TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
int64_t preferred_memory_type_id, void* userp, void** buffer,
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
int64_t* actual_memory_type_id)
{
*buffer = malloc(byte_size);
if (*buffer != nullptr) {
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
*actual_memory_type_id = 0;
return nullptr;
}
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
"failed to allocate output buffer for warmup.");
}
TRITONSERVER_Error*
WarmupResponseRelease(
TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id)
{
free(buffer);
return nullptr;
}
ResponseAllocator warmup_allocator = ResponseAllocator(
WarmupResponseAlloc, WarmupResponseRelease, nullptr );
void
WarmupResponseComplete(
TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
void* userp)
{
auto res_pair = reinterpret_cast<
std::pair<std::promise<void>, std::vector<std::string>*>*>(userp);
if (iresponse != nullptr) {
auto err = TRITONSERVER_InferenceResponseError(iresponse);
if (err != nullptr) {
static std::mutex res_mtx;
{
std::lock_guard<std::mutex> lk(res_mtx);
res_pair->second->emplace_back(TRITONSERVER_ErrorMessage(err));
}
TRITONSERVER_ErrorDelete(err);
}
LOG_TRITONSERVER_ERROR(
TRITONSERVER_InferenceResponseDelete(iresponse),
"deleting warmup response");
}
if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
res_pair->first.set_value();
}
}
void
WarmupRequestComplete(
TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
{
if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) {
if (userp != nullptr) {
auto warmup_promise = reinterpret_cast<std::promise<void>*>(userp);
warmup_promise->set_value();
}
}
}
Status
VerifyModelLoadGpuFraction(
const std::string& name, TRITONSERVER_InstanceGroupKind kind,
int32_t device_id,
const triton::common::BackendCmdlineConfigMap& backend_config_map)
{
size_t free, total;
double memory_limit;
RETURN_IF_ERROR(GetDeviceMemoryInfo(device_id, &free, &total));
RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
backend_config_map, device_id, &memory_limit));
const size_t allow = total * memory_limit;
const size_t used = total - free;
if (used > allow) {
return Status(
Status::Code::UNAVAILABLE,
std::string("can not create model '") + name +
"': memory limit set for " +
TRITONSERVER_InstanceGroupKindString(kind) + " " +
std::to_string(device_id) +
" has exceeded, model loading is rejected.");
}
return Status::Success;
}
}
TritonModelInstance::TritonModelInstance(
TritonModel* model, const std::string& name, const Signature& signature,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const TritonServerMessage& host_policy_message,
const std::vector<SecondaryDevice>& secondary_devices)
: model_(model), name_(name), signature_(signature), kind_(kind),
device_id_(device_id), host_policy_(host_policy),
host_policy_message_(host_policy_message), profile_names_(profile_names),
passive_(passive), secondary_devices_(secondary_devices), state_(nullptr)
{
#ifdef TRITON_ENABLE_METRICS
if (Metrics::Enabled()) {
const int id = (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU)
? device_id_
: METRIC_REPORTER_ID_CPU;
const bool response_cache_enabled =
model_->ResponseCacheEnabled() &&
model_->Server()->ResponseCacheEnabled();
MetricModelReporter::Create(
model_->ModelId(), model_->Version(), id, response_cache_enabled,
model_->Config().metric_tags(), &reporter_);
}
#endif }
TritonModelInstance::~TritonModelInstance()
{
if (triton_backend_thread_.get() != nullptr) {
triton_backend_thread_->StopBackendThread();
}
model_->Server()->GetRateLimiter()->UnregisterModelInstance(this);
if (model_->Backend()->ModelInstanceFiniFn() != nullptr) {
LOG_TRITONSERVER_ERROR(
model_->Backend()->ModelInstanceFiniFn()(
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this)),
"failed finalizing model instance");
}
}
Status
TritonModelInstance::CreateInstance(
TritonModel* model, const std::string& name, const Signature& signature,
TRITONSERVER_InstanceGroupKind kind, int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const inference::ModelRateLimiter& rate_limiter_config,
const std::vector<SecondaryDevice>& secondary_devices,
std::shared_ptr<TritonModelInstance>* triton_model_instance)
{
static triton::common::HostPolicyCmdlineConfig empty_host_policy;
const triton::common::HostPolicyCmdlineConfig* host_policy =
&empty_host_policy;
const auto policy_it = model->HostPolicyMap().find(host_policy_name);
if (policy_it != model->HostPolicyMap().end()) {
host_policy = &policy_it->second;
}
RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
auto err = ConstructAndInitializeInstance(
model, name, signature, kind, device_id, profile_names, passive,
host_policy_name, *host_policy, rate_limiter_config, secondary_devices,
triton_model_instance);
RETURN_IF_ERROR(ResetNumaMemoryPolicy());
RETURN_IF_ERROR(err);
if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
RETURN_IF_ERROR(VerifyModelLoadGpuFraction(
name, kind, device_id, model->BackendConfigMap()));
}
return Status::Success;
}
Status
TritonModelInstance::ConstructAndInitializeInstance(
TritonModel* model, const std::string& name, const Signature& signature,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const std::vector<std::string>& profile_names, const bool passive,
const std::string& host_policy_name,
const triton::common::HostPolicyCmdlineConfig& host_policy,
const inference::ModelRateLimiter& rate_limiter_config,
const std::vector<SecondaryDevice>& secondary_devices,
std::shared_ptr<TritonModelInstance>* triton_model_instance)
{
triton::common::TritonJson::Value host_policy_json(
triton::common::TritonJson::ValueType::OBJECT);
triton::common::TritonJson::Value policy_setting_json(
host_policy_json, triton::common::TritonJson::ValueType::OBJECT);
for (const auto& pr : host_policy) {
RETURN_IF_ERROR(policy_setting_json.AddString(pr.first.c_str(), pr.second));
}
RETURN_IF_ERROR(host_policy_json.Add(
host_policy_name.c_str(), std::move(policy_setting_json)));
TritonServerMessage host_policy_message(host_policy_json);
std::unique_ptr<TritonModelInstance> local_instance(new TritonModelInstance(
model, name, signature, kind, device_id, profile_names, passive,
host_policy, host_policy_message, secondary_devices));
TRITONBACKEND_ModelInstance* triton_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(local_instance.get());
if (model->Backend()->ModelInstanceInitFn() != nullptr) {
#ifdef _WIN32
std::unique_ptr<SharedLibrary> slib;
RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
RETURN_IF_ERROR(slib->SetLibraryDirectory(model->Backend()->Directory()));
#endif
TRITONSERVER_Error* err =
model->Backend()->ModelInstanceInitFn()(triton_instance);
#ifdef _WIN32
RETURN_IF_ERROR(slib->ResetLibraryDirectory());
#endif
RETURN_IF_TRITONSERVER_ERROR(err);
}
if (!passive) {
RETURN_IF_ERROR(local_instance->GenerateWarmupData());
RETURN_IF_ERROR(model->Server()->GetRateLimiter()->RegisterModelInstance(
local_instance.get(), rate_limiter_config));
RETURN_IF_ERROR(local_instance->SetBackendThread(
kind, device_id, model->DeviceBlocking()));
}
triton_model_instance->reset(local_instance.release());
return Status::Success;
}
Status
TritonModelInstance::SetBackendThread(
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
const bool device_blocking)
{
if (ShareBackendThread(device_blocking, kind)) {
auto device_instances = model_->GetInstancesByDevice(device_id);
if (!device_instances.empty()) {
LOG_VERBOSE(1) << "Using already started backend thread for " << Name()
<< " on device " << device_id;
triton_backend_thread_ = device_instances[0]->triton_backend_thread_;
}
}
if (triton_backend_thread_.get() == nullptr) {
std::unique_ptr<TritonBackendThread> local_backend_thread;
RETURN_IF_ERROR(TritonBackendThread::CreateBackendThread(
Name(), this, 0 , device_id, &local_backend_thread));
triton_backend_thread_ = std::move(local_backend_thread);
} else {
triton_backend_thread_->AddModelInstance(this);
}
RETURN_IF_ERROR(triton_backend_thread_->InitAndWarmUpModelInstance(this));
return Status::Success;
}
Status
TritonModelInstance::GenerateWarmupData()
{
warmup_samples_.clear();
for (const auto& warmup_setting : model_->Config().model_warmup()) {
if (warmup_setting.batch_size() == 0) {
LOG_VERBOSE(1) << "Skipping batch 0 warmup sample '"
<< warmup_setting.name() << "'";
continue;
}
LOG_VERBOSE(1) << "Generating warmup sample data for '"
<< warmup_setting.name() << "'";
int64_t max_zero_byte_size = 0;
int64_t max_random_byte_size = 0;
for (const auto& input_meta : warmup_setting.inputs()) {
auto element_count =
triton::common::GetElementCount(input_meta.second.dims());
if (element_count == -1) {
return Status(
Status::Code::INVALID_ARG,
"warmup setting expects all variable-size dimensions are specified "
"for input '" +
input_meta.first + "'");
}
int64_t batch_byte_size =
element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = element_count * sizeof(int32_t);
}
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
} else {
max_random_byte_size =
std::max(batch_byte_size, max_random_byte_size);
}
break;
}
default:
break;
}
}
warmup_samples_.emplace_back(warmup_setting.name(), warmup_setting.count());
auto& warmup_data = warmup_samples_.back();
TRITONSERVER_MemoryType type;
int64_t type_id;
warmup_data.zero_data_.reset(new AllocatedMemory(
max_zero_byte_size, TRITONSERVER_MEMORY_CPU_PINNED ,
0 ));
char* zero_buffer = warmup_data.zero_data_->MutableBuffer(&type, &type_id);
memset(zero_buffer, 0, max_zero_byte_size);
warmup_data.random_data_.reset(new AllocatedMemory(
max_random_byte_size, TRITONSERVER_MEMORY_CPU_PINNED ,
0 ));
char* random_buffer =
warmup_data.random_data_->MutableBuffer(&type, &type_id);
for (int64_t offset = 0; offset < max_random_byte_size; offset++) {
random_buffer[offset] = rand();
}
for (size_t cnt = 0; cnt < warmup_setting.batch_size(); cnt++) {
warmup_data.requests_.emplace_back(
new InferenceRequest(model_, model_->Version()));
auto& lrequest = warmup_data.requests_.back();
std::vector<std::shared_ptr<InferenceRequest::Input>> input_sps;
for (const auto& input_meta : warmup_setting.inputs()) {
auto batch1_element_count =
triton::common::GetElementCount(input_meta.second.dims());
auto batch_byte_size =
batch1_element_count *
triton::common::GetDataTypeByteSize(input_meta.second.data_type());
if (batch_byte_size == 0) {
batch_byte_size = batch1_element_count * sizeof(int32_t);
}
const char* allocated_ptr;
switch (input_meta.second.input_data_type_case()) {
case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
allocated_ptr = zero_buffer;
break;
case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
allocated_ptr = zero_buffer;
} else {
allocated_ptr = random_buffer;
}
break;
}
case inference::ModelWarmup_Input::InputDataTypeCase::
kInputDataFile: {
warmup_data.provided_data_.emplace_back(new std::string());
auto input_data = warmup_data.provided_data_.back().get();
RETURN_IF_ERROR(ReadTextFile(
JoinPath(
{model_->LocalizedModelPath(), kWarmupDataFolder,
input_meta.second.input_data_file()}),
input_data));
if (input_meta.second.data_type() ==
inference::DataType::TYPE_STRING) {
batch_byte_size = input_data->size();
} else if (((size_t)batch_byte_size) > input_data->size()) {
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects " +
std::to_string(batch_byte_size) +
" bytes, but the data "
"provided from " +
input_meta.second.input_data_file() + "only has " +
std::to_string(input_data->size()) + " bytes");
}
allocated_ptr = input_data->data();
break;
}
default:
return Status(
Status::Code::INVALID_ARG,
lrequest->LogRequest() + "warmup setting expects input '" +
input_meta.first + "' to have input_data_type set");
}
const inference::ModelInput* input_config;
bool is_original_input =
model_->GetInput(input_meta.first, &input_config).IsOk();
InferenceRequest::Input* input = nullptr;
std::vector<int64_t> input_meta_shape;
if ((model_->Config().max_batch_size() != 0) && is_original_input) {
input_meta_shape.push_back(1);
}
for (auto d : input_meta.second.dims()) {
input_meta_shape.push_back(d);
}
if (is_original_input) {
RETURN_IF_ERROR(lrequest->AddOriginalInput(
input_meta.first, input_meta.second.data_type(), input_meta_shape,
&input));
} else {
input_sps.emplace_back();
RETURN_IF_ERROR(lrequest->AddOverrideInput(
input_meta.first, input_meta.second.data_type(),
(model_->Config().max_batch_size() != 0 ? 1 : 0),
input_meta_shape, &input_sps.back()));
input = input_sps.back().get();
}
RETURN_IF_ERROR(input->AppendData(
allocated_ptr, batch_byte_size,
TRITONSERVER_MEMORY_CPU , 0 ));
}
RETURN_IF_ERROR(lrequest->PrepareForInference());
for (const auto& sp : input_sps) {
RETURN_IF_ERROR(lrequest->AddOverrideInput(sp));
}
}
}
return Status::Success;
}
Status
TritonModelInstance::PrepareRequestsForExecution(
std::vector<std::unique_ptr<InferenceRequest>>& requests)
{
for (auto& r : requests) {
RETURN_IF_ERROR(r->LoadInputStates());
RETURN_IF_ERROR(r->SetState(InferenceRequest::State::EXECUTING));
}
return Status::Success;
}
Status
TritonModelInstance::PrepareRequestsOrRespond(
std::vector<std::unique_ptr<InferenceRequest>>& requests)
{
auto status = PrepareRequestsForExecution(requests);
if (!status.IsOk()) {
for (auto& r : requests) {
InferenceRequest::RespondIfError(
r, status, true , FailureReason::OTHER);
}
LOG_STATUS_ERROR(status, "Requests failed pre-execution checks");
}
return status;
}
Status
TritonModelInstance::Schedule(
std::vector<std::unique_ptr<InferenceRequest>>&& requests)
{
RETURN_IF_ERROR(PrepareRequestsOrRespond(requests));
thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
triton_requests.clear();
for (auto& r : requests) {
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
}
Execute(triton_requests);
return Status::Success;
}
Status
TritonModelInstance::Initialize()
{
RETURN_IF_ERROR(SetNumaConfigOnThread(HostPolicy()));
return Status::Success;
}
Status
TritonModelInstance::WarmUp()
{
std::vector<triton::core::TritonModelInstance::WarmupData> lwarmup_samples;
lwarmup_samples.swap(warmup_samples_);
for (auto& sample : lwarmup_samples) {
for (size_t iteration = 1; iteration <= sample.count_; ++iteration) {
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name() << " is running warmup sample '"
<< sample.sample_name_ << "' for iteration " << iteration;
std::vector<std::promise<void>> request_complete(sample.requests_.size());
std::vector<std::string> response_errors;
std::vector<std::pair<std::promise<void>, std::vector<std::string>*>>
response_complete(sample.requests_.size());
std::vector<TRITONBACKEND_Request*> triton_requests;
for (size_t i = 0; i < sample.requests_.size(); ++i) {
auto& request = sample.requests_[i];
request->SetReleaseCallback(
WarmupRequestComplete, &request_complete[i]);
response_complete[i].second = &response_errors;
request->SetResponseCallback(
&warmup_allocator, nullptr, WarmupResponseComplete,
&response_complete[i]);
request->SetResponseFactory();
#ifdef TRITON_ENABLE_STATS
request->CaptureRequestStartNs();
#endif request->CaptureQueueStartNs();
triton_requests.push_back(
reinterpret_cast<TRITONBACKEND_Request*>(request.get()));
}
Execute(triton_requests);
for (size_t i = 0; i < sample.requests_.size(); ++i) {
request_complete[i].get_future().get();
response_complete[i].first.get_future().get();
}
if (response_errors.size() != 0) {
std::string err_str =
"failed to run warmup sample '" + sample.sample_name_ + "': ";
for (const auto& error : response_errors) {
err_str += (error + "; ");
}
LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
<< "' instance " << Name()
<< " failed to run warmup sample '"
<< sample.sample_name_ << "'";
return Status(Status::Code::INVALID_ARG, err_str);
}
}
}
return Status::Success;
}
void
TritonModelInstance::Execute(
std::vector<TRITONBACKEND_Request*>& triton_requests)
{
TRITONBACKEND_ModelInstance* triton_model_instance =
reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);
TritonBackend::TritonModelInstanceExecFn_t inst_exec_fn =
model_->Backend()->ModelInstanceExecFn();
TRITONSERVER_Error* err = inst_exec_fn(
triton_model_instance, &triton_requests[0], triton_requests.size());
if (err != nullptr) {
Status status = Status(
TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
TRITONSERVER_ErrorMessage(err));
for (TRITONBACKEND_Request* tr : triton_requests) {
std::unique_ptr<InferenceRequest> ur(
reinterpret_cast<InferenceRequest*>(tr));
InferenceRequest::RespondIfError(
ur, status, true , FailureReason::BACKEND);
}
TRITONSERVER_ErrorDelete(err);
}
}
Status
TritonModelInstance::TritonBackendThread::CreateBackendThread(
const std::string name, TritonModelInstance* model_instance, const int nice,
const int32_t device_id,
std::unique_ptr<TritonBackendThread>* triton_backend_thread)
{
TritonBackendThread* raw_triton_backend_thread =
new TritonBackendThread(name, model_instance->Model(), nice, device_id);
std::unique_ptr<TritonBackendThread> runner(raw_triton_backend_thread);
runner->AddModelInstance(model_instance);
runner->backend_thread_ = std::thread([raw_triton_backend_thread]() {
raw_triton_backend_thread->BackendThread();
});
triton_backend_thread->reset(runner.release());
return Status::Success;
}
void
TritonModelInstance::TritonBackendThread::AddModelInstance(
TritonModelInstance* model_instance)
{
model_instances_.push_back(model_instance);
}
Status
TritonModelInstance::TritonBackendThread::InitAndWarmUpModelInstance(
TritonModelInstance* model_instance)
{
auto init_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::INIT, model_instance);
RETURN_IF_ERROR(
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, init_payload));
RETURN_IF_ERROR(init_payload->Wait());
auto warmup_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::WARM_UP, model_instance);
RETURN_IF_ERROR(model_->Server()->GetRateLimiter()->EnqueuePayload(
model_, warmup_payload));
RETURN_IF_ERROR(warmup_payload->Wait());
return Status::Success;
}
TritonModelInstance::TritonBackendThread::TritonBackendThread(
const std::string& name, TritonModel* model, const int nice,
const int32_t device_id)
: name_(name), nice_(nice), device_id_(device_id), model_(model)
{
}
TritonModelInstance::TritonBackendThread::~TritonBackendThread()
{
StopBackendThread();
}
void
TritonModelInstance::TritonBackendThread::StopBackendThread()
{
if (backend_thread_.joinable()) {
auto exit_payload = model_->Server()->GetRateLimiter()->GetPayload(
Payload::Operation::EXIT, model_instances_.back());
model_->Server()->GetRateLimiter()->EnqueuePayload(model_, exit_payload);
backend_thread_.join();
}
}
void
TritonModelInstance::TritonBackendThread::BackendThread()
{
#ifndef _WIN32
if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice_) == 0) {
LOG_VERBOSE(1) << "Starting backend thread for " << name_ << " at nice "
<< nice_ << " on device " << device_id_ << "...";
} else {
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice (requested nice " << nice_ << " failed)"
<< " on device " << device_id_ << "...";
}
#else
LOG_VERBOSE(1) << "Starting backend thread for " << name_
<< " at default nice on device " << device_id_ << "...";
#endif
bool should_exit = false;
while (!should_exit) {
std::shared_ptr<Payload> payload;
model_->Server()->GetRateLimiter()->DequeuePayload(
model_instances_, &payload);
NVTX_RANGE(nvtx_, "BackendThread " + name_);
payload->Execute(&should_exit);
model_instances_.push_back(payload->GetInstance());
model_->Server()->GetRateLimiter()->PayloadRelease(payload);
}
LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
}
struct ModelInstanceResponseStatistics {
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics()
: model_instance(nullptr), response_factory(nullptr), response_start(0),
compute_output_start(0), response_end(0), error(nullptr)
{
}
TritonModelInstance* model_instance;
std::shared_ptr<InferenceResponseFactory>* response_factory;
uint64_t response_start;
uint64_t compute_output_start;
uint64_t response_end;
TRITONSERVER_Error* error;
#endif };
extern "C" {
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceName(
TRITONBACKEND_ModelInstance* instance, const char** name)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*name = ti->Name().c_str();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceKind(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_InstanceGroupKind* kind)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*kind = ti->Kind();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceDeviceId(
TRITONBACKEND_ModelInstance* instance, int32_t* device_id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*device_id = ti->DeviceId();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceHostPolicy(
TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*host_policy = const_cast<TRITONSERVER_Message*>(
reinterpret_cast<const TRITONSERVER_Message*>(&ti->HostPolicyMessage()));
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->Profiles().size();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceProfileName(
TRITONBACKEND_ModelInstance* instance, const uint32_t index,
const char** profile_name)
{
*profile_name = nullptr;
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rprofiles = ti->Profiles();
if (index >= rprofiles.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " + std::to_string(rprofiles.size()) +
" profiles")
.c_str());
}
*profile_name = rprofiles[index].c_str();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
TRITONBACKEND_ModelInstance* instance, uint32_t* count)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*count = ti->SecondaryDevices().size();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
int64_t* id)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
const auto& rsecondarydevices = ti->SecondaryDevices();
if (index >= rsecondarydevices.size()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("out of bounds index ") + std::to_string(index) +
": instance is configured with " +
std::to_string(rsecondarydevices.size()) + " secondary devices")
.c_str());
}
*kind = rsecondarydevices[index].kind_.c_str();
*id = rsecondarydevices[index].id_;
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceIsPassive(
TRITONBACKEND_ModelInstance* instance, bool* is_passive)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*is_passive = ti->IsPassive();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceModel(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*model = reinterpret_cast<TRITONBACKEND_Model*>(ti->Model());
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceState(
TRITONBACKEND_ModelInstance* instance, void** state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
*state = ti->State();
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceSetState(
TRITONBACKEND_ModelInstance* instance, void* state)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->SetState(state);
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportStatistics(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
const bool success, const uint64_t exec_start_ns,
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
tr->ReportStatistics(
ti->MetricReporter(), success, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsNew(
TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics)
{
#ifdef TRITON_ENABLE_STATS
*response_statistics =
reinterpret_cast<TRITONBACKEND_ModelInstanceResponseStatistics*>(
new ModelInstanceResponseStatistics());
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
{
#ifdef TRITON_ENABLE_STATS
delete reinterpret_cast<ModelInstanceResponseStatistics*>(
response_statistics);
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ModelInstance* model_instance)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->model_instance = reinterpret_cast<TritonModelInstance*>(model_instance);
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ResponseFactory* response_factory)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
response_factory);
;
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_start)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_start = response_start;
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t compute_output_start)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->compute_output_start = compute_output_start;
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_end)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_end = response_end;
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONSERVER_Error* error)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->error = error;
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportResponseStatistics(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
InferenceStatsAggregator* sa =
rs->model_instance->Model()->MutableStatsAggregator();
std::string key =
std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());
if (rs->error == nullptr) {
if (rs->compute_output_start > 0) {
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess(
key, rs->response_start, rs->compute_output_start, rs->response_end));
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end));
}
} else if (
TRITONSERVER_ErrorCode(rs->error) == TRITONSERVER_ERROR_CANCELLED) {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
sa->UpdateResponseCancel(key, rs->response_start, rs->response_end));
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail(
key, rs->response_start, rs->compute_output_start, rs->response_end));
}
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
const uint64_t exec_start_ns, const uint64_t compute_start_ns,
const uint64_t compute_end_ns, const uint64_t exec_end_ns)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->Model()->MutableStatsAggregator()->UpdateInferBatchStats(
ti->MetricReporter(), batch_size, exec_start_ns, compute_start_ns,
compute_end_ns, exec_end_ns);
#endif
return nullptr; }
TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportMemoryUsage(
TRITONBACKEND_ModelInstance* instance,
TRITONSERVER_BufferAttributes** usage, uint32_t usage_size)
{
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
ti->SetMemoryUsage({reinterpret_cast<BufferAttributes**>(usage), usage_size});
return nullptr; }
} }}