1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once
#include <functional>
#include <string>
#include <unordered_map>
#include <vector>
#include "buffer_attributes.h"
#include "infer_response.h"
#include "infer_stats.h"
#include "infer_trace.h"
#include "memory.h"
#include "response_allocator.h"
#include "sequence_state.h"
#include "status.h"
#include "triton/common/logging.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"
namespace triton { namespace core {
class Model;
class InferenceServer;
class MetricModelReporter;
//
// An inference request. A request can be used multiple times for
// inference but before each inference run, PrepareForInference() must
// be called to verify and prepare the request. Verification involves
// ensuring that any changes made since the last inference are
// valid. Preparing involves removing/resetting any state left over
// from the previous inference.
//
class InferenceRequest {
public:
using InternalReleaseFn =
std::function<Status(std::unique_ptr<InferenceRequest>&, const uint32_t)>;
/// State for the request object.
enum class State {
// The request has been initialized, but not yet enqueued.
INITIALIZED,
// The request is pending execution.
PENDING,
// The request failed to enqueue.
FAILED_ENQUEUE,
// The request has been picked up by a backend model instance for execution,
// but hasn't been released yet.
EXECUTING,
// The request has been released.
RELEASED
};
// Input tensor
class Input {
public:
enum class TensorType { TENSOR, SHAPE_TENSOR, NON_LINEAR };
Input();
Input(
const std::string& name, const inference::DataType datatype,
const std::vector<int64_t>& shape);
Input(
const std::string& name, const inference::DataType datatype,
const int64_t* shape, const uint64_t dim_count);
// Set the name, data type and original shape of the input tensor.
void SetMetadata(
const std::string& name, const inference::DataType& dt,
const std::vector<int64_t>& shape);
// The name of the input tensor. There is no mutable operator for
// the name because it is used in a InferenceRequest map and a
// mutable method would allow it to get out-of-sync.
const std::string& Name() const { return name_; }
// Data type of the input tensor.
inference::DataType DType() const { return datatype_; }
// The original shape of the input tensor.
const std::vector<int64_t>& OriginalShape() const
{
return original_shape_;
}
// The shape of the input tensor after normalization. This shape
// is the original shape modified as required/expected by
// inference processing.
const std::vector<int64_t>& Shape() const { return shape_; }
std::vector<int64_t>* MutableShape() { return &shape_; }
// FIXME. Should not need these functions. All shapes kept here
// should include the batch dimension instead of breaking the same
// into batch + shape.
const std::vector<int64_t>& ShapeWithBatchDim() const
{
return shape_with_batch_dim_;
}
std::vector<int64_t>* MutableShapeWithBatchDim()
{
return &shape_with_batch_dim_;
}
// Return true if host-specific data was added for this input
bool HasHostPolicySpecificData() const
{
return has_host_policy_specific_data_;
}
// Whether or not the input is a tensorrt shape tensor
bool IsShapeTensor() const
{
return tensor_type_ == TensorType::SHAPE_TENSOR;
}
// Specifies whether the input uses a non-linear IO format
bool IsNonLinearFormatIo() const
{
return tensor_type_ == TensorType::NON_LINEAR;
}
// Set the input to be treated as a shape tensor.
Status SetIsShapeTensor();
// Set the input uses a non-linear IO format
Status SetIsNonLinearFormatIo();
// The data for this input.
const std::shared_ptr<Memory>& Data() const { return data_; }
// The data for this input for a specific device
const std::shared_ptr<Memory>& Data(
const std::string& host_policy_name) const;
// Return all host policy data set for this input
const std::map<std::string, std::shared_ptr<Memory>>& HostPolicyData() const
{
return host_policy_data_map_;
}
// Set the data for this input. Error if input already has some
// data.
Status SetData(const std::shared_ptr<Memory>& data);
// Set the data associated with the host policy for this input.
// Return error if input already has some data.
Status SetData(
const std::string& host_policy_name,
const std::shared_ptr<Memory>& data);
// Append a new buffer of data to this input.
Status AppendData(
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
Status AppendDataWithHostPolicy(
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id, const char* host_policy_name);
Status AppendDataWithBufferAttributes(
const void* base, BufferAttributes* buffer_attributes);
// Prepend a new buffer of data to this input.
Status PrependData(
const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
int64_t memory_type_id);
// Remove all existing data for the input.
Status RemoveAllData();
// Get the number of buffers containing the input tensor data.
size_t DataBufferCount() const { return data_->BufferCount(); }
// Get the number of buffers containing the input tensor data with
// host policy. If there are no buffers corresponding to the specific
// host policy, the number of buffers in the fallback input data is
// returned.
size_t DataBufferCountForHostPolicy(
const std::string& host_policy_name) const;
// Get the 'idx' buffer containing a contiguous chunk of bytes for
// the input. Return error is 'idx' refers to a buffer that does
// not exist. Return a pointer to the chunk in 'base' and the
// size of the chunk in 'byte_size'. 'memory_type' acts as
// both input and output. On input 'memory_type' is the buffer
// memory type preferred by the function caller. On return
// 'memory_type' gives the actual memory type of the chunk pointed
// to by 'base'. 'memory_type_id' acts as both input and
// output. On input 'memory_type_id' is the buffer memory type id
// preferred by the function caller. On return 'memory_type_id'
// gives the actual memory type id of the chunk pointed to by
// 'base'.
Status DataBuffer(
const size_t idx, const void** base, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id) const;
// Get the buffer attributes associated with 'idx' buffer.
Status DataBufferAttributes(
const size_t idx, const void** base,
BufferAttributes** buffer_attributes) const;
// Get the 'idx' buffer containing a contiguous chunk of bytes for
// the input. Return error is 'idx' refers to a buffer that does
// not exist. Return a pointer to the chunk in 'base' and the
// size of the chunk in 'byte_size'. 'memory_type' acts as
// both input and output. On input 'memory_type' is the buffer
// memory type preferred by the function caller. On return
// 'memory_type' gives the actual memory type of the chunk pointed
// to by 'base'. 'memory_type_id' acts as both input and
// output. On input 'memory_type_id' is the buffer memory type id
// preferred by the function caller. On return 'memory_type_id'
// gives the actual memory type id of the chunk pointed to by
// 'base'.
Status DataBufferForHostPolicy(
const size_t idx, const void** base, size_t* byte_size,
TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
const std::string& host_policy_name) const;
private:
DISALLOW_COPY_AND_ASSIGN(Input);
friend std::ostream& operator<<(
std::ostream& out, const InferenceRequest::Input& input);
std::string name_;
inference::DataType datatype_;
std::vector<int64_t> original_shape_;
std::vector<int64_t> shape_;
std::vector<int64_t> shape_with_batch_dim_;
TensorType tensor_type_;
std::shared_ptr<Memory> data_;
bool has_host_policy_specific_data_;
// A map of host policy to input data memory
std::map<std::string, std::shared_ptr<Memory>> host_policy_data_map_;
};
// Sequence ID can be either a 64 bit integer or a string.
// This class implements the SequenceId type
class SequenceId {
public:
enum class DataType { UINT64, STRING };
SequenceId();
SequenceId(const std::string& sequence_label);
SequenceId(uint64_t sequence_index);
SequenceId& operator=(const SequenceId& rhs) = default;
SequenceId& operator=(const std::string& rhs);
SequenceId& operator=(const uint64_t rhs);
// Functions that help determine exact type of sequence Id
DataType Type() const { return id_type_; }
bool InSequence() const
{
return ((sequence_label_ != "") || (sequence_index_ != 0));
}
// Get the value of the SequenceId based on the type
const std::string& StringValue() const { return sequence_label_; }
uint64_t UnsignedIntValue() const { return sequence_index_; }
private:
friend std::ostream& operator<<(
std::ostream& out, const InferenceRequest::SequenceId& correlation_id);
friend bool operator==(const SequenceId lhs, const SequenceId rhs);
friend bool operator!=(const SequenceId lhs, const SequenceId rhs);
std::string sequence_label_;
uint64_t sequence_index_;
DataType id_type_;
};
// InferenceRequest
//
// The two constructors are identical except one takes model as a
// shared pointer and the other as a raw pointer. The shared pointer
// version is the primary one and acts to keep the model alive as
// long as the request is in flight. The raw pointer version is used
// only for cases where the model itself is issuing a request
// (e.g. warmup) and no shared pointer version of the model exists
// (because we aren't using shared_from_this).
InferenceRequest(
const std::shared_ptr<Model>& model,
const int64_t requested_model_version);
InferenceRequest(Model* model, const int64_t requested_model_version);
const std::string& ModelName() const;
int64_t RequestedModelVersion() const { return requested_model_version_; }
int64_t ActualModelVersion() const;
const std::string& Id() const { return id_; }
void SetId(const std::string& i) { id_ = i; }
// Return string for logging request ID
std::string LogRequest() const
{
std::string id = Id();
if (id.empty()) {
id = "<id_unknown>";
}
return std::string("[request id: ") + id + "] ";
}
// Flags for the request, union of TRITONSERVER_RequestFlag.
uint32_t Flags() const { return flags_; }
void SetFlags(uint32_t f) { flags_ = f; }
const SequenceId& CorrelationId() const { return correlation_id_; }
void SetCorrelationId(const SequenceId& c) { correlation_id_ = c; }
// The batch size of the request, as understood by Triton. A
// batch-size of 0 indicates that the model doesn't support batching
// in a way that Triton understands. Batch size is not set
// explicitly so there is no setter for it. It is set when the
// request is normalized.
uint32_t BatchSize() const { return batch_size_; }
uint64_t Priority() const { return priority_; }
void SetPriority(uint64_t p);
uint64_t TimeoutMicroseconds() const { return timeout_us_; }
void SetTimeoutMicroseconds(uint64_t t) { timeout_us_ = t; }
const std::string& CacheKey() const { return cache_key_; }
// It is up to the user to update the cache_key_ if modifying any hashable
// fields of the request after cache_key_is_set_ has been set to true.
void SetCacheKey(const std::string& key)
{
cache_key_ = key;
cache_key_is_set_ = true;
}
bool CacheKeyIsSet() const { return cache_key_is_set_; }
// Define and validate state transitions for request.
Status SetState(InferenceRequest::State state);
#ifdef TRITON_ENABLE_TRACING
const std::shared_ptr<InferenceTraceProxy>& TraceProxy() const
{
return trace_;
}
std::shared_ptr<InferenceTraceProxy>* MutableTrace() { return &trace_; }
void SetTrace(const std::shared_ptr<InferenceTraceProxy>& trace)
{
trace_ = trace;
response_factory_->SetTrace(trace);
}
void ReleaseTrace()
{
trace_ = nullptr;
response_factory_->ReleaseTrace();
}
Status TraceInputTensors(
TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
#endif // TRITON_ENABLE_TRACING
// Add a parameter to the request.
Status AddParameter(const char* name, const char* value);
Status AddParameter(const char* name, const int64_t value);
Status AddParameter(const char* name, const bool value);
Status AddParameter(const char* name, const double value);
Status SetParameters(const std::deque<InferenceParameter>& parameters);
const std::deque<InferenceParameter>& Parameters() const
{
return parameters_;
}
// The original inputs are the inputs added to the request before
// the inference execution (that is before
// TRITONSERVER_ServerInferAsync is called). Once execution has
// started the original inputs should not be modified until
// execution completes (and those modifications will apply to the
// next inference execution).
Status MutableOriginalInput(const std::string& name, Input** input);
std::unordered_map<std::string, Input>* MutableOriginalInputs()
{
return &original_inputs_;
}
const std::unordered_map<std::string, Input>& OriginalInputs() const
{
return original_inputs_;
}
// The override inputs are the inputs added to the request after
// inference execution has started (that is after
// TRITONSERVER_ServerInferAsync or equivalent is called). During
// inference processing, if Triton needs to change an original input
// it will add an override instead of changing the original. Triton
// will also use an override if it needs to add a new input to the
// request. Overrides are recorded as shared_ptr so that the same
// override can be used efficiently multiple times or even in
// multiple requests simultaneously. Must be careful not to modify
// an override input if it is being shared unless you want that
// change to be reflected in all requests that hold that override
// input. Override inputs within a specific request are not
// persisted across inference calls.
std::unordered_map<std::string, std::shared_ptr<Input>>*
MutableOverrideInputs()
{
return &override_inputs_;
}
const std::unordered_map<std::string, std::shared_ptr<Input>>&
OverrideInputs() const
{
return override_inputs_;
}
// Get an input taking into account both original inputs and
// overrides. If an override input is available use it, otherwise
// use the original input. Accessing inputs via this method is not
// valid until after PrepareForInference is called.
Status ImmutableInput(const std::string& name, const Input** input) const;
const std::unordered_map<std::string, Input*>& ImmutableInputs() const
{
return inputs_;
}
// The original requested outputs are the requested outputs added to
// the request before the inference execution (that is before
// TRITONSERVER_ServerInferAsync is called). Once execution has
// started the original requested outputs should not be modified
// until execution completes (and those modifications will apply to
// the next inference execution).
const std::set<std::string>& OriginalRequestedOutputs() const
{
return original_requested_outputs_;
}
// Get the requested outputs that should be used during
// inference. Accessing outputs via this method is not valid until
// after PrepareForInference is called.
const std::set<std::string>& ImmutableRequestedOutputs() const
{
return (requested_outputs_.empty()) ? original_requested_outputs_
: requested_outputs_;
}
// Get the response factory.
const std::shared_ptr<InferenceResponseFactory>& ResponseFactory() const
{
return response_factory_;
}
// Add an original input to the request. If 'input' is non-null
// return a pointer to the newly added input.
Status AddOriginalInput(
const std::string& name, const inference::DataType datatype,
const int64_t* shape, const uint64_t dim_count, Input** input = nullptr);
Status AddOriginalInput(
const std::string& name, const inference::DataType datatype,
const std::vector<int64_t>& shape, Input** input = nullptr);
// Add an original raw input to the request. If 'input' is non-null
// return a pointer to the newly added input.
Status AddRawInput(const std::string& name, Input** input = nullptr);
// Remove a single original input or all inputs.
Status RemoveOriginalInput(const std::string& name);
Status RemoveAllOriginalInputs();
// Add an override input to the request. If 'input' is non-null
// return a pointer to the newly added input.
// FIXME passing batch size is special handling for backend API.
// For override input, the 'shape' is without batch dimension for
// backends that implemented w/o backend API (which need correct
// input.Shape()), but backend API uses input.ShapeWithBatchDim().
Status AddOverrideInput(
const std::string& name, const inference::DataType datatype,
const int64_t batch_size, const std::vector<int64_t>& shape,
std::shared_ptr<Input>* input = nullptr);
// Add an override input to the request.
Status AddOverrideInput(const std::shared_ptr<Input>& input);
// Request an original requested output.
Status AddOriginalRequestedOutput(const std::string& name);
// Remove a single original requested output or all requested
// outputs.
Status RemoveOriginalRequestedOutput(const std::string& name);
Status RemoveAllOriginalRequestedOutputs();
// Initialize the release callback for the request.
Status SetReleaseCallback(
TRITONSERVER_InferenceRequestReleaseFn_t release_fn, void* release_userp)
{
release_fn_ = release_fn;
release_userp_ = release_userp;
return Status::Success;
}
// Initialize the response factory arguments that are going to be used with
// any responses produced for this request.
Status SetResponseCallback(
const ResponseAllocator* allocator, void* alloc_userp,
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp)
{
response_allocator_ = allocator;
alloc_userp_ = alloc_userp;
response_callback_ = response_fn;
response_userp_ = response_userp;
return Status::Success;
}
// Returns the preferred memory type and memory type ID of the output buffer
// for the request. 'name' and 'byte_size' are optional and set to nullptr
// if not specified, if provided, they give the allocator more information.
// 'memory_type' and 'memory_type_id' are also used as input to provide types
// preferred by the caller.
// Status::Code::UNAVAILABLE will be returned if output properties are not
// available.
Status OutputBufferProperties(
const char* name, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
int64_t* memory_type_id);
// Add a callback to be invoked on releasing the request object from Triton.
// Multiple callbacks can be added by calling this function in order,
// and they will be invoked in reversed order.
Status AddInternalReleaseCallback(InternalReleaseFn&& callback)
{
release_callbacks_.emplace_back(std::move(callback));
return Status::Success;
}
// Add a delegator to be invoked on sending the responses of this request.
// The response will be passed to 'delegator' and 'delegator' must call the
// InferenceResponse::Send() to send the response.
Status SetResponseDelegator(
std::function<void(
std::unique_ptr<InferenceResponse>&&, const uint32_t)>&& delegator)
{
response_delegator_ = std::move(delegator);
return response_factory_->SetResponseDelegator(response_delegator_);
}
Status SetSequenceStates(
const std::shared_ptr<SequenceStates>& sequence_states)
{
sequence_states_ = sequence_states;
return Status::Success;
}
Status LoadInputStates();
void SetResponseFactory()
{
response_factory_.reset(new InferenceResponseFactory(
model_shared_, id_, response_allocator_, alloc_userp_,
response_callback_, response_userp_, response_delegator_));
}
const std::shared_ptr<SequenceStates>& GetSequenceStates() const
{
return sequence_states_;
}
// Prepare this request for inference.
Status PrepareForInference();
// Run this inference request using the model associated with the
// request. If Status::Success is returned then the call has taken
// ownership of the request object and so 'request' will be
// nullptr. If non-success is returned then the caller still retains
// ownership of 'request'.
static Status Run(std::unique_ptr<InferenceRequest>& request);
// Send an error response for this request. If 'status' is Success
// then no response is sent and the request is not released (even if
// 'release_request' is true). Because this is sending an error it
// is assumed that this is the last response for the request and so
// the FINAL flag is set in the response callback. If
// 'release_request' is true then the release callback is called for
// this request and ownership is given to the callback. Thus, if
// 'release_request' is true 'request' is returned as nullptr.
static void RespondIfError(
std::unique_ptr<InferenceRequest>& request, const Status& status,
const bool release_request = false,
FailureReason reason = FailureReason::OTHER);
// Send an error response to a set of 'requests'. If 'status' is
// Success then no responses are sent and the requests are not
// released (even if 'release_request' is true). Because this is
// sending an error it is assumed that this is the last response for
// the requests and so the FINAL flag is set in the response
// callbacks. If 'release_request' is true then the release callback
// is called for each request, and the request ownership is given to
// the callback. Thus, if 'release_request' is true 'requests' is
// returned with all nullptrs.
static void RespondIfError(
std::vector<std::unique_ptr<InferenceRequest>>& requests,
const Status& status, const bool release_requests = false,
FailureReason reason = FailureReason::OTHER);
// Release the request. Call the release callback and transfer
// ownership of the request to the callback. On return 'request' is
// nullptr.
static Status Release(
std::unique_ptr<InferenceRequest>&& request,
const uint32_t release_flags);
// Create a copy of 'from' suitable for use as a "null" request as
// required for the direct sequence batcher. The returned copy will
// contain only the minimum content required for a null request.
// The statistics of the copy will not be collected.
static InferenceRequest* CopyAsNull(const InferenceRequest& from);
uint64_t QueueStartNs() const { return queue_start_ns_; }
uint64_t CaptureQueueStartNs()
{
queue_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
return queue_start_ns_;
}
uint64_t CacheLookupStartNs() const { return cache_lookup_start_ns_; }
uint64_t CaptureCacheLookupStartNs()
{
cache_lookup_start_ns_ =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
return cache_lookup_start_ns_;
}
uint64_t CacheLookupEndNs() const { return cache_lookup_end_ns_; }
uint64_t CaptureCacheLookupEndNs()
{
cache_lookup_end_ns_ =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
return cache_lookup_end_ns_;
}
uint64_t BatcherStartNs() const { return batcher_start_ns_; }
uint64_t CaptureBatcherStartNs()
{
batcher_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
return batcher_start_ns_;
}
#ifdef TRITON_ENABLE_STATS
uint64_t RequestStartNs() const { return request_start_ns_; }
uint64_t CaptureRequestStartNs()
{
request_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
return request_start_ns_;
}
// Report the statistics to stats collectors associated with the request.
// Duration and timestamps provide two granularities for stats collectors.
void ReportStatistics(
MetricModelReporter* metric_reporter, bool success,
const uint64_t compute_start_ns, const uint64_t compute_input_end_ns,
const uint64_t compute_output_start_ns, const uint64_t compute_end_ns);
// Report the error statistics to stats collectors associated with the
// request.
// FIXME: A separate function may not be necessary here, but is being used
// cautiously in case of unforeseen issues such as possibly capturing a trace
// twice. This should be revisited and better tested to see if the
// ReportStatistics function can be used as-is for the newly captured failure
// cases.
void ReportErrorStatistics(
MetricModelReporter* metric_reporter, FailureReason reason);
// Report the statistics to stats collectors associated with the request.
// Duration and timestamps provide two granularities for stats collectors.
void ReportStatisticsWithDuration(
MetricModelReporter* metric_reporter, bool success,
const uint64_t compute_start_ns, const uint64_t compute_input_duration_ns,
const uint64_t compute_infer_duration_ns,
const uint64_t compute_output_duration_ns);
// Report the statistics to stats collectors associated with the request on
// response cache hits. Cache miss stats will be updated through model object
// directly because the backend may release the request object.
void ReportStatisticsCacheHit(MetricModelReporter* metric_reporter);
// Statistics for each request are aggregated into the corresponding
// model's statistics. Optionally this function may be used to
// add an additional aggregator where statistics are also aggregated.
void SetSecondaryStatsAggregator(
InferenceStatsAggregator* secondary_stats_aggregator)
{
secondary_stats_aggregator_ = secondary_stats_aggregator;
}
#endif // TRITON_ENABLE_STATS
// Mark the request as cancelled.
Status Cancel()
{
if (!response_factory_) {
return Status(
Status::Code::INTERNAL,
"It is not possible to cancel an inference request before calling "
"TRITONSERVER_InferAsync.");
}
response_factory_->Cancel();
return Status::Success;
}
// Check if the request is marked as cancelled. This does not indicate if the
// request is actually cancelled. The request is cancelled if and only if it
// is responded with a cancelled status.
Status IsCancelled(bool* is_cancelled)
{
if (!response_factory_) {
return Status(
Status::Code::INTERNAL,
"It is not possible to query cancellation status before calling "
"TRITONSERVER_InferAsync.");
}
*is_cancelled = response_factory_->IsCancelled();
return Status::Success;
}
bool IsCancelled()
{
bool is_cancelled = false;
Status status = IsCancelled(&is_cancelled);
if (!status.IsOk()) {
LOG_ERROR << status.Message();
}
return is_cancelled;
}
private:
DISALLOW_COPY_AND_ASSIGN(InferenceRequest);
friend std::ostream& operator<<(
std::ostream& out, const InferenceRequest& request);
friend std::ostream& operator<<(
std::ostream& out, const InferenceRequest::State& state);
Status Normalize();
// Helper for validating Inputs
Status ValidateRequestInputs();
Status ValidateBytesInputs(
const std::string& input_id, const Input& input,
TRITONSERVER_MemoryType* buffer_memory_type) const;
// Helpers for pending request metrics
void IncrementPendingRequestCount();
void DecrementPendingRequestCount();
// Has anything in the request potentially changed in a way that
// causes normalization to be required when preparing the request
// for inference.
bool needs_normalization_;
// The model associated with this request. For most requests
// model_shared_ will be non-null and will act to keep the model
// alive as long as this request is live. In this case model_raw_
// will be the raw pointer from the shared pointer. For cases where
// the model itself created the request (like running requests for
// warmup), model_shared_ will be nullptr, but model_raw_ will
// still be defined. Thus model_raw_ is always defined and should
// always to used to access the model.
std::shared_ptr<Model> model_shared_;
Model* model_raw_;
// The model version as requested and based on version policy the
// specific version that is actually used for inference.
int64_t requested_model_version_;
int64_t actual_model_version_;
std::string id_;
uint32_t flags_;
SequenceId correlation_id_;
uint32_t batch_size_;
uint64_t priority_;
uint64_t timeout_us_;
std::string cache_key_ = "";
// Helper to determine if request was successfully hashed
// and cache_key_ field is valid
bool cache_key_is_set_ = false;
std::unordered_map<std::string, Input> original_inputs_;
std::unordered_map<std::string, std::shared_ptr<Input>> override_inputs_;
std::unordered_map<std::string, Input*> inputs_;
std::set<std::string> original_requested_outputs_;
std::string raw_input_name_;
uint32_t raw_input_size_;
// requested_outputs_ is to be used post-normalization. It will be
// empty unless it differs from original_requested_outputs_, so
// typically should access it through ImmutableRequestedOutputs.
std::set<std::string> requested_outputs_;
// The release function and user pointer for this request.
TRITONSERVER_InferenceRequestReleaseFn_t release_fn_;
void* release_userp_;
// Additional release callbacks invoked before 'release_fn_'.
std::vector<InternalReleaseFn> release_callbacks_;
// Delegator to be invoked on sending responses.
std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
response_delegator_;
// The response factory associated with this request.
std::shared_ptr<InferenceResponseFactory> response_factory_;
// Request timestamps. Queue start is needed for schedulers even
// when statistics are not being collected.
uint64_t queue_start_ns_;
// Cache lookup start/end timestamps. Cache manages its own stats even
// when statistics are not being colleceted.
uint64_t cache_lookup_start_ns_;
uint64_t cache_lookup_end_ns_;
// Cache insertion start/end timestamps. Cache manages its own stats even
// when statistics are not being colleceted.
uint64_t cache_insertion_start_ns_;
uint64_t cache_insertion_end_ns_;
// Dedicated timestamp for batcher internal which can diverge from
// queue start timestamp to provide accurate queue time without affecting
// batcher functionalities.
uint64_t batcher_start_ns_;
// Whether the stats of the request should be collected.
bool collect_stats_;
// The parameters of the request. Use a deque so that there is no
// reallocation.
std::deque<InferenceParameter> parameters_;
#ifdef TRITON_ENABLE_STATS
uint64_t request_start_ns_;
InferenceStatsAggregator* secondary_stats_aggregator_ = nullptr;
#endif // TRITON_ENABLE_STATS
#ifdef TRITON_ENABLE_TRACING
// Inference trace associated with this request.
std::shared_ptr<InferenceTraceProxy> trace_;
#endif // TRITON_ENABLE_TRACING
// Sequence I/O states used for implicit state.
std::shared_ptr<SequenceStates> sequence_states_;
// The state of the request.
std::atomic<InferenceRequest::State> state_;
// Whether this is a null request used for direct sequence batch padding or
// not.
bool null_request_;
// Response factory arguments
const ResponseAllocator* response_allocator_;
void* response_userp_;
void* alloc_userp_;
TRITONSERVER_InferenceResponseCompleteFn_t response_callback_;
};
std::ostream& operator<<(std::ostream& out, const InferenceRequest& request);
std::ostream& operator<<(
std::ostream& out, const InferenceRequest::Input& input);
std::ostream& operator<<(
std::ostream& out, const InferenceRequest::SequenceId& sequence_id);
bool operator==(
const InferenceRequest::SequenceId lhs,
const InferenceRequest::SequenceId rhs);
}} // namespace triton::core
namespace std {
using namespace triton::core;
template <>
class hash<InferenceRequest::SequenceId> {
public:
size_t operator()(const InferenceRequest::SequenceId& sequence_id) const
{
if (sequence_id.Type() == InferenceRequest::SequenceId::DataType::STRING) {
return std::hash<std::string>{}(sequence_id.StringValue());
}
return std::hash<uint64_t>{}(sequence_id.UnsignedIntValue());
}
};
} // namespace std