1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
use crate::{ffi, logger::Logger};
use cudarc::driver::sys::{CUevent, CUstream};
use cxx::UniquePtr;
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum DataType {
// 32-bit floating point format.
FLOAT = 0,
// IEEE 16-bit floating-point format.
HALF = 1,
// Signed 8-bit integer representing a quantized floating-point value.
INT8 = 2,
// Signed 32-bit integer format.
INT32 = 3,
// 8-bit boolean. 0 = false, 1 = true, other values undefined.
BOOL = 4,
// Unsigned 8-bit integer format.
// Cannot be used to represent quantized floating-point values.
// Use the IdentityLayer to convert kUINT8 network-level inputs to {kFLOAT, kHALF} prior
// to use with other TensorRT layers, or to convert intermediate output
// before kUINT8 network-level outputs from {kFLOAT, kHALF} to kUINT8.
// kUINT8 conversions are only supported for {kFLOAT, kHALF}.
// kUINT8 to {kFLOAT, kHALF} conversion will convert the integer values
// to equivalent floating point values.
// {kFLOAT, kHALF} to kUINT8 conversion will convert the floating point values
// to integer values by truncating towards zero. This conversion has undefined behavior for
// floating point values outside the range [0.0f, 256.0f) after truncation.
// kUINT8 conversions are not supported for {kINT8, kINT32, kBOOL}.
UINT8 = 5,
// Signed 8-bit floating point with
// 1 sign bit, 4 exponent bits, 3 mantissa bits, and exponent-bias 7.
// \warning kFP8 is not supported yet and will result in an error or undefined behavior.
FP8 = 6,
}
impl DataType {
pub fn get_elem_size(&self) -> usize {
match self {
DataType::FLOAT => 4,
DataType::HALF => 2,
DataType::INT8 => 1,
DataType::INT32 => 4,
DataType::BOOL => 1,
DataType::UINT8 => 1,
DataType::FP8 => 1,
}
}
}
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum TensorIOMode {
// Tensor is not an input or output.
NONE = 0,
// Tensor is input to the engine.
INPUT = 1,
// Tensor is output by the engine.
OUTPUT = 2,
}
impl TensorIOMode {
pub fn is_input(&self) -> bool {
*self == TensorIOMode::INPUT
}
pub fn is_output(&self) -> bool {
*self == TensorIOMode::OUTPUT
}
}
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum TensorFormat {
// Row major linear format.
// For a tensor with dimensions {N, C, H, W} or {numbers, channels,
// columns, rows}, the dimensional index corresponds to {3, 2, 1, 0}
// and thus the order is W minor.
//
// For DLA usage, the tensor sizes are limited to C,H,W in the range [1,8192].
//
LINEAR = 0,
// Two wide channel vectorized row major format. This format is bound to
// FP16. It is only available for dimensions >= 3.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to a C array with dimensions
// [N][(C+1)/2][H][W][2], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][c/2][h][w][c%2].
CHW2 = 1,
// Eight channel format where C is padded to a multiple of 8. This format
// is bound to FP16. It is only available for dimensions >= 3.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to the array with dimensions
// [N][H][W][(C+7)/8*8], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][h][w][c].
HWC8 = 2,
// Four wide channel vectorized row major format. This format is bound to
// INT8 or FP16. It is only available for dimensions >= 3.
// For INT8, the C dimension must be a build-time constant.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to a C array with dimensions
// [N][(C+3)/4][H][W][4], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][c/4][h][w][c%4].
//
// Deprecated usage:
//
// If running on the DLA, this format can be used for acceleration
// with the caveat that C must be equal or lesser than 4.
// If used as DLA input and the build option kGPU_FALLBACK is not specified,
// it needs to meet line stride requirement of DLA format. Column stride in bytes should
// be a multiple of 32 on Xavier and 64 on Orin.
CHW4 = 3,
// Sixteen wide channel vectorized row major format. This format is bound
// to FP16. It is only available for dimensions >= 3.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to a C array with dimensions
// [N][(C+15)/16][H][W][16], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][c/16][h][w][c%16].
//
// For DLA usage, this format maps to the native feature format for FP16,
// and the tensor sizes are limited to C,H,W in the range [1,8192].
//
CHW16 = 4,
// Thirty-two wide channel vectorized row major format. This format is
// only available for dimensions >= 3.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to a C array with dimensions
// [N][(C+31)/32][H][W][32], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][c/32][h][w][c%32].
//
// For DLA usage, this format maps to the native feature format for INT8,
// and the tensor sizes are limited to C,H,W in the range [1,8192].
CHW32 = 5,
// Eight channel format where C is padded to a multiple of 8. This format
// is bound to FP16, and it is only available for dimensions >= 4.
// For a tensor with dimensions {N, C, D, H, W},
// the memory layout is equivalent to an array with dimensions
// [N][D][H][W][(C+7)/8*8], with the tensor coordinates (n, c, d, h, w)
// mapping to array subscript [n][d][h][w][c].
DHWC8 = 6,
// Thirty-two wide channel vectorized row major format. This format is
// bound to FP16 and INT8 and is only available for dimensions >= 4.
// For a tensor with dimensions {N, C, D, H, W},
// the memory layout is equivalent to a C array with dimensions
// [N][(C+31)/32][D][H][W][32], with the tensor coordinates (n, c, d, h, w)
// mapping to array subscript [n][c/32][d][h][w][c%32].
CDHW32 = 7,
// Non-vectorized channel-last format. This format is bound to either FP32 or UINT8,
// and is only available for dimensions >= 3.
HWC = 8,
// DLA planar format. For a tensor with dimension {N, C, H, W}, the W axis
// always has unit stride. The stride for stepping along the H axis is
// rounded up to 64 bytes.
//
// The memory layout is equivalent to a C array with dimensions
// [N][C][H][roundUp(W, 64/elementSize)] where elementSize is
// 2 for FP16 and 1 for Int8, with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][c][h][w].
DLALINEAR = 9,
// DLA image format. For a tensor with dimension {N, C, H, W} the C axis
// always has unit stride. The stride for stepping along the H axis is rounded up
// to 32 bytes on Xavier and 64 bytes on Orin. C can only be 1, 3 or 4.
// If C == 1, it will map to grayscale format.
// If C == 3 or C == 4, it will map to color image format. And if C == 3,
// the stride for stepping along the W axis needs to be padded to 4 in elements.
//
// When C is {1, 3, 4}, then C' is {1, 4, 4} respectively,
// the memory layout is equivalent to a C array with dimensions
// [N][H][roundUp(W, 32/C'/elementSize)][C'] on Xavier and [N][H][roundUp(W, 64/C'/elementSize)][C'] on Orin
// where elementSize is 2 for FP16
// and 1 for Int8. The tensor coordinates (n, c, h, w) mapping to array
// subscript [n][h][w][c].
DLAHWC4 = 10,
// Sixteen channel format where C is padded to a multiple of 16. This format
// is bound to FP16. It is only available for dimensions >= 3.
// For a tensor with dimensions {N, C, H, W},
// the memory layout is equivalent to the array with dimensions
// [N][H][W][(C+15)/16*16], with the tensor coordinates (n, c, h, w)
// mapping to array subscript [n][h][w][c].
HWC16 = 11,
// Non-vectorized channel-last format. This format is bound to FP32.
// It is only available for dimensions >= 4.
DHWC = 12,
}
pub enum EngineCapability {
//
// Standard: TensorRT flow without targeting the safety runtime.
// This flow supports both DeviceType::kGPU and DeviceType::kDLA.
//
STANDARD = 0,
//
// Safety: TensorRT flow with restrictions targeting the safety runtime.
// See safety documentation for list of supported layers and formats.
// This flow supports only DeviceType::kGPU.
//
// This flag is only supported in NVIDIA Drive(R) products.
SAFETY = 1,
//
// DLA Standalone: TensorRT flow with restrictions targeting external, to TensorRT, DLA runtimes.
// See DLA documentation for list of supported layers and formats.
// This flow supports only DeviceType::kDLA.
//
DLASTANDALONE = 2,
}
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum HardwareCompatibilityLevel {
// Do not require hardware compatibility with GPU architectures other than that of the GPU on which the engine was
// built.
NONE = 0,
// Require that the engine is compatible with Ampere and newer GPUs. This will limit the max shared memory usage to
// 48KiB, may reduce the number of available tactics for each layer, and may prevent some fusions from occurring.
// Thus this can decrease the performance, especially for tf32 models.
// This option will disable cuDNN, cuBLAS, and cuBLAS LT as tactic sources.
AMPEREPLUS = 1,
}
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum ProfilingVerbosity {
LAYERNAMESONLY = 0, //< Print only the layer names. This is the default setting.
NONE = 1, //< Do not print any layer information.
DETAILED = 2, //< Print detailed layer information including layer names and layer parameters.
}
pub struct Runtime {
pub(crate) runtime: UniquePtr<ffi::Runtime>,
logger: Logger,
}
impl Runtime {
pub fn new() -> Option<Self> {
let mut logger = Logger::new();
let runtime = ffi::create_runtime(logger.0.pin_mut());
if runtime.is_null() {
None
} else {
Some(Self { runtime, logger })
}
}
pub fn logger(&mut self) -> &mut Logger {
&mut self.logger
}
pub fn deserialize(&mut self, data: &[u8]) -> Option<CudaEngine> {
let engine = self.runtime.pin_mut().deserialize(data);
if engine.is_null() {
None
} else {
Some(CudaEngine(engine))
}
}
pub fn set_max_threads(&mut self, max_threads: i32) -> bool {
self.runtime.pin_mut().set_max_threads(max_threads)
}
pub fn get_max_threads(&self) -> i32 {
self.runtime.get_max_threads()
}
pub fn set_engine_host_code_allowed(&mut self, allowed: bool) {
self.runtime.pin_mut().set_engine_host_code_allowed(allowed)
}
pub fn get_engine_host_code_allowed(&self) -> bool {
self.runtime.get_engine_host_code_allowed()
}
}
pub struct CudaEngine(pub(crate) UniquePtr<ffi::CudaEngine>);
impl CudaEngine {
pub fn get_tensor_shape(&self, name: &str) -> Vec<i32> {
self.0.get_tensor_shape(name)
}
pub fn get_tensor_dtype(&self, name: &str) -> DataType {
match self.0.get_tensor_dtype(name) {
0 => DataType::FLOAT as _,
1 => DataType::HALF as _,
2 => DataType::INT8 as _,
3 => DataType::INT32 as _,
4 => DataType::BOOL as _,
5 => DataType::UINT8 as _,
6 => DataType::FP8 as _,
dtype => panic!("Invalid data type: {}", dtype),
}
}
pub fn get_num_layers(&self) -> i32 {
self.0.get_num_layers()
}
pub fn create_execution_context(&mut self) -> Option<ExecutionContext> {
let context = self.0.pin_mut().create_execution_context();
if context.is_null() {
None
} else {
Some(ExecutionContext(context))
}
}
pub fn is_shape_inference_io(&self, name: &str) -> bool {
self.0.is_shape_inference_io(name)
}
pub fn get_tensor_io_mode(&self, name: &str) -> TensorIOMode {
match self.0.get_tensor_io_mode(name) {
0 => TensorIOMode::NONE,
1 => TensorIOMode::INPUT,
2 => TensorIOMode::OUTPUT,
mode => panic!("Invalid tensor io mode: {}", mode),
}
}
pub fn create_execution_context_without_device_memory(&mut self) -> Option<ExecutionContext> {
let context = self
.0
.pin_mut()
.create_execution_context_without_device_memory();
if context.is_null() {
None
} else {
Some(ExecutionContext(context))
}
}
pub fn get_device_memory_size(&self) -> usize {
self.0.get_device_memory_size()
}
pub fn is_refittable(&self) -> bool {
self.0.is_refittable()
}
pub fn get_tensor_bytes_per_component(&self, name: &str) -> i32 {
self.0.get_tensor_bytes_per_component(name)
}
pub fn get_tensor_components_per_element(&self, name: &str) -> i32 {
self.0.get_tensor_components_per_element(name)
}
pub fn get_tensor_format(&self, name: &str) -> TensorFormat {
match self.0.get_tensor_format(name) {
0 => TensorFormat::LINEAR,
1 => TensorFormat::CHW2,
2 => TensorFormat::HWC8,
3 => TensorFormat::CHW4,
4 => TensorFormat::CHW16,
5 => TensorFormat::CHW32,
6 => TensorFormat::DHWC8,
7 => TensorFormat::CDHW32,
8 => TensorFormat::HWC,
9 => TensorFormat::DLALINEAR,
10 => TensorFormat::DLAHWC4,
11 => TensorFormat::HWC16,
12 => TensorFormat::DHWC,
format => panic!("Invalid tensor format: {}", format),
}
}
pub fn get_tensor_vectorized_dim(&self, name: &str) -> i32 {
self.0.get_tensor_vectorized_dim(name)
}
pub fn get_name(&self) -> &str {
self.0.get_name()
}
pub fn get_num_optimization_profiles(&self) -> i32 {
self.0.get_num_optimization_profiles()
}
pub fn get_engine_capability(&self) -> EngineCapability {
match self.0.get_engine_capability() {
0 => EngineCapability::STANDARD,
1 => EngineCapability::SAFETY,
2 => EngineCapability::DLASTANDALONE,
capability => panic!("Invalid engine capability: {}", capability),
}
}
pub fn has_implicit_batch_dimension(&self) -> bool {
self.0.has_implicit_batch_dimension()
}
pub fn get_num_io_tensors(&self) -> i32 {
self.0.get_num_io_tensors()
}
pub fn get_io_tensor_name(&self, index: i32) -> &str {
self.0.get_io_tensor_name(index)
}
pub fn get_hardware_compatibility_level(&self) -> HardwareCompatibilityLevel {
match self.0.get_hardware_compatibility_level() {
0 => HardwareCompatibilityLevel::NONE,
1 => HardwareCompatibilityLevel::AMPEREPLUS,
level => panic!("Invalid hardware compatibility level: {}", level),
}
}
pub fn get_num_aux_streams(&self) -> i32 {
self.0.get_num_aux_streams()
}
}
pub struct ExecutionContext(pub(crate) UniquePtr<ffi::ExecutionContext>);
impl ExecutionContext {
pub fn set_debug_sync(&mut self, sync: bool) {
self.0.pin_mut().set_debug_sync(sync)
}
pub fn get_debug_sync(&self) -> bool {
self.0.get_debug_sync()
}
pub fn set_name(&mut self, name: &str) {
self.0.pin_mut().set_name(name)
}
pub fn get_name(&self) -> &str {
self.0.get_name()
}
pub fn set_device_memory(&mut self, memory: usize) {
self.0.pin_mut().set_device_memory(memory)
}
pub fn get_tensor_strides(&self, name: &str) -> Vec<i32> {
self.0.get_tensor_strides(name)
}
pub fn get_optimization_profile(&self) -> i32 {
self.0.get_optimization_profile()
}
pub fn set_input_shape(&mut self, name: &str, shape: &[i32]) -> bool {
self.0.pin_mut().set_input_shape(name, shape)
}
pub fn get_tensor_shape(&self, name: &str) -> Vec<i32> {
self.0.get_tensor_shape(name)
}
pub fn all_input_dimensions_specified(&self) -> bool {
self.0.all_input_dimensions_specified()
}
pub fn all_input_shapes_specified(&self) -> bool {
self.0.all_input_shapes_specified()
}
pub fn set_optimization_profile_async(
&mut self,
profile_index: i32,
stream: &CUstream,
) -> bool {
let stream_raw = *stream;
self.0
.pin_mut()
.set_optimization_profile_async(profile_index, stream_raw as _)
}
pub fn set_enqueue_emits_profile(&mut self, emits: bool) {
self.0.pin_mut().set_enqueue_emits_profile(emits)
}
pub fn get_enqueue_emits_profile(&self) -> bool {
self.0.get_enqueue_emits_profile()
}
pub fn report_to_profiler(&mut self) {
self.0.pin_mut().report_to_profiler()
}
pub fn set_tensor_address(&mut self, name: &str, address: usize) -> bool {
self.0.pin_mut().set_tensor_address(name, address)
}
pub fn get_tensor_address(&self, name: &str) -> usize {
self.0.get_tensor_address(name)
}
pub fn set_input_tensor_address(&mut self, name: &str, address: usize) -> bool {
self.0.pin_mut().set_input_tensor_address(name, address)
}
pub fn get_output_tensor_address(&self, name: &str) -> usize {
self.0.get_output_tensor_address(name)
}
pub fn set_input_consumed_event(&mut self, event: &CUevent) -> bool {
let event_raw = *event;
self.0.pin_mut().set_input_consumed_event(event_raw as _)
}
pub fn get_input_consumed_event(&self) -> CUevent {
let event_raw = self.0.get_input_consumed_event();
event_raw as _
}
pub fn get_max_output_size(&self, name: &str) -> usize {
self.0.get_max_output_size(name)
}
pub fn enqueue_v3(&mut self, stream: &CUstream) -> bool {
let stream_raw = *stream;
self.0.pin_mut().enqueue_v3(stream_raw as usize)
}
pub fn set_persistent_cache_limit(&mut self, limit: usize) {
self.0.pin_mut().set_persistent_cache_limit(limit)
}
pub fn get_persistent_cache_limit(&self) -> usize {
self.0.get_persistent_cache_limit()
}
pub fn set_nvtx_verbosity(&mut self, verbosity: ProfilingVerbosity) {
self.0.pin_mut().set_nvtx_verbosity(verbosity as _)
}
pub fn set_aux_streams(&mut self, streams: &[&CUstream]) {
let streams: Vec<_> = streams
.iter()
.map(|stream| {
let s = **stream;
s as _
})
.collect();
self.0.pin_mut().set_aux_streams(streams.as_slice())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::logger::Severity;
#[test]
fn test_runtime() {
use std::{io::Read, path::Path};
let mut runtime = Runtime::new().unwrap();
runtime.logger().log(Severity::Info, "Hello, world!");
let engine_path = Path::new("../tmp/pp-ocr-v4-det-fp16.engine");
if engine_path.exists() {
let mut file = std::fs::File::open(engine_path).unwrap();
let mut data = Vec::new();
file.read_to_end(&mut data).unwrap();
let mut engine = runtime.deserialize(data.as_slice()).unwrap();
let _context = engine.create_execution_context().unwrap();
let num_io_tensors = engine.get_num_io_tensors();
let msg = format!("num_io_tensors: {}", num_io_tensors);
runtime.logger().log(Severity::Info, msg.as_str());
for i in 0..num_io_tensors {
let name = engine.get_io_tensor_name(i);
let mode = engine.get_tensor_io_mode(name);
let shape = engine.get_tensor_shape(name);
let msg = format!("name: {}, mode: {:?}, shape: {:?}", name, mode, shape);
runtime.logger().log(Severity::Info, msg.as_str());
}
} else {
runtime
.logger()
.log(Severity::Info, "Engine file not found! Skip test!");
}
}
}