Struct CudaEngine

Source

pub struct CudaEngine { /* private fields */ }

Expand description

CUDA engine (real mode)

Implementations§

Source §

impl CudaEngine

Source

pub fn get_nb_io_tensors(&self) -> Result<i32>

Examples found in repository ?

examples/basic_workflow.rs (line 84)

18fn main() -> Result<(), Box<dyn Error>> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("TensorRT-RTX Basic Workflow Example");
25    println!("=====================================\n");
26
27    // Step 1: Create logger
28    println!("1. Creating logger...");
29    let logger = Logger::stderr()?;
30    println!("   ✓ Logger created\n");
31
32    // Step 2: Build phase
33    println!("2. Building engine...");
34
35    let builder = Builder::new(&logger)?;
36    println!("   ✓ Builder created");
37
38    // Create network with explicit batch dimensions
39    let mut network = builder.create_network(network_flags::EXPLICIT_BATCH)?;
40    println!("   ✓ Network created");
41
42    // Create and configure builder config
43    let mut config = builder.create_config()?;
44    println!("   ✓ Config created");
45
46    // Set workspace memory limit (1GB)
47    config.set_memory_pool_limit(MemoryPoolType::Workspace, 1 << 30)?;
48    println!("   ✓ Workspace limit set to 1GB");
49
50    // Note: In a real application, you would add layers to the network here
51    // For example:
52    // - network.add_input(...)
53    // - network.add_convolution(...)
54    // - network.add_activation(...)
55    // - etc.
56
57    println!("\n   Note: This example uses an empty network.");
58    println!("   In production, you would:");
59    println!("   - Parse an ONNX model");
60    println!("   - Or programmatically add layers");
61    println!("   - Define input/output tensors\n");
62
63    // Build serialized network
64    println!("   Building serialized engine...");
65    match builder.build_serialized_network(&mut network, &mut config) {
66        Ok(engine_data) => {
67            println!("   ✓ Engine built ({} bytes)", engine_data.len());
68
69            // Save to disk
70            let engine_path = "/tmp/example.engine";
71            std::fs::write(engine_path, &engine_data)?;
72            println!("   ✓ Engine saved to {}\n", engine_path);
73
74            // Step 3: Inference phase
75            println!("3. Loading engine for inference...");
76
77            let runtime = Runtime::new(&logger)?;
78            println!("   ✓ Runtime created");
79
80            let engine = runtime.deserialize_cuda_engine(&engine_data)?;
81            println!("   ✓ Engine deserialized");
82
83            // Query engine information
84            let num_tensors = engine.get_nb_io_tensors()?;
85            println!("   ✓ Engine has {} I/O tensors", num_tensors);
86
87            for i in 0..num_tensors {
88                let name = engine.get_tensor_name(i)?;
89                println!("      - Tensor {}: {}", i, name);
90            }
91
92            // Create execution context
93            let _context = engine.create_execution_context()?;
94            println!("   ✓ Execution context created\n");
95
96            println!("4. Next steps for real inference:");
97            println!("   - Allocate CUDA memory for inputs/outputs");
98            println!("   - Copy input data to GPU");
99            println!("   - Bind tensor addresses with context.set_tensor_address()");
100            println!("   - Execute with context.enqueue_v3()");
101            println!("   - Copy results back to CPU");
102        }
103        Err(e) => {
104            eprintln!("   ✗ Failed to build engine: {}", e);
105            eprintln!("\n   This is expected for an empty network.");
106            eprintln!("   In production, add layers before building.");
107            return Err(e.into());
108        }
109    }
110
111    println!("\n✓ Example completed successfully!");
112
113    Ok(())
114}

More examples

Hide additional examples

examples/tiny_network.rs (line 42)

18fn main() -> Result<()> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("=== Tiny Network Example ===\n");
25
26    // 1. Create logger
27    println!("1. Creating logger...");
28    let logger = Logger::stderr()?;
29
30    // 2. Build the network
31    println!("2. Building network...");
32    let engine_data = build_tiny_network(&logger)?;
33    println!("   Engine size: {} bytes", engine_data.len());
34
35    // 3. Create runtime and deserialize engine
36    println!("\n3. Creating runtime and loading engine...");
37    let runtime = Runtime::new(&logger)?;
38    let engine = runtime.deserialize_cuda_engine(&engine_data)?;
39
40    // 4. Inspect engine
41    println!("4. Engine information:");
42    let num_io_tensors = engine.get_nb_io_tensors()?;
43    println!("   Number of I/O tensors: {}", num_io_tensors);
44
45    for i in 0..num_io_tensors {
46        let name = engine.get_tensor_name(i)?;
47        println!("   Tensor {}: {}", i, name);
48    }
49
50    // 5. Create execution context
51    println!("\n5. Creating execution context...");
52    let mut context = engine.create_execution_context()?;
53
54    // 6. Prepare input/output buffers
55    println!("6. Preparing buffers...");
56    let input_size = 3 * 4 * 4; // [1, 3, 4, 4]
57    let output_size = 3 * 4 * 4; // Same as input
58
59    // Create input with mix of positive and negative values
60    let input_data: Vec<f32> = (0..input_size)
61        .map(|i| {
62            // Create pattern: positive, negative, zero, positive, ...
63            match i % 4 {
64                0 => (i as f32) * 0.5,  // Positive values
65                1 => -(i as f32) * 0.3, // Negative values
66                2 => 0.0,               // Zero
67                _ => (i as f32) * 0.1,  // Small positive values
68            }
69        })
70        .collect();
71
72    println!("   Input shape: [1, 3, 4, 4] ({} elements)", input_size);
73    println!("   First 8 input values: {:?}", &input_data[..8]);
74
75    // Allocate device memory
76    let mut input_device = DeviceBuffer::new(input_size * std::mem::size_of::<f32>())?;
77    let output_device = DeviceBuffer::new(output_size * std::mem::size_of::<f32>())?;
78
79    // Copy input to device (convert f32 slice to bytes)
80    let input_bytes = unsafe {
81        std::slice::from_raw_parts(
82            input_data.as_ptr() as *const u8,
83            input_data.len() * std::mem::size_of::<f32>(),
84        )
85    };
86    input_device.copy_from_host(input_bytes)?;
87
88    // 7. Set tensor addresses
89    println!("\n7. Binding tensors...");
90    unsafe {
91        context.set_tensor_address("input", input_device.as_ptr())?;
92        context.set_tensor_address("output", output_device.as_ptr())?;
93    }
94
95    // 8. Execute inference
96    println!("8. Running inference...");
97    let stream = trtx::cuda::get_default_stream();
98    unsafe {
99        context.enqueue_v3(stream)?;
100    }
101    synchronize()?;
102    println!("   ✓ Inference completed");
103
104    // 9. Copy output back to host
105    println!("\n9. Reading results...");
106    let mut output_data: Vec<f32> = vec![0.0; output_size];
107    let output_bytes = unsafe {
108        std::slice::from_raw_parts_mut(
109            output_data.as_mut_ptr() as *mut u8,
110            output_data.len() * std::mem::size_of::<f32>(),
111        )
112    };
113    output_device.copy_to_host(output_bytes)?;
114
115    println!("   Output shape: [1, 3, 4, 4] ({} elements)", output_size);
116    println!("   First 8 output values: {:?}", &output_data[..8]);
117
118    // 10. Verify results
119    println!("\n10. Verification:");
120    println!("   ReLU function: max(0, x)");
121    println!("   - Positive inputs should pass through unchanged");
122    println!("   - Negative inputs should become 0.0");
123    println!("   - Zero inputs should remain 0.0");
124
125    let mut passed = true;
126    let mut failures = Vec::new();
127
128    for (i, (&input, &output)) in input_data.iter().zip(output_data.iter()).enumerate() {
129        let expected = if input > 0.0 { input } else { 0.0 };
130        let diff = (output - expected).abs();
131
132        if diff > 1e-6 {
133            passed = false;
134            if failures.len() < 5 {
135                failures.push((i, input, expected, output));
136            }
137        }
138    }
139
140    if passed {
141        println!(
142            "\n   ✓ PASS: All {} outputs match expected ReLU behavior!",
143            output_size
144        );
145
146        // Show some examples
147        println!("\n   Sample verification (first 8 elements):");
148        for i in 0..8.min(input_size) {
149            let input = input_data[i];
150            let output = output_data[i];
151            let expected = if input > 0.0 { input } else { 0.0 };
152            println!(
153                "      [{:2}] ReLU({:7.3}) = {:7.3} (expected {:7.3}) ✓",
154                i, input, output, expected
155            );
156        }
157    } else {
158        println!("\n   ✗ FAIL: {} mismatches found!", failures.len());
159        for (i, input, expected, output) in failures {
160            println!(
161                "      [{:2}] ReLU({:7.3}) = {:7.3}, expected {:7.3}",
162                i, input, output, expected
163            );
164        }
165    }
166
167    println!("\n=== Example completed ===");
168    Ok(())
169}

Source

pub fn get_tensor_name(&self, index: i32) -> Result<String>

Examples found in repository ?

examples/basic_workflow.rs (line 88)

18fn main() -> Result<(), Box<dyn Error>> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("TensorRT-RTX Basic Workflow Example");
25    println!("=====================================\n");
26
27    // Step 1: Create logger
28    println!("1. Creating logger...");
29    let logger = Logger::stderr()?;
30    println!("   ✓ Logger created\n");
31
32    // Step 2: Build phase
33    println!("2. Building engine...");
34
35    let builder = Builder::new(&logger)?;
36    println!("   ✓ Builder created");
37
38    // Create network with explicit batch dimensions
39    let mut network = builder.create_network(network_flags::EXPLICIT_BATCH)?;
40    println!("   ✓ Network created");
41
42    // Create and configure builder config
43    let mut config = builder.create_config()?;
44    println!("   ✓ Config created");
45
46    // Set workspace memory limit (1GB)
47    config.set_memory_pool_limit(MemoryPoolType::Workspace, 1 << 30)?;
48    println!("   ✓ Workspace limit set to 1GB");
49
50    // Note: In a real application, you would add layers to the network here
51    // For example:
52    // - network.add_input(...)
53    // - network.add_convolution(...)
54    // - network.add_activation(...)
55    // - etc.
56
57    println!("\n   Note: This example uses an empty network.");
58    println!("   In production, you would:");
59    println!("   - Parse an ONNX model");
60    println!("   - Or programmatically add layers");
61    println!("   - Define input/output tensors\n");
62
63    // Build serialized network
64    println!("   Building serialized engine...");
65    match builder.build_serialized_network(&mut network, &mut config) {
66        Ok(engine_data) => {
67            println!("   ✓ Engine built ({} bytes)", engine_data.len());
68
69            // Save to disk
70            let engine_path = "/tmp/example.engine";
71            std::fs::write(engine_path, &engine_data)?;
72            println!("   ✓ Engine saved to {}\n", engine_path);
73
74            // Step 3: Inference phase
75            println!("3. Loading engine for inference...");
76
77            let runtime = Runtime::new(&logger)?;
78            println!("   ✓ Runtime created");
79
80            let engine = runtime.deserialize_cuda_engine(&engine_data)?;
81            println!("   ✓ Engine deserialized");
82
83            // Query engine information
84            let num_tensors = engine.get_nb_io_tensors()?;
85            println!("   ✓ Engine has {} I/O tensors", num_tensors);
86
87            for i in 0..num_tensors {
88                let name = engine.get_tensor_name(i)?;
89                println!("      - Tensor {}: {}", i, name);
90            }
91
92            // Create execution context
93            let _context = engine.create_execution_context()?;
94            println!("   ✓ Execution context created\n");
95
96            println!("4. Next steps for real inference:");
97            println!("   - Allocate CUDA memory for inputs/outputs");
98            println!("   - Copy input data to GPU");
99            println!("   - Bind tensor addresses with context.set_tensor_address()");
100            println!("   - Execute with context.enqueue_v3()");
101            println!("   - Copy results back to CPU");
102        }
103        Err(e) => {
104            eprintln!("   ✗ Failed to build engine: {}", e);
105            eprintln!("\n   This is expected for an empty network.");
106            eprintln!("   In production, add layers before building.");
107            return Err(e.into());
108        }
109    }
110
111    println!("\n✓ Example completed successfully!");
112
113    Ok(())
114}

More examples

Hide additional examples

examples/tiny_network.rs (line 46)

18fn main() -> Result<()> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("=== Tiny Network Example ===\n");
25
26    // 1. Create logger
27    println!("1. Creating logger...");
28    let logger = Logger::stderr()?;
29
30    // 2. Build the network
31    println!("2. Building network...");
32    let engine_data = build_tiny_network(&logger)?;
33    println!("   Engine size: {} bytes", engine_data.len());
34
35    // 3. Create runtime and deserialize engine
36    println!("\n3. Creating runtime and loading engine...");
37    let runtime = Runtime::new(&logger)?;
38    let engine = runtime.deserialize_cuda_engine(&engine_data)?;
39
40    // 4. Inspect engine
41    println!("4. Engine information:");
42    let num_io_tensors = engine.get_nb_io_tensors()?;
43    println!("   Number of I/O tensors: {}", num_io_tensors);
44
45    for i in 0..num_io_tensors {
46        let name = engine.get_tensor_name(i)?;
47        println!("   Tensor {}: {}", i, name);
48    }
49
50    // 5. Create execution context
51    println!("\n5. Creating execution context...");
52    let mut context = engine.create_execution_context()?;
53
54    // 6. Prepare input/output buffers
55    println!("6. Preparing buffers...");
56    let input_size = 3 * 4 * 4; // [1, 3, 4, 4]
57    let output_size = 3 * 4 * 4; // Same as input
58
59    // Create input with mix of positive and negative values
60    let input_data: Vec<f32> = (0..input_size)
61        .map(|i| {
62            // Create pattern: positive, negative, zero, positive, ...
63            match i % 4 {
64                0 => (i as f32) * 0.5,  // Positive values
65                1 => -(i as f32) * 0.3, // Negative values
66                2 => 0.0,               // Zero
67                _ => (i as f32) * 0.1,  // Small positive values
68            }
69        })
70        .collect();
71
72    println!("   Input shape: [1, 3, 4, 4] ({} elements)", input_size);
73    println!("   First 8 input values: {:?}", &input_data[..8]);
74
75    // Allocate device memory
76    let mut input_device = DeviceBuffer::new(input_size * std::mem::size_of::<f32>())?;
77    let output_device = DeviceBuffer::new(output_size * std::mem::size_of::<f32>())?;
78
79    // Copy input to device (convert f32 slice to bytes)
80    let input_bytes = unsafe {
81        std::slice::from_raw_parts(
82            input_data.as_ptr() as *const u8,
83            input_data.len() * std::mem::size_of::<f32>(),
84        )
85    };
86    input_device.copy_from_host(input_bytes)?;
87
88    // 7. Set tensor addresses
89    println!("\n7. Binding tensors...");
90    unsafe {
91        context.set_tensor_address("input", input_device.as_ptr())?;
92        context.set_tensor_address("output", output_device.as_ptr())?;
93    }
94
95    // 8. Execute inference
96    println!("8. Running inference...");
97    let stream = trtx::cuda::get_default_stream();
98    unsafe {
99        context.enqueue_v3(stream)?;
100    }
101    synchronize()?;
102    println!("   ✓ Inference completed");
103
104    // 9. Copy output back to host
105    println!("\n9. Reading results...");
106    let mut output_data: Vec<f32> = vec![0.0; output_size];
107    let output_bytes = unsafe {
108        std::slice::from_raw_parts_mut(
109            output_data.as_mut_ptr() as *mut u8,
110            output_data.len() * std::mem::size_of::<f32>(),
111        )
112    };
113    output_device.copy_to_host(output_bytes)?;
114
115    println!("   Output shape: [1, 3, 4, 4] ({} elements)", output_size);
116    println!("   First 8 output values: {:?}", &output_data[..8]);
117
118    // 10. Verify results
119    println!("\n10. Verification:");
120    println!("   ReLU function: max(0, x)");
121    println!("   - Positive inputs should pass through unchanged");
122    println!("   - Negative inputs should become 0.0");
123    println!("   - Zero inputs should remain 0.0");
124
125    let mut passed = true;
126    let mut failures = Vec::new();
127
128    for (i, (&input, &output)) in input_data.iter().zip(output_data.iter()).enumerate() {
129        let expected = if input > 0.0 { input } else { 0.0 };
130        let diff = (output - expected).abs();
131
132        if diff > 1e-6 {
133            passed = false;
134            if failures.len() < 5 {
135                failures.push((i, input, expected, output));
136            }
137        }
138    }
139
140    if passed {
141        println!(
142            "\n   ✓ PASS: All {} outputs match expected ReLU behavior!",
143            output_size
144        );
145
146        // Show some examples
147        println!("\n   Sample verification (first 8 elements):");
148        for i in 0..8.min(input_size) {
149            let input = input_data[i];
150            let output = output_data[i];
151            let expected = if input > 0.0 { input } else { 0.0 };
152            println!(
153                "      [{:2}] ReLU({:7.3}) = {:7.3} (expected {:7.3}) ✓",
154                i, input, output, expected
155            );
156        }
157    } else {
158        println!("\n   ✗ FAIL: {} mismatches found!", failures.len());
159        for (i, input, expected, output) in failures {
160            println!(
161                "      [{:2}] ReLU({:7.3}) = {:7.3}, expected {:7.3}",
162                i, input, output, expected
163            );
164        }
165    }
166
167    println!("\n=== Example completed ===");
168    Ok(())
169}

Source

pub fn get_tensor_shape(&self, name: &str) -> Result<Vec<i64>>

Source

pub fn create_execution_context(&self) -> Result<ExecutionContext<'_>>

Examples found in repository ?

examples/basic_workflow.rs (line 93)

18fn main() -> Result<(), Box<dyn Error>> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("TensorRT-RTX Basic Workflow Example");
25    println!("=====================================\n");
26
27    // Step 1: Create logger
28    println!("1. Creating logger...");
29    let logger = Logger::stderr()?;
30    println!("   ✓ Logger created\n");
31
32    // Step 2: Build phase
33    println!("2. Building engine...");
34
35    let builder = Builder::new(&logger)?;
36    println!("   ✓ Builder created");
37
38    // Create network with explicit batch dimensions
39    let mut network = builder.create_network(network_flags::EXPLICIT_BATCH)?;
40    println!("   ✓ Network created");
41
42    // Create and configure builder config
43    let mut config = builder.create_config()?;
44    println!("   ✓ Config created");
45
46    // Set workspace memory limit (1GB)
47    config.set_memory_pool_limit(MemoryPoolType::Workspace, 1 << 30)?;
48    println!("   ✓ Workspace limit set to 1GB");
49
50    // Note: In a real application, you would add layers to the network here
51    // For example:
52    // - network.add_input(...)
53    // - network.add_convolution(...)
54    // - network.add_activation(...)
55    // - etc.
56
57    println!("\n   Note: This example uses an empty network.");
58    println!("   In production, you would:");
59    println!("   - Parse an ONNX model");
60    println!("   - Or programmatically add layers");
61    println!("   - Define input/output tensors\n");
62
63    // Build serialized network
64    println!("   Building serialized engine...");
65    match builder.build_serialized_network(&mut network, &mut config) {
66        Ok(engine_data) => {
67            println!("   ✓ Engine built ({} bytes)", engine_data.len());
68
69            // Save to disk
70            let engine_path = "/tmp/example.engine";
71            std::fs::write(engine_path, &engine_data)?;
72            println!("   ✓ Engine saved to {}\n", engine_path);
73
74            // Step 3: Inference phase
75            println!("3. Loading engine for inference...");
76
77            let runtime = Runtime::new(&logger)?;
78            println!("   ✓ Runtime created");
79
80            let engine = runtime.deserialize_cuda_engine(&engine_data)?;
81            println!("   ✓ Engine deserialized");
82
83            // Query engine information
84            let num_tensors = engine.get_nb_io_tensors()?;
85            println!("   ✓ Engine has {} I/O tensors", num_tensors);
86
87            for i in 0..num_tensors {
88                let name = engine.get_tensor_name(i)?;
89                println!("      - Tensor {}: {}", i, name);
90            }
91
92            // Create execution context
93            let _context = engine.create_execution_context()?;
94            println!("   ✓ Execution context created\n");
95
96            println!("4. Next steps for real inference:");
97            println!("   - Allocate CUDA memory for inputs/outputs");
98            println!("   - Copy input data to GPU");
99            println!("   - Bind tensor addresses with context.set_tensor_address()");
100            println!("   - Execute with context.enqueue_v3()");
101            println!("   - Copy results back to CPU");
102        }
103        Err(e) => {
104            eprintln!("   ✗ Failed to build engine: {}", e);
105            eprintln!("\n   This is expected for an empty network.");
106            eprintln!("   In production, add layers before building.");
107            return Err(e.into());
108        }
109    }
110
111    println!("\n✓ Example completed successfully!");
112
113    Ok(())
114}

More examples

Hide additional examples

examples/tiny_network.rs (line 52)

18fn main() -> Result<()> {
19    #[cfg(feature = "dlopen_tensorrt_rtx")]
20    trtx::dynamically_load_tensorrt(None::<String>).unwrap();
21    #[cfg(feature = "dlopen_tensorrt_onnxparser")]
22    trtx::dynamically_load_tensorrt_onnxparser(None::<String>).unwrap();
23
24    println!("=== Tiny Network Example ===\n");
25
26    // 1. Create logger
27    println!("1. Creating logger...");
28    let logger = Logger::stderr()?;
29
30    // 2. Build the network
31    println!("2. Building network...");
32    let engine_data = build_tiny_network(&logger)?;
33    println!("   Engine size: {} bytes", engine_data.len());
34
35    // 3. Create runtime and deserialize engine
36    println!("\n3. Creating runtime and loading engine...");
37    let runtime = Runtime::new(&logger)?;
38    let engine = runtime.deserialize_cuda_engine(&engine_data)?;
39
40    // 4. Inspect engine
41    println!("4. Engine information:");
42    let num_io_tensors = engine.get_nb_io_tensors()?;
43    println!("   Number of I/O tensors: {}", num_io_tensors);
44
45    for i in 0..num_io_tensors {
46        let name = engine.get_tensor_name(i)?;
47        println!("   Tensor {}: {}", i, name);
48    }
49
50    // 5. Create execution context
51    println!("\n5. Creating execution context...");
52    let mut context = engine.create_execution_context()?;
53
54    // 6. Prepare input/output buffers
55    println!("6. Preparing buffers...");
56    let input_size = 3 * 4 * 4; // [1, 3, 4, 4]
57    let output_size = 3 * 4 * 4; // Same as input
58
59    // Create input with mix of positive and negative values
60    let input_data: Vec<f32> = (0..input_size)
61        .map(|i| {
62            // Create pattern: positive, negative, zero, positive, ...
63            match i % 4 {
64                0 => (i as f32) * 0.5,  // Positive values
65                1 => -(i as f32) * 0.3, // Negative values
66                2 => 0.0,               // Zero
67                _ => (i as f32) * 0.1,  // Small positive values
68            }
69        })
70        .collect();
71
72    println!("   Input shape: [1, 3, 4, 4] ({} elements)", input_size);
73    println!("   First 8 input values: {:?}", &input_data[..8]);
74
75    // Allocate device memory
76    let mut input_device = DeviceBuffer::new(input_size * std::mem::size_of::<f32>())?;
77    let output_device = DeviceBuffer::new(output_size * std::mem::size_of::<f32>())?;
78
79    // Copy input to device (convert f32 slice to bytes)
80    let input_bytes = unsafe {
81        std::slice::from_raw_parts(
82            input_data.as_ptr() as *const u8,
83            input_data.len() * std::mem::size_of::<f32>(),
84        )
85    };
86    input_device.copy_from_host(input_bytes)?;
87
88    // 7. Set tensor addresses
89    println!("\n7. Binding tensors...");
90    unsafe {
91        context.set_tensor_address("input", input_device.as_ptr())?;
92        context.set_tensor_address("output", output_device.as_ptr())?;
93    }
94
95    // 8. Execute inference
96    println!("8. Running inference...");
97    let stream = trtx::cuda::get_default_stream();
98    unsafe {
99        context.enqueue_v3(stream)?;
100    }
101    synchronize()?;
102    println!("   ✓ Inference completed");
103
104    // 9. Copy output back to host
105    println!("\n9. Reading results...");
106    let mut output_data: Vec<f32> = vec![0.0; output_size];
107    let output_bytes = unsafe {
108        std::slice::from_raw_parts_mut(
109            output_data.as_mut_ptr() as *mut u8,
110            output_data.len() * std::mem::size_of::<f32>(),
111        )
112    };
113    output_device.copy_to_host(output_bytes)?;
114
115    println!("   Output shape: [1, 3, 4, 4] ({} elements)", output_size);
116    println!("   First 8 output values: {:?}", &output_data[..8]);
117
118    // 10. Verify results
119    println!("\n10. Verification:");
120    println!("   ReLU function: max(0, x)");
121    println!("   - Positive inputs should pass through unchanged");
122    println!("   - Negative inputs should become 0.0");
123    println!("   - Zero inputs should remain 0.0");
124
125    let mut passed = true;
126    let mut failures = Vec::new();
127
128    for (i, (&input, &output)) in input_data.iter().zip(output_data.iter()).enumerate() {
129        let expected = if input > 0.0 { input } else { 0.0 };
130        let diff = (output - expected).abs();
131
132        if diff > 1e-6 {
133            passed = false;
134            if failures.len() < 5 {
135                failures.push((i, input, expected, output));
136            }
137        }
138    }
139
140    if passed {
141        println!(
142            "\n   ✓ PASS: All {} outputs match expected ReLU behavior!",
143            output_size
144        );
145
146        // Show some examples
147        println!("\n   Sample verification (first 8 elements):");
148        for i in 0..8.min(input_size) {
149            let input = input_data[i];
150            let output = output_data[i];
151            let expected = if input > 0.0 { input } else { 0.0 };
152            println!(
153                "      [{:2}] ReLU({:7.3}) = {:7.3} (expected {:7.3}) ✓",
154                i, input, output, expected
155            );
156        }
157    } else {
158        println!("\n   ✗ FAIL: {} mismatches found!", failures.len());
159        for (i, input, expected, output) in failures {
160            println!(
161                "      [{:2}] ReLU({:7.3}) = {:7.3}, expected {:7.3}",
162                i, input, output, expected
163            );
164        }
165    }
166
167    println!("\n=== Example completed ===");
168    Ok(())
169}