1use npu_rs::{
2 NpuDevice, DeviceInfo, Tensor, ExecutionContext, ModelConfig, ModelRuntime,
3 QuantFormat, OptimizationLevel, NeuralNetwork, Layer, LayerType,
4 QuantStats, QuantConverter, PTQEngine, GraphOptimizer, ComputationGraph,
5 ComputeNode, Profiler, ProfileEvent,
6};
7use std::sync::Arc;
8
9fn main() {
11 println!("=== Full NPU Inference Pipeline ===\n");
12
13 setup_device();
14 build_model();
15 quantize_model();
16 execute_inference();
17 monitor_performance();
18
19 println!("\n=== Pipeline Completed ===\n");
20}
21
22fn setup_device() {
23 println!("1. Device Setup");
24
25 let device_info = DeviceInfo {
26 device_id: 0,
27 peak_throughput_tops: 20.0,
28 memory_mb: 512,
29 compute_units: 4,
30 frequency_mhz: 800,
31 power_tdp_watts: 5.0,
32 vendor: "Xilinx/SiFive".to_string(),
33 device_name: "XilinxAI-Engine 20-TOPS".to_string(),
34 };
35
36 let device = Arc::new(NpuDevice::with_config(device_info));
37 match device.initialize() {
38 Ok(_) => {
39 println!(" ✓ Device initialized");
40 let info = device.get_info();
41 println!(" Device: {} ({} MB memory, {} TOPS peak)",
42 info.device_name, info.memory_mb, info.peak_throughput_tops
43 );
44
45 let memory_pool = device.get_memory_pool();
46 let manager = memory_pool.get_manager();
47 println!(" Available memory: {} MB\n",
48 manager.get_available_bytes() / 1024 / 1024
49 );
50 }
51 Err(e) => println!(" ✗ Init failed: {}\n", e),
52 }
53}
54
55fn build_model() {
56 println!("2. Model Building");
57
58 let model_config = ModelConfig {
59 name: "ResNet18-Lite".to_string(),
60 input_shape: vec![1, 224, 224, 3],
61 output_shape: vec![1, 1000],
62 quant_format: QuantFormat::Int8,
63 optimization_level: OptimizationLevel::O3,
64 use_cache: true,
65 };
66
67 let runtime = ModelRuntime::new(model_config);
68 println!(" Model: {}", runtime.get_config().name);
69 println!(" Input: {:?}", runtime.input_shape());
70 println!(" Output: {:?}", runtime.output_shape());
71
72 let mut network = NeuralNetwork::new(runtime.get_config().name.clone());
73
74 network.add_layer(Layer::new(
75 "stem_conv".to_string(),
76 LayerType::Convolution,
77 vec![1, 224, 224, 3],
78 vec![1, 112, 112, 64],
79 ));
80
81 network.add_layer(Layer::new(
82 "residual_block_1".to_string(),
83 LayerType::PointwiseConvolution,
84 vec![1, 112, 112, 64],
85 vec![1, 112, 112, 64],
86 ));
87
88 network.add_layer(Layer::new(
89 "residual_block_2".to_string(),
90 LayerType::PointwiseConvolution,
91 vec![1, 56, 56, 128],
92 vec![1, 56, 56, 128],
93 ));
94
95 network.add_layer(Layer::new(
96 "global_avg_pool".to_string(),
97 LayerType::Pooling,
98 vec![1, 7, 7, 512],
99 vec![1, 512],
100 ));
101
102 network.add_layer(Layer::new(
103 "classifier".to_string(),
104 LayerType::FullyConnected,
105 vec![1, 512],
106 vec![1, 1000],
107 ));
108
109 println!(" Layers: {}", network.layer_count());
110 println!(" Estimated TOPS: {:.6}\n", network.total_tops());
111}
112
113fn quantize_model() {
114 println!("3. Model Quantization");
115
116 let calibration_data = vec![
117 Tensor::random(&[1, 224, 224, 3]).data,
118 Tensor::random(&[1, 224, 224, 3]).data,
119 Tensor::random(&[1, 224, 224, 3]).data,
120 ];
121
122 let ptq = PTQEngine::new(8, false);
123 match ptq.calibrate(&calibration_data) {
124 Ok(converter) => {
125 println!(" ✓ Calibration complete");
126
127 let sample = &calibration_data[0];
128 let stats = QuantStats::from_tensor(sample);
129 println!(" Calibration Stats:");
130 println!(" - Min: {:.6}", stats.min_val);
131 println!(" - Max: {:.6}", stats.max_val);
132 println!(" - Mean: {:.6}", stats.mean_val);
133 println!(" - Std: {:.6}", stats.std_val);
134
135 match converter.quantize_tensor(sample) {
136 Ok(quantized) => {
137 println!(" ✓ Quantization complete: {} values", quantized.len());
138 println!(" Compression: {:.2}x\n",
139 (sample.len() * 4) as f64 / quantized.len() as f64
140 );
141 }
142 Err(e) => println!(" ✗ Quantization failed: {}\n", e),
143 }
144 }
145 Err(e) => println!(" ✗ Calibration failed: {}\n", e),
146 }
147}
148
149fn execute_inference() {
150 println!("4. Inference Execution");
151
152 let device = Arc::new(NpuDevice::new());
153 match device.initialize() {
154 Ok(_) => {
155 let ctx = ExecutionContext::new(device);
156
157 let input = Tensor::random(&[1, 224, 224, 3]);
158 let weights = Tensor::random(&[1, 1, 3, 64]);
159
160 println!(" Input: {:?}", input.shape());
161 println!(" Weights: {:?}", weights.shape());
162
163 match ctx.execute_conv1x1(&input.data, &weights.data) {
164 Ok(output) => {
165 println!(" ✓ Conv1x1 executed");
166 println!(" Output: {:?}", output.shape());
167 println!(" Throughput: {:.4} GOPS\n", ctx.get_current_throughput_gops());
168 }
169 Err(e) => println!(" ✗ Execution failed: {}\n", e),
170 }
171 }
172 Err(e) => println!(" ✗ Device init failed: {}\n", e),
173 }
174}
175
176fn monitor_performance() {
177 println!("5. Performance Monitoring");
178
179 let device = Arc::new(NpuDevice::new());
180 let _ = device.initialize();
181
182 let mut profiler = Profiler::new(device);
183
184 let ops_profile = vec![
185 ("Conv3x3_In16_Out32", 1728, 0.25, 4.5),
186 ("MatMul_512x1000", 1_024_000, 0.15, 4.2),
187 ("ReLU_Activation", 512_000, 0.05, 2.1),
188 ];
189
190 println!(" Recording {} operations...", ops_profile.len());
191 for (name, ops, duration, power) in ops_profile {
192 profiler.record_event(ProfileEvent {
193 event_name: name.to_string(),
194 start_time_ms: 0.0,
195 duration_ms: duration,
196 ops_count: ops,
197 power_watts: power,
198 });
199 }
200
201 println!(" \n Operations Profile:");
202 for event in profiler.get_events() {
203 println!(" - {}: {:.2} GOPS, {:.2} W",
204 event.event_name,
205 event.get_throughput_gops(),
206 event.power_watts
207 );
208 }
209
210 let report = profiler.generate_report();
211 println!("\n Performance Summary:");
212 println!(" - Total Ops: {}", report.total_operations);
213 println!(" - Total Time: {:.4} ms", report.total_time_ms);
214 println!(" - Avg Throughput: {:.2} GOPS", report.avg_throughput_gops);
215 println!(" - Peak Power: {:.2} W\n", report.peak_power_watts);
216}