1use serde::{Deserialize, Serialize};
5
6#[derive(Debug, Clone, Default, Serialize, Deserialize)]
9pub struct FullProfile {
10 pub version: String,
11 pub timestamp: String,
12 #[serde(default)]
13 pub hardware: HardwareInfo,
14 #[serde(default)]
15 pub kernel: Option<KernelInfo>,
16 #[serde(default)]
17 pub timing: TimingMetrics,
18 #[serde(default)]
19 pub throughput: ThroughputMetrics,
20 #[serde(default)]
21 pub roofline: Option<RooflineMetrics>,
22 #[serde(default)]
23 pub gpu_compute: Option<GpuComputeMetrics>,
24 #[serde(default)]
25 pub gpu_memory: Option<GpuMemoryMetrics>,
26 #[serde(default)]
27 pub gpu_stalls: Option<GpuStallMetrics>,
28 #[serde(default)]
29 pub gpu_transfer: Option<GpuTransferMetrics>,
30 #[serde(default)]
31 pub vram: Option<VramMetrics>,
32 #[serde(default)]
33 pub pcie: Option<PcieMetrics>,
34 #[serde(default)]
35 pub system_health: Option<SystemHealthMetrics>,
36 #[serde(default)]
37 pub energy: Option<EnergyMetrics>,
38 #[serde(default)]
39 pub cpu_counters: Option<CpuHwCounters>,
40 #[serde(default)]
41 pub cpu_simd: Option<CpuSimdCounters>,
42 #[serde(default)]
43 pub arm_counters: Option<ArmCounters>,
44 #[serde(default)]
45 pub cpu_memory: Option<CpuMemoryMetrics>,
46 #[serde(default)]
47 pub swap: Option<SwapMetrics>,
48 #[serde(default)]
49 pub disk_io: Option<DiskIoMetrics>,
50 #[serde(default)]
51 pub network_io: Option<NetworkIoMetrics>,
52 #[serde(default)]
53 pub numa: Option<NumaMetrics>,
54 #[serde(default)]
55 pub wasm: Option<WasmMetrics>,
56 #[serde(default)]
57 pub quant: Option<QuantMetrics>,
58 #[serde(default)]
59 pub rayon: Option<RayonMetrics>,
60 #[serde(default)]
61 pub compilation: Option<CompilationMetrics>,
62 #[serde(default)]
63 pub async_profiling: Option<AsyncMetrics>,
64 #[serde(default)]
65 pub muda: Vec<MudaEntry>,
66 #[serde(default)]
67 pub metal: Option<MetalMetrics>,
68 #[serde(default)]
69 pub regression: Option<RegressionMetrics>,
70 #[serde(default)]
71 pub syscall: Option<SyscallMetrics>,
72}
73
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
76pub struct TimingMetrics {
77 pub wall_clock_time_us: f64,
78 pub samples: u32,
79 pub stddev_us: f64,
80 pub ci_95_low_us: f64,
81 pub ci_95_high_us: f64,
82}
83
84#[derive(Debug, Clone, Default, Serialize, Deserialize)]
86pub struct ThroughputMetrics {
87 pub tflops: f64,
88 pub gflops: f64,
89 pub bandwidth_gbps: f64,
90 pub arithmetic_intensity: f64,
91}
92
93#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct RooflineMetrics {
96 pub peak_compute_tflops: f64,
97 pub peak_bandwidth_gbps: f64,
98 pub ridge_point: f64,
99 pub bound: String,
100 pub efficiency_pct: f64,
101 pub distance_to_ridge: f64,
102}
103
104#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct GpuComputeMetrics {
107 pub sm_utilization_pct: f64,
108 pub achieved_occupancy_pct: f64,
109 pub warp_execution_efficiency_pct: f64,
110 pub branch_efficiency_pct: f64,
111 pub tensor_core_utilization_pct: f64,
112 pub ipc: f64,
113 pub flop16_ops: u64,
114 pub flop32_ops: u64,
115 pub register_usage_per_thread: u32,
116 pub shared_memory_per_block: u32,
117 pub grid_dimensions: [u32; 3],
118 pub block_dimensions: [u32; 3],
119}
120
121#[derive(Debug, Clone, Default, Serialize, Deserialize)]
123pub struct GpuMemoryMetrics {
124 pub dram_throughput_pct: f64,
125 pub l1_hit_rate_pct: f64,
126 pub l2_hit_rate_pct: f64,
127 pub global_load_efficiency_pct: f64,
128 pub global_store_efficiency_pct: f64,
129 pub shared_load_efficiency_pct: f64,
130 pub shared_store_efficiency_pct: f64,
131 pub shared_bank_conflicts: u64,
132}
133
134#[derive(Debug, Clone, Default, Serialize, Deserialize)]
136pub struct GpuStallMetrics {
137 pub barrier_stall_cycles: u64,
138 pub memory_stall_cycles: u64,
139 pub pipeline_bubbles: u64,
140 pub warp_scheduler_idle_pct: f64,
141}
142
143#[derive(Debug, Clone, Default, Serialize, Deserialize)]
145pub struct GpuTransferMetrics {
146 pub h2d_bandwidth_gbps: f64,
147 pub d2h_bandwidth_gbps: f64,
148 pub pcie_utilization_pct: f64,
149}
150
151#[derive(Debug, Clone, Default, Serialize, Deserialize)]
153pub struct VramMetrics {
154 pub vram_used_mb: f64,
155 pub vram_total_mb: f64,
156 pub vram_free_mb: f64,
157 pub vram_utilization_pct: f64,
158 pub vram_peak_mb: f64,
159 pub vram_allocation_count: u64,
160 pub vram_fragmentation_pct: f64,
161}
162
163#[derive(Debug, Clone, Default, Serialize, Deserialize)]
165pub struct PcieMetrics {
166 pub pcie_gen: u32,
167 pub pcie_width: u32,
168 pub pcie_bandwidth_theoretical_gbps: f64,
169 pub pcie_rx_throughput_gbps: f64,
170 pub pcie_tx_throughput_gbps: f64,
171}
172
173#[derive(Debug, Clone, Default, Serialize, Deserialize)]
175pub struct SystemHealthMetrics {
176 pub gpu_temperature_celsius: f64,
177 pub gpu_power_watts: f64,
178 pub gpu_clock_mhz: f64,
179 pub gpu_memory_clock_mhz: f64,
180 pub cpu_frequency_mhz: f64,
181 pub cpu_temperature_celsius: f64,
182 pub gpu_memory_used_mb: f64,
183 pub gpu_memory_total_mb: f64,
184}
185
186#[derive(Debug, Clone, Default, Serialize, Deserialize)]
188pub struct EnergyMetrics {
189 pub tflops_per_watt: f64,
190 pub joules_per_inference: f64,
191}
192
193#[derive(Debug, Clone, Default, Serialize, Deserialize)]
195pub struct CpuHwCounters {
196 pub cycles: u64,
197 pub instructions: u64,
198 pub cache_references: u64,
199 pub cache_misses: u64,
200 pub l1_dcache_load_misses: u64,
201 pub llc_loads: u64,
202 pub branches: u64,
203 pub branch_misses: u64,
204}
205
206#[derive(Debug, Clone, Default, Serialize, Deserialize)]
208pub struct CpuSimdCounters {
209 pub fp_arith_scalar_single: u64,
210 pub fp_arith_128b_packed_single: u64,
211 pub fp_arith_256b_packed_single: u64,
212 pub fp_arith_512b_packed_single: u64,
213 pub simd_utilization_pct: f64,
214}
215
216#[derive(Debug, Clone, Default, Serialize, Deserialize)]
218pub struct ArmCounters {
219 pub inst_retired: u64,
220 pub cpu_cycles: u64,
221 pub ase_spec: u64,
222}
223
224#[derive(Debug, Clone, Default, Serialize, Deserialize)]
226pub struct CpuMemoryMetrics {
227 pub rss_mb: f64,
228 pub rss_peak_mb: f64,
229 pub vms_mb: f64,
230 pub heap_allocated_mb: f64,
231 pub heap_peak_mb: f64,
232 pub malloc_count: u64,
233 pub free_count: u64,
234 pub memory_leaks_bytes: u64,
235}
236
237#[derive(Debug, Clone, Default, Serialize, Deserialize)]
239pub struct SwapMetrics {
240 pub swap_used_mb: f64,
241 pub swap_in_count: u64,
242 pub swap_out_count: u64,
243 pub swap_activity_detected: bool,
244}
245
246#[derive(Debug, Clone, Default, Serialize, Deserialize)]
248pub struct DiskIoMetrics {
249 pub disk_read_bytes: u64,
250 pub disk_write_bytes: u64,
251 pub disk_read_iops: f64,
252 pub disk_write_iops: f64,
253 pub io_wait_pct: f64,
254 pub file_descriptors_open: u64,
255}
256
257#[derive(Debug, Clone, Default, Serialize, Deserialize)]
259pub struct NetworkIoMetrics {
260 pub net_rx_bytes: u64,
261 pub net_tx_bytes: u64,
262}
263
264#[derive(Debug, Clone, Default, Serialize, Deserialize)]
266pub struct NumaMetrics {
267 pub numa_node: u32,
268 pub numa_remote_access_pct: f64,
269 pub cpu_affinity_mask: String,
270 pub voluntary_ctx_switches: u64,
271 pub involuntary_ctx_switches: u64,
272 pub cpu_migration_count: u64,
273}
274
275#[derive(Debug, Clone, Default, Serialize, Deserialize)]
277pub struct WasmMetrics {
278 pub instruction_count: u64,
279 pub fuel_consumed: u64,
280 pub simd128_detected: bool,
281}
282
283#[derive(Debug, Clone, Default, Serialize, Deserialize)]
285pub struct QuantMetrics {
286 pub superblocks_per_sec: f64,
287 pub effective_bandwidth_gbps: f64,
288 pub compression_speedup: f64,
289}
290
291#[derive(Debug, Clone, Default, Serialize, Deserialize)]
293pub struct RayonMetrics {
294 pub parallel_speedup: f64,
295 pub parallel_efficiency: f64,
296 pub heijunka_score: f64,
297 pub thread_spawn_overhead_us: f64,
298 pub work_steal_count: u64,
299 pub num_threads: u32,
300}
301
302#[derive(Debug, Clone, Default, Serialize, Deserialize)]
304pub struct CompilationMetrics {
305 pub ptx_jit_time_ms: f64,
306 pub ptx_cache_hit: bool,
307 pub ptx_size_bytes: u64,
308 pub sass_instruction_count: u64,
309}
310
311#[derive(Debug, Clone, Default, Serialize, Deserialize)]
313pub struct AsyncMetrics {
314 pub poll_count: u64,
315 pub poll_efficiency: f64,
316 pub yield_ratio: f64,
317 pub avg_poll_latency_us: f64,
318}
319
320#[derive(Debug, Clone, Serialize, Deserialize)]
322pub struct MudaEntry {
323 pub muda_type: String,
324 pub source: String,
325 pub impact_pct: f64,
326}
327
328#[derive(Debug, Clone, Default, Serialize, Deserialize)]
330pub struct MetalMetrics {
331 pub gpu_timestamp_ns: u64,
332 pub dispatch_config: String,
333}
334
335#[derive(Debug, Clone, Default, Serialize, Deserialize)]
337pub struct RegressionMetrics {
338 pub regression_pct: f64,
339 pub p_value: f64,
340 pub effect_size_cohens_d: f64,
341 pub verdict: String,
342}
343
344#[derive(Debug, Clone, Default, Serialize, Deserialize)]
346pub struct SyscallMetrics {
347 pub total_syscalls: u64,
348 pub syscall_breakdown: std::collections::HashMap<String, u64>,
349 pub io_overhead_pct: f64,
350 pub page_faults_minor: u64,
351 pub page_faults_major: u64,
352}
353
354#[derive(Debug, Clone, Default, Serialize, Deserialize)]
356pub struct HardwareInfo {
357 pub gpu: Option<String>,
358 pub gpu_sm: Option<String>,
359 pub gpu_memory_gb: Option<f64>,
360 pub gpu_bandwidth_gbps: Option<f64>,
361 pub gpu_pcie: Option<String>,
362 pub cpu: Option<String>,
363 pub cpu_features: Vec<String>,
364 pub numa_nodes: Option<u32>,
365}
366
367#[derive(Debug, Clone, Default, Serialize, Deserialize)]
369pub struct KernelInfo {
370 pub name: String,
371 pub dimensions: Vec<u32>,
372 pub grid: Option<[u32; 3]>,
373 pub block: Option<[u32; 3]>,
374 pub shared_memory_bytes: Option<u32>,
375 pub registers_per_thread: Option<u32>,
376}
377
378pub fn total_metric_count() -> usize {
380 5 + 4 + 6 + 12 + 8 + 4 + 3 + 7 + 5 + 8 + 2 + 8 + 5 + 3 + 8 + 4 + 6 + 2 + 6 + 3 + 3 + 6 + 4 + 4 + 13 + 2 + 4 + 5 }
409
410#[cfg(test)]
411mod tests {
412 use super::*;
413
414 #[test]
420 fn test_metric_count() {
421 assert_eq!(total_metric_count(), 150);
422 }
423
424 #[test]
426 fn test_category_count() {
427 let categories = 28;
429 assert_eq!(categories, 28);
430 }
431
432 #[test]
434 fn test_full_profile_json_roundtrip() {
435 let profile = FullProfile {
436 version: "2.0".to_string(),
437 timestamp: "2026-04-04T12:00:00Z".to_string(),
438 timing: TimingMetrics {
439 wall_clock_time_us: 23.2,
440 samples: 50,
441 stddev_us: 0.3,
442 ci_95_low_us: 23.0,
443 ci_95_high_us: 23.4,
444 },
445 throughput: ThroughputMetrics {
446 tflops: 11.6,
447 gflops: 0.0,
448 bandwidth_gbps: 78.4,
449 arithmetic_intensity: 16.0,
450 },
451 ..Default::default()
452 };
453 let json = serde_json::to_string_pretty(&profile).unwrap();
454 let parsed: FullProfile = serde_json::from_str(&json).unwrap();
455 assert_eq!(parsed.version, "2.0");
456 assert!((parsed.timing.wall_clock_time_us - 23.2).abs() < 0.01);
457 assert!((parsed.throughput.tflops - 11.6).abs() < 0.01);
458 }
459}