use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct FullProfile {
pub version: String,
pub timestamp: String,
#[serde(default)]
pub hardware: HardwareInfo,
#[serde(default)]
pub kernel: Option<KernelInfo>,
#[serde(default)]
pub timing: TimingMetrics,
#[serde(default)]
pub throughput: ThroughputMetrics,
#[serde(default)]
pub roofline: Option<RooflineMetrics>,
#[serde(default)]
pub gpu_compute: Option<GpuComputeMetrics>,
#[serde(default)]
pub gpu_memory: Option<GpuMemoryMetrics>,
#[serde(default)]
pub gpu_stalls: Option<GpuStallMetrics>,
#[serde(default)]
pub gpu_transfer: Option<GpuTransferMetrics>,
#[serde(default)]
pub vram: Option<VramMetrics>,
#[serde(default)]
pub pcie: Option<PcieMetrics>,
#[serde(default)]
pub system_health: Option<SystemHealthMetrics>,
#[serde(default)]
pub energy: Option<EnergyMetrics>,
#[serde(default)]
pub cpu_counters: Option<CpuHwCounters>,
#[serde(default)]
pub cpu_simd: Option<CpuSimdCounters>,
#[serde(default)]
pub arm_counters: Option<ArmCounters>,
#[serde(default)]
pub cpu_memory: Option<CpuMemoryMetrics>,
#[serde(default)]
pub swap: Option<SwapMetrics>,
#[serde(default)]
pub disk_io: Option<DiskIoMetrics>,
#[serde(default)]
pub network_io: Option<NetworkIoMetrics>,
#[serde(default)]
pub numa: Option<NumaMetrics>,
#[serde(default)]
pub wasm: Option<WasmMetrics>,
#[serde(default)]
pub quant: Option<QuantMetrics>,
#[serde(default)]
pub rayon: Option<RayonMetrics>,
#[serde(default)]
pub compilation: Option<CompilationMetrics>,
#[serde(default)]
pub async_profiling: Option<AsyncMetrics>,
#[serde(default)]
pub muda: Vec<MudaEntry>,
#[serde(default)]
pub metal: Option<MetalMetrics>,
#[serde(default)]
pub regression: Option<RegressionMetrics>,
#[serde(default)]
pub syscall: Option<SyscallMetrics>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TimingMetrics {
pub wall_clock_time_us: f64,
pub samples: u32,
pub stddev_us: f64,
pub ci_95_low_us: f64,
pub ci_95_high_us: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ThroughputMetrics {
pub tflops: f64,
pub gflops: f64,
pub bandwidth_gbps: f64,
pub arithmetic_intensity: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RooflineMetrics {
pub peak_compute_tflops: f64,
pub peak_bandwidth_gbps: f64,
pub ridge_point: f64,
pub bound: String,
pub efficiency_pct: f64,
pub distance_to_ridge: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct GpuComputeMetrics {
pub sm_utilization_pct: f64,
pub achieved_occupancy_pct: f64,
pub warp_execution_efficiency_pct: f64,
pub branch_efficiency_pct: f64,
pub tensor_core_utilization_pct: f64,
pub ipc: f64,
pub flop16_ops: u64,
pub flop32_ops: u64,
pub register_usage_per_thread: u32,
pub shared_memory_per_block: u32,
pub grid_dimensions: [u32; 3],
pub block_dimensions: [u32; 3],
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct GpuMemoryMetrics {
pub dram_throughput_pct: f64,
pub l1_hit_rate_pct: f64,
pub l2_hit_rate_pct: f64,
pub global_load_efficiency_pct: f64,
pub global_store_efficiency_pct: f64,
pub shared_load_efficiency_pct: f64,
pub shared_store_efficiency_pct: f64,
pub shared_bank_conflicts: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct GpuStallMetrics {
pub barrier_stall_cycles: u64,
pub memory_stall_cycles: u64,
pub pipeline_bubbles: u64,
pub warp_scheduler_idle_pct: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct GpuTransferMetrics {
pub h2d_bandwidth_gbps: f64,
pub d2h_bandwidth_gbps: f64,
pub pcie_utilization_pct: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct VramMetrics {
pub vram_used_mb: f64,
pub vram_total_mb: f64,
pub vram_free_mb: f64,
pub vram_utilization_pct: f64,
pub vram_peak_mb: f64,
pub vram_allocation_count: u64,
pub vram_fragmentation_pct: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PcieMetrics {
pub pcie_gen: u32,
pub pcie_width: u32,
pub pcie_bandwidth_theoretical_gbps: f64,
pub pcie_rx_throughput_gbps: f64,
pub pcie_tx_throughput_gbps: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SystemHealthMetrics {
pub gpu_temperature_celsius: f64,
pub gpu_power_watts: f64,
pub gpu_clock_mhz: f64,
pub gpu_memory_clock_mhz: f64,
pub cpu_frequency_mhz: f64,
pub cpu_temperature_celsius: f64,
pub gpu_memory_used_mb: f64,
pub gpu_memory_total_mb: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EnergyMetrics {
pub tflops_per_watt: f64,
pub joules_per_inference: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CpuHwCounters {
pub cycles: u64,
pub instructions: u64,
pub cache_references: u64,
pub cache_misses: u64,
pub l1_dcache_load_misses: u64,
pub llc_loads: u64,
pub branches: u64,
pub branch_misses: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CpuSimdCounters {
pub fp_arith_scalar_single: u64,
pub fp_arith_128b_packed_single: u64,
pub fp_arith_256b_packed_single: u64,
pub fp_arith_512b_packed_single: u64,
pub simd_utilization_pct: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ArmCounters {
pub inst_retired: u64,
pub cpu_cycles: u64,
pub ase_spec: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CpuMemoryMetrics {
pub rss_mb: f64,
pub rss_peak_mb: f64,
pub vms_mb: f64,
pub heap_allocated_mb: f64,
pub heap_peak_mb: f64,
pub malloc_count: u64,
pub free_count: u64,
pub memory_leaks_bytes: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SwapMetrics {
pub swap_used_mb: f64,
pub swap_in_count: u64,
pub swap_out_count: u64,
pub swap_activity_detected: bool,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DiskIoMetrics {
pub disk_read_bytes: u64,
pub disk_write_bytes: u64,
pub disk_read_iops: f64,
pub disk_write_iops: f64,
pub io_wait_pct: f64,
pub file_descriptors_open: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct NetworkIoMetrics {
pub net_rx_bytes: u64,
pub net_tx_bytes: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct NumaMetrics {
pub numa_node: u32,
pub numa_remote_access_pct: f64,
pub cpu_affinity_mask: String,
pub voluntary_ctx_switches: u64,
pub involuntary_ctx_switches: u64,
pub cpu_migration_count: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct WasmMetrics {
pub instruction_count: u64,
pub fuel_consumed: u64,
pub simd128_detected: bool,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct QuantMetrics {
pub superblocks_per_sec: f64,
pub effective_bandwidth_gbps: f64,
pub compression_speedup: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RayonMetrics {
pub parallel_speedup: f64,
pub parallel_efficiency: f64,
pub heijunka_score: f64,
pub thread_spawn_overhead_us: f64,
pub work_steal_count: u64,
pub num_threads: u32,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CompilationMetrics {
pub ptx_jit_time_ms: f64,
pub ptx_cache_hit: bool,
pub ptx_size_bytes: u64,
pub sass_instruction_count: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct AsyncMetrics {
pub poll_count: u64,
pub poll_efficiency: f64,
pub yield_ratio: f64,
pub avg_poll_latency_us: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MudaEntry {
pub muda_type: String,
pub source: String,
pub impact_pct: f64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MetalMetrics {
pub gpu_timestamp_ns: u64,
pub dispatch_config: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RegressionMetrics {
pub regression_pct: f64,
pub p_value: f64,
pub effect_size_cohens_d: f64,
pub verdict: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SyscallMetrics {
pub total_syscalls: u64,
pub syscall_breakdown: std::collections::HashMap<String, u64>,
pub io_overhead_pct: f64,
pub page_faults_minor: u64,
pub page_faults_major: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct HardwareInfo {
pub gpu: Option<String>,
pub gpu_sm: Option<String>,
pub gpu_memory_gb: Option<f64>,
pub gpu_bandwidth_gbps: Option<f64>,
pub gpu_pcie: Option<String>,
pub cpu: Option<String>,
pub cpu_features: Vec<String>,
pub numa_nodes: Option<u32>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct KernelInfo {
pub name: String,
pub dimensions: Vec<u32>,
pub grid: Option<[u32; 3]>,
pub block: Option<[u32; 3]>,
pub shared_memory_bytes: Option<u32>,
pub registers_per_thread: Option<u32>,
}
pub fn total_metric_count() -> usize {
5 + 4 + 6 + 12 + 8 + 4 + 3 + 7 + 5 + 8 + 2 + 8 + 5 + 3 + 8 + 4 + 6 + 2 + 6 + 3 + 3 + 6 + 4 + 4 + 13 + 2 + 4 + 5 }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metric_count() {
assert_eq!(total_metric_count(), 150);
}
#[test]
fn test_category_count() {
let categories = 28;
assert_eq!(categories, 28);
}
#[test]
fn test_full_profile_json_roundtrip() {
let profile = FullProfile {
version: "2.0".to_string(),
timestamp: "2026-04-04T12:00:00Z".to_string(),
timing: TimingMetrics {
wall_clock_time_us: 23.2,
samples: 50,
stddev_us: 0.3,
ci_95_low_us: 23.0,
ci_95_high_us: 23.4,
},
throughput: ThroughputMetrics {
tflops: 11.6,
gflops: 0.0,
bandwidth_gbps: 78.4,
arithmetic_intensity: 16.0,
},
..Default::default()
};
let json = serde_json::to_string_pretty(&profile).unwrap();
let parsed: FullProfile = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.version, "2.0");
assert!((parsed.timing.wall_clock_time_us - 23.2).abs() < 0.01);
assert!((parsed.throughput.tflops - 11.6).abs() < 0.01);
}
}