#[derive(Debug, Clone)]
pub struct ComponentTiming {
pub name: String,
pub duration: Duration,
pub calls: u64,
}
#[derive(Debug, Default)]
pub struct ProfilingCollector {
timings: Vec<ComponentTiming>,
start: Option<Instant>,
}
impl ProfilingCollector {
pub fn new() -> Self {
Self::default()
}
pub fn start(&mut self) {
self.start = Some(Instant::now());
}
pub fn record(&mut self, name: &str, duration: Duration, calls: u64) {
self.timings.push(ComponentTiming {
name: name.to_string(),
duration,
calls,
});
}
pub fn into_hotspots(self) -> Vec<ProfilingHotspot> {
let total: Duration = self.timings.iter().map(|t| t.duration).sum();
let total_nanos = total.as_nanos() as f64;
if total_nanos == 0.0 {
return Vec::new();
}
self.timings
.into_iter()
.filter_map(|t| {
let percentage = (t.duration.as_nanos() as f64 / total_nanos) * 100.0;
if percentage > 5.0 {
let avg_per_call = if t.calls > 0 {
Duration::from_nanos((t.duration.as_nanos() / u128::from(t.calls)) as u64)
} else {
Duration::ZERO
};
let (explanation, is_expected) = explain_component(&t.name, percentage);
Some(ProfilingHotspot {
component: t.name,
time: t.duration,
percentage,
call_count: t.calls,
avg_per_call,
explanation,
is_expected,
})
} else {
None
}
})
.collect()
}
}
fn explain_component(name: &str, percentage: f64) -> (String, bool) {
match name {
"Q4K_GEMV" | "MatMul" | "GEMM" | "TensorCore" => (
format!(
"Matrix ops at {:.1}% - expected for transformer inference",
percentage
),
true,
),
"Attention" | "FlashAttention" | "IncrementalAttention" => (
format!(
"Attention at {:.1}% - normal for autoregressive decode",
percentage
),
true,
),
"RMSNorm" | "LayerNorm" => {
if percentage > 15.0 {
(
"Normalization high - megakernel fusion recommended".to_string(),
false,
)
} else {
("Normalization within normal range".to_string(), true)
}
}
"KernelLaunch" => (
"Kernel launch overhead - CUDA graphs recommended (PAR-037)".to_string(),
false,
),
"MemcpyH2D" | "MemcpyD2H" | "Transfer" => (
"Memory transfer - persistent buffers recommended (PAR-038)".to_string(),
false,
),
"KVCache" | "KV_Cache" => {
if percentage > 20.0 {
(
"KV cache overhead high - FP16/ZRAM recommended".to_string(),
false,
)
} else {
("KV cache within normal range".to_string(), true)
}
}
"SwiGLU" | "FFN" => (
format!("FFN at {:.1}% - expected for transformer", percentage),
true,
),
"Embedding" => (
"Embedding lookup - expected at inference start".to_string(),
true,
),
"Sampling" | "TopK" | "TopP" => (
"Sampling overhead - expected for token generation".to_string(),
true,
),
_ => {
if percentage > 20.0 {
(
format!("Unknown at {:.1}% - investigate", percentage),
false,
)
} else {
(String::new(), true)
}
}
}
}
#[derive(Debug, Clone)]
#[allow(clippy::struct_excessive_bools)] pub struct PmatVerification {
pub point_41_pass: bool,
pub point_42_pass: bool,
pub point_49_pass: bool,
pub ollama_2x_pass: bool,
pub all_pass: bool,
}
impl PmatVerification {
pub fn verify(runner: &ShowcaseRunner) -> Self {
let apr_tps = runner
.apr_gguf_stats
.as_ref()
.or(runner.apr_native_stats.as_ref())
.map_or(0.0, |s| s.mean_throughput);
let apr_cv = runner
.apr_gguf_stats
.as_ref()
.or(runner.apr_native_stats.as_ref())
.map_or(1.0, |s| s.cv);
let llamacpp_tps = runner
.llamacpp_stats
.as_ref()
.map_or(200.0, |s| s.mean_throughput);
let ollama_tps = runner
.ollama_stats
.as_ref()
.map_or(318.0, |s| s.mean_throughput);
let point_41_pass = apr_tps >= llamacpp_tps * 1.25;
let point_42_pass = apr_tps >= 60.0;
let point_49_pass = apr_cv < 0.05;
let ollama_2x_pass = apr_tps >= ollama_tps * 2.0;
let all_pass = point_41_pass && point_42_pass && point_49_pass;
Self {
point_41_pass,
point_42_pass,
point_49_pass,
ollama_2x_pass,
all_pass,
}
}
pub fn to_report(&self) -> String {
use std::fmt::Write;
let mut out = String::new();
out.push_str("PMAT Verification Results:\n");
out.push_str("─────────────────────────────────────\n");
let _ = writeln!(
out,
"Point 41 (≥1.25x llama.cpp): {}",
if self.point_41_pass {
"✓ PASS"
} else {
"✗ FAIL"
}
);
let _ = writeln!(
out,
"Point 42 (≥60 tok/s): {}",
if self.point_42_pass {
"✓ PASS"
} else {
"✗ FAIL"
}
);
let _ = writeln!(
out,
"Point 49 (CV <5%): {}",
if self.point_49_pass {
"✓ PASS"
} else {
"✗ FAIL"
}
);
let _ = writeln!(
out,
"2x Ollama Target: {}",
if self.ollama_2x_pass {
"✓ PASS"
} else {
"○ PENDING"
}
);
out.push_str("─────────────────────────────────────\n");
let _ = writeln!(
out,
"Overall: {}",
if self.all_pass {
"✓ ALL PASS"
} else {
"✗ NEEDS WORK"
}
);
out
}
}
#[cfg(feature = "showcase-profile")]
pub mod profiler {
use super::{explain_component, Duration, ProfilingHotspot};
use renacer::time_attribution::Hotspot;
#[derive(Debug, Clone)]
pub struct RenacerProfilerConfig {
pub threshold_us: u64,
pub trace_all: bool,
pub device_id: u32,
}
impl Default for RenacerProfilerConfig {
fn default() -> Self {
Self {
threshold_us: 100,
trace_all: false,
device_id: 0,
}
}
}
pub fn convert_hotspots(renacer_hotspots: &[Hotspot]) -> Vec<ProfilingHotspot> {
renacer_hotspots
.iter()
.map(|h| {
let (explanation, is_expected) = explain_component(&h.cluster, h.percentage);
ProfilingHotspot {
component: h.cluster.clone(),
time: h.time,
percentage: h.percentage,
call_count: 0, avg_per_call: Duration::ZERO,
explanation,
is_expected,
}
})
.collect()
}
pub use renacer::cuda_tracer::CudaTracerConfig;
pub use renacer::time_attribution::{identify_hotspots, Hotspot as RenacerHotspot};
}
#[cfg(not(feature = "showcase-profile"))]
pub mod profiler {
#[derive(Debug, Clone, Default)]
pub struct RenacerProfilerConfig {
pub threshold_us: u64,
pub trace_all: bool,
pub device_id: u32,
}
}