use crate::error::CliError;
use crate::output;
use colored::Colorize;
use std::fmt::Write;
use std::path::Path;
use std::time::Instant;
#[cfg(feature = "inference")]
use realizar::brick::BrickProfiler;
#[cfg(feature = "inference")]
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, QuantizedGenerateConfig};
#[derive(Debug, Clone, Copy, Default)]
pub(crate) enum OutputFormat {
#[default]
Human,
Json,
Flamegraph,
}
impl std::str::FromStr for OutputFormat {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"human" | "text" => Ok(Self::Human),
"json" => Ok(Self::Json),
"flamegraph" | "svg" => Ok(Self::Flamegraph),
_ => Err(format!("Unknown format: {s}")),
}
}
}
#[derive(Debug, Clone, Copy, Default)]
pub(crate) enum ProfileFocus {
#[default]
All,
Attention,
Mlp,
Matmul,
Embedding,
}
impl std::str::FromStr for ProfileFocus {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"all" => Ok(Self::All),
"attention" | "attn" => Ok(Self::Attention),
"mlp" | "ffn" => Ok(Self::Mlp),
"matmul" | "gemm" => Ok(Self::Matmul),
"embedding" | "embed" => Ok(Self::Embedding),
_ => Err(format!("Unknown focus: {s}")),
}
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
struct Hotspot {
name: String,
time_us: f64,
percent: f64, count: usize,
avg_us: f64,
min_us: f64,
max_us: f64,
bottleneck: Option<String>,
efficiency_pct: Option<f64>,
category: Option<String>,
bandwidth_gbs: Option<f64>,
data_bytes_per_call: Option<u64>,
}
#[derive(Debug, Clone, Default)]
pub struct CiAssertions {
pub min_throughput: Option<f64>,
pub max_p99_ms: Option<f64>,
pub max_p50_ms: Option<f64>,
#[allow(dead_code)]
pub max_memory_mb: Option<f64>,
}
#[derive(Debug, Clone)]
pub struct AssertionResult {
pub name: String,
pub expected: String,
pub actual: String,
pub passed: bool,
}
#[derive(Debug, Clone)]
pub struct CiProfileReport {
pub model_path: String,
pub passed: bool,
pub throughput_tok_s: f64,
pub latency_p50_ms: f64,
pub latency_p99_ms: f64,
pub assertions: Vec<AssertionResult>,
}
impl CiProfileReport {
pub(crate) fn from_results(results: &RealProfileResults, assertions: &CiAssertions) -> Self {
let mut assertion_results = Vec::new();
let mut all_passed = true;
let latency_ms = results.total_inference_us / 1000.0;
if let Some(min_tps) = assertions.min_throughput {
let passed = results.throughput_tok_s >= min_tps;
if !passed {
all_passed = false;
}
assertion_results.push(AssertionResult {
name: "throughput".to_string(),
expected: format!(">= {:.1} tok/s", min_tps),
actual: format!("{:.1} tok/s", results.throughput_tok_s),
passed,
});
}
if let Some(max_p99) = assertions.max_p99_ms {
let passed = latency_ms <= max_p99;
if !passed {
all_passed = false;
}
assertion_results.push(AssertionResult {
name: "latency_p99".to_string(),
expected: format!("<= {:.1} ms", max_p99),
actual: format!("{:.2} ms", latency_ms),
passed,
});
}
if let Some(max_p50) = assertions.max_p50_ms {
let passed = latency_ms <= max_p50;
if !passed {
all_passed = false;
}
assertion_results.push(AssertionResult {
name: "latency_p50".to_string(),
expected: format!("<= {:.1} ms", max_p50),
actual: format!("{:.2} ms", latency_ms),
passed,
});
}
CiProfileReport {
model_path: results.model_path.clone(),
passed: all_passed,
throughput_tok_s: results.throughput_tok_s,
latency_p50_ms: latency_ms,
latency_p99_ms: latency_ms, assertions: assertion_results,
}
}
pub fn print_human(&self) {
println!();
println!("{}", "CI PROFILE REPORT (PMAT-192)".white().bold());
println!("{}", "═".repeat(60));
println!();
println!(" Model: {}", self.model_path.cyan());
println!(" Throughput: {:.1} tok/s", self.throughput_tok_s);
println!(" Latency p50: {:.2} ms", self.latency_p50_ms);
println!(" Latency p99: {:.2} ms", self.latency_p99_ms);
println!();
if !self.assertions.is_empty() {
println!("{}", "ASSERTIONS".white().bold());
println!("{}", "─".repeat(60));
for assertion in &self.assertions {
let status = if assertion.passed {
"✅ PASS".green()
} else {
"❌ FAIL".red()
};
println!(
" {} {}: {} (expected {})",
status,
assertion.name.cyan(),
assertion.actual,
assertion.expected
);
}
println!();
}
if self.passed {
println!("{}", "✅ ALL ASSERTIONS PASSED".green().bold());
} else {
println!("{}", "❌ ASSERTIONS FAILED".red().bold());
}
println!();
}
pub fn print_json(&self) {
let mut json = String::from("{\n");
writeln!(json, " \"model\": \"{}\",", self.model_path)
.expect("write to String is infallible");
writeln!(json, " \"passed\": {},", self.passed).expect("write to String is infallible");
json.push_str(" \"metrics\": {\n");
writeln!(
json,
" \"throughput_tok_s\": {:.2},",
self.throughput_tok_s
)
.expect("write to String is infallible");
writeln!(json, " \"latency_p50_ms\": {:.2},", self.latency_p50_ms)
.expect("write to String is infallible");
writeln!(json, " \"latency_p99_ms\": {:.2}", self.latency_p99_ms)
.expect("write to String is infallible");
json.push_str(" },\n");
json.push_str(" \"assertions\": [\n");
for (i, assertion) in self.assertions.iter().enumerate() {
json.push_str(" {\n");
writeln!(json, " \"name\": \"{}\",", assertion.name)
.expect("write to String is infallible");
writeln!(json, " \"expected\": \"{}\",", assertion.expected)
.expect("write to String is infallible");
writeln!(json, " \"actual\": \"{}\",", assertion.actual)
.expect("write to String is infallible");
writeln!(json, " \"passed\": {}", assertion.passed)
.expect("write to String is infallible");
if i < self.assertions.len() - 1 {
json.push_str(" },\n");
} else {
json.push_str(" }\n");
}
}
json.push_str(" ]\n");
json.push_str("}\n");
println!("{json}");
}
}
#[derive(Debug, Clone, Default)]
pub(crate) struct RooflineAnalysis {
pub peak_compute: f64,
pub peak_bandwidth_gbps: f64,
pub achieved_gflops: f64,
pub achieved_bandwidth_gbps: f64,
pub compute_efficiency_pct: f64,
pub memory_efficiency_pct: f64,
pub arithmetic_intensity: f64,
pub ai_threshold: f64,
pub bottleneck: String,
#[allow(dead_code)]
pub backend: String,
pub hardware_model: String,
}
#[derive(Debug, Clone, Default)]
#[allow(clippy::struct_field_names)]
pub(crate) struct CategorySummary {
pub attention_pct: f64,
pub ffn_pct: f64,
pub norm_pct: f64,
pub tokenize_pct: f64,
pub training_pct: f64,
pub serving_pct: f64,
pub other_pct: f64,
}
#[derive(Debug, Clone, Copy)]
pub(crate) enum PerfGrade {
A,
B,
C,
D,
F,
}
impl PerfGrade {
fn from_efficiency(pct: f64) -> Self {
if pct >= 50.0 {
Self::A
} else if pct >= 30.0 {
Self::B
} else if pct >= 15.0 {
Self::C
} else if pct >= 5.0 {
Self::D
} else {
Self::F
}
}
fn label(self) -> &'static str {
match self {
Self::A => "A",
Self::B => "B",
Self::C => "C",
Self::D => "D",
Self::F => "F",
}
}
fn description(self) -> &'static str {
match self {
Self::A => "Excellent — near hardware peak",
Self::B => "Good — reasonable utilization",
Self::C => "Fair — room for improvement",
Self::D => "Poor — significant optimization needed",
Self::F => "Critical — likely using wrong backend or naive implementation",
}
}
}
impl std::fmt::Display for PerfGrade {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.label())
}
}
#[derive(Debug, Clone, Default)]
pub(crate) struct RealProfileResults {
model_path: String,
architecture: String,
num_layers: usize,
vocab_size: usize,
hidden_dim: usize,
warmup_passes: usize,
measure_passes: usize,
total_inference_us: f64,
throughput_tok_s: f64,
tokens_per_pass: usize,
hotspots: Vec<Hotspot>,
per_layer_us: Vec<f64>,
is_real_data: bool,
roofline: Option<RooflineAnalysis>,
category_summary: Option<CategorySummary>,
backend: String,
latency_p50_ms: f64,
latency_p95_ms: f64,
latency_p99_ms: f64,
latency_min_ms: f64,
latency_max_ms: f64,
prefill_tok_s: f64,
decode_tok_s: f64,
total_tokens_generated: usize,
kernel_launch_overhead_pct: f64,
kernel_launch_overhead_us: f64,
}
#[derive(Debug, Clone)]
struct OllamaBaseline {
decode_tok_s: f64,
prefill_tok_s: f64,
model_name: String,
}
fn percentile(sorted: &[f64], p: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
if sorted.len() == 1 {
return sorted[0];
}
let idx = (p / 100.0) * (sorted.len() - 1) as f64;
let lower = idx.floor() as usize;
let upper = idx.ceil() as usize;
let frac = idx - lower as f64;
if upper >= sorted.len() {
sorted[sorted.len() - 1]
} else {
sorted[lower] * (1.0 - frac) + sorted[upper] * frac
}
}
include!("diff_benchmark_report.rs");
include!("profile_pct_change_classify.rs");
include!("kernel.rs");
include!("profile_ollama.rs");
include!("profile_safetensors.rs");
include!("profile_print_hotspot.rs");
include!("comparison.rs");
include!("profile_09.rs");