#[cfg(feature = "inference")]
fn measure_batch_throughput(
config: &CbtopConfig,
cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
prompt_tokens: &[u32],
) -> Result<(usize, Vec<f64>)> {
eprintln!("cbtop: PAR-103 Pre-caching weights for batch mode...");
let cache_bytes = cuda_model
.pre_cache_weights_for_batch()
.map_err(|e| CliError::ValidationFailed(format!("Failed to pre-cache weights: {e}")))?;
eprintln!(
"cbtop: PAR-103 Cached {:.1} MB of weights",
cache_bytes as f64 / 1024.0 / 1024.0
);
eprintln!(
"cbtop: PAR-103 Batch mode - {} concurrent tokens per forward",
config.concurrent
);
let batch_tokens: Vec<u32> = (0..config.concurrent)
.map(|_| prompt_tokens.last().copied().unwrap_or(0))
.collect();
let mut total_tokens = 0usize;
let mut latencies_us = Vec::with_capacity(config.iterations);
for i in 0..config.iterations {
let iter_start = Instant::now();
let result = cuda_model.forward_cuda(&batch_tokens);
match result {
Ok(_logits) => {
total_tokens += config.concurrent;
latencies_us.push(iter_start.elapsed().as_micros() as f64);
}
Err(e) => {
eprintln!("\ncbtop: Batch forward error: {e}");
return Err(CliError::ValidationFailed(format!(
"Batch forward failed: {e}"
)));
}
}
eprint!("\r Iteration {}/{}", i + 1, config.iterations);
}
Ok((total_tokens, latencies_us))
}
#[cfg(feature = "inference")]
fn measure_standard_throughput(
config: &CbtopConfig,
cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
draft_cuda_model: &mut Option<realizar::gguf::OwnedQuantizedModelCuda>,
prompt_tokens: &[u32],
gen_config: &realizar::gguf::QuantizedGenerateConfig,
) -> Result<(usize, Vec<f64>)> {
let mut total_tokens = 0usize;
let mut latencies_us = Vec::with_capacity(config.iterations);
for i in 0..config.iterations {
let iter_start = Instant::now();
let result = if config.speculative {
if let Some(ref mut draft) = draft_cuda_model {
cuda_model.generate_speculative_with_draft(
draft,
prompt_tokens,
gen_config,
config.speculation_k,
)
} else {
cuda_model.generate_speculative_cuda(
prompt_tokens,
gen_config,
config.speculation_k,
)
}
} else {
cuda_model.generate_gpu_resident(prompt_tokens, gen_config)
};
match result {
Ok(output) => {
let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
total_tokens += tokens_generated;
latencies_us.push(iter_start.elapsed().as_micros() as f64);
}
Err(e) => {
eprintln!("\ncbtop: Generation error: {e}");
return Err(CliError::ValidationFailed(format!(
"Generation failed: {e}"
)));
}
}
eprint!("\r Iteration {}/{}", i + 1, config.iterations);
}
Ok((total_tokens, latencies_us))
}
#[cfg(feature = "inference")]
fn print_profiler_brick_stats(cuda_model: &realizar::gguf::OwnedQuantizedModelCuda) {
let profiler = cuda_model.profiler();
let mut all: Vec<_> = profiler.all_brick_stats().collect();
if all.is_empty() {
eprintln!(" No per-brick data collected (profiling may need per-brick sync points)");
} else {
eprintln!("Per-Brick Timing (REAL via BrickProfiler):");
all.sort_by(|a, b| b.total_ns.cmp(&a.total_ns));
for stats in all {
eprintln!(
" {:20} {:8.2}µs avg, {:8} samples, {:.1} tok/s",
stats.name,
stats.avg_us(),
stats.count,
stats.tokens_per_sec()
);
}
}
}
#[cfg(all(feature = "inference", feature = "visualization"))]
fn check_renacer_escalation(tokens_per_sec: f64, cv_percent: f64) {
use renacer::brick_tracer::{BrickEscalationThresholds, BrickTracer};
let thresholds = BrickEscalationThresholds::default();
let efficiency = tokens_per_sec / 976.0 * 100.0;
if cv_percent > thresholds.cv_percent || efficiency < thresholds.efficiency_percent {
eprintln!();
eprintln!(
"cbtop: Anomaly detected (CV: {:.1}%, efficiency: {:.1}%) - escalating to renacer",
cv_percent, efficiency
);
eprintln!(
" Threshold: CV > {:.1}% or efficiency < {:.1}%",
thresholds.cv_percent, thresholds.efficiency_percent
);
let _tracer = BrickTracer::new_local();
let reason =
if cv_percent > thresholds.cv_percent && efficiency < thresholds.efficiency_percent {
"cv_and_efficiency"
} else if cv_percent > thresholds.cv_percent {
"cv_exceeded"
} else {
"efficiency_low"
};
eprintln!(" BrickTracer: Enabled for syscall breakdown");
eprintln!(" Escalation reason: {reason}");
eprintln!();
}
}
#[cfg(feature = "inference")]
fn load_draft_model(
config: &CbtopConfig,
) -> Result<Option<realizar::gguf::OwnedQuantizedModelCuda>> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};
let Some(ref draft_path) = config.draft_model_path else {
return Ok(None);
};
eprintln!("cbtop: Loading draft model (PAR-099)...");
let draft_load_start = Instant::now();
let draft_mapped = MappedGGUFModel::from_path(draft_path)
.map_err(|e| CliError::ValidationFailed(format!("Failed to map draft model: {e}")))?;
let draft_model = OwnedQuantizedModel::from_mapped(&draft_mapped)
.map_err(|e| CliError::ValidationFailed(format!("Failed to create draft model: {e}")))?;
let draft_cuda = OwnedQuantizedModelCuda::new(draft_model, 0)
.map_err(|e| CliError::ValidationFailed(format!("Failed to init draft CUDA: {e}")))?;
eprintln!(
"cbtop: Draft model loaded in {:.2}s",
draft_load_start.elapsed().as_secs_f32()
);
Ok(Some(draft_cuda))
}
fn derived_brick_score(
name: &str,
budget: f64,
measured_per_layer_us: f64,
fraction: f64,
) -> BrickScore {
let derived_us = measured_per_layer_us * fraction;
let score = compute_brick_score(derived_us, budget);
eprintln!(" {name}: {derived_us:.2}µs (budget: {budget}µs)");
BrickScore {
name: name.to_string(),
score,
grade: score_to_grade(score),
budget_us: budget,
actual_us: derived_us,
gap_factor: derived_us / budget,
}
}
#[cfg(feature = "inference")]
fn bench_rmsnorm_brick(
hidden_dim: usize,
bench_config: &realizar::brick::BenchmarkConfig,
label: &str,
) -> BrickScore {
use realizar::brick::{benchmark_brick, RmsNormBrick};
let brick = RmsNormBrick::new(vec![1.0; hidden_dim], 1e-5);
let input: Vec<f32> = vec![1.0; hidden_dim];
let report = benchmark_brick(
&brick,
|| {
let start = Instant::now();
let _ = brick.run(&input);
start.elapsed().as_nanos() as f64 / 1000.0
},
bench_config,
);
let score = compute_brick_score(report.mean_us, 1.5);
eprintln!(" {label}: {:.2}µs (budget: 1.5µs)", report.mean_us);
BrickScore {
name: "RmsNorm".to_string(),
score,
grade: score_to_grade(score),
budget_us: 1.5,
actual_us: report.mean_us,
gap_factor: report.mean_us / 1.5,
}
}
#[cfg(feature = "inference")]
#[allow(clippy::too_many_arguments)]
fn benchmark_bricks(
config: &CbtopConfig,
hidden_dim: usize,
num_heads: usize,
num_kv_heads: usize,
head_dim: usize,
measured_per_layer_us: f64,
tokens_per_sec: f64,
num_layers: usize,
) -> Vec<BrickScore> {
use realizar::brick::{BenchmarkConfig, QkvBrick};
eprintln!("cbtop: Brick timing estimates (* = derived from throughput)...");
let bench_config = BenchmarkConfig {
warmup: config.warmup.min(10),
samples: config.iterations.min(100),
max_cv: 0.05,
};
let mut reports: Vec<BrickScore> = Vec::new();
reports.push(bench_rmsnorm_brick(hidden_dim, &bench_config, "RmsNorm"));
let _brick = QkvBrick::new(
hidden_dim,
hidden_dim,
num_heads * head_dim,
num_kv_heads * head_dim,
);
reports.push(derived_brick_score(
"QkvBrick*",
6.0,
measured_per_layer_us,
6.0 / 35.7,
));
reports.push(derived_brick_score(
"RoPE*",
1.0,
measured_per_layer_us,
1.0 / 35.7,
));
let measured_layer_us = if tokens_per_sec > 0.0 && num_layers > 0 {
1_000_000.0 / tokens_per_sec / num_layers as f64
} else {
0.0
};
let attn_us = measured_layer_us * (10.0 / 35.7);
let score = compute_brick_score(attn_us, 10.0);
reports.push(BrickScore {
name: "Attention*".to_string(),
score,
grade: score_to_grade(score),
budget_us: 10.0,
actual_us: attn_us,
gap_factor: attn_us / 10.0,
});
eprintln!(" Attention*: {attn_us:.2}µs (budget: 10.0µs) [* = derived from total throughput]");
reports.push(derived_brick_score(
"OProj*",
3.5,
measured_per_layer_us,
3.5 / 35.7,
));
reports.push(bench_rmsnorm_brick(
hidden_dim,
&bench_config,
"RmsNorm (2)",
));
reports.push(derived_brick_score(
"FfnBrick*",
12.2,
measured_per_layer_us,
12.2 / 35.7,
));
eprintln!();
reports
}
#[cfg(feature = "inference")]
#[allow(clippy::too_many_arguments)]
fn build_and_output_report(
config: &CbtopConfig,
model_name: &str,
gpu_name: &str,
tokens_per_sec: f64,
cv_percent: f64,
latencies_us: &[f64],
brick_reports: Vec<BrickScore>,
) -> Result<()> {
let mut sorted = latencies_us.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
if sorted.is_empty() {
return Ok(());
}
let p50 = sorted[sorted.len() / 2];
let p99 = sorted[(sorted.len() as f64 * 0.99).min((sorted.len() - 1) as f64) as usize];
let n_bricks = brick_reports.len().max(1);
let score_sum: f64 = brick_reports.iter().map(|b| b.score as f64).sum();
let pmat_brick_score = (score_sum / n_bricks as f64) as u32;
let all_pass = brick_reports.iter().all(|b| b.gap_factor <= 1.0 + 1e-9);
let status = if all_pass { "PASS" } else { "FAIL" };
let ci_result = if all_pass { "green" } else { "red" };
let brick_passed = brick_reports.iter().filter(|b| b.gap_factor <= 1.0 + 1e-9).count() as u32;
let brick_failed = (n_bricks as u32).saturating_sub(brick_passed);
let report = HeadlessReport {
model: model_name.to_string(),
timestamp: chrono_timestamp(),
hardware: HardwareInfo {
gpu: gpu_name.to_string(),
cpu: get_cpu_info(),
memory_gb: get_memory_gb(),
},
throughput: ThroughputMetrics {
tokens_per_sec,
ttft_ms: p50 / 1000.0,
cv_percent,
p50_us: p50,
p99_us: p99,
},
brick_scores: brick_reports,
pmat_scores: PmatScores {
rust_project_score: 0.0,
tdg_score: 0.0,
cuda_tdg_score: 0.0,
brick_score: pmat_brick_score,
grade: score_to_grade(pmat_brick_score),
},
falsification: FalsificationSummary {
total_points: n_bricks as u32,
passed: brick_passed,
failed: brick_failed,
blocked: 0,
},
status: status.to_string(),
ci_result: ci_result.to_string(),
};
let ci_passed = check_ci_thresholds(&report, config);
if config.json {
let json_output = format_report_as_json(&report);
if let Some(ref path) = config.output {
std::fs::write(path, &json_output).map_err(|e| {
CliError::ValidationFailed(format!("Failed to write output file: {e}"))
})?;
eprintln!("cbtop: Results written to {}", path.display());
} else {
println!("{json_output}");
}
} else {
print_report_text(&report);
}
if config.ci && !ci_passed {
eprintln!("cbtop: CI thresholds not met!");
return Err(CliError::ValidationFailed(
"CI thresholds not met".to_string(),
));
}
Ok(())
}
fn compute_brick_score(actual_us: f64, budget_us: f64) -> u32 {
let gap = actual_us / budget_us;
if gap <= 1.0 + 1e-9 {
100
} else if gap <= 1.2 {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
{ (100.0 - (gap - 1.0) * 50.0) as u32 }
} else {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
{ (100.0 - (gap - 1.0) * 100.0).max(0.0) as u32 }
}
}
fn score_to_grade(score: u32) -> String {
match score {
90..=100 => "A",
80..=89 => "B",
70..=79 => "C",
60..=69 => "D",
_ => "F",
}
.to_string()
}
fn chrono_timestamp() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now().duration_since(UNIX_EPOCH).map_or_else(
|_| "unknown".to_string(),
|d| {
let secs = d.as_secs();
format!(
"2026-01-12T{:02}:{:02}:{:02}Z",
(secs / 3600) % 24,
(secs / 60) % 60,
secs % 60
)
},
)
}