apr-cli 0.29.3 - Docs.rs


/// PAR-103: Concurrent batch mode for aggregate throughput measurement.
#[cfg(feature = "inference")]
fn measure_batch_throughput(
    config: &CbtopConfig,
    cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
    prompt_tokens: &[u32],
) -> Result<(usize, Vec<f64>)> {
    eprintln!("cbtop: PAR-103 Pre-caching weights for batch mode...");
    let cache_bytes = cuda_model
        .pre_cache_weights_for_batch()
        .map_err(|e| CliError::ValidationFailed(format!("Failed to pre-cache weights: {e}")))?;
    eprintln!(
        "cbtop: PAR-103 Cached {:.1} MB of weights",
        cache_bytes as f64 / 1024.0 / 1024.0
    );
    eprintln!(
        "cbtop: PAR-103 Batch mode - {} concurrent tokens per forward",
        config.concurrent
    );

    let batch_tokens: Vec<u32> = (0..config.concurrent)
        .map(|_| prompt_tokens.last().copied().unwrap_or(0))
        .collect();

    let mut total_tokens = 0usize;
    let mut latencies_us = Vec::with_capacity(config.iterations);

    for i in 0..config.iterations {
        let iter_start = Instant::now();
        let result = cuda_model.forward_cuda(&batch_tokens);
        match result {
            Ok(_logits) => {
                total_tokens += config.concurrent;
                latencies_us.push(iter_start.elapsed().as_micros() as f64);
            }
            Err(e) => {
                eprintln!("\ncbtop: Batch forward error: {e}");
                return Err(CliError::ValidationFailed(format!(
                    "Batch forward failed: {e}"
                )));
            }
        }
        eprint!("\r  Iteration {}/{}", i + 1, config.iterations);
    }
    Ok((total_tokens, latencies_us))
}

/// Standard single-token generation measurement (with optional speculative decoding).
#[cfg(feature = "inference")]
fn measure_standard_throughput(
    config: &CbtopConfig,
    cuda_model: &mut realizar::gguf::OwnedQuantizedModelCuda,
    draft_cuda_model: &mut Option<realizar::gguf::OwnedQuantizedModelCuda>,
    prompt_tokens: &[u32],
    gen_config: &realizar::gguf::QuantizedGenerateConfig,
) -> Result<(usize, Vec<f64>)> {
    let mut total_tokens = 0usize;
    let mut latencies_us = Vec::with_capacity(config.iterations);

    for i in 0..config.iterations {
        let iter_start = Instant::now();
        let result = if config.speculative {
            if let Some(ref mut draft) = draft_cuda_model {
                cuda_model.generate_speculative_with_draft(
                    draft,
                    prompt_tokens,
                    gen_config,
                    config.speculation_k,
                )
            } else {
                cuda_model.generate_speculative_cuda(
                    prompt_tokens,
                    gen_config,
                    config.speculation_k,
                )
            }
        } else {
            cuda_model.generate_gpu_resident(prompt_tokens, gen_config)
        };

        match result {
            Ok(output) => {
                let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
                total_tokens += tokens_generated;
                latencies_us.push(iter_start.elapsed().as_micros() as f64);
            }
            Err(e) => {
                eprintln!("\ncbtop: Generation error: {e}");
                return Err(CliError::ValidationFailed(format!(
                    "Generation failed: {e}"
                )));
            }
        }
        eprint!("\r  Iteration {}/{}", i + 1, config.iterations);
    }
    Ok((total_tokens, latencies_us))
}

/// Print per-brick timing from the BrickProfiler.
#[cfg(feature = "inference")]
fn print_profiler_brick_stats(cuda_model: &realizar::gguf::OwnedQuantizedModelCuda) {
    let profiler = cuda_model.profiler();
    let mut all: Vec<_> = profiler.all_brick_stats().collect();
    if all.is_empty() {
        eprintln!("  No per-brick data collected (profiling may need per-brick sync points)");
    } else {
        eprintln!("Per-Brick Timing (REAL via BrickProfiler):");
        all.sort_by(|a, b| b.total_ns.cmp(&a.total_ns));
        for stats in all {
            eprintln!(
                "  {:20} {:8.2}µs avg, {:8} samples, {:.1} tok/s",
                stats.name,
                stats.avg_us(),
                stats.count,
                stats.tokens_per_sec()
            );
        }
    }
}

/// PMAT-PERF-009: Renacer BrickTracer escalation for anomaly detection.
/// Per Mace et al. (2015): Only trace when anomalies detected to avoid overhead.
#[cfg(all(feature = "inference", feature = "visualization"))]
fn check_renacer_escalation(tokens_per_sec: f64, cv_percent: f64) {
    use renacer::brick_tracer::{BrickEscalationThresholds, BrickTracer};

    let thresholds = BrickEscalationThresholds::default();
    let efficiency = tokens_per_sec / 976.0 * 100.0;

    if cv_percent > thresholds.cv_percent || efficiency < thresholds.efficiency_percent {
        eprintln!();
        eprintln!(
            "cbtop: Anomaly detected (CV: {:.1}%, efficiency: {:.1}%) - escalating to renacer",
            cv_percent, efficiency
        );
        eprintln!(
            "  Threshold: CV > {:.1}% or efficiency < {:.1}%",
            thresholds.cv_percent, thresholds.efficiency_percent
        );
        let _tracer = BrickTracer::new_local();
        let reason =
            if cv_percent > thresholds.cv_percent && efficiency < thresholds.efficiency_percent {
                "cv_and_efficiency"
            } else if cv_percent > thresholds.cv_percent {
                "cv_exceeded"
            } else {
                "efficiency_low"
            };
        eprintln!("  BrickTracer: Enabled for syscall breakdown");
        eprintln!("  Escalation reason: {reason}");
        eprintln!();
    }
}

/// Load optional draft model for speculative decoding.
#[cfg(feature = "inference")]
fn load_draft_model(
    config: &CbtopConfig,
) -> Result<Option<realizar::gguf::OwnedQuantizedModelCuda>> {
    use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};

    let Some(ref draft_path) = config.draft_model_path else {
        return Ok(None);
    };
    eprintln!("cbtop: Loading draft model (PAR-099)...");
    let draft_load_start = Instant::now();
    let draft_mapped = MappedGGUFModel::from_path(draft_path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to map draft model: {e}")))?;
    let draft_model = OwnedQuantizedModel::from_mapped(&draft_mapped)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to create draft model: {e}")))?;
    let draft_cuda = OwnedQuantizedModelCuda::new(draft_model, 0)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to init draft CUDA: {e}")))?;
    eprintln!(
        "cbtop: Draft model loaded in {:.2}s",
        draft_load_start.elapsed().as_secs_f32()
    );
    Ok(Some(draft_cuda))
}

/// Create a derived brick score from measured layer time.
fn derived_brick_score(
    name: &str,
    budget: f64,
    measured_per_layer_us: f64,
    fraction: f64,
) -> BrickScore {
    let derived_us = measured_per_layer_us * fraction;
    let score = compute_brick_score(derived_us, budget);
    eprintln!("  {name}: {derived_us:.2}µs (budget: {budget}µs)");
    BrickScore {
        name: name.to_string(),
        score,
        grade: score_to_grade(score),
        budget_us: budget,
        actual_us: derived_us,
        gap_factor: derived_us / budget,
    }
}

/// Benchmark RmsNorm brick and return score.
#[cfg(feature = "inference")]
fn bench_rmsnorm_brick(
    hidden_dim: usize,
    bench_config: &realizar::brick::BenchmarkConfig,
    label: &str,
) -> BrickScore {
    use realizar::brick::{benchmark_brick, RmsNormBrick};
    let brick = RmsNormBrick::new(vec![1.0; hidden_dim], 1e-5);
    let input: Vec<f32> = vec![1.0; hidden_dim];
    let report = benchmark_brick(
        &brick,
        || {
            let start = Instant::now();
            let _ = brick.run(&input);
            start.elapsed().as_nanos() as f64 / 1000.0
        },
        bench_config,
    );
    let score = compute_brick_score(report.mean_us, 1.5);
    eprintln!("  {label}: {:.2}µs (budget: 1.5µs)", report.mean_us);
    BrickScore {
        name: "RmsNorm".to_string(),
        score,
        grade: score_to_grade(score),
        budget_us: 1.5,
        actual_us: report.mean_us,
        gap_factor: report.mean_us / 1.5,
    }
}

/// Benchmark individual bricks and return scores.
#[cfg(feature = "inference")]
#[allow(clippy::too_many_arguments)]
fn benchmark_bricks(
    config: &CbtopConfig,
    hidden_dim: usize,
    num_heads: usize,
    num_kv_heads: usize,
    head_dim: usize,
    measured_per_layer_us: f64,
    tokens_per_sec: f64,
    num_layers: usize,
) -> Vec<BrickScore> {
    use realizar::brick::{BenchmarkConfig, QkvBrick};

    eprintln!("cbtop: Brick timing estimates (* = derived from throughput)...");
    let bench_config = BenchmarkConfig {
        warmup: config.warmup.min(10),
        samples: config.iterations.min(100),
        max_cv: 0.05,
    };

    let mut reports: Vec<BrickScore> = Vec::new();
    reports.push(bench_rmsnorm_brick(hidden_dim, &bench_config, "RmsNorm"));

    let _brick = QkvBrick::new(
        hidden_dim,
        hidden_dim,
        num_heads * head_dim,
        num_kv_heads * head_dim,
    );
    reports.push(derived_brick_score(
        "QkvBrick*",
        6.0,
        measured_per_layer_us,
        6.0 / 35.7,
    ));
    reports.push(derived_brick_score(
        "RoPE*",
        1.0,
        measured_per_layer_us,
        1.0 / 35.7,
    ));

    let measured_layer_us = if tokens_per_sec > 0.0 && num_layers > 0 {
        1_000_000.0 / tokens_per_sec / num_layers as f64
    } else {
        0.0
    };
    let attn_us = measured_layer_us * (10.0 / 35.7);
    let score = compute_brick_score(attn_us, 10.0);
    reports.push(BrickScore {
        name: "Attention*".to_string(),
        score,
        grade: score_to_grade(score),
        budget_us: 10.0,
        actual_us: attn_us,
        gap_factor: attn_us / 10.0,
    });
    eprintln!("  Attention*: {attn_us:.2}µs (budget: 10.0µs) [* = derived from total throughput]");

    reports.push(derived_brick_score(
        "OProj*",
        3.5,
        measured_per_layer_us,
        3.5 / 35.7,
    ));
    reports.push(bench_rmsnorm_brick(
        hidden_dim,
        &bench_config,
        "RmsNorm (2)",
    ));
    reports.push(derived_brick_score(
        "FfnBrick*",
        12.2,
        measured_per_layer_us,
        12.2 / 35.7,
    ));

    eprintln!();
    reports
}

/// Build headless report and output it.
#[cfg(feature = "inference")]
#[allow(clippy::too_many_arguments)]
fn build_and_output_report(
    config: &CbtopConfig,
    model_name: &str,
    gpu_name: &str,
    tokens_per_sec: f64,
    cv_percent: f64,
    latencies_us: &[f64],
    brick_reports: Vec<BrickScore>,
) -> Result<()> {
    let mut sorted = latencies_us.to_vec();
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    if sorted.is_empty() {
        return Ok(());
    }
    let p50 = sorted[sorted.len() / 2];
    let p99 = sorted[(sorted.len() as f64 * 0.99).min((sorted.len() - 1) as f64) as usize];

    // GH-420 Bug 2: Use equal weights for all bricks (dynamic count).
    // Previous code used 7 hardcoded weights and silently dropped bricks beyond 7.
    let n_bricks = brick_reports.len().max(1);
    let score_sum: f64 = brick_reports.iter().map(|b| b.score as f64).sum();
    let pmat_brick_score = (score_sum / n_bricks as f64) as u32;

    // 1e-9 epsilon: budget derived from same profiler data, so gap ≈ 1.0;
    // without epsilon, floating-point rounding makes gap 1.0000000000001 → false fail.
    let all_pass = brick_reports.iter().all(|b| b.gap_factor <= 1.0 + 1e-9);
    // GH-420 Bug 5: status is pass/fail on brick gaps only.
    // Throughput target is hardware-dependent — do not hardcode.
    let status = if all_pass { "PASS" } else { "FAIL" };
    let ci_result = if all_pass { "green" } else { "red" };

    // GH-420 Bug 4: Count actual brick pass/fail for falsification summary.
    let brick_passed = brick_reports.iter().filter(|b| b.gap_factor <= 1.0 + 1e-9).count() as u32;
    let brick_failed = (n_bricks as u32).saturating_sub(brick_passed);

    let report = HeadlessReport {
        model: model_name.to_string(),
        timestamp: chrono_timestamp(),
        hardware: HardwareInfo {
            gpu: gpu_name.to_string(),
            cpu: get_cpu_info(),
            memory_gb: get_memory_gb(),
        },
        throughput: ThroughputMetrics {
            tokens_per_sec,
            ttft_ms: p50 / 1000.0,
            cv_percent,
            p50_us: p50,
            p99_us: p99,
        },
        brick_scores: brick_reports,
        // GH-420 Bug 3: Report 0 for scores not actually computed.
        pmat_scores: PmatScores {
            rust_project_score: 0.0,
            tdg_score: 0.0,
            cuda_tdg_score: 0.0,
            brick_score: pmat_brick_score,
            grade: score_to_grade(pmat_brick_score),
        },
        // GH-420 Bug 4: Real falsification from brick pass/fail counts.
        falsification: FalsificationSummary {
            total_points: n_bricks as u32,
            passed: brick_passed,
            failed: brick_failed,
            blocked: 0,
        },
        status: status.to_string(),
        ci_result: ci_result.to_string(),
    };

    let ci_passed = check_ci_thresholds(&report, config);

    if config.json {
        let json_output = format_report_as_json(&report);
        if let Some(ref path) = config.output {
            std::fs::write(path, &json_output).map_err(|e| {
                CliError::ValidationFailed(format!("Failed to write output file: {e}"))
            })?;
            eprintln!("cbtop: Results written to {}", path.display());
        } else {
            println!("{json_output}");
        }
    } else {
        print_report_text(&report);
    }

    if config.ci && !ci_passed {
        eprintln!("cbtop: CI thresholds not met!");
        return Err(CliError::ValidationFailed(
            "CI thresholds not met".to_string(),
        ));
    }
    Ok(())
}

/// Compute brick score from actual timing vs budget
fn compute_brick_score(actual_us: f64, budget_us: f64) -> u32 {
    let gap = actual_us / budget_us;
    // 1e-9 epsilon: budget derived from same profiler data as actual,
    // so gap ≈ 1.0 always; without epsilon, floating-point rounding
    // produces 1.0000000000001 which truncates score to 99.
    if gap <= 1.0 + 1e-9 {
        100
    } else if gap <= 1.2 {
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        { (100.0 - (gap - 1.0) * 50.0) as u32 }
    } else {
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        { (100.0 - (gap - 1.0) * 100.0).max(0.0) as u32 }
    }
}

/// Convert score to letter grade
fn score_to_grade(score: u32) -> String {
    match score {
        90..=100 => "A",
        80..=89 => "B",
        70..=79 => "C",
        60..=69 => "D",
        _ => "F",
    }
    .to_string()
}

/// Get ISO 8601 timestamp
fn chrono_timestamp() -> String {
    use std::time::{SystemTime, UNIX_EPOCH};
    SystemTime::now().duration_since(UNIX_EPOCH).map_or_else(
        |_| "unknown".to_string(),
        |d| {
            let secs = d.as_secs();
            format!(
                "2026-01-12T{:02}:{:02}:{:02}Z",
                (secs / 3600) % 24,
                (secs / 60) % 60,
                secs % 60
            )
        },
    )
}